import numpy as np
import pandas as pd
[docs]def remove_columns_matching(df, column, match):
"""
Return a ``DataFrame`` with rows where `column` values match `match` are removed.
The selected `column` series of values from the supplied Pandas ``DataFrame`` is compared
to `match`, and those rows that match are removed from the DataFrame.
:param df: Pandas ``DataFrame``
:param column: Column indexer
:param match: ``str`` match target
:return: Pandas ``DataFrame`` filtered
"""
df = df.copy()
mask = df[column].values != match
return df.iloc[mask, :]
[docs]def remove_columns_containing(df, column, match):
"""
Return a ``DataFrame`` with rows where `column` values containing `match` are removed.
The selected `column` series of values from the supplied Pandas ``DataFrame`` is compared
to `match`, and those rows that contain it are removed from the DataFrame.
:param df: Pandas ``DataFrame``
:param column: Column indexer
:param match: ``str`` match target
:return: Pandas ``DataFrame`` filtered
"""
df = df.copy()
mask = [match not in str(v) for v in df[column].values]
return df.iloc[mask, :]
[docs]def remove_reverse(df):
"""
Remove rows with a + in the 'Reverse' column.
Return a ``DataFrame`` where rows where there is a "+" in the column 'Reverse' are removed.
Filters data to remove peptides matched as reverse.
:param df: Pandas ``DataFrame``
:return: filtered Pandas ``DataFrame``
"""
return remove_columns_containing(df, 'Reverse', '+')
[docs]def remove_contaminants(df):
"""
Remove rows with a + in the 'Contaminants' column
Return a ``DataFrame`` where rows where there is a "+" in the column 'Contaminants' are removed.
Filters data to remove peptides matched as reverse.
:param df: Pandas ``DataFrame``
:return: filtered Pandas ``DataFrame``
"""
return remove_columns_containing(df, 'Contaminant', '+')
[docs]def remove_potential_contaminants(df):
"""
Remove rows with a + in the 'Potential contaminant' column
Return a ``DataFrame`` where rows where there is a "+" in the column 'Contaminants' are removed.
Filters data to remove peptides matched as reverse.
:param df: Pandas ``DataFrame``
:return: filtered Pandas ``DataFrame``
"""
return remove_columns_containing(df, 'Potential contaminant', '+')
[docs]def remove_only_identified_by_site(df):
"""
Remove rows with a + in the 'Only identified by site' column
Return a ``DataFrame`` where rows where there is a "+" in the column 'Only identified by site' are removed.
Filters data to remove peptides matched as reverse.
:param df: Pandas ``DataFrame``
:return: filtered Pandas ``DataFrame``
"""
return remove_columns_containing(df, 'Only identified by site', '+')
[docs]def filter_localization_probability(df, threshold=0.75):
"""
Remove rows with a localization probability below 0.75
Return a ``DataFrame`` where the rows with a value < `threshold` (default 0.75) in column 'Localization prob' are removed.
Filters data to remove poorly localized peptides (non Class-I by default).
:param df: Pandas ``DataFrame``
:param threshold: Cut-off below which rows are discarded (default 0.75)
:return: Pandas ``DataFrame``
"""
df = df.copy()
localization_probability_mask = df['Localization prob'].values >= threshold
return df.iloc[localization_probability_mask, :]
[docs]def minimum_valid_values_in_any_group(df, levels=None, n=1, invalid=np.nan):
"""
Filter ``DataFrame`` by at least n valid values in at least one group.
Taking a Pandas ``DataFrame`` with a ``MultiIndex`` column index, filters rows to remove
rows where there are less than `n` valid values per group. Groups are defined by the `levels` parameter indexing
into the column index. For example, a ``MultiIndex`` with top and second level Group (A,B,C) and Replicate (1,2,3) using
``levels=[0,1]`` would filter on `n` valid values per replicate. Alternatively, ``levels=[0]`` would filter on `n`
valid values at the Group level only, e.g. A, B or C.
By default valid values are determined by `np.nan`. However, alternatives can be supplied via `invalid`.
:param df: Pandas ``DataFrame``
:param levels: ``list`` of ``int`` specifying levels of column ``MultiIndex`` to group by
:param n: ``int`` minimum number of valid values threshold
:param invalid: matching invalid value
:return: filtered Pandas ``DataFrame``
"""
df = df.copy()
if levels is None:
if 'Group' in df.columns.names:
levels = [df.columns.names.index('Group')]
# Filter by at least 7 (values in class:timepoint) at least in at least one group
if invalid is np.nan:
dfx = ~np.isnan(df)
else:
dfx = df != invalid
dfc = dfx.astype(int).sum(axis=1, level=levels)
dfm = dfc.max(axis=1) >= n
mask = dfm.values
return df.iloc[mask, :]
[docs]def search(df, match, columns=['Proteins','Protein names','Gene names']):
"""
Search for a given string in a set of columns in a processed ``DataFrame``.
Returns a filtered ``DataFrame`` where `match` is contained in one of the `columns`.
:param df: Pandas ``DataFrame``
:param match: ``str`` to search for in columns
:param columns: ``list`` of ``str`` to search for match
:return: filtered Pandas ``DataFrame``
"""
df = df.copy()
dft = df.reset_index()
mask = np.zeros((dft.shape[0],), dtype=bool)
idx = ['Proteins','Protein names','Gene names']
for i in idx:
if i in dft.columns:
mask = mask | np.array([match in str(l) for l in dft[i].values])
return df.iloc[mask]
[docs]def filter_exclude(df, s):
"""
Filter dataframe to exclude matching columns, based on search for "s"
:param s: string to search for, exclude matching columns
"""
keep = ~np.array( [s in c for c in df.columns.values] )
return df.iloc[:, keep]
[docs]def filter_select_columns(df, columns):
"""
Filter dataframe to include specified columns, retaining any Intensity columns.
"""
return df.filter(regex='^(LFQ Intensity.*|Intensity(.*)|%s)$' % ('|'.join(columns)) )
[docs]def filter_intensity(df, label=""):
"""
Filter to include only the Intensity values with optional specified label, excluding other
Intensity measurements, but retaining all other columns.
"""
dft = df.filter(regex="^(?!Intensity).*$")
dfi = df.filter(regex='^(.*Intensity.*%s.*__\d)$' % label)
return pd.concat([dft,dfi], axis=1)
[docs]def filter_intensity_lfq(df, label=""):
"""
Filter to include only the Intensity values with optional specified label, excluding other
Intensity measurements, but retaining all other columns.
"""
dft = df.filter(regex="^(?!LFQ Intensity).*$")
dfi = df.filter(regex='^(.*Intensity.*%s.*__\d)$' % label)
return pd.concat([dft,dfi], axis=1)