Source code for padua.filters

import numpy as np


[docs]def remove_columns_matching(df, column, match): """ Return a ``DataFrame`` with rows where `column` values match `match` are removed. The selected `column` series of values from the supplied Pandas ``DataFrame`` is compared to `match`, and those rows that match are removed from the DataFrame. :param df: Pandas ``DataFrame`` :param column: Column indexer :param match: ``str`` match target :return: Pandas ``DataFrame`` filtered """ df = df.copy() mask = df[column].values != match return df.iloc[mask, :]
[docs]def remove_columns_containing(df, column, match): """ Return a ``DataFrame`` with rows where `column` values containing `match` are removed. The selected `column` series of values from the supplied Pandas ``DataFrame`` is compared to `match`, and those rows that contain it are removed from the DataFrame. :param df: Pandas ``DataFrame`` :param column: Column indexer :param match: ``str`` match target :return: Pandas ``DataFrame`` filtered """ df = df.copy() mask = [match not in str(v) for v in df[column].values] return df.iloc[mask, :]
[docs]def remove_reverse(df): """ Remove rows with a + in the 'Reverse' column. Return a ``DataFrame`` where rows where there is a "+" in the column 'Reverse' are removed. Filters data to remove peptides matched as reverse. :param df: Pandas ``DataFrame`` :return: filtered Pandas ``DataFrame`` """ return remove_columns_containing(df, 'Reverse', '+')
[docs]def remove_potential_contaminants(df): """ Remove rows with a + in the 'Contaminants' column Return a ``DataFrame`` where rows where there is a "+" in the column 'Contaminants' are removed. Filters data to remove peptides matched as reverse. :param df: Pandas ``DataFrame`` :return: filtered Pandas ``DataFrame`` """ return remove_columns_containing(df, 'Potential contaminant', '+')
[docs]def remove_only_identified_by_site(df): """ Remove rows with a + in the 'Only identified by site' column Return a ``DataFrame`` where rows where there is a "+" in the column 'Only identified by site' are removed. Filters data to remove peptides matched as reverse. :param df: Pandas ``DataFrame`` :return: filtered Pandas ``DataFrame`` """ return remove_columns_containing(df, 'Only identified by site', '+')
[docs]def filter_localization_probability(df, threshold=0.75): """ Remove rows with a localization probability below 0.75 Return a ``DataFrame`` where the rows with a value < `threshold` (default 0.75) in column 'Localization prob' are removed. Filters data to remove poorly localized peptides (non Class-I by default). :param df: Pandas ``DataFrame`` :param threshold: Cut-off below which rows are discarded (default 0.75) :return: Pandas ``DataFrame`` """ df = df.copy() localization_probability_mask = df['Localization prob'].values >= threshold return df.iloc[localization_probability_mask, :]
[docs]def minimum_valid_values_in_any_group(df, levels=None, n=1, invalid=np.nan): """ Filter ``DataFrame`` by at least n valid values in at least one group. Taking a Pandas ``DataFrame`` with a ``MultiIndex`` column index, filters rows to remove rows where there are less than `n` valid values per group. Groups are defined by the `levels` parameter indexing into the column index. For example, a ``MultiIndex`` with top and second level Group (A,B,C) and Replicate (1,2,3) using ``levels=[0,1]`` would filter on `n` valid values per replicate. Alternatively, ``levels=[0]`` would filter on `n` valid values at the Group level only, e.g. A, B or C. By default valid values are determined by `np.nan`. However, alternatives can be supplied via `invalid`. :param df: Pandas ``DataFrame`` :param levels: ``list`` of ``int`` specifying levels of column ``MultiIndex`` to group by :param n: ``int`` minimum number of valid values threshold :param invalid: matching invalid value :return: filtered Pandas ``DataFrame`` """ df = df.copy() if levels is None: if 'Group' in df.columns.names: levels = [df.columns.names.index('Group')] # Filter by at least 7 (values in class:timepoint) at least in at least one group if invalid is np.nan: dfx = ~np.isnan(df) else: dfx = df != invalid dfc = dfx.astype(int).sum(axis=1, level=levels) dfm = dfc.max(axis=1) >= n mask = dfm.values return df.iloc[mask, :]