Source code for padua.analysis

import pandas as pd
import numpy as np
import requests

try:
    import sklearn
except ImportError:
    sklearn = False
else:
    from sklearn.decomposition import PCA    



try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

from . import filters
from .utils import get_protein_id


[docs]def correlation(df, rowvar=False):
    """
    Calculate column-wise Pearson correlations using ``numpy.ma.corrcoef``

    Input data is masked to ignore NaNs when calculating correlations. Data is returned as
    a Pandas ``DataFrame`` of column_n x column_n dimensions, with column index copied to
    both axes.

    :param df: Pandas DataFrame
    :return: Pandas DataFrame (n_columns x n_columns) of column-wise correlations
    """

    # Create a correlation matrix for all correlations
    # of the columns (filled with na for all values)
    df = df.copy()
    maskv = np.ma.masked_where(np.isnan(df.values), df.values)
    cdf = np.ma.corrcoef(maskv, rowvar=False)
    cdf = pd.DataFrame(np.array(cdf))
    cdf.columns = df.columns
    cdf.index = df.columns

    return cdf
    
    
[docs]def pca(df, n_components=2, mean_center=False, **kwargs):
    """
    Principal Component Analysis, based on `sklearn.decomposition.PCA`

    Performs a principal component analysis (PCA) on the supplied dataframe, selecting the first ``n_components`` components
    in the resulting model. The model scores and weights are returned.

    For more information on PCA and the algorithm used, see the `scikit-learn documentation <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_.

    :param df: Pandas ``DataFrame`` to perform the analysis on
    :param n_components: ``int`` number of components to select
    :param mean_center: ``bool`` mean center the data before performing PCA
    :param kwargs: additional keyword arguments to `sklearn.decomposition.PCA`
    :return: scores ``DataFrame`` of PCA scores n_components x n_samples
             weights ``DataFrame`` of PCA scores n_variables x n_components
    """

    if not sklearn:
        assert('This library depends on scikit-learn (sklearn) to perform PCA analysis')
        
    from sklearn.decomposition import PCA

    df = df.copy()
    
    # We have to zero fill, nan errors in PCA
    df[ np.isnan(df) ] = 0

    if mean_center:
        mean = np.mean(df.values, axis=0)
        df = df - mean

    pca = PCA(n_components=n_components, **kwargs)
    pca.fit(df.values.T)

    scores = pd.DataFrame(pca.transform(df.values.T)).T
    scores.index =  ['Principal Component %d' % (n+1) for n in range(0, scores.shape[0])]
    scores.columns = df.columns

    weights = pd.DataFrame(pca.components_).T
    weights.index = df.index
    weights.columns =  ['Weights on Principal Component %d' % (n+1) for n in range(0, weights.shape[1])]
       
    return scores, weights


[docs]def enrichment(df):
    """
    Calculate relative enrichment of peptide modifications.

    Taking a modifiedsitepeptides ``DataFrame`` returns the relative enrichment of the specified
    modification in the table.

    The returned data columns are generated from the input data columns.

    :param df: Pandas ``DataFrame``
    :return: Pandas ``DataFrame`` of percentage modifications in the supplied data.
    """

    values = []
    groups = []
    #totals = []

    dfr = df.sum(axis=1, level=0)
    for c in dfr.columns.values:
        dfx = dfr[c]
        dfx = dfx.dropna(axis=0, how='any')
        #total = len([m for m in dfx.index.values if m != 'Unmodified'])
        total = dfx.index.values.shape[0]
        # Sum up the number of phosphosites
        dfx = dfx.reset_index().filter(regex='Sequence|Modifications').set_index('Sequence').sum(axis=0, level=0)
        phosp = dfx[dfx > 0].shape[0]

        values.append((phosp, total-phosp))
        groups.append(c)
        #totals.append(total)

    return pd.DataFrame(np.array(values).T, columns=groups, index=["",""])
    

[docs]def sitespeptidesproteins(df, site_localization_probability=0.75):
    """
    Generate summary count of modified sites, peptides and proteins in a processed dataset ``DataFrame``.

    Returns the number of sites, peptides and proteins as calculated as follows:

    - `sites` (>0.75; or specified site localization probability) count of all sites > threshold
    - `peptides` the set of `Sequence windows` in the dataset (unique peptides)
    - `proteins` the set of unique leading peptides in the dataset

    :param df: Pandas ``DataFrame`` of processed data
    :param site_localization_probability: ``float`` site localization probability threshold (for sites calculation)
    :return: ``tuple`` of ``int``, containing sites, peptides, proteins
    """

    sites = filters.filter_localization_probability(df, site_localization_probability)['Sequence window']
    peptides = set(df['Sequence window'])
    proteins = set([p.split(';')[0] for p in df['Proteins']])
    return len(sites), len(peptides), len(proteins)


[docs]def modifiedaminoacids(df):
    """
    Calculate the number of modified amino acids in supplied ``DataFrame``.

    Returns the total of all modifications and the total for each amino acid individually, as an ``int`` and a
    ``dict`` of ``int``, keyed by amino acid, respectively.

    :param df: Pandas ``DataFrame`` containing processed data.
    :return: total_aas ``int`` the total number of all modified amino acids
             quants ``dict`` of ``int`` keyed by amino acid, giving individual counts for each aa.
    """

    amino_acids = list(df['Amino acid'].values)
    aas = set(amino_acids)
    quants = {}

    for aa in aas:
        quants[aa] = amino_acids.count(aa)
        
    total_aas = len(amino_acids)

    return total_aas, quants


[docs]def go_enrichment(df, enrichment='function', organism='Homo sapiens', summary=True, fdr=0.05, ids_from=['Proteins','Protein IDs']):
    """
    Calculate gene ontology (GO) enrichment for a specified set of indices, using the PantherDB GO enrichment service.

    Provided with a processed data ``DataFrame`` will calculate the GO ontology enrichment specified by `enrichment`,
    for the specified `organism`. The IDs to use for genes are taken from the field `ids_from`, which by default is
    compatible with both proteinGroups and modified peptide tables. Setting the `fdr` parameter (default=0.05) sets
    the cut-off to use for filtering the results. If `summary` is ``True`` (default) the returned ``DataFrame``
    contains just the ontology summary and FDR.

    :param df: Pandas ``DataFrame`` to
    :param enrichment: ``str`` GO enrichment method to use (one of 'function', 'process', 'cellular_location', 'protein_class', 'pathway')
    :param organism: ``str`` organism name (e.g. "Homo sapiens")
    :param summary: ``bool`` return full, or summarised dataset
    :param fdr: ``float`` FDR cut-off to use for returned GO enrichments
    :param ids_from: ``list`` of ``str`` containing the index levels to select IDs from (genes, protein IDs, etc.) default=['Proteins','Protein IDs']
    :return: Pandas ``DataFrame`` containing enrichments, sorted by P value.
    """

    if isinstance(df, pd.DataFrame) or isinstance(df, pd.Series):
        l = list(set(ids_from) & set(df.index.names))[0]
        data = "\n".join([get_protein_id(s) for s in df.index.get_level_values(l)])
    else:
        data = "\n".join([get_protein_id(s) for s in df])

    r = requests.post("http://www.pantherdb.org/webservices/garuda/enrichment.jsp", data={
            'organism': organism,
            'type': 'enrichment',
            'enrichmentType': enrichment},
            files = {
            'geneList': ('genelist.txt', StringIO(data) ),

            }
        )

    try:
        go = pd.read_csv(StringIO(r.text), sep='\t', skiprows=5, lineterminator='\n', header=None)
    except ValueError:
        return None

    go.columns = ["GO", "Name", "Protein", "P"]
    go = go.set_index(["GO","Name"])
    if summary:
        go = go.drop("Protein", axis=1).mean(axis=0, level=["GO","Name"])

    if fdr:
        go = go[ go["P"] < fdr ]

    return go.sort("P", ascending=True)