Source code for padua.visualize

"""
Visualization tools for proteomic data, using standard Pandas dataframe structures
from imported data. These functions make some assumptions about the structure of
data, but generally try to accomodate.

Depends on scikit-learn for PCA analysis
"""

import pandas as pd
import numpy as np
import scipy as sp

import warnings

import scipy.spatial.distance as distance
import scipy.cluster.hierarchy as sch

import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.image as mplimg
import matplotlib.cm as cm

from matplotlib.patches import Ellipse
from matplotlib.colors import colorConverter
from matplotlib.backends.backend_agg import FigureCanvasAgg

from io import BytesIO, StringIO

from . import analysis
from . import process
from .utils import qvalues, get_protein_id, get_protein_ids, get_protein_id_list, get_shortstr, get_index_list, build_combined_label, \
                   hierarchical_match, chunks, calculate_s0_curve, find_nearest_idx


from PIL import Image

import requests
from requests_toolbelt import MultipartEncoder

import re

uniprot_kegg_cache = {}



# Add ellipses for confidence intervals, with thanks to Joe Kington
# http://stackoverflow.com/questions/12301071/multidimensional-confidence-intervals
[docs]def plot_point_cov(points, nstd=2, **kwargs):
    """
    Plots an `nstd` sigma ellipse based on the mean and covariance of a point
    "cloud" (points, an Nx2 array).

    Parameters
    ----------
        points : An Nx2 array of the data points.
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        Additional keyword arguments are pass on to the ellipse patch.

    Returns
    -------
        A matplotlib ellipse artist
    """
    pos = points.mean(axis=0)
    cov = np.cov(points, rowvar=False)
    return plot_cov_ellipse(cov, pos, nstd, **kwargs)


[docs]def plot_cov_ellipse(cov, pos, nstd=2, **kwargs):
    """
    Plots an `nstd` sigma error ellipse based on the specified covariance
    matrix (`cov`). Additional keyword arguments are passed on to the
    ellipse patch artist.

    Parameters
    ----------
        cov : The 2x2 covariance matrix to base the ellipse on
        pos : The location of the center of the ellipse. Expects a 2-element
            sequence of [x0, y0].
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        Additional keyword arguments are pass on to the ellipse patch.

    Returns
    -------
        A matplotlib ellipse artist
    """

    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:, order]

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))

    # Width and height are "full" widths, not radius
    width, height = 2 * nstd * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, fill=False, **kwargs)

    return ellip



def _pca_scores(scores, pc1=0, pc2=1, fcol=None, ecol=None, marker='o', markersize=30, label_scores=None, show_covariance_ellipse=True, **kwargs):
    """
    Plot a scores plot for two principal components as AxB scatter plot.

    Returns the plotted axis.

    :param scores: DataFrame containing scores
    :param pc1: Column indexer into scores for PC1
    :param pc2: Column indexer into scores for PC2
    :param fcol: Face (fill) color definition
    :param ecol: Edge color definition
    :param marker: Marker style (matplotlib; default 'o')
    :param markersize: int Size of the marker
    :param label_scores: Index level to label markers with
    :param show_covariance_ellipse: Plot covariance (2*std) ellipse around each grouping
    :param kwargs:
    :return: Generated axes
    """

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1,1,1)
    levels = [0,1]    

    for c in set(scores.columns.get_level_values('Group')):

        try:
            data = scores[c].values.reshape(2,-1)
        except:
            continue

        fc = hierarchical_match(fcol, c, 'k')
        ec = hierarchical_match(ecol, c)
        
        if ec is None:
            ec = fc

        if type(markersize) == str:
            # Use as a key vs. index value in this levels
            idx = scores.columns.names.index(markersize)
            s = c[idx]
        elif callable(markersize):
            s = markersize(c)
        else:
            s = markersize

        ax.scatter(data[pc1,:], data[pc2,:], s=s, marker=marker, edgecolors=ec, c=fc)

        if show_covariance_ellipse and data.shape[1] > 2:
            cov = data[[pc1, pc2], :].T
            ellip = plot_point_cov(cov, nstd=2, linestyle='dashed', linewidth=0.5, edgecolor=ec or fc,
                                   alpha=0.8)  #**kwargs for ellipse styling
            ax.add_artist(ellip)

    if label_scores:
        scores_f = scores.iloc[ [pc1, pc2] ]
        idxs = get_index_list( scores_f.columns.names, label_scores )

        for n, (x, y) in enumerate(scores_f.T.values):
            r, ha = (30, 'left')
            ax.text(x, y, build_combined_label( scores_f.columns.values[n], idxs, ', '), rotation=r, ha=ha, va='baseline', rotation_mode='anchor', bbox=dict(boxstyle='round,pad=0.3', fc='#ffffff', ec='none', alpha=0.6))

    ax.set_xlabel(scores.index[pc1], fontsize=16)
    ax.set_ylabel(scores.index[pc2], fontsize=16)
    fig.tight_layout()
    return ax


def _pca_weights(weights, pc, threshold=None, label_threshold=None, label_weights=None, **kwargs):
    """

    :param weights:
    :param pc:
    :param threshold:
    :param label_threshold:
    :param label_weights:
    :param kwargs:
    :return:
    """
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1,1,1)
    ax.plot(weights.iloc[:, pc].values)
    ylim = np.max( np.abs( weights.values ) ) * 1.1
    ax.set_ylim( -ylim, +ylim  )
    ax.set_xlim(0, weights.shape[0])
    ax.set_aspect(1./ax.get_data_ratio())
    
    wts = weights.iloc[:, pc]

    if threshold:
        if label_threshold is None:
            label_threshold = threshold

        if label_weights:

            FILTER_UP = wts.values >= label_threshold
            FILTER_DOWN = wts.values <= -label_threshold
            FILTER = FILTER_UP | FILTER_DOWN

            wti = np.arange(0, weights.shape[0])
            wti = wti[FILTER]

            idxs = get_index_list( wts.index.names, label_weights )
            for x in wti:
                y = wts.iloc[x]
                r, ha =  (30, 'left') if y >= 0 else (-30, 'left')
                ax.text(x, y, build_combined_label( wts.index.values[x], idxs), rotation=r, ha=ha, va='baseline', rotation_mode='anchor', bbox=dict(boxstyle='round,pad=0.3', fc='#ffffff', ec='none', alpha=0.4))

        ax.axhline(threshold, 0, 1)
        ax.axhline(-threshold, 0, 1)

    ax.set_ylabel("Weights on Principal Component %d" % (pc+1), fontsize=16)
    fig.tight_layout()
    return ax
    

[docs]def pca(df, n_components=2, mean_center=False, fcol=None, ecol=None, marker='o', markersize=40, threshold=None, label_threshold=None, label_weights=None, label_scores=None, return_df=False, show_covariance_ellipse=False, *args, **kwargs):
    """
    Perform Principal Component Analysis (PCA) from input DataFrame and generate scores and weights plots.

    Principal Component Analysis is a technique for identifying the largest source of variation in a dataset. This
    function uses the implementation available in scikit-learn. The PCA is calculated via `analysis.pca` and will
    therefore give identical results.

    Resulting scores and weights plots are generated showing the distribution of samples within the resulting
    PCA space. Sample color and marker size can be controlled by label, lookup and calculation (lambda) to
    generate complex plots highlighting sample separation.

    For further information see the examples included in the documentation.

    :param df: Pandas `DataFrame`
    :param n_components: `int` number of Principal components to return
    :param mean_center: `bool` mean center the data before performing PCA
    :param fcol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param ecol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param marker: `str` matplotlib marker name (default "o")
    :param markersize: `int` or `callable` which returns an `int` for a given indexer
    :param threshold: `float` weight threshold for plot (horizontal line)
    :param label_threshold: `float` weight threshold over which to draw labels
    :param label_weights: `list` of `str`
    :param label_scores: `list` of `str`
    :param return_df: `bool` return the resulting scores, weights as pandas DataFrames
    :param show_covariance_ellipse: `bool` show the covariance ellipse around each group
    :param args: additional arguments passed to analysis.pca
    :param kwargs: additional arguments passed to analysis.pca
    :return:
    """

    scores, weights = analysis.pca(df, n_components=n_components, *args, **kwargs)

    scores_ax = _pca_scores(scores, fcol=fcol, ecol=ecol, marker=marker, markersize=markersize, label_scores=label_scores, show_covariance_ellipse=show_covariance_ellipse)
    weights_ax = []
    
    for pc in range(0, weights.shape[1]):
        weights_ax.append( _pca_weights(weights, pc, threshold=threshold, label_threshold=label_threshold, label_weights=label_weights) )
    
    if return_df:
        return scores, weights
    else:
        return scores_ax, weights_ax


[docs]def plsda(df, a, b, n_components=2, mean_center=False, scale=True, fcol=None, ecol=None, marker='o', markersize=40, threshold=None, label_threshold=None, label_weights=None, label_scores=None, return_df=False, show_covariance_ellipse=False, *args, **kwargs):
    """
    Partial Least Squares Regression Analysis, based on `sklearn.cross_decomposition.PLSRegression`

    Performs a partial least squares regression (PLS-R) on the supplied dataframe ``df``
    against the provided continuous variable ``v``, selecting the first ``n_components``.

    For more information on PLS regression and the algorithm used, see the `scikit-learn documentation <http://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html>`_.

    Resulting scores and weights plots are generated showing the distribution of samples within the resulting
    PCA space. Sample color and marker size can be controlled by label, lookup and calculation (lambda) to
    generate complex plots highlighting sample separation.

    For further information see the examples included in the documentation.

    :param df: Pandas `DataFrame`
    :param a: Column selector for group a
    :param b: Column selector for group b
    :param n_components: `int` number of Principal components to return
    :param mean_center: `bool` mean center the data before performing PCA
    :param fcol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param ecol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param marker: `str` matplotlib marker name (default "o")
    :param markersize: `int` or `callable` which returns an `int` for a given indexer
    :param threshold: `float` weight threshold for plot (horizontal line)
    :param label_threshold: `float` weight threshold over which to draw labels
    :param label_weights: `list` of `str`
    :param label_scores: `list` of `str`
    :param return_df: `bool` return the resulting scores, weights as pandas DataFrames
    :param show_covariance_ellipse: `bool` show the covariance ellipse around each group
    :param args: additional arguments passed to analysis.pca
    :param kwargs: additional arguments passed to analysis.pca
    :return:
    """

    scores, weights, loadings = analysis.plsda(df, a, b, n_components=n_components, scale=scale, *args, **kwargs)

    scores_ax = _pca_scores(scores, fcol=fcol, ecol=ecol, marker=marker, markersize=markersize, label_scores=label_scores, show_covariance_ellipse=show_covariance_ellipse)
    weights_ax = []

    for pc in range(0, weights.shape[1]):
        weights_ax.append( _pca_weights(weights, pc, threshold=threshold, label_threshold=label_threshold, label_weights=label_weights) )

    if return_df:
        return scores, weights
    else:
        return scores_ax, weights_ax



[docs]def plsr(df, v, n_components=2, mean_center=False, scale=True, fcol=None, ecol=None, marker='o', markersize=40, threshold=None, label_threshold=None, label_weights=None, label_scores=None, return_df=False, show_covariance_ellipse=False, *args, **kwargs):
    """
    Partial Least Squares Regression Analysis, based on `sklearn.cross_decomposition.PLSRegression`

    Performs a partial least squares regression (PLS-R) on the supplied dataframe ``df``
    against the provided continuous variable ``v``, selecting the first ``n_components``.

    For more information on PLS regression and the algorithm used, see the `scikit-learn documentation <http://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html>`_.

    Resulting scores, weights and regression plots are generated showing the distribution of samples within the resulting
    PCA space. Sample color and marker size can be controlled by label, lookup and calculation (lambda) to
    generate complex plots highlighting sample separation.

    For further information see the examples included in the documentation.

    :param df: Pandas `DataFrame`
    :param v: Continuous variable to perform regression against
    :param n_components: `int` number of Principal components to return
    :param mean_center: `bool` mean center the data before performing PCA
    :param fcol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param ecol: `dict` of indexers:colors, where colors are hex colors or matplotlib color names
    :param marker: `str` matplotlib marker name (default "o")
    :param markersize: `int` or `callable` which returns an `int` for a given indexer
    :param threshold: `float` weight threshold for plot (horizontal line)
    :param label_threshold: `float` weight threshold over which to draw labels
    :param label_weights: `list` of `str`
    :param label_scores: `list` of `str`
    :param return_df: `bool` return the resulting scores, weights as pandas DataFrames
    :param show_covariance_ellipse: `bool` show the covariance ellipse around each group
    :param args: additional arguments passed to analysis.pca
    :param kwargs: additional arguments passed to analysis.pca
    :return:
    """

    scores, weights, loadings, predicted = analysis.plsr(df, v, n_components=n_components, scale=scale, *args, **kwargs)

    scores_ax = _pca_scores(scores, fcol=fcol, ecol=ecol, marker=marker, markersize=markersize, label_scores=label_scores, show_covariance_ellipse=show_covariance_ellipse)
    weights_ax = []

    for pc in range(0, weights.shape[1]):
        weights_ax.append( _pca_weights(weights, pc, threshold=threshold, label_threshold=label_threshold, label_weights=label_weights) )

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1,1,1)

    slope, intercept, r, p, se = sp.stats.linregress(v, predicted.flatten())

    # Add regression line
    xmin, xmax = np.min(v), np.max(v)
    ax.plot([xmin, xmax],[xmin*slope+intercept, xmax*slope+intercept], lw=1, c='k')

    ax.scatter(v, predicted, s=50, alpha=0.5)
    ax.set_xlabel("Actual values")
    ax.set_ylabel("Predicted values")

    ax.set_aspect(1./ax.get_data_ratio())

    ax.text(0.05, 0.95, '$y = %.2f+%.2fx$' % (intercept, slope), horizontalalignment='left', transform=ax.transAxes, color='black', fontsize=14)

    ax.text(0.95, 0.15, '$r^2$ = %.2f' % (r**2), horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=14)
    ax.text(0.95, 0.10, '$p$ = %.2f' % p, horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=14)
    ax.text(0.95, 0.05, '$SE$ = %.2f' % se, horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=14)


    if return_df:
        return scores, weights, loadings, predicted
    else:
        return scores_ax, weights_ax, ax


    
[docs]def enrichment(dfenr, include=None):
    """
    Generates an enrichment pie chart series from a calculate enrichment table
    :param df:
    :return:
    """
    enrichment = dfenr.values.flatten()
    dfenr = pd.DataFrame([enrichment, 1-enrichment], columns=dfenr.columns, index=["",""])

    if include:
        dfenr = dfenr[include]

    axes = dfenr.plot(kind='pie', subplots=True, figsize=(dfenr.shape[1]*4, 3))
    for n, ax in enumerate(axes):
        #ax.legend().set_visible(False)
        modified, unmodified = dfenr.values[:,n]
        total = modified + unmodified
        enrichment = modified/total
        ax.annotate("%.1f%%" % (100 * enrichment),
                 xy=(0.3, 0.6),
                 xycoords='axes fraction',
                 color='w',
                 size=18)
        ax.set_xlabel( ax.get_ylabel(), fontsize=18)
        ax.set_ylabel("")
        ax.set_aspect('equal', 'datalim')

    return axes


[docs]def volcano(df, a, b=None, fdr=0.05, figsize=(8,10), show_numbers=True, threshold=2, minimum_sample_n=0, estimate_qvalues=False, labels_from=None, labels_for=None, title=None, label_format=None, markersize=64, s0=0.00001, draw_fdr=True, is_log2=False, fillna=None, label_sig_only=True, ax=None, xlim=None, ylim=None, fc='grey', fc_sig='blue', fc_sigr='red'):
    """
    Volcano plot of two sample groups showing t-test p value vs. log2(fc).

    Generates a volcano plot for two sample groups, selected from `df` using `a` and `b` indexers. The mean of
    each group is calculated along the y-axis (per protein) and used to generate a log2 ratio. If a log2-transformed
    dataset is supplied set `islog2=True` (a warning will be given when negative values are present).

    A two-sample independent t-test is performed between each group. If `minimum_sample_n` is supplied, any values (proteins)
    without this number of samples will be dropped from the analysis.

    Individual data points can be labelled in the resulting plot by passing `labels_from` with a index name, and `labels_for`
    with a list of matching values for which to plot labels.


    :param df: Pandas `dataframe`
    :param a: `tuple` or `str` indexer for group A
    :param b: `tuple` or `str` indexer for group B
    :param fdr: `float` false discovery rate cut-off
    :param threshold: `float` log2(fc) ratio cut -off
    :param minimum_sample_n: `int` minimum sample for t-test
    :param estimate_qvalues: `bool` estimate Q values (adjusted P)
    :param labels_from: `str` or `int` index level to get labels from
    :param labels_for: `list` of `str` matching labels to show
    :param title: `str` title for plot
    :param markersize: `int` size of markers
    :param s0: `float` smoothing factor between fdr/fc cutoff
    :param draw_fdr: `bool` draw the fdr/fc curve
    :param is_log2: `bool` is the data log2 transformed already?
    :param fillna: `float` fill NaN values with value (default: 0)
    :param label_sig_only: `bool` only label significant values
    :param ax: matplotlib `axis` on which to draw
    :param fc: `str` hex or matplotlib color code, default color of points
    :return:
    """
    df = df.copy()
    
    if np.any(df.values < 0) and not is_log2:
        warnings.warn("Input data has negative values. If data is log2 transformed, set is_log2=True.")

    if fillna is not None:
        df = df.fillna(fillna)
        
    if labels_from is None:
        labels_from = list(df.index.names)

    if b is not None:
        # Calculate ratio between two groups
        g1, g2 = df[a].values, df[b].values

        if is_log2:
            dr = np.nanmean(g2, axis=1) - np.nanmean(g1, axis=1)
        else:
            dr = np.log2(np.nanmean(g2, axis=1) / np.nanmean(g1, axis=1))

        ginv = ( (~np.isnan(g1)).sum(axis=1) < minimum_sample_n ) | ((~np.isnan(g2)).sum(axis=1) < minimum_sample_n)

        g1 = np.ma.masked_where(np.isnan(g1), g1)
        g2 = np.ma.masked_where(np.isnan(g2), g2)

        # Calculate the p value between two groups (t-test)
        t, p = sp.stats.mstats.ttest_ind(g1.T, g2.T)

    else:
        g1 = df[a].values
        dr = np.nanmean(g1, axis=1)

        ginv = (~np.isnan(g1)).sum(axis=1) < minimum_sample_n

        # Calculate the p value one sample t
        g1 = np.ma.masked_where(np.isnan(g1), g1)
        t, p = sp.stats.mstats.ttest_1samp(g1.T, popmean=0)

    p = np.array(p) # Unmask

    # Set p values to nan where not >= minimum_sample_n values
    p[ ginv ] = np.nan

    if ax is None:
        fig = plt.figure(figsize=figsize)

        ax = fig.add_subplot(1,1,1)

    if estimate_qvalues:
        p[~np.isnan(p)] = qvalues(p[~np.isnan(p)])
        ax.set_ylabel('-log$_{10}$(Q)')
    else:
        ax.set_ylabel('-log$_{10}$(p)')


    # There are values below the fdr
    s0x, s0y, s0fn = calculate_s0_curve(s0, fdr, min(fdr/2, np.nanmin(p)), np.log2(threshold), np.nanmax(np.abs(dr)), curve_interval=0.001)

    if draw_fdr is True:
        ax.plot(s0x, -np.log10(s0y), fc_sigr, lw=1 )
        ax.plot(-s0x, -np.log10(s0y), fc_sigr, lw=1 )

    # Select data based on s0 curve
    _FILTER_IN = []

    for x, y in zip(dr, p):
        x = np.abs(x)
        spy = s0fn(x)

        if len(s0x) == 0 or x < np.min(s0x):
            _FILTER_IN.append(False)
            continue

        if y <= spy:
            _FILTER_IN.append(True)
        else:
            _FILTER_IN.append(False)

    _FILTER_IN = np.array(_FILTER_IN)
    _FILTER_OUT = ~ _FILTER_IN
    

    def scatter(ax, f, c, alpha=0.5):
        # lw = [float(l[0][-1])/5 for l in df[ f].index.values]
        
        if type(markersize) == str:
            # Use as a key vs. index value in this levels
            s = df.index.get_level_values(markersize)
        elif callable(markersize):
            s = np.array([markersize(c) for c in df.index.values])
        else:
            s = np.ones((df.shape[0],))*markersize
        
        
        ax.scatter(dr[f], -np.log10(p[f]), c=c, s=s[f], linewidths=0, alpha=0.5)
    
    _FILTER_OUT1 = _FILTER_OUT & ~np.isnan(p) & (np.array(p) > fdr)
    scatter(ax, _FILTER_OUT1, fc, alpha=0.3)

    _FILTER_OUT2 = _FILTER_OUT & (np.array(p) <= fdr)
    scatter(ax, _FILTER_OUT2, fc_sig, alpha=0.3)

    if labels_for:
        idxs = get_index_list( df.index.names, labels_from )
        for shown, label, x, y in zip( _FILTER_IN , df.index.values, dr, -np.log10(p)):
            
            if shown or not label_sig_only:
                label = build_combined_label( label, idxs, label_format=label_format)
                
                if labels_for == True or any([l in label for l in labels_for]):
                    r, ha, ofx, ofy =  (30, 'left', 0.15, 0.02) if x >= 0 else (-30, 'right', -0.15, 0.02)
                    t = ax.text(x+ofx, y+ofy, label , rotation=r, ha=ha, va='baseline', rotation_mode='anchor', bbox=dict(boxstyle='round,pad=0.3', fc='#ffffff', ec='none', alpha=0.4))

    scatter(ax, _FILTER_IN, fc_sigr)
    

    ax.set_xlabel('log$_2$ ratio')

    # Centre the plot horizontally
    if xlim is None:
        xmin, xmax = ax.get_xlim()
        xlim = np.max(np.abs([xmin, xmax]))
    ax.set_xlim((-xlim, xlim))

    if ylim is None:
        _, ylim = ax.get_ylim()
    ax.set_ylim((0, ylim))

    if title:
        ax.set_title(title)

    if show_numbers:
        # Annotate axes with the numbers of points in each category (up & down):
        # filtered (red), Sig-filtered (blue), nonsig (grey)
        dr_up, dr_down = dr > 0, dr < 0
        grey_up, blue_up, red_up = np.sum(_FILTER_OUT1 & dr_up), np.sum(_FILTER_OUT2 & dr_up), np.sum(_FILTER_IN & dr_up)
        grey_do, blue_do, red_do = np.sum(_FILTER_OUT1 & dr_down), np.sum(_FILTER_OUT2 & dr_down), np.sum(_FILTER_IN & dr_down)

        ax.text(0.95, 0.95, '%d' % red_up, horizontalalignment='right', transform=ax.transAxes, color=fc_sigr, fontsize=14)
        ax.text(0.95, 0.90, '%d' % blue_up, horizontalalignment='right', transform=ax.transAxes, color=fc_sig, fontsize=14)
        ax.text(0.95, 0.05, '%d' % grey_up, horizontalalignment='right', transform=ax.transAxes, color=fc, fontsize=14)

        ax.text(0.05, 0.95, '%d' % red_do, horizontalalignment='left', transform=ax.transAxes, color=fc_sigr, fontsize=14)
        ax.text(0.05, 0.90, '%d' % blue_do, horizontalalignment='left', transform=ax.transAxes, color=fc_sig, fontsize=14)
        ax.text(0.05, 0.05, '%d' % grey_do, horizontalalignment='left', transform=ax.transAxes, color=fc, fontsize=14)

    return ax, p, dr, _FILTER_IN
    

def _bartoplabel(ax, name, mx, offset):
    """

    :param ax:
    :param name:
    :param mx:
    :param offset:
    :return:
    """
    # attach some text labels
    for ii,container in enumerate(ax.containers):
        rect = container.patches[0]
        height = rect.get_height()
        rect.set_width( rect.get_width() - 0.05 )
        ax.text(rect.get_x()+rect.get_width()/2., height+150, '%.2f%%'% (name[ii]/mx),
                ha='center', va='bottom', fontsize=15)
    
    
[docs]def modifiedaminoacids(df, kind='pie'):
    """
    Generate a plot of relative numbers of modified amino acids in source DataFrame.

    Plot a pie or bar chart showing the number and percentage of modified amino
    acids in the supplied data frame. The amino acids displayed will be
    determined from the supplied data/modification type.

    :param df: processed DataFrame
    :param kind: `str` type of plot; either "pie" or "bar"
    :return: matplotlib ax
    """

    colors =   ['#6baed6','#c6dbef','#bdbdbd']
    total_aas, quants = analysis.modifiedaminoacids(df)
    
    df = pd.DataFrame()
    for a, n in quants.items():
        df[a] = [n]
    df.sort(axis=1, inplace=True)
    
    if kind == 'bar' or kind == 'both':
        ax1 = df.plot(kind='bar', figsize=(7,7), color=colors)
        ax1.set_ylabel('Number of phosphorylated amino acids')
        ax1.set_xlabel('Amino acid')
        ax1.set_xticks([])
        ylim = np.max(df.values)+1000
        ax1.set_ylim(0, ylim )
        _bartoplabel(ax1, 100*df.values[0], total_aas, ylim )

        ax1.set_xlim((-0.3, 0.3))
        return ax
    
    if kind == 'pie' or kind == 'both':

        dfp =df.T
        residues = dfp.index.values
        
        dfp.index = ["%.2f%% (%d)" % (100*df[i].values[0]/total_aas, df[i].values[0]) for i in dfp.index.values ]
        ax2 = dfp.plot(kind='pie', y=0, colors=colors)
        ax2.legend(residues, loc='upper left', bbox_to_anchor=(1.0, 1.0))
        ax2.set_ylabel('')
        ax2.set_xlabel('')
        ax2.figure.set_size_inches(6,6)

        for t in ax2.texts:
            t.set_fontsize(15)

        return ax2

    return ax1, ax2
    
    
[docs]def modificationlocalization(df):
    """
    Plot the % of Class I, II and III localised peptides according to standard thresholds.

    Generates a pie chart showing the % of peptides that fall within the Class I, II and III
    classifications based on localisation probability. These definitions are::

        Class I     0.75 > x
        Class II    0.50 > x <= 0.75
        Class III   0.25 > x <= 0.50

    Any peptides with a localisation score of <= 0.25 are excluded.

    :param df:
    :return: matplotlib axis
    """


    colors =  ["#78c679", "#d9f0a3", "#ffffe5"]

    lp = df['Localization prob'].values
    class_i = np.sum(lp > 0.75)
    class_ii = np.sum( (lp > 0.5) & (lp <= 0.75 ) )
    class_iii = np.sum( (lp > 0.25) & (lp <= 0.5 ) )

    cl = [class_i, class_ii, class_iii]
    total_v = class_i + class_ii + class_iii
    
    df = pd.DataFrame(cl)

    df.index = ["%.2f%% (%d)" % (100*v/total_v, v) for n, v in enumerate(cl) ]
    
    ax = df.plot(kind='pie', y=0, colors=colors)

    ax.legend(['Class I (> 0.75)', 'Class II (> 0.5 ≤ 0.75)', 'Class III (> 0.25, ≤ 0.5)'],
              loc='upper left', bbox_to_anchor=(1.0, 1.0))
    ax.set_ylabel('')
    ax.set_xlabel('')

    for t in ax.texts:
        t.set_fontsize(15)

    ax.figure.set_size_inches(6,6)

    return ax
    
    
[docs]def box(df, s=None, title_from=None, subplots=False, figsize=(18,6), groups=None, fcol=None, ecol=None, hatch=None, ylabel="", xlabel=""):
    """
    Generate a box plot from pandas DataFrame with sample grouping.

    Plot group mean, median and deviations for specific values (proteins) in the dataset. Plotting is controlled via
    the `s` param, which is used as a search string along the y-axis. All matching values will be returned and plotted.
    Multiple search values can be provided as a `list` of `str` and these will be searched as an `and` query.

    Box fill and edge colors can be controlled on a full-index basis by passing a `dict` of indexer:color to
    `fcol` and `ecol` respectively. Box hatching can be controlled by passing a `dict` of indexer:hatch to `hatch`.


    :param df: Pandas `DataFrame`
    :param s: `str` search y-axis for matching values (case-insensitive)
    :param title_from: `list` of `str` of index levels to generate title from
    :param subplots: `bool` use subplots to separate plot groups
    :param figsize: `tuple` of `int` size of resulting figure
    :param groups:
    :param fcol: `dict` of `str` indexer:color where color is hex value or matplotlib color code
    :param ecol: `dict` of `str` indexer:color where color is hex value or matplotlib color code
    :param hatch: `dict` of `str` indexer:hatch where hatch is matplotlib hatch descriptor
    :param ylabel: `str` ylabel for boxplot
    :param xlabel: `str` xlabel for boxplot
    :return: `list` of `Figure`
    """

    df = df.copy()

    if type(s) == str:
        s = [s]

    if title_from is None:
        title_from = list(df.index.names)
        
    if groups is None:
        groups = list( set( df.columns.get_level_values(0) ) )
        
    # Build the combined name/info string using label_from; replace the index
    title_idxs = get_index_list( df.index.names, title_from )
    df.index = [build_combined_label(r, title_idxs) for r in df.index.values]
    
    if s:
        # Filter the table on the match string (s)
        df = df.iloc[ [all([str(si).lower() in l.lower() for si in s]) for l in df.index.values] ]
    
    figures = []
    # Iterate each matching row, building the correct structure dataframe
    for ix in range(df.shape[0]):
        
        dfi = pd.DataFrame(df.iloc[ix]).T
        label = dfi.index.values[0]
        dfi = process.fold_columns_to_rows(dfi, levels_from=len(df.columns.names)-1)

        if subplots:
            gs = gridspec.GridSpec(1, len(subplots), width_ratios=[dfi[sp].shape[1] for sp in subplots])
            subplotl = subplots
        elif isinstance(dfi.columns, pd.MultiIndex) and len(dfi.columns.levels) > 1:
            subplotl = dfi.columns.levels[0]
            gs = gridspec.GridSpec(1, len(subplotl), width_ratios=[dfi[sp].shape[1] for sp in subplotl])
        else:
            # Subplots
            subplotl = [None]
            gs =  gridspec.GridSpec(1, 1) 


        first_ax = None

        fig = plt.figure(figsize=figsize)

        for n, sp in enumerate(subplotl):

            if sp is None:
                dfp = dfi
            else:
                dfp = dfi[sp]

            ax = fig.add_subplot(gs[n], sharey=first_ax)
            
            #print(dfp.median(axis=1, level=0).reset_index())

            medians = dfp.median(axis=1, level=0).reset_index()#.set_index('Replicate') #.dropna(axis=1)

            if groups and all([g in medians.columns.get_level_values(0) for g in groups]):
                medians = medians[ groups ]

            ax, dic = medians.plot(
                kind='box', 
                return_type = 'both',
                patch_artist=True,
                ax = ax,
            )

            ax.set_xlabel('')
            for n, c in enumerate(medians.columns.values):
                if sp is None:
                    hier = []
                else:
                    hier = [sp]  
                if type(c) == tuple:
                    hier.extend(c)
                else:
                    hier.append(c)
                    
                if fcol:
                    color = hierarchical_match(fcol, hier, None)
                    if color:
                        dic['boxes'][n].set_color( color )
                if ecol:
                    color = hierarchical_match(ecol, hier, None)
                    if color:
                        dic['boxes'][n].set_edgecolor( color )
                if hatch:
                    dic['boxes'][n].set_hatch( hierarchical_match(hatch, hier, '') )

            ax.set_xlabel(xlabel)
            ax.tick_params(axis='both', which='major', labelsize=12)

            if first_ax is None:
                first_ax = ax
            else:
                for yl in ax.get_yticklabels():
                    yl.set_visible(False)

        first_ax.set_ylabel(ylabel, fontsize=14)
        fig.subplots_adjust(wspace=0.05)

        fig.suptitle(label)
        
        figures.append(fig)
        
    return figures

    
[docs]def column_correlations(df, cmap=cm.Reds):
    """

    :param df:
    :param cmap:
    :return:
    """
    df = df.copy()
    df = df.astype(float)
    
    dfc = analysis.column_correlations(df)
    dfc = dfc.values

    # Plot the distributions
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    fig = plt.figure()
    fig.set_size_inches(12, 12)
    ax = fig.add_subplot(1,1,1)

    #vmax = np.max(dfc)
    #vmin = np.min(dfc)

    i = ax.imshow(dfc, cmap=cmap, vmin=0.5, vmax=1.0, interpolation='none')
    ax.figure.colorbar(i)
    ax.set_xticks([])
    ax.set_yticks([])
    
    return ax
    
    
def _process_ix(i, idx):
    """


    :param i:
    :param idx:
    :return:
    """
    if idx is None:
        return set(i)
        
    if len(idx) > 1:
        return set( zip([i.get_level_values(l) for l in idx]) )
    else:
        return set( i.get_level_values(idx[0]) )
        
    
[docs]def venn(df1, df2, df3=None, labels=None, ix1=None, ix2=None, ix3=None, return_intersection=False, fcols=None):
    """
    Plot a 2 or 3-part venn diagram showing the overlap between 2 or 3 pandas DataFrames.

    Provided with two or three Pandas DataFrames, this will return a venn diagram showing the overlap calculated between
    the DataFrame indexes provided as ix1, ix2, ix3. Labels for each DataFrame can be provided as a list in the same order,
    while `fcol` can be used to specify the colors of each section.

    :param df1: Pandas DataFrame
    :param df2: Pandas DataFrame
    :param df3: Pandas DataFrame (optional)
    :param labels: List of labels for the provided dataframes
    :param ix1: Index level name of of Dataframe 1 to use for comparison
    :param ix2: Index level name of of Dataframe 2 to use for comparison
    :param ix3: Index level name of of Dataframe 3 to use for comparison
    :param return_intersection: Return the intersection of the supplied indices
    :param fcols: List of colors for the provided dataframes
    :return: ax, or ax with intersection
    """
    try:
        import matplotlib_venn as mplv
    except ImportError:
        raise ImportError("To plot venn diagrams, install matplotlib-venn package: pip install matplotlib-venn")

    plt.gcf().clear()


    if labels is None:
        labels = ["A", "B", "C"]
        
    s1 = _process_ix(df1.index, ix1)
    s2 = _process_ix(df2.index, ix2)
    if df3 is not None:
        s3 = _process_ix(df3.index, ix3)

    kwargs = {}

    if fcols:
        kwargs['set_colors'] = [fcols[l] for l in labels]

    if df3 is not None:
        vn = mplv.venn3([s1,s2,s3], set_labels=labels, **kwargs)
        intersection = s1 & s2 & s3
    else:
        vn = mplv.venn2([s1,s2], set_labels=labels, **kwargs)
        intersection = s1 & s2



    ax = plt.gca()

    if return_intersection:
        return ax, list(intersection)
    else:
        return ax

        
[docs]def sitespeptidesproteins(df, labels=None, colors=None, site_localization_probability=0.75):
    """
    Plot the number of sites, peptides and proteins in the dataset.

    Generates a plot with sites, peptides and proteins displayed hierarchically in chevrons.
    The site count is limited to Class I (<=0.75 site localization probability) by default
    but may be altered using the `site_localization_probability` parameter.

    Labels and alternate colours may be supplied as a 3-entry iterable.

    :param df: pandas DataFrame to calculate numbers from
    :param labels: list/tuple of 3 strings containing labels
    :param colors: list/tuple of 3 colours as hex codes or matplotlib color codes
    :param site_localization_probability: the cut-off for site inclusion (default=0.75; Class I)
    :return:
    """
    fig = plt.figure(figsize=(4,6))
    ax = fig.add_subplot(1,1,1)

    shift = 0.5
    values = analysis.sitespeptidesproteins(df, site_localization_probability)
    if labels is None:
        labels = ['Sites (Class I)', 'Peptides', 'Proteins']
    if colors is None:
        colors = ['#756bb1', '#bcbddc', '#dadaeb']

    for n, (c, l, v) in enumerate(zip(colors, labels, values)):
        ax.fill_between([0,1,2], np.array([shift,0,shift]) + n, np.array([1+shift,1,1+shift]) + n, color=c, alpha=0.5 )

        ax.text(1, 0.5 + n, "{}\n{:,}".format(l, v), ha='center', color='k', fontsize=16 )
        
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_axis_off()
    
    return ax
    


[docs]def rankintensity(df, colors=None, labels_from='Protein names', number_of_annotations=3, show_go_enrichment=False, go_ids_from=None, go_enrichment='function', go_max_labels=8, go_fdr=None, progress_callback=None):
    """
    Rank intensity plot, showing intensity order vs. raw intensity value S curve.

    Generates a plot showing detected protein intensity plotted against protein intensity rank. A series of colors
    can be provided to segment the S curve into regions. Gene ontology enrichments (as calculated via `analysis.go_enrichment`)
    can be overlaid on the output. Note that since the ranking reflects simple abundance there is little meaning to enrichment
    (FDR will remove most if not all items) and it is best considered an annotation of the 'types' of proteins in that region.

    :param df: Pands DataFrame
    :param colors: `list` of colors to segment the plot into
    :param labels_from: Take labels from this column
    :param number_of_annotations: Number of protein annotations at each tip
    :param show_go_enrichment: Overlay plot with GO enrichment terms
    :param go_ids_from: Get IDs for GO enrichment from this column
    :param go_enrichment: Type of GO enrichment to show
    :param go_max_labels: Maximum number of GO enrichment labels per segment
    :param go_fdr: FDR cutoff to apply to the GO enrichment terms
    :return: matplotlib Axes
    """

    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(1,1,1)

    labels = df[labels_from].values

    if go_ids_from:
        ids = df[go_ids_from]
    else:
        if 'Proteins' in df.columns.values:
            ids = df['Proteins']
        elif 'Protein IDs' in df.columns.values:
            ids = df['Protein IDs']
        else:
            ids = None

    y = np.log10(df['Intensity'].values)

    filt = ~np.array( np.isnan(y) | np.isinf(y) )
    y = y[filt]
    labels = labels[filt]
    if ids is not None:
        ids = ids[filt]

    sort = np.argsort(y)
    y = y[sort]
    labels = labels[sort]
    if ids is not None:
        ids = ids[sort]

    x_max = y.shape[0]
    y_min, y_max = np.min(y), np.max(y)
    yrange = y_max-y_min

    x = np.arange(x_max)

    ax.set_xlim( (-x_max//3,  x_max+x_max//3) )
    ax.set_ylim( (y_min-1,  y_max+1) )

    # Set the dimensions so we can plot the labels correctly.
    ax.scatter(x, y, alpha=0, zorder=-1)

    ax.set_ylabel("Peptide intensity ($log_{10}$)")
    ax.set_xlabel("Peptide rank")

    # Defines ranges over which text can be slotted in (avoid y overlapping)
    slot_size = 0.03 # FIXME: We should calculate this
    text_y_slots = np.arange(0.1, 0.95, slot_size)


    text_y_cross = 0.55
    # Build set of standard x offsets at defined y data points
    # For each y slot, find the (nearest) data y, therefore the x
    text_x_slots = []
    inv =  ax.transLimits.inverted()
    for ys in text_y_slots:
        _, yd = inv.transform( (0, ys) )
        text_x_slots.append( ax.transLimits.transform( (x[find_nearest_idx(y, yd)], 0 ) )[0] )
    text_x_slots = np.array(text_x_slots)

    text_x_slots[text_y_slots < text_y_cross] += 0.15
    text_x_slots[text_y_slots > text_y_cross] -= 0.15

    def annotate_obj(ax, n, labels, xs, ys, idx, yd, ha):
        """
        """

        ni = 1
        previous = {}
        for l, xi, yi in zip(labels, xs, ys):
            if type(l) == str:
                l = get_shortstr(l)

                if l in previous: # Use previous text slot for annotation; skip annotation part
                    axf, ayf = previous[l]

                elif text_y_slots[idx]:
                    axf = text_x_slots[idx]
                    ayf = text_y_slots[idx]
                    text_x_slots[idx] = np.nan
                    text_y_slots[idx] = np.nan

                    ax.text(axf, ayf, l, transform=ax.transAxes,  ha=ha, va='center', color='k')

                    idx += yd
                    ni += 1

                    previous[l] = (axf, ayf)

                ax.annotate("", xy=(xi, yi), xycoords='data',  xytext=(axf, ayf), textcoords='axes fraction',
                        va='center', color='k',
                        arrowprops=dict(arrowstyle='-', connectionstyle="arc3", ec='k', lw=1), zorder=100)

            if ni > n:
                break

    _n = np.min([labels.shape[0], 20])

    annotate_obj(ax, number_of_annotations, labels[-1:-_n:-1], x[-1:-_n:-1], y[-1:-_n:-1], -1, -1, 'right')
    annotate_obj(ax, number_of_annotations, labels[:_n], x[:_n], y[:_n], 0, +1, 'left')

    if show_go_enrichment and ids is not None:

        # Calculate orders of magnitude (range)
        oomr = int(np.round(np.min(y))), int(np.round(np.max(y)))

        # First -1, last +1
        segments = [[x, x+1] for x in range(*oomr)]
        segments[0][0] -= 3
        segments[-1][1] += 3

        if not isinstance(colors, list):
            if not isinstance(colors, tuple):
                colors = ('#1f77b4', '#d62728')
            # Build a continuous scale from low to high
            cmap = mpl.colors.LinearSegmentedColormap.from_list('custom', list(colors), len(segments))
            colors = [mpl.colors.rgb2hex(cmap(n)) for n in range(len(segments))]

        for n, (s, e) in enumerate(segments):
            if progress_callback:
                progress_callback(float(n)/len(segments))

            mask = (y > s) & (y < e)
            c = x[mask]

            # Comparison relative to background
            gids = list(set(ids[mask]) - set(ids[~mask]))
            go = analysis.go_enrichment(gids, enrichment=go_enrichment, fdr=go_fdr)

            if go is not None:
                labels = [gi[1] for gi in go.index]

                # Filter out less specific GO terms where specific terms exist (simple text matching)
                labels_f = []
                for l in labels:
                    for ll in labels:
                        if l in ll and l != ll:
                            break
                    else:
                        labels_f.append(l)
                labels = labels_f[:go_max_labels]

                yr = ax.transLimits.transform( (0, y[c[0]]) )[1], ax.transLimits.transform( (0, y[c[-1]]) )[1]

                # find axis label point for both start and end
                if yr[0] < text_y_cross:
                    yr = yr[0]-slot_size, yr[1]
                else:
                    yr = yr[0]+slot_size, yr[1]

                yr = find_nearest_idx(text_y_slots, yr[0]), find_nearest_idx(text_y_slots, yr[1])

                yrange = list(range(yr[0], yr[1]))

                # Center ish
                if len(yrange) > len(labels):
                    crop = (len(yrange) - len(labels)) // 2
                    if crop > 1:
                        yrange = yrange[crop:-crop]

                # display ranked top to bottom
                for idx, l in zip(yrange, labels):
                    axf = text_x_slots[idx]
                    ayf = text_y_slots[idx]
                    text_x_slots[idx] = np.nan
                    text_y_slots[idx] = np.nan

                    if ayf > text_y_cross:
                        ha = 'right'
                    else:
                        ha = 'left'
                    ax.text(axf, ayf, l, transform=ax.transAxes, ha=ha, color=colors[n])

            # Calculate GO enrichment terms for each region?
            ax.scatter(x[mask], y[mask], s=15, c=colors[n], lw=0, zorder=100)

    else:
            ax.scatter(x, y, s=15, c='k', lw=0, zorder=100)

    return ax
    
    
[docs]def hierarchical(df, cluster_cols=True, cluster_rows=False, n_col_clusters=False, n_row_clusters=False, row_labels=True, col_labels=True, fcol=None, z_score=0, method='ward', cmap=cm.PuOr_r, return_clusters=False, rdistance_fn=distance.pdist, cdistance_fn=distance.pdist ):
    """
    Hierarchical clustering of samples or proteins

    Peform a hiearchical clustering on a pandas DataFrame and display the resulting clustering as a
    heatmap.
    The axis of clustering can be controlled with `cluster_cols` and `cluster_rows`. By default clustering is performed
    along the X-axis, therefore to cluster samples transpose the DataFrame as it is passed, using `df.T`.

    Samples are z-scored along the 0-axis (y) by default. To override this use the `z_score` param with the axis to `z_score`
    or alternatively, `None`, to turn it off.

    If a `n_col_clusters` or `n_row_clusters` is specified, this defines the number of clusters to identify and highlight
    in the resulting heatmap. At *least* this number of clusters will be selected, in some instances there will be more
    if 2 clusters rank equally at the determined cutoff.

    If specified `fcol` will be used to colour the axes for matching samples.

    :param df: Pandas ``DataFrame`` to cluster
    :param cluster_cols: ``bool`` if ``True`` cluster along column axis
    :param cluster_rows: ``bool`` if ``True`` cluster along row axis
    :param n_col_clusters: ``int`` the ideal number of highlighted clusters in cols
    :param n_row_clusters: ``int`` the ideal number of highlighted clusters in rows
    :param fcol: ``dict`` of label:colors to be applied along the axes
    :param z_score: ``int`` to specify the axis to Z score or `None` to disable
    :param method: ``str`` describing cluster method, default ward
    :param cmap: matplotlib colourmap for heatmap
    :param return_clusters: ``bool`` return clusters in addition to axis
    :return: matplotlib axis, or axis and cluster data
    """


    # helper for cleaning up axes by removing ticks, tick labels, frame, etc.
    def clean_axis(ax):
        """Remove ticks, tick labels, and frame from axis"""
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        ax.set_axis_bgcolor('#ffffff')
        for sp in ax.spines.values():
            sp.set_visible(False)

    def optimize_clusters(clusters, denD, target_n):
        target_n = target_n - 1 # We return edges; not regions
        threshold = np.max(clusters)
        max_iterations = threshold

        i = 0
        while i < max_iterations:
            cc = sch.fcluster(clusters, threshold, 'distance')
            cco = cc[ denD['leaves'] ]
            edges = [n for n in range(cco.shape[0]-1) if cco[n] != cco[n+1]  ]
            n_clusters = len(edges)
            
            if n_clusters == target_n:
                break

            if n_clusters < target_n:
                threshold = threshold // 2

            elif n_clusters > target_n:
                threshold = int( threshold * 1.5 )

            i += 1

        return edges

    dfc = df.copy()

    if z_score is None:
        pass
    elif z_score == 0:
        dfc = (dfc - dfc.median(axis=0)) / dfc.std(axis=0)
    elif z_score == 1:
        dfc = ((dfc.T - dfc.median(axis=1).T) / dfc.std(axis=1).T).T


    # Remove nan/infs
    dfc[np.isinf(dfc)] = 0
    dfc[np.isnan(dfc)] = 0

    #dfc.dropna(axis=0, how='any', inplace=True)

    # make norm
    vmin = dfc.min().min()
    vmax = dfc.max().max()
    vmax = max([vmax, abs(vmin)])  # choose larger of vmin and vmax
    vmin = vmax * -1

    my_norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)

    # dendrogram single color
    sch.set_link_color_palette(['black'])

    # cluster
    if cluster_rows:
        row_pairwise_dists = distance.squareform(rdistance_fn(dfc))
        row_clusters = sch.linkage(row_pairwise_dists, method=method)

    if cluster_cols:
        col_pairwise_dists = distance.squareform(cdistance_fn(dfc.T))
        col_clusters = sch.linkage(col_pairwise_dists, method=method)

    # heatmap with row names
    fig = plt.figure(figsize=(12, 12))
    heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1])

    if cluster_cols:
        # col dendrogram
        col_denAX = fig.add_subplot(heatmapGS[0, 1])
        col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf)
        clean_axis(col_denAX)

    rowGSSS = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=heatmapGS[1, 0], wspace=0.0, hspace=0.0, width_ratios=[1, 0.05])

    if cluster_rows:
        # row dendrogram
        row_denAX = fig.add_subplot(rowGSSS[0, 0])
        row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right')
        clean_axis(row_denAX)

    row_denD = {
        'leaves':range(0, dfc.shape[0])
    }

    # row colorbar
    if fcol and 'Group' in dfc.index.names:
        class_idx = dfc.index.names.index('Group')

        classcol = [fcol[x] for x in dfc.index.get_level_values(0)[row_denD['leaves']]]
        classrgb = np.array([colorConverter.to_rgb(c) for c in classcol]).reshape(-1, 1, 3)
        row_cbAX = fig.add_subplot(rowGSSS[0, 1])
        row_axi = row_cbAX.imshow(classrgb, interpolation='nearest', aspect='auto', origin='lower')
        clean_axis(row_cbAX)

    # heatmap
    heatmapAX = fig.add_subplot(heatmapGS[1, 1])

    axi = heatmapAX.imshow(dfc.iloc[row_denD['leaves'], col_denD['leaves']], interpolation='nearest', aspect='auto', origin='lower'
                           , norm=my_norm, cmap=cmap)
    clean_axis(heatmapAX)


    def build_labels(index, ixs):
        zstr = zip(*[index.get_level_values(x) for x in ixs])
        return np.array([" ".join([str(t) for t in i]) if type(i) == tuple else str(i) for i in zstr])

    # row labels
    if dfc.shape[0] <= 100:
        heatmapAX.set_yticks(range(dfc.shape[0]))
        heatmapAX.yaxis.set_ticks_position('right')
        if row_labels is True:
            row_labels = list(range(len(dfc.index.names)))
        ylabels = build_labels(dfc.index, row_labels)[row_denD['leaves']]
        heatmapAX.set_yticklabels(ylabels)

    # col labels
    if dfc.shape[1] <= 100:
        heatmapAX.set_xticks(range(dfc.shape[1]))
        if col_labels is True:
            col_labels = list(range(len(dfc.columns.names)))
        xlabels = build_labels(dfc.columns, col_labels)[col_denD['leaves']]
        xlabelsL = heatmapAX.set_xticklabels(xlabels)
        # rotate labels 90 degrees
        for label in xlabelsL:
            label.set_rotation(90)

    # remove the tick lines
    for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines():
        l.set_markersize(0)

    heatmapAX.grid('off')

    edges = None

    if cluster_cols and n_col_clusters:
        edges = optimize_clusters(col_clusters, col_denD, n_col_clusters)
        for edge in edges:
            heatmapAX.axvline(edge +0.5, color='k', lw=3)

    if cluster_rows and n_row_clusters:
        edges = optimize_clusters(row_clusters, row_denD, n_row_clusters)
        for edge in edges:
            heatmapAX.axhline(edge +0.5, color='k', lw=3)

    print(np.min(dfc.values), np.max(dfc.values))
    if return_clusters:

        return fig, dfc.iloc[row_denD['leaves'], col_denD['leaves']], edges

    else:
        return fig


[docs]def correlation(df, cm=cm.PuOr_r, vmin=None, vmax=None, labels=None, show_scatter=False):
    """
    Generate a column-wise correlation plot from the provided data.

    The columns of the supplied dataframes will be correlated (using `analysis.correlation`) to
    generate a Pearson correlation plot heatmap. Scatter plots of correlated samples can also be generated over
    the redundant half of the plot to give a visual indication of the protein distribution.

    :param df: `pandas.DataFrame`
    :param cm: Matplotlib colormap (default cm.PuOr_r)
    :param vmin: Minimum value for colormap normalization
    :param vmax: Maximum value for colormap normalization
    :param labels: Index column to retrieve labels from
    :param show_scatter: Show overlaid scatter plots for each sample in lower-left half. Note that this is slow for large numbers of samples.
    :return: `matplotlib.Figure` generated Figure.
    """


    data = analysis.correlation(df).values

    # Plot the distributions
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1,1,1)

    if vmin is None:
        vmin = np.nanmin(data)

    if vmax is None:
        vmax = np.nanmax(data)

    n_dims = data.shape[0]

    # If showing scatter plots, set the inlay portion to np.nan
    if show_scatter:

        # Get the triangle, other values will be zeroed
        idx = np.tril_indices(n_dims)
        data[idx] = np.nan

    cm.set_bad('w', 1.)
    i = ax.imshow(data, cmap=cm, vmin=vmin, vmax=vmax, interpolation='none')
    fig.colorbar(i)
    fig.axes[0].grid('off')

    if show_scatter:
        figo = mpl.figure.Figure(figsize=(n_dims, n_dims), dpi=300)
        # Create a dummy Agg canvas so we don't have to display/output this intermediate
        canvas = FigureCanvasAgg(figo)

        for x in range(0, n_dims):
            for y in range(x, n_dims):

                ax = figo.add_subplot(n_dims, n_dims, y*n_dims+x+1)

                if x != y:
                    xd = df.values[:, x]
                    yd = df.values[:, y]
                    ax.scatter(xd, yd, lw=0, s=5, c='k', alpha=0.2)

                ax.grid('off')
                ax.axis('off')


        figo.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)
        raw = BytesIO()
        figo.savefig(raw, format='png', bbox_inches=0, transparent=True)
        del figo

        raw.seek(0)
        img = mplimg.imread(raw)
        ax2 = fig.add_axes(fig.axes[0].get_position(), label='image', zorder=1)
        ax2.axis('off')
        ax2.imshow(img)

    if labels:
        # Build labels from the supplied axis
        labels = df.columns.get_level_values(labels)

        fig.axes[0].set_xticks(range(n_dims))
        fig.axes[0].set_xticklabels(labels, rotation=90)

        fig.axes[0].set_yticks(range(n_dims))
        fig.axes[0].set_yticklabels(labels)

    return fig


[docs]def comparedist(df1, df2, bins=50):
    """
    Compare the distributions of two DataFrames giving visualisations of:
     - individual and combined distributions
     - distribution of non-common values
     - distribution of non-common values vs. each side

    Plot distribution as area (fill_between) + mean, median vertical bars.

    :param df1: `pandas.DataFrame`
    :param df2: `pandas.DataFrame`
    :param bins: `int` number of bins for histogram
    :return: Figure
    """

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(30, 10))

    xr = np.nanmin( [np.nanmin(df1), np.nanmin(df2)] ), np.nanmax( [np.nanmax(df1), np.nanmax(df2)] )

    def areadist(ax, v, xr, c, bins=100, by=None, alpha=1):
        """
        Plot the histogram distribution but as an area plot
        """
        y, x = np.histogram(v[~np.isnan(v)], bins)
        x = x[:-1]

        if by is None:
            by = np.zeros( (bins, ) )

        ax.fill_between(x, y, by, facecolor=c, alpha=alpha)
        return y

    ax1.set_title('Distributions of A and B')
    areadist(ax1, df2.values, xr, c='r', bins=bins)
    areadist(ax1, df1.values, xr, c='k', bins=bins, alpha=0.3)
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Count')

    ax2.set_title('Distributions of A and values unique to B')
    # Calculate what is different isolate those values
    areadist(ax2, df2.values[ df2.values != df1.values ], xr, c='r', bins=bins)
    areadist(ax2, df1.values, xr, c='k', bins=bins, alpha=0.3)
    ax2.set_xlabel('Value')
    ax2.set_ylabel('Count')

    # Calculate (at the lowest column index level) the difference between
    # distribution of unique values, vs those in common
    # Get indices of difference
    dfc= df1.copy()
    dfc[:] = (df2.values != df1.values).astype(int)
    for i in dfc.columns.values:
        dfc[i[:-1]] = np.max(dfc[i[:-1]].values, axis=1)

    ax3.set_title('Distributions of associated values of A and substituted values in B')
    areadist(ax3, df2.values[ df2.values != df1.values ], xr, c='r')
    areadist(ax3, df1.values[ dfc.values == 1 ], xr, c='k', alpha=0.3)
    ax3.set_xlabel('Value')
    ax3.set_ylabel('Count')

    return fig



[docs]def kegg_pathway(df, pathway, a, b=None, ids_from="Proteins", cmap=cm.PuOr_r, is_log2=False, fillna=None, z_score=1):
    """
    Visualize data on a kegg pathway.


    :param df:
    :param pathway:
    :param a:
    :param b:
    :param ids_from:
    :param cmap:
    :param is_log2:
    :param fillna:
    :param z_score:
    :return:
    """

    try:
        import uniprot as up
    except ImportError:
        raise ImportError("Mapping from KEGG to UniProt IDs requires uniprot package; pip install uniprot")


    df = df.copy()

    if np.any(df.values < 0) and not is_log2:
        warnings.warn("Input data has negative values. If data is log2 transformed, set is_log2=True.")

    if fillna is not None:
        df = df.fillna(fillna)

    if z_score is None:
        pass
    elif z_score == 0:
        df = (df - df.median(axis=0)) / df.std(axis=0)
    elif z_score == 1:
        df = ((df.T - df.median(axis=1).T) / df.std(axis=1).T).T


    if b is not None:
        # Calculate ratio between two groups
        g1, g2 = df[a].values, df[b].values

        if is_log2:
            dr = np.nanmean(g2, axis=1) - np.nanmean(g1, axis=1)
        else:
            dr = np.log2(np.nanmean(g2, axis=1) / np.nanmean(g1, axis=1))

    else:
        g1 = df[a].values
        dr = np.nanmean(g1, axis=1)




    maxi = np.max(abs(dr))
    norm = mpl.colors.Normalize(vmin=-maxi, vmax=maxi)
    mapper = cm.ScalarMappable(norm=norm, cmap=cm.PuOr_r) # Orange up

    node_colors = {}
    for p, v in zip(df.index.get_level_values(ids_from), dr):
        pid = str(p).split(";")[-1]

        if "_" in pid:
            pid = pid[:pid.index("_")]

        node_colors[pid] = mpl.colors.rgb2hex(mapper.to_rgba(v))


    global uniprot_kegg_cache

    # Only do this once
    upids = list( node_colors.keys() )
    upids = [p for p in upids if p not in uniprot_kegg_cache.keys()]

    if upids:
        new_pairs = up.batch_uniprot_id_mapping_pairs('ACC+ID', 'KEGG_ID', upids)
        uniprot_kegg_cache.update( dict(new_pairs) )

        for p in upids:
            if p not in uniprot_kegg_cache:
                uniprot_kegg_cache[p] = None # Not found, don't look again

    with StringIO() as f:
        f.write('#hsa\tData\n')
        for k, c in list(node_colors.items()):
            if k in uniprot_kegg_cache and uniprot_kegg_cache[k] is not None:
                kid = uniprot_kegg_cache[k]
                f.write('%s\t%s\n' % (kid.split(':')[-1], c ))

        # Reset file
        f.seek(0)

        url = 'http://www.kegg.jp/kegg-bin/mcolor_pathway'
        m = MultipartEncoder(
            fields={
                'map': pathway,
                'mapping_list': ('filename', f),
                'mode': 'color',
                'submit': 'Exec',
                'reference': 'white',
                 }
        )

    r = requests.post(url, data=m, headers={'Content-Type': m.content_type})
    if r.status_code == 200:
        ms = re.finditer("document.pathwayimage.src='(/tmp/mark_pathway[^']*?)'.*?>(.*?)<", r.text)
        m = list(ms)[0]

        # Download image data
        image = Image.open(requests.get('http://www.kegg.jp%s' % m.group(1), stream=True).raw)
        width, height = image.size   # Get dimensions
        image = image.crop((1, 1, width-1, height-1)) # Crop black outline

        return image


def _barrightlabel(ax, name):
    """

    :param ax:
    :param name:
    :param mx:
    :param offset:
    :return:
    """

    # attach some text labels
    for ii, rect in enumerate(ax.containers[0].patches):
        width = rect.get_width()
        ax.text(width * 1.1, rect.get_y()+rect.get_height()/2., "{:,}".format(name[ii]),
                ha='left', va='center', fontsize=12, color='k')


[docs]def quality_control(df):

    labels = []
    values = []

    dfc = df.copy()

    labels_f = []
    values_f = []
    for k in ['Reverse','Potential contaminant','Contaminant','Only identified by site']:
        if k in df.columns:
            values_f.append( df[ df[k].astype(str) == "+" ].shape[0] )
            labels_f.append(k)
            dfc = dfc[ dfc[k].astype(str) != "+" ]

    dfc = dfc.filter(regex='(Intensity|Ratio).*')
    dfc[ dfc == 0 ] = np.nan
    dfq = dfc.dropna(how='all', axis=0)

    values.append(dfq.shape[0])
    labels.append("Quantified")

    values.append(dfc.shape[0])
    labels.append("Filtered")

    values.extend(values_f)
    labels.extend(labels_f)

    labels.append("Total")
    values.append(df.shape[0])


    dfs = pd.DataFrame(values, index=labels)
    ax = dfs.plot(kind='barh', legend=None, logx=True)

    _barrightlabel(ax, dfs.values.flatten() )

    return ax