Source code for padua.io

import pandas as pd
import numpy as np


[docs]def read_maxquant(f, header=0, **kwargs): """ Load the quantified table output from MaxQuant run, e.g. - Proteingroups.txt - Phospho (STY)Sites.txt :param f: Source file :return: Pandas dataframe of imported data """ df = pd.read_csv(f, delimiter='\t', header=header, **kwargs) df.set_index('id', inplace=True) return df
[docs]def read_perseus(f): """ Load a Perseus processed data table :param f: Source file :return: Pandas dataframe of imported data """ df = pd.read_csv(f, delimiter='\t', header=[0,1,2,3], low_memory=False) df.columns = pd.MultiIndex.from_tuples([(x,) for x in df.columns.get_level_values(0)]) return df
[docs]def write_perseus(f, df): """ Export a dataframe to Perseus; recreating the format :param f: :param df: :return: """ ### Generate the Perseus like type index FIELD_TYPE_MAP = { 'Amino acid':'C', 'Charge':'C', 'Reverse':'C', 'Potential contaminant':'C', 'Multiplicity':'C', 'Localization prob':'N', 'PEP':'N', 'Score':'N', 'Delta score':'N', 'Score for localization':'N', 'Mass error [ppm]':'N', 'Intensity':'N', 'Position':'N', 'Proteins':'T', 'Positions within proteins':'T', 'Leading proteins':'T', 'Protein names':'T', 'Gene names':'T', 'Sequence window':'T', 'Unique identifier':'T', } def map_field_type(n, c): try: t = FIELD_TYPE_MAP[c] except: t = "E" # In the first element, add type indicator if n == 0: t = "#!{Type}%s" % t return t df = df.copy() df.columns = pd.MultiIndex.from_tuples([(k, map_field_type(n, k)) for n, k in enumerate(df.columns)], names=["Label","Type"]) df = df.transpose().reset_index().transpose() df.to_csv(f, index=False, header=False)
[docs]def write_phosphopath(df, f): """ Write out the data frame of phosphosites in the following format:: protein, protein-Rsite, Rsite, multiplicity Q13619 Q13619-S10 S10 1 Q9H3Z4 Q9H3Z4-S10 S10 1 Q6GQQ9 Q6GQQ9-S100 S100 1 Q86YP4 Q86YP4-S100 S100 1 Q9H307 Q9H307-S100 S100 1 Q8NEY1 Q8NEY1-S1000 S1000 1 The file is written as a comma-separated (CSV) file to file ``f``. :param df: :param f: :return: """ def _protein_id(s): return s.split(';')[0].split(' ')[0].split('_')[0].split('-')[0] proteins = [_protein_id(k) for k in df.index.get_level_values('Proteins')] amino_acids = df.index.get_level_values('Amino acid') positions = [_protein_id(k) for k in df.index.get_level_values('Positions within proteins')] multiplicity = [k[-1] for k in df.index.get_level_values('Multiplicity')] apos = ["%s%s" % x for x in zip(amino_acids, positions)] prar = ["%s-%s" % x for x in zip(proteins, apos)] phdf = pd.DataFrame(np.array(list(zip(proteins, prar, apos, multiplicity)))) phdf.to_csv(f, sep='\t', index=None, header=None)
[docs]def write_phosphopath_ratio(df, f, v, a=None, b=None): """ Write out the data frame ratio between two groups protein-Rsite-multiplicity-timepoint ID Ratio Q13619-S10-1-1 0.5 Q9H3Z4-S10-1-1 0.502 Q6GQQ9-S100-1-1 0.504 Q86YP4-S100-1-1 0.506 Q9H307-S100-1-1 0.508 Q8NEY1-S1000-1-1 0.51 Q13541-S101-1-1 0.512 O95785-S1012-2-1 0.514 O95785-S1017-2-1 0.516 Q9Y4G8-S1022-1-1 0.518 P35658-S1023-1-1 0.52 :param df: :param f: :param v: :param a: :param b: :return: """ proteins = [get_protein_id(k) for k in df.index.get_level_values('Proteins')] amino_acids = df.index.get_level_values('Amino acid') positions = [get_protein_id(k) for k in df.index.get_level_values('Positions within proteins')] multiplicity = [k[-1] for k in df.index.get_level_values('Multiplicity')] apos = ["%s%s" % x for x in zip(amino_acids, positions)] prar = ["%s-%s-1-%s" % x for x in zip(proteins, apos, multiplicity)] phdf = pd.DataFrame(np.array(list(zip(prar, v)))) phdf.columns = ["ID", "Ratio"] phdf.to_csv(f, sep='\t', index=None)
[docs]def write_r(df, f, sep=",", index_join="@", columns_join="."): """ Export dataframe in a format easily importable to R Index fields are joined with "@" and column fields by "." by default. :param df: :param f: :param index_join: :param columns_join: :return: """ df = df.copy() df.index = ["@".join([str(s) for s in v]) for v in df.index.values] df.columns = [".".join([str(s) for s in v]) for v in df.index.values] df.to_csv(f, sep=sep)