import numpy as np
import scipy as sp
import scipy.interpolate
[docs]def qvalues(pv, m = None, verbose = False, lowmem = False, pi0 = None):
"""
Copyright (c) 2012, Nicolo Fusi, University of Sheffield
All rights reserved.
Estimates q-values from p-values
Args
=====
m: number of tests. If not specified m = pv.size
verbose: print verbose messages? (default False)
lowmem: use memory-efficient in-place algorithm
pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003.
For most GWAS this is not necessary, since pi0 is extremely likely to be
1
:param pv:
:param m:
:param verbose:
:param lowmem:
:param pi0:
:return:
"""
assert(pv.min() >= 0 and pv.max() <= 1), "p-values should be between 0 and 1"
original_shape = pv.shape
pv = pv.ravel() # flattens the array in place, more efficient than flatten()
if m == None:
m = float(len(pv))
else:
# the user has supplied an m
m *= 1.0
# if the number of hypotheses is small, just set pi0 to 1
if len(pv) < 100 and pi0 == None:
pi0 = 1.0
elif pi0 != None:
pi0 = pi0
else:
# evaluate pi0 for different lambdas
pi0 = []
lam = sp.arange(0, 0.90, 0.01)
counts = sp.array([(pv > i).sum() for i in sp.arange(0, 0.9, 0.01)])
for l in range(len(lam)):
pi0.append(counts[l]/(m*(1-lam[l])))
pi0 = sp.array(pi0)
# fit natural cubic spline
tck = sp.interpolate.splrep(lam, pi0, k = 3)
pi0 = sp.interpolate.splev(lam[-1], tck)
if pi0 > 1:
if verbose:
print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0)
pi0 = 1.0
assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0
if lowmem:
# low memory version, only uses 1 pv and 1 qv matrices
qv = sp.zeros((len(pv),))
last_pv = pv.argmax()
qv[last_pv] = (pi0*pv[last_pv]*m)/float(m)
pv[last_pv] = -sp.inf
prev_qv = last_pv
for i in range(int(len(pv))-2, -1, -1):
cur_max = pv.argmax()
qv_i = (pi0*m*pv[cur_max]/float(i+1))
pv[cur_max] = -sp.inf
qv_i1 = prev_qv
qv[cur_max] = min(qv_i, qv_i1)
prev_qv = qv[cur_max]
else:
p_ordered = sp.argsort(pv)
pv = pv[p_ordered]
qv = pi0 * m/len(pv) * pv
qv[-1] = min(qv[-1],1.0)
for i in range(len(pv)-2, -1, -1):
qv[i] = min(pi0*m*pv[i]/(i+1.0), qv[i+1])
# reorder qvalues
qv_temp = qv.copy()
qv = sp.zeros_like(qv)
qv[p_ordered] = qv_temp
# reshape qvalues
qv = qv.reshape(original_shape)
return qv
[docs]def get_protein_id(s):
"""
Return a shortened string, split on spaces, underlines and semicolons.
Extract the first, highest-ranked protein ID from a string containing
protein IDs in MaxQuant output format: e.g. P07830;P63267;Q54A44;P63268
Long names (containing species information) are eliminated (split on ' ') and
isoforms are removed (split on '_').
:param s: protein IDs in MaxQuant format
:type s: str or unicode
:return: string
"""
return str(s).split(';')[0].split(' ')[0].split('_')[0]
[docs]def get_protein_ids(s):
"""
Return a list of shortform protein IDs.
Extract all protein IDs from a string containing
protein IDs in MaxQuant output format: e.g. P07830;P63267;Q54A44;P63268
Long names (containing species information) are eliminated (split on ' ') and
isoforms are removed (split on '_').
:param s: protein IDs in MaxQuant format
:type s: str or unicode
:return: list of string ids
"""
return [p.split(' ')[0].split('_')[0] for p in s.split(';') ]
[docs]def get_protein_id_list(df, level=0):
"""
Return a complete list of shortform IDs from a DataFrame
Extract all protein IDs from a dataframe from multiple rows containing
protein IDs in MaxQuant output format: e.g. P07830;P63267;Q54A44;P63268
Long names (containing species information) are eliminated (split on ' ') and
isoforms are removed (split on '_').
:param df: DataFrame
:type df: pandas.DataFrame
:param level: Level of DataFrame index to extract IDs from
:type level: int or str
:return: list of string ids
"""
protein_list = []
for s in df.index.get_level_values(level):
protein_list.extend( get_protein_ids(s) )
return list(set(protein_list))
[docs]def get_shortstr(s):
"""
Return the first part of a string before a semicolon.
Extract the first, highest-ranked protein ID from a string containing
protein IDs in MaxQuant output format: e.g. P07830;P63267;Q54A44;P63268
:param s: protein IDs in MaxQuant format
:type s: str or unicode
:return: string
"""
return str(s).split(';')[0]
[docs]def get_index_list(l, ms):
"""
:param l:
:param ms:
:return:
"""
if type(ms) != list and type(ms) != tuple:
ms = [ms]
return [l.index(s) for s in ms if s in l]
[docs]def build_combined_label(sl, idxs, sep=' '):
"""
Generate a combined label from a list of indexes
into sl, by joining them with `sep` (str).
:param sl: Strings to combine
:type sl: dict of str
:param idxs: Indexes into sl
:type idxs: list of sl keys
:param sep:
:return: `str` of combined label
"""
return sep.join([get_shortstr(str(sl[n])) for n in idxs])
[docs]def hierarchical_match(d, k, default=None):
"""
Match a key against a dict, simplifying element at a time
:param df: DataFrame
:type df: pandas.DataFrame
:param level: Level of DataFrame index to extract IDs from
:type level: int or str
:return: hiearchically matched value or default
"""
if d is None:
return default
if type(k) != list and type(k) != tuple:
k = [k]
for n, _ in enumerate(k):
key = tuple(k[0:len(k)-n])
if len(key) == 1:
key = key[0]
try:
d[key]
except:
pass
else:
return d[key]
return default
[docs]def chunks(seq, num):
"""
Separate `seq` (`np.array`) into `num` series of as-near-as possible equal
length values.
:param seq: Sequence to split
:type seq: np.array
:param num: Number of parts to split sequence into
:type num: int
:return: np.array of split parts
"""
avg = len(seq) / float(num)
out = []
last = 0.0
while last < len(seq):
out.append(seq[int(last):int(last + avg)])
last += avg
return np.array(out)
[docs]def calculate_s0_curve(s0, minpval, maxpval, minratio, maxratio, curve_interval=0.1):
"""
Calculate s0 curve for volcano plot.
Taking an min and max p value, and a min and max ratio, calculate an smooth
curve starting from parameter `s0` in each direction.
The `curve_interval` parameter defines the smoothness of the resulting curve.
:param s0: `float` offset of curve from interset
:param minpval: `float` minimum p value
:param maxpval: `float` maximum p value
:param minratio: `float` minimum ratio
:param maxratio: `float` maximum ratio
:param curve_interval: `float` stepsize (smoothness) of curve generator
:return: x, y, fn x,y points of curve, and fn generator
"""
mminpval = -np.log10(minpval)
mmaxpval = -np.log10(maxpval)
maxpval_adjust = mmaxpval - mminpval
ax0 = (s0 + maxpval_adjust * minratio) / maxpval_adjust
edge_offset = (maxratio-ax0) % curve_interval
max_x = maxratio-edge_offset
if (max_x > ax0):
x = np.arange(ax0, max_x, curve_interval)
else:
x = np.arange(max_x, ax0, curve_interval)
fn = lambda x: 10 ** (-s0/(x-minratio) - mminpval)
y = fn(x)
return x, y, fn
[docs]def find_nearest_idx(array,value):
"""
:param array:
:param value:
:return:
"""
array = array.copy()
array[np.isnan(array)] = 1
idx = (np.abs(array-value)).argmin()
return idx