Source code for padua.imputation

Algorithms for imputing missing values in data

import numpy as np
    import sklearn
except ImportError:
    sklearn = False
    from sklearn.cross_decomposition import PLSRegression

[docs]def gaussian(df, width=0.3, downshift=-1.8, prefix=None): """ Impute missing values by drawing from a normal distribution :param df: :param width: Scale factor for the imputed distribution relative to the standard deviation of measured values. Can be a single number or list of one per column. :param downshift: Shift the imputed values down, in units of std. dev. Can be a single number or list of one per column :param prefix: The column prefix for imputed columns :return: """ df = df.copy() imputed = df.isnull() # Keep track of what's real if prefix: mask = np.array([l.startswith(prefix) for l in df.columns.values]) mycols = np.arange(0, df.shape[1])[mask] else: mycols = np.arange(0, df.shape[1]) if type(width) is not list: width = [width] * len(mycols) elif len(mycols) != len(width): raise ValueError("Length of iterable 'width' does not match # of columns") if type(downshift) is not list: downshift = [downshift] * len(mycols) elif len(mycols) != len(downshift): raise ValueError("Length of iterable 'downshift' does not match # of columns") for i in mycols: data = df.iloc[:, i] mask = data.isnull().values mean = data.mean(axis=0) stddev = data.std(axis=0) m = mean + downshift[i]*stddev s = stddev*width[i] # Generate a list of random numbers for filling in values = np.random.normal(loc=m, scale=s, size=df.shape[0]) # Now fill them in df.iloc[mask, i] = values[mask] return df, imputed
[docs]def pls(df): """ A simple implementation of a least-squares approach to imputation using partial least squares regression (PLS). :param df: :return: """ if not sklearn: assert('This library depends on scikit-learn (sklearn) to perform PLS-based imputation') df = df.copy() df[np.isinf(df)] = np.nan dfo = df.dropna(how='any', axis=0) dfo = dfo.astype(np.float64) dfi = df.copy() imputed = df.isnull() #Keep track of what's real # List of proteins with missing values in their rows missing_values = df[ np.sum(np.isnan(df), axis=1) > 0 ].index ix_mask = np.arange(0, df.shape[1]) total_n = len(missing_values) #dfi = df.fillna(0) plsr = PLSRegression(n_components=2) for n, p in enumerate(missing_values.values): # Generate model for this protein from missing data target = df.loc[p].values.copy().T ixes = ix_mask[ np.isnan(target) ] # Fill missing values with row median for calculation target[np.isnan(target)] = np.nanmedian(target), target) # For each missing value, calculate imputed value from the column data input for ix in ixes: imputv = plsr.predict(dfo.iloc[:, ix])[0] dfi.ix[p, ix] = imputv print("%d%%" % ((n/total_n)*100), end="\r") return dfi, imputed