Source code for uncurl.preprocessing

"""
Misc functions...
"""

import numpy as np
from scipy import sparse

from uncurl.sparse_utils import sparse_cell_normalize, sparse_means_var_csc

[docs]def sparse_mean_var(data):
    """
    Calculates the variance for each row of a sparse matrix,
    using the relationship Var = E[x^2] - E[x]^2.

    Returns:
        pair of matrices mean, variance.
    """
    data = sparse.csc_matrix(data)
    return sparse_means_var_csc(data.data,
            data.indices,
            data.indptr,
            data.shape[1],
            data.shape[0])

[docs]def max_variance_genes(data, nbins=5, frac=0.2):
    """
    This function identifies the genes that have the max variance
    across a number of bins sorted by mean.

    Args:
        data (array): genes x cells
        nbins (int): number of bins to sort genes by mean expression level. Default: 10.
        frac (float): fraction of genes to return per bin - between 0 and 1. Default: 0.1

    Returns:
        list of gene indices (list of ints)
    """
    # TODO: profile, make more efficient for large matrices
    # 8000 cells: 0.325 seconds
    # top time: sparse.csc_tocsr, csc_matvec, astype, copy, mul_scalar
    # 73233 cells: 5.347 seconds, 4.762 s in sparse_var
    # csc_tocsr: 1.736 s
    # copy: 1.028 s
    # astype: 0.999 s
    # there is almost certainly something superlinear in this method
    # maybe it's to_csr?
    indices = []
    if sparse.issparse(data):
        means, var = sparse_mean_var(data)
    else:
        means = data.mean(1)
        var = data.var(1)
    mean_indices = means.argsort()
    n_elements = int(data.shape[0]/nbins)
    frac_elements = int(n_elements*frac)
    for i in range(nbins):
        bin_i = mean_indices[i*n_elements : (i+1)*n_elements]
        if i==nbins-1:
            bin_i = mean_indices[i*n_elements :]
        var_i = var[bin_i]
        var_sorted = var_i.argsort()
        top_var_indices = var_sorted[len(bin_i) - frac_elements:]
        ind = bin_i[top_var_indices]
        # filter out genes with zero variance
        ind = [index for index in ind if var[index]>0]
        indices.extend(ind)
    return indices

[docs]def cell_normalize(data):
    """
    Returns the data where the expression is normalized so that the total
    count per cell is equal.
    """
    if sparse.issparse(data):
        data = sparse.csc_matrix(data.astype(float))
        # normalize in-place
        sparse_cell_normalize(data.data,
                data.indices,
                data.indptr,
                data.shape[1],
                data.shape[0])
        return data
    data_norm = data.astype(float)
    total_umis = []
    for i in range(data.shape[1]):
        di = data_norm[:,i]
        total_umis.append(di.sum())
        di /= total_umis[i]
    med = np.median(total_umis)
    data_norm *= med
    return data_norm

[docs]def log1p(data):
    """
    Returns ln(data+1), whether the original data is dense or sparse.
    """
    if sparse.issparse(data):
        return data.log1p()
    else:
        return np.log1p(data)