Source code for uncurl.state_estimation

# state estimation with poisson convex mixture model

from .clustering import kmeans_pp, poisson_cluster
from uncurl.nolips import nolips_update_w, objective, sparse_objective
from uncurl.nolips import sparse_nolips_update_w
# try to use parallel; otherwise
#from uncurl.nolips_parallel import sparse_nolips_update_w as parallel_sparse_nolips_update_w
try:
    from uncurl.nolips_parallel import sparse_nolips_update_w as parallel_sparse_nolips_update_w
except:
    print('Warning: cannot import sparse nolips')
    # if parallel can't be used, do not use parallel update function...
    pass
from .preprocessing import cell_normalize, log1p

import numpy as np
from scipy import sparse
from scipy.optimize import minimize
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd

eps=1e-8

def _create_w_objective(m, X):
    """
    Creates an objective function and its derivative for W, given M and X (data)

    Args:
        m (array): genes x clusters
        X (array): genes x cells
    """
    genes, clusters = m.shape
    cells = X.shape[1]
    m_sum = m.sum(0)
    def objective(w):
        # convert w into a matrix first... because it's a vector for
        # optimization purposes
        w = w.reshape((m.shape[1], X.shape[1]))
        d = m.dot(w)+eps
        # derivative of objective wrt all elements of w
        # for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus 
        # x_ij
        temp = X/d
        m2 = m.T.dot(temp)
        deriv = m_sum.reshape((clusters, 1)) - m2
        return np.sum(d - X*np.log(d))/genes, deriv.flatten()/genes
    return objective

def _create_m_objective(w, X):
    """
    Creates an objective function and its derivative for M, given W and X

    Args:
        w (array): clusters x cells
        X (array): genes x cells
    """
    clusters, cells = w.shape
    genes = X.shape[0]
    w_sum = w.sum(1)
    def objective(m):
        m = m.reshape((X.shape[0], w.shape[0]))
        d = m.dot(w)+eps
        temp = X/d
        w2 = w.dot(temp.T)
        deriv = w_sum - w2.T
        return np.sum(d - X*np.log(d))/genes, deriv.flatten()/genes
    return objective

[docs]def initialize_from_assignments(assignments, k, max_assign_weight=0.75):
    """
    Creates a weight initialization matrix from Poisson clustering assignments.

    Args:
        assignments (array): 1D array of integers, of length cells
        k (int): number of states/clusters
        max_assign_weight (float, optional): between 0 and 1 - how much weight to assign to the highest cluster. Default: 0.75

    Returns:
        init_W (array): k x cells
    """
    cells = len(assignments)
    init_W = np.zeros((k, cells))
    for i, a in enumerate(assignments):
        # entirely arbitrary... maybe it would be better to scale
        # the weights based on k?
        init_W[a, i] = max_assign_weight
        for a2 in range(k):
            if a2!=a:
                init_W[a2, i] = (1-max_assign_weight)/(k-1)
    return init_W/init_W.sum(0)

[docs]def initialize_means(data, clusters, k):
    """
    Initializes the M matrix given the data and a set of cluster labels.
    Cluster centers are set to the mean of each cluster.

    Args:
        data (array): genes x cells
        clusters (array): 1d array of ints (0...k-1)
        k (int): number of clusters
    """
    init_w = np.zeros((data.shape[0], k))
    if sparse.issparse(data):
        for i in range(k):
            if data[:,clusters==i].shape[1]==0:
                point = np.random.randint(0, data.shape[1])
                init_w[:,i] = data[:,point].toarray().flatten()
            else:
                # memory usage might be a problem here?
                init_w[:,i] = np.array(data[:,clusters==i].mean(1)).flatten() + eps
    else:
        for i in range(k):
            if data[:,clusters==i].shape[1]==0:
                point = np.random.randint(0, data.shape[1])
                init_w[:,i] = data[:,point].flatten()
            else:
                init_w[:,i] = data[:,clusters==i].mean(1) + eps
    return init_w

[docs]def initialize_weights_nn(data, means, lognorm=True):
    """
    Initializes the weights with a nearest-neighbor approach using the means.
    """
    # TODO
    genes, cells = data.shape
    k = means.shape[1]
    if lognorm:
        data = log1p(cell_normalize(data))
    for i in range(cells):
        for j in range(k):
            pass

def _estimate_w(X, w_init, means, Xsum, update_fn, objective_fn, is_sparse=True, parallel=True, threads=4, method='NoLips', tol=1e-10, disp=False, inner_max_iters=100, output_name='W', regularization=0.0):
    clusters, cells = w_init.shape
    if method=='NoLips':
        nolips_iters = inner_max_iters
        m_sum = means.sum(0)
        lams = 1/(2*Xsum)
        for j in range(nolips_iters):
            if is_sparse and parallel:
                w_new = update_fn(X.data, X.indices, X.indptr, X.shape[1], X.shape[0], means, w_init, lams, m_sum, n_threads=threads, regularization=regularization)
            else:
                w_new = update_fn(X, means, w_init, Xsum)
            #w_new = w_res.x.reshape((clusters, cells))
            #w_new = w_new/w_new.sum(0)
            if tol > 0:
                w_diff = np.sqrt(np.sum((w_new - w_init)**2)/(clusters*cells))
                if disp:
                    print('inner iter {0}: w_diff={1}'.format(j, w_diff))
                w_init = w_new
                if w_diff < tol:
                    break
            else:
                w_init = w_new
    elif method=='L-BFGS-B':
        w_objective = _create_w_objective(means, X)
        w_bounds = [(0, None) for c in range(clusters*cells)]
        w_res = minimize(w_objective, w_init.flatten(),
                method='L-BFGS-B', jac=True, bounds=w_bounds,
                options={'disp':disp, 'maxiter':inner_max_iters})
        w_new = w_res.x.reshape((clusters, cells))
        w_init = w_new
    return w_new

def _call_sparse_obj(X, M, W):
    return sparse_objective(X.data, X.indices, X.indptr, X.shape[1], X.shape[0],
            M, W)

[docs]def initialize_means_weights(data, clusters, init_means=None, init_weights=None, initialization='tsvd', max_assign_weight=0.75):
    """
    Generates initial means and weights for state estimation.
    """
    genes, cells = data.shape
    if init_means is None:
        if init_weights is not None:
            if len(init_weights.shape)==1:
                means = initialize_means(data, init_weights, clusters)
            else:
                means = initialize_means(data, init_weights.argmax(0),
                        clusters, max_assign_weight=max_assign_weight)
        elif initialization=='cluster':
            assignments, means = poisson_cluster(data, clusters)
            if init_weights is None:
                init_weights = initialize_from_assignments(assignments, clusters,
                        max_assign_weight=max_assign_weight)
        elif initialization=='kmpp':
            means, assignments = kmeans_pp(data, clusters)
        elif initialization=='km':
            km = KMeans(clusters)
            assignments = km.fit_predict(log1p(cell_normalize(data)).T)
            init_weights = initialize_from_assignments(assignments, clusters,
                    max_assign_weight)
            means = initialize_means(data, assignments, clusters)
        elif initialization=='tsvd':
            n_components = min(50, genes-1)
            #tsvd = TruncatedSVD(min(50, genes-1))
            km = KMeans(clusters)
            # remove dependence on sklearn tsvd b/c it has a bug that
            # prevents it from working properly on long inputs 
            # if num elements > 2**31
            #data_reduced = tsvd.fit_transform(log1p(cell_normalize(data)).T)
            U, Sigma, VT = randomized_svd(log1p(cell_normalize(data)).T,
                    n_components)
            data_reduced = U*Sigma
            assignments = km.fit_predict(data_reduced)
            init_weights = initialize_from_assignments(assignments, clusters,
                    max_assign_weight)
            means = initialize_means(data, assignments, clusters)
    else:
        means = init_means.copy()
    means = means.astype(float)
    if init_weights is None:
        if init_means is not None:
            if initialization == 'cluster':
                assignments, means = poisson_cluster(data, clusters,
                        init=init_means, max_iters=1)
                w_init = initialize_from_assignments(assignments, clusters,
                        max_assign_weight)
            elif initialization == 'km':
                km = KMeans(clusters, init=log1p(init_means.T), max_iter=1)
                assignments = km.fit_predict(log1p(cell_normalize(data)).T)
                w_init = initialize_from_assignments(assignments, clusters,
                        max_assign_weight)
            else:
                w_init = np.random.random((clusters, cells))
                w_init = w_init/w_init.sum(0)
        else:
            w_init = np.random.random((clusters, cells))
            w_init = w_init/w_init.sum(0)
    else:
        if len(init_weights.shape)==1:
            init_weights = initialize_from_assignments(init_weights, clusters,
                    max_assign_weight)
        w_init = init_weights.copy()
    return means, w_init

[docs]def poisson_estimate_state(data, clusters, init_means=None, init_weights=None, method='NoLips', max_iters=30, tol=1e-10, disp=False, inner_max_iters=100, normalize=True, initialization='tsvd', parallel=True, threads=4, max_assign_weight=0.75, run_w_first=True, constrain_w=False, regularization=0.0):
    """
    Uses a Poisson Covex Mixture model to estimate cell states and
    cell state mixing weights.

    To lower computational costs, use a sparse matrix, set disp to False, and set tol to 0.

    Args:
        data (array): genes x cells array or sparse matrix.
        clusters (int): number of mixture components
        init_means (array, optional): initial centers - genes x clusters. Default: from Poisson kmeans
        init_weights (array, optional): initial weights - clusters x cells, or assignments as produced by clustering. Default: from Poisson kmeans
        method (str, optional): optimization method. Current options are 'NoLips' and 'L-BFGS-B'. Default: 'NoLips'.
        max_iters (int, optional): maximum number of iterations. Default: 30
        tol (float, optional): if both M and W change by less than tol (RMSE), then the iteration is stopped. Default: 1e-10
        disp (bool, optional): whether or not to display optimization progress. Default: False
        inner_max_iters (int, optional): Number of iterations to run in the optimization subroutine for M and W. Default: 100
        normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True.
        initialization (str, optional): If initial means and weights are not provided, this describes how they are initialized. Options: 'cluster' (poisson cluster for means and weights), 'kmpp' (kmeans++ for means, random weights), 'km' (regular k-means), 'tsvd' (tsvd(50) + k-means). Default: tsvd.
        parallel (bool, optional): Whether to use parallel updates (sparse NoLips only). Default: True
        threads (int, optional): How many threads to use in the parallel computation. Default: 4
        max_assign_weight (float, optional): If using a clustering-based initialization, how much weight to assign to the max weight cluster. Default: 0.75
        run_w_first (bool, optional): Whether or not to optimize W first (if false, M will be optimized first). Default: True
        constrain_w (bool, optional): If True, then W is normalized after every iteration. Default: False
        regularization (float, optional): Regularization coefficient for M and W. Default: 0 (no regularization).

    Returns:
        M (array): genes x clusters - state means
        W (array): clusters x cells - state mixing components for each cell
        ll (float): final log-likelihood
    """
    genes, cells = data.shape
    means, w_init = initialize_means_weights(data, clusters, init_means, init_weights, initialization, max_assign_weight)
    X = data.astype(float)
    XT = X.T
    is_sparse = False
    if sparse.issparse(X):
        is_sparse = True
        update_fn = sparse_nolips_update_w
        # convert to csc
        X = sparse.csc_matrix(X)
        XT = sparse.csc_matrix(XT)
        if parallel:
            update_fn = parallel_sparse_nolips_update_w
        Xsum = np.asarray(X.sum(0)).flatten()
        Xsum_m = np.asarray(X.sum(1)).flatten()
        # L-BFGS-B won't work right now for sparse matrices
        method = 'NoLips'
        objective_fn = _call_sparse_obj
    else:
        objective_fn = objective
        update_fn = nolips_update_w
        Xsum = X.sum(0)
        Xsum_m = X.sum(1)
        # If method is NoLips, converting to a sparse matrix
        # will always improve the performance (?) and never lower accuracy...
        # will almost always improve performance?
        # if sparsity is below 40%?
        if method == 'NoLips':
            if np.count_nonzero(X) < 0.4*genes*cells:
                is_sparse = True
                X = sparse.csc_matrix(X)
                XT = sparse.csc_matrix(XT)
                update_fn = sparse_nolips_update_w
                if parallel:
                    update_fn = parallel_sparse_nolips_update_w
                objective_fn = _call_sparse_obj
    w_new = w_init
    for i in range(max_iters):
        # TODO: write progress
        if disp:
            print('iter: {0}'.format(i))
        if run_w_first:
            # step 1: given M, estimate W
            w_new = _estimate_w(X, w_new, means, Xsum, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'W', regularization)
            if constrain_w:
                w_new = w_new/w_new.sum(0)
            if disp:
                w_ll = objective_fn(X, means, w_new)
                print('Finished updating W. Objective value: {0}'.format(w_ll))
            # step 2: given W, update M
            means = _estimate_w(XT, means.T, w_new.T, Xsum_m, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'M', regularization)
            means = means.T
            if disp:
                w_ll = objective_fn(X, means, w_new)
                print('Finished updating M. Objective value: {0}'.format(w_ll))
        else:
            # step 1: given W, update M
            means = _estimate_w(XT, means.T, w_new.T, Xsum_m, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'M', regularization)
            means = means.T
            if disp:
                w_ll = objective_fn(X, means, w_new)
                print('Finished updating M. Objective value: {0}'.format(w_ll))
            # step 2: given M, estimate W
            w_new = _estimate_w(X, w_new, means, Xsum, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'W', regularization)
            if constrain_w:
                w_new = w_new/w_new.sum(0)
            if disp:
                w_ll = objective_fn(X, means, w_new)
                print('Finished updating W. Objective value: {0}'.format(w_ll))
    if normalize:
        w_new = w_new/w_new.sum(0)
    m_ll = objective_fn(X, means, w_new)
    return means, w_new, m_ll