Source code for uncurl.robust.state_estimation

# state estimation with poisson convex mixture model

from uncurl.clustering import kmeans_pp
from uncurl.state_estimation import initialize_from_assignments, nolips_update_w, _create_w_objective, _create_m_objective

import numpy as np
from scipy.optimize import minimize
from scipy.special import gammaln

eps=1e-10

def _poisson_calculate_lls(X, M, W, use_constant=True, add_eps=True):
    """
    For hard thresholding: this calculates the log-likelihood of each
    gene, and returns a list of log-likelihoods.
    """
    genes, cells = X.shape
    L = np.zeros(genes)
    d = M.dot(W)
    if add_eps:
        d += 1e-30
    # d[d==0] = np.min(d[d>0])/1e4
    LLs = X*np.log(d) - d
    if use_constant:
        LLs -= gammaln(X+1)
    L = np.sum(LLs, 1)
    L[np.isnan(L)] = -np.inf
    return L


[docs]def robust_estimate_state(data, clusters, dist='Poiss', init_means=None, init_weights=None, method='NoLips', max_iters=10, tol=1e-10, disp=True, inner_max_iters=100, reps=1, normalize=True, gene_portion=0.2, use_constant=True):
    """
    Uses a Poisson Covex Mixture model to estimate cell states and
    cell state mixing weights.

    Args:
        data (array): genes x cells
        clusters (int): number of mixture components
        dist (string, optional): Distribution used - only 'Poiss' is implemented. Default: 'Poiss'
        init_means (array, optional): initial centers - genes x clusters. Default: kmeans++ initializations
        init_weights (array, optional): initial weights - clusters x cells, or assignments as produced by clustering. Default: random(0,1)
        method (str, optional): optimization method. Options include 'NoLips' or 'L-BFGS-B'. Default: 'NoLips'.
        max_iters (int, optional): maximum number of iterations. Default: 10
        tol (float, optional): if both M and W change by less than tol, then the iteration is stopped. Default: 1e-10
        disp (bool, optional): whether or not to display optimization parameters. Default: True
        inner_max_iters (int, optional): Number of iterations to run in the optimization subroutine for M and W. Default: 100
        normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True.
        gene_portion (float, optional): The proportion of genes to use for estimating W after hard thresholding. Default: 0.2

    Returns:
        M (array): genes x clusters - state means
        W (array): clusters x cells - state mixing components for each cell
        ll (float): final log-likelihood
        genes (array): 1d array of all genes used in final iteration.
    """
    genes, cells = data.shape
    if init_means is None:
        means, assignments = kmeans_pp(data, clusters)
    else:
        means = init_means.copy()
    clusters = means.shape[1]
    w_init = np.random.random((clusters, cells))
    if init_weights is not None:
        if len(init_weights.shape)==1:
            init_weights = initialize_from_assignments(init_weights, clusters)
    # repeat steps 1 and 2 until convergence:
    ll = np.inf
    # objective functions...
    w_obj = _create_w_objective
    m_obj = _create_m_objective
    ll_func = _poisson_calculate_lls
    included_genes = np.arange(genes)
    num_genes = int(np.ceil(gene_portion*genes))
    if disp:
        print('num_genes: {0}'.format(num_genes))
    nolips_iters = inner_max_iters
    X = data.astype(float)
    Xsum = X.sum(0)
    Xsum_m = X.sum(1)
    for i in range(max_iters):
        if disp:
            print('iter: {0}'.format(i))
        # step 1: given M, estimate W
        w_objective = w_obj(means[included_genes,:], data[included_genes,:])
        if method == 'NoLips':
            for j in range(nolips_iters):
                w_new = nolips_update_w(X[included_genes,:], means[included_genes,:], w_init, Xsum)
                #w_new = w_res.x.reshape((clusters, cells))
                #w_new = w_new/w_new.sum(0)
                w_init = w_new
        elif method == 'L-BFGS-B':
            w_bounds = [(0, None) for c in range(clusters*cells)]
            w_res = minimize(w_objective, w_init.flatten(),
                    method='L-BFGS-B', jac=True, bounds=w_bounds,
                    options={'disp':disp, 'maxiter':inner_max_iters})
            w_new = w_res.x.reshape((clusters, cells))
            w_init = w_new
        w_ll, w_deriv = w_objective(w_new.reshape(clusters*cells))
        #w_diff = np.sqrt(np.sum((w_res.x-w_init)**2))/w_init.size
        #w_init = w_res.x
        #w_new = w_res.x.reshape((clusters, cells))
        # step 2: given W, update M
        w_ll, w_deriv = w_objective(w_new.reshape(clusters*cells))
        if disp:
            print('Finished updating W. Objective value: {0}'.format(w_ll))
        # step 2: given W, update M
        m_objective = m_obj(w_new, data[included_genes,:])
        if method == 'NoLips':
            for j in range(nolips_iters):
                m_new = nolips_update_w(X[included_genes,:].T, w_new.T, means[included_genes,:].T, Xsum_m)
                means[included_genes,:] = m_new.T
        elif method == 'L-BFGS-B':
            m_bounds = [(0, None) for c in range(clusters*len(included_genes))]
            m_res = minimize(m_objective, means[included_genes,:].flatten(),
                    method='L-BFGS-B', jac=True, bounds=m_bounds,
                    options={'disp':disp, 'maxiter':inner_max_iters})
            means[included_genes,:] = m_res.x.reshape((len(included_genes), clusters))
        m_ll, m_deriv = m_objective(means[included_genes,:].reshape(len(included_genes)*clusters))
        if disp:
            print('Finished updating M. Objective value: {0}'.format(m_ll))

        # step 3: hard thresholding/gene subset selection
        lls = ll_func(data, means, w_new, use_constant)
        if i < max_iters - 1:
            included_genes = lls.argsort()[::-1][:num_genes]
            if disp:
                print(lls[included_genes])
                print(included_genes)
                print('selected number of genes: ' + str(len(included_genes)))
    if normalize:
        w_new = w_new/w_new.sum(0)
    return means, w_new, m_ll, included_genes