Source code for uncurl.nb_state_estimation

# state estimation with NB convex mixture model

from .clustering import kmeans_pp
from .nb_clustering import nb_fit, find_nb_genes
from .state_estimation import initialize_from_assignments

import numpy as np
from scipy.optimize import minimize

eps=1e-8

def _create_w_objective(m, X, R):
    """
    Creates an objective function and its derivative for W, given M and X (data)

    Args:
        m (array): genes x clusters
        X (array): genes x cells
        R (array): 1 x genes
    """
    genes, clusters = m.shape
    cells = X.shape[1]
    R1 = R.reshape((genes, 1)).dot(np.ones((1, cells)))
    def objective(w):
        # convert w into a matrix first... because it's a vector for
        # optimization purposes
        w = w.reshape((m.shape[1], X.shape[1]))
        d = m.dot(w)+eps
        return np.sum((X + R1)*np.log(d + R1) - X*np.log(d))/genes
    def deriv(w):
        # derivative of objective wrt all elements of w
        # for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus 
        # x_ij
        w2 = w.reshape((m.shape[1], X.shape[1]))
        d = m.dot(w2)+eps
        temp = X/d
        temp2 = (X+R1)/(d+R1)
        m1 = m.T.dot(temp2)
        m2 = m.T.dot(temp)
        deriv = m1 - m2
        return deriv.flatten()/genes
    return objective, deriv

def _create_m_objective(w, X, R):
    """
    Creates an objective function and its derivative for M, given W and X

    Args:
        w (array): clusters x cells
        X (array): genes x cells
        R (array): 1 x genes
    """
    clusters, cells = w.shape
    genes = X.shape[0]
    R1 = R.reshape((genes, 1)).dot(np.ones((1, cells)))
    def objective(m):
        m = m.reshape((X.shape[0], w.shape[0]))
        d = m.dot(w)+eps
        return np.sum((X+R1)*np.log(d + R1) - X*np.log(d))/genes
    def deriv(m):
        m2 = m.reshape((X.shape[0], w.shape[0]))
        d = m2.dot(w)+eps
        temp = X/d
        temp2 = (X+R1)/(d+R1)
        w1 = w.dot(temp2.T)
        w2 = w.dot(temp.T)
        deriv = w1.T - w2.T
        return deriv.flatten()/genes
    return objective, deriv

[docs]def nb_estimate_state(data, clusters, R=None, init_means=None, init_weights=None, max_iters=10, tol=1e-4, disp=True, inner_max_iters=400, normalize=True): """ Uses a Negative Binomial Mixture model to estimate cell states and cell state mixing weights. If some of the genes do not fit a negative binomial distribution (mean > var), then the genes are discarded from the analysis. Args: data (array): genes x cells clusters (int): number of mixture components R (array, optional): vector of length genes containing the dispersion estimates for each gene. Default: use nb_fit init_means (array, optional): initial centers - genes x clusters. Default: kmeans++ initializations init_weights (array, optional): initial weights - clusters x cells. Default: random(0,1) max_iters (int, optional): maximum number of iterations. Default: 10 tol (float, optional): if both M and W change by less than tol (in RMSE), then the iteration is stopped. Default: 1e-4 disp (bool, optional): whether or not to display optimization parameters. Default: True inner_max_iters (int, optional): Number of iterations to run in the scipy minimizer for M and W. Default: 400 normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True. Returns: M (array): genes x clusters - state centers W (array): clusters x cells - state mixing components for each cell R (array): 1 x genes - NB dispersion parameter for each gene ll (float): Log-likelihood of final iteration """ # TODO: deal with non-NB data... just ignore it? or do something else? data_subset = data.copy() genes, cells = data_subset.shape # 1. use nb_fit to get inital Rs if R is None: nb_indices = find_nb_genes(data) data_subset = data[nb_indices, :] if init_means is not None and len(init_means) > sum(nb_indices): init_means = init_means[nb_indices, :] genes, cells = data_subset.shape R = np.zeros(genes) P, R = nb_fit(data_subset) if init_means is None: means, assignments = kmeans_pp(data_subset, clusters) else: means = init_means.copy() clusters = means.shape[1] w_init = np.random.random(cells*clusters) if init_weights is not None: if len(init_weights.shape)==1: init_weights = initialize_from_assignments(init_weights, clusters) w_init = init_weights.reshape(cells*clusters) m_init = means.reshape(genes*clusters) ll = np.inf # repeat steps 1 and 2 until convergence: for i in range(max_iters): if disp: print('iter: {0}'.format(i)) w_bounds = [(0, 1.0) for x in w_init] m_bounds = [(0, None) for x in m_init] # step 1: given M, estimate W w_objective, w_deriv = _create_w_objective(means, data_subset, R) w_res = minimize(w_objective, w_init, method='L-BFGS-B', jac=w_deriv, bounds=w_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) w_diff = np.sqrt(np.sum((w_res.x-w_init)**2))/w_init.size w_new = w_res.x.reshape((clusters, cells)) w_init = w_res.x # step 2: given W, update M m_objective, m_deriv = _create_m_objective(w_new, data_subset, R) # method could be 'L-BFGS-B' or 'SLSQP'... SLSQP gives a memory error... # or use TNC... m_res = minimize(m_objective, m_init, method='L-BFGS-B', jac=m_deriv, bounds=m_bounds, options={'disp':disp, 'maxiter':inner_max_iters}) m_diff = np.sqrt(np.sum((m_res.x-m_init)**2))/m_init.size m_new = m_res.x.reshape((genes, clusters)) m_init = m_res.x ll = m_res.fun means = m_new if w_diff < tol and m_diff < tol: break if normalize: w_new = w_new/w_new.sum(0) return m_new, w_new, R, ll