Source code for uncurl.nb_state_estimation

# state estimation with NB convex mixture model

from .clustering import kmeans_pp
from .nb_clustering import nb_fit, find_nb_genes
from .state_estimation import initialize_from_assignments

import numpy as np
from scipy.optimize import minimize

eps=1e-8

def _create_w_objective(m, X, R):
    """
    Creates an objective function and its derivative for W, given M and X (data)

    Args:
        m (array): genes x clusters
        X (array): genes x cells
        R (array): 1 x genes
    """
    genes, clusters = m.shape
    cells = X.shape[1]
    R1 = R.reshape((genes, 1)).dot(np.ones((1, cells)))
    def objective(w):
        # convert w into a matrix first... because it's a vector for
        # optimization purposes
        w = w.reshape((m.shape[1], X.shape[1]))
        d = m.dot(w)+eps
        return np.sum((X + R1)*np.log(d + R1) - X*np.log(d))/genes
    def deriv(w):
        # derivative of objective wrt all elements of w
        # for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus 
        # x_ij
        w2 = w.reshape((m.shape[1], X.shape[1]))
        d = m.dot(w2)+eps
        temp = X/d
        temp2 = (X+R1)/(d+R1)
        m1 = m.T.dot(temp2)
        m2 = m.T.dot(temp)
        deriv = m1 - m2
        return deriv.flatten()/genes
    return objective, deriv

def _create_m_objective(w, X, R):
    """
    Creates an objective function and its derivative for M, given W and X

    Args:
        w (array): clusters x cells
        X (array): genes x cells
        R (array): 1 x genes
    """
    clusters, cells = w.shape
    genes = X.shape[0]
    R1 = R.reshape((genes, 1)).dot(np.ones((1, cells)))
    def objective(m):
        m = m.reshape((X.shape[0], w.shape[0]))
        d = m.dot(w)+eps
        return np.sum((X+R1)*np.log(d + R1) - X*np.log(d))/genes
    def deriv(m):
        m2 = m.reshape((X.shape[0], w.shape[0]))
        d = m2.dot(w)+eps
        temp = X/d
        temp2 = (X+R1)/(d+R1)
        w1 = w.dot(temp2.T)
        w2 = w.dot(temp.T)
        deriv = w1.T - w2.T
        return deriv.flatten()/genes
    return objective, deriv

[docs]def nb_estimate_state(data, clusters, R=None, init_means=None, init_weights=None, max_iters=10, tol=1e-4, disp=True, inner_max_iters=400, normalize=True):
    """
    Uses a Negative Binomial Mixture model to estimate cell states and
    cell state mixing weights.

    If some of the genes do not fit a negative binomial distribution
    (mean > var), then the genes are discarded from the analysis.

    Args:
        data (array): genes x cells
        clusters (int): number of mixture components
        R (array, optional): vector of length genes containing the dispersion estimates for each gene. Default: use nb_fit
        init_means (array, optional): initial centers - genes x clusters. Default: kmeans++ initializations
        init_weights (array, optional): initial weights - clusters x cells. Default: random(0,1)
        max_iters (int, optional): maximum number of iterations. Default: 10
        tol (float, optional): if both M and W change by less than tol (in RMSE), then the iteration is stopped. Default: 1e-4
        disp (bool, optional): whether or not to display optimization parameters. Default: True
        inner_max_iters (int, optional): Number of iterations to run in the scipy minimizer for M and W. Default: 400
        normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True.

    Returns:
        M (array): genes x clusters - state centers
        W (array): clusters x cells - state mixing components for each cell
        R (array): 1 x genes - NB dispersion parameter for each gene
        ll (float): Log-likelihood of final iteration
    """
    # TODO: deal with non-NB data... just ignore it? or do something else?
    data_subset = data.copy()
    genes, cells = data_subset.shape
    # 1. use nb_fit to get inital Rs
    if R is None:
        nb_indices = find_nb_genes(data)
        data_subset = data[nb_indices, :]
        if init_means is not None and len(init_means) > sum(nb_indices):
            init_means = init_means[nb_indices, :]
        genes, cells = data_subset.shape
        R = np.zeros(genes)
        P, R = nb_fit(data_subset)
    if init_means is None:
        means, assignments = kmeans_pp(data_subset, clusters)
    else:
        means = init_means.copy()
    clusters = means.shape[1]
    w_init = np.random.random(cells*clusters)
    if init_weights is not None:
        if len(init_weights.shape)==1:
            init_weights = initialize_from_assignments(init_weights, clusters)
        w_init = init_weights.reshape(cells*clusters)
    m_init = means.reshape(genes*clusters)
    ll = np.inf
    # repeat steps 1 and 2 until convergence:
    for i in range(max_iters):
        if disp:
            print('iter: {0}'.format(i))
        w_bounds = [(0, 1.0) for x in w_init]
        m_bounds = [(0, None) for x in m_init]
        # step 1: given M, estimate W
        w_objective, w_deriv = _create_w_objective(means, data_subset, R)
        w_res = minimize(w_objective, w_init, method='L-BFGS-B', jac=w_deriv, bounds=w_bounds, options={'disp':disp, 'maxiter':inner_max_iters})
        w_diff = np.sqrt(np.sum((w_res.x-w_init)**2))/w_init.size
        w_new = w_res.x.reshape((clusters, cells))
        w_init = w_res.x
        # step 2: given W, update M
        m_objective, m_deriv = _create_m_objective(w_new, data_subset, R)
        # method could be 'L-BFGS-B' or 'SLSQP'... SLSQP gives a memory error...
        # or use TNC...
        m_res = minimize(m_objective, m_init, method='L-BFGS-B', jac=m_deriv, bounds=m_bounds, options={'disp':disp, 'maxiter':inner_max_iters})
        m_diff = np.sqrt(np.sum((m_res.x-m_init)**2))/m_init.size
        m_new = m_res.x.reshape((genes, clusters))
        m_init = m_res.x
        ll = m_res.fun
        means = m_new
        if w_diff < tol and m_diff < tol:
            break
    if normalize:
        w_new = w_new/w_new.sum(0)
    return m_new, w_new, R, ll