# state estimation with poisson convex mixture model
from .clustering import kmeans_pp, poisson_cluster
from uncurl.nolips import nolips_update_w, objective, sparse_objective
from uncurl.nolips import sparse_nolips_update_w
# try to use parallel; otherwise
#from uncurl.nolips_parallel import sparse_nolips_update_w as parallel_sparse_nolips_update_w
try:
from uncurl.nolips_parallel import sparse_nolips_update_w as parallel_sparse_nolips_update_w
except:
print('Warning: cannot import sparse nolips')
# if parallel can't be used, do not use parallel update function...
pass
from .preprocessing import cell_normalize, log1p
import numpy as np
from scipy import sparse
from scipy.optimize import minimize
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd
eps=1e-8
def _create_w_objective(m, X):
"""
Creates an objective function and its derivative for W, given M and X (data)
Args:
m (array): genes x clusters
X (array): genes x cells
"""
genes, clusters = m.shape
cells = X.shape[1]
m_sum = m.sum(0)
def objective(w):
# convert w into a matrix first... because it's a vector for
# optimization purposes
w = w.reshape((m.shape[1], X.shape[1]))
d = m.dot(w)+eps
# derivative of objective wrt all elements of w
# for w_{ij}, the derivative is... m_j1+...+m_jn sum over genes minus
# x_ij
temp = X/d
m2 = m.T.dot(temp)
deriv = m_sum.reshape((clusters, 1)) - m2
return np.sum(d - X*np.log(d))/genes, deriv.flatten()/genes
return objective
def _create_m_objective(w, X):
"""
Creates an objective function and its derivative for M, given W and X
Args:
w (array): clusters x cells
X (array): genes x cells
"""
clusters, cells = w.shape
genes = X.shape[0]
w_sum = w.sum(1)
def objective(m):
m = m.reshape((X.shape[0], w.shape[0]))
d = m.dot(w)+eps
temp = X/d
w2 = w.dot(temp.T)
deriv = w_sum - w2.T
return np.sum(d - X*np.log(d))/genes, deriv.flatten()/genes
return objective
[docs]def initialize_from_assignments(assignments, k, max_assign_weight=0.75):
"""
Creates a weight initialization matrix from Poisson clustering assignments.
Args:
assignments (array): 1D array of integers, of length cells
k (int): number of states/clusters
max_assign_weight (float, optional): between 0 and 1 - how much weight to assign to the highest cluster. Default: 0.75
Returns:
init_W (array): k x cells
"""
cells = len(assignments)
init_W = np.zeros((k, cells))
for i, a in enumerate(assignments):
# entirely arbitrary... maybe it would be better to scale
# the weights based on k?
init_W[a, i] = max_assign_weight
for a2 in range(k):
if a2!=a:
init_W[a2, i] = (1-max_assign_weight)/(k-1)
return init_W/init_W.sum(0)
[docs]def initialize_means(data, clusters, k):
"""
Initializes the M matrix given the data and a set of cluster labels.
Cluster centers are set to the mean of each cluster.
Args:
data (array): genes x cells
clusters (array): 1d array of ints (0...k-1)
k (int): number of clusters
"""
init_w = np.zeros((data.shape[0], k))
if sparse.issparse(data):
for i in range(k):
if data[:,clusters==i].shape[1]==0:
point = np.random.randint(0, data.shape[1])
init_w[:,i] = data[:,point].toarray().flatten()
else:
# memory usage might be a problem here?
init_w[:,i] = np.array(data[:,clusters==i].mean(1)).flatten() + eps
else:
for i in range(k):
if data[:,clusters==i].shape[1]==0:
point = np.random.randint(0, data.shape[1])
init_w[:,i] = data[:,point].flatten()
else:
init_w[:,i] = data[:,clusters==i].mean(1) + eps
return init_w
[docs]def initialize_weights_nn(data, means, lognorm=True):
"""
Initializes the weights with a nearest-neighbor approach using the means.
"""
# TODO
genes, cells = data.shape
k = means.shape[1]
if lognorm:
data = log1p(cell_normalize(data))
for i in range(cells):
for j in range(k):
pass
def _estimate_w(X, w_init, means, Xsum, update_fn, objective_fn, is_sparse=True, parallel=True, threads=4, method='NoLips', tol=1e-10, disp=False, inner_max_iters=100, output_name='W', regularization=0.0):
clusters, cells = w_init.shape
if method=='NoLips':
nolips_iters = inner_max_iters
m_sum = means.sum(0)
lams = 1/(2*Xsum)
for j in range(nolips_iters):
if is_sparse and parallel:
w_new = update_fn(X.data, X.indices, X.indptr, X.shape[1], X.shape[0], means, w_init, lams, m_sum, n_threads=threads, regularization=regularization)
else:
w_new = update_fn(X, means, w_init, Xsum)
#w_new = w_res.x.reshape((clusters, cells))
#w_new = w_new/w_new.sum(0)
if tol > 0:
w_diff = np.sqrt(np.sum((w_new - w_init)**2)/(clusters*cells))
if disp:
print('inner iter {0}: w_diff={1}'.format(j, w_diff))
w_init = w_new
if w_diff < tol:
break
else:
w_init = w_new
elif method=='L-BFGS-B':
w_objective = _create_w_objective(means, X)
w_bounds = [(0, None) for c in range(clusters*cells)]
w_res = minimize(w_objective, w_init.flatten(),
method='L-BFGS-B', jac=True, bounds=w_bounds,
options={'disp':disp, 'maxiter':inner_max_iters})
w_new = w_res.x.reshape((clusters, cells))
w_init = w_new
return w_new
def _call_sparse_obj(X, M, W):
return sparse_objective(X.data, X.indices, X.indptr, X.shape[1], X.shape[0],
M, W)
[docs]def initialize_means_weights(data, clusters, init_means=None, init_weights=None, initialization='tsvd', max_assign_weight=0.75):
"""
Generates initial means and weights for state estimation.
"""
genes, cells = data.shape
if init_means is None:
if init_weights is not None:
if len(init_weights.shape)==1:
means = initialize_means(data, init_weights, clusters)
else:
means = initialize_means(data, init_weights.argmax(0),
clusters, max_assign_weight=max_assign_weight)
elif initialization=='cluster':
assignments, means = poisson_cluster(data, clusters)
if init_weights is None:
init_weights = initialize_from_assignments(assignments, clusters,
max_assign_weight=max_assign_weight)
elif initialization=='kmpp':
means, assignments = kmeans_pp(data, clusters)
elif initialization=='km':
km = KMeans(clusters)
assignments = km.fit_predict(log1p(cell_normalize(data)).T)
init_weights = initialize_from_assignments(assignments, clusters,
max_assign_weight)
means = initialize_means(data, assignments, clusters)
elif initialization=='tsvd':
n_components = min(50, genes-1)
#tsvd = TruncatedSVD(min(50, genes-1))
km = KMeans(clusters)
# remove dependence on sklearn tsvd b/c it has a bug that
# prevents it from working properly on long inputs
# if num elements > 2**31
#data_reduced = tsvd.fit_transform(log1p(cell_normalize(data)).T)
U, Sigma, VT = randomized_svd(log1p(cell_normalize(data)).T,
n_components)
data_reduced = U*Sigma
assignments = km.fit_predict(data_reduced)
init_weights = initialize_from_assignments(assignments, clusters,
max_assign_weight)
means = initialize_means(data, assignments, clusters)
else:
means = init_means.copy()
means = means.astype(float)
if init_weights is None:
if init_means is not None:
if initialization == 'cluster':
assignments, means = poisson_cluster(data, clusters,
init=init_means, max_iters=1)
w_init = initialize_from_assignments(assignments, clusters,
max_assign_weight)
elif initialization == 'km':
km = KMeans(clusters, init=log1p(init_means.T), max_iter=1)
assignments = km.fit_predict(log1p(cell_normalize(data)).T)
w_init = initialize_from_assignments(assignments, clusters,
max_assign_weight)
else:
w_init = np.random.random((clusters, cells))
w_init = w_init/w_init.sum(0)
else:
w_init = np.random.random((clusters, cells))
w_init = w_init/w_init.sum(0)
else:
if len(init_weights.shape)==1:
init_weights = initialize_from_assignments(init_weights, clusters,
max_assign_weight)
w_init = init_weights.copy()
return means, w_init
[docs]def poisson_estimate_state(data, clusters, init_means=None, init_weights=None, method='NoLips', max_iters=30, tol=1e-10, disp=False, inner_max_iters=100, normalize=True, initialization='tsvd', parallel=True, threads=4, max_assign_weight=0.75, run_w_first=True, constrain_w=False, regularization=0.0):
"""
Uses a Poisson Covex Mixture model to estimate cell states and
cell state mixing weights.
To lower computational costs, use a sparse matrix, set disp to False, and set tol to 0.
Args:
data (array): genes x cells array or sparse matrix.
clusters (int): number of mixture components
init_means (array, optional): initial centers - genes x clusters. Default: from Poisson kmeans
init_weights (array, optional): initial weights - clusters x cells, or assignments as produced by clustering. Default: from Poisson kmeans
method (str, optional): optimization method. Current options are 'NoLips' and 'L-BFGS-B'. Default: 'NoLips'.
max_iters (int, optional): maximum number of iterations. Default: 30
tol (float, optional): if both M and W change by less than tol (RMSE), then the iteration is stopped. Default: 1e-10
disp (bool, optional): whether or not to display optimization progress. Default: False
inner_max_iters (int, optional): Number of iterations to run in the optimization subroutine for M and W. Default: 100
normalize (bool, optional): True if the resulting W should sum to 1 for each cell. Default: True.
initialization (str, optional): If initial means and weights are not provided, this describes how they are initialized. Options: 'cluster' (poisson cluster for means and weights), 'kmpp' (kmeans++ for means, random weights), 'km' (regular k-means), 'tsvd' (tsvd(50) + k-means). Default: tsvd.
parallel (bool, optional): Whether to use parallel updates (sparse NoLips only). Default: True
threads (int, optional): How many threads to use in the parallel computation. Default: 4
max_assign_weight (float, optional): If using a clustering-based initialization, how much weight to assign to the max weight cluster. Default: 0.75
run_w_first (bool, optional): Whether or not to optimize W first (if false, M will be optimized first). Default: True
constrain_w (bool, optional): If True, then W is normalized after every iteration. Default: False
regularization (float, optional): Regularization coefficient for M and W. Default: 0 (no regularization).
Returns:
M (array): genes x clusters - state means
W (array): clusters x cells - state mixing components for each cell
ll (float): final log-likelihood
"""
genes, cells = data.shape
means, w_init = initialize_means_weights(data, clusters, init_means, init_weights, initialization, max_assign_weight)
X = data.astype(float)
XT = X.T
is_sparse = False
if sparse.issparse(X):
is_sparse = True
update_fn = sparse_nolips_update_w
# convert to csc
X = sparse.csc_matrix(X)
XT = sparse.csc_matrix(XT)
if parallel:
update_fn = parallel_sparse_nolips_update_w
Xsum = np.asarray(X.sum(0)).flatten()
Xsum_m = np.asarray(X.sum(1)).flatten()
# L-BFGS-B won't work right now for sparse matrices
method = 'NoLips'
objective_fn = _call_sparse_obj
else:
objective_fn = objective
update_fn = nolips_update_w
Xsum = X.sum(0)
Xsum_m = X.sum(1)
# If method is NoLips, converting to a sparse matrix
# will always improve the performance (?) and never lower accuracy...
# will almost always improve performance?
# if sparsity is below 40%?
if method == 'NoLips':
if np.count_nonzero(X) < 0.4*genes*cells:
is_sparse = True
X = sparse.csc_matrix(X)
XT = sparse.csc_matrix(XT)
update_fn = sparse_nolips_update_w
if parallel:
update_fn = parallel_sparse_nolips_update_w
objective_fn = _call_sparse_obj
w_new = w_init
for i in range(max_iters):
# TODO: write progress
if disp:
print('iter: {0}'.format(i))
if run_w_first:
# step 1: given M, estimate W
w_new = _estimate_w(X, w_new, means, Xsum, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'W', regularization)
if constrain_w:
w_new = w_new/w_new.sum(0)
if disp:
w_ll = objective_fn(X, means, w_new)
print('Finished updating W. Objective value: {0}'.format(w_ll))
# step 2: given W, update M
means = _estimate_w(XT, means.T, w_new.T, Xsum_m, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'M', regularization)
means = means.T
if disp:
w_ll = objective_fn(X, means, w_new)
print('Finished updating M. Objective value: {0}'.format(w_ll))
else:
# step 1: given W, update M
means = _estimate_w(XT, means.T, w_new.T, Xsum_m, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'M', regularization)
means = means.T
if disp:
w_ll = objective_fn(X, means, w_new)
print('Finished updating M. Objective value: {0}'.format(w_ll))
# step 2: given M, estimate W
w_new = _estimate_w(X, w_new, means, Xsum, update_fn, objective_fn, is_sparse, parallel, threads, method, tol, disp, inner_max_iters, 'W', regularization)
if constrain_w:
w_new = w_new/w_new.sum(0)
if disp:
w_ll = objective_fn(X, means, w_new)
print('Finished updating W. Objective value: {0}'.format(w_ll))
if normalize:
w_new = w_new/w_new.sum(0)
m_ll = objective_fn(X, means, w_new)
return means, w_new, m_ll