# wrapper for various NMF methods
import numpy as np
from scipy import sparse
from sklearn.decomposition import NMF, non_negative_factorization
from .preprocessing import log1p, cell_normalize
from .state_estimation import initialize_from_assignments
[docs]def nmf_init(data, clusters, k, init='enhanced'):
"""
Generates initial M and W given a data set and an array of cluster labels.
There are 3 options for init:
enhanced - uses EIn-NMF from Gong 2013
basic - uses means for M, assigns W such that the chosen cluster for a given cell has value 0.75 and all others have 0.25/(k-1).
nmf - uses means for M, and assigns W using the NMF objective while holding M constant.
"""
init_m = np.zeros((data.shape[0], k))
if sparse.issparse(data):
for i in range(k):
if data[:,clusters==i].shape[1]==0:
point = np.random.randint(0, data.shape[1])
init_m[:,i] = data[:,point].toarray().flatten()
else:
init_m[:,i] = np.array(data[:,clusters==i].mean(1)).flatten()
else:
for i in range(k):
if data[:,clusters==i].shape[1]==0:
point = np.random.randint(0, data.shape[1])
init_m[:,i] = data[:,point].flatten()
else:
init_m[:,i] = data[:,clusters==i].mean(1)
init_w = np.zeros((k, data.shape[1]))
if init == 'enhanced':
distances = np.zeros((k, data.shape[1]))
for i in range(k):
for j in range(data.shape[1]):
distances[i,j] = np.sqrt(((data[:,j] - init_m[:,i])**2).sum())
for i in range(k):
for j in range(data.shape[1]):
init_w[i,j] = 1/((distances[:,j]/distances[i,j])**(-2)).sum()
elif init == 'basic':
init_w = initialize_from_assignments(clusters, k)
elif init == 'nmf':
init_w_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_m.T)
init_w = init_w_.T
return init_m, init_w
# TODO: initialization if init_w is a cluster list?
[docs]def log_norm_nmf(data, k, normalize_w=True, return_cost=True, init_weights=None, init_means=None, **kwargs):
"""
Args:
data (array): dense or sparse array with shape (genes, cells)
k (int): number of cell types
normalize_w (bool, optional): True if W should be normalized (so that each column sums to 1). Default: True
return_cost (bool, optional): True if the NMF objective value (squared error) should be returned. Default: True
init_weights (array, optional): Initial value for W. Default: None
init_means (array, optional): Initial value for M. Default: None
**kwargs: misc arguments to NMF
Returns:
Two matrices M of shape (genes, k) and W of shape (k, cells). They correspond to M and M in Poisson state estimation. If return_cost is True (which it is by default), then the cost will also be returned. This might be prohibitably costly
"""
init = None
if init_weights is not None or init_means is not None:
init = 'custom'
if init_weights is None:
init_weights_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_means.T)
init_weights = init_weights_.T
elif init_means is None:
init_means, _, n_iter = non_negative_factorization(data, n_components=k, init='custom', update_W=False, W=init_weights)
nmf = NMF(k, init=init, **kwargs)
data = log1p(cell_normalize(data))
M = nmf.fit_transform(data, W=init_means, H=init_weights)
W = nmf.components_
if normalize_w:
W = W/W.sum(0)
if return_cost:
cost = 0
if sparse.issparse(data):
ws = sparse.csr_matrix(M)
hs = sparse.csr_matrix(W)
cost = 0.5*((data - ws.dot(hs)).power(2)).sum()
else:
cost = 0.5*((data - M.dot(W))**2).sum()
return M, W, cost
else:
return M, W
# TODO: initialization
[docs]def norm_nmf(data, k, init_weights=None, init_means=None, normalize_w=True, **kwargs):
"""
Args:
data (array): dense or sparse array with shape (genes, cells)
k (int): number of cell types
normalize_w (bool): True if W should be normalized (so that each column sums to 1)
init_weights (array, optional): Initial value for W. Default: None
init_means (array, optional): Initial value for M. Default: None
**kwargs: misc arguments to NMF
Returns:
Two matrices M of shape (genes, k) and W of shape (k, cells)
"""
init = None
if init_weights is not None or init_means is not None:
init = 'custom'
if init_weights is None:
init_weights_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_means.T)
init_weights = init_weights_.T
elif init_means is None:
init_means, _, n_iter = non_negative_factorization(data, n_components=k, init='custom', update_W=False, W=init_weights)
nmf = NMF(k, init=init, **kwargs)
data = log1p(cell_normalize(data))
M = nmf.fit_transform(data, W=init_means, H=init_weights)
W = nmf.components_
if normalize_w:
W = W/W.sum(0)
return M, W