Source code for uncurl.nmf_wrapper

# wrapper for various NMF methods

import numpy as np
from scipy import sparse
from sklearn.decomposition import NMF, non_negative_factorization

from .preprocessing import log1p, cell_normalize
from .state_estimation import initialize_from_assignments

[docs]def nmf_init(data, clusters, k, init='enhanced'):
    """
    Generates initial M and W given a data set and an array of cluster labels.

    There are 3 options for init:
        enhanced - uses EIn-NMF from Gong 2013
        basic - uses means for M, assigns W such that the chosen cluster for a given cell has value 0.75 and all others have 0.25/(k-1).
        nmf - uses means for M, and assigns W using the NMF objective while holding M constant.
    """
    init_m = np.zeros((data.shape[0], k))
    if sparse.issparse(data):
        for i in range(k):
            if data[:,clusters==i].shape[1]==0:
                point = np.random.randint(0, data.shape[1])
                init_m[:,i] = data[:,point].toarray().flatten()
            else:
                init_m[:,i] = np.array(data[:,clusters==i].mean(1)).flatten()
    else:
        for i in range(k):
            if data[:,clusters==i].shape[1]==0:
                point = np.random.randint(0, data.shape[1])
                init_m[:,i] = data[:,point].flatten()
            else:
                init_m[:,i] = data[:,clusters==i].mean(1)
    init_w = np.zeros((k, data.shape[1]))
    if init == 'enhanced':
        distances = np.zeros((k, data.shape[1]))
        for i in range(k):
            for j in range(data.shape[1]):
                distances[i,j] = np.sqrt(((data[:,j] - init_m[:,i])**2).sum())
        for i in range(k):
            for j in range(data.shape[1]):
                init_w[i,j] = 1/((distances[:,j]/distances[i,j])**(-2)).sum()
    elif init == 'basic':
        init_w = initialize_from_assignments(clusters, k)
    elif init == 'nmf':
        init_w_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_m.T)
        init_w = init_w_.T
    return init_m, init_w

# TODO: initialization if init_w is a cluster list?
[docs]def log_norm_nmf(data, k, normalize_w=True, return_cost=True, init_weights=None, init_means=None, **kwargs):
    """
    Args:
        data (array): dense or sparse array with shape (genes, cells)
        k (int): number of cell types
        normalize_w (bool, optional): True if W should be normalized (so that each column sums to 1). Default: True
        return_cost (bool, optional): True if the NMF objective value (squared error) should be returned. Default: True
        init_weights (array, optional): Initial value for W. Default: None
        init_means (array, optional): Initial value for M. Default: None
        **kwargs: misc arguments to NMF

    Returns:
        Two matrices M of shape (genes, k) and W of shape (k, cells). They correspond to M and M in Poisson state estimation. If return_cost is True (which it is by default), then the cost will also be returned. This might be prohibitably costly
    """
    init = None
    if init_weights is not None or init_means is not None:
        init = 'custom'
        if init_weights is None:
            init_weights_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_means.T)
            init_weights = init_weights_.T
        elif init_means is None:
            init_means, _, n_iter = non_negative_factorization(data, n_components=k, init='custom', update_W=False, W=init_weights)
    nmf = NMF(k, init=init, **kwargs)
    data = log1p(cell_normalize(data))
    M = nmf.fit_transform(data, W=init_means, H=init_weights)
    W = nmf.components_
    if normalize_w:
        W = W/W.sum(0)
    if return_cost:
        cost = 0
        if sparse.issparse(data):
            ws = sparse.csr_matrix(M)
            hs = sparse.csr_matrix(W)
            cost = 0.5*((data - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data - M.dot(W))**2).sum()
        return M, W, cost
    else:
        return M, W

# TODO: initialization
[docs]def norm_nmf(data, k, init_weights=None, init_means=None, normalize_w=True, **kwargs):
    """
    Args:
        data (array): dense or sparse array with shape (genes, cells)
        k (int): number of cell types
        normalize_w (bool): True if W should be normalized (so that each column sums to 1)
        init_weights (array, optional): Initial value for W. Default: None
        init_means (array, optional): Initial value for M. Default: None
        **kwargs: misc arguments to NMF

    Returns:
        Two matrices M of shape (genes, k) and W of shape (k, cells)
    """
    init = None
    if init_weights is not None or init_means is not None:
        init = 'custom'
        if init_weights is None:
            init_weights_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_W=False, W=init_means.T)
            init_weights = init_weights_.T
        elif init_means is None:
            init_means, _, n_iter = non_negative_factorization(data, n_components=k, init='custom', update_W=False, W=init_weights)
    nmf = NMF(k, init=init, **kwargs)
    data = log1p(cell_normalize(data))
    M = nmf.fit_transform(data, W=init_means, H=init_weights)
    W = nmf.components_
    if normalize_w:
        W = W/W.sum(0)
    return M, W