Source code for uncurl.clustering

# poisson clustering

import numpy as np
from scipy import sparse

from .pois_ll import poisson_ll, poisson_dist

eps = 1e-10

[docs]def kmeans_pp(data, k, centers=None):
    """
    Generates kmeans++ initial centers.

    Args:
        data (array): A 2d array- genes x cells
        k (int): Number of clusters
        centers (array, optional): if provided, these are one or more known cluster centers. 2d array of genes x number of centers (<=k).

    Returns:
        centers - a genes x k array of cluster means.
        assignments - a cells x 1 array of cluster assignments
    """
    # TODO: what if there is missing data for a given gene?
    # missing data could be if all the entires are -1.
    genes, cells = data.shape
    if sparse.issparse(data) and not sparse.isspmatrix_csc(data):
        data = sparse.csc_matrix(data)
    num_known_centers = 0
    if centers is None:
        centers = np.zeros((genes, k))
    else:
        num_known_centers = centers.shape[1]
        centers = np.concatenate((centers, np.zeros((genes, k-num_known_centers))), 1)
    distances = np.zeros((cells, k))
    distances[:] = np.inf
    if num_known_centers == 0:
        init = np.random.randint(0, cells)
        if sparse.issparse(data):
            centers[:,0] = data[:, init].toarray().flatten()
        else:
            centers[:,0] = data[:, init]
        num_known_centers+=1
    available_cells = list(range(cells))
    for c in range(num_known_centers, k):
        c2 = c-1
        # use different formulation for distance... if sparse, use lls
        # if not sparse, use poisson_dist
        if sparse.issparse(data):
            lls = poisson_ll(data, centers[:,c2:c2+1]).flatten()
            distances[:,c2] = 1 + lls.max() - lls
            distances[:,c2] /= distances[:,c2].max()
        else:
            for cell in range(cells):
                distances[cell, c2] = poisson_dist(data[:,cell], centers[:,c2])
        # choose a new data point as center... probability proportional
        # to distance^2
        min_distances = np.min(distances, 1)
        min_distances = min_distances**2
        min_distances = min_distances[available_cells]
        # should be sampling without replacement
        min_dist = np.random.choice(available_cells,
                p=min_distances/min_distances.sum())
        available_cells.pop(available_cells.index(min_dist))
        if sparse.issparse(data):
            centers[:,c] = data[:, min_dist].toarray().flatten()
        else:
            centers[:,c] = data[:, min_dist]
    lls = poisson_ll(data, centers)
    new_assignments = np.argmax(lls, 1)
    centers[centers==0.0] = eps
    return centers, new_assignments

[docs]def poisson_cluster(data, k, init=None, max_iters=100):
    """
    Performs Poisson hard EM on the given data.

    Args:
        data (array): A 2d array- genes x cells. Can be dense or sparse; for best performance, sparse matrices should be in CSC format.
        k (int): Number of clusters
        init (array, optional): Initial centers - genes x k array. Default: None, use kmeans++
        max_iters (int, optional): Maximum number of iterations. Default: 100

    Returns:
        a tuple of two arrays: a cells x 1 vector of cluster assignments,
        and a genes x k array of cluster means.
    """
    # TODO: be able to use a combination of fixed and unknown starting points
    # e.g., have init values only for certain genes, have a row of all
    # zeros indicating that kmeans++ should be used for that row.
    genes, cells = data.shape
    #print 'starting: ', centers
    if sparse.issparse(data) and not sparse.isspmatrix_csc(data):
        data = sparse.csc_matrix(data)
    init, assignments = kmeans_pp(data, k, centers=init)
    centers = np.copy(init)
    assignments = np.zeros(cells)
    for it in range(max_iters):
        lls = poisson_ll(data, centers)
        #cluster_dists = np.zeros((cells, k))
        new_assignments = np.argmax(lls, 1)
        if np.equal(assignments, new_assignments).all():
            #print 'ending: ', centers
            return new_assignments, centers
        for c in range(k):
            if sparse.issparse(data):
                if data[:,new_assignments==c].shape[0]==0:
                    # re-initialize centers?
                    new_c, _ = kmeans_pp(data, k, centers[:,:c])
                    centers[:,c] = new_c[:,c]
                else:
                    centers[:,c] = np.asarray(data[:,new_assignments==c].mean(1)).flatten()
            else:
                if len(data[:,new_assignments==c])==0:
                    new_c, _ = kmeans_pp(data, k, centers[:,:c])
                    centers[:,c] = new_c[:,c]
                else:
                    centers[:,c] = np.mean(data[:,new_assignments==c], 1)
        assignments = new_assignments
    return assignments, centers