# poisson clustering
import numpy as np
from scipy import sparse
from .pois_ll import poisson_ll, poisson_dist
eps = 1e-10
[docs]def kmeans_pp(data, k, centers=None):
"""
Generates kmeans++ initial centers.
Args:
data (array): A 2d array- genes x cells
k (int): Number of clusters
centers (array, optional): if provided, these are one or more known cluster centers. 2d array of genes x number of centers (<=k).
Returns:
centers - a genes x k array of cluster means.
assignments - a cells x 1 array of cluster assignments
"""
# TODO: what if there is missing data for a given gene?
# missing data could be if all the entires are -1.
genes, cells = data.shape
if sparse.issparse(data) and not sparse.isspmatrix_csc(data):
data = sparse.csc_matrix(data)
num_known_centers = 0
if centers is None:
centers = np.zeros((genes, k))
else:
num_known_centers = centers.shape[1]
centers = np.concatenate((centers, np.zeros((genes, k-num_known_centers))), 1)
distances = np.zeros((cells, k))
distances[:] = np.inf
if num_known_centers == 0:
init = np.random.randint(0, cells)
if sparse.issparse(data):
centers[:,0] = data[:, init].toarray().flatten()
else:
centers[:,0] = data[:, init]
num_known_centers+=1
available_cells = list(range(cells))
for c in range(num_known_centers, k):
c2 = c-1
# use different formulation for distance... if sparse, use lls
# if not sparse, use poisson_dist
if sparse.issparse(data):
lls = poisson_ll(data, centers[:,c2:c2+1]).flatten()
distances[:,c2] = 1 + lls.max() - lls
distances[:,c2] /= distances[:,c2].max()
else:
for cell in range(cells):
distances[cell, c2] = poisson_dist(data[:,cell], centers[:,c2])
# choose a new data point as center... probability proportional
# to distance^2
min_distances = np.min(distances, 1)
min_distances = min_distances**2
min_distances = min_distances[available_cells]
# should be sampling without replacement
min_dist = np.random.choice(available_cells,
p=min_distances/min_distances.sum())
available_cells.pop(available_cells.index(min_dist))
if sparse.issparse(data):
centers[:,c] = data[:, min_dist].toarray().flatten()
else:
centers[:,c] = data[:, min_dist]
lls = poisson_ll(data, centers)
new_assignments = np.argmax(lls, 1)
centers[centers==0.0] = eps
return centers, new_assignments
[docs]def poisson_cluster(data, k, init=None, max_iters=100):
"""
Performs Poisson hard EM on the given data.
Args:
data (array): A 2d array- genes x cells. Can be dense or sparse; for best performance, sparse matrices should be in CSC format.
k (int): Number of clusters
init (array, optional): Initial centers - genes x k array. Default: None, use kmeans++
max_iters (int, optional): Maximum number of iterations. Default: 100
Returns:
a tuple of two arrays: a cells x 1 vector of cluster assignments,
and a genes x k array of cluster means.
"""
# TODO: be able to use a combination of fixed and unknown starting points
# e.g., have init values only for certain genes, have a row of all
# zeros indicating that kmeans++ should be used for that row.
genes, cells = data.shape
#print 'starting: ', centers
if sparse.issparse(data) and not sparse.isspmatrix_csc(data):
data = sparse.csc_matrix(data)
init, assignments = kmeans_pp(data, k, centers=init)
centers = np.copy(init)
assignments = np.zeros(cells)
for it in range(max_iters):
lls = poisson_ll(data, centers)
#cluster_dists = np.zeros((cells, k))
new_assignments = np.argmax(lls, 1)
if np.equal(assignments, new_assignments).all():
#print 'ending: ', centers
return new_assignments, centers
for c in range(k):
if sparse.issparse(data):
if data[:,new_assignments==c].shape[0]==0:
# re-initialize centers?
new_c, _ = kmeans_pp(data, k, centers[:,:c])
centers[:,c] = new_c[:,c]
else:
centers[:,c] = np.asarray(data[:,new_assignments==c].mean(1)).flatten()
else:
if len(data[:,new_assignments==c])==0:
new_c, _ = kmeans_pp(data, k, centers[:,:c])
centers[:,c] = new_c[:,c]
else:
centers[:,c] = np.mean(data[:,new_assignments==c], 1)
assignments = new_assignments
return assignments, centers