Source code for uncurl.evaluation
from collections import Counter
import numpy as np
from sklearn.neighbors import BallTree
[docs]def purity(labels, true_labels):
"""
Calculates the purity score for the given labels.
Args:
labels (array): 1D array of integers
true_labels (array): 1D array of integers - true labels
Returns:
purity score - a float bewteen 0 and 1. Closer to 1 is better.
"""
purity = 0.0
for i in set(labels):
indices = (labels==i)
true_clusters = true_labels[indices]
if len(true_clusters)==0:
continue
counts = Counter(true_clusters)
lab, count = counts.most_common()[0]
purity += count
return float(purity)/len(labels)
[docs]def nne(dim_red, true_labels):
"""
Calculates the nearest neighbor accuracy (basically leave-one-out cross
validation with a 1NN classifier).
Args:
dim_red (array): dimensions (k, cells)
true_labels (array): 1d array of integers
Returns:
Nearest neighbor accuracy - fraction of points for which the 1NN
1NN classifier returns the correct value.
"""
# use sklearn's BallTree
bt = BallTree(dim_red.T)
correct = 0
for i, l in enumerate(true_labels):
dist, ind = bt.query([dim_red[:,i]], k=2)
closest_cell = ind[0, 1]
if true_labels[closest_cell] == l:
correct += 1
return float(correct)/len(true_labels)
[docs]def mdl(ll, k, data):
"""
Returns the minimum description length score of the model given its
log-likelihood and k, the number of cell types.
a lower cost is better...
"""
"""
N - no. of genes
n - no. of cells
k - no. of cell types
R - sum(Dataset) i.e. total no. of reads
function TotCost = TotBits(N,m,p,R,C)
# C is the cost from the cost function
TotCost = C + (N*m + m*p)*(log(R/(N*p)));
"""
N, m = data.shape
cost = ll + (N*m + m*k)*(np.log(data.sum()/(N*k)))
return cost