Source code for uncurl.evaluation

from collections import Counter

import numpy as np
from sklearn.neighbors import BallTree

[docs]def purity(labels, true_labels): """ Calculates the purity score for the given labels. Args: labels (array): 1D array of integers true_labels (array): 1D array of integers - true labels Returns: purity score - a float bewteen 0 and 1. Closer to 1 is better. """ purity = 0.0 for i in set(labels): indices = (labels==i) true_clusters = true_labels[indices] if len(true_clusters)==0: continue counts = Counter(true_clusters) lab, count = counts.most_common()[0] purity += count return float(purity)/len(labels)
[docs]def nne(dim_red, true_labels): """ Calculates the nearest neighbor accuracy (basically leave-one-out cross validation with a 1NN classifier). Args: dim_red (array): dimensions (k, cells) true_labels (array): 1d array of integers Returns: Nearest neighbor accuracy - fraction of points for which the 1NN 1NN classifier returns the correct value. """ # use sklearn's BallTree bt = BallTree(dim_red.T) correct = 0 for i, l in enumerate(true_labels): dist, ind = bt.query([dim_red[:,i]], k=2) closest_cell = ind[0, 1] if true_labels[closest_cell] == l: correct += 1 return float(correct)/len(true_labels)
[docs]def mdl(ll, k, data): """ Returns the minimum description length score of the model given its log-likelihood and k, the number of cell types. a lower cost is better... """ """ N - no. of genes n - no. of cells k - no. of cell types R - sum(Dataset) i.e. total no. of reads function TotCost = TotBits(N,m,p,R,C) # C is the cost from the cost function TotCost = C + (N*m + m*p)*(log(R/(N*p))); """ N, m = data.shape cost = ll + (N*m + m*k)*(np.log(data.sum()/(N*k))) return cost