Source code for uncurl.lineage

# Lineage tracing and pseudotime calculation
import heapq

import numpy as np
from scipy.optimize import curve_fit
from scipy.spatial.distance import pdist, squareform

from .dimensionality_reduction import dim_reduce

[docs]def fourier_series(x, *a): """ Arbitrary dimensionality fourier series. The first parameter is a_0, and the second parameter is the interval/scale parameter. The parameters are altering sin and cos paramters. n = (len(a)-2)/2 """ output = 0 output += a[0]/2 w = a[1] for n in range(2, len(a), 2): n_ = n/2 val1 = a[n] val2 = a[n+1] output += val1*np.sin(n_*x*w) output += val2*np.cos(n_*x*w) return output
[docs]def graph_distances(start, edges, distances): """ Given an undirected adjacency list and a pairwise distance matrix between all nodes: calculates distances along graph from start node. Args: start (int): start node edges (list): adjacency list of tuples distances (array): 2d array of distances between nodes Returns: dict of node to distance from start """ # convert adjacency list to adjacency dict adj = {x: [] for x in range(len(distances))} for n1, n2 in edges: adj[n1].append(n2) adj[n2].append(n1) # run dijkstra's algorithm to_visit = [] new_dist = {} for n in adj[start]: heapq.heappush(to_visit, (distances[start, n], n)) while to_visit: d, next_node = heapq.heappop(to_visit) if next_node not in new_dist: new_dist[next_node] = d for n in adj[next_node]: if n not in new_dist: heapq.heappush(to_visit, (d + distances[next_node, n], n)) return new_dist
[docs]def poly_curve(x, *a): """ Arbitrary dimension polynomial. """ output = 0.0 for n in range(0, len(a)): output += a[n]*x**n return output
[docs]def lineage(means, weights, curve_function='poly', curve_dimensions=6): """ Lineage graph produced by minimum spanning tree Args: means (array): genes x clusters - output of state estimation weights (array): clusters x cells - output of state estimation curve_function (string): either 'poly' or 'fourier'. Default: 'poly' curve_dimensions (int): number of parameters for the curve. Default: 6 Returns: curve parameters: list of lists for each cluster smoothed data in 2d space: 2 x cells list of edges: pairs of cell indices cell cluster assignments: list of ints """ if curve_function=='poly': func = poly_curve elif curve_function=='fourier': func = fourier_series # step 1: dimensionality reduction X = dim_reduce(means, weights, 2) reduced_data = np.dot(X.T, weights) if X.shape[0]==2: reduced_data = np.dot(X, weights) # 2. identifying dominant cell types - max weight for each cell cells = weights.shape[1] clusters = means.shape[1] cell_cluster_assignments = weights.argmax(0) # 3. fit smooth curve over cell types -5th order fourier series # cluster_curves contains the parameters for each curve. cluster_curves = [] # cluster_fitted_vals is a 2 x cells array cluster_fitted_vals = reduced_data.copy() # cluster_edges contain a list of ordered pairs (indices) connecting cells # in each cluster. cluster_edges = [] for c in range(clusters): cluster_cells = reduced_data[:, cell_cluster_assignments==c] if len(cluster_cells) == 0: cluster_edges.append([]) continue if cluster_cells.shape[1] < 2: cluster_edges.append([]) continue elif cluster_cells.shape[1] < curve_dimensions: tc = cluster_cells.shape[1]-1 else: tc = curve_dimensions # y = f(x) if curve_function=='fourier': p0 = [1.0]*tc # scipy is bad at finding the correct scale p0[1] = 0.0001 bounds = (-np.inf, np.inf) else: p0 = [1.0]*tc bounds = (-np.inf, np.inf) p_x, pcov_x = curve_fit(func, cluster_cells[0,:], cluster_cells[1,:], p0=p0, bounds=bounds) perr_x = np.sum(np.sqrt(np.diag(pcov_x))) # x = f(y) p_y, pcov_y = curve_fit(func, cluster_cells[1,:], cluster_cells[0,:], p0=p0, bounds=bounds) perr_y = np.sum(np.sqrt(np.diag(pcov_y))) if perr_x <= perr_y: x_vals = reduced_data[0,:] cluster_curves.append(p_x) y_vals = np.array([func(x, *p_x) for x in x_vals]) #print 'error:', np.sum(np.sqrt((y_vals - reduced_data[1,:])**2)[cell_cluster_assignments==c]) fitted_vals = np.array([x_vals, y_vals]) cluster_fitted_vals[:,cell_cluster_assignments==c] = fitted_vals[:,cell_cluster_assignments==c] # sort points by increasing X, connect points x_indices = np.argsort(x_vals) x_indices = [x for x in x_indices if cell_cluster_assignments[x]==c] new_cluster_edges = [] for i, j in zip(x_indices[:-1], x_indices[1:]): new_cluster_edges.append((i,j)) cluster_edges.append(new_cluster_edges) else: y_vals = reduced_data[1,:] cluster_curves.append(p_y) x_vals = np.array([func(x, *p_y) for x in y_vals]) #print 'error:', np.sum(np.sqrt((x_vals - reduced_data[0,:])**2)[cell_cluster_assignments==c]) fitted_vals = np.array([x_vals, y_vals]) cluster_fitted_vals[:,cell_cluster_assignments==c] = fitted_vals[:,cell_cluster_assignments==c] # sort points by increasing Y, connect points y_indices = np.argsort(y_vals) y_indices = [x for x in y_indices if cell_cluster_assignments[x]==c] new_cluster_edges = [] for i,j in zip(y_indices[:-1], y_indices[1:]): new_cluster_edges.append((i,j)) cluster_edges.append(new_cluster_edges) # 4. connect each cluster together # for each cluster, find the closest point in another cluster, and connect # those points. Add that point to cluster_edges. # build a distance matrix between the reduced points... distances = squareform(pdist(cluster_fitted_vals.T)) for c1 in range(clusters): min_dist = np.inf min_index = None if sum(cell_cluster_assignments==c1)==0: continue for c2 in range(clusters): if sum(cell_cluster_assignments==c2)==0: continue if c1!=c2: distances_c = distances[cell_cluster_assignments==c1,:][:, cell_cluster_assignments==c2] mindex = np.unravel_index(distances_c.argmin(), distances_c.shape) if distances_c[mindex] < min_dist: min_dist = distances_c[mindex] min_index = np.where(distances==min_dist) min_index = (min_index[0][0], min_index[1][0]) cluster_edges[c1].append(min_index) # flatten cluster_edges? cluster_edges = [i for sublist in cluster_edges for i in sublist] return cluster_curves, cluster_fitted_vals, cluster_edges, cell_cluster_assignments
[docs]def pseudotime(starting_node, edges, fitted_vals): """ Args: starting_node (int): index of the starting node edges (list): list of tuples (node1, node2) fitted_vals (array): output of lineage (2 x cells) Returns: A 1d array containing the pseudotime value of each cell. """ # TODO # 1. calculate a distance matrix... distances = np.array([[sum((x - y)**2) for x in fitted_vals.T] for y in fitted_vals.T]) # 2. start from the root node/cell, calculate distance along graph distance_dict = graph_distances(starting_node, edges, distances) output = [] for i in range(fitted_vals.shape[1]): output.append(distance_dict[i]) return np.array(output)