Source code for uncurl.experiment_runner

# general framework for running purity experiments on 10x dataset

# two steps: dimensionality reduction/preprocessing, and clustering
# preprocessing -> dim_red -> clustering?
# ex. uncurl_mw -> tsne -> km???

# preprocessing returns a matrix of shape (k, cells), where k <= genes
from __future__ import print_function

import time
import sys

try:
    # optional dependencies...
    import matplotlib.pyplot as plt
except:
    pass

import numpy as np
from scipy import sparse
from scipy.special import log1p

from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
from sklearn.metrics.cluster import adjusted_rand_score as ari

from sklearn.decomposition import NMF, TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.cluster.bicluster import SpectralBiclustering, SpectralCoclustering

try:
    import Cluster_Ensembles as CE
except:
    pass

# optional dependencies?
try:
    import SIMLR
    from ZIFA import ZIFA
    from ZIFA import block_ZIFA
except:
    # optional dependencies?
    pass

# magic (requires python 3)
try:
    import pandas as pd
    import magic
except:
    pass

from .state_estimation import poisson_estimate_state
from .dimensionality_reduction import dim_reduce
from .evaluation import purity, nne
from .preprocessing import cell_normalize
try:
    from . import ensemble
    from .ensemble import nmf_ensemble, nmf_kfold, nmf_tsne, poisson_se_tsne, poisson_se_multiclust, lightlda_se_tsne
except:
    print('unable to import ensemble methods.')
from .clustering import poisson_cluster
from .lightlda_utils import lightlda_estimate_state
from .plda_utils import plda_estimate_state
from .vis import visualize_dim_red

from uncurl.sparse_utils import symmetric_kld, jensen_shannon


[docs]class Preprocess(object):
    """
    Pre-processing methods take in a genes x cells data matrix of integer
    counts, and return a k x cells matrix, where k <= genes.

    Preprocessing methods can return multiple outputs. the outputs are

    If k=2, then the method can be used for visualization...

    This class represents a 'blank' preprocessing.
    """

    def __init__(self, **params):
        if 'output_names' in params:
            self.output_names = params['output_names']
            params.pop('output_names')
        self.params = params
        if not hasattr(self, 'output_names'):
            self.output_names = ['Data']

[docs]    def run(self, data):
        """
        should return a list of output matrices of the same length
        as self.output_names, and an objective value.

        data is of shape (genes, cells).
        """
        return [data], 0

[docs]class Log(Preprocess):
    """
    Takes the natural log of the data+1.
    """

    def __init__(self, **params):
        self.output_names = ['LogData']
        super(Log, self).__init__(**params)

[docs]    def run(self, data):
        if sparse.issparse(data):
            return [data.log1p()], 0
        else:
            return [np.log1p(data)], 0

[docs]class LogNorm(Preprocess):
    """
    First, normalizes the counts per cell, and then takes log(normalized_counts+1).
    """

    def __init__(self, **params):
        self.output_names = ['LogNorm']
        super(LogNorm, self).__init__(**params)

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        if sparse.issparse(data_norm):
            return [data_norm.log1p()], 0
        else:
            return [np.log1p(data_norm)], 0

[docs]class TSVD(Preprocess):
    """
    Runs truncated SVD on the data. the input param k is the number of
    dimensions.
    """

    def __init__(self, **params):
        self.output_names = ['TSVD']
        self.tsvd = TruncatedSVD(params['k'])
        super(TSVD, self).__init__(**params)

[docs]    def run(self, data):
        return [self.tsvd.fit_transform(data.T).T], 0

[docs]class Tsne(Preprocess):
    """
    2d tsne dimensionality reduction - tsne always uses 2d

    metric is a string that could be any metric usable with tsne, or 'kld'
    or 'jensen-shannon'
    """

    def __init__(self, metric='euclidean', **params):
        """
        metric (str) can be any metric usable with tsne.
        """
        self.output_names = ['TSNE']
        if metric != 'euclidean':
            self.output_names = ['TSNE_' + metric]
        if metric=='kld':
            metric = symmetric_kld
        elif metric == 'jensen-shannon':
            metric = jensen_shannon
        self.tsne = TSNE(2, metric=metric)
        super(Tsne, self).__init__(**params)

[docs]    def run(self, data):
        if sparse.issparse(data):
            data = data.toarray()
        return [self.tsne.fit_transform(data.T).T], 0

[docs]class Pca(Preprocess):
    """
    PCA preprocessing
    """

    def __init__(self, **params):
        self.output_names = ['PCA']
        self.pca = PCA(params['k'])
        super(Pca, self).__init__(**params)

[docs]    def run(self, data):
        if sparse.issparse(data):
            data = data.toarray()
        return [self.pca.fit_transform(data.T).T], 0

[docs]class Zifa(Preprocess):
    """
    ZIFA preprocessing
    """

    def __init__(self, **params):
        self.output_names = ['ZIFA']
        self.k = params['k']
        super(Zifa, self).__init__(**params)

[docs]    def run(self, data):
        if sparse.issparse(data):
            data = data.toarray()
        data = np.log1p(data)
        Z, model_params = block_ZIFA.fitModel(data.T, self.k)
        return [Z.T], 0


[docs]class PoissonSE(Preprocess):
    """
    Runs Poisson State Estimation, returning W and MW.

    Requires a 'k' parameter.

    Optional args: return_m=True: returns M in outputs
    return_mw=True: returns MW in outputs
    """

    def __init__(self, return_w=True, return_m=False, return_mw=False,
            return_mds=False, normalize_data=False, **params):
        self.output_names = []
        self.return_w = return_w
        if return_w:
            self.output_names.append('Poisson_W')
        self.return_m = return_m
        if return_m:
            self.output_names.append('Poisson_M')
        self.return_mw = return_mw
        if return_mw:
            self.output_names.append('Poisson_MW')
        self.return_mds = return_mds
        if return_mds:
            self.output_names.append('Poisson_MDS')
        self.normalize_data = normalize_data
        super(PoissonSE, self).__init__(**params)

[docs]    def run(self, data):
        """
        Returns:
            list of W, M*W
            ll
        """
        if self.normalize_data:
            data = cell_normalize(data)
        M, W, ll = poisson_estimate_state(data, **self.params)
        outputs = []
        if self.return_w:
            outputs.append(W)
        if self.return_m:
            outputs.append(M)
        if self.return_mw:
            outputs.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            outputs.append(X.T.dot(W))
        return outputs, ll


[docs]class LightLDASE(Preprocess):
    """
    Runs LightLDA State Estimation, returning W and MW.
    Requires a 'k' parameter.
    """

    def __init__(self, **params):
        self.output_names = ['LightLDA_W']
        self.return_w = True
        self.return_m = False
        self.return_mw = False
        self.return_mds = False
        if 'return_mw' in params and params['return_mw']:
            self.output_names.append('LightLDA_MW')
            self.return_mw = True
            params.pop('return_mw')
        if 'return_mds' in params and params['return_mds']:
            self.output_names.append('LightLDA_MDS')
            self.return_mds = True
            params.pop('return_mds')
        super(LightLDASE, self).__init__(**params)

[docs]    def run(self, data):
        M, W, ll = lightlda_estimate_state(data, **self.params)
        output = [W]
        if self.return_mw:
            output.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            output.append(X.T.dot(W))
        return output, ll

[docs]class PLDASE(Preprocess):
    """
    Runs PLDA State Estimation, returning W and MW.
    Requires a 'k' parameter.
    """

    def __init__(self, **params):
        self.output_names = ['PLDA_W']
        if 'return_mw' in params and params['return_mw']:
            self.output_names.append('PLDA_MW')
            self.return_mw = True
            params.pop('return_mw')
        if 'return_mds' in params and params['return_mds']:
            self.output_names.append('PLDA_MDS')
            self.return_mds = True
            params.pop('return_mds')
        super(PLDASE, self).__init__(**params)

[docs]    def run(self, data):
        M, W = plda_estimate_state(data, **self.params)
        output = [W]
        if self.return_mw:
            output.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            output.append(X.T.dot(W))
        return output, 0



[docs]class EnsembleTsneLightLDASE(Preprocess):
    """
    Runs tsne-based LightLDA Poisson state estimation
    """

    def __init__(self, **params):
        self.output_names = ['LightLDA_Ensemble_W']
        if 'return_mw' in params and params['return_mw']:
            self.output_names.append('LightLDA_Ensemble_MW')
            self.return_mw = True
            params.pop('return_mw')
        if 'return_mds' in params and params['return_mds']:
            self.output_names.append('LightLDA_Ensemble_MDS')
            self.return_mds = True
            params.pop('return_mds')
        super(EnsembleTsneLightLDASE, self).__init__(**params)

[docs]    def run(self, data):
        M, W, ll = lightlda_se_tsne(data, **self.params)
        output = [W]
        if self.return_mw:
            output.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            output.append(X.T.dot(W))
        return output, ll


[docs]class LogNMF(Preprocess):
    """
    Runs NMF on log(normalize(data)+1), returning H and W*H.

    Requires a 'k' parameter, which is the rank of the matrices.
    """

    def __init__(self, return_h=True, return_w=False, return_mds=False, return_wh=False, **params):
        super(LogNMF, self).__init__(**params)
        self.output_names = []
        self.nmf = NMF(params['k'])
        self.return_h = return_h
        if return_h:
            self.output_names.append('logNMF_H')
        self.return_w = return_w
        if return_w:
            self.output_names.append('logNMF_W')
        self.return_mds = return_mds
        if return_mds:
            self.output_names.append('logNMF_MDS')
        self.return_wh = return_wh
        if return_wh:
            self.output_names.append('logNMF_WH')

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        if sparse.issparse(data_norm):
            data_norm = data_norm.log1p()
        else:
            data_norm = log1p(data_norm)
        W = self.nmf.fit_transform(data_norm)
        H = self.nmf.components_
        if sparse.issparse(data_norm):
            cost = 0
            #ws = sparse.csr_matrix(W)
            #hs = sparse.csr_matrix(H)
            #cost = 0.5*((data_norm - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data_norm - W.dot(H))**2).sum()
        if 'normalize_h' in self.params:
            print('normalize h')
            H = H/H.sum(0)
        output = []
        if self.return_h:
            output.append(H)
        if self.return_w:
            output.append(W)
        if self.return_wh:
            output.append(W.dot(H))
        if self.return_mds:
            X = dim_reduce(W, H, 2)
            output.append(X.T.dot(H))
        return output, cost

[docs]class BasicNMF(Preprocess):
    """
    Runs NMF on data, returning H and W*H.

    Requires a 'k' parameter, which is the rank of the matrices.
    """

    def __init__(self, return_h=True, return_w=False, return_mds=False, return_wh=False, **params):
        super(BasicNMF, self).__init__(**params)
        self.nmf = NMF(params['k'])
        self.output_names = []
        self.return_h = return_h
        if return_h:
            self.output_names.append('NMF_H')
        self.return_w = return_w
        if return_w:
            self.output_names.append('NMF_W')
        self.return_mds = return_mds
        if return_mds:
            self.output_names.append('NMF_MDS')
        self.return_wh = return_wh
        if return_wh:
            self.output_names.append('NMF_WH')

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        W = self.nmf.fit_transform(data_norm)
        H = self.nmf.components_
        if sparse.issparse(data):
            ws = sparse.csr_matrix(W)
            hs = sparse.csr_matrix(H)
            cost = 0.5*((data - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data - W.dot(H))**2).sum()
        if 'normalize_h' in self.params:
            H = H/H.sum(0)
        output = []
        if self.return_h:
            output.append(H)
        if self.return_w:
            output.append(W)
        if self.return_wh:
            output.append(W.dot(H))
        if self.return_mds:
            X = dim_reduce(W, H, 2)
            output.append(X.T.dot(H))
        return output, cost

[docs]class EnsembleNMF(Preprocess):
    """
    Runs Ensemble NMF on log(data+1), returning the consensus
    results for H and W*H.

    Requires a 'k' parameter, which is the rank of the matrices.
    """

    def __init__(self, **params):
        self.output_names = ['H', 'WH']
        super(EnsembleNMF, self).__init__(**params)

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        if sparse.issparse(data_norm):
            data_norm = data_norm.log1p()
        else:
            data_norm = log1p(data_norm)
        W, H = nmf_ensemble(data_norm, **self.params)
        if sparse.issparse(data_norm):
            ws = sparse.csr_matrix(W)
            hs = sparse.csr_matrix(H)
            cost = 0.5*((data_norm - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data_norm - W.dot(H))**2).sum()
        return [H, W.dot(H)], cost

[docs]class KFoldNMF(Preprocess):
    """
    Runs K-fold ensemble NMF on log(data+1), returning the consensus
    results for H and W*H.

    Requires a 'k' parameter, which is the rank of the matrices.
    """

    def __init__(self, **params):
        self.output_names = ['H', 'WH']
        super(KFoldNMF, self).__init__(**params)

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        if sparse.issparse(data_norm):
            data_norm = data_norm.log1p()
        else:
            data_norm = log1p(data_norm)
        W, H = nmf_kfold(data_norm, **self.params)
        if sparse.issparse(data_norm):
            ws = sparse.csr_matrix(W)
            hs = sparse.csr_matrix(H)
            cost = 0.5*((data_norm - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data_norm - W.dot(H))**2).sum()
        return [H, W.dot(H)], cost

[docs]class EnsembleTsneNMF(Preprocess):
    """
    Runs tsne-based ensemble NMF
    """

    def __init__(self, **params):
        self.output_names = ['Ensemble_NMF_H', 'Ensemble_NMF_WH']
        super(EnsembleTsneNMF, self).__init__(**params)

[docs]    def run(self, data):
        data_norm = cell_normalize(data)
        if sparse.issparse(data_norm):
            data_norm = data_norm.log1p()
        else:
            data_norm = log1p(data_norm)
        W, H = nmf_tsne(data_norm, **self.params)
        if sparse.issparse(data_norm):
            ws = sparse.csr_matrix(W)
            hs = sparse.csr_matrix(H)
            cost = 0.5*((data_norm - ws.dot(hs)).power(2)).sum()
        else:
            cost = 0.5*((data_norm - W.dot(H))**2).sum()
        return [H, W.dot(H)], cost

[docs]class EnsembleTsnePoissonSE(Preprocess):
    """
    Runs tsne-based ensemble Poisson state estimation
    """

    def __init__(self, **params):
        self.output_names = ['Ensemble_W', 'Ensemble_MW']
        super(EnsembleTsnePoissonSE, self).__init__(**params)

[docs]    def run(self, data):
        M, W, obj = poisson_se_tsne(data, **self.params)
        outputs = []
        outputs.append(W)
        if 'return_m' in self.params and self.params['return_m']:
            outputs.append(M)
        if 'return_mw' in self.params and self.params['return_mw']:
            outputs.append(M.dot(W))
        return outputs, obj

[docs]class EnsembleTSVDPoissonSE(Preprocess):
    """
    Runs Poisson state estimation initialized from 8 runs of tsvd-km.

    params: k - dimensionality
    """

    def __init__(self, **params):
        self.output_names = ['TSVDEnsemble_W']
        self.return_m = False
        self.return_mw = False
        self.return_mds = False
        if 'return_m' in params and params['return_m']:
            self.output_names.append('TSVDEnsemble_M')
            self.return_m = True
            params.pop('return_m')
        if 'return_mw' in params and params['return_mw']:
            self.output_names.append('TSVDEnsemble_MW')
            self.return_mw = True
            params.pop('return_mw')
        if 'return_mds' in params and params['return_mds']:
            self.output_names.append('TSVDEnsemble_MDS')
            self.return_mds = True
            params.pop('return_mds')
        super(EnsembleTSVDPoissonSE, self).__init__(**params)

[docs]    def run(self, data):
        M, W, obj = poisson_se_multiclust(data, **self.params)
        outputs = []
        outputs.append(W)
        if self.return_m:
            outputs.append(M)
        if self.return_mw:
            outputs.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            outputs.append(X.T.dot(W))
        return outputs, obj

[docs]class EnsembleClusterPoissonSE(Preprocess):
    """
    Runs Poisson state estimation initialized from the consensus
    of 10 runs of Poisson KM.

    params: k - dimensionality
    """

    def __init__(self, **params):
        self.output_names = ['ConsensusPoisson_W']
        self.return_m = False
        self.return_mw = False
        self.return_mds = False
        if 'return_m' in params and params['return_m']:
            self.output_names.append('ConsensusPoisson_M')
            self.return_m = True
            params.pop('return_m')
        if 'return_mw' in params and params['return_mw']:
            self.output_names.append('ConsensusPoisson_MW')
            self.return_mw = True
            params.pop('return_mw')
        if 'return_mds' in params and params['return_mds']:
            self.output_names.append('ConsensusPoisson_MDS')
            self.return_mds = True
            params.pop('return_mds')
        super(EnsembleClusterPoissonSE, self).__init__(**params)

[docs]    def run(self, data):
        # make the data sparse for runtime improvements in poisson clustering
        data = sparse.csc_matrix(data)
        M, W, obj = ensemble.poisson_consensus_se(data, **self.params)
        outputs = []
        outputs.append(W)
        if self.return_m:
            outputs.append(M)
        if self.return_mw:
            outputs.append(M.dot(W))
        if self.return_mds:
            X = dim_reduce(M, W, 2)
            outputs.append(X.T.dot(W))
        return outputs, obj

[docs]class Simlr(Preprocess):

    def __init__(self, **params):
        self.output_names = ['PCA50_SIMLR']
        # TODO: make params tunable... what do these numbers even mean???
        self.simlr = SIMLR.SIMLR_LARGE(params['k'], 30, 0)
        super(Simlr, self).__init__(**params)

[docs]    def run(self, data):
        X_log = np.log1p(data)
        if 'n_pca_components' in self.params:
            n_components = self.params['n_pca_components']
        else:
            n_components = 50
        X = SIMLR.helper.fast_pca(X_log.T, n_components)
        S, F, val, ind = self.simlr.fit(X)
        return [F.T], 0

[docs]class SimlrSmall(Preprocess):
    """
    Simlr for small-scale datasets (no PCA preprocessing)
    """

    def __init__(self, **params):
        self.output_names = ['SIMLR']
        # TODO: make params tunable... what do these numbers even mean???
        self.simlr = SIMLR.SIMLR(params['k'], 30, 0)
        super(Simlr, self).__init__(**params)

[docs]    def run(self, data):
        X = np.log1p(data)
        # TODO: the python implementation of simlr only contains the
        # large-scale (PCA-dependent) methods.
        if 'n_pca_components' in self.params:
            n_components = self.params['n_pca_components']
        else:
            n_components = 50
        X = SIMLR.helper.fast_pca(data.T, n_components)
        S, F, val, ind = self.simlr.fit(X)
        return [F.T], 0

[docs]class Magic(Preprocess):
    # TODO: this requires python 3

    def __init__(self, use_magic=True, use_tsne=False, use_pca=False, **params):
        self.output_names = []
        self.use_magic = use_magic
        self.use_tsne = use_tsne
        self.use_pca = use_pca
        if self.use_magic:
            self.output_names.append('magic')
        if self.use_tsne:
            self.output_names.append('magic_tsne')
        if self.use_pca:
            self.output_names.append('magic_pca')
        super(Magic, self).__init__(**params)

[docs]    def run(self, data):
        if 'n_pca_components' in self.params:
            n_components = self.params['n_pca_components']
        else:
            n_components = 20
        if sparse.issparse(data):
            data = data.toarray()
        data_array = pd.DataFrame(data.T)
        data_array.columns = data_array.columns.astype(str)
        scdata = magic.mg.SCData(pd.DataFrame(data_array), data_type='sc-seq')
        scdata = scdata.normalize_scseq_data()
        scdata.run_magic(n_pca_components=n_components, random_pca=True,
                t=6, k=30, ka=10, epsilon=1, rescale_percent=99)
        #scdata.run_tsne()
        outputs = []
        if self.use_magic:
            outputs.append(scdata.magic.data.as_matrix().T)
        if self.use_tsne:
            scdata.magic.run_tsne()
            outputs.append(scdata.magic.tsne.as_matrix().T)
        if self.use_pca:
            scdata.magic.run_pca()
            outputs.append(scdata.magic.pca.as_matrix().T)
        return outputs, 0

[docs]class LoadPreproc(Preprocess):
    """
    takes preprocessed data matrix, just return that when run is called
    """
    def __init__(self, datasets, **params):
        self.datasets = datasets
        super(LoadPreproc, self).__init__(**params)

[docs]    def run(self, data):
        return self.datasets, 0


[docs]class Cluster(object):
    """
    Clustering methods take in a matrix of shape k x cells, and
    return an array of integers in (0, n_classes-1).

    They should be able to run on the output of pre-processing...
    """

    def __init__(self, n_classes, **params):
        self.n_classes = n_classes
        self.name = ''
        if 'name' in params:
            self.name = params.pop('name')
        self.params = params

[docs]    def run(self, data):
        pass

[docs]class Argmax(Cluster):

    def __init__(self, n_classes, **params):
        super(Argmax, self).__init__(n_classes, **params)
        self.name = 'argmax'

[docs]    def run(self, data):
        assert(data.shape[0]==self.n_classes)
        return data.argmax(0)

[docs]class KM(Cluster):
    """
    k-means clustering
    """

    def __init__(self, n_classes, **params):
        super(KM, self).__init__(n_classes, **params)
        self.name = 'km'
        self.km = KMeans(n_classes)

[docs]    def run(self, data):
        return self.km.fit_predict(data.T)

[docs]class DBScan(Cluster):
    """
    dbscan clustering
    """

    def __init__(self, n_classes, **params):
        super(DBScan, self).__init__(n_classes, **params)
        self.name = 'dbscan'
        self.dbscan = DBSCAN()

[docs]    def run(self, data):
        return self.dbscan.fit_predict(data.T)

[docs]class PoissonCluster(Cluster):
    """
    Poisson k-means clustering
    """

    def __init__(self, n_classes, **params):
        super(PoissonCluster, self).__init__(n_classes, **params)
        self.name = 'poisson_km'

[docs]    def run(self, data):
        assignments, means = poisson_cluster(data, self.n_classes, **self.params)
        return assignments


[docs]class PcaKm(Cluster):
    """
    PCA + kmeans

    Requires a parameter k, where k is the dimensionality
    of PCA.
    """

    def __init__(self, n_classes, use_log=False, name='pca_km', **params):
        super(PcaKm, self).__init__(n_classes, **params)
        self.use_log = use_log
        self.pca = PCA(params['k'])
        self.km = KMeans(n_classes)
        self.name = name

[docs]    def run(self, data):
        if sparse.issparse(data):
            data = data.toarray()
        if self.use_log:
            data = log1p(data)
        data_pca = self.pca.fit_transform(data.T)
        labels = self.km.fit_predict(data_pca)
        return labels

[docs]class TsneKm(Cluster):
    """
    TSNE(2) + Kmeans
    """

    def __init__(self, n_classes, use_log=False, name='tsne_km',
            metric='euclidean', use_exp=False, **params):
        super(TsneKm, self).__init__(n_classes, **params)
        self.use_log=use_log
        if metric=='kld':
            metric = symmetric_kld
        if 'k' in self.params:
            self.tsne = TSNE(self.params['k'], metric=metric)
        else:
            self.tsne = TSNE(2, metric=metric)
        self.km = KMeans(n_classes)
        self.name = name
        self.use_exp = use_exp

[docs]    def run(self, data):
        if sparse.issparse(data):
            data = data.toarray()
        if self.use_log:
            data = log1p(data)
        if self.use_exp:
            data = (10**data) - 1
        data_tsne = self.tsne.fit_transform(data.T)
        labels = self.km.fit_predict(data_tsne)
        return labels

[docs]class SimlrKm(Cluster):
    """
    Fast minibatch Kmeans from the simlr library
    """

    def __init__(self, n_classes, **params):
        super(SimlrKm, self).__init__(n_classes, **params)
        self.simlr = SIMLR.SIMLR_LARGE(8, 30, 0)
        self.name = 'km'

[docs]    def run(self, data):
        y_pred = self.simlr.fast_minibatch_kmeans(data.T, 8)
        return y_pred.flatten()

[docs]class Bicluster(Cluster):
    """
    Spectral Biclustering
    """

    def __init__(self, n_classes, n_gene_classes=10, **params):
        super(Bicluster, self).__init__(n_classes, **params)
        self.n_gene_classes = n_gene_classes
        self.name = 'SpectralBicluster'

[docs]    def run(self, data):
        bc = SpectralBiclustering(n_clusters=(self.n_gene_classes, self.n_classes))
        bc.fit(data)
        gene_clusters = bc.row_labels_
        cell_clusters = bc.column_labels_
        return cell_clusters

[docs]class Cocluster(Cluster):
    """
    Spectral Coclustering
    """

    def __init__(self, n_classes, n_gene_classes=10, **params):
        super(Cocluster, self).__init__(n_classes, **params)
        self.n_gene_classes = n_gene_classes
        self.name = 'SpectralCocluster'

[docs]    def run(self, data):
        bc = SpectralCoclustering(n_clusters=(self.n_gene_classes, self.n_classes))
        bc.fit(data)
        gene_clusters = bc.row_labels_
        cell_clusters = bc.column_labels_
        return cell_clusters



[docs]def run_experiment(methods, data, n_classes, true_labels, n_runs=10, use_purity=True, use_nmi=False, use_ari=False, use_nne=False, consensus=False):
    """
    runs a pre-processing + clustering experiment...

    exactly one of use_purity, use_nmi, or use_ari can be true

    Args:
        methods: list of 2-tuples. The first element is either a single Preprocess object or a list of Preprocess objects, to be applied in sequence to the data. The second element is either a single Cluster object, a list of Cluster objects, or a list of lists, where each list is a sequence of Preprocess objects with the final element being a Cluster object.
        data: genes x cells array
        true_labels: 1d array of length cells
        consensus: if true, runs a consensus on cluster results for each method at the very end.
        use_purity, use_nmi, use_ari, use_nne: which error metric to use (at most one can be True)

    Returns:
        purities (list of lists)
        names (list of lists)
        other (dict): keys: timing, preprocessing, clusterings
    """
    results = []
    names = []
    clusterings = {}
    other_results = {}
    other_results['timing'] = {}
    other_results['preprocessing'] = {}
    if use_purity:
        purity_method = purity
    elif use_nmi:
        purity_method = nmi
    elif use_ari:
        purity_method = ari
    elif use_nne:
        purity_method = nne
    for i in range(n_runs):
        print('run {0}'.format(i))
        purities = []
        r = 0
        method_index = 0
        for preproc, cluster in methods:
            t0 = time.time()
            if isinstance(preproc, Preprocess):
                preprocessed, ll = preproc.run(data)
                output_names = preproc.output_names
            else:
                # if the input is a list, only use the first preproc result
                p1 = data
                output_names = ['']
                for p in preproc:
                    p1, ll = p.run(p1)
                    p1 = p1[0]
                    if output_names[0] != '':
                        output_names[0] = output_names[0] + '_' + p.output_names[0]
                    else:
                        output_names[0] = p.output_names[0]
                preprocessed = [p1]
            t1 = time.time() - t0
            for name, pre in zip(output_names, preprocessed):
                starting_index = method_index
                if isinstance(cluster, Cluster):
                    #try:
                        t0 = time.time()
                        labels = cluster.run(pre)
                        t2 = t1 + time.time() - t0
                        if use_nne:
                            purities.append(purity_method(pre, true_labels))
                        else:
                            purities.append(purity_method(labels, true_labels))
                        if i==0:
                            names.append(name + '_' + cluster.name)
                            clusterings[names[-1]] = []
                            other_results['timing'][names[-1]] = []
                        print(names[r])
                        clusterings[names[r]].append(labels)
                        print('time: ' + str(t2))
                        other_results['timing'][names[r]].append(t2)
                        print(purities[-1])
                        r += 1
                        method_index += 1
                    #except:
                    #    print('failed to do clustering')
                elif type(cluster) == list:
                    for c in cluster:
                        if isinstance(c, list):
                            t2 = t1
                            name2 = name
                            sub_data = pre.copy()
                            for subproc in c[:-1]:
                                t0 = time.time()
                                subproc_out, ll = subproc.run(sub_data)
                                sub_data = subproc_out[0]
                                name2 = name2 + '_' + subproc.output_names[0]
                                t2 += time.time() - t0
                            t0 = time.time()
                            labels = c[-1].run(sub_data)
                            t2 += time.time() - t0
                            if use_nne:
                                purities.append(purity_method(sub_data, true_labels))
                            else:
                                purities.append(purity_method(labels, true_labels))
                            if i==0:
                                names.append(name2 + '_' + c[-1].name)
                                clusterings[names[-1]] = []
                                other_results['timing'][names[-1]] = []
                            print(names[r])
                            clusterings[names[r]].append(labels)
                            other_results['timing'][names[r]].append(t2)
                            print('time: ' + str(t2))
                            print(purities[-1])
                            r += 1
                            method_index += 1
                        else:
                            try:
                                t0 = time.time()
                                labels = c.run(pre)
                                t2 = t1 + time.time() - t0
                                if i==0:
                                    names.append(name + '_' + c.name)
                                    clusterings[names[-1]] = []
                                    other_results['timing'][names[-1]] = []
                                if use_nne:
                                    purities.append(purity_method(pre, true_labels))
                                else:
                                    purities.append(purity_method(labels, true_labels))
                                print(names[r])
                                clusterings[names[r]].append(labels)
                                other_results['timing'][names[r]].append(t2)
                                print('time: ' + str(t2))
                                print(purities[-1])
                                r += 1
                                method_index += 1
                            except:
                                print('failed to do clustering')
                # find the highest purity for the pre-processing method
                # save the preprocessing result with the highest NMI
                num_clustering_results = method_index - starting_index
                clustering_results = purities[-num_clustering_results:]
                if i > 0 and len(clustering_results) > 0:
                    old_clustering_results = results[-1][starting_index:method_index]
                    if max(old_clustering_results) < max(clustering_results):
                        other_results['preprocessing'][name] = pre
                else:
                    other_results['preprocessing'][name] = pre
        print('\t'.join(names))
        print('purities: ' + '\t'.join(map(str, purities)))
        results.append(purities)
    consensus_purities = []
    if consensus:
        other_results['consensus'] = {}
        k = len(np.unique(true_labels))
        for name, clusts in clusterings.items():
            print(name)
            clusts = np.vstack(clusts)
            consensus_clust = CE.cluster_ensembles(clusts, verbose=False, N_clusters_max=k)
            other_results['consensus'][name] = consensus_clust
            if use_purity:
                consensus_purity = purity(consensus_clust.flatten(), true_labels)
                print('consensus purity: ' + str(consensus_purity))
                consensus_purities.append(consensus_purity)
            if use_nmi:
                consensus_nmi = nmi(true_labels, consensus_clust)
                print('consensus NMI: ' + str(consensus_nmi))
                consensus_purities.append(consensus_nmi)
            if use_ari:
                consensus_ari = ari(true_labels, consensus_clust)
                print('consensus ARI: ' + str(consensus_ari))
                consensus_purities.append(consensus_ari)
        print('consensus results: ' + '\t'.join(map(str, consensus_purities)))
    other_results['clusterings'] = clusterings
    return results, names, other_results

[docs]def generate_visualizations(methods, data, true_labels, base_dir = 'visualizations',
        figsize=(18,10), **scatter_options):
    """
    Generates visualization scatters for all the methods.

    Args:
        methods: follows same format as run_experiments. List of tuples.
        data: genes x cells
        true_labels: array of integers
        base_dir: base directory to save all the plots
        figsize: tuple of ints representing size of figure
        scatter_options: options for plt.scatter
    """
    plt.figure(figsize=figsize)
    for method in methods:
        preproc= method[0]
        if isinstance(preproc, Preprocess):
            preprocessed, ll = preproc.run(data)
            output_names = preproc.output_names
        else:
            # if the input is a list, only use the first preproc result
            p1 = data
            output_names = ['']
            for p in preproc:
                p1, ll = p.run(p1)
                p1 = p1[0]
                output_names[0] = output_names[0] + p.output_names[0]
            preprocessed = [p1]
        for r, name in zip(preprocessed, output_names):
            # TODO: cluster labels
            print(name)
            # if it's 2d, just display it... else, do tsne to reduce to 2d
            if r.shape[0]==2:
                r_dim_red = r
            else:
                # sometimes the data is too big to do tsne... (for sklearn)
                if sparse.issparse(r) and r.shape[0] > 100:
                    name = 'tsvd_' + name
                    tsvd = TruncatedSVD(50)
                    r_dim_red = tsvd.fit_transform(r.T)
                    try:
                        tsne = TSNE(2)
                        r_dim_red = tsne.fit_transform(r_dim_red).T
                        name = 'tsne_' + name
                    except:
                        tsvd2 = TruncatedSVD(2)
                        r_dim_red = tsvd2.fit_transform(r_dim_red).T
                else:
                    name = 'tsne_' + name
                    tsne = TSNE(2)
                    r_dim_red = tsne.fit_transform(r.T).T
            if isinstance(method[1], list):
                for clustering_method in method[1]:
                    try:
                        cluster_labels = clustering_method.run(r)
                    except:
                        print('clustering failed')
                        continue
                    output_path = base_dir + '/{0}_{1}_labels.png'.format(name, clustering_method.name)
                    visualize_dim_red(r_dim_red, cluster_labels, output_path, **scatter_options)
            else:
                clustering_method = method[1]
                try:
                    cluster_labels = clustering_method.run(r)
                except:
                    print('clustering failed')
                    continue
                output_path = base_dir + '/{0}_{1}_labels.png'.format(name, clustering_method.name)
                visualize_dim_red(r_dim_red, cluster_labels, output_path, **scatter_options)
            output_path = base_dir + '/{0}_true_labels.png'.format(name)
            visualize_dim_red(r_dim_red, true_labels, output_path, **scatter_options)