Python Examples of umap.UMAP

Source File: helper.py From BERMUDA with MIT License

6 votes

def cal_UMAP(code, pca_dim = 50, n_neighbors = 30, min_dist=0.1, n_components=2, metric='cosine'):
    """ Calculate UMAP dimensionality reduction
    Args:
        code: num_cells * num_features
        pca_dim: if dimensionality of code > pca_dim, apply PCA first
        n_neighbors: UMAP parameter
        min_dist: UMAP parameter
        n_components: UMAP parameter
        metric: UMAP parameter
    Returns:
        umap_code: num_cells * n_components
    """
    if code.shape[1] > pca_dim:
        pca = PCA(n_components=pca_dim)
        code = pca.fit_transform(code)
    fit = umap.UMAP(n_neighbors=n_neighbors,
                    min_dist=min_dist,
                    n_components=n_components,
                    metric=metric,
                    random_state=0)
    umap_code = fit.fit_transform(code)

    return umap_code

Source File: EmbeddingsResolver.py From scattertext with Apache License 2.0

6 votes

def project_embeddings(self, projection_model=None, x_dim=0, y_dim=1):
        '''

        :param projection_model: sklearn unsupervised model (e.g., PCA) by default the recommended model is umap.UMAP,
            which requires UMAP in to be installed
        :param x_dim: int, default 0, dimension of transformation matrix for x-axis
        :param y_dim: int, default 1, dimension of transformation matrix for y-axis
        :return:
        '''
        axes = self.project(projection_model)
        word_axes = (pd.DataFrame({'term': [w for w in self.vocab_],
                                   'x': axes.T[x_dim],
                                   'y': axes.T[y_dim]})
                     .set_index('term')
                     .reindex(pd.Series(self.corpus_.get_terms()))
                     .dropna())
        self.corpus_ = self.corpus_.remove_terms(set(self.corpus_.get_terms()) - set(word_axes.index))
        word_axes = word_axes.reindex(self.corpus_.get_terms()).dropna()

        return self.corpus_, word_axes

Source File: EmbeddingsResolver.py From scattertext with Apache License 2.0

6 votes

def project(self, projection_model=None):
        '''
        :param projection_model: sklearn unsupervised model (e.g., PCA) by default the recommended model is umap.UMAP,
        which requires UMAP in to be installed

        :return: array, shape (num dimension, vocab size)
        '''
        if self.embeddings_ is None:
            raise Exception("Run set_embeddings_model or set_embeddings to get embeddings")
        if projection_model is None:
            try:
                import umap
            except:
                raise Exception("Please install umap (pip install umap-learn) to use the default projection_model.")
            projection_model = umap.UMAP(min_dist=0.5, metric='cosine')
        axes = projection_model.fit_transform(self.embeddings_)
        return axes

Source File: test_umap_trustworthiness.py From umap with BSD 3-Clause "New" or "Revised" License

6 votes

def test_discrete_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="ordinal",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )

Source File: train.py From B-SOID with GNU General Public License v3.0

6 votes

def main(train_folders: list):
    """
    :param train_folders: list, training data folders
    :return f_10fps: 2D array, features
    :return umap_embeddings: 2D array, embedded UMAP space
    :return nn_classifier: obj, MLP classifier
    :return scores: 1D array, cross-validated accuracy
    :return nn_assignments: neural net predictions
    """
    import bsoid_umap.utils.likelihoodprocessing
    filenames, training_data, perc_rect = bsoid_umap.utils.likelihoodprocessing.main(train_folders)
    f_10fps, f_10fps_sc = bsoid_feats(training_data)
    trained_umap, umap_embeddings = bsoid_umap_embed(f_10fps_sc)
    hdb_assignments, soft_clusters, soft_assignments = bsoid_hdbscan(umap_embeddings)
    nn_classifier, scores, nn_assignments = bsoid_nn(f_10fps, soft_assignments)
    if PLOT:
        timestr = time.strftime("_%Y%m%d_%H%M")
        fig1 = plot_classes(umap_embeddings[hdb_assignments >= 0], hdb_assignments[hdb_assignments >= 0])
        my_file1 = 'hdb_soft_assignments'
        fig1.savefig(os.path.join(OUTPUT_PATH, str.join('', (my_file1, timestr, '.svg'))))
        plot_accuracy(scores)
    return f_10fps, f_10fps_sc, umap_embeddings, hdb_assignments, soft_assignments, soft_clusters, \
           nn_classifier, scores, nn_assignments

Source File: test_umap_on_iris.py From umap with BSD 3-Clause "New" or "Revised" License

6 votes

def test_umap_sparse_transform_on_iris(iris, iris_selection):
    data = sparse.csr_matrix(iris.data[iris_selection])
    assert sparse.issparse(data)
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit(data)

    new_data = sparse.csr_matrix(iris.data[~iris_selection])
    assert sparse.issparse(new_data)
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.80,
        "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
    )


# UMAP Clusterability on Iris
# ---------------------------

Source File: test_umap_trustworthiness.py From umap with BSD 3-Clause "New" or "Revised" License

6 votes

def test_string_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    labels = np.array(["this", "that", "other"])[labels]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="string",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )

Source File: test_ingest.py From scanpy with BSD 3-Clause "New" or "Revised" License

6 votes

def test_ingest_map_embedding_umap():
    adata_ref = sc.AnnData(X)
    adata_new = sc.AnnData(T)

    sc.pp.neighbors(
        adata_ref, method='umap', use_rep='X', n_neighbors=4, random_state=0
    )
    sc.tl.umap(adata_ref, random_state=0)

    ing = sc.tl.Ingest(adata_ref)
    ing.fit(adata_new)
    ing.map_embedding(method='umap')

    reducer = UMAP(min_dist=0.5, random_state=0, n_neighbors=4)
    reducer.fit(X)
    umap_transformed_t = reducer.transform(T)

    assert np.allclose(ing._obsm['X_umap'], umap_transformed_t)

Source File: test_umap_trustworthiness.py From umap with BSD 3-Clause "New" or "Revised" License

6 votes

def test_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="l1",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )

Source File: train.py From B-SOID with GNU General Public License v3.0

6 votes

def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
    """
    Trains HDBSCAN (unsupervised) given learned UMAP space
    :param umap_embeddings: 2D array, embedded UMAP space
    :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
    :return assignments: HDBSCAN assignments
    """
    highest_numulab = -np.infty
    numulab = []
    min_cluster_range = range(6, 21)
    logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                             min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
                                             **hdbscan_params).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            logging.info('Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
    return assignments, soft_clusters, soft_assignments

Source File: train.py From B-SOID with GNU General Public License v3.0

6 votes

def bsoid_umap_embed(f_10fps_sc, umap_params=UMAP_PARAMS):
    """
    Trains UMAP (unsupervised) given a set of features based on (x,y) positions
    :param f_10fps_sc: 2D array, standardized/session features
    :param umap_params: dict, UMAP params in GLOBAL_CONFIG
    :return trained_umap: object, trained UMAP transformer
    :return umap_embeddings: 2D array, embedded UMAP space
    """
    feats_train = f_10fps_sc.T
    logging.info('Transforming all {} instances from {} D into {} D'.format(feats_train.shape[0],
                                                                            feats_train.shape[1],
                                                                            umap_params.get('n_components')))
    trained_umap = umap.UMAP(n_neighbors=int(round(np.sqrt(feats_train.shape[0]))),  # power law
                             **umap_params).fit(feats_train)
    umap_embeddings = trained_umap.embedding_
    logging.info('Done non-linear transformation with UMAP from {} D into {} D.'.format(feats_train.shape[1],
                                                                                        umap_embeddings.shape[1]))
    return trained_umap, umap_embeddings

Source File: umap_reconstruction.py From scikit-lego with MIT License

6 votes

def fit(self, X, y=None):
        """
        Fit the model using X as training data.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: ignored but kept in for pipeline support
        :return: Returns an instance of self.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if self.n_components < 2:
            raise ValueError("Number of components must be at least two.")
        if not self.threshold:
            raise ValueError(f"The `threshold` value cannot be `None`.")

        self.umap_ = umap.UMAP(
            n_components=self.n_components,
            n_neighbors=self.n_neighbors,
            min_dist=self.min_dist,
            metric=self.metric,
            random_state=self.random_state,
        )
        self.umap_.fit(X, y)
        self.offset_ = -self.threshold
        return self

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_bad_init(nn_data):
    u = UMAP(init="foobar")
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_negative_sample_rate(nn_data):
    u = UMAP(negative_sample_rate=-1)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_bad_metric(nn_data):
    u = UMAP(metric=45)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_bad_numeric_init(nn_data):
    u = UMAP(init=42)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_bad_matrix_init(nn_data):
    u = UMAP(init=np.array([[0, 0, 0], [0, 0, 0]]))
    assert_raises(ValueError, u.fit, nn_data)

Source File: umap_view.py From phy with BSD 3-Clause "New" or "Revised" License

5 votes

def umap(x):
    """Perform the dimension reduction of the array x."""
    from umap import UMAP
    return UMAP().fit_transform(x)

Source File: visAnnos.py From scMatch with MIT License

5 votes

def CalCords(savefolder, em, visMethod):
    cordFile = os.path.join(savefolder, 'cords_%s.csv' % visMethod)
    if os.path.exists(cordFile):
        cords = pd.read_csv(cordFile, index_col=0, header=0)
        return cords
        
    x = em.T.values
    from sklearn.preprocessing import StandardScaler
    x = StandardScaler().fit_transform(x)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=10)
    principalComponents = pca.fit_transform(x)
    
    if visMethod == 'PCA':
        cords = pd.DataFrame(data = principalComponents[:,:2], columns = ['x', 'y'], index = em.columns)
    elif visMethod == 'tSNE':
        from sklearn.manifold import TSNE
        tsneComponents = TSNE(n_components=2).fit_transform(principalComponents)
        cords = pd.DataFrame(data = tsneComponents, columns = ['x', 'y'], index = em.columns)
    elif visMethod == 'UMAP':
        import umap
        umapComponents = umap.UMAP(n_components=2).fit_transform(principalComponents)
        cords = pd.DataFrame(data = umapComponents, columns = ['x', 'y'], index = em.columns)
    cords.index.name = 'barcode'    
    cords.to_csv(cordFile, index=True, header=True)
    return cords

Source File: umap_view.py From phy with BSD 3-Clause "New" or "Revised" License

5 votes

def attach_to_controller(self, controller):
        def coords(cluster_ids):
            """Must return a Bunch object with pos, spike_ids, spike_clusters."""
            # We select 200 spikes from the selected clusters.
            # WARNING: lasso and split will work but will *only split the shown subselection* of
            # spikes. You should use the `load_all` keyword argument to `coords()` to load all
            # spikes before computing the spikes inside the lasso, however (1) this could be
            # prohibitely long with UMAP, and (2) the coordinates will change when reperforming
            # the dimension reduction on all spikes, so the splitting would be meaningless anyway.
            # A warning is displayed when trying to split on a view that does not accept the
            # `load_all` keyword argument, because it means that all relevant spikes (even not
            # shown ones) are not going to be split.
            spike_ids = controller.selector(200, cluster_ids)
            # We get the cluster ids corresponding to the chosen spikes.
            spike_clusters = controller.supervisor.clustering.spike_clusters[spike_ids]
            # We get the waveforms of the spikes, across all channels so that we use the
            # same dimensions for every cluster.
            data = controller.model.get_waveforms(spike_ids, None)
            # We reshape the array as a 2D array so that we can pass it to the t-SNE algorithm.
            (n_spikes, n_samples, n_channels) = data.shape
            data = data.transpose((0, 2, 1))  # get an (n_spikes, n_channels, n_samples) array
            data = data.reshape((n_spikes, n_samples * n_channels))
            # We perform the dimension reduction.
            pos = umap(data)
            return Bunch(pos=pos, spike_ids=spike_ids, spike_clusters=spike_clusters)

        def create_view():
            """Create and return a histogram view."""
            return WaveformUMAPView(coords=controller.context.cache(coords))

        # Maps a view name to a function that returns a view
        # when called with no argument.
        controller.view_creator['WaveformUMAPView'] = create_view

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_negative_learning_rate(nn_data):
    u = UMAP(learning_rate=-1.5)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_too_many_neighbors_warns(nn_data):
    u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
    u.fit(
        nn_data[:100,]
    )
    assert_equal(u._a, 1.2)
    assert_equal(u._b, 1.75)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_negative_n_neighbours(nn_data):
    u = UMAP(n_neighbors=-1)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_too_small_n_neighbours(nn_data):
    u = UMAP(n_neighbors=0.5)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_non_integer_n_components(nn_data):
    u = UMAP(n_components=1.5)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_negative_min_dist(nn_data):
    u = UMAP(min_dist=-1)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_bad_too_large_min_dist(nn_data):
    u = UMAP(min_dist=2.0)
    # a RuntimeWarning about division by zero in a,b curve fitting is expected
    # caught and ignored for this test
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_too_large_op(nn_data):
    u = UMAP(set_op_mix_ratio=1.5)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_validation_params.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_umap_negative_op(nn_data):
    u = UMAP(set_op_mix_ratio=-1.0)
    assert_raises(ValueError, u.fit, nn_data)

Source File: test_umap_repeated_data.py From umap with BSD 3-Clause "New" or "Revised" License

5 votes

def test_repeated_points_large_n(repetition_dense):
    model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense)
    assert_equal(model._n_neighbors, 3)

Python umap.UMAP Examples