Python umap.UMAP Examples

The following are 30 code examples of umap.UMAP(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module umap , or try the search function .
Example #1
Source File: helper.py    From BERMUDA with MIT License 6 votes vote down vote up
def cal_UMAP(code, pca_dim = 50, n_neighbors = 30, min_dist=0.1, n_components=2, metric='cosine'):
    """ Calculate UMAP dimensionality reduction
    Args:
        code: num_cells * num_features
        pca_dim: if dimensionality of code > pca_dim, apply PCA first
        n_neighbors: UMAP parameter
        min_dist: UMAP parameter
        n_components: UMAP parameter
        metric: UMAP parameter
    Returns:
        umap_code: num_cells * n_components
    """
    if code.shape[1] > pca_dim:
        pca = PCA(n_components=pca_dim)
        code = pca.fit_transform(code)
    fit = umap.UMAP(n_neighbors=n_neighbors,
                    min_dist=min_dist,
                    n_components=n_components,
                    metric=metric,
                    random_state=0)
    umap_code = fit.fit_transform(code)

    return umap_code 
Example #2
Source File: EmbeddingsResolver.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def project_embeddings(self, projection_model=None, x_dim=0, y_dim=1):
        '''

        :param projection_model: sklearn unsupervised model (e.g., PCA) by default the recommended model is umap.UMAP,
            which requires UMAP in to be installed
        :param x_dim: int, default 0, dimension of transformation matrix for x-axis
        :param y_dim: int, default 1, dimension of transformation matrix for y-axis
        :return:
        '''
        axes = self.project(projection_model)
        word_axes = (pd.DataFrame({'term': [w for w in self.vocab_],
                                   'x': axes.T[x_dim],
                                   'y': axes.T[y_dim]})
                     .set_index('term')
                     .reindex(pd.Series(self.corpus_.get_terms()))
                     .dropna())
        self.corpus_ = self.corpus_.remove_terms(set(self.corpus_.get_terms()) - set(word_axes.index))
        word_axes = word_axes.reindex(self.corpus_.get_terms()).dropna()

        return self.corpus_, word_axes 
Example #3
Source File: EmbeddingsResolver.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def project(self, projection_model=None):
        '''
        :param projection_model: sklearn unsupervised model (e.g., PCA) by default the recommended model is umap.UMAP,
        which requires UMAP in to be installed

        :return: array, shape (num dimension, vocab size)
        '''
        if self.embeddings_ is None:
            raise Exception("Run set_embeddings_model or set_embeddings to get embeddings")
        if projection_model is None:
            try:
                import umap
            except:
                raise Exception("Please install umap (pip install umap-learn) to use the default projection_model.")
            projection_model = umap.UMAP(min_dist=0.5, metric='cosine')
        axes = projection_model.fit_transform(self.embeddings_)
        return axes 
Example #4
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_discrete_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="ordinal",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #5
Source File: train.py    From B-SOID with GNU General Public License v3.0 6 votes vote down vote up
def main(train_folders: list):
    """
    :param train_folders: list, training data folders
    :return f_10fps: 2D array, features
    :return umap_embeddings: 2D array, embedded UMAP space
    :return nn_classifier: obj, MLP classifier
    :return scores: 1D array, cross-validated accuracy
    :return nn_assignments: neural net predictions
    """
    import bsoid_umap.utils.likelihoodprocessing
    filenames, training_data, perc_rect = bsoid_umap.utils.likelihoodprocessing.main(train_folders)
    f_10fps, f_10fps_sc = bsoid_feats(training_data)
    trained_umap, umap_embeddings = bsoid_umap_embed(f_10fps_sc)
    hdb_assignments, soft_clusters, soft_assignments = bsoid_hdbscan(umap_embeddings)
    nn_classifier, scores, nn_assignments = bsoid_nn(f_10fps, soft_assignments)
    if PLOT:
        timestr = time.strftime("_%Y%m%d_%H%M")
        fig1 = plot_classes(umap_embeddings[hdb_assignments >= 0], hdb_assignments[hdb_assignments >= 0])
        my_file1 = 'hdb_soft_assignments'
        fig1.savefig(os.path.join(OUTPUT_PATH, str.join('', (my_file1, timestr, '.svg'))))
        plot_accuracy(scores)
    return f_10fps, f_10fps_sc, umap_embeddings, hdb_assignments, soft_assignments, soft_clusters, \
           nn_classifier, scores, nn_assignments 
Example #6
Source File: test_umap_on_iris.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_umap_sparse_transform_on_iris(iris, iris_selection):
    data = sparse.csr_matrix(iris.data[iris_selection])
    assert sparse.issparse(data)
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        force_approximation_algorithm=True,
    ).fit(data)

    new_data = sparse.csr_matrix(iris.data[~iris_selection])
    assert sparse.issparse(new_data)
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.80,
        "Insufficiently trustworthy transform for" "iris dataset: {}".format(trust),
    )


# UMAP Clusterability on Iris
# --------------------------- 
Example #7
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_string_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    labels = np.array(["this", "that", "other"])[labels]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="string",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #8
Source File: test_ingest.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_ingest_map_embedding_umap():
    adata_ref = sc.AnnData(X)
    adata_new = sc.AnnData(T)

    sc.pp.neighbors(
        adata_ref, method='umap', use_rep='X', n_neighbors=4, random_state=0
    )
    sc.tl.umap(adata_ref, random_state=0)

    ing = sc.tl.Ingest(adata_ref)
    ing.fit(adata_new)
    ing.map_embedding(method='umap')

    reducer = UMAP(min_dist=0.5, random_state=0, n_neighbors=4)
    reducer.fit(X)
    umap_transformed_t = reducer.transform(T)

    assert np.allclose(ing._obsm['X_umap'], umap_transformed_t) 
Example #9
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="l1",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #10
Source File: train.py    From B-SOID with GNU General Public License v3.0 6 votes vote down vote up
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
    """
    Trains HDBSCAN (unsupervised) given learned UMAP space
    :param umap_embeddings: 2D array, embedded UMAP space
    :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
    :return assignments: HDBSCAN assignments
    """
    highest_numulab = -np.infty
    numulab = []
    min_cluster_range = range(6, 21)
    logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                             min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
                                             **hdbscan_params).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            logging.info('Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
    return assignments, soft_clusters, soft_assignments 
Example #11
Source File: train.py    From B-SOID with GNU General Public License v3.0 6 votes vote down vote up
def bsoid_umap_embed(f_10fps_sc, umap_params=UMAP_PARAMS):
    """
    Trains UMAP (unsupervised) given a set of features based on (x,y) positions
    :param f_10fps_sc: 2D array, standardized/session features
    :param umap_params: dict, UMAP params in GLOBAL_CONFIG
    :return trained_umap: object, trained UMAP transformer
    :return umap_embeddings: 2D array, embedded UMAP space
    """
    feats_train = f_10fps_sc.T
    logging.info('Transforming all {} instances from {} D into {} D'.format(feats_train.shape[0],
                                                                            feats_train.shape[1],
                                                                            umap_params.get('n_components')))
    trained_umap = umap.UMAP(n_neighbors=int(round(np.sqrt(feats_train.shape[0]))),  # power law
                             **umap_params).fit(feats_train)
    umap_embeddings = trained_umap.embedding_
    logging.info('Done non-linear transformation with UMAP from {} D into {} D.'.format(feats_train.shape[1],
                                                                                        umap_embeddings.shape[1]))
    return trained_umap, umap_embeddings 
Example #12
Source File: umap_reconstruction.py    From scikit-lego with MIT License 6 votes vote down vote up
def fit(self, X, y=None):
        """
        Fit the model using X as training data.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: ignored but kept in for pipeline support
        :return: Returns an instance of self.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if self.n_components < 2:
            raise ValueError("Number of components must be at least two.")
        if not self.threshold:
            raise ValueError(f"The `threshold` value cannot be `None`.")

        self.umap_ = umap.UMAP(
            n_components=self.n_components,
            n_neighbors=self.n_neighbors,
            min_dist=self.min_dist,
            metric=self.metric,
            random_state=self.random_state,
        )
        self.umap_.fit(X, y)
        self.offset_ = -self.threshold
        return self 
Example #13
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_bad_init(nn_data):
    u = UMAP(init="foobar")
    assert_raises(ValueError, u.fit, nn_data) 
Example #14
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_negative_sample_rate(nn_data):
    u = UMAP(negative_sample_rate=-1)
    assert_raises(ValueError, u.fit, nn_data) 
Example #15
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_bad_metric(nn_data):
    u = UMAP(metric=45)
    assert_raises(ValueError, u.fit, nn_data) 
Example #16
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_bad_numeric_init(nn_data):
    u = UMAP(init=42)
    assert_raises(ValueError, u.fit, nn_data) 
Example #17
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_bad_matrix_init(nn_data):
    u = UMAP(init=np.array([[0, 0, 0], [0, 0, 0]]))
    assert_raises(ValueError, u.fit, nn_data) 
Example #18
Source File: umap_view.py    From phy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def umap(x):
    """Perform the dimension reduction of the array x."""
    from umap import UMAP
    return UMAP().fit_transform(x) 
Example #19
Source File: visAnnos.py    From scMatch with MIT License 5 votes vote down vote up
def CalCords(savefolder, em, visMethod):
    cordFile = os.path.join(savefolder, 'cords_%s.csv' % visMethod)
    if os.path.exists(cordFile):
        cords = pd.read_csv(cordFile, index_col=0, header=0)
        return cords
        
    x = em.T.values
    from sklearn.preprocessing import StandardScaler
    x = StandardScaler().fit_transform(x)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=10)
    principalComponents = pca.fit_transform(x)
    
    if visMethod == 'PCA':
        cords = pd.DataFrame(data = principalComponents[:,:2], columns = ['x', 'y'], index = em.columns)
    elif visMethod == 'tSNE':
        from sklearn.manifold import TSNE
        tsneComponents = TSNE(n_components=2).fit_transform(principalComponents)
        cords = pd.DataFrame(data = tsneComponents, columns = ['x', 'y'], index = em.columns)
    elif visMethod == 'UMAP':
        import umap
        umapComponents = umap.UMAP(n_components=2).fit_transform(principalComponents)
        cords = pd.DataFrame(data = umapComponents, columns = ['x', 'y'], index = em.columns)
    cords.index.name = 'barcode'    
    cords.to_csv(cordFile, index=True, header=True)
    return cords 
Example #20
Source File: umap_view.py    From phy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def attach_to_controller(self, controller):
        def coords(cluster_ids):
            """Must return a Bunch object with pos, spike_ids, spike_clusters."""
            # We select 200 spikes from the selected clusters.
            # WARNING: lasso and split will work but will *only split the shown subselection* of
            # spikes. You should use the `load_all` keyword argument to `coords()` to load all
            # spikes before computing the spikes inside the lasso, however (1) this could be
            # prohibitely long with UMAP, and (2) the coordinates will change when reperforming
            # the dimension reduction on all spikes, so the splitting would be meaningless anyway.
            # A warning is displayed when trying to split on a view that does not accept the
            # `load_all` keyword argument, because it means that all relevant spikes (even not
            # shown ones) are not going to be split.
            spike_ids = controller.selector(200, cluster_ids)
            # We get the cluster ids corresponding to the chosen spikes.
            spike_clusters = controller.supervisor.clustering.spike_clusters[spike_ids]
            # We get the waveforms of the spikes, across all channels so that we use the
            # same dimensions for every cluster.
            data = controller.model.get_waveforms(spike_ids, None)
            # We reshape the array as a 2D array so that we can pass it to the t-SNE algorithm.
            (n_spikes, n_samples, n_channels) = data.shape
            data = data.transpose((0, 2, 1))  # get an (n_spikes, n_channels, n_samples) array
            data = data.reshape((n_spikes, n_samples * n_channels))
            # We perform the dimension reduction.
            pos = umap(data)
            return Bunch(pos=pos, spike_ids=spike_ids, spike_clusters=spike_clusters)

        def create_view():
            """Create and return a histogram view."""
            return WaveformUMAPView(coords=controller.context.cache(coords))

        # Maps a view name to a function that returns a view
        # when called with no argument.
        controller.view_creator['WaveformUMAPView'] = create_view 
Example #21
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_negative_learning_rate(nn_data):
    u = UMAP(learning_rate=-1.5)
    assert_raises(ValueError, u.fit, nn_data) 
Example #22
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_too_many_neighbors_warns(nn_data):
    u = UMAP(a=1.2, b=1.75, n_neighbors=2000, n_epochs=11, init="random")
    u.fit(
        nn_data[:100,]
    )
    assert_equal(u._a, 1.2)
    assert_equal(u._b, 1.75) 
Example #23
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_negative_n_neighbours(nn_data):
    u = UMAP(n_neighbors=-1)
    assert_raises(ValueError, u.fit, nn_data) 
Example #24
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_too_small_n_neighbours(nn_data):
    u = UMAP(n_neighbors=0.5)
    assert_raises(ValueError, u.fit, nn_data) 
Example #25
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_non_integer_n_components(nn_data):
    u = UMAP(n_components=1.5)
    assert_raises(ValueError, u.fit, nn_data) 
Example #26
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_negative_min_dist(nn_data):
    u = UMAP(min_dist=-1)
    assert_raises(ValueError, u.fit, nn_data) 
Example #27
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_bad_too_large_min_dist(nn_data):
    u = UMAP(min_dist=2.0)
    # a RuntimeWarning about division by zero in a,b curve fitting is expected
    # caught and ignored for this test
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        assert_raises(ValueError, u.fit, nn_data) 
Example #28
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_too_large_op(nn_data):
    u = UMAP(set_op_mix_ratio=1.5)
    assert_raises(ValueError, u.fit, nn_data) 
Example #29
Source File: test_umap_validation_params.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_umap_negative_op(nn_data):
    u = UMAP(set_op_mix_ratio=-1.0)
    assert_raises(ValueError, u.fit, nn_data) 
Example #30
Source File: test_umap_repeated_data.py    From umap with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_repeated_points_large_n(repetition_dense):
    model = UMAP(n_neighbors=5, unique=True).fit(repetition_dense)
    assert_equal(model._n_neighbors, 3)