Python Examples of gensim.models.FastText

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

6 votes

def test_check_pre_train_statistics(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)

        for v in se.wv.vocab:
            se.wv.vocab[v].count = 1

        # Just throws multiple warnings warning
        se._check_pre_training_sanity(1, 1, 1)

        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(0, 1, 1)
        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(1, 0, 1)
        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(1, 1, 0)

Source File: entity_discoverer.py From HarvestText with MIT License

6 votes

def train_emb(self, sent_words, word2id, id2word, emb_dim, min_count, ft_iters, use_subword, min_n, max_n):
        """因为fasttext的词频筛选策略(>=5)，word2id和id2word会发生改变，但是要保持按照词频的排序

        :return: emb_mat, word2id, id2word
            - emb_mat: np.array [num_entities, emb_dim]
            - word2id
            - id2word
        """
        print("Training fasttext")
        model = FastText(sent_words, size=emb_dim, min_count=min_count,
                         iter=ft_iters, word_ngrams=int(use_subword), min_n=min_n, max_n=max_n)
        id2word = [wd for wd in id2word if wd in model.wv.vocab]
        word2id = {wd: i for (i, wd) in enumerate(id2word)}
        emb_mat = np.zeros((len(id2word), emb_dim))
        for i, wd in enumerate(id2word):
            emb_mat[i, :] = model.wv[wd]

        return emb_mat, word2id, id2word

    # clustering

Source File: pre_train.py From embeddings with Apache License 2.0

6 votes

def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file)

Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

6 votes

def test_average_train_np_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(
            sv=m.sv, total_sentences=len(self.sentences), update=False
        )
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
        mem = m._get_thread_working_mem()
        output = train_average_np(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1.0 == m.sv[0]).all())
        self.assertTrue((1.5 == m.sv[2]).all())
        self.assertTrue((2 == m.sv[3]).all())
        # "go" -> [1,1...]
        # oov: "12345" -> (14 hashes * 2) / 14 =  2
        # (2 + 1) / 2 = 1.5

Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

6 votes

def test_average_train_cy_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(
            sv=m.sv, total_sentences=len(self.sentences), update=False
        )
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
        mem = m._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        output = train_average_cy(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1.0 + EPS == m.sv[0]).all())
        self.assertTrue(np.allclose(1.5, m.sv[2]))
        self.assertTrue(np.allclose(2, m.sv[3]))

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

6 votes

def test_map_all_vectors_to_disk(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)

        p = Path("fse/test/test_data/test_emb")
        p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
        p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
        p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")

        se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))

        self.assertTrue(p_vecs.exists())
        self.assertTrue(p_ngrams.exists())
        self.assertTrue(p_vocab.exists())

        for p in [p_vecs, p_ngrams, p_vocab]:
            p.unlink()

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_init_w_empty_ft_model(self):
        ft = FastText(min_count=1, size=DIM)
        ft.wv.vectors = np.zeros(10)
        ft.wv.vectors_ngrams = None
        with self.assertRaises(RuntimeError):
            BaseSentence2VecModel(ft)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_dtypes(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)

        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.word_weights = np.ones(len(se.wv.vocab), dtype=bool)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_san_incos_len(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.word_weights = np.ones(20)
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_san_no_word_weights(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.word_weights = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_san_no_sv_vecs(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.sv.vectors = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_san_no_wv_len(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.wv.vectors = []
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_check_pre_train_san_no_wv(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.wv = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_estimate_memory(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"])

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_save_load_with_memmap(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        shape = (1000, 1000)
        ft.wv.vectors = np.zeros(shape, np.float32)

        p = Path("fse/test/test_data/test_emb")
        p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
        p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
        p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")

        p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy")

        se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))
        self.assertTrue(p_vecs.exists())
        self.assertTrue(p_ngrams.exists())
        self.assertTrue(p_vocab.exists())

        se.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        se = BaseSentence2VecModel.load(str(p.absolute()))
        self.assertFalse(se.wv.vectors_vocab.flags.writeable)
        self.assertEqual(shape, se.wv.vectors.shape)
        self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape)

        for p in [p, p_vecs, p_ngrams, p_vocab]:
            p.unlink()

Source File: scdv.py From redshells with MIT License

5 votes

def __init__(self, documents: List[List[str]], cluster_size: int, sparsity_percentage: float, gaussian_mixture_kwargs: Dict[Any, Any],
                 dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> None:
        """

        :param documents: documents for training.
        :param cluster_size:  word cluster size.
        :param sparsity_percentage: sparsity percentage. This must be in [0, 1].
        :param gaussian_mixture_kwargs: Arguments to build `sklearn.mixture.GaussianMixture` except cluster_size. Please see `sklearn.mixture.GaussianMixture.__init__` for details.
        :param dictionary: `gensim.corpora.Dictionary`. 
        """
        logger.info('_build_dictionary...')
        self._dictionary = dictionary
        vocabulary_size = len(self._dictionary.token2id)
        embedding_size = w2v.wv.vector_size

        logger.info('_build_word_embeddings...')
        self._word_embeddings = self._build_word_embeddings(self._dictionary, w2v)
        assert self._word_embeddings.shape == (vocabulary_size, embedding_size)

        logger.info('_build_word_cluster_probabilities...')
        self._word_cluster_probabilities = self._build_word_cluster_probabilities(self._word_embeddings, cluster_size, gaussian_mixture_kwargs)
        assert self._word_cluster_probabilities.shape == (vocabulary_size, cluster_size)

        logger.info('_build_idf...')
        self._idf = self._build_idf(self._dictionary)
        assert self._idf.shape == (vocabulary_size, )

        logger.info('_build_word_cluster_vectors...')
        word_cluster_vectors = self._build_word_cluster_vectors(self._word_embeddings, self._word_cluster_probabilities)
        assert word_cluster_vectors.shape == (vocabulary_size, cluster_size, embedding_size)

        logger.info('_build_word_topic_vectors...')
        word_topic_vectors = self._build_word_topic_vectors(self._idf, word_cluster_vectors)
        assert word_topic_vectors.shape == (vocabulary_size, (cluster_size * embedding_size))

        logger.info('_build_sparsity_threshold...')
        self._sparse_threshold = self._build_sparsity_threshold(word_topic_vectors, self._dictionary, documents, sparsity_percentage)

Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_init_w_ft_model_wo_vecs(self):
        ft = FastText(SENTENCES, size=5)
        with self.assertRaises(RuntimeError):
            ft.wv.vectors_vocab = None
            BaseSentence2VecModel(ft)
        with self.assertRaises(RuntimeError):
            ft.wv.vectors_ngrams = None
            BaseSentence2VecModel(ft)

Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH

        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))

Source File: embedding.py From fancy-nlp with GNU General Public License v3.0

5 votes

def train_fasttext(corpus: List[List[str]],
                   vocabulary: Dict[str, int],
                   zero_init_indices: Union[int, List[int]] = 0,
                   rand_init_indices: Union[int, List[int]] = 1,
                   embedding_dim: int = 300) -> np.ndarray:
    """Use fasttext to train on corpus to obtain embedding

    Args:
        corpus: List of List of str. List of tokenized texts, the corpus to train on, like ``[['我',
            '是', '中', '国', '人'], ...]``.
        vocabulary: Dict[str, int']. A mapping of words to indices
        zero_init_indices: int or a List of int. The indices which use zero-initialization. These
            indices usually represent padding token.
        rand_init_indices: int or a List of int. The indices which use randomly-initialization.These
            indices usually represent other special tokens, such as "unk" token.
        embedding_dim: int. Dimensionality of embedding

    Returns: np.ndarray, a word embedding matrix, shaped [vocab_size, embedding_dim].

    """
    model = FastText(size=embedding_dim, min_count=1, window=5, sg=1, word_ngrams=1)
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus), epochs=10)

    emb = np.zeros(shape=(len(vocabulary), embedding_dim), dtype='float32')

    for w, i in vocabulary.items():
        emb[i, :] = model.wv[w]  # note that oov words can still have word vectors

    if isinstance(zero_init_indices, int):
        zero_init_indices = [zero_init_indices]
    if isinstance(rand_init_indices, int):
        rand_init_indices = [rand_init_indices]
    for idx in zero_init_indices:
        emb[idx] = np.zeros(embedding_dim)
    for idx in rand_init_indices:
        emb[idx] = np.random.normal(0, 0.05, embedding_dim)

    return emb

Source File: langmodelfeat.py From holoclean with Apache License 2.0

5 votes

def specific_setup(self):
        self.name = 'LangModelFeaturizer'
        self.emb_size = 10
        self.all_attrs = self.ds.get_attributes()
        self.attrs_number = len(self.all_attrs)
        self.attr_language_model = {}
        raw_data = self.ds.get_raw_data()
        for attr in self.all_attrs:
            attr_corpus = list(zip(raw_data[attr].tolist()))
            model = FastText(attr_corpus, min_count=1, size=self.emb_size)
            self.attr_language_model[attr] = model

Source File: scdv.py From redshells with MIT License

5 votes

def _build_word_embeddings(dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> np.ndarray:
        embeddings = np.zeros((len(dictionary.token2id), w2v.vector_size))
        for token, idx in dictionary.token2id.items():
            if token in w2v.wv:
                embeddings[idx] = w2v.wv[token]
        return sklearn.preprocessing.normalize(embeddings, axis=1, norm='l2')

Python gensim.models.FastText() Examples