Python gensim.models.FastText() Examples

The following are 21 code examples of gensim.models.FastText(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def test_check_pre_train_statistics(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)

        for v in se.wv.vocab:
            se.wv.vocab[v].count = 1

        # Just throws multiple warnings warning
        se._check_pre_training_sanity(1, 1, 1)

        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(0, 1, 1)
        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(1, 0, 1)
        with self.assertRaises(ValueError):
            se._check_pre_training_sanity(1, 1, 0) 
Example #2
Source File: entity_discoverer.py    From HarvestText with MIT License 6 votes vote down vote up
def train_emb(self, sent_words, word2id, id2word, emb_dim, min_count, ft_iters, use_subword, min_n, max_n):
        """因为fasttext的词频筛选策略(>=5),word2id和id2word会发生改变,但是要保持按照词频的排序

        :return: emb_mat, word2id, id2word
            - emb_mat: np.array [num_entities, emb_dim]
            - word2id
            - id2word
        """
        print("Training fasttext")
        model = FastText(sent_words, size=emb_dim, min_count=min_count,
                         iter=ft_iters, word_ngrams=int(use_subword), min_n=min_n, max_n=max_n)
        id2word = [wd for wd in id2word if wd in model.wv.vocab]
        word2id = {wd: i for (i, wd) in enumerate(id2word)}
        emb_mat = np.zeros((len(id2word), emb_dim))
        for i, wd in enumerate(id2word):
            emb_mat[i, :] = model.wv[wd]

        return emb_mat, word2id, id2word

    # clustering 
Example #3
Source File: pre_train.py    From embeddings with Apache License 2.0 6 votes vote down vote up
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file) 
Example #4
Source File: test_average.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def test_average_train_np_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(
            sv=m.sv, total_sentences=len(self.sentences), update=False
        )
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
        mem = m._get_thread_working_mem()
        output = train_average_np(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1.0 == m.sv[0]).all())
        self.assertTrue((1.5 == m.sv[2]).all())
        self.assertTrue((2 == m.sv[3]).all())
        # "go" -> [1,1...]
        # oov: "12345" -> (14 hashes * 2) / 14 =  2
        # (2 + 1) / 2 = 1.5 
Example #5
Source File: test_average.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def test_average_train_cy_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(
            sv=m.sv, total_sentences=len(self.sentences), update=False
        )
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
        mem = m._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        output = train_average_cy(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1.0 + EPS == m.sv[0]).all())
        self.assertTrue(np.allclose(1.5, m.sv[2]))
        self.assertTrue(np.allclose(2, m.sv[3])) 
Example #6
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def test_map_all_vectors_to_disk(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)

        p = Path("fse/test/test_data/test_emb")
        p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
        p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
        p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")

        se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))

        self.assertTrue(p_vecs.exists())
        self.assertTrue(p_ngrams.exists())
        self.assertTrue(p_vocab.exists())

        for p in [p_vecs, p_ngrams, p_vocab]:
            p.unlink() 
Example #7
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_init_w_empty_ft_model(self):
        ft = FastText(min_count=1, size=DIM)
        ft.wv.vectors = np.zeros(10)
        ft.wv.vectors_ngrams = None
        with self.assertRaises(RuntimeError):
            BaseSentence2VecModel(ft) 
Example #8
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_dtypes(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)

        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.word_weights = np.ones(len(se.wv.vocab), dtype=bool)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32) 
Example #9
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_san_incos_len(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.word_weights = np.ones(20)
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1) 
Example #10
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_san_no_word_weights(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.word_weights = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1) 
Example #11
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_san_no_sv_vecs(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.sv.vectors = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1) 
Example #12
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_san_no_wv_len(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.wv.vectors = []
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1) 
Example #13
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_check_pre_train_san_no_wv(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        se.wv = None
        with self.assertRaises(RuntimeError):
            se._check_pre_training_sanity(1, 1, 1) 
Example #14
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_estimate_memory(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)
        self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"]) 
Example #15
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_save_load_with_memmap(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        shape = (1000, 1000)
        ft.wv.vectors = np.zeros(shape, np.float32)

        p = Path("fse/test/test_data/test_emb")
        p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
        p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
        p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")

        p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy")

        se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))
        self.assertTrue(p_vecs.exists())
        self.assertTrue(p_ngrams.exists())
        self.assertTrue(p_vocab.exists())

        se.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        se = BaseSentence2VecModel.load(str(p.absolute()))
        self.assertFalse(se.wv.vectors_vocab.flags.writeable)
        self.assertEqual(shape, se.wv.vectors.shape)
        self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape)

        for p in [p, p_vecs, p_ngrams, p_vocab]:
            p.unlink() 
Example #16
Source File: scdv.py    From redshells with MIT License 5 votes vote down vote up
def __init__(self, documents: List[List[str]], cluster_size: int, sparsity_percentage: float, gaussian_mixture_kwargs: Dict[Any, Any],
                 dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> None:
        """

        :param documents: documents for training.
        :param cluster_size:  word cluster size.
        :param sparsity_percentage: sparsity percentage. This must be in [0, 1].
        :param gaussian_mixture_kwargs: Arguments to build `sklearn.mixture.GaussianMixture` except cluster_size. Please see `sklearn.mixture.GaussianMixture.__init__` for details.
        :param dictionary: `gensim.corpora.Dictionary`. 
        """
        logger.info('_build_dictionary...')
        self._dictionary = dictionary
        vocabulary_size = len(self._dictionary.token2id)
        embedding_size = w2v.wv.vector_size

        logger.info('_build_word_embeddings...')
        self._word_embeddings = self._build_word_embeddings(self._dictionary, w2v)
        assert self._word_embeddings.shape == (vocabulary_size, embedding_size)

        logger.info('_build_word_cluster_probabilities...')
        self._word_cluster_probabilities = self._build_word_cluster_probabilities(self._word_embeddings, cluster_size, gaussian_mixture_kwargs)
        assert self._word_cluster_probabilities.shape == (vocabulary_size, cluster_size)

        logger.info('_build_idf...')
        self._idf = self._build_idf(self._dictionary)
        assert self._idf.shape == (vocabulary_size, )

        logger.info('_build_word_cluster_vectors...')
        word_cluster_vectors = self._build_word_cluster_vectors(self._word_embeddings, self._word_cluster_probabilities)
        assert word_cluster_vectors.shape == (vocabulary_size, cluster_size, embedding_size)

        logger.info('_build_word_topic_vectors...')
        word_topic_vectors = self._build_word_topic_vectors(self._idf, word_cluster_vectors)
        assert word_topic_vectors.shape == (vocabulary_size, (cluster_size * embedding_size))

        logger.info('_build_sparsity_threshold...')
        self._sparse_threshold = self._build_sparsity_threshold(word_topic_vectors, self._dictionary, documents, sparsity_percentage) 
Example #17
Source File: test_base_s2v.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_init_w_ft_model_wo_vecs(self):
        ft = FastText(SENTENCES, size=5)
        with self.assertRaises(RuntimeError):
            ft.wv.vectors_vocab = None
            BaseSentence2VecModel(ft)
        with self.assertRaises(RuntimeError):
            ft.wv.vectors_ngrams = None
            BaseSentence2VecModel(ft) 
Example #18
Source File: test_average.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 5 votes vote down vote up
def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH

        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6)) 
Example #19
Source File: embedding.py    From fancy-nlp with GNU General Public License v3.0 5 votes vote down vote up
def train_fasttext(corpus: List[List[str]],
                   vocabulary: Dict[str, int],
                   zero_init_indices: Union[int, List[int]] = 0,
                   rand_init_indices: Union[int, List[int]] = 1,
                   embedding_dim: int = 300) -> np.ndarray:
    """Use fasttext to train on corpus to obtain embedding

    Args:
        corpus: List of List of str. List of tokenized texts, the corpus to train on, like ``[['我',
            '是', '中', '国', '人'], ...]``.
        vocabulary: Dict[str, int']. A mapping of words to indices
        zero_init_indices: int or a List of int. The indices which use zero-initialization. These
            indices usually represent padding token.
        rand_init_indices: int or a List of int. The indices which use randomly-initialization.These
            indices usually represent other special tokens, such as "unk" token.
        embedding_dim: int. Dimensionality of embedding

    Returns: np.ndarray, a word embedding matrix, shaped [vocab_size, embedding_dim].

    """
    model = FastText(size=embedding_dim, min_count=1, window=5, sg=1, word_ngrams=1)
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus), epochs=10)

    emb = np.zeros(shape=(len(vocabulary), embedding_dim), dtype='float32')

    for w, i in vocabulary.items():
        emb[i, :] = model.wv[w]  # note that oov words can still have word vectors

    if isinstance(zero_init_indices, int):
        zero_init_indices = [zero_init_indices]
    if isinstance(rand_init_indices, int):
        rand_init_indices = [rand_init_indices]
    for idx in zero_init_indices:
        emb[idx] = np.zeros(embedding_dim)
    for idx in rand_init_indices:
        emb[idx] = np.random.normal(0, 0.05, embedding_dim)

    return emb 
Example #20
Source File: langmodelfeat.py    From holoclean with Apache License 2.0 5 votes vote down vote up
def specific_setup(self):
        self.name = 'LangModelFeaturizer'
        self.emb_size = 10
        self.all_attrs = self.ds.get_attributes()
        self.attrs_number = len(self.all_attrs)
        self.attr_language_model = {}
        raw_data = self.ds.get_raw_data()
        for attr in self.all_attrs:
            attr_corpus = list(zip(raw_data[attr].tolist()))
            model = FastText(attr_corpus, min_count=1, size=self.emb_size)
            self.attr_language_model[attr] = model 
Example #21
Source File: scdv.py    From redshells with MIT License 5 votes vote down vote up
def _build_word_embeddings(dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> np.ndarray:
        embeddings = np.zeros((len(dictionary.token2id), w2v.vector_size))
        for token, idx in dictionary.token2id.items():
            if token in w2v.wv:
                embeddings[idx] = w2v.wv[token]
        return sklearn.preprocessing.normalize(embeddings, axis=1, norm='l2')