Python gensim.models.FastText() Examples
The following are 21
code examples of gensim.models.FastText().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def test_check_pre_train_statistics(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) for v in se.wv.vocab: se.wv.vocab[v].count = 1 # Just throws multiple warnings warning se._check_pre_training_sanity(1, 1, 1) with self.assertRaises(ValueError): se._check_pre_training_sanity(0, 1, 1) with self.assertRaises(ValueError): se._check_pre_training_sanity(1, 0, 1) with self.assertRaises(ValueError): se._check_pre_training_sanity(1, 1, 0)
Example #2
Source File: entity_discoverer.py From HarvestText with MIT License | 6 votes |
def train_emb(self, sent_words, word2id, id2word, emb_dim, min_count, ft_iters, use_subword, min_n, max_n): """因为fasttext的词频筛选策略(>=5),word2id和id2word会发生改变,但是要保持按照词频的排序 :return: emb_mat, word2id, id2word - emb_mat: np.array [num_entities, emb_dim] - word2id - id2word """ print("Training fasttext") model = FastText(sent_words, size=emb_dim, min_count=min_count, iter=ft_iters, word_ngrams=int(use_subword), min_n=min_n, max_n=max_n) id2word = [wd for wd in id2word if wd in model.wv.vocab] word2id = {wd: i for (i, wd) in enumerate(id2word)} emb_mat = np.zeros((len(id2word), emb_dim)) for i, wd in enumerate(id2word): emb_mat[i, :] = model.wv[wd] return emb_mat, word2id, id2word # clustering
Example #3
Source File: pre_train.py From embeddings with Apache License 2.0 | 6 votes |
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs): """ train_fasttext(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly. The model is saved at the output location. Arguments --------- input_file : Input pre-processed wiki dump output_file : Output directory to save the model. skipgram : Layers of the model (0 - CBOW, 1 - Skipgram) loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss) size : Embedding size (100 ~ 300) epochs : Number of epochs """ sentence = LineSentence(input_file) model = FastText(sentence, sg=skipgram, hs=loss, size=size, alpha=0.05, window=5, min_count=5, min_n=2, max_n=5, workers=3, iter=epochs) model.save(output_file)
Example #4
Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def test_average_train_np_ft(self): ft = FastText(min_count=1, size=DIM) ft.build_vocab(SENTENCES) m = Average(ft) m.prep.prepare_vectors( sv=m.sv, total_sentences=len(self.sentences), update=False ) m._pre_train_calls() m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32) mem = m._get_thread_working_mem() output = train_average_np(m, self.sentences, m.sv.vectors, mem) self.assertEqual((4, 10), output) self.assertTrue((1.0 == m.sv[0]).all()) self.assertTrue((1.5 == m.sv[2]).all()) self.assertTrue((2 == m.sv[3]).all()) # "go" -> [1,1...] # oov: "12345" -> (14 hashes * 2) / 14 = 2 # (2 + 1) / 2 = 1.5
Example #5
Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def test_average_train_cy_ft(self): ft = FastText(min_count=1, size=DIM) ft.build_vocab(SENTENCES) m = Average(ft) m.prep.prepare_vectors( sv=m.sv, total_sentences=len(self.sentences), update=False ) m._pre_train_calls() m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32) mem = m._get_thread_working_mem() from fse.models.average_inner import train_average_cy output = train_average_cy(m, self.sentences, m.sv.vectors, mem) self.assertEqual((4, 10), output) self.assertTrue((1.0 + EPS == m.sv[0]).all()) self.assertTrue(np.allclose(1.5, m.sv[2])) self.assertTrue(np.allclose(2, m.sv[3]))
Example #6
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def test_map_all_vectors_to_disk(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) p = Path("fse/test/test_data/test_emb") p_vecs = Path("fse/test/test_data/test_emb_wv.vectors") p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors") p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors") se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p)) self.assertTrue(p_vecs.exists()) self.assertTrue(p_ngrams.exists()) self.assertTrue(p_vocab.exists()) for p in [p_vecs, p_ngrams, p_vocab]: p.unlink()
Example #7
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_init_w_empty_ft_model(self): ft = FastText(min_count=1, size=DIM) ft.wv.vectors = np.zeros(10) ft.wv.vectors_ngrams = None with self.assertRaises(RuntimeError): BaseSentence2VecModel(ft)
Example #8
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_dtypes(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32) se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32) se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) se.word_weights = np.ones(len(se.wv.vocab), dtype=bool) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32)
Example #9
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_san_incos_len(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.word_weights = np.ones(20) with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
Example #10
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_san_no_word_weights(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.word_weights = None with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
Example #11
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_san_no_sv_vecs(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.sv.vectors = None with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
Example #12
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_san_no_wv_len(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv.vectors = [] with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
Example #13
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_check_pre_train_san_no_wv(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv = None with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
Example #14
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_estimate_memory(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"])
Example #15
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_save_load_with_memmap(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) shape = (1000, 1000) ft.wv.vectors = np.zeros(shape, np.float32) p = Path("fse/test/test_data/test_emb") p_vecs = Path("fse/test/test_data/test_emb_wv.vectors") p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors") p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors") p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy") se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p)) self.assertTrue(p_vecs.exists()) self.assertTrue(p_ngrams.exists()) self.assertTrue(p_vocab.exists()) se.save(str(p.absolute())) self.assertTrue(p.exists()) self.assertFalse(p_not_exists.exists()) se = BaseSentence2VecModel.load(str(p.absolute())) self.assertFalse(se.wv.vectors_vocab.flags.writeable) self.assertEqual(shape, se.wv.vectors.shape) self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape) for p in [p, p_vecs, p_ngrams, p_vocab]: p.unlink()
Example #16
Source File: scdv.py From redshells with MIT License | 5 votes |
def __init__(self, documents: List[List[str]], cluster_size: int, sparsity_percentage: float, gaussian_mixture_kwargs: Dict[Any, Any], dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> None: """ :param documents: documents for training. :param cluster_size: word cluster size. :param sparsity_percentage: sparsity percentage. This must be in [0, 1]. :param gaussian_mixture_kwargs: Arguments to build `sklearn.mixture.GaussianMixture` except cluster_size. Please see `sklearn.mixture.GaussianMixture.__init__` for details. :param dictionary: `gensim.corpora.Dictionary`. """ logger.info('_build_dictionary...') self._dictionary = dictionary vocabulary_size = len(self._dictionary.token2id) embedding_size = w2v.wv.vector_size logger.info('_build_word_embeddings...') self._word_embeddings = self._build_word_embeddings(self._dictionary, w2v) assert self._word_embeddings.shape == (vocabulary_size, embedding_size) logger.info('_build_word_cluster_probabilities...') self._word_cluster_probabilities = self._build_word_cluster_probabilities(self._word_embeddings, cluster_size, gaussian_mixture_kwargs) assert self._word_cluster_probabilities.shape == (vocabulary_size, cluster_size) logger.info('_build_idf...') self._idf = self._build_idf(self._dictionary) assert self._idf.shape == (vocabulary_size, ) logger.info('_build_word_cluster_vectors...') word_cluster_vectors = self._build_word_cluster_vectors(self._word_embeddings, self._word_cluster_probabilities) assert word_cluster_vectors.shape == (vocabulary_size, cluster_size, embedding_size) logger.info('_build_word_topic_vectors...') word_topic_vectors = self._build_word_topic_vectors(self._idf, word_cluster_vectors) assert word_topic_vectors.shape == (vocabulary_size, (cluster_size * embedding_size)) logger.info('_build_sparsity_threshold...') self._sparse_threshold = self._build_sparsity_threshold(word_topic_vectors, self._dictionary, documents, sparsity_percentage)
Example #17
Source File: test_base_s2v.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_init_w_ft_model_wo_vecs(self): ft = FastText(SENTENCES, size=5) with self.assertRaises(RuntimeError): ft.wv.vectors_vocab = None BaseSentence2VecModel(ft) with self.assertRaises(RuntimeError): ft.wv.vectors_ngrams = None BaseSentence2VecModel(ft)
Example #18
Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_cy_equal_np_ft_random(self): ft = FastText(size=20, min_count=1) ft.build_vocab(SENTENCES) m1 = Average(ft) m1.prep.prepare_vectors( sv=m1.sv, total_sentences=len(self.sentences), update=False ) m1._pre_train_calls() from fse.models.average_inner import MAX_NGRAMS_IN_BATCH m1.batch_ngrams = MAX_NGRAMS_IN_BATCH mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1) m2 = Average(ft) m2.prep.prepare_vectors( sv=m2.sv, total_sentences=len(self.sentences), update=False ) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() from fse.models.average_inner import train_average_cy o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2) self.assertEqual(o1, o2) self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
Example #19
Source File: embedding.py From fancy-nlp with GNU General Public License v3.0 | 5 votes |
def train_fasttext(corpus: List[List[str]], vocabulary: Dict[str, int], zero_init_indices: Union[int, List[int]] = 0, rand_init_indices: Union[int, List[int]] = 1, embedding_dim: int = 300) -> np.ndarray: """Use fasttext to train on corpus to obtain embedding Args: corpus: List of List of str. List of tokenized texts, the corpus to train on, like ``[['我', '是', '中', '国', '人'], ...]``. vocabulary: Dict[str, int']. A mapping of words to indices zero_init_indices: int or a List of int. The indices which use zero-initialization. These indices usually represent padding token. rand_init_indices: int or a List of int. The indices which use randomly-initialization.These indices usually represent other special tokens, such as "unk" token. embedding_dim: int. Dimensionality of embedding Returns: np.ndarray, a word embedding matrix, shaped [vocab_size, embedding_dim]. """ model = FastText(size=embedding_dim, min_count=1, window=5, sg=1, word_ngrams=1) model.build_vocab(sentences=corpus) model.train(sentences=corpus, total_examples=len(corpus), epochs=10) emb = np.zeros(shape=(len(vocabulary), embedding_dim), dtype='float32') for w, i in vocabulary.items(): emb[i, :] = model.wv[w] # note that oov words can still have word vectors if isinstance(zero_init_indices, int): zero_init_indices = [zero_init_indices] if isinstance(rand_init_indices, int): rand_init_indices = [rand_init_indices] for idx in zero_init_indices: emb[idx] = np.zeros(embedding_dim) for idx in rand_init_indices: emb[idx] = np.random.normal(0, 0.05, embedding_dim) return emb
Example #20
Source File: langmodelfeat.py From holoclean with Apache License 2.0 | 5 votes |
def specific_setup(self): self.name = 'LangModelFeaturizer' self.emb_size = 10 self.all_attrs = self.ds.get_attributes() self.attrs_number = len(self.all_attrs) self.attr_language_model = {} raw_data = self.ds.get_raw_data() for attr in self.all_attrs: attr_corpus = list(zip(raw_data[attr].tolist())) model = FastText(attr_corpus, min_count=1, size=self.emb_size) self.attr_language_model[attr] = model
Example #21
Source File: scdv.py From redshells with MIT License | 5 votes |
def _build_word_embeddings(dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> np.ndarray: embeddings = np.zeros((len(dictionary.token2id), w2v.vector_size)) for token, idx in dictionary.token2id.items(): if token in w2v.wv: embeddings[idx] = w2v.wv[token] return sklearn.preprocessing.normalize(embeddings, axis=1, norm='l2')