Python gensim.corpora.TextCorpus() Examples

The following are 19 code examples of gensim.corpora.TextCorpus(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.corpora , or try the search function .
Example #1
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_save_load_ability(self):
        """
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        """
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # pickle to disk
        tmpf = get_tmpfile('tc_test.cpickle')
        miislita.save(tmpf)

        miislita2 = CorpusMiislita.load(tmpf)

        self.assertEqual(len(miislita), len(miislita2))
        self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) 
Example #2
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_save_load_ability(self):
        """
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        """
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # pickle to disk
        tmpf = get_tmpfile('tc_test.cpickle')
        miislita.save(tmpf)

        miislita2 = CorpusMiislita.load(tmpf)

        self.assertEqual(len(miislita), len(miislita2))
        self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) 
Example #3
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_save_load_ability(self):
        """
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        """
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # pickle to disk
        tmpf = get_tmpfile('tc_test.cpickle')
        miislita.save(tmpf)

        miislita2 = CorpusMiislita.load(tmpf)

        self.assertEqual(len(miislita), len(miislita2))
        self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) 
Example #4
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_save_load_ability(self):
        """
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        """
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # pickle to disk
        tmpf = get_tmpfile('tc_test.cpickle')
        miislita.save(tmpf)

        miislita2 = CorpusMiislita.load(tmpf)

        self.assertEqual(len(miislita), len(miislita2))
        self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id) 
Example #5
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #6
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #7
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #8
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #9
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #10
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #11
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #12
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #13
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #14
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #15
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #16
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #17
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, corpusPath, tokenRegexStr):
		if tokenRegexStr is None:
			tokenRegexStr = GensimTermiteCorpusReader.DEFAULT_TOKEN_REGEX
		self.tokenRegexStr = tokenRegexStr
		self.tokenRegex = re.compile(self.tokenRegexStr)
		self.docIds = []
		corpora.TextCorpus.__init__(self, corpusPath) 
Example #18
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
Example #19
Source File: test_miislita.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2))