Python nltk.tokenize.RegexpTokenizer() Examples

The following are 30 code examples of nltk.tokenize.RegexpTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: preprocessing.py    From toxic_comments with MIT License 7 votes vote down vote up
def clean_text(df, wrong_words_dict, autocorrect=True):
    df.fillna("__NA__", inplace=True)
    tokinizer = RegexpTokenizer(r'\w+')
    regexps = [re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)")]
    texts = df.tolist()
    result = []
    for text in tqdm(texts):
        tokens = tokinizer.tokenize(text.lower())
        tokens = [split_text_and_digits(token, regexps) for token in tokens]
        tokens = [substitute_repeats(token, 3) for token in tokens]
        text = ' '.join(tokens)
        if autocorrect:
            for wrong, right in wrong_words_dict.items():
                text = text.replace(wrong, right)
        result.append(text)
    return result 
Example #2
Source File: aligned.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
Example #3
Source File: aligned.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def __init__(
        self,
        root,
        fileids,
        sep='/',
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding='latin1',
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
Example #4
Source File: preprocessing.py    From neupy with MIT License 6 votes vote down vote up
def transform(self, texts, y=None):
        tokenizer = RegexpTokenizer(r'[a-z]+|\d+')

        tokenized_texts = []
        stoplist = []

        if self.ignore_stopwords:
            stoplist = stopwords.words('english')

        for text in texts:
            tokenized_text = []
            for word in tokenizer.tokenize(text.lower()):
                if word not in stoplist:
                    tokenized_text.append(word.strip())

            tokenized_texts.append(tokenized_text)
        return tokenized_texts 
Example #5
Source File: topic_modeler.py    From Artificial-Intelligence-with-Python with MIT License 6 votes vote down vote up
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
Example #6
Source File: vocab_index_descriptions.py    From caml-mimic with MIT License 6 votes vote down vote up
def vocab_index_descriptions(vocab_file, vectors_file):
    #load lookups
    vocab = set()
    with open(vocab_file, 'r') as vocabfile:
        for i,line in enumerate(vocabfile):
            line = line.strip()
            if line != '':
                vocab.add(line)
    ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}
    w2ind = {w:i for i,w in ind2w.items()}
    desc_dict = datasets.load_code_descriptions()
        
    tokenizer = RegexpTokenizer(r'\w+')

    with open(vectors_file, 'w') as of:
        w = csv.writer(of, delimiter=' ')
        w.writerow(["CODE", "VECTOR"])
        for code, desc in tqdm(desc_dict.items()):
            #same preprocessing steps as in get_discharge_summaries
            tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]
            inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]
            w.writerow([code] + [str(i) for i in inds]) 
Example #7
Source File: extract_baseline_features.py    From Sarcasm-Detection with MIT License 6 votes vote down vote up
def get_ngram_features_from_map(tweets, ngram_map, n):
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    features = []
    for tweet in tweets:
        feature_list = [0] * np.zeros(len(ngram_map))
        tweet = tweet.lower()
        ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        if n > 1:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
        if n > 2:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
        for gram in ngram_list:
            if gram in ngram_map:
                feature_list[ngram_map[gram]] += 1.0
        features.append(feature_list)
    return features 
Example #8
Source File: aligned.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding=None):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt')

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
Example #9
Source File: topic_modeling.py    From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License 5 votes vote down vote up
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words
        self.english_stop_words= stopwords.words('english')

        # Create a Snowball stemmer
        self.snowball_stemmer = SnowballStemmer('english')

    # Tokenizing, stop word removal, and stemming 
Example #10
Source File: query_processing.py    From DarkDarkGo with MIT License 5 votes vote down vote up
def clean_text(dirty_text):
    """
        Given a string, this function tokenizes the words of that string.
        :param dirty_text: string
        :return: list
        input = "American artist accomplishments american"
        output = ['accomplishments', 'american', 'artist']
    """

    lower_dirty_text = dirty_text.lower()
    regex_pattern = r"[\w']+"
    tokenizer = RegexpTokenizer(regex_pattern)
    tokens = tokenizer.tokenize(lower_dirty_text)
    unique_tokens = list(set(tokens))
    return unique_tokens 
Example #11
Source File: data.py    From tagan with Apache License 2.0 5 votes vote down vote up
def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower()) 
Example #12
Source File: ycoe.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe. 
Example #13
Source File: ycoe.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(
            self, root, items, sep='_', sent_tokenizer=sent_tokenizer
        )


#: A list of all documents and their titles in ycoe. 
Example #14
Source File: rte_classify.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
Example #15
Source File: datasets.py    From AttnGAN with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #16
Source File: helpers.py    From chirp with MIT License 5 votes vote down vote up
def get_tokens(text):
    """Tokenize the input text."""
    soup = BeautifulSoup(text, "html.parser")
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(soup.get_text()) 
Example #17
Source File: lda_model_calculator.py    From moviegeek with MIT License 5 votes vote down vote up
def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in tqdm(data):
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model 
Example #18
Source File: lda_model_calculator.py    From moviegeek with MIT License 5 votes vote down vote up
def tokenize(self, data):
        tokenizer = RegexpTokenizer(r'\w+')

        return [tokenizer.tokenize(d) for d in data] 
Example #19
Source File: datasets.py    From DM-GAN with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #20
Source File: extract_baseline_features.py    From Sarcasm-Detection with MIT License 5 votes vote down vote up
def get_ngrams(tweets, n):
    unigrams = Counter()
    bigrams = Counter()
    trigrams = Counter()
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    for tweet in tweets:
        tweet = tweet.lower()
        # Get the unigram list for this tweet and update the unigram counter
        unigram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        unigrams.update(unigram_list)
        # Get the bigram list for this tweet and update the bigram counter
        if n > 1:
            bigram_list = get_ngram_list(regexp_tknzr, tweet, 2)
            bigrams.update(bigram_list)
            # Get the trigram list for this tweet and update the trigram counter
            if n > 2:
                trigram_list = get_ngram_list(regexp_tknzr, tweet, 3)
                trigrams.update(trigram_list)
    # Update the counters such that each n-gram appears at least min_occurence times
    min_occurence = 2
    unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence]
    # In case using just unigrams, make the bigrams and trigrams empty
    bigram_tokens = trigram_tokens = []
    if n > 1:
        bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence]
    if n > 2:
        trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence]
    return unigram_tokens, bigram_tokens, trigram_tokens 
Example #21
Source File: data.py    From dong_iccv_2017 with MIT License 5 votes vote down vote up
def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower()) 
Example #22
Source File: IdentifyingTopic.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents))) 
Example #23
Source File: semantic_image_synthesis_dataset.py    From DMIT with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #24
Source File: ycoe.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, root, items, encoding=None):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe. 
Example #25
Source File: Paste.py    From AIL-framework with GNU Affero General Public License v3.0 5 votes vote down vote up
def _get_top_words(self, sort=False):
        """
        Tokenising method: Returning a sorted list or a set of paste's words

        :param sort: Selecting the output: sorted list or a set. (set by default)

        :return: set or sorted list of tuple [(word, occurency)...]

        :Example: PST._get_top_words(False)

        """
        words = {}
        tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
                                    gaps=True, discard_empty=True)

        blob = TextBlob(clean( (self.get_p_content()) ), tokenizer=tokenizer)

        for word in blob.tokens:
            if word in words.keys():
                num = words[word]
            else:
                num = 0
            words[word] = num + 1
        if sort:
            var = sorted(words.items(), key=operator.itemgetter(1), reverse=True)
        else:
            var = words

        return var 
Example #26
Source File: datasets.py    From multiple-objects-gan with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #27
Source File: datasets.py    From semantic-object-accuracy-for-generative-text-to-image-synthesis with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #28
Source File: rte_classify.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
                              'have', 'is', 'are', 'were', 'and', 'very', '.',','])

        self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set([lemmatize(token) for token in self.text_tokens])
            self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
Example #29
Source File: datasets.py    From attn-gan with MIT License 5 votes vote down vote up
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
Example #30
Source File: topic_extractor.py    From TBBTCorpus with Apache License 2.0 5 votes vote down vote up
def __tokenize(self, docs):
		output = []
		for doc in docs:
			tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
			output.append(tokenizer.tokenize(doc.lower()))
		return output