Python nltk.bigrams() Examples
The following are 19
code examples of nltk.bigrams().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: utils.py From BERT with Apache License 2.0 | 8 votes |
def bigram_counts(word_list): bgs = nltk.bigrams(word_list) fdist = nltk.FreqDist(bgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example #2
Source File: maxent-nltk.py From twitter-sentiment-analysis with MIT License | 6 votes |
def get_data_from_file(file_name, isTrain=True): data = [] with open(train_csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if isTrain: tag = line.split(',')[1] bag_of_words = line.split(',')[2].split() if USE_BIGRAMS: bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split())) bag_of_words = bag_of_words+bag_of_words_bigram else : tag = '5' bag_of_words = line.split(',')[1].split() if USE_BIGRAMS: bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split())) bag_of_words = bag_of_words+bag_of_words_bigram data.append((bag_of_words, tag)) return data
Example #3
Source File: loader.py From BERT with Apache License 2.0 | 6 votes |
def __init__(self, data_paths, batch_size, unroll, level): self.batch_size = batch_size self.unroll = unroll train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data( data_paths, level) self.bg_counts = bigram_counts(train_tokens) self.tg_counts = trigram_counts(train_tokens) self.token_to_id = token_to_id # NOTE extends the vocabulary self.token_to_id['<_>'] = len(self.token_to_id) self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems()) train_data = _reshape_data(train_data, batch_size, unroll) valid_data = _reshape_data(valid_data, batch_size, unroll) test_data = _reshape_data(test_data, batch_size, unroll) self.split_data = {"train": train_data, "valid": valid_data, "test": test_data} self.frequencies = frequencies self.frequencies_cumsum = np.cumsum(frequencies) self.hist_freqs = hist_freqs self.hist_freqs_cumsum = np.cumsum(hist_freqs) self.continuations = build_continuations(self.bg_counts) bgs = nltk.bigrams(train_tokens) if level == "word": self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts( bgs)
Example #4
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document) return features #//////////////////////////////////////////////////////////// #{ Helper Functions #////////////////////////////////////////////////////////////
Example #5
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]: """ Based on the current utterance, return a dictionary where the keys are the strings in the database that map to lists of the token indices that they are linked to. """ string_linking_scores: Dict[str, List[int]] = defaultdict(list) for index, token in enumerate(tokenized_utterance): for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []): string_linking_scores[string].append(index) token_bigrams = bigrams([token.text for token in tokenized_utterance]) for index, token_bigram in enumerate(token_bigrams): for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []): string_linking_scores[string].extend([index, index + 1]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for index, trigram in enumerate(trigrams): if trigram[0] == "st": natural_language_key = f"st. {trigram[2]}".lower() else: natural_language_key = " ".join(trigram).lower() for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []): string_linking_scores[string].extend([index, index + 1, index + 2]) return string_linking_scores
Example #6
Source File: sc_bigramcount.py From atap with Apache License 2.0 | 5 votes |
def count_bigrams(corpus): text = corpus.map(itemgetter(1)) sents = text.flatMap(nltk.sent_tokenize) sents = sents.map(lambda s: list(nltk.word_tokenize(s))) bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s))) unique_bigrams = bigrams.distinct().count() print("unique bigrams: {}".format(unique_bigrams)) bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF() print(bigram_counts.head()) ## Main functionality
Example #7
Source File: large-scale-evaluation-freebase.py From BREDS with GNU Lesser General Public License v3.0 | 5 votes |
def extract_bigrams(text): tokens = word_tokenize(text) return [gram[0]+' '+gram[1] for gram in bigrams(tokens)] # ######################################## # Estimations of sets and intersections # # ########################################
Example #8
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams( document ) return features # //////////////////////////////////////////////////////////// # { Helper Functions # ////////////////////////////////////////////////////////////
Example #9
Source File: read.py From CrisisLex with MIT License | 5 votes |
def get_tweet_terms(tweet, stem_map = None, bigrams_map = None): words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map) filtered_words = [w for w in words if not w in stopwords.words('english')] bigrams = nltk.bigrams(filtered_words) words_set = set(filtered_words) terms_dict = {} for w in words_set: terms_dict['%s'%w] = 'y' for b in bigrams: terms_dict['%s %s'%(b[0],b[1])] = 'y' return terms_dict
Example #10
Source File: read.py From CrisisLex with MIT License | 5 votes |
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None): ps = PorterStemmer() local_map = dict() word_list = [] clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)] filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')] for w in filtered_words: if w.isalpha(): w_temp = ps.stem_word(w) if stem_words_map is not None: if w_temp not in stem_words_map: stem_words_map[w_temp] = dict() stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1 local_map[w_temp] = w word_list.append(w_temp) bigrams = nltk.bigrams(word_list) for b in bigrams: bigram_org = (local_map[b[0]],local_map[b[1]]) if stem_bigrams_map is not None: if b not in stem_bigrams_map: stem_bigrams_map[b] = dict() stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1 return word_list, bigrams # keeps track of the exact form of the stemmed bigrams, not only the one of the words
Example #11
Source File: adaptive_collect.py From CrisisLex with MIT License | 5 votes |
def update_terms_stats(terms_fd, json_tweet, lex): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None: return False tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+') doc = tokenizer.tokenize(tweet) for w_raw in doc: w = w_raw.strip('\"\'.,;?!:)(@/*&') if not (w.strip('#')).isalpha(): w_aux = '' #ignore non-ascii characters for s in w: if ord(s) < 128: w_aux += s else: break w = w_aux w = w.lower() if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16): if w in lex: continue tweet_terms.append(w) terms_fd.inc(w) bigrams = nltk.bigrams(tweet_terms) for b in bigrams: if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex: continue if b[1]+" "+b[0] in terms_fd: terms_fd.inc(b[1]+" "+b[0]) else: terms_fd.inc(b[0]+" "+b[1]) return True
Example #12
Source File: SVM.py From codenn with MIT License | 5 votes |
def tokenize(text): # text = NB.remove_punctuation(text) try: text = text.decode('utf-8').encode('ascii', 'replace').strip().lower() except: text = text.encode('ascii', 'replace').strip().lower() word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't biword = [b for b in nltk.bigrams(word)] triword = [t for t in nltk.trigrams(word)] # word = [w for w in word if w not in stopwords.words('english')] return word # triword
Example #13
Source File: bigrams.py From textkit with MIT License | 5 votes |
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
Example #14
Source File: lex_sem_ft.py From DL-text with MIT License | 5 votes |
def train_bigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2 in bigrams(sent, pad_right=True, pad_left=True): model[w1][w2] += 1 total_count = 0 for w1 in model: total_count = float(sum(model[w1].values())) for w2 in model[w1]: model[w1][w2] /= total_count return model #Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:
Example #15
Source File: spelling_checker.py From kaggle-HomeDepot with MIT License | 5 votes |
def get_valid_bigram_words(self, words): _words = [] for i in nltk.bigrams(words): if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len): if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)): if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)): _words.append(" ".join(i)) return _words
Example #16
Source File: utils.py From BERT with Apache License 2.0 | 5 votes |
def estimate_modkn_discounts(ngrams): # Get counts counts = Counter(ngrams) N1 = float(len([k for k in counts if counts[k] == 1])) N2 = float(len([k for k in counts if counts[k] == 2])) N3 = float(len([k for k in counts if counts[k] == 3])) N4 = float(len([k for k in counts if counts[k] == 4])) N3p = float(len([k for k in counts if counts[k] >= 3])) # Estimate discounting parameters Y = N1 / (N1 + 2 * N2) D1 = 1 - 2 * Y * (N2 / N1) D2 = 2 - 3 * Y * (N3 / N2) D3p = 3 - 4 * Y * (N4 / N3) # FIXME(zxie) Assumes bigrams for now # Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+) N1_lookup = Counter() N2_lookup = Counter() N3p_lookup = Counter() for bg in counts: if counts[bg] == 1: N1_lookup[bg[0]] += 1 elif counts[bg] == 2: N2_lookup[bg[0]] += 1 else: N3p_lookup[bg[0]] += 1 return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup
Example #17
Source File: lex_sem_ft.py From DeepLearn with MIT License | 5 votes |
def train_bigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2 in bigrams(sent, pad_right=True, pad_left=True): model[w1][w2] += 1 total_count = 0 for w1 in model: total_count = float(sum(model[w1].values())) for w2 in model[w1]: model[w1][w2] /= total_count return model #Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:
Example #18
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 4 votes |
def add_dates_to_number_linking_scores( self, number_linking_scores: Dict[str, Tuple[str, str, List[int]]], current_tokenized_utterance: List[Token], ) -> None: month_reverse_lookup = { str(number): string for string, number in atis_tables.MONTH_NUMBERS.items() } day_reverse_lookup = { str(number): string for string, number in atis_tables.DAY_NUMBERS.items() } if self.dates: for date in self.dates: # Add the year linking score entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == str(date.year): entity_linking[token_index] = 1 action = format_action( nonterminal="year_number", right_hand_side=str(date.year), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("year_number", str(date.year), entity_linking) entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == month_reverse_lookup[str(date.month)]: entity_linking[token_index] = 1 action = format_action( nonterminal="month_number", right_hand_side=str(date.month), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("month_number", str(date.month), entity_linking) entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == day_reverse_lookup[str(date.day)]: entity_linking[token_index] = 1 for bigram_index, bigram in enumerate( bigrams([token.text for token in current_tokenized_utterance]) ): if " ".join(bigram) == day_reverse_lookup[str(date.day)]: entity_linking[bigram_index] = 1 entity_linking[bigram_index + 1] = 1 action = format_action( nonterminal="day_number", right_hand_side=str(date.day), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("day_number", str(date.day), entity_linking)
Example #19
Source File: dataset.py From qb with MIT License | 4 votes |
def create_qb_tokenizer( unigrams=True, bigrams=False, trigrams=False, zero_length_token='zerolengthunk', strip_qb_patterns=True): def tokenizer(text): if strip_qb_patterns: text = re.sub( '\s+', ' ', re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE) ).strip().capitalize() import nltk tokens = nltk.word_tokenize(text) if len(tokens) == 0: return [zero_length_token] else: ngrams = [] if unigrams: ngrams.extend(tokens) if bigrams: ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)]) if trigrams: ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)]) if len(ngrams) == 0: ngrams.append(zero_length_token) return ngrams return tokenizer