Python Examples of nltk.bigrams

Source File: utils.py From BERT with Apache License 2.0

8 votes

def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d

Source File: maxent-nltk.py From twitter-sentiment-analysis with MIT License

6 votes

def get_data_from_file(file_name, isTrain=True):
    data = []
    with open(train_csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if isTrain:
                tag = line.split(',')[1]
                bag_of_words = line.split(',')[2].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            else :
                tag = '5'
                bag_of_words = line.split(',')[1].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            data.append((bag_of_words, tag))
    return data

Source File: loader.py From BERT with Apache License 2.0

6 votes

def __init__(self, data_paths, batch_size, unroll, level):
    self.batch_size = batch_size
    self.unroll = unroll
    train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data(
        data_paths, level)
    self.bg_counts = bigram_counts(train_tokens)
    self.tg_counts = trigram_counts(train_tokens)
    self.token_to_id = token_to_id
    # NOTE extends the vocabulary
    self.token_to_id['<_>'] = len(self.token_to_id)
    self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems())
    train_data = _reshape_data(train_data, batch_size, unroll)
    valid_data = _reshape_data(valid_data, batch_size, unroll)
    test_data = _reshape_data(test_data, batch_size, unroll)
    self.split_data = {"train": train_data, "valid": valid_data,
                       "test": test_data}
    self.frequencies = frequencies
    self.frequencies_cumsum = np.cumsum(frequencies)
    self.hist_freqs = hist_freqs
    self.hist_freqs_cumsum = np.cumsum(hist_freqs)
    self.continuations = build_continuations(self.bg_counts)
    bgs = nltk.bigrams(train_tokens)
    if level == "word":
      self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts(
          bgs)

Source File: util.py From razzy-spinner with GNU General Public License v3.0

6 votes

def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////

Source File: atis_world.py From allennlp-semparse with Apache License 2.0

6 votes

def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores

Source File: sc_bigramcount.py From atap with Apache License 2.0

5 votes

def count_bigrams(corpus):
    text = corpus.map(itemgetter(1))
    sents = text.flatMap(nltk.sent_tokenize)
    sents = sents.map(lambda s: list(nltk.word_tokenize(s)))

    bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s)))
    unique_bigrams = bigrams.distinct().count()
    print("unique bigrams: {}".format(unique_bigrams))

    bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF()
    print(bigram_counts.head())


## Main functionality

Source File: large-scale-evaluation-freebase.py From BREDS with GNU Lesser General Public License v3.0

5 votes

def extract_bigrams(text):
    tokens = word_tokenize(text)
    return [gram[0]+' '+gram[1] for gram in bigrams(tokens)]


# ########################################
# Estimations of sets and intersections #
# ########################################

Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
            document
        )
    return features


# ////////////////////////////////////////////////////////////
# { Helper Functions
# ////////////////////////////////////////////////////////////

Source File: read.py From CrisisLex with MIT License

5 votes

def get_tweet_terms(tweet, stem_map = None, bigrams_map = None):
    words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map)
    filtered_words = [w for w in words if not w in stopwords.words('english')]

    bigrams = nltk.bigrams(filtered_words)
    words_set = set(filtered_words)
    terms_dict = {}

    for w in words_set:
        terms_dict['%s'%w] = 'y'

    for b in bigrams:
        terms_dict['%s %s'%(b[0],b[1])] = 'y'

    return terms_dict

Source File: read.py From CrisisLex with MIT License

5 votes

def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
    filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]],local_map[b[1]])
        if stem_bigrams_map is not None:
                if b not in stem_bigrams_map:
                    stem_bigrams_map[b] = dict()
                stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1

    return word_list, bigrams

# keeps track of the exact form of the stemmed bigrams, not only the one of the words

Source File: adaptive_collect.py From CrisisLex with MIT License

5 votes

def update_terms_stats(terms_fd, json_tweet, lex):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None:
        return False
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        w = w_raw.strip('\"\'.,;?!:)(@/*&')
        if not (w.strip('#')).isalpha():
            w_aux = ''
            #ignore non-ascii characters
            for s in w:
                if ord(s) < 128:
                    w_aux += s
                else:
                    break
            w = w_aux
        w = w.lower()
        if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
            if w in lex:
                continue
            tweet_terms.append(w)
            terms_fd.inc(w)
    bigrams = nltk.bigrams(tweet_terms)
    for b in bigrams:
        if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex:
            continue
        if b[1]+" "+b[0] in terms_fd:
            terms_fd.inc(b[1]+" "+b[0])
        else:
            terms_fd.inc(b[0]+" "+b[1])
    return True

Source File: SVM.py From codenn with MIT License

5 votes

def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword

Source File: bigrams.py From textkit with MIT License

5 votes

def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]

Source File: lex_sem_ft.py From DL-text with MIT License

5 votes

def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:

Source File: spelling_checker.py From kaggle-HomeDepot with MIT License

5 votes

def get_valid_bigram_words(self, words):
        _words = []
        for i in nltk.bigrams(words):
            if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
                if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
                    if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
                        _words.append(" ".join(i))
        return _words

Source File: utils.py From BERT with Apache License 2.0

5 votes

def estimate_modkn_discounts(ngrams):
	# Get counts
	counts = Counter(ngrams)
	N1 = float(len([k for k in counts if counts[k] == 1]))
	N2 = float(len([k for k in counts if counts[k] == 2]))
	N3 = float(len([k for k in counts if counts[k] == 3]))
	N4 = float(len([k for k in counts if counts[k] == 4]))
	N3p = float(len([k for k in counts if counts[k] >= 3]))

	# Estimate discounting parameters
	Y = N1 / (N1 + 2 * N2)
	D1 = 1 - 2 * Y * (N2 / N1)
	D2 = 2 - 3 * Y * (N3 / N2)
	D3p = 3 - 4 * Y * (N4 / N3)

	# FIXME(zxie) Assumes bigrams for now
	# Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+)
	N1_lookup = Counter()
	N2_lookup = Counter()
	N3p_lookup = Counter()
	for bg in counts:
		if counts[bg] == 1:
			N1_lookup[bg[0]] += 1
		elif counts[bg] == 2:
			N2_lookup[bg[0]] += 1
		else:
			N3p_lookup[bg[0]] += 1

	return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup

Source File: lex_sem_ft.py From DeepLearn with MIT License

5 votes

def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:

Source File: atis_world.py From allennlp-semparse with Apache License 2.0

4 votes

def add_dates_to_number_linking_scores(
        self,
        number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
        current_tokenized_utterance: List[Token],
    ) -> None:

        month_reverse_lookup = {
            str(number): string for string, number in atis_tables.MONTH_NUMBERS.items()
        }
        day_reverse_lookup = {
            str(number): string for string, number in atis_tables.DAY_NUMBERS.items()
        }

        if self.dates:
            for date in self.dates:
                # Add the year linking score
                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == str(date.year):
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="year_number",
                    right_hand_side=str(date.year),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("year_number", str(date.year), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == month_reverse_lookup[str(date.month)]:
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="month_number",
                    right_hand_side=str(date.month),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )

                number_linking_scores[action] = ("month_number", str(date.month), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == day_reverse_lookup[str(date.day)]:
                        entity_linking[token_index] = 1
                for bigram_index, bigram in enumerate(
                    bigrams([token.text for token in current_tokenized_utterance])
                ):
                    if " ".join(bigram) == day_reverse_lookup[str(date.day)]:
                        entity_linking[bigram_index] = 1
                        entity_linking[bigram_index + 1] = 1
                action = format_action(
                    nonterminal="day_number",
                    right_hand_side=str(date.day),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("day_number", str(date.day), entity_linking)

Source File: dataset.py From qb with MIT License

4 votes

def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer

Python nltk.bigrams() Examples