Python nltk.bigrams() Examples

The following are 19 code examples of nltk.bigrams(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: utils.py    From BERT with Apache License 2.0 8 votes vote down vote up
def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example #2
Source File: maxent-nltk.py    From twitter-sentiment-analysis with MIT License 6 votes vote down vote up
def get_data_from_file(file_name, isTrain=True):
    data = []
    with open(train_csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if isTrain:
                tag = line.split(',')[1]
                bag_of_words = line.split(',')[2].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            else :
                tag = '5'
                bag_of_words = line.split(',')[1].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            data.append((bag_of_words, tag))
    return data 
Example #3
Source File: loader.py    From BERT with Apache License 2.0 6 votes vote down vote up
def __init__(self, data_paths, batch_size, unroll, level):
    self.batch_size = batch_size
    self.unroll = unroll
    train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data(
        data_paths, level)
    self.bg_counts = bigram_counts(train_tokens)
    self.tg_counts = trigram_counts(train_tokens)
    self.token_to_id = token_to_id
    # NOTE extends the vocabulary
    self.token_to_id['<_>'] = len(self.token_to_id)
    self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems())
    train_data = _reshape_data(train_data, batch_size, unroll)
    valid_data = _reshape_data(valid_data, batch_size, unroll)
    test_data = _reshape_data(test_data, batch_size, unroll)
    self.split_data = {"train": train_data, "valid": valid_data,
                       "test": test_data}
    self.frequencies = frequencies
    self.frequencies_cumsum = np.cumsum(frequencies)
    self.hist_freqs = hist_freqs
    self.hist_freqs_cumsum = np.cumsum(hist_freqs)
    self.continuations = build_continuations(self.bg_counts)
    bgs = nltk.bigrams(train_tokens)
    if level == "word":
      self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts(
          bgs) 
Example #4
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#//////////////////////////////////////////////////////////// 
Example #5
Source File: atis_world.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
Example #6
Source File: sc_bigramcount.py    From atap with Apache License 2.0 5 votes vote down vote up
def count_bigrams(corpus):
    text = corpus.map(itemgetter(1))
    sents = text.flatMap(nltk.sent_tokenize)
    sents = sents.map(lambda s: list(nltk.word_tokenize(s)))

    bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s)))
    unique_bigrams = bigrams.distinct().count()
    print("unique bigrams: {}".format(unique_bigrams))

    bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF()
    print(bigram_counts.head())


## Main functionality 
Example #7
Source File: large-scale-evaluation-freebase.py    From BREDS with GNU Lesser General Public License v3.0 5 votes vote down vote up
def extract_bigrams(text):
    tokens = word_tokenize(text)
    return [gram[0]+' '+gram[1] for gram in bigrams(tokens)]


# ########################################
# Estimations of sets and intersections #
# ######################################## 
Example #8
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
            document
        )
    return features


# ////////////////////////////////////////////////////////////
# { Helper Functions
# //////////////////////////////////////////////////////////// 
Example #9
Source File: read.py    From CrisisLex with MIT License 5 votes vote down vote up
def get_tweet_terms(tweet, stem_map = None, bigrams_map = None):
    words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map)
    filtered_words = [w for w in words if not w in stopwords.words('english')]

    bigrams = nltk.bigrams(filtered_words)
    words_set = set(filtered_words)
    terms_dict = {}

    for w in words_set:
        terms_dict['%s'%w] = 'y'

    for b in bigrams:
        terms_dict['%s %s'%(b[0],b[1])] = 'y'

    return terms_dict 
Example #10
Source File: read.py    From CrisisLex with MIT License 5 votes vote down vote up
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
    filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]],local_map[b[1]])
        if stem_bigrams_map is not None:
                if b not in stem_bigrams_map:
                    stem_bigrams_map[b] = dict()
                stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1

    return word_list, bigrams

# keeps track of the exact form of the stemmed bigrams, not only the one of the words 
Example #11
Source File: adaptive_collect.py    From CrisisLex with MIT License 5 votes vote down vote up
def update_terms_stats(terms_fd, json_tweet, lex):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None:
        return False
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        w = w_raw.strip('\"\'.,;?!:)(@/*&')
        if not (w.strip('#')).isalpha():
            w_aux = ''
            #ignore non-ascii characters
            for s in w:
                if ord(s) < 128:
                    w_aux += s
                else:
                    break
            w = w_aux
        w = w.lower()
        if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
            if w in lex:
                continue
            tweet_terms.append(w)
            terms_fd.inc(w)
    bigrams = nltk.bigrams(tweet_terms)
    for b in bigrams:
        if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex:
            continue
        if b[1]+" "+b[0] in terms_fd:
            terms_fd.inc(b[1]+" "+b[0])
        else:
            terms_fd.inc(b[0]+" "+b[1])
    return True 
Example #12
Source File: SVM.py    From codenn with MIT License 5 votes vote down vote up
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
Example #13
Source File: bigrams.py    From textkit with MIT License 5 votes vote down vote up
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams] 
Example #14
Source File: lex_sem_ft.py    From DL-text with MIT License 5 votes vote down vote up
def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]: 
Example #15
Source File: spelling_checker.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def get_valid_bigram_words(self, words):
        _words = []
        for i in nltk.bigrams(words):
            if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
                if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
                    if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
                        _words.append(" ".join(i))
        return _words 
Example #16
Source File: utils.py    From BERT with Apache License 2.0 5 votes vote down vote up
def estimate_modkn_discounts(ngrams):
	# Get counts
	counts = Counter(ngrams)
	N1 = float(len([k for k in counts if counts[k] == 1]))
	N2 = float(len([k for k in counts if counts[k] == 2]))
	N3 = float(len([k for k in counts if counts[k] == 3]))
	N4 = float(len([k for k in counts if counts[k] == 4]))
	N3p = float(len([k for k in counts if counts[k] >= 3]))

	# Estimate discounting parameters
	Y = N1 / (N1 + 2 * N2)
	D1 = 1 - 2 * Y * (N2 / N1)
	D2 = 2 - 3 * Y * (N3 / N2)
	D3p = 3 - 4 * Y * (N4 / N3)

	# FIXME(zxie) Assumes bigrams for now
	# Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+)
	N1_lookup = Counter()
	N2_lookup = Counter()
	N3p_lookup = Counter()
	for bg in counts:
		if counts[bg] == 1:
			N1_lookup[bg[0]] += 1
		elif counts[bg] == 2:
			N2_lookup[bg[0]] += 1
		else:
			N3p_lookup[bg[0]] += 1

	return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup 
Example #17
Source File: lex_sem_ft.py    From DeepLearn with MIT License 5 votes vote down vote up
def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]: 
Example #18
Source File: atis_world.py    From allennlp-semparse with Apache License 2.0 4 votes vote down vote up
def add_dates_to_number_linking_scores(
        self,
        number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
        current_tokenized_utterance: List[Token],
    ) -> None:

        month_reverse_lookup = {
            str(number): string for string, number in atis_tables.MONTH_NUMBERS.items()
        }
        day_reverse_lookup = {
            str(number): string for string, number in atis_tables.DAY_NUMBERS.items()
        }

        if self.dates:
            for date in self.dates:
                # Add the year linking score
                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == str(date.year):
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="year_number",
                    right_hand_side=str(date.year),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("year_number", str(date.year), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == month_reverse_lookup[str(date.month)]:
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="month_number",
                    right_hand_side=str(date.month),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )

                number_linking_scores[action] = ("month_number", str(date.month), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == day_reverse_lookup[str(date.day)]:
                        entity_linking[token_index] = 1
                for bigram_index, bigram in enumerate(
                    bigrams([token.text for token in current_tokenized_utterance])
                ):
                    if " ".join(bigram) == day_reverse_lookup[str(date.day)]:
                        entity_linking[bigram_index] = 1
                        entity_linking[bigram_index + 1] = 1
                action = format_action(
                    nonterminal="day_number",
                    right_hand_side=str(date.day),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("day_number", str(date.day), entity_linking) 
Example #19
Source File: dataset.py    From qb with MIT License 4 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer