Python nltk.trigrams() Examples
The following are 7
code examples of nltk.trigrams().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: lex_sem_ft.py From DeepLearn with MIT License | 5 votes |
def train_trigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True): model[(w1,w2)][w2] += 1 total_count = 0 for w1,w2 in model: total_count = float(sum(model[(w1, w2)].values())) for w3 in model[(w1,w2)]: model[(w1, w2)][w3] /= total_count #Total Sum Of Trigram Probablity Of A Sentence[Returns Float]:
Example #2
Source File: utils.py From BERT with Apache License 2.0 | 5 votes |
def trigram_counts(word_list): tgs = nltk.trigrams(word_list) fdist = nltk.FreqDist(tgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example #3
Source File: lex_sem_ft.py From DL-text with MIT License | 5 votes |
def train_trigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True): model[(w1,w2)][w2] += 1 total_count = 0 for w1,w2 in model: total_count = float(sum(model[(w1, w2)].values())) for w3 in model[(w1,w2)]: model[(w1, w2)][w3] /= total_count #Total Sum Of Trigram Probablity Of A Sentence[Returns Float]:
Example #4
Source File: SVM.py From codenn with MIT License | 5 votes |
def tokenize(text): # text = NB.remove_punctuation(text) try: text = text.decode('utf-8').encode('ascii', 'replace').strip().lower() except: text = text.encode('ascii', 'replace').strip().lower() word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't biword = [b for b in nltk.bigrams(word)] triword = [t for t in nltk.trigrams(word)] # word = [w for w in word if w not in stopwords.words('english')] return word # triword
Example #5
Source File: words2map.py From words2map with MIT License | 5 votes |
def get_collocations(words): # returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant minimum_frequency = 3 ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency} collocations = dict(ngrams) for ngram, likelihood in dict(ngrams).iteritems(): grams = ngram.split("_") if len(grams) != 1: gram_likelihoods = [ngrams[gram] for gram in grams] if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)): collocations.pop(ngram, None) else: for gram in grams: collocations.pop(gram, None) return sorted(collocations.items(), key=itemgetter(1), reverse=True)
Example #6
Source File: lang_model_2.py From jakaton_feminicidios with MIT License | 5 votes |
def test(): lm1 = pickle.load(open("lm.bin", 'rb')) tweets_list = deviger.load_dataset('test.txt') for line in tweets_list: sentences = nltk.sent_tokenize(line.strip()) print("Tweet sentences:", sentences) for sent in sentences: words = nltk.word_tokenize(sent) word_trigrams = nltk.trigrams(words) sum_log_prob = 0 for trigram in word_trigrams: logprob = lm1.log_prob(trigram) sum_log_prob += logprob print("(", sum_log_prob / len(words), ")")
Example #7
Source File: dataset.py From qb with MIT License | 4 votes |
def create_qb_tokenizer( unigrams=True, bigrams=False, trigrams=False, zero_length_token='zerolengthunk', strip_qb_patterns=True): def tokenizer(text): if strip_qb_patterns: text = re.sub( '\s+', ' ', re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE) ).strip().capitalize() import nltk tokens = nltk.word_tokenize(text) if len(tokens) == 0: return [zero_length_token] else: ngrams = [] if unigrams: ngrams.extend(tokens) if bigrams: ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)]) if trigrams: ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)]) if len(ngrams) == 0: ngrams.append(zero_length_token) return ngrams return tokenizer