Python nltk.tokenize.TreebankWordTokenizer() Examples
The following are 15
code examples of nltk.tokenize.TreebankWordTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #2
Source File: phrasemachine.py From phrasemachine with MIT License | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #3
Source File: msrvtt_tagging.py From Semantics-AssistedVideoCaptioning with MIT License | 6 votes |
def main(tag_gt, word2idx, zipname): with zf.ZipFile(zipname) as myzip: namelist = myzip.namelist() print('namelist:', namelist) datainfo = myzip.open(namelist[-1], 'r') info_dict = json.load(datainfo) sentences = info_dict['sentences'] tokenizer = TreebankWordTokenizer() for sentence in sentences: video_id = sentence['video_id'] video_idx = int(video_id[5:]) caption = sentence['caption'] words = tokenizer.tokenize(caption) for word in words: if word in word2idx: tag_gt[video_idx, word2idx[word]] = 1
Example #4
Source File: preprocessing.py From question-generation with MIT License | 6 votes |
def tokenise(text, asbytes=True, append_eos=False): text = text.decode() if asbytes else text if use_nltk: sents = [s for s in sent_tokenize(text)] tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)] else: for char in string.punctuation+'()-–': text = text.replace(char, ' '+char+' ') tokens = text.lower().split(' ') tokens = [w.encode() if asbytes else w for w in tokens if w.strip() != ''] if append_eos: tokens.append(EOS.encode() if asbytes else EOS) # tokens = np.asarray(tokens) # return np.asarray(tokens) return tokens
Example #5
Source File: lang_proc.py From SearchingReddit with MIT License | 5 votes |
def stem_and_tokenize_text(text): sents = sent_tokenize(text) tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents])) terms = [Term(token) for token in tokens] return filter(lambda term: not term.is_punctuation(), terms)
Example #6
Source File: UDParser.py From PredPatt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def tokenize(sentence): "Tokenize sentence the way parser expects." tokenizer = TreebankWordTokenizer() s = tokenizer.tokenize(sentence) s = ' '.join(s) # character replacements s = ''.join(REPLACEMENTS_R.get(x,x) for x in s) return s
Example #7
Source File: UDParser.py From PredPatt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fresh(self, s, tokenized=False): """UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string). Pass in `tokenized=True` if `s` has already been tokenized, otherwise we apply `nltk.tokenize.TreebankWordTokenizer`. """ if self.process is None: self._start_subprocess() s = str(s.strip()) if not tokenized: s = tokenize(s) s = s.strip() assert '\n' not in s, "No newline characters allowed %r" % s try: self.process.stdin.write(s.encode('utf-8')) except IOError as e: #if e.errno == 32: # broken pipe # self.process = None # return self(s) # retry will restart process raise e self.process.stdin.write(b'\n') self.process.stdin.flush() out = self.process.stdout.readline() if sys.version_info[0] == 3: out = out.decode() return self.to_ud(out)
Example #8
Source File: bridge.py From castor with Apache License 2.0 | 5 votes |
def parse(self, sentence): s_toks = TreebankWordTokenizer().tokenize(sentence) sentence = ' '.join(s_toks).lower() return sentence
Example #9
Source File: reader.py From variational-text-tensorflow with MIT License | 5 votes |
def get(self, text=["medical"]): if type(text) == str: text = text.lower() text = TreebankWordTokenizer().tokenize(text) try: data = np.array(map(self.vocab.get, text)) return self.onehot(data), data except: unknowns = [] for word in text: if self.vocab.get(word) == None: unknowns.append(word) raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
Example #10
Source File: preprocessing.py From question-generation with MIT License | 5 votes |
def char_pos_to_word(text, tokens, char_pos, asbytes=True): ix=0 text=text.decode() if asbytes else text if use_nltk: sents = [s for s in sent_tokenize(text)] spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents] # lens = [len(sent)+1 for sent in sents] offsets = [] for i,sent in enumerate(sents): offsets.append(text.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster? spans = [(span[0]+offsets[i], span[1]+offsets[i]) for i,sent in enumerate(spans) for span in sent] # print(char_pos) for ix,s in enumerate(spans): # print(s, tokens[ix]) if s[1] > char_pos: return ix print('couldnt find the char pos via nltk') print(text, char_pos, len(text)) else: tokens = [t.decode() for t in tokens] if char_pos>len(text): print('Char pos doesnt fall within size of text!') for t,token in enumerate(tokens): for char in token: ix = text.find(char, ix) ix += 1 if ix >= char_pos: return t print('couldnt find the char pos') print(text, tokens, char_pos, len(text)) # Filter a complete context down to the sentence containing the start of the answer span
Example #11
Source File: preprocessing.py From question-generation with MIT License | 5 votes |
def filter_context(ctxt, char_pos, window_size_before=0, window_size_after=0, max_tokens=-1): sents = [s for s in sent_tokenize(ctxt)] spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents] # lens = [len(sent)+1 for sent in sents] offsets = [] for i,sent in enumerate(sents): # print(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # print(len(sents[i-1]) if i>0 else 0) # print(offsets[i-1] if i>0 else 0) # print(offsets[i-1]+len(sents[i-1]) if i>0 else 0) offsets.append(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster? spans = [[(span[0]+offsets[i], span[1]+offsets[i]) for span in sent] for i,sent in enumerate(spans) ] for ix,sent in enumerate(spans): # print(sent[0][0], sent[-1][1], char_pos) if char_pos >= sent[0][0] and char_pos < sent[-1][1]: start=max(0, ix-window_size_before) end = min(len(sents)-1, ix+window_size_after) # print(start, end, start, offsets[start]) # new_ix=char_pos-offsets[start] # print(new_ix) # print(" ".join(sents[start:end+1])[new_ix:new_ix+10]) flat_spans=[span for sen in spans for span in sen] if max_tokens > -1 and len([span for sen in spans[start:end+1] for span in sen]) > max_tokens: for i,span in enumerate(flat_spans): if char_pos < span[1]: tok_ix =i # print(span, char_pos) break start_ix = max(spans[start][0][0], flat_spans[max(tok_ix-max_tokens,0)][0]) end_ix = min(spans[end][-1][1], flat_spans[min(tok_ix+max_tokens, len(flat_spans)-1)][1]) # if len(flat_spans[start_tok:end_tok+1]) > 21: # print(start_tok, end_tok, tok_ix) # print(flat_spans[tok_ix]) # print(flat_spans[start_tok:end_tok]) # print(ctxt[flat_spans[start_tok][0]:flat_spans[end_tok][1]]) return ctxt[start_ix:end_ix], char_pos-start_ix else: return " ".join(sents[start:end+1]), char_pos - offsets[start] print('couldnt find the char pos') print(ctxt, char_pos, len(ctxt))
Example #12
Source File: loader.py From question-generation with MIT License | 5 votes |
def get_vocab(corpus, vocab_size=2000): def tokenise(text): sents = [s for s in sent_tokenize(text)] tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)] return tokens vocab = {PAD:0,OOV:1, SOS:2, EOS:3} word_count = defaultdict(float) for l in corpus: # for w in l.lower().split(): for w in tokenise(l): word_count[w] +=1 vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))] for w in vocab_list: vocab[w] = len(vocab) return vocab
Example #13
Source File: loader.py From question-generation with MIT License | 5 votes |
def get_glove_vocab(path, size=2000, d=200, variant='6B', filter_to_squad=False): # this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import! def tokenise(text): sents = [s for s in sent_tokenize(text)] tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)] return tokens vocab = {PAD:0,OOV:1, SOS:2, EOS:3} if filter_to_squad: squad_words = set() squad_train = load_squad_triples(path, dev=False) squad_dev = load_squad_triples(path, dev=True) for triple in squad_train+squad_dev: squad_words |= set(tokenise(triple[0])) squad_words |= set(tokenise(triple[1])) squad_words |= set(tokenise(triple[2])) with open(path+'glove.'+variant+'/glove.'+variant+'.'+str(d)+'d.txt') as fp: entries = fp.readlines() for i,row in enumerate(entries): if len(vocab)-4>= size and size > 0: break cols = row.strip().split(' ') if len(cols) < d+1: print(row) if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad: vocab[cols[0]] = len(vocab) return vocab # def get_vocab(corpus, vocab_size=1000): # lines = [re.sub(r'([\,\?\!\.]+)',r' \1 ', line).lower() for line in corpus] # # lines = re.split('[\n]+',raw_data.lower()) # vocab = {PAD:0,OOV:1, SOS:2, EOS:3} # word_count = defaultdict(float) # for l in lines: # for w in l.split(): # word_count[w] +=1 # vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))] # for w in vocab_list: # vocab[w] = len(vocab) # return vocab
Example #14
Source File: bbn2conll.py From entity-recognition-datasets with MIT License | 4 votes |
def tokenizeit(store): #NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ? Note this # seems to be done differently in different corpora. tokenizer = TreebankWordTokenizer() do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.', 'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.', 'Sgt.', 'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.', 'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.', 'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.', 'Ltda.','E.U.','I.B.M.','D.T.', 'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.', 'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.', 'S.C.','Neb.', 'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.', 'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.', 'Mo.','Vt.', 'Blvd.','Ave.','Ln.','Rd.', 'No.'] pat = re.compile(r'[0-9][.,]{0,1}[0-9]*') for i,x in enumerate(store): if x[0] == '\n': store[i] = ([x[0]], store[i][1]) #elif any([i in x[0] for i in do_not_tokenize]) and #elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}: elif x[0] in do_not_tokenize: toks = [x[0]] store[i] = (toks, store[i][1]) elif shall_use_split(x[0], do_not_tokenize): #x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}: toks = x[0].split(' ') #print 'Plain split on: ', x[0] store[i] = (toks, store[i][1]) else: toks = tokenizer.tokenize(x[0]) # if '$' not in x[0] and '%' not in x[0] and "'" not in x[0] and "`" not in x[0] and x[0][-1]!='.' and not pat.match(x[0]): # toks = regtok(x[0]) # elif x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}: # toks = x[0].split(' ') # elif x[0][0:4] in do_not_tokenize: # toks = [x[0][0:4]] # toks.extend(x[0][4:].split(' ')) # toks = [i for i in toks if i!=''] # print toks # else: # toks = word_tokenize(x[0]) store[i] = (toks, store[i][1]) return store
Example #15
Source File: i2b2toconll.py From entity-recognition-datasets with MIT License | 4 votes |
def tokenizeit(store): #NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ? Note this # seems to be done differently in different corpora. tokenizer = TreebankWordTokenizer() do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.', 'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.', 'Sgt.','M.D.', 'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.', 'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.', 'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.', 'Ltda.','E.U.','I.B.M.','D.T.', 'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.', 'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.', 'S.C.','Neb.', 'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.', 'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.', 'Mo.','Vt.', 'Blvd.','Ave.','Ln.','Rd.', 'No.'] pat = re.compile(r'[0-9][.,]{0,1}[0-9]*') for i,x in enumerate(store): if x[0] == '\n': store[i] = ([x[0]], store[i][1]) #elif any([i in x[0] for i in do_not_tokenize]) and #elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}: elif x[0] in do_not_tokenize: toks = [x[0]] store[i] = (toks, store[i][1]) elif shall_use_split(x[0], do_not_tokenize): #x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}: toks = x[0].split(' ') #print 'Plain split on: ', x[0] store[i] = (toks, store[i][1]) elif '%' in x[0]: toks = tokenizer.tokenize(x[0]) store[i] = (toks, store[i][1] ) else: #NOTE It seems like this is already tokenized inline in the xml, # so this way (just splitting spaces) may be best here. toks = x[0].split(' ') store[i] = (toks, store[i][1]) return store