Python Examples of nltk.tokenize.TreebankWordTokenizer

Source File: phrasemachine.py From scattertext with Apache License 2.0

6 votes

def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html

Source File: phrasemachine.py From phrasemachine with MIT License

6 votes

def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html

Source File: msrvtt_tagging.py From Semantics-AssistedVideoCaptioning with MIT License

6 votes

def main(tag_gt, word2idx, zipname):
    with zf.ZipFile(zipname) as myzip:
        namelist = myzip.namelist()
        print('namelist:', namelist)
        datainfo = myzip.open(namelist[-1], 'r')
        info_dict = json.load(datainfo)
        sentences = info_dict['sentences']
        tokenizer = TreebankWordTokenizer()
        for sentence in sentences:
            video_id = sentence['video_id']
            video_idx = int(video_id[5:])
            caption = sentence['caption']
            words = tokenizer.tokenize(caption)
            for word in words:
                if word in word2idx:
                    tag_gt[video_idx, word2idx[word]] = 1

Source File: preprocessing.py From question-generation with MIT License

6 votes

def tokenise(text, asbytes=True, append_eos=False):

    text = text.decode() if asbytes else text
    if use_nltk:
        sents = [s for s in sent_tokenize(text)]

        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
    else:
        for char in string.punctuation+'()-–':
            text = text.replace(char, ' '+char+' ')
        tokens = text.lower().split(' ')
    tokens = [w.encode() if asbytes else w for w in tokens if w.strip() != '']
    if append_eos:
        tokens.append(EOS.encode() if asbytes else EOS)
    # tokens = np.asarray(tokens)
    # return np.asarray(tokens)
    return tokens

Source File: lang_proc.py From SearchingReddit with MIT License

5 votes

def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms)

Source File: UDParser.py From PredPatt with BSD 3-Clause "New" or "Revised" License

5 votes

def tokenize(sentence):
    "Tokenize sentence the way parser expects."
    tokenizer = TreebankWordTokenizer()
    s = tokenizer.tokenize(sentence)
    s = ' '.join(s)
    # character replacements
    s = ''.join(REPLACEMENTS_R.get(x,x) for x in s)
    return s

Source File: UDParser.py From PredPatt with BSD 3-Clause "New" or "Revised" License

5 votes

def fresh(self, s, tokenized=False):
        """UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string).

        Pass in `tokenized=True` if `s` has already been tokenized, otherwise we
        apply `nltk.tokenize.TreebankWordTokenizer`.

        """
        if self.process is None:
            self._start_subprocess()
        s = str(s.strip())
        if not tokenized:
            s = tokenize(s)
        s = s.strip()
        assert '\n' not in s, "No newline characters allowed %r" % s
        try:
            self.process.stdin.write(s.encode('utf-8'))
        except IOError as e:
            #if e.errno == 32:          # broken pipe
            #    self.process = None
            #    return self(s)  # retry will restart process
            raise e
        self.process.stdin.write(b'\n')
        self.process.stdin.flush()
        out = self.process.stdout.readline()
        if sys.version_info[0] == 3:
            out = out.decode()
        return self.to_ud(out)

Source File: bridge.py From castor with Apache License 2.0

5 votes

def parse(self, sentence):
        s_toks = TreebankWordTokenizer().tokenize(sentence)
        sentence = ' '.join(s_toks).lower()
        return sentence

Source File: reader.py From variational-text-tensorflow with MIT License

5 votes

def get(self, text=["medical"]):
    if type(text) == str:
      text = text.lower()
      text = TreebankWordTokenizer().tokenize(text)

    try:
      data = np.array(map(self.vocab.get, text))
      return self.onehot(data), data
    except:
      unknowns = []
      for word in text:
        if self.vocab.get(word) == None:
          unknowns.append(word)
      raise Exception(" [!] unknown words: %s" % ",".join(unknowns))

Source File: preprocessing.py From question-generation with MIT License

5 votes

def char_pos_to_word(text, tokens, char_pos, asbytes=True):
    ix=0
    text=text.decode() if asbytes else text
    if use_nltk:
        sents = [s for s in sent_tokenize(text)]
        spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
        # lens = [len(sent)+1  for sent in sents]
        offsets = []
        for i,sent in enumerate(sents):
            offsets.append(text.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
        spans = [(span[0]+offsets[i], span[1]+offsets[i]) for i,sent in enumerate(spans) for span in sent]
        # print(char_pos)
        for ix,s in enumerate(spans):
            # print(s, tokens[ix])
            if s[1] > char_pos:
                return ix
        print('couldnt find the char pos via nltk')
        print(text, char_pos, len(text))
    else:
        tokens = [t.decode() for t in tokens]
        if char_pos>len(text):
            print('Char pos doesnt fall within size of text!')

        for t,token in enumerate(tokens):
            for char in token:
                ix = text.find(char, ix)
                ix += 1
                if ix >= char_pos:
                    return t
        print('couldnt find the char pos')
        print(text, tokens, char_pos, len(text))

# Filter a complete context down to the sentence containing the start of the answer span

Source File: preprocessing.py From question-generation with MIT License

5 votes

def filter_context(ctxt, char_pos, window_size_before=0, window_size_after=0, max_tokens=-1):
    sents = [s for s in sent_tokenize(ctxt)]
    spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
    # lens = [len(sent)+1  for sent in sents]
    offsets = []
    for i,sent in enumerate(sents):
        # print(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0))
        # print(len(sents[i-1]) if i>0 else 0)
        # print(offsets[i-1] if i>0 else 0)
        # print(offsets[i-1]+len(sents[i-1]) if i>0 else 0)
        offsets.append(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
    spans = [[(span[0]+offsets[i], span[1]+offsets[i]) for span in sent] for i,sent in enumerate(spans) ]
    for ix,sent in enumerate(spans):
        # print(sent[0][0], sent[-1][1], char_pos)
        if char_pos >= sent[0][0] and char_pos < sent[-1][1]:
            start=max(0, ix-window_size_before)
            end = min(len(sents)-1, ix+window_size_after)
            # print(start, end, start, offsets[start])
            # new_ix=char_pos-offsets[start]
            # print(new_ix)
            # print(" ".join(sents[start:end+1])[new_ix:new_ix+10])
            flat_spans=[span for sen in spans for span in sen]
            if max_tokens > -1 and len([span for sen in spans[start:end+1] for span in sen]) > max_tokens:
                for i,span in enumerate(flat_spans):
                    if char_pos < span[1]:
                        tok_ix =i
                        # print(span, char_pos)
                        break
                start_ix = max(spans[start][0][0], flat_spans[max(tok_ix-max_tokens,0)][0])
                end_ix = min(spans[end][-1][1], flat_spans[min(tok_ix+max_tokens, len(flat_spans)-1)][1])

                # if len(flat_spans[start_tok:end_tok+1]) > 21:
                # print(start_tok, end_tok, tok_ix)
                # print(flat_spans[tok_ix])
                # print(flat_spans[start_tok:end_tok])
                # print(ctxt[flat_spans[start_tok][0]:flat_spans[end_tok][1]])
                return ctxt[start_ix:end_ix], char_pos-start_ix
            else:
                return " ".join(sents[start:end+1]), char_pos - offsets[start]
    print('couldnt find the char pos')
    print(ctxt, char_pos, len(ctxt))

Source File: loader.py From question-generation with MIT License

5 votes

def get_vocab(corpus, vocab_size=2000):
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
        return tokens
    vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
    word_count = defaultdict(float)
    for l in corpus:
        # for w in l.lower().split():
        for w in tokenise(l):
            word_count[w] +=1
    vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
    for w in vocab_list:
        vocab[w] = len(vocab)
    return vocab

Source File: loader.py From question-generation with MIT License

5 votes

def get_glove_vocab(path, size=2000, d=200, variant='6B', filter_to_squad=False):
    # this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import!
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
        return tokens

    vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
    if filter_to_squad:
        squad_words = set()
        squad_train = load_squad_triples(path, dev=False)
        squad_dev = load_squad_triples(path, dev=True)
        for triple in squad_train+squad_dev:
            squad_words |= set(tokenise(triple[0]))
            squad_words |= set(tokenise(triple[1]))
            squad_words |= set(tokenise(triple[2]))
    with open(path+'glove.'+variant+'/glove.'+variant+'.'+str(d)+'d.txt') as fp:
        entries = fp.readlines()
    for i,row in enumerate(entries):
        if len(vocab)-4>= size and size > 0:
            break
        cols = row.strip().split(' ')
        if len(cols) < d+1:
            print(row)
        if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad:
            vocab[cols[0]] = len(vocab)
    return vocab

# def get_vocab(corpus, vocab_size=1000):
#     lines = [re.sub(r'([\,\?\!\.]+)',r' \1 ', line).lower() for line in corpus]
#     # lines = re.split('[\n]+',raw_data.lower())
#     vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
#     word_count = defaultdict(float)
#     for l in lines:
#         for w in l.split():
#             word_count[w] +=1
#     vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
#     for w in vocab_list:
#         vocab[w] = len(vocab)
#     return vocab

Source File: bbn2conll.py From entity-recognition-datasets with MIT License

4 votes

def tokenizeit(store):
    #NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ?  Note this
    # seems to be done differently in different corpora.
    tokenizer = TreebankWordTokenizer()

    do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
    'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
    'Sgt.',
    'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
    'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
    'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
    'Ltda.','E.U.','I.B.M.','D.T.',
    'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
    'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
    'S.C.','Neb.',
    'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
    'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
    'Mo.','Vt.',
    'Blvd.','Ave.','Ln.','Rd.',
    'No.']
    pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')

    for i,x in enumerate(store):
        if x[0] == '\n':
            store[i] =  ([x[0]], store[i][1])
        #elif any([i in x[0] for i in do_not_tokenize]) and
        #elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
        elif x[0] in do_not_tokenize:
            toks = [x[0]]
            store[i] =  (toks, store[i][1])
        elif shall_use_split(x[0], do_not_tokenize):
            #x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
            toks = x[0].split(' ')
            #print 'Plain split on: ', x[0]
            store[i] =  (toks, store[i][1])
        else:
            toks = tokenizer.tokenize(x[0])
#            if '$' not in x[0] and '%' not in x[0] and "'" not in x[0] and "`" not in x[0] and x[0][-1]!='.' and not pat.match(x[0]):
#                toks = regtok(x[0])
#            elif x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
#                toks = x[0].split(' ')
#            elif x[0][0:4] in do_not_tokenize:
#                toks = [x[0][0:4]]
#                toks.extend(x[0][4:].split(' '))
#                toks = [i for i in toks if i!='']
#                print toks
#            else:
#                toks = word_tokenize(x[0])
            store[i] =  (toks, store[i][1])
    return store

Source File: i2b2toconll.py From entity-recognition-datasets with MIT License

4 votes

def tokenizeit(store):
	#NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ?  Note this
	# seems to be done differently in different corpora.
	tokenizer = TreebankWordTokenizer()

	do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
	'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
	'Sgt.','M.D.',
	'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
	'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
	'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
	'Ltda.','E.U.','I.B.M.','D.T.',
	'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
	'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
	'S.C.','Neb.',
	'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
	'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
	'Mo.','Vt.',
	'Blvd.','Ave.','Ln.','Rd.',
	'No.']
	pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')

	for i,x in enumerate(store):
		if x[0] == '\n':
			store[i] =  ([x[0]], store[i][1])
		#elif any([i in x[0] for i in do_not_tokenize]) and
		#elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
		elif x[0] in do_not_tokenize:
			toks = [x[0]]
			store[i] =  (toks, store[i][1])
		elif shall_use_split(x[0], do_not_tokenize):
			#x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
			toks = x[0].split(' ')
			#print 'Plain split on: ', x[0]
			store[i] =  (toks, store[i][1])
		elif '%' in x[0]:
			toks = tokenizer.tokenize(x[0])
			store[i] = (toks, store[i][1] )
		else:
			#NOTE It seems like this is already tokenized inline in the xml,
			# so this way (just splitting spaces) may be best here.
			toks = x[0].split(' ')
			store[i] =  (toks, store[i][1])
	return store

Python nltk.tokenize.TreebankWordTokenizer() Examples