Python nltk.RegexpParser() Examples
Example #1
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print("text2num -> ", text) return text
Example #2
def test_baseline(): cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) # print(len(test_sents[0])) # print(test_sents[0]) print(cp.evaluate(test_sents))
Example #3
def __init__(self): self.grammar = r""" VP: {<ADJ_SIM><V_PRS>} VP: {<ADJ_INO><V.*>} VP: {<V_PRS><N_SING><V_SUB>} NP: {<N_SING><ADJ.*><N_SING>} NP: {<N.*><PRO>} VP: {<N_SING><V_.*>} VP: {<V.*>+} NP: {<ADJ.*>?<N.*>+ <ADJ.*>?} DNP: {<DET><NP>} PP: {<ADJ_CMPR><P>} PP: {<ADJ_SIM><P>} PP: {<P><N_SING>} PP: {<P>*} DDNP: {<NP><DNP>} NPP: {<PP><NP>+} """ self.cp = nltk.RegexpParser(self.grammar)
Example #4
def myParser(): grammar = '\n'.join([ 'NP: {<DT>*<NNP>}', 'NP: {<JJ>*<NN>}', 'NP: {<NNP>+}', ]) return nltk.RegexpParser(grammar)
Example #5
def test_regexp(): grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(grammar) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) print(cp.evaluate(test_sents))
Example #6
def extract_experience(resume_text): ''' Helper function to extract experience from resume text :param resume_text: Plain resume text :return: list of experience ''' wordnet_lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) # word tokenization word_tokens = nltk.word_tokenize(resume_text) # remove stop words and lemmatize filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] sent = nltk.pos_tag(filtered_sentence) # parse regex cp = nltk.RegexpParser('P: {<NNP>+}') cs = cp.parse(sent) # for i in cs.subtrees(filter=lambda x: x.label() == 'P'): # print(i) test = [] for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')): test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2])) # Search the word 'experience' in the chunk and then print out the text after it x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()] return x
Example #7
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
Example #8
def get_parse_info(parsestr, stemmer, language, stoplist): hash_token_pos = OrderedDict() if language=='german': grammar = r""" NBAR: {<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><APPR><NBAR>} # Above, connected with in/of/etc... """ if language=='english': #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) postoks = [] for i in Tree.fromstring(parsestr).subtrees(): if i.height() == 2: word, pos = i[0], i.label() hash_token_pos[stemmer.stem(word)] = word + u"::" + pos postoks.append((word, pos)) chunk_tree = chunker.parse(postoks) phrases = get_terms(chunk_tree, stemmer, stoplist) phrase_list = [ ' '.join(term) for term in phrases if term] return hash_token_pos, phrase_list
Example #9
def nltk_parse_clause(sentence): """ Natural Language Toolkit: code_cascaded_chunker """ grammar = r""" NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments CLAUSE: {<NP><VP>} # Chunk NP, VP """ cp = nltk.RegexpParser(grammar) #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] parsed_sentence = cp.parse(sentence) #print('parsed_sentence=', parsed_sentence)
Example #10
def extract_candidates(text_obj, no_subset=False): """ Based on part of speech return a list of candidate phrases :param text_obj: Input text Representation see @InputTextObj :param no_subset: if true won't put a candidate which is the subset of an other candidate :param lang: language (currently en, fr and de are supported) :return: list of candidate phrases (string) """ keyphrase_candidate = set() np_parser = nltk.RegexpParser(get_grammar(text_obj.lang)) # Noun phrase parser trees = np_parser.parse_sents(text_obj.pos_tagged) # Generator with one tree per sentence for tree in trees: for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): # For each nounphrase # Concatenate the token with a space keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves())) keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5} if no_subset: keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate) else: keyphrase_candidate = list(keyphrase_candidate) return keyphrase_candidate
Example #11
def generate_tree(text): text = text.replace('“', '"') #to preserve quotes in text, primarily news content text = text.replace('”', '"') text = text.replace('’', "'") text = unidecode(text) chunker = nltk.RegexpParser(grammar) tokenized_text = nltk.tokenize.word_tokenize(text) postoks = nltk.tag.pos_tag(tokenized_text) tree = chunker.parse(postoks) return tree
Example #12
def extract_experience(resume_text): ''' Helper function to extract experience from resume text :param resume_text: Plain resume text :return: list of experience ''' wordnet_lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) # word tokenization word_tokens = nltk.word_tokenize(resume_text) # remove stop words and lemmatize filtered_sentence = [ w for w in word_tokens if w not in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words ] sent = nltk.pos_tag(filtered_sentence) # parse regex cp = nltk.RegexpParser('P: {<NNP>+}') cs = cp.parse(sent) # for i in cs.subtrees(filter=lambda x: x.label() == 'P'): # print(i) test = [] for vp in list( cs.subtrees(filter=lambda x: x.label() == 'P') ): test.append(" ".join([ i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]) ) # Search the word 'experience' in the chunk and # then print out the text after it x = [ x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower() ] return x
Example #13
def keywords_syntax_nltk(sentence): global text_terms terms = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in text_terms: phrase_terms = text_terms[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) phrase_terms = get_terms(cp_tree) text_terms[phrase] = phrase_terms terms += phrase_terms keywords = [] for term in terms: if len(term) > 0: keywords.append(' '.join(term)) return keywords # Ref to #from nltk.stem.wordnet import WordNetLemmatizer
Example #14
def fetch_all_organizations(resume_text): organizations = set() tokenized_sentences = nltk.sent_tokenize(resume_text) # Custom grammar with NLTK # NP - Noun Phrase # NN - Noun # NNP - Proper Noun # V - Verb # JJ - Adjective # In a sentence that contains NN NNNP V NN NN JJ NN. # The noun-phrases fetched are: # NP: NN NNP # NP: NN NN # NP: NN # Ex, "Application Developer at Delta Force" # => ["Application Developer", "Delta Force"] grammar = r"""NP: {<NN|NNP>+}""" parser = nltk.RegexpParser(grammar) avoid_organizations = utilities.get_avoid_organizations() for sentence in tokenized_sentences: # tags all parts of speech in the tokenized sentences tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence)) # then chunks with customize grammar # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) noun_phrases = [] for np_chunk in np_chunks: if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree noun_phrase = "" for (org, tag) in np_chunk.leaves(): noun_phrase += org + ' ' noun_phrases.append(noun_phrase.rstrip()) # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) for chunk in chunks: if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION': (organization, tag) = chunk[0] # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase for noun_phrase in noun_phrases: if organization in noun_phrase and organization not in avoid_organizations: organizations.add(noun_phrase.capitalize()) return organizations
Example #15
def keywords_syntax_nltk(sentence): global text_terms terms = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in text_terms: phrase_terms = text_terms[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) phrase_terms = get_terms(cp_tree) text_terms[phrase] = phrase_terms terms += phrase_terms keywords = [] for term in terms: if len(term) > 0: keywords.append(' '.join(term)) return keywords # Ref to #from nltk.stem.wordnet import WordNetLemmatizer