Python nltk.RegexpParser() Examples
The following are 15
code examples of nltk.RegexpParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: math_expression_calculator.py From JARVIS with Apache License 2.0 | 8 votes |
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print("text2num -> ", text) return text
Example #2
Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 6 votes |
def test_baseline(): cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) # print(len(test_sents[0])) # print(test_sents[0]) print(cp.evaluate(test_sents))
Example #3
Source File: chunker.py From Parsivar with MIT License | 6 votes |
def __init__(self): self.grammar = r""" VP: {<ADJ_SIM><V_PRS>} VP: {<ADJ_INO><V.*>} VP: {<V_PRS><N_SING><V_SUB>} NP: {<N_SING><ADJ.*><N_SING>} NP: {<N.*><PRO>} VP: {<N_SING><V_.*>} VP: {<V.*>+} NP: {<ADJ.*>?<N.*>+ <ADJ.*>?} DNP: {<DET><NP>} PP: {<ADJ_CMPR><P>} PP: {<ADJ_SIM><P>} PP: {<P><N_SING>} PP: {<P>*} DDNP: {<NP><DNP>} NPP: {<PP><NP>+} """ self.cp = nltk.RegexpParser(self.grammar)
Example #4
Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def myParser(): grammar = '\n'.join([ 'NP: {<DT>*<NNP>}', 'NP: {<JJ>*<NN>}', 'NP: {<NNP>+}', ]) return nltk.RegexpParser(grammar)
Example #5
Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def test_regexp(): grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(grammar) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) print(cp.evaluate(test_sents))
Example #6
Source File: utils.py From ResumeParser with MIT License | 5 votes |
def extract_experience(resume_text): ''' Helper function to extract experience from resume text :param resume_text: Plain resume text :return: list of experience ''' wordnet_lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) # word tokenization word_tokens = nltk.word_tokenize(resume_text) # remove stop words and lemmatize filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] sent = nltk.pos_tag(filtered_sentence) # parse regex cp = nltk.RegexpParser('P: {<NNP>+}') cs = cp.parse(sent) # for i in cs.subtrees(filter=lambda x: x.label() == 'P'): # print(i) test = [] for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')): test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2])) # Search the word 'experience' in the chunk and then print out the text after it x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()] return x
Example #7
Source File: extract_sentences.py From StrepHit with GNU General Public License v3.0 | 5 votes |
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
Example #8
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def get_parse_info(parsestr, stemmer, language, stoplist): hash_token_pos = OrderedDict() if language=='german': grammar = r""" NBAR: {<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><APPR><NBAR>} # Above, connected with in/of/etc... """ if language=='english': #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns VP: {<V.*>} # terminated with Verbs NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) postoks = [] for i in Tree.fromstring(parsestr).subtrees(): if i.height() == 2: word, pos = i[0], i.label() hash_token_pos[stemmer.stem(word)] = word + u"::" + pos postoks.append((word, pos)) chunk_tree = chunker.parse(postoks) phrases = get_terms(chunk_tree, stemmer, stoplist) phrase_list = [ ' '.join(term) for term in phrases if term] return hash_token_pos, phrase_list
Example #9
Source File: do_benchmark.py From PyRATA with Apache License 2.0 | 5 votes |
def nltk_parse_clause(sentence): """ Natural Language Toolkit: code_cascaded_chunker http://www.nltk.org/book/ch07.html#code-cascaded-chunker """ grammar = r""" NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN PP: {<IN><NP>} # Chunk prepositions followed by NP VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments CLAUSE: {<NP><VP>} # Chunk NP, VP """ cp = nltk.RegexpParser(grammar) #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] parsed_sentence = cp.parse(sentence) #print('parsed_sentence=', parsed_sentence)
Example #10
Source File: extractor.py From ai-research-keyphrase-extraction with Apache License 2.0 | 5 votes |
def extract_candidates(text_obj, no_subset=False): """ Based on part of speech return a list of candidate phrases :param text_obj: Input text Representation see @InputTextObj :param no_subset: if true won't put a candidate which is the subset of an other candidate :param lang: language (currently en, fr and de are supported) :return: list of candidate phrases (string) """ keyphrase_candidate = set() np_parser = nltk.RegexpParser(get_grammar(text_obj.lang)) # Noun phrase parser trees = np_parser.parse_sents(text_obj.pos_tagged) # Generator with one tree per sentence for tree in trees: for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): # For each nounphrase # Concatenate the token with a space keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves())) keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5} if no_subset: keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate) else: keyphrase_candidate = list(keyphrase_candidate) return keyphrase_candidate
Example #11
Source File: phrases_extractor.py From word2vec-recommender with MIT License | 5 votes |
def generate_tree(text): text = text.replace('“', '"') #to preserve quotes in text, primarily news content text = text.replace('”', '"') text = text.replace('’', "'") text = unidecode(text) chunker = nltk.RegexpParser(grammar) tokenized_text = nltk.tokenize.word_tokenize(text) postoks = nltk.tag.pos_tag(tokenized_text) tree = chunker.parse(postoks) return tree
Example #12
Source File: utils.py From pyresparser with GNU General Public License v3.0 | 4 votes |
def extract_experience(resume_text): ''' Helper function to extract experience from resume text :param resume_text: Plain resume text :return: list of experience ''' wordnet_lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) # word tokenization word_tokens = nltk.word_tokenize(resume_text) # remove stop words and lemmatize filtered_sentence = [ w for w in word_tokens if w not in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words ] sent = nltk.pos_tag(filtered_sentence) # parse regex cp = nltk.RegexpParser('P: {<NNP>+}') cs = cp.parse(sent) # for i in cs.subtrees(filter=lambda x: x.label() == 'P'): # print(i) test = [] for vp in list( cs.subtrees(filter=lambda x: x.label() == 'P') ): test.append(" ".join([ i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]) ) # Search the word 'experience' in the chunk and # then print out the text after it x = [ x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower() ] return x
Example #13
Source File: sentence_keywords.py From Valx with GNU General Public License v3.0 | 4 votes |
def keywords_syntax_nltk(sentence): global text_terms terms = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in text_terms: phrase_terms = text_terms[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) phrase_terms = get_terms(cp_tree) text_terms[phrase] = phrase_terms terms += phrase_terms keywords = [] for term in terms: if len(term) > 0: keywords.append(' '.join(term)) return keywords # Ref to https://gist.github.com/879414 #from nltk.stem.wordnet import WordNetLemmatizer
Example #14
Source File: language_parser.py From cvscan with MIT License | 4 votes |
def fetch_all_organizations(resume_text): organizations = set() tokenized_sentences = nltk.sent_tokenize(resume_text) # Custom grammar with NLTK # NP - Noun Phrase # NN - Noun # NNP - Proper Noun # V - Verb # JJ - Adjective # In a sentence that contains NN NNNP V NN NN JJ NN. # The noun-phrases fetched are: # NP: NN NNP # NP: NN NN # NP: NN # Ex, "Application Developer at Delta Force" # => ["Application Developer", "Delta Force"] grammar = r"""NP: {<NN|NNP>+}""" parser = nltk.RegexpParser(grammar) avoid_organizations = utilities.get_avoid_organizations() for sentence in tokenized_sentences: # tags all parts of speech in the tokenized sentences tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence)) # then chunks with customize grammar # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) noun_phrases = [] for np_chunk in np_chunks: if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree noun_phrase = "" for (org, tag) in np_chunk.leaves(): noun_phrase += org + ' ' noun_phrases.append(noun_phrase.rstrip()) # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) for chunk in chunks: if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION': (organization, tag) = chunk[0] # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase for noun_phrase in noun_phrases: if organization in noun_phrase and organization not in avoid_organizations: organizations.add(noun_phrase.capitalize()) return organizations
Example #15
Source File: sentence_keywords.py From DeepEHR with MIT License | 4 votes |
def keywords_syntax_nltk(sentence): global text_terms terms = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in text_terms: phrase_terms = text_terms[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) phrase_terms = get_terms(cp_tree) text_terms[phrase] = phrase_terms terms += phrase_terms keywords = [] for term in terms: if len(term) > 0: keywords.append(' '.join(term)) return keywords # Ref to https://gist.github.com/879414 #from nltk.stem.wordnet import WordNetLemmatizer