Python Examples of nltk.RegexpParser

Source File: math_expression_calculator.py From JARVIS with Apache License 2.0

8 votes

def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text

Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

6 votes

def test_baseline():
    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    # print(len(test_sents[0]))
    # print(test_sents[0])
    print(cp.evaluate(test_sents))

Source File: chunker.py From Parsivar with MIT License

6 votes

def __init__(self):
        self.grammar = r"""
                        VP: {<ADJ_SIM><V_PRS>}
                        VP: {<ADJ_INO><V.*>}
                        VP: {<V_PRS><N_SING><V_SUB>}
                        NP: {<N_SING><ADJ.*><N_SING>}
                        NP: {<N.*><PRO>}
                        VP: {<N_SING><V_.*>}
                        VP: {<V.*>+}
                        NP: {<ADJ.*>?<N.*>+ <ADJ.*>?}
                        DNP: {<DET><NP>}
                        PP: {<ADJ_CMPR><P>}
                        PP: {<ADJ_SIM><P>}
                        PP: {<P><N_SING>}
                        PP: {<P>*}
                        DDNP: {<NP><DNP>}
                        NPP: {<PP><NP>+}
                        """

        self.cp = nltk.RegexpParser(self.grammar)

Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def myParser():
    grammar = '\n'.join([
	'NP: {<DT>*<NNP>}',
	'NP: {<JJ>*<NN>}',
	'NP: {<NNP>+}',
	])
    return nltk.RegexpParser(grammar)

Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def test_regexp():
    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    print(cp.evaluate(test_sents))

Source File: utils.py From ResumeParser with MIT License

5 votes

def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization 
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize  
    filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] 
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)
    
    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)
    
    test = []
    
    for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
        test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))

    # Search the word 'experience' in the chunk and then print out the text after it
    x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
    return x

Source File: extract_sentences.py From StrepHit with GNU General Public License v3.0

5 votes

def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
        else:
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                    self.language, self.grammars.keys())
            )

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])

Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0

5 votes

def get_parse_info(parsestr, stemmer, language, stoplist):
    hash_token_pos = OrderedDict()
    if language=='german':
        grammar = r"""
            NBAR:
            {<N.*|ADJ.*>*<N.*>}  # Nouns and Adjectives, terminated with Nouns
            VP:
            {<V.*>}  # terminated with Verbs
            NP:
            {<NBAR>}
            {<NBAR><APPR><NBAR>}  # Above, connected with in/of/etc...
        """
    if language=='english':
        #Taken from Su Nam Kim Paper...
        grammar = r"""
            NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
            VP:
            {<V.*>}  # terminated with Verbs
            NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """
    
    chunker = RegexpParser(grammar)
    
    postoks = []
    for i in Tree.fromstring(parsestr).subtrees():
        if i.height() == 2:
            word, pos = i[0], i.label()
            hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
            postoks.append((word, pos))
       
    chunk_tree = chunker.parse(postoks)
    phrases = get_terms(chunk_tree, stemmer, stoplist)
    phrase_list = [ ' '.join(term) for term in phrases if term]
    return hash_token_pos, phrase_list

Source File: do_benchmark.py From PyRATA with Apache License 2.0

5 votes

def nltk_parse_clause(sentence):
  """
  Natural Language Toolkit: code_cascaded_chunker
  http://www.nltk.org/book/ch07.html#code-cascaded-chunker
  """
  grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
  cp = nltk.RegexpParser(grammar)
  #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),  ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
  parsed_sentence = cp.parse(sentence)
  #print('parsed_sentence=', parsed_sentence)

Source File: extractor.py From ai-research-keyphrase-extraction with Apache License 2.0

5 votes

def extract_candidates(text_obj, no_subset=False):
    """
    Based on part of speech return a list of candidate phrases
    :param text_obj: Input text Representation see @InputTextObj
    :param no_subset: if true won't put a candidate which is the subset of an other candidate
    :param lang: language (currently en, fr and de are supported)
    :return: list of candidate phrases (string)
    """

    keyphrase_candidate = set()

    np_parser = nltk.RegexpParser(get_grammar(text_obj.lang))  # Noun phrase parser
    trees = np_parser.parse_sents(text_obj.pos_tagged)  # Generator with one tree per sentence

    for tree in trees:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):  # For each nounphrase
            # Concatenate the token with a space
            keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))

    keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5}

    if no_subset:
        keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate)
    else:
        keyphrase_candidate = list(keyphrase_candidate)

    return keyphrase_candidate

Source File: phrases_extractor.py From word2vec-recommender with MIT License

5 votes

def generate_tree(text):
    text = text.replace('“', '"') #to preserve quotes in text, primarily news content
    text = text.replace('”', '"')
    text = text.replace('’', "'")
    text = unidecode(text)
    chunker = nltk.RegexpParser(grammar)
    tokenized_text = nltk.tokenize.word_tokenize(text)
    postoks = nltk.tag.pos_tag(tokenized_text)
    tree = chunker.parse(postoks)
    return tree

Source File: utils.py From pyresparser with GNU General Public License v3.0

4 votes

def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize
    filtered_sentence = [
            w for w in word_tokens if w not
            in stop_words and wordnet_lemmatizer.lemmatize(w)
            not in stop_words
        ]
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)

    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)

    test = []

    for vp in list(
        cs.subtrees(filter=lambda x: x.label() == 'P')
    ):
        test.append(" ".join([
            i[0] for i in vp.leaves()
            if len(vp.leaves()) >= 2])
        )

    # Search the word 'experience' in the chunk and
    # then print out the text after it
    x = [
        x[x.lower().index('experience') + 10:]
        for i, x in enumerate(test)
        if x and 'experience' in x.lower()
    ]
    return x

Source File: sentence_keywords.py From Valx with GNU General Public License v3.0

4 votes

def keywords_syntax_nltk(sentence):
	global text_terms
	terms = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		if phrase in text_terms:
			phrase_terms = text_terms[phrase]
		else:
			#-------------------POS tagging output
			words = NLP_word.word_splitting(phrase.lower())
			pos_tags = NLP_word.word_pos_tagging(words)
	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
		
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			phrase_terms = get_terms(cp_tree)
			text_terms[phrase] = phrase_terms

		terms += phrase_terms 

	keywords = []
	for term in terms:
		if len(term) > 0:
			keywords.append(' '.join(term))
	return keywords


# Ref to https://gist.github.com/879414
#from nltk.stem.wordnet import WordNetLemmatizer

Source File: language_parser.py From cvscan with MIT License

4 votes

def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '

        noun_phrases.append(noun_phrase.rstrip())

    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:
            organizations.add(noun_phrase.capitalize())

  return organizations

Source File: sentence_keywords.py From DeepEHR with MIT License

4 votes

def keywords_syntax_nltk(sentence):
	global text_terms
	terms = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		if phrase in text_terms:
			phrase_terms = text_terms[phrase]
		else:
			#-------------------POS tagging output
			words = NLP_word.word_splitting(phrase.lower())
			pos_tags = NLP_word.word_pos_tagging(words)
	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
		
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			phrase_terms = get_terms(cp_tree)
			text_terms[phrase] = phrase_terms

		terms += phrase_terms 

	keywords = []
	for term in terms:
		if len(term) > 0:
			keywords.append(' '.join(term))
	return keywords


# Ref to https://gist.github.com/879414
#from nltk.stem.wordnet import WordNetLemmatizer

Python nltk.RegexpParser() Examples