Python Examples of spacy.tokens.Token

Source File: helpers.py From errudite with GNU General Public License v2.0

6 votes

def get_token_feature(t: Token, label: str) -> str:
    """Get the linguistic feature given a Spacy.Token obj and a label
    
    Arguments:
        t {Token} -- input token
        label {str} -- linguistic feature to return 
    
    Returns:
        str -- linguistic feature
    """

    if label in ['text', 'orth']:
        return t.text
    if label.lower() == 'ent':
        label = 'ent_type'
    return getattr(t, '{}_'.format(label.lower()), '')

Source File: spacy_annotator.py From errudite with GNU General Public License v2.0

6 votes

def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str:
        """Function which gets a normalized string of the sentence and removes stop words
        
        Keyword Arguments:
            sentence_str {str} -- input sentence string (default: {None})
            tokens {List[Token]} -- pre-computed token list, with feature added (default: {None})
            use_lemma {bool} -- return the lemma or the text (default: {True})
        
        Returns:
            str -- the str with stopwords removed
        """
        if not tokens and sentence_str:
            #sentence_str = normalize_answer(sentence_str)
            tokens = self.model(sentence_str)
        elif not tokens:
            tokens = []
        #word_tokenize(sentence_str)
        attr = 'lemma_' if use_lemma else 'text' # what to merge
        return ' '.join([ getattr(token, attr) for token in tokens
            if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])

Source File: nlp.py From armchair-expert with MIT License

6 votes

def from_token(token: Token, people: list = None) -> Optional['Pos']:
        if token.text[0] == '#':
            return Pos.HASHTAG
        elif token.text[0] == '@':
            return Pos.PROPN
        elif token.text[0] == ' ' or token.text[0] == "\n":
            return Pos.SPACE

        if token._.is_emoji:
            return Pos.EMOJI

        # Makeup for shortcomings of NLP detecting online nicknames
        if people is not None:
            if token.text in people:
                return Pos.PROPN

        if re.match(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token.text):
            return Pos.URL

        try:
            return Pos[token.pos_]
        except KeyError:
            print("Unknown PoS: %s" % token.text)
            return Pos.X

Source File: helpers.py From errudite with GNU General Public License v2.0

6 votes

def get_token_feature(t: Token, label: str) -> str:
    """Get the linguistic feature given a Spacy.Token obj and a label
    
    Arguments:
        t {Token} -- input token
        label {str} -- linguistic feature to return 
    
    Returns:
        str -- linguistic feature
    """

    if label in ['text', 'orth']:
        return t.text
    if label.lower() == 'ent':
        label = 'ent_type'
    return getattr(t, '{}_'.format(label.lower()), '')

Source File: extract_triples.py From ravestate with BSD 3-Clause "New" or "Revised" License

6 votes

def triple_search(triple: Triple, token: Token):
    """
    Recursive search through the dependency tree
    looks for triple values in each of the children and calls itself with the children nodes
    """
    question_word = None
    for word in token.children:
        if word.text.lower() in QuestionWord.question_words:
            question_word = QuestionWord(word)
            word = QuestionWord(word)
            if not triple.get_object():
                triple.set_object(question_word)
        elif word.dep_ in OBJECT_SET:
            triple.set_object(word)
        if word.dep_ in SUBJECT_SET:
            triple.set_subject(word)
        if isinstance(word, Token) and word.dep_ not in RECURSION_BLACKLIST:
            triple = triple_search(triple, word)
    if not triple.get_subject() and question_word:
        triple.set_subject(question_word)
    return triple

Source File: helpers.py From errudite with GNU General Public License v2.0

6 votes

def change_matched_token_form(a_token: Token,
    a_pattern: Dict[str, str],
    b_pattern: Dict[str, str]) -> str:
    # first, deal with orth.
    if get_str_from_pattern(b_pattern):
        return get_str_from_pattern(b_pattern)
    elif 'TAG' in b_pattern and 'TAG' in a_pattern:  # deal with the tags
        # singular -> plural
        if a_pattern['TAG'] in ['NN', 'NNP'] and b_pattern['TAG'] in ['NNS', 'NNPS']:
            return pluralize(a_token.text)
        # plural -> singular
        elif b_pattern['TAG'] in ['NN', 'NNP'] and a_pattern['TAG'] in ['NNS', 'NNPS']:
            return singularize(a_token.text)
        # verb form change
        elif a_pattern['TAG'] in VBs and b_pattern['TAG'] in VBs:
            return conjugate(a_token.text, tag=b_pattern['TAG'])
    elif 'POS' in b_pattern and 'POS' in a_pattern:
        # if IS_DEBUGGING == 'change_matched_token_form':
        #    print ('unmachted token form change', a_token, b_token, a_pattern, b_pattern)
        return a_token.text
    return a_token.text