Python spacy.tokens.Token() Examples

The following are 30 code examples of spacy.tokens.Token(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy.tokens , or try the search function .
Example #1
Source File: helpers.py    From errudite with GNU General Public License v2.0 6 votes vote down vote up
def get_token_feature(t: Token, label: str) -> str:
    """Get the linguistic feature given a Spacy.Token obj and a label
    
    Arguments:
        t {Token} -- input token
        label {str} -- linguistic feature to return 
    
    Returns:
        str -- linguistic feature
    """

    if label in ['text', 'orth']:
        return t.text
    if label.lower() == 'ent':
        label = 'ent_type'
    return getattr(t, '{}_'.format(label.lower()), '') 
Example #2
Source File: spacy_annotator.py    From errudite with GNU General Public License v2.0 6 votes vote down vote up
def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str:
        """Function which gets a normalized string of the sentence and removes stop words
        
        Keyword Arguments:
            sentence_str {str} -- input sentence string (default: {None})
            tokens {List[Token]} -- pre-computed token list, with feature added (default: {None})
            use_lemma {bool} -- return the lemma or the text (default: {True})
        
        Returns:
            str -- the str with stopwords removed
        """
        if not tokens and sentence_str:
            #sentence_str = normalize_answer(sentence_str)
            tokens = self.model(sentence_str)
        elif not tokens:
            tokens = []
        #word_tokenize(sentence_str)
        attr = 'lemma_' if use_lemma else 'text' # what to merge
        return ' '.join([ getattr(token, attr) for token in tokens
            if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS]) 
Example #3
Source File: nlp.py    From armchair-expert with MIT License 6 votes vote down vote up
def from_token(token: Token, people: list = None) -> Optional['Pos']:
        if token.text[0] == '#':
            return Pos.HASHTAG
        elif token.text[0] == '@':
            return Pos.PROPN
        elif token.text[0] == ' ' or token.text[0] == "\n":
            return Pos.SPACE

        if token._.is_emoji:
            return Pos.EMOJI

        # Makeup for shortcomings of NLP detecting online nicknames
        if people is not None:
            if token.text in people:
                return Pos.PROPN

        if re.match(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token.text):
            return Pos.URL

        try:
            return Pos[token.pos_]
        except KeyError:
            print("Unknown PoS: %s" % token.text)
            return Pos.X 
Example #4
Source File: helpers.py    From errudite with GNU General Public License v2.0 6 votes vote down vote up
def get_token_feature(t: Token, label: str) -> str:
    """Get the linguistic feature given a Spacy.Token obj and a label
    
    Arguments:
        t {Token} -- input token
        label {str} -- linguistic feature to return 
    
    Returns:
        str -- linguistic feature
    """

    if label in ['text', 'orth']:
        return t.text
    if label.lower() == 'ent':
        label = 'ent_type'
    return getattr(t, '{}_'.format(label.lower()), '') 
Example #5
Source File: extract_triples.py    From ravestate with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def triple_search(triple: Triple, token: Token):
    """
    Recursive search through the dependency tree
    looks for triple values in each of the children and calls itself with the children nodes
    """
    question_word = None
    for word in token.children:
        if word.text.lower() in QuestionWord.question_words:
            question_word = QuestionWord(word)
            word = QuestionWord(word)
            if not triple.get_object():
                triple.set_object(question_word)
        elif word.dep_ in OBJECT_SET:
            triple.set_object(word)
        if word.dep_ in SUBJECT_SET:
            triple.set_subject(word)
        if isinstance(word, Token) and word.dep_ not in RECURSION_BLACKLIST:
            triple = triple_search(triple, word)
    if not triple.get_subject() and question_word:
        triple.set_subject(question_word)
    return triple 
Example #6
Source File: helpers.py    From errudite with GNU General Public License v2.0 6 votes vote down vote up
def change_matched_token_form(a_token: Token,
    a_pattern: Dict[str, str],
    b_pattern: Dict[str, str]) -> str:
    # first, deal with orth.
    if get_str_from_pattern(b_pattern):
        return get_str_from_pattern(b_pattern)
    elif 'TAG' in b_pattern and 'TAG' in a_pattern:  # deal with the tags
        # singular -> plural
        if a_pattern['TAG'] in ['NN', 'NNP'] and b_pattern['TAG'] in ['NNS', 'NNPS']:
            return pluralize(a_token.text)
        # plural -> singular
        elif b_pattern['TAG'] in ['NN', 'NNP'] and a_pattern['TAG'] in ['NNS', 'NNPS']:
            return singularize(a_token.text)
        # verb form change
        elif a_pattern['TAG'] in VBs and b_pattern['TAG'] in VBs:
            return conjugate(a_token.text, tag=b_pattern['TAG'])
    elif 'POS' in b_pattern and 'POS' in a_pattern:
        # if IS_DEBUGGING == 'change_matched_token_form':
        #    print ('unmachted token form change', a_token, b_token, a_pattern, b_pattern)
        return a_token.text
    return a_token.text 
Example #7
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_location(token: Token) -> bool:
    return token.ent_type_ in ['LOC', 'GPE'] 
Example #8
Source File: semantic_rule_detector.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def _gen_token_pattern(self, token: Token, label: str, use_orth: bool=False, match_op: str=None) -> Dict[str, str]:
        """generate the matcher token 
        
        Arguments:
            token {Token} -- A token
            label {str} -- provided label (orth, lemma, dep, pos, tag, etc.)
        
        Keyword Arguments:
            use_orth {bool} -- just use the orth (default: {False})
            match_op {str} -- repeat match. Can be (+, *, ?, !) (default: {None})
                
        Returns:
            Dict[str, str] -- Generated pattern that could input to the SPACY matcher
        """
        if use_orth:
            label =  'lower' #'orth'
        if not label and match_op: # dummy match that allow a blank to be filled in
            return {'OP': match_op, DUMMY_FLAG: True}
        if not token or not label: # just want to match this token, but no label
            return None
        pattern = {}
        for label in self._extend_label([token], label):
            feature = get_token_feature(token, label)
            if label.lower() == "tag" and feature not in VBs + WHs + NNs:
                return None
            pattern[label.upper()] = get_token_feature(token, label)
        if match_op: # match 0-N times of this token
            pattern['OP'] = match_op
        return pattern 
Example #9
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_year(token: Token) -> bool:
    '''
    If the time is year
    '''
    try:
        year_str = token.text
        if year_str.endswith('s'):
            year_str = year_str[:-1]
        year = int(year_str)
        if year >= 1000 and year <= 2100:
            return True
    except ValueError:
        pass
    return False 
Example #10
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_month(token: Token) -> bool:
    MONTHS = set(['january', 'february', 'march', 'april', \
        'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'])
    return token.lemma_ in MONTHS 
Example #11
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_date(token: Token) -> bool:
    return is_year(token) or is_month(token) or token.ent_type_ in ['DATE', 'TIME'] 
Example #12
Source File: text_field.py    From stog with MIT License 5 votes vote down vote up
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError("TextFields must be passed Tokens. "
                                     "Found: {} with types {}.".format(tokens, [type(x) for x in tokens])) 
Example #13
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_numeric(token: Token) -> bool:
    return token.tag_ == 'CD' #or token.ent_type_ in ['PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'] 
Example #14
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_prop_noun(token: Token) -> bool:
    return token.tag_ in NNP_POS or token.ent_type_ in NNP_NERS 
Example #15
Source File: qc_ner.py    From errudite with GNU General Public License v2.0 5 votes vote down vote up
def is_entity(token: Token) -> bool:
    return token.ent_type_ != None 
Example #16
Source File: discrete_feature_extractor.py    From medaCy with GNU General Public License v3.0 5 votes vote down vote up
def _token_to_feature_dict(self, index, sentence):
        """
        Extracts features of a given token

        :param index: the index of the token in the sequence
        :param sentence: an array of tokens corresponding to a sequence
        :return: a dictionary with a feature representation of the spaCy token object.
        """

        # This should automatically gather features that are set on tokens
        # by looping over all attributes set on sentence[index] that begin with 'feature'

        features = {
            'bias': 1.0
        }

        for i in range(-self.window_size, self.window_size+1):
            # loop through our window, ignoring tokens that aren't there
            if not 0 <= index + i < len(sentence):
                continue

            token = sentence[index+i]

            # adds features from medacy pipeline
            current = {f'{i}:{feature}': token._.get(feature) for feature in self.all_custom_features}

            # adds features that are overlayed from spacy token attributes
            for feature in self.spacy_features:
                if isinstance(getattr(token, feature), Token):
                    current[f'{i}:{feature}'] = getattr(token, feature).text
                else:
                    current[f'{i}:{feature}'] = getattr(token, feature)

            features.update(current)

        return features 
Example #17
Source File: pos_drop_feature_extractor.py    From medaCy with GNU General Public License v3.0 5 votes vote down vote up
def _token_to_feature_dict(self, index, sentence):
        """
        Extracts features of a given token

        :param index: the index of the token in the sequence
        :param sentence: an array of tokens corresponding to a sequence
        :return: a dictionary with a feature representation of the spaCy token object.
        """

        # This should automatically gather features that are set on tokens
        # by looping over all attributes set on sentence[index] that begin with 'feature'

        target = sentence[index]
        sentence = [token for token in sentence if token.pos_ not in self.ignored_pos or token is target]

        features = {
            'bias': 1.0
        }

        for i in range(-self.window_size, self.window_size+1):
            # loop through our window, ignoring tokens that aren't there
            if not 0 <= index + i < len(sentence):
                continue

            token = sentence[index+i]

            # adds features from medacy pipeline
            current = {'%i:%s' % (i, feature): token._.get(feature) for feature in self.all_custom_features}

            # adds features that are overlayed from spacy token attributes
            for feature in self.spacy_features:
                if isinstance(getattr(token, feature), Token):
                    current['%i:%s' % (i, feature)] = getattr(token, feature).text
                else:
                    current['%i:%s' % (i, feature)] = getattr(token, feature)

            features.update(current)

        return features 
Example #18
Source File: spacy_utlis.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def parse(nlp: spacy.language.Language,
          text: str, language: Language = None,
          use_profanity_filter: bool = False) -> Union[Doc, Token]:
    disable = [] if use_profanity_filter else [SpacyProfanityFilterComponent.name]
    component_cfg = {}
    if use_profanity_filter:
        component_cfg[SpacyProfanityFilterComponent.name] = {
            'language': language,
        }
    return nlp(text, disable=disable, component_cfg=component_cfg) 
Example #19
Source File: spacy_utlis.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def make_token(nlp: spacy.language.Language, word: Union[str, Token]) -> Token:
    if hasattr(word, 'text'):
        return word
    doc = parse(nlp=nlp, text=word)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[:])
    return doc[0] 
Example #20
Source File: markov_engine.py    From armchair-expert with MIT License 5 votes vote down vote up
def from_token(token: Token) -> 'MarkovNeighbor':
        key = token.text.lower()
        text = token.text
        if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
            compound = True
        else:
            compound = False
        pos = Pos.from_token(token)
        values = [0, 0]
        dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1)
        return MarkovNeighbor(key, text, pos, compound, values, dist) 
Example #21
Source File: triple.py    From ravestate with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def set_subject(self, subject: Union[Token, QuestionWord]):
        self._subject = subject 
Example #22
Source File: markov_engine.py    From armchair-expert with MIT License 5 votes vote down vote up
def from_token(token: Token) -> 'MarkovWord':
        if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
            compound = True
        else:
            compound = False
        return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={}) 
Example #23
Source File: structure.py    From armchair-expert with MIT License 5 votes vote down vote up
def analyze(token: Token, mode: CapitalizationMode):
        pos = Pos.from_token(token)
        mode = PoSCapitalizationMode(pos, mode)
        return mode.to_embedding() 
Example #24
Source File: nlp.py    From armchair-expert with MIT License 5 votes vote down vote up
def from_token(token: Token, compound_rules: Optional[List[str]] = None) -> 'CapitalizationMode':

        # Try to make a guess for many common patterns
        pos = Pos.from_token(token)
        if pos in [Pos.NUM, Pos.EMOJI, Pos.SYM, Pos.SPACE, Pos.EOS, Pos.HASHTAG, Pos.PUNCT, Pos.URL]:
            return CapitalizationMode.COMPOUND

        if token.text[0] == '@' or token.text[0] == '#':
            return CapitalizationMode.COMPOUND

        if token.text in compound_rules:
            return CapitalizationMode.COMPOUND

        lower_count = 0
        upper_count = 0
        upper_start = False
        for idx, c in enumerate(token.text):

            if c.isupper():
                upper_count += 1
                if upper_start:
                    upper_start = False
                if idx == 0:
                    upper_start = True
            elif c.islower():
                lower_count += 1

        if upper_start:
            return CapitalizationMode.UPPER_FIRST
        elif lower_count > 0 and upper_count == 0:
            return CapitalizationMode.LOWER_ALL
        elif upper_count > 0 and lower_count == 0:
            return CapitalizationMode.UPPER_ALL
        elif upper_count == 0 and lower_count == 0:
            return CapitalizationMode.NONE
        else:
            return CapitalizationMode.COMPOUND 
Example #25
Source File: spacy_indexer.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def tokens_to_indices(
        self, tokens: List[SpacyToken], vocabulary: Vocabulary
    ) -> Dict[str, List[numpy.ndarray]]:
        if not all(isinstance(x, SpacyToken) for x in tokens):
            raise ValueError(
                "The spacy indexer requires you to use a Tokenizer which produces SpacyTokens."
            )
        indices: List[numpy.ndarray] = [token.vector for token in tokens]
        return {"tokens": indices} 
Example #26
Source File: text_field.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, IndexedTokenList]] = None

        if not all(isinstance(x, (Token, SpacyToken)) for x in tokens):
            raise ConfigurationError(
                "TextFields must be passed Tokens. "
                "Found: {} with types {}.".format(tokens, [type(x) for x in tokens])
            ) 
Example #27
Source File: text_field.py    From magnitude with MIT License 5 votes vote down vote up
def __init__(self, tokens             , token_indexers                         )        :
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens = None
        self._indexer_name_to_indexed_token = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError(u"TextFields must be passed Tokens. "
                                     u"Found: {} with types {}.".format(tokens, [type(x) for x in tokens]))

    #overrides 
Example #28
Source File: text_field.py    From gtos with MIT License 5 votes vote down vote up
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError("TextFields must be passed Tokens. "
                                     "Found: {} with types {}.".format(tokens, [type(x) for x in tokens])) 
Example #29
Source File: question_word.py    From ravestate with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, token: Token):
        self.text = self.question_words[token.text.lower()]
        self.lemma_ = self.question_words[token.text.lower()]
        self.pos_ = self.question_pos
        self.dep_ = token.dep_
        self.is_space = False
        self.children = list() 
Example #30
Source File: triple.py    From ravestate with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, subject: Token = None, predicate: Token = None, object: Token = None):
        self.set_subject(subject)
        self.set_predicate(predicate)
        self.set_object(object)