Python spacy.tokens.Token() Examples
The following are 30
code examples of spacy.tokens.Token().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy.tokens
, or try the search function
.
Example #1
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def get_token_feature(t: Token, label: str) -> str: """Get the linguistic feature given a Spacy.Token obj and a label Arguments: t {Token} -- input token label {str} -- linguistic feature to return Returns: str -- linguistic feature """ if label in ['text', 'orth']: return t.text if label.lower() == 'ent': label = 'ent_type' return getattr(t, '{}_'.format(label.lower()), '')
Example #2
Source File: spacy_annotator.py From errudite with GNU General Public License v2.0 | 6 votes |
def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str: """Function which gets a normalized string of the sentence and removes stop words Keyword Arguments: sentence_str {str} -- input sentence string (default: {None}) tokens {List[Token]} -- pre-computed token list, with feature added (default: {None}) use_lemma {bool} -- return the lemma or the text (default: {True}) Returns: str -- the str with stopwords removed """ if not tokens and sentence_str: #sentence_str = normalize_answer(sentence_str) tokens = self.model(sentence_str) elif not tokens: tokens = [] #word_tokenize(sentence_str) attr = 'lemma_' if use_lemma else 'text' # what to merge return ' '.join([ getattr(token, attr) for token in tokens if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])
Example #3
Source File: nlp.py From armchair-expert with MIT License | 6 votes |
def from_token(token: Token, people: list = None) -> Optional['Pos']: if token.text[0] == '#': return Pos.HASHTAG elif token.text[0] == '@': return Pos.PROPN elif token.text[0] == ' ' or token.text[0] == "\n": return Pos.SPACE if token._.is_emoji: return Pos.EMOJI # Makeup for shortcomings of NLP detecting online nicknames if people is not None: if token.text in people: return Pos.PROPN if re.match(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token.text): return Pos.URL try: return Pos[token.pos_] except KeyError: print("Unknown PoS: %s" % token.text) return Pos.X
Example #4
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def get_token_feature(t: Token, label: str) -> str: """Get the linguistic feature given a Spacy.Token obj and a label Arguments: t {Token} -- input token label {str} -- linguistic feature to return Returns: str -- linguistic feature """ if label in ['text', 'orth']: return t.text if label.lower() == 'ent': label = 'ent_type' return getattr(t, '{}_'.format(label.lower()), '')
Example #5
Source File: extract_triples.py From ravestate with BSD 3-Clause "New" or "Revised" License | 6 votes |
def triple_search(triple: Triple, token: Token): """ Recursive search through the dependency tree looks for triple values in each of the children and calls itself with the children nodes """ question_word = None for word in token.children: if word.text.lower() in QuestionWord.question_words: question_word = QuestionWord(word) word = QuestionWord(word) if not triple.get_object(): triple.set_object(question_word) elif word.dep_ in OBJECT_SET: triple.set_object(word) if word.dep_ in SUBJECT_SET: triple.set_subject(word) if isinstance(word, Token) and word.dep_ not in RECURSION_BLACKLIST: triple = triple_search(triple, word) if not triple.get_subject() and question_word: triple.set_subject(question_word) return triple
Example #6
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def change_matched_token_form(a_token: Token, a_pattern: Dict[str, str], b_pattern: Dict[str, str]) -> str: # first, deal with orth. if get_str_from_pattern(b_pattern): return get_str_from_pattern(b_pattern) elif 'TAG' in b_pattern and 'TAG' in a_pattern: # deal with the tags # singular -> plural if a_pattern['TAG'] in ['NN', 'NNP'] and b_pattern['TAG'] in ['NNS', 'NNPS']: return pluralize(a_token.text) # plural -> singular elif b_pattern['TAG'] in ['NN', 'NNP'] and a_pattern['TAG'] in ['NNS', 'NNPS']: return singularize(a_token.text) # verb form change elif a_pattern['TAG'] in VBs and b_pattern['TAG'] in VBs: return conjugate(a_token.text, tag=b_pattern['TAG']) elif 'POS' in b_pattern and 'POS' in a_pattern: # if IS_DEBUGGING == 'change_matched_token_form': # print ('unmachted token form change', a_token, b_token, a_pattern, b_pattern) return a_token.text return a_token.text
Example #7
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_location(token: Token) -> bool: return token.ent_type_ in ['LOC', 'GPE']
Example #8
Source File: semantic_rule_detector.py From errudite with GNU General Public License v2.0 | 5 votes |
def _gen_token_pattern(self, token: Token, label: str, use_orth: bool=False, match_op: str=None) -> Dict[str, str]: """generate the matcher token Arguments: token {Token} -- A token label {str} -- provided label (orth, lemma, dep, pos, tag, etc.) Keyword Arguments: use_orth {bool} -- just use the orth (default: {False}) match_op {str} -- repeat match. Can be (+, *, ?, !) (default: {None}) Returns: Dict[str, str] -- Generated pattern that could input to the SPACY matcher """ if use_orth: label = 'lower' #'orth' if not label and match_op: # dummy match that allow a blank to be filled in return {'OP': match_op, DUMMY_FLAG: True} if not token or not label: # just want to match this token, but no label return None pattern = {} for label in self._extend_label([token], label): feature = get_token_feature(token, label) if label.lower() == "tag" and feature not in VBs + WHs + NNs: return None pattern[label.upper()] = get_token_feature(token, label) if match_op: # match 0-N times of this token pattern['OP'] = match_op return pattern
Example #9
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_year(token: Token) -> bool: ''' If the time is year ''' try: year_str = token.text if year_str.endswith('s'): year_str = year_str[:-1] year = int(year_str) if year >= 1000 and year <= 2100: return True except ValueError: pass return False
Example #10
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_month(token: Token) -> bool: MONTHS = set(['january', 'february', 'march', 'april', \ 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']) return token.lemma_ in MONTHS
Example #11
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_date(token: Token) -> bool: return is_year(token) or is_month(token) or token.ent_type_ in ['DATE', 'TIME']
Example #12
Source File: text_field.py From stog with MIT License | 5 votes |
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None: self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens: Optional[Dict[str, TokenList]] = None self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]): raise ConfigurationError("TextFields must be passed Tokens. " "Found: {} with types {}.".format(tokens, [type(x) for x in tokens]))
Example #13
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_numeric(token: Token) -> bool: return token.tag_ == 'CD' #or token.ent_type_ in ['PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
Example #14
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_prop_noun(token: Token) -> bool: return token.tag_ in NNP_POS or token.ent_type_ in NNP_NERS
Example #15
Source File: qc_ner.py From errudite with GNU General Public License v2.0 | 5 votes |
def is_entity(token: Token) -> bool: return token.ent_type_ != None
Example #16
Source File: discrete_feature_extractor.py From medaCy with GNU General Public License v3.0 | 5 votes |
def _token_to_feature_dict(self, index, sentence): """ Extracts features of a given token :param index: the index of the token in the sequence :param sentence: an array of tokens corresponding to a sequence :return: a dictionary with a feature representation of the spaCy token object. """ # This should automatically gather features that are set on tokens # by looping over all attributes set on sentence[index] that begin with 'feature' features = { 'bias': 1.0 } for i in range(-self.window_size, self.window_size+1): # loop through our window, ignoring tokens that aren't there if not 0 <= index + i < len(sentence): continue token = sentence[index+i] # adds features from medacy pipeline current = {f'{i}:{feature}': token._.get(feature) for feature in self.all_custom_features} # adds features that are overlayed from spacy token attributes for feature in self.spacy_features: if isinstance(getattr(token, feature), Token): current[f'{i}:{feature}'] = getattr(token, feature).text else: current[f'{i}:{feature}'] = getattr(token, feature) features.update(current) return features
Example #17
Source File: pos_drop_feature_extractor.py From medaCy with GNU General Public License v3.0 | 5 votes |
def _token_to_feature_dict(self, index, sentence): """ Extracts features of a given token :param index: the index of the token in the sequence :param sentence: an array of tokens corresponding to a sequence :return: a dictionary with a feature representation of the spaCy token object. """ # This should automatically gather features that are set on tokens # by looping over all attributes set on sentence[index] that begin with 'feature' target = sentence[index] sentence = [token for token in sentence if token.pos_ not in self.ignored_pos or token is target] features = { 'bias': 1.0 } for i in range(-self.window_size, self.window_size+1): # loop through our window, ignoring tokens that aren't there if not 0 <= index + i < len(sentence): continue token = sentence[index+i] # adds features from medacy pipeline current = {'%i:%s' % (i, feature): token._.get(feature) for feature in self.all_custom_features} # adds features that are overlayed from spacy token attributes for feature in self.spacy_features: if isinstance(getattr(token, feature), Token): current['%i:%s' % (i, feature)] = getattr(token, feature).text else: current['%i:%s' % (i, feature)] = getattr(token, feature) features.update(current) return features
Example #18
Source File: spacy_utlis.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def parse(nlp: spacy.language.Language, text: str, language: Language = None, use_profanity_filter: bool = False) -> Union[Doc, Token]: disable = [] if use_profanity_filter else [SpacyProfanityFilterComponent.name] component_cfg = {} if use_profanity_filter: component_cfg[SpacyProfanityFilterComponent.name] = { 'language': language, } return nlp(text, disable=disable, component_cfg=component_cfg)
Example #19
Source File: spacy_utlis.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def make_token(nlp: spacy.language.Language, word: Union[str, Token]) -> Token: if hasattr(word, 'text'): return word doc = parse(nlp=nlp, text=word) with doc.retokenize() as retokenizer: retokenizer.merge(doc[:]) return doc[0]
Example #20
Source File: markov_engine.py From armchair-expert with MIT License | 5 votes |
def from_token(token: Token) -> 'MarkovNeighbor': key = token.text.lower() text = token.text if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False pos = Pos.from_token(token) values = [0, 0] dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1) return MarkovNeighbor(key, text, pos, compound, values, dist)
Example #21
Source File: triple.py From ravestate with BSD 3-Clause "New" or "Revised" License | 5 votes |
def set_subject(self, subject: Union[Token, QuestionWord]): self._subject = subject
Example #22
Source File: markov_engine.py From armchair-expert with MIT License | 5 votes |
def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})
Example #23
Source File: structure.py From armchair-expert with MIT License | 5 votes |
def analyze(token: Token, mode: CapitalizationMode): pos = Pos.from_token(token) mode = PoSCapitalizationMode(pos, mode) return mode.to_embedding()
Example #24
Source File: nlp.py From armchair-expert with MIT License | 5 votes |
def from_token(token: Token, compound_rules: Optional[List[str]] = None) -> 'CapitalizationMode': # Try to make a guess for many common patterns pos = Pos.from_token(token) if pos in [Pos.NUM, Pos.EMOJI, Pos.SYM, Pos.SPACE, Pos.EOS, Pos.HASHTAG, Pos.PUNCT, Pos.URL]: return CapitalizationMode.COMPOUND if token.text[0] == '@' or token.text[0] == '#': return CapitalizationMode.COMPOUND if token.text in compound_rules: return CapitalizationMode.COMPOUND lower_count = 0 upper_count = 0 upper_start = False for idx, c in enumerate(token.text): if c.isupper(): upper_count += 1 if upper_start: upper_start = False if idx == 0: upper_start = True elif c.islower(): lower_count += 1 if upper_start: return CapitalizationMode.UPPER_FIRST elif lower_count > 0 and upper_count == 0: return CapitalizationMode.LOWER_ALL elif upper_count > 0 and lower_count == 0: return CapitalizationMode.UPPER_ALL elif upper_count == 0 and lower_count == 0: return CapitalizationMode.NONE else: return CapitalizationMode.COMPOUND
Example #25
Source File: spacy_indexer.py From allennlp with Apache License 2.0 | 5 votes |
def tokens_to_indices( self, tokens: List[SpacyToken], vocabulary: Vocabulary ) -> Dict[str, List[numpy.ndarray]]: if not all(isinstance(x, SpacyToken) for x in tokens): raise ValueError( "The spacy indexer requires you to use a Tokenizer which produces SpacyTokens." ) indices: List[numpy.ndarray] = [token.vector for token in tokens] return {"tokens": indices}
Example #26
Source File: text_field.py From allennlp with Apache License 2.0 | 5 votes |
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None: self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens: Optional[Dict[str, IndexedTokenList]] = None if not all(isinstance(x, (Token, SpacyToken)) for x in tokens): raise ConfigurationError( "TextFields must be passed Tokens. " "Found: {} with types {}.".format(tokens, [type(x) for x in tokens]) )
Example #27
Source File: text_field.py From magnitude with MIT License | 5 votes |
def __init__(self, tokens , token_indexers ) : self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens = None self._indexer_name_to_indexed_token = None if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]): raise ConfigurationError(u"TextFields must be passed Tokens. " u"Found: {} with types {}.".format(tokens, [type(x) for x in tokens])) #overrides
Example #28
Source File: text_field.py From gtos with MIT License | 5 votes |
def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None: self.tokens = tokens self._token_indexers = token_indexers self._indexed_tokens: Optional[Dict[str, TokenList]] = None self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]): raise ConfigurationError("TextFields must be passed Tokens. " "Found: {} with types {}.".format(tokens, [type(x) for x in tokens]))
Example #29
Source File: question_word.py From ravestate with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, token: Token): self.text = self.question_words[token.text.lower()] self.lemma_ = self.question_words[token.text.lower()] self.pos_ = self.question_pos self.dep_ = token.dep_ self.is_space = False self.children = list()
Example #30
Source File: triple.py From ravestate with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, subject: Token = None, predicate: Token = None, object: Token = None): self.set_subject(subject) self.set_predicate(predicate) self.set_object(object)