Python spacy.tokens.Span() Examples
The following are 30
code examples of spacy.tokens.Span().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy.tokens
, or try the search function
.
Example #1
Source File: document.py From neuralcoref with MIT License | 6 votes |
def __new__( cls, span, mention_index, utterance_index, utterance_start_sent, speaker=None, gold_label=None, *args, **kwargs, ): # We need to override __new__ see http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html obj = spacy.tokens.Span.__new__( cls, span.doc, span.start, span.end, *args, **kwargs ) return obj
Example #2
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def span_to_json(sentence: Span, sid: int = 0) -> List[Dict]: ''' @param <Span> sentence: sentence in span type @return <Dict> json-seralized sentences ''' if sentence is None: return None j_sentence = [{ 'idx': t.i, 'text': t.text, 'ner': t.ent_type_, 'lemma': t.lemma_, 'pos': t.pos_, 'tag': t.tag_, 'whitespace': t.whitespace_, 'sid': sid #, #'matches': [] } for t in sentence] return j_sentence
Example #3
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def gen_text_from_sent_list(sentences: List[Span]) -> str: ''' #TODO: to comment ''' return ''.join([s.text + s[-1].whitespace_ for s in sentences])
Example #4
Source File: helpers.py From errudite with GNU General Public License v2.0 | 6 votes |
def convert_doc(doc: Union[Doc, Span, 'Target'], strict_format: str=None): def _strict_doc(doc): if not doc: return None if type(doc) == str: return doc if strict_format == 'doc': return doc if type(doc) == Doc else doc.as_doc() if strict_format == 'span': return doc if type(doc) == Span else doc[:] return doc def _convert(doc): if type(doc) == str: return doc if type(doc) == Doc or type(doc) == Span: return _strict_doc(doc) else: return _strict_doc(getattr(doc, 'doc', None)) if not doc: return None if type(doc) == list: return [ _convert(d) for d in doc ] else: return _convert(doc)
Example #5
Source File: context.py From errudite with GNU General Public License v2.0 | 6 votes |
def get_sentence(self, sid: Union[int, List[int]]=0, doc: Doc=None) -> Union[Span, List[Span]]: """Query a sentence in a paragraph. Keyword Arguments: sid {Union[int, List[int]]} -- sid the sentence id; or. (default: {None}) Returns: Union[Span, List[Span]] -- the sentence """ if doc: sentences = list(doc.sents) else: sentences = list(self.doc.sents) if type(sid) == int or type(sid) == float: if int(sid) >= 0 and int(sid) < len(sentences): return sentences[int(sid)] # else if it's an array sid = [int(s) for s in sid if s >= 0 and s < len(sentences)] if len(sid) > 0: filtered = [sentences[s] for s in sid] return filtered[0] if len(filtered) == 1 else filtered if sentences: return sentences[0] return None
Example #6
Source File: spacy_np_annotator.py From nlp-architect with Apache License 2.0 | 5 votes |
def get_noun_phrases(doc: Doc) -> [Span]: """ Get noun phrase tags from a spacy annotated document. Args: doc (Doc): a spacy type document Returns: a list of noun phrase Span objects """ assert hasattr(doc._, "noun_phrases"), "no noun_phrase attributes in document" return doc._.noun_phrases
Example #7
Source File: spacy_np_annotator.py From nlp-architect with Apache License 2.0 | 5 votes |
def __call__(self, doc: Doc) -> Doc: """ Annotate the document with noun phrase spans """ spans = [] doc_vecs = [] doc_chars = [] doc_lens = [] if len(doc) < 1: return doc for sentence in doc.sents: features = self._feature_extractor([t.text for t in sentence]) if isinstance(features, tuple): doc_vec = features[0] doc_chars.append(features[1]) else: doc_vec = features doc_vecs.append(doc_vec) doc_lens.append(len(doc_vec)) doc_vectors = pad_sentences(np.asarray(doc_vecs)) inputs = doc_vectors if self.char_vocab: max_len = doc_vectors.shape[1] padded_chars = np.zeros((len(doc_chars), max_len, self.model.max_word_len)) for idx, d in enumerate(doc_chars): d = d[:max_len] padded_chars[idx, -d.shape[0] :] = d inputs = [inputs, padded_chars] np_indexes = self._infer_chunks(inputs, doc_lens) for s, e in np_indexes: np_span = Span(doc, s, e) spans.append(np_span) spans = _NPPostprocessor.process(spans) set_noun_phrases(doc, spans) return doc
Example #8
Source File: spacy_np_annotator.py From nlp-architect with Apache License 2.0 | 5 votes |
def set_noun_phrases(doc: Doc, nps: [Span]) -> None: """ Set noun phrase tags Args: doc (Doc): a spacy type document nps ([Span]): a list of Spans """ assert hasattr(doc._, "noun_phrases"), "no noun_phrase attributes in document" doc._.set("noun_phrases", nps)
Example #9
Source File: spacy_np_annotator.py From nlp-architect with Apache License 2.0 | 5 votes |
def process(cls, noun_phrases: [Span]) -> [Span]: new_phrases = [] for phrase in noun_phrases: p = _NPPostprocessor._phrase_process(phrase) if p is not None and len(p) > 0: new_phrases.append(p) return new_phrases
Example #10
Source File: markov_engine.py From armchair-expert with MIT License | 5 votes |
def span_to_bigram(span: Span) -> list: grams = [] for a_idx, a in enumerate(span): for b_idx, b in enumerate(span): dist = b_idx - a_idx if dist == 0: continue elif abs(dist) <= MARKOV_WINDOW_SIZE: grams.append([a, b, dist]) return grams
Example #11
Source File: spacy_np_annotator.py From nlp-architect with Apache License 2.0 | 5 votes |
def _phrase_process(cls, phrase: Span) -> Span: last_phrase = None while phrase != last_phrase: last_phrase = phrase for func_args in post_processing_rules: pf = func_args[0] args = func_args[1:] if len(args) > 0: phrase = pf(phrase, *args) else: phrase = pf(phrase) if phrase is None: break return phrase
Example #12
Source File: volume_unit_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: #match and tag volume units matches = self.volume_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit']) for token in span: token._.feature_is_volume_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
Example #13
Source File: frequency_unit_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: # match and frequency indicators matches = self.frequency_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator']) for token in span: token._.feature_is_frequency_indicator = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
Example #14
Source File: time_unit_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: # match and tag time units matches = self.time_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['time_unit']) for token in span: token._.feature_is_time_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
Example #15
Source File: measurement_unit_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: # match units of measurement (x/y, , etc) matches = self.unit_of_measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit']) for token in span: token._.feature_is_measurement_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
Example #16
Source File: lexicon_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def __call__(self, doc): """ Runs a document through the lexicon component. Utilizes SpaCy's PhraseMatcher to find spans in the doc that match the lexicon and overlays the appropriate label as 'feature_is_label_from_lexicon' over all tokens in the span. :param doc: :return: """ logging.debug("Called Lexicon Component") matcher = PhraseMatcher(self.nlp.vocab, max_length=10) for label in self.lexicon: Token.set_extension('feature_is_' + label + '_from_lexicon', default=False, force=True) patterns = [self.nlp.make_doc(term) for term in self.lexicon[label]] logging.debug(patterns) matcher.add(label, None, *patterns) matches = matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end) logging.debug(span) if span is not None: logging.debug('Lexicon term matched: %s Label: %s' % (span.text, self.nlp.vocab.strings[match_id])) for token in span: token._.set('feature_is_' + self.nlp.vocab.strings[match_id] + '_from_lexicon', True) return doc
Example #17
Source File: dataset_reader.py From errudite with GNU General Public License v2.0 | 5 votes |
def _compute_span_info(self, instance: Instance, spans: Span, feature_list: List[str], target: str, info_idxes): if target not in instance.entries: target_name = f'prediction(model="{target}")' target = 'predictions' else: target_name = target if len(list(np.unique(feature_list))) > 2: return info_idxes span_features = [ get_token_feature(t, feature_list[idx]).strip() for idx, t in enumerate(spans) ] if any([not s or s in ["(", ")", ","] for s in span_features]): return info_idxes if any([ f not in VBs +WHs + NNs and feature_list[idx] == 'tag' for idx, f in enumerate(span_features) ]): return info_idxes pattern = ' '.join(span_features) if pattern not in info_idxes[target]: info_idxes[target][pattern] = defaultdict(dict) info_idxes[target][pattern]['cover'] = defaultdict(dict) if target != 'predictions': info_idxes[target][pattern]['cover']['total'][instance.key()] = True predictions = instance.get_entry('predictions') or [] for prediction in predictions: model = prediction.model if target == 'predictions': if model not in target_name: continue else: info_idxes[target][pattern]['cover'][model][instance.key()] = True if instance.is_incorrect(model): info_idxes[target][pattern][model][instance.key()] = True #print(model, instance.is_incorrect(model)) #print(info_idxes[target][pattern][model]) return info_idxes
Example #18
Source File: linguistic.py From errudite with GNU General Public License v2.0 | 5 votes |
def STRING(target: Union['Target', Span]) -> str: """Get the raw string from a given span or target. Parameters ---------- target : Union[Target, Span] The target to be converted to string. Returns ------- str The string. """ output = "" try: if not target: raise DSLValueError(f"No valid input to [ STRING ]. target: {target}") else: target = convert_list(target) doc = convert_doc(target)[0] if 'label' in target[0].__class__.__name__.lower(): output = target[0].get_label() elif doc: output = doc.text except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: ex = Exception(f"Unknown exception from [ STRING ]: {e}") #logger.error(ex) raise(ex) #finally: else: #pass return output
Example #19
Source File: length.py From errudite with GNU General Public License v2.0 | 5 votes |
def length( docs: Union['Target', Span, List[Union['Target', Span]]]) -> int: """ The length of a given span, in tokens. If the input is a List, take the min length of all spans in the list. Parameters ---------- docs : Union[Target, Span, List[Union[Target, Span]]] The input doc(s) for computing the length. Returns ------- int The length. """ output = 0 try: def length_(doc): return len(convert_doc(doc)) if doc else 0 if docs is None: raise DSLValueError(f"No valid input to [ length ]. input: {docs}") elif type(docs) == list and len(docs) > 0: output = min([ length_(doc) for doc in docs ]) # convert_token else: output = length_(docs) # convert_token except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: #print(f'[is_digit]') #traceback.print_exc() ex = Exception(f"Unknown exception from [ length ]: {e}") #logger.error(ex) raise(ex) #finally: else: #pass return output
Example #20
Source File: helpers.py From errudite with GNU General Public License v2.0 | 5 votes |
def spans_to_json(sentences: List[Span]) -> Dict: ''' @param <Span[]> sentences: sentence in span type @return <Dict> json-seralized sentences ''' spans = [] for sid, sentence in enumerate(sentences): spans += span_to_json(sentence, sid=sid) return spans
Example #21
Source File: mq.py From pydata2019-nlp-system with Apache License 2.0 | 5 votes |
def default(self, obj): if isinstance(obj, Span): return str(obj) return json.JSONEncoder.default(self, obj)
Example #22
Source File: document.py From neuralcoref with MIT License | 5 votes |
def exact_match(self, mention2): """ Does the Mention lowercase text matches another Mention/Span lowercase text""" return self.lower_ == mention2.lower_
Example #23
Source File: document.py From neuralcoref with MIT License | 5 votes |
def heads_agree(self, mention2): """ Does the root of the Mention match the root of another Mention/Span""" # we allow same-type NEs to not match perfectly, # but rather one could be included in the other, e.g., "George" -> "George Bush" if ( self.in_entities and mention2.in_entities and self.entity_label == mention2.entity_label and ( self.root.lower_ in mention2.lower_ or mention2.root.lower_ in self.lower_ ) ): return True return self.root.lower_ == mention2.root.lower_
Example #24
Source File: document.py From neuralcoref with MIT License | 5 votes |
def _get_type(self): """ Find the type of the Span """ conj = ["CC", ","] prp = ["PRP", "PRP$"] proper = ["NNP", "NNPS"] if any(t.tag_ in conj and t.ent_type_ not in ACCEPTED_ENTS for t in self): mention_type = MENTION_TYPE["LIST"] elif self.root.tag_ in prp: mention_type = MENTION_TYPE["PRONOMINAL"] elif self.root.ent_type_ in ACCEPTED_ENTS or self.root.tag_ in proper: mention_type = MENTION_TYPE["PROPER"] else: mention_type = MENTION_TYPE["NOMINAL"] return mention_type
Example #25
Source File: document.py From neuralcoref with MIT License | 5 votes |
def __init__( self, span, mention_index, utterance_index, utterances_start_sent, speaker=None, gold_label=None, ): """ Arguments: span (spaCy Span): the spaCy span from which creating the Mention object mention_index (int): index of the Mention in the Document utterance_index (int): index of the utterance of the Mention in the Document utterances_start_sent (int): index of the first sentence of the utterance of the Mention in the Document (an utterance can comprise several sentences) speaker (Speaker): the speaker of the mention gold_label (anything): a gold label associated to the Mention (for training) """ self.index = mention_index self.utterance_index = utterance_index self.utterances_sent = utterances_start_sent + self._get_doc_sent_number() self.speaker = speaker self.gold_label = gold_label self.spans_embeddings = None self.words_embeddings = None self.features = None self.features_ = None self.spans_embeddings_ = None self.words_embeddings_ = None self.mention_type = self._get_type() self.propers = set(self.content_words) self.entity_label = self._get_entity_label() self.in_entities = self._get_in_entities()
Example #26
Source File: token.py From errudite with GNU General Public License v2.0 | 4 votes |
def token( docs: Union[Span, 'Target'], idxes: Union[int, List[int]]=None, pattern: Union[str, List[str]]=None) -> Union[Span, Token]: """ Get a list of tokens from the target based on idxes (sub-list) and pattern. Note that ``idxes`` runs before ``pattern``. That is, if the idxes exist, the pattern filters the idxed doc tokens. Parameters ---------- docs : Union[Target, Span] The doc to be queried. idxes : Union[int, List[int]], optional Retrive the sub-list of tokens from docs, with idx(es). By default None pattern : Union[str, List[str]], optional Used to filter and get the sub-list of spans in the doc span list. Pattern allows linguistic annotations and automatically detects queries on POS tags and entity types, in ALL CAPS. For example, ``(what, which) NOUN)`` may query all docs that have ``what NOUN`` or ``which NOUN``. If a list, then all the patterns in a list are "OR". By default None Returns ------- Union[Span, Token] The queried sub-list. """ output = [] try: if not docs: raise DSLValueError("No input to [ token ].") docs_ = token_idxes(docs, idxes=idxes) if pattern: output = token_pattern(docs_, pattern) else: output = docs_ except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: ex = Exception(f"Unknown exception from [ token ]: {e}") raise(ex) #logger.error(ex) #finally: else: return output
Example #27
Source File: token.py From errudite with GNU General Public License v2.0 | 4 votes |
def has_pattern( docs: Union[Doc, Span, 'Target', List], idxes: Union[int, List[int]]=None, pattern: Union[str, List[str]]=None) -> bool: """ To determine whether the targeted span contains a certain pattern. Parameters ---------- docs : Union[Target, Span] The doc to be queried. idxes : Union[int, List[int]], optional Retrive the sub-list of tokens from docs, with idx(es). By default None pattern : Union[str, List[str]], optional Used to filter and get the sub-list of spans in the doc span list. Pattern allows linguistic annotations and automatically detects queries on POS tags and entity types, in ALL CAPS. For example, ``(what, which) NOUN)`` may query all docs that have "what NOUN" or "which NOUN". If a list, then all the patterns in a list are "OR". By default None Returns ------- bool Whether the span/target has the pattern or not. """ output = False try: if pattern is None: raise DSLValueError(f"[ {pattern} ] is not a valid pattern to [ has_pattern ].") else: tokens = token(docs, idxes=idxes, pattern=pattern) if ( type(docs) == list ): output = any([ o and length(o) > 0 for o in tokens ]) else: output = tokens and length(tokens) > 0 except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: ex = Exception(f"Unknown exception from [ has_pattern ]: {e}") raise(ex) #logger.error(ex) #finally: else: return output
Example #28
Source File: token.py From errudite with GNU General Public License v2.0 | 4 votes |
def boundary_with ( docs: Union[Span, 'Target'], pattern: Union[str, List[str]], direction: str='start') -> Union[Doc, Span, Token]: """ To determine whether the targeted span contains a certain pattern, at the beginning or the end of the doc. *When using the DSL parser*, this function can be called in alternative ways, with ``direction`` being automatically filled in: ``[starts_with|ends_with](...)``. Parameters ---------- docs : Union[Target, Span] The doc to be queried. pattern : Union[str, List[str]] The same as in `has_pattern`. direction : str Either to test the "start" or the "end" of the doc. Returns ------- bool Whether the span/target starts/ends with the pattern or not. """ output = False try: if pattern is None: raise DSLValueError(f"[ {pattern} ] is not a valid pattern to [ boundary_with ].") pattern = convert_list(pattern) pattern_arr = merge_list([ parse_cmd(p).gen_pattern_list() for p in pattern]) if type(pattern_arr) in [ list, tuple ]: while type(pattern_arr[0]) in [ list, tuple ]: pattern_arr = pattern_arr[0] idx_length = len(pattern_arr) if direction == 'start': idxes = [ 0, idx_length ] else: idxes = [ -idx_length-1, 0 ] # to also cover the ednding cmd output = has_pattern(docs, idxes=idxes, pattern=pattern) except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: ex = Exception(f"Unknown exception from [ {direction}s_with ]: {e}") #logger.error(ex) raise(ex) else: return output
Example #29
Source File: token.py From errudite with GNU General Public License v2.0 | 4 votes |
def token_pattern( docs: Union[Doc, Span, 'Target', List], pattern: Union[str, List[str]]) -> bool: output = [] try: global CUR_SAVED_RULE if not pattern: # special case: just return everything output = docs else: if not docs: raise DSLValueError("No given doc to [ token_pattern ].") docs = convert_list(convert_doc(docs, strict_format='doc')) pattern = convert_list(pattern) pattern_key = 'pattern' + '::'.join(pattern) if pattern_key != CUR_SAVED_RULE: # define a matcher only when it's not the same rule currently used. patterns = merge_list([ parse_cmd(p).gen_pattern_list() for p in pattern]) if patterns: if 'matcher' in matcher: matcher.remove('matcher') matcher.add('matcher', None, *patterns) CUR_SAVED_RULE = pattern_key returned_spans = [] for doc in docs: for _, start, end in matcher(doc): returned_spans.append(doc[start:end]) if len(returned_spans) == 1: output = returned_spans[0] if not returned_spans: pass #raise DSLValueError(f"No match found for {pattern} in {docs}.") else: output = returned_spans except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: ex = Exception(f"Unknown exception from [ token_pattern ]: {e}") #logger.error(ex) raise(ex) #finally: else: return output
Example #30
Source File: get_sentence.py From errudite with GNU General Public License v2.0 | 4 votes |
def sentence( answer: 'QAAnswer', context: 'Context', shift: Union[int, List[int]]=0) -> Span: """ *Machine Comprehension only* Get the sentence that contains a given answer. Shift indicates if neighboring sentences should be included. Parameters ---------- answer : QAAnswer The selected answer. context : Context The context target of a given instance. *Automatically filled in when using the DSL parser.* shift : Union[int, List[int]], optional Shift indicates if neighboring sentences should be included, by default 0 If ``shift==0``, then the actual sentence is returned; if ``shift==[-2,-1,1,2]``, then the four sentences surrounding the answer sentence are returned. Returns ------- Span The selected sentence that contains the answer. """ output = None try: if not context or context.__class__.__name__ != "Context": raise DSLValueError(f"Cannot retrive the sentence, due to invalid context: [ {context} ].") if not answer or \ not ("Answer" in answer.__class__.__name__ or \ (type(answer) == list and "Answer" in answer[0].__class__.__name__)): raise DSLValueError(f"Cannot retrive the sentence, due to invalid answer: [ {answer} ].") # only getting one sentence if type(answer) != list and type(shift) != list: output = context.get_sentence(answer.sid) # multiple sentences. Convert both into list answer = convert_list(answer) shift = convert_list(shift) sids = [] for a in answer: sids += [a.sid + r for r in shift ] sids = np.unique(sids) output = context.get_sentence(sids) except DSLValueError as e: #logger.error(e) raise(e) except Exception as e: #print(f'[is_digit]') #traceback.print_exc() ex = Exception(f"Unknown exception from [ sentence ]: {e}") #logger.error(ex) raise(ex) else: #pass return output