Python spacy.tokens() Examples
The following are 30
code examples of spacy.tokens().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy
, or try the search function
.
Example #1
Source File: document.py From neuralcoref with MIT License | 6 votes |
def __new__( cls, span, mention_index, utterance_index, utterance_start_sent, speaker=None, gold_label=None, *args, **kwargs, ): # We need to override __new__ see http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html obj = spacy.tokens.Span.__new__( cls, span.doc, span.start, span.end, *args, **kwargs ) return obj
Example #2
Source File: spacy_annotator.py From errudite with GNU General Public License v2.0 | 6 votes |
def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str: """Function which gets a normalized string of the sentence and removes stop words Keyword Arguments: sentence_str {str} -- input sentence string (default: {None}) tokens {List[Token]} -- pre-computed token list, with feature added (default: {None}) use_lemma {bool} -- return the lemma or the text (default: {True}) Returns: str -- the str with stopwords removed """ if not tokens and sentence_str: #sentence_str = normalize_answer(sentence_str) tokens = self.model(sentence_str) elif not tokens: tokens = [] #word_tokenize(sentence_str) attr = 'lemma_' if use_lemma else 'text' # what to merge return ' '.join([ getattr(token, attr) for token in tokens if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])
Example #3
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 6 votes |
def _generate_partly_censored_word(self, word: Union[str, spacy.tokens.Token], profane_word: str) -> str: def is_delete_or_insert(opcode): return opcode[0] in ('delete', 'insert') # noinspection PyShadowingNames def find_word_part(word: str, word_part: str) -> str: word_to_word_part_opcodes = Levenshtein.opcodes(word, word_part) word_part_in_word_start = ( word_to_word_part_opcodes[0][2] if is_delete_or_insert(word_to_word_part_opcodes[0]) else 0) word_part_in_word_finish = ( word_to_word_part_opcodes[-1][1] if is_delete_or_insert(word_to_word_part_opcodes[-1]) else len(word)) return word[word_part_in_word_start:word_part_in_word_finish] with suppress(AttributeError): word = word.text word_part_for_censoring = find_word_part(word.lower(), profane_word) return regex.sub(pattern=re.escape(word_part_for_censoring), repl=self._generate_fully_censored_word(word=word_part_for_censoring), string=word, flags=regex.IGNORECASE)
Example #4
Source File: rindex.py From semanticRetrievalMRS with MIT License | 6 votes |
def iterative_abs_save_info(debug_num=None): total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num cur_count = 0 with open(config.ABS_WIKI_FILE, 'rb') as abs_file: with SqliteDict(str(config.ABS_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as abs_rindex_db: for line in tqdm(abs_file, total=total_doc_num): item = json.loads(line) # print(item.keys()) # print() if item['title'] in abs_rindex_db: continue tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset']) poss = spacy_get_pos(tokens) assert len(tokens) == len(poss) # print(tokens) # print(sent_offset) abs_rindex_db[item['title']] = { 'tokens': tokens, 'poss': poss, 'sentence_offset': sent_offset } cur_count += 1 if cur_count % 5000 == 0: abs_rindex_db.commit() abs_rindex_db.commit() abs_rindex_db.close()
Example #5
Source File: rindex.py From semanticRetrievalMRS with MIT License | 6 votes |
def get_sentence_tokens(texts, charoffsets): whole_text = "".join(texts) tokens = [] sentence_offsets = [] start_t = 0 end_t = 0 for offset_list in charoffsets: end_t = start_t for start, end in offset_list: cur_token = whole_text[start:end] if len(cur_token) > 0: tokens.append(cur_token) end_t += 1 sentence_offsets.append((start_t, end_t)) start_t = end_t return tokens, sentence_offsets
Example #6
Source File: wordnet.py From gobbli with Apache License 2.0 | 6 votes |
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"): try: from nltk.corpus import wordnet import nltk except ImportError: raise ImportError( "WordNet-based data augmentation requires nltk to be installed." ) self.wn = wordnet try: import spacy from spacy.tokens import Token except ImportError: raise ImportError( "WordNet-based data augmentation requires spaCy and a language " "model to be installed (for part of speech tagging)." ) if not skip_download_check: nltk.download("wordnet") self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False) Token.set_extension("replacement", default=None, force=True)
Example #7
Source File: test_spacy_featurizer.py From rasa-for-botfront with Apache License 2.0 | 6 votes |
def test_spacy_training_sample_alignment(spacy_nlp_component): from spacy.tokens import Doc m1 = Message.build(text="I have a feeling", intent="feeling") m2 = Message.build(text="", intent="feeling") m3 = Message.build(text="I am the last message", intent="feeling") td = TrainingData(training_examples=[m1, m2, m3]) attribute_docs = spacy_nlp_component.docs_for_training_data(td) assert isinstance(attribute_docs["text"][0], Doc) assert isinstance(attribute_docs["text"][1], Doc) assert isinstance(attribute_docs["text"][2], Doc) assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"] assert [t.text for t in attribute_docs["text"][1]] == [] assert [t.text for t in attribute_docs["text"][2]] == [ "i", "am", "the", "last", "message", ]
Example #8
Source File: spacy_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]: """ Converts spaCy tokens to allennlp tokens. Is a no-op if keep_spacy_tokens is True """ if not self._keep_spacy_tokens: tokens = [ Token( token.text, token.idx, token.idx + len(token.text), token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_, ) for token in tokens ] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
Example #9
Source File: spacy_tokenizer.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, language: str = "en_core_web_sm", pos_tags: bool = False, parse: bool = False, ner: bool = False, keep_spacy_tokens: bool = False, split_on_spaces: bool = False, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, ) -> None: self.spacy = get_spacy_model(language, pos_tags, parse, ner) if split_on_spaces: self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab) self._keep_spacy_tokens = keep_spacy_tokens self._start_tokens = start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self._end_tokens = end_tokens or []
Example #10
Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0 | 5 votes |
def process_non_content_bearing_samples( self, empty_samples: List[Tuple[int, Text]] ) -> List[Tuple[int, "Doc"]]: """Creates empty Doc-objects from zero-lengthed training samples strings.""" from spacy.tokens import Doc n_docs = [ (empty_sample[0], doc) for empty_sample, doc in zip( empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples] ) ] return n_docs
Example #11
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _replace_token(text: str, old: spacy.tokens.Token, new: str) -> str: return text[:old.idx] + new + text[old.idx + len(old.text):] # noinspection PyProtectedMember
Example #12
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _save_word_with_no_profanity_inside(self, word: spacy.tokens.Token) -> None: if self._cache_redis is None: self._words_with_no_profanity_inside.add(word.text) else: self._cache_redis.sadd('_words_with_no_profanity_inside', word.text)
Example #13
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _get_censored_word(self, word: spacy.tokens.Token) -> Optional[Word]: if self._cache_redis is None: return self._censored_words.get(word.text) else: d = self._cache_redis.hgetall(word.text) if not d: return None uncensored, censored, original_profane_word = d[b'uncensored'], d[b'censored'], d[b'original_profane_word'] if not original_profane_word: original_profane_word = None return Word(uncensored=uncensored, censored=censored, original_profane_word=original_profane_word)
Example #14
Source File: util.py From scispacy with Apache License 2.0 | 5 votes |
def __call__(self, text): words = text.split(" ") # All tokens 'own' a subsequent space character in # this tokenizer. This is a technicality and probably # not that interesting. spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
Example #15
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _keep_only_letters_or_dictionary_word(self, language: Language, word: Union[str, spacy.tokens.Token]) -> str: with suppress(AttributeError): word = word.text if language is None: language = self.languages[0] if AnalysisType.DEEP in self.analyses and self._is_dictionary_word(language=language, word=word): return word else: return ''.join(regex.findall(r'\p{letter}', word))
Example #16
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _parse(self, language: Language, text: str, use_profanity_filter: bool = True) -> spacy.tokens.Doc: nlp = self._get_nlp(language) return spacy_utlis.parse(nlp=nlp, text=text, language=language, use_profanity_filter=use_profanity_filter)
Example #17
Source File: nlp.py From armchair-expert with MIT License | 5 votes |
def create_nlp_instance(): import spacy from spacymoji import Emoji nlp = spacy.load('en') emoji_pipe = Emoji(nlp) nlp.add_pipe(emoji_pipe, first=True) # Merge hashtag tokens which were split by spacy def hashtag_pipe(doc): merged_hashtag = False while True: for token_index, token in enumerate(doc): if token.text == '#': if token.head is not None: start_index = token.idx end_index = start_index + len(token.head.text) + 1 if doc.merge(start_index, end_index) is not None: merged_hashtag = True break if not merged_hashtag: break merged_hashtag = False return doc nlp.add_pipe(hashtag_pipe) return nlp
Example #18
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _generate_fully_censored_word(self, word: Union[str, spacy.tokens.Token]) -> str: with suppress(AttributeError): word = word.text return len(word) * self.censor_char
Example #19
Source File: spacy_wrapper.py From supervised-oie with MIT License | 5 votes |
def __call__(self, text): """ Call this tokenizer - just split based on space """ words = re.split(r' +', text) # Allow arbitrary number of spaces # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
Example #20
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def _make_spacy_token(self, language: Language, word: str) -> spacy.tokens.Token: return spacy_utlis.make_token(nlp=self._get_nlp(language), word=word)
Example #21
Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0 | 5 votes |
def censor_word(self, word: Union[str, spacy.tokens.Token], language: Language = None) -> Word: """Returns censored word""" word = self._make_spacy_token(language=language, word=word) return self._censor_word(language=language, word=word)
Example #22
Source File: test_gold_annotator_component.py From medaCy with GNU General Public License v3.0 | 5 votes |
def test_overlays_annotations(self): """ Tests that this pipeline component adds the correct labels. Note that this only tests that at least one instance of each label is overlayed because the number of tokens that receive the label varies based on the tokenizer. """ sample_file = sample_dataset.data_files[0] txt_file_path = sample_file.txt_path ann_file_path = sample_file.ann_path with open(txt_file_path) as f: text = f.read() doc: Doc = self.nlp(text) doc.set_extension('file_name', default=None, force=True) doc._.file_name = txt_file_path doc.set_extension('gold_annotation_file', default=None, force=True) doc._.gold_annotation_file = ann_file_path ann = Annotations(ann_file_path) labels = ann.get_labels() gold_annotator = GoldAnnotatorOverlayer(self.nlp, list(labels)) doc = gold_annotator(doc) overlayed_labels = {t._.gold_label for t in doc} overlayed_labels.remove('O') self.assertSetEqual(overlayed_labels, labels)
Example #23
Source File: wordnet.py From kb with Apache License 2.0 | 5 votes |
def __call__(self, text: str) -> List[Token]: spacy_doc = self.nlp(text) # create allennlp tokens normalized_tokens = [ Token(spacy_token.text, pos_=self.spacy_to_wordnet_map.get(spacy_token.pos_, spacy_token.pos_), lemma_=spacy_token.lemma_ ) for spacy_token in spacy_doc if not spacy_token.is_space ] return normalized_tokens
Example #24
Source File: common.py From kb with Apache License 2.0 | 5 votes |
def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
Example #25
Source File: spacy_annotator.py From errudite with GNU General Public License v2.0 | 5 votes |
def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
Example #26
Source File: annotate_graphs.py From RL-based-Graph2Seq-for-NQG with Apache License 2.0 | 5 votes |
def __call__(self, text): words = text.split(' ') # All tokens 'own' a subsequent space character in this tokenizer spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)
Example #27
Source File: test_extract_triples.py From ravestate with BSD 3-Clause "New" or "Revised" License | 5 votes |
def spacy_model(): nlp = spacy_nlp_en from spacy.tokens import Doc if not Doc.has_extension('triples'): Doc.set_extension('triples', getter=extract_triples) return nlp
Example #28
Source File: rindex.py From semanticRetrievalMRS with MIT License | 5 votes |
def iterative_abs_save_random_batching(batch_size=10000): total_doc_num = init_inspect.TOTAL_NUM_DOC with open(config.ABS_WIKI_FILE, 'rb') as abs_file: lines = [] for line in tqdm(abs_file, total=total_doc_num): lines.append(line) # if len(lines) == 100000: # break random_per = range(len(lines)) # random_per = np.random.permutation(len(lines)) # random.shuffle(lines) # existing_title_set = set() batch_list = [] with SqliteDict(str(config.ABS_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as abs_rindex_db: for index in tqdm(random_per): item = json.loads(lines[index]) # print(item.keys()) # print() if item['title'] in abs_rindex_db: continue tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset']) poss = spacy_get_pos(tokens) assert len(tokens) == len(poss) # print(tokens) # print(sent_offset) rindex_item = { 'tokens': tokens, 'poss': poss, 'sentence_offset': sent_offset } batch_list.append((item['title'], rindex_item)) if len(batch_list) == batch_size: for title, rindex_item in batch_list: abs_rindex_db[title] = rindex_item abs_rindex_db.commit() batch_list = [] # Commit last one for title, rindex_item in batch_list: abs_rindex_db[title] = rindex_item abs_rindex_db.commit() abs_rindex_db.close()
Example #29
Source File: rindex.py From semanticRetrievalMRS with MIT License | 5 votes |
def iterative_abs(debug_num=None): total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num cur_count = 0 with open(config.ABS_WIKI_FILE, 'rb') as abs_file: for line in tqdm(abs_file, total=total_doc_num): item = json.loads(line) # print(item.keys()) # print() tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset']) poss = spacy_get_pos(tokens) assert len(tokens) == len(poss) print(tokens) print(sent_offset) # print(poss)
Example #30
Source File: science_ie_data_utils.py From sciwing with MIT License | 5 votes |
def _form_ann_line( idx: str, char_offset: Tuple[int, int, str], tag_name: str, doc: spacy.tokens.doc.Doc, ): """ Forms a ann line that can be used to write the ANN files for CoNLL format Parameters ---------- idx : int The index for the entity being written char_offset : int THe start, end, tag for the line tag_name : str The tag to be used and is one of ``[Task, Process, Material]`` doc : str Spacy doc to query the appropriate characters Returns ------- str An ANN line that is formed. """ start_offset, end_offset, entity_type = char_offset surface_form = doc.char_span(start_offset, end_offset).text start_offset = str(start_offset) end_offset = str(end_offset) ann_line = " ".join([start_offset, end_offset]) ann_line = "\t".join([ann_line, surface_form]) ann_line = " ".join([tag_name, ann_line]) ann_line = "\t".join([f"T{idx}", ann_line]) return ann_line