Python spacy.tokenizer() Examples
The following are 9
code examples of spacy.tokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy
, or try the search function
.
Example #1
Source File: newsroom_data_maker.py From TransferRL with MIT License | 6 votes |
def run(entry): text = entry['text'] summary = entry['summary'] text = ' '.join([_.text for _ in tokenizer(remove_non_ascii(text))]) summary = ' '.join([_.text for _ in tokenizer(remove_non_ascii(summary))]) text = nlp(text) summary = nlp(summary) text = '\n'.join([' '.join([_.text for _ in s]) for s in text.sents]) summary = '\n'.join([' '.join([_.text for _ in s]) for s in summary.sents]) # run pre-processing line_text, pos_text, ner_text = pre_processing(text) line_summary, pos_summary, ner_summary = pre_processing(summary) entry['processed'] = {} entry['processed']['text'] = line_text entry['processed']['pos_text'] = pos_text entry['processed']['ner_text'] = ner_text entry['processed']['summary'] = line_summary entry['processed']['pos_summary'] = pos_summary entry['processed']['ner_summary'] = ner_summary entry['text'] = text.lower() entry['summary'] = summary.lower() return entry
Example #2
Source File: newsroom_data_maker.py From RLSeq2Seq with MIT License | 6 votes |
def run(entry): text = entry['text'] summary = entry['summary'] text = ' '.join([_.text for _ in tokenizer(remove_non_ascii(text))]) summary = ' '.join([_.text for _ in tokenizer(remove_non_ascii(summary))]) text = nlp(text) summary = nlp(summary) text = '\n'.join([' '.join([_.text for _ in s]) for s in text.sents]) summary = '\n'.join([' '.join([_.text for _ in s]) for s in summary.sents]) # run pre-processing line_text, pos_text, ner_text = pre_processing(text) line_summary, pos_summary, ner_summary = pre_processing(summary) entry['processed'] = {} entry['processed']['text'] = line_text entry['processed']['pos_text'] = pos_text entry['processed']['ner_text'] = ner_text entry['processed']['summary'] = line_summary entry['processed']['pos_summary'] = pos_summary entry['processed']['ner_summary'] = ner_summary entry['text'] = text.lower() entry['summary'] = summary.lower() return entry
Example #3
Source File: nlp.py From bugbug with Mozilla Public License 2.0 | 6 votes |
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array( [ np.mean( [ self.model[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0, ) for words in tokenizer.pipe(data) ] )
Example #4
Source File: nlp.py From bugbug with Mozilla Public License 2.0 | 6 votes |
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array( [ np.mean( [ self.model[w.text.lower()] * self.word2weight[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0, ) for words in tokenizer.pipe(data) ] )
Example #5
Source File: model_factory.py From anonymisation with Apache License 2.0 | 6 votes |
def get_empty_model(load_labels_for_training: bool) -> French: """ Generate an empty NER model :rtype: object """ # Important to setup the right language because it impacts the tokenizer, sentences split, ... nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', first=True) ner = nlp.create_pipe('ner') # add labels if load_labels_for_training: for token_type in list(colors.keys()): ner.add_label(token_type) nlp.add_pipe(ner, last=True) return nlp
Example #6
Source File: preprocess_data.py From vampire with Apache License 2.0 | 6 votes |
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]: if tokenizer_type == "just_spaces": tokenizer = SpacyWordSplitter() elif tokenizer_type == "spacy": nlp = spacy.load('en') tokenizer = Tokenizer(nlp.vocab) tokenized_examples = [] with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f: for line in f: if data_path.endswith(".jsonl") or data_path.endswith(".json"): example = json.loads(line) else: example = {"text": line} if tokenize: if tokenizer_type == 'just_spaces': tokens = list(map(str, tokenizer.split_words(example['text']))) elif tokenizer_type == 'spacy': tokens = list(map(str, tokenizer(example['text']))) text = ' '.join(tokens) else: text = example['text'] tokenized_examples.append(text) return tokenized_examples
Example #7
Source File: utils.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _spacy_tokenize(x, spacy): return [tok.text for tok in spacy.tokenizer(x)]
Example #8
Source File: utils.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def is_tokenizer_serializable(tokenizer, language): """Extend with other tokenizers which are found to not be serializable """ if tokenizer == 'spacy': return False return True
Example #9
Source File: nlp.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def __init__(self, *args, **kwargs): # Detect when the Spacy optional dependency is missing if not HAS_OPTIONAL_DEPENDENCIES: raise NotImplementedError(OPT_MSG_MISSING) super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)