Python spacy.tokenizer() Examples

The following are 9 code examples of spacy.tokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy , or try the search function

Example #1

Source File: newsroom_data_maker.py From TransferRL with MIT License

6 votes

def run(entry):
    text = entry['text']
    summary = entry['summary']
    text = ' '.join([_.text for _ in tokenizer(remove_non_ascii(text))])
    summary = ' '.join([_.text for _ in tokenizer(remove_non_ascii(summary))])
    text = nlp(text)
    summary = nlp(summary)
    text = '\n'.join([' '.join([_.text for _ in s]) for s in text.sents])
    summary = '\n'.join([' '.join([_.text for _ in s]) for s in summary.sents])
    # run pre-processing
    line_text, pos_text, ner_text = pre_processing(text)
    line_summary, pos_summary, ner_summary = pre_processing(summary)
    entry['processed'] = {}
    entry['processed']['text'] = line_text
    entry['processed']['pos_text'] = pos_text
    entry['processed']['ner_text'] = ner_text
    entry['processed']['summary'] = line_summary
    entry['processed']['pos_summary'] = pos_summary
    entry['processed']['ner_summary'] = ner_summary
    entry['text'] = text.lower()
    entry['summary'] = summary.lower()
    return entry

Example #2

Source File: newsroom_data_maker.py From RLSeq2Seq with MIT License

6 votes

def run(entry):
    text = entry['text']
    summary = entry['summary']
    text = ' '.join([_.text for _ in tokenizer(remove_non_ascii(text))])
    summary = ' '.join([_.text for _ in tokenizer(remove_non_ascii(summary))])
    text = nlp(text)
    summary = nlp(summary)
    text = '\n'.join([' '.join([_.text for _ in s]) for s in text.sents])
    summary = '\n'.join([' '.join([_.text for _ in s]) for s in summary.sents])
    # run pre-processing
    line_text, pos_text, ner_text = pre_processing(text)
    line_summary, pos_summary, ner_summary = pre_processing(summary)
    entry['processed'] = {}
    entry['processed']['text'] = line_text
    entry['processed']['pos_text'] = pos_text
    entry['processed']['ner_text'] = ner_text
    entry['processed']['summary'] = line_summary
    entry['processed']['pos_summary'] = pos_summary
    entry['processed']['ner_summary'] = ner_summary
    entry['text'] = text.lower()
    entry['summary'] = summary.lower()
    return entry

Example #3

Source File: nlp.py From bugbug with Mozilla Public License 2.0

6 votes

def transform(self, data):
        tokenizer = Tokenizer(nlp.vocab)
        return np.array(
            [
                np.mean(
                    [
                        self.model[w.text.lower()]
                        for w in words
                        if w.text.lower() in self.model
                    ]
                    or [np.zeros(self.dim)],
                    axis=0,
                )
                for words in tokenizer.pipe(data)
            ]
        )

Example #4

Source File: nlp.py From bugbug with Mozilla Public License 2.0

6 votes

def transform(self, data):
        tokenizer = Tokenizer(nlp.vocab)
        return np.array(
            [
                np.mean(
                    [
                        self.model[w.text.lower()] * self.word2weight[w.text.lower()]
                        for w in words
                        if w.text.lower() in self.model
                    ]
                    or [np.zeros(self.dim)],
                    axis=0,
                )
                for words in tokenizer.pipe(data)
            ]
        )

Example #5

Source File: model_factory.py From anonymisation with Apache License 2.0

6 votes

def get_empty_model(load_labels_for_training: bool) -> French:
    """
    Generate an empty NER model
    :rtype: object
    """
    # Important to setup the right language because it impacts the tokenizer, sentences split, ...
    nlp = spacy.blank('fr')

    nlp.tokenizer = get_tokenizer(nlp)

    nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', first=True)
    ner = nlp.create_pipe('ner')
    # add labels
    if load_labels_for_training:
        for token_type in list(colors.keys()):
            ner.add_label(token_type)

    nlp.add_pipe(ner, last=True)

    return nlp

Example #6

Source File: preprocess_data.py From vampire with Apache License 2.0

6 votes

def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)
    tokenized_examples = []
    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            if data_path.endswith(".jsonl") or data_path.endswith(".json"):
                example = json.loads(line)
            else:
                example = {"text": line}
            if tokenize:
                if tokenizer_type == 'just_spaces':
                    tokens = list(map(str, tokenizer.split_words(example['text'])))
                elif tokenizer_type == 'spacy':
                    tokens = list(map(str, tokenizer(example['text'])))
                text = ' '.join(tokens)
            else:
                text = example['text']
            tokenized_examples.append(text)
    return tokenized_examples

Example #7

Source File: utils.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def _spacy_tokenize(x, spacy):
    return [tok.text for tok in spacy.tokenizer(x)]

Example #8

Source File: utils.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def is_tokenizer_serializable(tokenizer, language):
    """Extend with other tokenizers which are found to not be serializable
    """
    if tokenizer == 'spacy':
        return False
    return True

Example #9

Source File: nlp.py From bugbug with Mozilla Public License 2.0

5 votes

def __init__(self, *args, **kwargs):

        # Detect when the Spacy optional dependency is missing
        if not HAS_OPTIONAL_DEPENDENCIES:
            raise NotImplementedError(OPT_MSG_MISSING)

        super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)