Python Examples of spacy.tokens

Source File: document.py From neuralcoref with MIT License

6 votes

def __new__(
        cls,
        span,
        mention_index,
        utterance_index,
        utterance_start_sent,
        speaker=None,
        gold_label=None,
        *args,
        **kwargs,
    ):
        # We need to override __new__ see http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        obj = spacy.tokens.Span.__new__(
            cls, span.doc, span.start, span.end, *args, **kwargs
        )
        return obj

Source File: spacy_annotator.py From errudite with GNU General Public License v2.0

6 votes

def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str:
        """Function which gets a normalized string of the sentence and removes stop words
        
        Keyword Arguments:
            sentence_str {str} -- input sentence string (default: {None})
            tokens {List[Token]} -- pre-computed token list, with feature added (default: {None})
            use_lemma {bool} -- return the lemma or the text (default: {True})
        
        Returns:
            str -- the str with stopwords removed
        """
        if not tokens and sentence_str:
            #sentence_str = normalize_answer(sentence_str)
            tokens = self.model(sentence_str)
        elif not tokens:
            tokens = []
        #word_tokenize(sentence_str)
        attr = 'lemma_' if use_lemma else 'text' # what to merge
        return ' '.join([ getattr(token, attr) for token in tokens
            if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

6 votes

def _generate_partly_censored_word(self, word: Union[str, spacy.tokens.Token], profane_word: str) -> str:
        def is_delete_or_insert(opcode):
            return opcode[0] in ('delete', 'insert')

        # noinspection PyShadowingNames
        def find_word_part(word: str, word_part: str) -> str:
            word_to_word_part_opcodes = Levenshtein.opcodes(word, word_part)
            word_part_in_word_start = (
                word_to_word_part_opcodes[0][2] if is_delete_or_insert(word_to_word_part_opcodes[0]) else 0)
            word_part_in_word_finish = (
                word_to_word_part_opcodes[-1][1] if is_delete_or_insert(word_to_word_part_opcodes[-1]) else len(word))
            return word[word_part_in_word_start:word_part_in_word_finish]

        with suppress(AttributeError):
            word = word.text

        word_part_for_censoring = find_word_part(word.lower(), profane_word)
        return regex.sub(pattern=re.escape(word_part_for_censoring),
                         repl=self._generate_fully_censored_word(word=word_part_for_censoring),
                         string=word,
                         flags=regex.IGNORECASE)

Source File: rindex.py From semanticRetrievalMRS with MIT License

6 votes

def iterative_abs_save_info(debug_num=None):
    total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num
    cur_count = 0

    with open(config.ABS_WIKI_FILE, 'rb') as abs_file:
        with SqliteDict(str(config.ABS_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as abs_rindex_db:
            for line in tqdm(abs_file, total=total_doc_num):
                item = json.loads(line)
                # print(item.keys())
                # print()
                if item['title'] in abs_rindex_db:
                    continue

                tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset'])
                poss = spacy_get_pos(tokens)
                assert len(tokens) == len(poss)
                # print(tokens)
                # print(sent_offset)
                abs_rindex_db[item['title']] = {
                    'tokens': tokens,
                    'poss': poss,
                    'sentence_offset': sent_offset
                }
                cur_count += 1

                if cur_count % 5000 == 0:
                    abs_rindex_db.commit()

            abs_rindex_db.commit()
            abs_rindex_db.close()

Source File: rindex.py From semanticRetrievalMRS with MIT License

6 votes

def get_sentence_tokens(texts, charoffsets):
    whole_text = "".join(texts)
    tokens = []
    sentence_offsets = []

    start_t = 0
    end_t = 0
    for offset_list in charoffsets:
        end_t = start_t
        for start, end in offset_list:
            cur_token = whole_text[start:end]
            if len(cur_token) > 0:
                tokens.append(cur_token)
                end_t += 1
        sentence_offsets.append((start_t, end_t))
        start_t = end_t
    return tokens, sentence_offsets

Source File: wordnet.py From gobbli with Apache License 2.0

6 votes

def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True)

Source File: test_spacy_featurizer.py From rasa-for-botfront with Apache License 2.0

6 votes

def test_spacy_training_sample_alignment(spacy_nlp_component):
    from spacy.tokens import Doc

    m1 = Message.build(text="I have a feeling", intent="feeling")
    m2 = Message.build(text="", intent="feeling")
    m3 = Message.build(text="I am the last message", intent="feeling")
    td = TrainingData(training_examples=[m1, m2, m3])

    attribute_docs = spacy_nlp_component.docs_for_training_data(td)

    assert isinstance(attribute_docs["text"][0], Doc)
    assert isinstance(attribute_docs["text"][1], Doc)
    assert isinstance(attribute_docs["text"][2], Doc)

    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
    assert [t.text for t in attribute_docs["text"][1]] == []
    assert [t.text for t in attribute_docs["text"][2]] == [
        "i",
        "am",
        "the",
        "last",
        "message",
    ]

Source File: spacy_tokenizer.py From allennlp with Apache License 2.0

6 votes

def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]:
        """
        Converts spaCy tokens to allennlp tokens. Is a no-op if
        keep_spacy_tokens is True
        """
        if not self._keep_spacy_tokens:
            tokens = [
                Token(
                    token.text,
                    token.idx,
                    token.idx + len(token.text),
                    token.lemma_,
                    token.pos_,
                    token.tag_,
                    token.dep_,
                    token.ent_type_,
                )
                for token in tokens
            ]
        for start_token in self._start_tokens:
            tokens.insert(0, Token(start_token, 0))
        for end_token in self._end_tokens:
            tokens.append(Token(end_token, -1))
        return tokens

Source File: spacy_tokenizer.py From allennlp with Apache License 2.0

6 votes

def __init__(
        self,
        language: str = "en_core_web_sm",
        pos_tags: bool = False,
        parse: bool = False,
        ner: bool = False,
        keep_spacy_tokens: bool = False,
        split_on_spaces: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        self.spacy = get_spacy_model(language, pos_tags, parse, ner)
        if split_on_spaces:
            self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)

        self._keep_spacy_tokens = keep_spacy_tokens
        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []

Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0

5 votes

def process_non_content_bearing_samples(
        self, empty_samples: List[Tuple[int, Text]]
    ) -> List[Tuple[int, "Doc"]]:
        """Creates empty Doc-objects from zero-lengthed training samples strings."""

        from spacy.tokens import Doc

        n_docs = [
            (empty_sample[0], doc)
            for empty_sample, doc in zip(
                empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples]
            )
        ]
        return n_docs

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _replace_token(text: str, old: spacy.tokens.Token, new: str) -> str:
        return text[:old.idx] + new + text[old.idx + len(old.text):]

    # noinspection PyProtectedMember

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _save_word_with_no_profanity_inside(self, word: spacy.tokens.Token) -> None:
        if self._cache_redis is None:
            self._words_with_no_profanity_inside.add(word.text)
        else:
            self._cache_redis.sadd('_words_with_no_profanity_inside', word.text)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _get_censored_word(self, word: spacy.tokens.Token) -> Optional[Word]:
        if self._cache_redis is None:
            return self._censored_words.get(word.text)
        else:
            d = self._cache_redis.hgetall(word.text)
            if not d:
                return None
            uncensored, censored, original_profane_word = d[b'uncensored'], d[b'censored'], d[b'original_profane_word']
            if not original_profane_word:
                original_profane_word = None
            return Word(uncensored=uncensored, censored=censored, original_profane_word=original_profane_word)

Source File: util.py From scispacy with Apache License 2.0

5 votes

def __call__(self, text):
        words = text.split(" ")
        # All tokens 'own' a subsequent space character in
        # this tokenizer. This is a technicality and probably
        # not that interesting.
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _keep_only_letters_or_dictionary_word(self, language: Language, word: Union[str, spacy.tokens.Token]) -> str:
        with suppress(AttributeError):
            word = word.text
        if language is None:
            language = self.languages[0]
        if AnalysisType.DEEP in self.analyses and self._is_dictionary_word(language=language, word=word):
            return word
        else:
            return ''.join(regex.findall(r'\p{letter}', word))

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _parse(self,
               language: Language,
               text: str,
               use_profanity_filter: bool = True) -> spacy.tokens.Doc:
        nlp = self._get_nlp(language)
        return spacy_utlis.parse(nlp=nlp, text=text, language=language, use_profanity_filter=use_profanity_filter)

Source File: nlp.py From armchair-expert with MIT License

5 votes

def create_nlp_instance():
    import spacy
    from spacymoji import Emoji

    nlp = spacy.load('en')
    emoji_pipe = Emoji(nlp)
    nlp.add_pipe(emoji_pipe, first=True)

    # Merge hashtag tokens which were split by spacy
    def hashtag_pipe(doc):
        merged_hashtag = False
        while True:
            for token_index, token in enumerate(doc):
                if token.text == '#':
                    if token.head is not None:
                        start_index = token.idx
                        end_index = start_index + len(token.head.text) + 1
                        if doc.merge(start_index, end_index) is not None:
                            merged_hashtag = True
                            break
            if not merged_hashtag:
                break
            merged_hashtag = False
        return doc

    nlp.add_pipe(hashtag_pipe)
    return nlp

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _generate_fully_censored_word(self, word: Union[str, spacy.tokens.Token]) -> str:
        with suppress(AttributeError):
            word = word.text
        return len(word) * self.censor_char

Source File: spacy_wrapper.py From supervised-oie with MIT License

5 votes

def __call__(self, text):
        """
        Call this tokenizer - just split based on space
        """
        words = re.split(r' +', text) # Allow arbitrary number of spaces
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _make_spacy_token(self, language: Language, word: str) -> spacy.tokens.Token:
        return spacy_utlis.make_token(nlp=self._get_nlp(language), word=word)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def censor_word(self, word: Union[str, spacy.tokens.Token], language: Language = None) -> Word:
        """Returns censored word"""
        word = self._make_spacy_token(language=language, word=word)
        return self._censor_word(language=language, word=word)

Source File: test_gold_annotator_component.py From medaCy with GNU General Public License v3.0

5 votes

def test_overlays_annotations(self):
        """
        Tests that this pipeline component adds the correct labels.
        Note that this only tests that at least one instance of each label is overlayed because the number of tokens
        that receive the label varies based on the tokenizer.
        """

        sample_file = sample_dataset.data_files[0]
        txt_file_path = sample_file.txt_path
        ann_file_path = sample_file.ann_path

        with open(txt_file_path) as f:
            text = f.read()
        doc: Doc = self.nlp(text)

        doc.set_extension('file_name', default=None, force=True)
        doc._.file_name = txt_file_path
        doc.set_extension('gold_annotation_file', default=None, force=True)
        doc._.gold_annotation_file = ann_file_path

        ann = Annotations(ann_file_path)
        labels = ann.get_labels()

        gold_annotator = GoldAnnotatorOverlayer(self.nlp, list(labels))

        doc = gold_annotator(doc)

        overlayed_labels = {t._.gold_label for t in doc}
        overlayed_labels.remove('O')

        self.assertSetEqual(overlayed_labels, labels)

Source File: wordnet.py From kb with Apache License 2.0

5 votes

def __call__(self, text: str) -> List[Token]:
        spacy_doc = self.nlp(text)

        # create allennlp tokens
        normalized_tokens = [
            Token(spacy_token.text,
                  pos_=self.spacy_to_wordnet_map.get(spacy_token.pos_, spacy_token.pos_),
                  lemma_=spacy_token.lemma_
            )

            for spacy_token in spacy_doc
            if not spacy_token.is_space
        ]

        return normalized_tokens

Source File: common.py From kb with Apache License 2.0

5 votes

def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

Source File: spacy_annotator.py From errudite with GNU General Public License v2.0

5 votes

def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

Source File: annotate_graphs.py From RL-based-Graph2Seq-for-NQG with Apache License 2.0

5 votes

def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

Source File: test_extract_triples.py From ravestate with BSD 3-Clause "New" or "Revised" License

5 votes

def spacy_model():
    nlp = spacy_nlp_en
    from spacy.tokens import Doc
    if not Doc.has_extension('triples'):
        Doc.set_extension('triples', getter=extract_triples)
    return nlp

Source File: rindex.py From semanticRetrievalMRS with MIT License

5 votes

def iterative_abs_save_random_batching(batch_size=10000):
    total_doc_num = init_inspect.TOTAL_NUM_DOC

    with open(config.ABS_WIKI_FILE, 'rb') as abs_file:
        lines = []
        for line in tqdm(abs_file, total=total_doc_num):
            lines.append(line)
            # if len(lines) == 100000:
            #     break

    random_per = range(len(lines))
    # random_per = np.random.permutation(len(lines))
    # random.shuffle(lines)

    # existing_title_set = set()

    batch_list = []

    with SqliteDict(str(config.ABS_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as abs_rindex_db:
        for index in tqdm(random_per):
            item = json.loads(lines[index])
            # print(item.keys())
            # print()
            if item['title'] in abs_rindex_db:
                continue

            tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset'])
            poss = spacy_get_pos(tokens)
            assert len(tokens) == len(poss)
            # print(tokens)
            # print(sent_offset)
            rindex_item = {
                'tokens': tokens,
                'poss': poss,
                'sentence_offset': sent_offset
            }

            batch_list.append((item['title'], rindex_item))

            if len(batch_list) == batch_size:
                for title, rindex_item in batch_list:
                    abs_rindex_db[title] = rindex_item
                abs_rindex_db.commit()
                batch_list = []

        # Commit last one
        for title, rindex_item in batch_list:
            abs_rindex_db[title] = rindex_item

        abs_rindex_db.commit()
        abs_rindex_db.close()

Source File: rindex.py From semanticRetrievalMRS with MIT License

5 votes

def iterative_abs(debug_num=None):
    total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num
    cur_count = 0

    with open(config.ABS_WIKI_FILE, 'rb') as abs_file:
        for line in tqdm(abs_file, total=total_doc_num):
            item = json.loads(line)
            # print(item.keys())
            # print()
            tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset'])
            poss = spacy_get_pos(tokens)
            assert len(tokens) == len(poss)
            print(tokens)
            print(sent_offset)
            # print(poss)

Source File: science_ie_data_utils.py From sciwing with MIT License

5 votes

def _form_ann_line(
        idx: str,
        char_offset: Tuple[int, int, str],
        tag_name: str,
        doc: spacy.tokens.doc.Doc,
    ):
        """ Forms a ann line that can be used to write the ANN files for CoNLL format

        Parameters
        ----------
        idx : int
            The index for the entity being written
        char_offset : int
            THe start, end, tag for the line
        tag_name : str
            The tag to be used and is one of ``[Task, Process, Material]``
        doc : str
            Spacy doc to query the appropriate characters

        Returns
        -------
        str
            An ANN line that is formed.

        """
        start_offset, end_offset, entity_type = char_offset
        surface_form = doc.char_span(start_offset, end_offset).text
        start_offset = str(start_offset)
        end_offset = str(end_offset)
        ann_line = " ".join([start_offset, end_offset])
        ann_line = "\t".join([ann_line, surface_form])
        ann_line = " ".join([tag_name, ann_line])
        ann_line = "\t".join([f"T{idx}", ann_line])
        return ann_line

Python spacy.tokens() Examples