Python Examples of spacy.language

Source File: spacy_parser.py From fonduer with MIT License

7 votes

def model_installed(name: str) -> bool:
        """Check if spaCy language model is installed.

        From https://github.com/explosion/spaCy/blob/master/spacy/util.py

        :param name:
        :return:
        """
        data_path = util.get_data_path()
        if not data_path or not data_path.exists():
            raise IOError(f"Can't find spaCy data path: {data_path}")
        if name in {d.name for d in data_path.iterdir()}:
            return True
        if is_package(name):  # installed as package
            return True
        if Path(name).exists():  # path to model data directory
            return True
        return False

Source File: spacy_processors_test.py From forte with Apache License 2.0

6 votes

def test_neg_spacy_processor(self):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        with self.assertRaises(ProcessExecutionException):
            _ = spacy.process(document)

Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang))

Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def create(cls, cfg):
        # type: (RasaNLUModelConfig) -> SpacyNLP
        import spacy

        component_conf = cfg.for_component(cls.name, cls.defaults)
        spacy_model_name = component_conf.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = cfg.language
            component_conf["model"] = cfg.language

        logger.info("Trying to load spacy model with "
                    "name '{}'".format(spacy_model_name))

        nlp = spacy.load(spacy_model_name, parser=False)
        cls.ensure_proper_language_model(nlp)
        return SpacyNLP(component_conf, nlp)

Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp: Optional["Language"]) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception(
                "Failed to load spacy language model. "
                "Loading the model returned 'None'."
            )
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception(
                "Failed to load spacy language model for "
                "lang '{}'. Make sure you have downloaded the "
                "correct model (https://spacy.io/docs/usage/)."
                "".format(nlp.lang)
            )

Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0

6 votes

def create(
        cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
    ) -> "SpacyNLP":

        component_config = override_defaults(cls.defaults, component_config)

        spacy_model_name = component_config.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = config.language
            component_config["model"] = config.language

        logger.info(f"Trying to load spacy model with name '{spacy_model_name}'")

        nlp = cls.load_model(spacy_model_name)

        cls.ensure_proper_language_model(nlp)
        return cls(component_config, nlp)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

6 votes

def _censor(self, text: str, return_bool=False) -> Union[str, bool]:
        """:return: text with any profane words censored or bool (True - text has profane words, False otherwise) if
        return_bool=True"""
        result = ''
        text_parts = self._split_by_language(text=text)
        for language, text_part in text_parts:
            result_part = text_part
            doc = self._parse(language=language, text=text_part)
            for token in doc:
                if token._.is_profane:
                    if return_bool:
                        return True
                    else:
                        result_part = self._replace_token(text=result_part, old=token, new=token._.censored)
            result += result_part
        if return_bool:
            return False
        else:
            return result

Source File: spacy_component.py From profanity-filter with GNU General Public License v3.0

6 votes

def __call__(self, doc: Doc, language: Language = None, stop_on_first_profane_word: Optional[bool] = None) -> Doc:
        self.register_extensions(exist_ok=True)
        if language is None:
            language = self._language
        if stop_on_first_profane_word is None:
            stop_on_first_profane_word = self._stop_on_first_profane_word
        i = 0
        while i < len(doc):
            j = i + 1
            while (j < len(doc)
                   and not doc[j - 1].whitespace_ and not doc[j - 1].is_space and not doc[j - 1].is_punct
                   and not doc[j].is_space and not doc[j].is_punct):
                j += 1
            span = self._censor_spaceless_span(doc[i:j], language=language)
            if stop_on_first_profane_word and span._.is_profane:
                break
            i += len(span)
        return doc

Source File: spacy_utils.py From rasa_nlu with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp: Optional['Language']) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang))

Source File: spacy_utils.py From rasa_nlu with Apache License 2.0

6 votes

def create(cls,
               component_config: Dict[Text, Any],
               config: RasaNLUModelConfig) -> 'SpacyNLP':
        import spacy

        component_config = override_defaults(cls.defaults, component_config)

        spacy_model_name = component_config.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = config.language
            component_config["model"] = config.language

        logger.info("Trying to load spacy model with "
                    "name '{}'".format(spacy_model_name))

        nlp = spacy.load(spacy_model_name, disable=['parser'])
        cls.ensure_proper_language_model(nlp)
        return cls(component_config, nlp)

Source File: spacy-fastext.py From word2vecVN with Apache License 2.0

6 votes

def load_nlp(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    return nlp

Source File: spacy_parser.py From fonduer with MIT License

6 votes

def _load_lang_model(self) -> None:
        """Load spaCy language model.

        If a model is not installed, download it before loading it.

        Currenty supported spaCy languages

        en English (50MB)
        de German (645MB)
        fr French (1.33GB)
        es Spanish (377MB)

        :return:
        """
        if self.lang in self.languages:
            if not SpacyParser.model_installed(self.lang):
                download(self.lang)
            model = spacy.load(self.lang)
        elif self.lang in self.alpha_languages:
            language_module = importlib.import_module(f"spacy.lang.{self.lang}")
            language_method = getattr(language_module, self.alpha_languages[self.lang])
            model = language_method()
        self.model = model

Source File: spacy_processors.py From forte with Apache License 2.0

5 votes

def default_configs(cls):
        """
        This defines a basic config structure for spaCy.
        Returns:

        """
        config = super().default_configs()
        config.update({
            'processors': 'tokenize, pos, lemma',
            'lang': 'en_core_web_sm',
            # Language code for the language to build the Pipeline
            'use_gpu': False,
        })
        return config

Source File: spacy_processors_test.py From forte with Apache License 2.0

5 votes

def setUp(self):
        self.spacy = Pipeline[DataPack]()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add(SpacyProcessor(), config=config)
        self.spacy.initialize()

        self.nlp: Language = spacy.load(config['lang'])

Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def cache_key(cls, model_metadata):
        # type: (Metadata) -> Text

        component_meta = model_metadata.for_component(cls.name)

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name

Source File: spacy_parser.py From fonduer with MIT License

5 votes

def __init__(self, vocab: Vocab) -> None:
        """Initialize a custom tokenizer.

        :param vocab: The vocab attribute of the respective spacy language object.
        """
        self.vocab = vocab

Source File: spacy_tokenizer.py From DeepPavlov with Apache License 2.0

5 votes

def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):
    disable = set(disable)
    try:
        model = spacy.load(model_name, disable=disable)
    except OSError as e:
        try:
            model = __import__(model_name).load(disable=disable)
            if not isinstance(model, spacy.language.Language):
                raise RuntimeError(f'{model_name} is not a spacy model module')
        except Exception:
            raise e
    return model

Source File: spacy_component.py From profanity-filter with GNU General Public License v3.0

5 votes

def __init__(self, profanity_filter: 'ProfanityFilter', nlp: spacy.language.Language, language: Language = None,
                 stop_on_first_profane_word: bool = False):
        self._language = language
        self._nlp = nlp  # Used only for tokenization
        self._profanity_filter = profanity_filter
        self._stop_on_first_profane_word = stop_on_first_profane_word

    # noinspection PyProtectedMember

Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0

5 votes

def cache_key(
        cls, component_meta: Dict[Text, Any], model_metadata: "Metadata"
    ) -> Optional[Text]:

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name

Source File: spacy_extractor.py From cookiecutter-spacy-fastapi with MIT License

5 votes

def __init__(
        self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
    ):
        """Initialize the SpacyExtractor pipeline.
        
        nlp (spacy.language.Language): pre-loaded spacy language model
        input_text_col (str): property on each document to run the model on
        input_id_col (str): property on each document to correlate with request

        RETURNS (EntityRecognizer): The newly constructed object.
        """
        self.nlp = nlp
        self.input_id_col = input_id_col
        self.input_text_col = input_text_col

Source File: spacy_utils.py From rasa_nlu with Apache License 2.0

5 votes

def cache_key(cls,
                  component_meta: Dict[Text, Any],
                  model_metadata: 'Metadata') -> Optional[Text]:

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _detect_languages(self, text: str) -> Languages:
        fallback_language = self.languages[0]
        fallback_result = OrderedSet([fallback_language])
        if AnalysisType.MULTILINGUAL in self.analyses:
            polyglot_output = polyglot.detect.Detector(text, quiet=True)
            result = OrderedSet([language.code for language in polyglot_output.languages if language.code != 'un'])
            if not result:
                result = fallback_result
        else:
            result = fallback_result
        result = result.intersection(self.languages)
        return result

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _is_profane_word(self, language: Language, word: str) -> bool:
        profane_word_dictionaries = (self.profane_word_dictionaries.values()
                                     if language is None else
                                     [self.profane_word_dictionaries[language]])
        return any(word in profane_word_dictionary for profane_word_dictionary in profane_word_dictionaries)

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _get_trie(self, language: Language) -> Trie:
        result = None
        # noinspection PyTypeChecker
        languages = OrderedSet([language]) | self.languages
        for language in languages:
            with suppress(KeyError):
                result = self._trie[language]
                break
        return result

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _is_dictionary_word(self, language: Language, word: str) -> bool:
        try:
            return any(spell.spell(word) for spell in self._get_spells(language=language))
        except UnicodeEncodeError:
            return False

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def _lemmas(self, language: Language, word: Union[str, spacy.tokens.Token]) -> 'OrderedSet[str]':
        result = OrderedSet()
        if not word:
            return result
        word = self._make_spacy_token(language=language, word=word)
        spacy_lemma = word.lemma_
        result.add(word.text)
        spacy_lemma = spacy_lemma.lower() if spacy_lemma != '-PRON-' else word.lower_
        result.add(spacy_lemma)
        result |= self._stems(language=language, word=word.text)
        result |= self._normal_forms(language=language, word=word.text)
        return result

Source File: profanity_filter.py From profanity-filter with GNU General Public License v3.0

5 votes

def spells(self, value: Optional[Spells]) -> None:
        if AnalysisType.DEEP in self.analyses:
            self.clear_cache()
            if value is not None:
                self._spells = value
            else:
                self._spells = {}
                for language in self._languages:
                    with suppress(HunSpellError):
                        self._spells[language] = HunSpell(self._DATA_DIR / f'{language}.dic',
                                                          self._DATA_DIR / f'{language}.aff')
                if not self._spells:
                    self.analyses -= {AnalysisType.DEEP}

Source File: spacy_component.py From profanity-filter with GNU General Public License v3.0

5 votes

def _censor_spaceless_span(self, span: Span, language: Language) -> Span:
        token = spacy_utlis.make_token(nlp=self._nlp, word=str(span) if len(span) > 1 else span[0])
        censored_word = self._profanity_filter.censor_word(word=token, language=language)
        if censored_word.is_profane:
            with span.doc.retokenize() as retokenizer:
                retokenizer.merge(span)
            token = span[0]
            token._.censored = censored_word.censored
            token._.original_profane_word = censored_word.original_profane_word
        else:
            for token in span:
                token._.censored = token.text
        return span

Source File: spacy_utlis.py From profanity-filter with GNU General Public License v3.0

5 votes

def parse(nlp: spacy.language.Language,
          text: str, language: Language = None,
          use_profanity_filter: bool = False) -> Union[Doc, Token]:
    disable = [] if use_profanity_filter else [SpacyProfanityFilterComponent.name]
    component_cfg = {}
    if use_profanity_filter:
        component_cfg[SpacyProfanityFilterComponent.name] = {
            'language': language,
        }
    return nlp(text, disable=disable, component_cfg=component_cfg)

Source File: spacy_utlis.py From profanity-filter with GNU General Public License v3.0

5 votes

def make_token(nlp: spacy.language.Language, word: Union[str, Token]) -> Token:
    if hasattr(word, 'text'):
        return word
    doc = parse(nlp=nlp, text=word)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[:])
    return doc[0]

Python spacy.language() Examples