Python spacy.language() Examples

The following are 30 code examples of spacy.language(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy , or try the search function .
Example #1
Source File: spacy_parser.py    From fonduer with MIT License 7 votes vote down vote up
def model_installed(name: str) -> bool:
        """Check if spaCy language model is installed.

        From https://github.com/explosion/spaCy/blob/master/spacy/util.py

        :param name:
        :return:
        """
        data_path = util.get_data_path()
        if not data_path or not data_path.exists():
            raise IOError(f"Can't find spaCy data path: {data_path}")
        if name in {d.name for d in data_path.iterdir()}:
            return True
        if is_package(name):  # installed as package
            return True
        if Path(name).exists():  # path to model data directory
            return True
        return False 
Example #2
Source File: spacy_processors_test.py    From forte with Apache License 2.0 6 votes vote down vote up
def test_neg_spacy_processor(self):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        with self.assertRaises(ProcessExecutionException):
            _ = spacy.process(document) 
Example #3
Source File: spacy_utils.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang)) 
Example #4
Source File: spacy_utils.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def create(cls, cfg):
        # type: (RasaNLUModelConfig) -> SpacyNLP
        import spacy

        component_conf = cfg.for_component(cls.name, cls.defaults)
        spacy_model_name = component_conf.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = cfg.language
            component_conf["model"] = cfg.language

        logger.info("Trying to load spacy model with "
                    "name '{}'".format(spacy_model_name))

        nlp = spacy.load(spacy_model_name, parser=False)
        cls.ensure_proper_language_model(nlp)
        return SpacyNLP(component_conf, nlp) 
Example #5
Source File: spacy_utils.py    From rasa-for-botfront with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp: Optional["Language"]) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception(
                "Failed to load spacy language model. "
                "Loading the model returned 'None'."
            )
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception(
                "Failed to load spacy language model for "
                "lang '{}'. Make sure you have downloaded the "
                "correct model (https://spacy.io/docs/usage/)."
                "".format(nlp.lang)
            ) 
Example #6
Source File: spacy_utils.py    From rasa-for-botfront with Apache License 2.0 6 votes vote down vote up
def create(
        cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
    ) -> "SpacyNLP":

        component_config = override_defaults(cls.defaults, component_config)

        spacy_model_name = component_config.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = config.language
            component_config["model"] = config.language

        logger.info(f"Trying to load spacy model with name '{spacy_model_name}'")

        nlp = cls.load_model(spacy_model_name)

        cls.ensure_proper_language_model(nlp)
        return cls(component_config, nlp) 
Example #7
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 6 votes vote down vote up
def _censor(self, text: str, return_bool=False) -> Union[str, bool]:
        """:return: text with any profane words censored or bool (True - text has profane words, False otherwise) if
        return_bool=True"""
        result = ''
        text_parts = self._split_by_language(text=text)
        for language, text_part in text_parts:
            result_part = text_part
            doc = self._parse(language=language, text=text_part)
            for token in doc:
                if token._.is_profane:
                    if return_bool:
                        return True
                    else:
                        result_part = self._replace_token(text=result_part, old=token, new=token._.censored)
            result += result_part
        if return_bool:
            return False
        else:
            return result 
Example #8
Source File: spacy_component.py    From profanity-filter with GNU General Public License v3.0 6 votes vote down vote up
def __call__(self, doc: Doc, language: Language = None, stop_on_first_profane_word: Optional[bool] = None) -> Doc:
        self.register_extensions(exist_ok=True)
        if language is None:
            language = self._language
        if stop_on_first_profane_word is None:
            stop_on_first_profane_word = self._stop_on_first_profane_word
        i = 0
        while i < len(doc):
            j = i + 1
            while (j < len(doc)
                   and not doc[j - 1].whitespace_ and not doc[j - 1].is_space and not doc[j - 1].is_punct
                   and not doc[j].is_space and not doc[j].is_punct):
                j += 1
            span = self._censor_spaceless_span(doc[i:j], language=language)
            if stop_on_first_profane_word and span._.is_profane:
                break
            i += len(span)
        return doc 
Example #9
Source File: spacy_utils.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp: Optional['Language']) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang)) 
Example #10
Source File: spacy_utils.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def create(cls,
               component_config: Dict[Text, Any],
               config: RasaNLUModelConfig) -> 'SpacyNLP':
        import spacy

        component_config = override_defaults(cls.defaults, component_config)

        spacy_model_name = component_config.get("model")

        # if no model is specified, we fall back to the language string
        if not spacy_model_name:
            spacy_model_name = config.language
            component_config["model"] = config.language

        logger.info("Trying to load spacy model with "
                    "name '{}'".format(spacy_model_name))

        nlp = spacy.load(spacy_model_name, disable=['parser'])
        cls.ensure_proper_language_model(nlp)
        return cls(component_config, nlp) 
Example #11
Source File: spacy-fastext.py    From word2vecVN with Apache License 2.0 6 votes vote down vote up
def load_nlp(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    return nlp 
Example #12
Source File: spacy_parser.py    From fonduer with MIT License 6 votes vote down vote up
def _load_lang_model(self) -> None:
        """Load spaCy language model.

        If a model is not installed, download it before loading it.

        Currenty supported spaCy languages

        en English (50MB)
        de German (645MB)
        fr French (1.33GB)
        es Spanish (377MB)

        :return:
        """
        if self.lang in self.languages:
            if not SpacyParser.model_installed(self.lang):
                download(self.lang)
            model = spacy.load(self.lang)
        elif self.lang in self.alpha_languages:
            language_module = importlib.import_module(f"spacy.lang.{self.lang}")
            language_method = getattr(language_module, self.alpha_languages[self.lang])
            model = language_method()
        self.model = model 
Example #13
Source File: spacy_processors.py    From forte with Apache License 2.0 5 votes vote down vote up
def default_configs(cls):
        """
        This defines a basic config structure for spaCy.
        Returns:

        """
        config = super().default_configs()
        config.update({
            'processors': 'tokenize, pos, lemma',
            'lang': 'en_core_web_sm',
            # Language code for the language to build the Pipeline
            'use_gpu': False,
        })
        return config 
Example #14
Source File: spacy_processors_test.py    From forte with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self.spacy = Pipeline[DataPack]()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add(SpacyProcessor(), config=config)
        self.spacy.initialize()

        self.nlp: Language = spacy.load(config['lang']) 
Example #15
Source File: spacy_utils.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def cache_key(cls, model_metadata):
        # type: (Metadata) -> Text

        component_meta = model_metadata.for_component(cls.name)

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name 
Example #16
Source File: spacy_parser.py    From fonduer with MIT License 5 votes vote down vote up
def __init__(self, vocab: Vocab) -> None:
        """Initialize a custom tokenizer.

        :param vocab: The vocab attribute of the respective spacy language object.
        """
        self.vocab = vocab 
Example #17
Source File: spacy_tokenizer.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):
    disable = set(disable)
    try:
        model = spacy.load(model_name, disable=disable)
    except OSError as e:
        try:
            model = __import__(model_name).load(disable=disable)
            if not isinstance(model, spacy.language.Language):
                raise RuntimeError(f'{model_name} is not a spacy model module')
        except Exception:
            raise e
    return model 
Example #18
Source File: spacy_component.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, profanity_filter: 'ProfanityFilter', nlp: spacy.language.Language, language: Language = None,
                 stop_on_first_profane_word: bool = False):
        self._language = language
        self._nlp = nlp  # Used only for tokenization
        self._profanity_filter = profanity_filter
        self._stop_on_first_profane_word = stop_on_first_profane_word

    # noinspection PyProtectedMember 
Example #19
Source File: spacy_utils.py    From rasa-for-botfront with Apache License 2.0 5 votes vote down vote up
def cache_key(
        cls, component_meta: Dict[Text, Any], model_metadata: "Metadata"
    ) -> Optional[Text]:

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name 
Example #20
Source File: spacy_extractor.py    From cookiecutter-spacy-fastapi with MIT License 5 votes vote down vote up
def __init__(
        self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
    ):
        """Initialize the SpacyExtractor pipeline.
        
        nlp (spacy.language.Language): pre-loaded spacy language model
        input_text_col (str): property on each document to run the model on
        input_id_col (str): property on each document to correlate with request

        RETURNS (EntityRecognizer): The newly constructed object.
        """
        self.nlp = nlp
        self.input_id_col = input_id_col
        self.input_text_col = input_text_col 
Example #21
Source File: spacy_utils.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def cache_key(cls,
                  component_meta: Dict[Text, Any],
                  model_metadata: 'Metadata') -> Optional[Text]:

        # Fallback, use the language name, e.g. "en",
        # as the model name if no explicit name is defined
        spacy_model_name = component_meta.get("model", model_metadata.language)

        return cls.name + "-" + spacy_model_name 
Example #22
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _detect_languages(self, text: str) -> Languages:
        fallback_language = self.languages[0]
        fallback_result = OrderedSet([fallback_language])
        if AnalysisType.MULTILINGUAL in self.analyses:
            polyglot_output = polyglot.detect.Detector(text, quiet=True)
            result = OrderedSet([language.code for language in polyglot_output.languages if language.code != 'un'])
            if not result:
                result = fallback_result
        else:
            result = fallback_result
        result = result.intersection(self.languages)
        return result 
Example #23
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _is_profane_word(self, language: Language, word: str) -> bool:
        profane_word_dictionaries = (self.profane_word_dictionaries.values()
                                     if language is None else
                                     [self.profane_word_dictionaries[language]])
        return any(word in profane_word_dictionary for profane_word_dictionary in profane_word_dictionaries) 
Example #24
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _get_trie(self, language: Language) -> Trie:
        result = None
        # noinspection PyTypeChecker
        languages = OrderedSet([language]) | self.languages
        for language in languages:
            with suppress(KeyError):
                result = self._trie[language]
                break
        return result 
Example #25
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _is_dictionary_word(self, language: Language, word: str) -> bool:
        try:
            return any(spell.spell(word) for spell in self._get_spells(language=language))
        except UnicodeEncodeError:
            return False 
Example #26
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _lemmas(self, language: Language, word: Union[str, spacy.tokens.Token]) -> 'OrderedSet[str]':
        result = OrderedSet()
        if not word:
            return result
        word = self._make_spacy_token(language=language, word=word)
        spacy_lemma = word.lemma_
        result.add(word.text)
        spacy_lemma = spacy_lemma.lower() if spacy_lemma != '-PRON-' else word.lower_
        result.add(spacy_lemma)
        result |= self._stems(language=language, word=word.text)
        result |= self._normal_forms(language=language, word=word.text)
        return result 
Example #27
Source File: profanity_filter.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def spells(self, value: Optional[Spells]) -> None:
        if AnalysisType.DEEP in self.analyses:
            self.clear_cache()
            if value is not None:
                self._spells = value
            else:
                self._spells = {}
                for language in self._languages:
                    with suppress(HunSpellError):
                        self._spells[language] = HunSpell(self._DATA_DIR / f'{language}.dic',
                                                          self._DATA_DIR / f'{language}.aff')
                if not self._spells:
                    self.analyses -= {AnalysisType.DEEP} 
Example #28
Source File: spacy_component.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def _censor_spaceless_span(self, span: Span, language: Language) -> Span:
        token = spacy_utlis.make_token(nlp=self._nlp, word=str(span) if len(span) > 1 else span[0])
        censored_word = self._profanity_filter.censor_word(word=token, language=language)
        if censored_word.is_profane:
            with span.doc.retokenize() as retokenizer:
                retokenizer.merge(span)
            token = span[0]
            token._.censored = censored_word.censored
            token._.original_profane_word = censored_word.original_profane_word
        else:
            for token in span:
                token._.censored = token.text
        return span 
Example #29
Source File: spacy_utlis.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def parse(nlp: spacy.language.Language,
          text: str, language: Language = None,
          use_profanity_filter: bool = False) -> Union[Doc, Token]:
    disable = [] if use_profanity_filter else [SpacyProfanityFilterComponent.name]
    component_cfg = {}
    if use_profanity_filter:
        component_cfg[SpacyProfanityFilterComponent.name] = {
            'language': language,
        }
    return nlp(text, disable=disable, component_cfg=component_cfg) 
Example #30
Source File: spacy_utlis.py    From profanity-filter with GNU General Public License v3.0 5 votes vote down vote up
def make_token(nlp: spacy.language.Language, word: Union[str, Token]) -> Token:
    if hasattr(word, 'text'):
        return word
    doc = parse(nlp=nlp, text=word)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[:])
    return doc[0]