Python Examples of spacy.language.Language

Source File: import_annotations.py From anonymisation with Apache License 2.0

6 votes

def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
    result: List[str] = list()
    for text, offsets in data:
        doc: Doc = spacy_model(text)
        # remove duplicated offsets
        offsets = normalize_offsets(offsets=offsets)
        offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
        gold_annotations = GoldParse(doc, entities=offset_tuples)
        annotations: List[str] = gold_annotations.ner
        assert len(annotations) == len(doc)
        # Flair uses BIOES and Spacy BILUO
        # BILUO for Begin, Inside, Last, Unit, Out
        # BIOES for Begin, Inside, Outside, End, Single
        annotations = [a.replace('L-', 'E-') for a in annotations]
        annotations = [a.replace('U-', 'S-') for a in annotations]
        annotations = ["O" if a == "-" else a for a in annotations]  # replace unknown
        result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
        result.append('\n')
    return result

Source File: file.py From stog with MIT License

6 votes

def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]

Source File: spacy_processors_test.py From forte with Apache License 2.0

6 votes

def test_neg_spacy_processor(self):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        with self.assertRaises(ProcessExecutionException):
            _ = spacy.process(document)

Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang))

Source File: count_word_frequencies.py From scispacy with Apache License 2.0

6 votes

def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (for scispacy, these are Pubmed abstracts), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    tokenizer = combined_rule_tokenizer(language_class())
    counts = Counter()
    doc_counts = Counter()
    for line in open(input_path, "r"):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts

Source File: flair_generate_html_from_txt.py From anonymisation with Apache License 2.0

6 votes

def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")]
    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences, colors=colors, title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"), "w") as output:
                output.write(page_html)

Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp: Optional["Language"]) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception(
                "Failed to load spacy language model. "
                "Loading the model returned 'None'."
            )
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception(
                "Failed to load spacy language model for "
                "lang '{}'. Make sure you have downloaded the "
                "correct model (https://spacy.io/docs/usage/)."
                "".format(nlp.lang)
            )

Source File: word_freqs.py From Blackstone with Apache License 2.0

6 votes

def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (in this case, sentences for the ICLR case law corpus), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    nlp = English()
    #tokenizer = combined_rule_tokenizer(language_class())
    tokenizer = Tokenizer(nlp.vocab)
    counts = Counter()
    doc_counts = Counter()
    for line in tqdm.tqdm(open(input_path, "r")):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts

Source File: file.py From gtos with MIT License

6 votes

def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]

Source File: spacy_utils.py From rasa_nlu with Apache License 2.0

6 votes

def ensure_proper_language_model(nlp: Optional['Language']) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang))

Source File: language.py From spacy-udpipe with MIT License

6 votes

def load_from_path(
    lang: str,
    path: str,
    meta: Optional[Dict] = {"description": "custom model"},
    **kwargs
) -> UDPipeLanguage:
    """Convenience function for initializing the Language class and loading
    a custom UDPipe model via the path argument.

    lang: ISO 639-1 language code or shorthand UDPipe model name.
    path: Path to the UDPipe model.
    meta: Optional meta-information about the UDPipe model.
    kwargs: Optional config parameters.
    RETURNS: The UDPipeLanguage object.
    """
    model = UDPipeModel(lang=lang, path=path, meta=meta)
    nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs)
    return nlp

Source File: spacy-fastext.py From word2vecVN with Apache License 2.0

6 votes

def load_nlp(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    return nlp

Source File: language.py From spacy-udpipe with MIT License

5 votes

def load(lang: str, **kwargs) -> UDPipeLanguage:
    """Convenience function for initializing the Language class that
    mimicks spacy.load.

    lang: ISO 639-1 language code or shorthand UDPipe model name.
    kwargs: Optional config parameters.
    RETURNS: The UDPipeLanguage object.
    """
    model = UDPipeModel(lang=lang, path=None, meta=None)
    nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs)
    return nlp

Source File: spacy_featurizer.py From rasa_nlu with Apache License 2.0

5 votes

def ndim(spacy_nlp: 'Language') -> int:
    """Number of features used to represent a document / sentence."""
    return spacy_nlp.vocab.vectors_length

Source File: spacy_processors_test.py From forte with Apache License 2.0

5 votes

def setUp(self):
        self.spacy = Pipeline[DataPack]()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add(SpacyProcessor(), config=config)
        self.spacy.initialize()

        self.nlp: Language = spacy.load(config['lang'])

Source File: spacy_processors.py From forte with Apache License 2.0

5 votes

def default_configs(cls):
        """
        This defines a basic config structure for spaCy.
        Returns:

        """
        config = super().default_configs()
        config.update({
            'processors': 'tokenize, pos, lemma',
            'lang': 'en_core_web_sm',
            # Language code for the language to build the Pipeline
            'use_gpu': False,
        })
        return config

Source File: spacy_processors.py From forte with Apache License 2.0

5 votes

def __init__(self):
        super().__init__()
        self.processors: str = ""
        self.nlp: Optional[Language] = None
        self.lang_model: str = ''

Source File: spacy_featurizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def ndim(spacy_nlp):
    """Number of features used to represent a document / sentence."""
    # type: Language -> int
    return spacy_nlp.vocab.vectors_length

Source File: spacy_extractor.py From cookiecutter-spacy-fastapi with MIT License

5 votes

def __init__(
        self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
    ):
        """Initialize the SpacyExtractor pipeline.
        
        nlp (spacy.language.Language): pre-loaded spacy language model
        input_text_col (str): property on each document to run the model on
        input_id_col (str): property on each document to correlate with request

        RETURNS (EntityRecognizer): The newly constructed object.
        """
        self.nlp = nlp
        self.input_id_col = input_id_col
        self.input_text_col = input_text_col

Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def __init__(self, component_config=None, nlp=None):
        # type: (Dict[Text, Any], Language) -> None

        self.nlp = nlp
        super(SpacyNLP, self).__init__(component_config)

Source File: util.py From allennlp with Apache License 2.0

5 votes

def get_spacy_model(
    spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool
) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ["vectors", "textcat"]
        if not pos_tags:
            disable.append("tagger")
        if not parse:
            disable.append("parser")
        if not ner:
            disable.append("ner")
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)

            # Import the downloaded model module directly and load from there
            spacy_model_module = __import__(spacy_model_name)
            spacy_model = spacy_model_module.load(disable=disable)  # type: ignore

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]

Source File: train_utils.py From scispacy with Apache License 2.0

5 votes

def evaluate_ner(
    nlp: Language, eval_data, dump_path: str = None, verbose: bool = False
) -> PerClassScorer:

    scorer = PerClassScorer()
    print("Evaluating %d rows" % len(eval_data))
    for i, (text, gold_spans) in enumerate(tqdm.tqdm(eval_data)):

        # parse dev data with trained model
        doc = nlp(text)
        predicted_spans = [
            (ent.start_char, ent.end_char, ent.label_) for ent in doc.ents
        ]
        scorer(predicted_spans, gold_spans["entities"])

        if i % 1000 == 0 and i > 0:
            for name, metric in scorer.get_metric().items():
                print(f"{name}: {metric}")

    metrics = scorer.get_metric()
    if dump_path is not None:
        json.dump(metrics, open(dump_path, "a+"))
    for name, metric in metrics.items():
        if "overall" in name or "untyped" in name or verbose:
            print(f"{name}: \t\t {metric}")

    return metrics

Source File: util.py From scispacy with Apache License 2.0

5 votes

def create_combined_rule_model() -> Language:
    nlp = spacy.load("en_core_web_sm")
    nlp.tokenizer = combined_rule_tokenizer(nlp)
    nlp.add_pipe(pysbd_sentencizer, first=True)
    return nlp

Source File: util.py From scispacy with Apache License 2.0

5 votes

def save_model(nlp: Language, output_path: str):
    nlp.to_disk(output_path)

Source File: skills.py From SkillsExtractorCognitiveSearch with MIT License

5 votes

def __init__(self, nlp: Language, data_path: Path = Path("data")):
        self.nlp = nlp
        self.data_path = data_path
        self.skills = self._get_skills()

        patterns = self._build_patterns(self.skills)
        extra_patterns = self._get_extra_skill_patterns()
        ruler = EntityRuler(nlp, overwrite_ents=True)
        ruler.add_patterns(itertools.chain(patterns, extra_patterns))
        if not self.nlp.has_pipe("skills_ruler"):
            self.nlp.add_pipe(ruler, name="skills_ruler")

Source File: flair_generate_html_from_xml.py From anonymisation with Apache License 2.0

5 votes

def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".xml")]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(path=os.path.join(data_folder, filename),
                                                                  keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception("No example loaded, causes: no cases in provided path or sample size is to high")

    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True)

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html)

Source File: spacy_parser.py From fonduer with MIT License

5 votes

def __init__(self, lang: Optional[str]) -> None:
        """Initialize SpacyParser."""
        self.name = "spacy"

        self.lang = lang
        self.model: Optional[Language] = None
        if self.has_tokenizer_support():
            self._load_lang_model()

Source File: import_annotations.py From anonymisation with Apache License 2.0

5 votes

def prepare_flair_train_test_corpus(spacy_model: Language, data_folder: str, dev_size: float,
                                    nb_segment: Optional[int], segment: Optional[int]) -> Corpus:

    all_annotated_files: List[str] = [os.path.join(data_folder, filename)
                                      for filename in os.listdir(data_folder) if filename.endswith(".txt")]
    if nb_segment is None and segment is None:
        random.shuffle(all_annotated_files)
        nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)
        dev_file_names = all_annotated_files[0:nb_doc_dev_set]
    else:
        assert segment < nb_segment
        all_segments = np.array_split(all_annotated_files, nb_segment)
        dev_file_names = list(all_segments[segment])
        print(dev_file_names)

    train_file_names = [file for file in all_annotated_files if file not in dev_file_names]

    train_path = export_data_set_flair_format(spacy_model, train_file_names)
    dev_path = export_data_set_flair_format(spacy_model, dev_file_names)

    corpus: Corpus = ColumnCorpus(data_folder=tempfile.gettempdir(),
                                  column_format={0: 'text', 1: 'ner'},
                                  train_file=os.path.basename(train_path),
                                  dev_file=os.path.basename(dev_path),
                                  test_file=os.path.basename(dev_path))
    return corpus

Source File: import_annotations.py From anonymisation with Apache License 2.0

5 votes

def export_data_set_flair_format(spacy_model: Language, data_file_names: List[str]) -> str:
    data = load_content(txt_paths=data_file_names)
    data_flair_format = convert_to_flair_format(spacy_model, data)
    f = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_path = f.name
    f.writelines(data_flair_format)
    f.close()
    return tmp_path

Source File: language.py From spacy-udpipe with MIT License

5 votes

def __init__(
        self,
        udpipe_model: UDPipeModel,
        meta: Optional[Dict] = None,
        **kwargs
    ):
        """Initialize the Language class.

        The language is called "udpipe_en" instead of "en" in order to
        avoid any potential conflicts with spaCy's built-in languages.
        Using entry points, this enables serializing and deserializing
        the language class and "lang": "udpipe_en" in the meta.json will
        automatically instantiate this class if this package is available.

        udpipe_model: The loaded UDPipe model.
        meta: spaCy model metadata.
        kwargs: Optional config parameters.
        """
        self.udpipe = udpipe_model
        self.Defaults = get_defaults(lang=udpipe_model._lang)
        self.lang = f"udpipe_{udpipe_model._lang}"
        ignore_tag_map = kwargs.get("ignore_tag_map", False)
        if ignore_tag_map:
            self.Defaults.tag_map = {}  # workaround for ValueError: [E167]
        self.vocab = self.Defaults.create_vocab()
        self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab)
        self.pipeline = []
        self.max_length = kwargs.get("max_length", 10 ** 6)
        self._meta = self.udpipe._meta if meta is None else dict(meta)
        self._path = None
        self._optimizer = None

Python spacy.language.Language() Examples