Python spacy.language.Language() Examples

The following are 30 code examples of spacy.language.Language(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy.language , or try the search function .
Example #1
Source File: import_annotations.py    From anonymisation with Apache License 2.0 6 votes vote down vote up
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
    result: List[str] = list()
    for text, offsets in data:
        doc: Doc = spacy_model(text)
        # remove duplicated offsets
        offsets = normalize_offsets(offsets=offsets)
        offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
        gold_annotations = GoldParse(doc, entities=offset_tuples)
        annotations: List[str] = gold_annotations.ner
        assert len(annotations) == len(doc)
        # Flair uses BIOES and Spacy BILUO
        # BILUO for Begin, Inside, Last, Unit, Out
        # BIOES for Begin, Inside, Outside, End, Single
        annotations = [a.replace('L-', 'E-') for a in annotations]
        annotations = [a.replace('U-', 'S-') for a in annotations]
        annotations = ["O" if a == "-" else a for a in annotations]  # replace unknown
        result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
        result.append('\n')
    return result 
Example #2
Source File: file.py    From stog with MIT License 6 votes vote down vote up
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options] 
Example #3
Source File: spacy_processors_test.py    From forte with Apache License 2.0 6 votes vote down vote up
def test_neg_spacy_processor(self):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        with self.assertRaises(ProcessExecutionException):
            _ = spacy.process(document) 
Example #4
Source File: spacy_utils.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang)) 
Example #5
Source File: count_word_frequencies.py    From scispacy with Apache License 2.0 6 votes vote down vote up
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (for scispacy, these are Pubmed abstracts), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    tokenizer = combined_rule_tokenizer(language_class())
    counts = Counter()
    doc_counts = Counter()
    for line in open(input_path, "r"):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts 
Example #6
Source File: flair_generate_html_from_txt.py    From anonymisation with Apache License 2.0 6 votes vote down vote up
def main(data_folder: str, output_folder: str, model_folder: str) -> None:
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")]
    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))

    for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"):
        with open(os.path.join(data_folder, filename), 'r') as input_f:
            sentences = tagger.predict(sentences=input_f.readlines(),
                                       mini_batch_size=32,
                                       verbose=False,
                                       use_tokenizer=tokenizer)
            case_name = filename.split('.')[0]
            page_html = render_ner_html(sentences, colors=colors, title=case_name)

            with open(os.path.join(output_folder, case_name + ".html"), "w") as output:
                output.write(page_html) 
Example #7
Source File: spacy_utils.py    From rasa-for-botfront with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp: Optional["Language"]) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception(
                "Failed to load spacy language model. "
                "Loading the model returned 'None'."
            )
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception(
                "Failed to load spacy language model for "
                "lang '{}'. Make sure you have downloaded the "
                "correct model (https://spacy.io/docs/usage/)."
                "".format(nlp.lang)
            ) 
Example #8
Source File: word_freqs.py    From Blackstone with Apache License 2.0 6 votes vote down vote up
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (in this case, sentences for the ICLR case law corpus), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    nlp = English()
    #tokenizer = combined_rule_tokenizer(language_class())
    tokenizer = Tokenizer(nlp.vocab)
    counts = Counter()
    doc_counts = Counter()
    for line in tqdm.tqdm(open(input_path, "r")):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts 
Example #9
Source File: file.py    From gtos with MIT License 6 votes vote down vote up
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options] 
Example #10
Source File: spacy_utils.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def ensure_proper_language_model(nlp: Optional['Language']) -> None:
        """Checks if the spacy language model is properly loaded.

        Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. "
                            "Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if
            # it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for "
                            "lang '{}'. Make sure you have downloaded the "
                            "correct model (https://spacy.io/docs/usage/)."
                            "".format(nlp.lang)) 
Example #11
Source File: language.py    From spacy-udpipe with MIT License 6 votes vote down vote up
def load_from_path(
    lang: str,
    path: str,
    meta: Optional[Dict] = {"description": "custom model"},
    **kwargs
) -> UDPipeLanguage:
    """Convenience function for initializing the Language class and loading
    a custom UDPipe model via the path argument.

    lang: ISO 639-1 language code or shorthand UDPipe model name.
    path: Path to the UDPipe model.
    meta: Optional meta-information about the UDPipe model.
    kwargs: Optional config parameters.
    RETURNS: The UDPipeLanguage object.
    """
    model = UDPipeModel(lang=lang, path=path, meta=meta)
    nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs)
    return nlp 
Example #12
Source File: spacy-fastext.py    From word2vecVN with Apache License 2.0 6 votes vote down vote up
def load_nlp(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    return nlp 
Example #13
Source File: language.py    From spacy-udpipe with MIT License 5 votes vote down vote up
def load(lang: str, **kwargs) -> UDPipeLanguage:
    """Convenience function for initializing the Language class that
    mimicks spacy.load.

    lang: ISO 639-1 language code or shorthand UDPipe model name.
    kwargs: Optional config parameters.
    RETURNS: The UDPipeLanguage object.
    """
    model = UDPipeModel(lang=lang, path=None, meta=None)
    nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs)
    return nlp 
Example #14
Source File: spacy_featurizer.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def ndim(spacy_nlp: 'Language') -> int:
    """Number of features used to represent a document / sentence."""
    return spacy_nlp.vocab.vectors_length 
Example #15
Source File: spacy_processors_test.py    From forte with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self.spacy = Pipeline[DataPack]()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add(SpacyProcessor(), config=config)
        self.spacy.initialize()

        self.nlp: Language = spacy.load(config['lang']) 
Example #16
Source File: spacy_processors.py    From forte with Apache License 2.0 5 votes vote down vote up
def default_configs(cls):
        """
        This defines a basic config structure for spaCy.
        Returns:

        """
        config = super().default_configs()
        config.update({
            'processors': 'tokenize, pos, lemma',
            'lang': 'en_core_web_sm',
            # Language code for the language to build the Pipeline
            'use_gpu': False,
        })
        return config 
Example #17
Source File: spacy_processors.py    From forte with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        super().__init__()
        self.processors: str = ""
        self.nlp: Optional[Language] = None
        self.lang_model: str = '' 
Example #18
Source File: spacy_featurizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def ndim(spacy_nlp):
    """Number of features used to represent a document / sentence."""
    # type: Language -> int
    return spacy_nlp.vocab.vectors_length 
Example #19
Source File: spacy_extractor.py    From cookiecutter-spacy-fastapi with MIT License 5 votes vote down vote up
def __init__(
        self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text"
    ):
        """Initialize the SpacyExtractor pipeline.
        
        nlp (spacy.language.Language): pre-loaded spacy language model
        input_text_col (str): property on each document to run the model on
        input_id_col (str): property on each document to correlate with request

        RETURNS (EntityRecognizer): The newly constructed object.
        """
        self.nlp = nlp
        self.input_id_col = input_id_col
        self.input_text_col = input_text_col 
Example #20
Source File: spacy_utils.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def __init__(self, component_config=None, nlp=None):
        # type: (Dict[Text, Any], Language) -> None

        self.nlp = nlp
        super(SpacyNLP, self).__init__(component_config) 
Example #21
Source File: util.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def get_spacy_model(
    spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool
) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ["vectors", "textcat"]
        if not pos_tags:
            disable.append("tagger")
        if not parse:
            disable.append("parser")
        if not ner:
            disable.append("ner")
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(
                f"Spacy models '{spacy_model_name}' not found.  Downloading and installing."
            )
            spacy_download(spacy_model_name)

            # Import the downloaded model module directly and load from there
            spacy_model_module = __import__(spacy_model_name)
            spacy_model = spacy_model_module.load(disable=disable)  # type: ignore

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options] 
Example #22
Source File: train_utils.py    From scispacy with Apache License 2.0 5 votes vote down vote up
def evaluate_ner(
    nlp: Language, eval_data, dump_path: str = None, verbose: bool = False
) -> PerClassScorer:

    scorer = PerClassScorer()
    print("Evaluating %d rows" % len(eval_data))
    for i, (text, gold_spans) in enumerate(tqdm.tqdm(eval_data)):

        # parse dev data with trained model
        doc = nlp(text)
        predicted_spans = [
            (ent.start_char, ent.end_char, ent.label_) for ent in doc.ents
        ]
        scorer(predicted_spans, gold_spans["entities"])

        if i % 1000 == 0 and i > 0:
            for name, metric in scorer.get_metric().items():
                print(f"{name}: {metric}")

    metrics = scorer.get_metric()
    if dump_path is not None:
        json.dump(metrics, open(dump_path, "a+"))
    for name, metric in metrics.items():
        if "overall" in name or "untyped" in name or verbose:
            print(f"{name}: \t\t {metric}")

    return metrics 
Example #23
Source File: util.py    From scispacy with Apache License 2.0 5 votes vote down vote up
def create_combined_rule_model() -> Language:
    nlp = spacy.load("en_core_web_sm")
    nlp.tokenizer = combined_rule_tokenizer(nlp)
    nlp.add_pipe(pysbd_sentencizer, first=True)
    return nlp 
Example #24
Source File: util.py    From scispacy with Apache License 2.0 5 votes vote down vote up
def save_model(nlp: Language, output_path: str):
    nlp.to_disk(output_path) 
Example #25
Source File: skills.py    From SkillsExtractorCognitiveSearch with MIT License 5 votes vote down vote up
def __init__(self, nlp: Language, data_path: Path = Path("data")):
        self.nlp = nlp
        self.data_path = data_path
        self.skills = self._get_skills()

        patterns = self._build_patterns(self.skills)
        extra_patterns = self._get_extra_skill_patterns()
        ruler = EntityRuler(nlp, overwrite_ents=True)
        ruler.add_patterns(itertools.chain(patterns, extra_patterns))
        if not self.nlp.has_pipe("skills_ruler"):
            self.nlp.add_pipe(ruler, name="skills_ruler") 
Example #26
Source File: flair_generate_html_from_xml.py    From anonymisation with Apache License 2.0 5 votes vote down vote up
def main(data_folder: str, model_folder: str, top_n: int) -> None:
    print(f"keep only top {top_n} examples per file")
    nlp: Language = spacy.blank('fr')
    nlp.tokenizer = get_tokenizer(nlp)
    tokenizer = build_spacy_tokenizer(nlp)
    filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".xml")]
    sentences: List[Sentence] = list()
    with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar:
        for filename in filenames:
            paragraphs: List[Paragraph] = get_paragraph_from_file(path=os.path.join(data_folder, filename),
                                                                  keep_paragraph_without_annotation=True)
            if len(paragraphs) > top_n:
                for paragraph in paragraphs[:top_n]:
                    if len(paragraph.text) > 0:
                        s = Sentence(text=paragraph.text, tokenizer=tokenizer)
                        sentences.append(s)
            progress_bar.update()
    if len(sentences) == 0:
        raise Exception("No example loaded, causes: no cases in provided path or sample size is to high")

    tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt'))
    _ = tagger.predict(sentences=sentences,
                       mini_batch_size=32,
                       verbose=True)

    print("prepare html")
    page_html = render_ner_html(sentences, colors=colors)
    print("write html")
    with open("sentence.html", "w") as writer:
        writer.write(page_html) 
Example #27
Source File: spacy_parser.py    From fonduer with MIT License 5 votes vote down vote up
def __init__(self, lang: Optional[str]) -> None:
        """Initialize SpacyParser."""
        self.name = "spacy"

        self.lang = lang
        self.model: Optional[Language] = None
        if self.has_tokenizer_support():
            self._load_lang_model() 
Example #28
Source File: import_annotations.py    From anonymisation with Apache License 2.0 5 votes vote down vote up
def prepare_flair_train_test_corpus(spacy_model: Language, data_folder: str, dev_size: float,
                                    nb_segment: Optional[int], segment: Optional[int]) -> Corpus:

    all_annotated_files: List[str] = [os.path.join(data_folder, filename)
                                      for filename in os.listdir(data_folder) if filename.endswith(".txt")]
    if nb_segment is None and segment is None:
        random.shuffle(all_annotated_files)
        nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size)
        dev_file_names = all_annotated_files[0:nb_doc_dev_set]
    else:
        assert segment < nb_segment
        all_segments = np.array_split(all_annotated_files, nb_segment)
        dev_file_names = list(all_segments[segment])
        print(dev_file_names)

    train_file_names = [file for file in all_annotated_files if file not in dev_file_names]

    train_path = export_data_set_flair_format(spacy_model, train_file_names)
    dev_path = export_data_set_flair_format(spacy_model, dev_file_names)

    corpus: Corpus = ColumnCorpus(data_folder=tempfile.gettempdir(),
                                  column_format={0: 'text', 1: 'ner'},
                                  train_file=os.path.basename(train_path),
                                  dev_file=os.path.basename(dev_path),
                                  test_file=os.path.basename(dev_path))
    return corpus 
Example #29
Source File: import_annotations.py    From anonymisation with Apache License 2.0 5 votes vote down vote up
def export_data_set_flair_format(spacy_model: Language, data_file_names: List[str]) -> str:
    data = load_content(txt_paths=data_file_names)
    data_flair_format = convert_to_flair_format(spacy_model, data)
    f = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_path = f.name
    f.writelines(data_flair_format)
    f.close()
    return tmp_path 
Example #30
Source File: language.py    From spacy-udpipe with MIT License 5 votes vote down vote up
def __init__(
        self,
        udpipe_model: UDPipeModel,
        meta: Optional[Dict] = None,
        **kwargs
    ):
        """Initialize the Language class.

        The language is called "udpipe_en" instead of "en" in order to
        avoid any potential conflicts with spaCy's built-in languages.
        Using entry points, this enables serializing and deserializing
        the language class and "lang": "udpipe_en" in the meta.json will
        automatically instantiate this class if this package is available.

        udpipe_model: The loaded UDPipe model.
        meta: spaCy model metadata.
        kwargs: Optional config parameters.
        """
        self.udpipe = udpipe_model
        self.Defaults = get_defaults(lang=udpipe_model._lang)
        self.lang = f"udpipe_{udpipe_model._lang}"
        ignore_tag_map = kwargs.get("ignore_tag_map", False)
        if ignore_tag_map:
            self.Defaults.tag_map = {}  # workaround for ValueError: [E167]
        self.vocab = self.Defaults.create_vocab()
        self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab)
        self.pipeline = []
        self.max_length = kwargs.get("max_length", 10 ** 6)
        self._meta = self.udpipe._meta if meta is None else dict(meta)
        self._path = None
        self._optimizer = None