Python spacy.language.Language() Examples
The following are 30
code examples of spacy.language.Language().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy.language
, or try the search function
.
Example #1
Source File: import_annotations.py From anonymisation with Apache License 2.0 | 6 votes |
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]: result: List[str] = list() for text, offsets in data: doc: Doc = spacy_model(text) # remove duplicated offsets offsets = normalize_offsets(offsets=offsets) offset_tuples = list(set([offset.to_tuple() for offset in offsets])) gold_annotations = GoldParse(doc, entities=offset_tuples) annotations: List[str] = gold_annotations.ner assert len(annotations) == len(doc) # Flair uses BIOES and Spacy BILUO # BILUO for Begin, Inside, Last, Unit, Out # BIOES for Begin, Inside, Outside, End, Single annotations = [a.replace('L-', 'E-') for a in annotations] annotations = [a.replace('U-', 'S-') for a in annotations] annotations = ["O" if a == "-" else a for a in annotations] # replace unknown result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)] result.append('\n') return result
Example #2
Source File: file.py From stog with MIT License | 6 votes |
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
Example #3
Source File: spacy_processors_test.py From forte with Apache License 2.0 | 6 votes |
def test_neg_spacy_processor(self): spacy = Pipeline[DataPack]() spacy.set_reader(StringReader()) config = { "processors": 'ner', "lang": "xx_ent_wiki_sm", # Language code for the language to build the Pipeline "use_gpu": False } spacy.add(SpacyProcessor(), config=config) spacy.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) with self.assertRaises(ProcessExecutionException): _ = spacy.process(document)
Example #4
Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def ensure_proper_language_model(nlp): # type: (Optional[Language]) -> None """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid.""" if nlp is None: raise Exception("Failed to load spacy language model. " "Loading the model returned 'None'.") if nlp.path is None: # Spacy sets the path to `None` if # it did not load the model from disk. # In this case `nlp` is an unusable stub. raise Exception("Failed to load spacy language model for " "lang '{}'. Make sure you have downloaded the " "correct model (https://spacy.io/docs/usage/)." "".format(nlp.lang))
Example #5
Source File: count_word_frequencies.py From scispacy with Apache License 2.0 | 6 votes |
def count_frequencies(language_class: Language, input_path: Path): """ Given a file containing single documents per line (for scispacy, these are Pubmed abstracts), split the text using a science specific tokenizer and compute word and document frequencies for all words. """ print(f"Processing {input_path}.") tokenizer = combined_rule_tokenizer(language_class()) counts = Counter() doc_counts = Counter() for line in open(input_path, "r"): words = [t.text for t in tokenizer(line)] counts.update(words) doc_counts.update(set(words)) return counts, doc_counts
Example #6
Source File: flair_generate_html_from_txt.py From anonymisation with Apache License 2.0 | 6 votes |
def main(data_folder: str, output_folder: str, model_folder: str) -> None: nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".txt")] tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt')) for filename in tqdm(iterable=filenames, unit=" txt", desc="anonymize cases"): with open(os.path.join(data_folder, filename), 'r') as input_f: sentences = tagger.predict(sentences=input_f.readlines(), mini_batch_size=32, verbose=False, use_tokenizer=tokenizer) case_name = filename.split('.')[0] page_html = render_ner_html(sentences, colors=colors, title=case_name) with open(os.path.join(output_folder, case_name + ".html"), "w") as output: output.write(page_html)
Example #7
Source File: spacy_utils.py From rasa-for-botfront with Apache License 2.0 | 6 votes |
def ensure_proper_language_model(nlp: Optional["Language"]) -> None: """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid.""" if nlp is None: raise Exception( "Failed to load spacy language model. " "Loading the model returned 'None'." ) if nlp.path is None: # Spacy sets the path to `None` if # it did not load the model from disk. # In this case `nlp` is an unusable stub. raise Exception( "Failed to load spacy language model for " "lang '{}'. Make sure you have downloaded the " "correct model (https://spacy.io/docs/usage/)." "".format(nlp.lang) )
Example #8
Source File: word_freqs.py From Blackstone with Apache License 2.0 | 6 votes |
def count_frequencies(language_class: Language, input_path: Path): """ Given a file containing single documents per line (in this case, sentences for the ICLR case law corpus), split the text using a science specific tokenizer and compute word and document frequencies for all words. """ print(f"Processing {input_path}.") nlp = English() #tokenizer = combined_rule_tokenizer(language_class()) tokenizer = Tokenizer(nlp.vocab) counts = Counter() doc_counts = Counter() for line in tqdm.tqdm(open(input_path, "r")): words = [t.text for t in tokenizer(line)] counts.update(words) doc_counts.update(set(words)) return counts, doc_counts
Example #9
Source File: file.py From gtos with MIT License | 6 votes |
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
Example #10
Source File: spacy_utils.py From rasa_nlu with Apache License 2.0 | 6 votes |
def ensure_proper_language_model(nlp: Optional['Language']) -> None: """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid.""" if nlp is None: raise Exception("Failed to load spacy language model. " "Loading the model returned 'None'.") if nlp.path is None: # Spacy sets the path to `None` if # it did not load the model from disk. # In this case `nlp` is an unusable stub. raise Exception("Failed to load spacy language model for " "lang '{}'. Make sure you have downloaded the " "correct model (https://spacy.io/docs/usage/)." "".format(nlp.lang))
Example #11
Source File: language.py From spacy-udpipe with MIT License | 6 votes |
def load_from_path( lang: str, path: str, meta: Optional[Dict] = {"description": "custom model"}, **kwargs ) -> UDPipeLanguage: """Convenience function for initializing the Language class and loading a custom UDPipe model via the path argument. lang: ISO 639-1 language code or shorthand UDPipe model name. path: Path to the UDPipe model. meta: Optional meta-information about the UDPipe model. kwargs: Optional config parameters. RETURNS: The UDPipeLanguage object. """ model = UDPipeModel(lang=lang, path=path, meta=meta) nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs) return nlp
Example #12
Source File: spacy-fastext.py From word2vecVN with Apache License 2.0 | 6 votes |
def load_nlp(vectors_loc, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab return nlp
Example #13
Source File: language.py From spacy-udpipe with MIT License | 5 votes |
def load(lang: str, **kwargs) -> UDPipeLanguage: """Convenience function for initializing the Language class that mimicks spacy.load. lang: ISO 639-1 language code or shorthand UDPipe model name. kwargs: Optional config parameters. RETURNS: The UDPipeLanguage object. """ model = UDPipeModel(lang=lang, path=None, meta=None) nlp = UDPipeLanguage(udpipe_model=model, meta=model._meta, **kwargs) return nlp
Example #14
Source File: spacy_featurizer.py From rasa_nlu with Apache License 2.0 | 5 votes |
def ndim(spacy_nlp: 'Language') -> int: """Number of features used to represent a document / sentence.""" return spacy_nlp.vocab.vectors_length
Example #15
Source File: spacy_processors_test.py From forte with Apache License 2.0 | 5 votes |
def setUp(self): self.spacy = Pipeline[DataPack]() self.spacy.set_reader(StringReader()) config = { "processors": "tokenize", "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False } self.spacy.add(SpacyProcessor(), config=config) self.spacy.initialize() self.nlp: Language = spacy.load(config['lang'])
Example #16
Source File: spacy_processors.py From forte with Apache License 2.0 | 5 votes |
def default_configs(cls): """ This defines a basic config structure for spaCy. Returns: """ config = super().default_configs() config.update({ 'processors': 'tokenize, pos, lemma', 'lang': 'en_core_web_sm', # Language code for the language to build the Pipeline 'use_gpu': False, }) return config
Example #17
Source File: spacy_processors.py From forte with Apache License 2.0 | 5 votes |
def __init__(self): super().__init__() self.processors: str = "" self.nlp: Optional[Language] = None self.lang_model: str = ''
Example #18
Source File: spacy_featurizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def ndim(spacy_nlp): """Number of features used to represent a document / sentence.""" # type: Language -> int return spacy_nlp.vocab.vectors_length
Example #19
Source File: spacy_extractor.py From cookiecutter-spacy-fastapi with MIT License | 5 votes |
def __init__( self, nlp: Language, input_id_col: str = "id", input_text_col: str = "text" ): """Initialize the SpacyExtractor pipeline. nlp (spacy.language.Language): pre-loaded spacy language model input_text_col (str): property on each document to run the model on input_id_col (str): property on each document to correlate with request RETURNS (EntityRecognizer): The newly constructed object. """ self.nlp = nlp self.input_id_col = input_id_col self.input_text_col = input_text_col
Example #20
Source File: spacy_utils.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def __init__(self, component_config=None, nlp=None): # type: (Dict[Text, Any], Language) -> None self.nlp = nlp super(SpacyNLP, self).__init__(component_config)
Example #21
Source File: util.py From allennlp with Apache License 2.0 | 5 votes |
def get_spacy_model( spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool ) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ["vectors", "textcat"] if not pos_tags: disable.append("tagger") if not parse: disable.append("parser") if not ner: disable.append("ner") try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning( f"Spacy models '{spacy_model_name}' not found. Downloading and installing." ) spacy_download(spacy_model_name) # Import the downloaded model module directly and load from there spacy_model_module = __import__(spacy_model_name) spacy_model = spacy_model_module.load(disable=disable) # type: ignore LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
Example #22
Source File: train_utils.py From scispacy with Apache License 2.0 | 5 votes |
def evaluate_ner( nlp: Language, eval_data, dump_path: str = None, verbose: bool = False ) -> PerClassScorer: scorer = PerClassScorer() print("Evaluating %d rows" % len(eval_data)) for i, (text, gold_spans) in enumerate(tqdm.tqdm(eval_data)): # parse dev data with trained model doc = nlp(text) predicted_spans = [ (ent.start_char, ent.end_char, ent.label_) for ent in doc.ents ] scorer(predicted_spans, gold_spans["entities"]) if i % 1000 == 0 and i > 0: for name, metric in scorer.get_metric().items(): print(f"{name}: {metric}") metrics = scorer.get_metric() if dump_path is not None: json.dump(metrics, open(dump_path, "a+")) for name, metric in metrics.items(): if "overall" in name or "untyped" in name or verbose: print(f"{name}: \t\t {metric}") return metrics
Example #23
Source File: util.py From scispacy with Apache License 2.0 | 5 votes |
def create_combined_rule_model() -> Language: nlp = spacy.load("en_core_web_sm") nlp.tokenizer = combined_rule_tokenizer(nlp) nlp.add_pipe(pysbd_sentencizer, first=True) return nlp
Example #24
Source File: util.py From scispacy with Apache License 2.0 | 5 votes |
def save_model(nlp: Language, output_path: str): nlp.to_disk(output_path)
Example #25
Source File: skills.py From SkillsExtractorCognitiveSearch with MIT License | 5 votes |
def __init__(self, nlp: Language, data_path: Path = Path("data")): self.nlp = nlp self.data_path = data_path self.skills = self._get_skills() patterns = self._build_patterns(self.skills) extra_patterns = self._get_extra_skill_patterns() ruler = EntityRuler(nlp, overwrite_ents=True) ruler.add_patterns(itertools.chain(patterns, extra_patterns)) if not self.nlp.has_pipe("skills_ruler"): self.nlp.add_pipe(ruler, name="skills_ruler")
Example #26
Source File: flair_generate_html_from_xml.py From anonymisation with Apache License 2.0 | 5 votes |
def main(data_folder: str, model_folder: str, top_n: int) -> None: print(f"keep only top {top_n} examples per file") nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [filename for filename in os.listdir(data_folder) if filename.endswith(".xml")] sentences: List[Sentence] = list() with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar: for filename in filenames: paragraphs: List[Paragraph] = get_paragraph_from_file(path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True) if len(paragraphs) > top_n: for paragraph in paragraphs[:top_n]: if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) sentences.append(s) progress_bar.update() if len(sentences) == 0: raise Exception("No example loaded, causes: no cases in provided path or sample size is to high") tagger: SequenceTagger = SequenceTagger.load(os.path.join(model_folder, 'best-model.pt')) _ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True) print("prepare html") page_html = render_ner_html(sentences, colors=colors) print("write html") with open("sentence.html", "w") as writer: writer.write(page_html)
Example #27
Source File: spacy_parser.py From fonduer with MIT License | 5 votes |
def __init__(self, lang: Optional[str]) -> None: """Initialize SpacyParser.""" self.name = "spacy" self.lang = lang self.model: Optional[Language] = None if self.has_tokenizer_support(): self._load_lang_model()
Example #28
Source File: import_annotations.py From anonymisation with Apache License 2.0 | 5 votes |
def prepare_flair_train_test_corpus(spacy_model: Language, data_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int]) -> Corpus: all_annotated_files: List[str] = [os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt")] if nb_segment is None and segment is None: random.shuffle(all_annotated_files) nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size) dev_file_names = all_annotated_files[0:nb_doc_dev_set] else: assert segment < nb_segment all_segments = np.array_split(all_annotated_files, nb_segment) dev_file_names = list(all_segments[segment]) print(dev_file_names) train_file_names = [file for file in all_annotated_files if file not in dev_file_names] train_path = export_data_set_flair_format(spacy_model, train_file_names) dev_path = export_data_set_flair_format(spacy_model, dev_file_names) corpus: Corpus = ColumnCorpus(data_folder=tempfile.gettempdir(), column_format={0: 'text', 1: 'ner'}, train_file=os.path.basename(train_path), dev_file=os.path.basename(dev_path), test_file=os.path.basename(dev_path)) return corpus
Example #29
Source File: import_annotations.py From anonymisation with Apache License 2.0 | 5 votes |
def export_data_set_flair_format(spacy_model: Language, data_file_names: List[str]) -> str: data = load_content(txt_paths=data_file_names) data_flair_format = convert_to_flair_format(spacy_model, data) f = tempfile.NamedTemporaryFile(delete=False, mode="w") tmp_path = f.name f.writelines(data_flair_format) f.close() return tmp_path
Example #30
Source File: language.py From spacy-udpipe with MIT License | 5 votes |
def __init__( self, udpipe_model: UDPipeModel, meta: Optional[Dict] = None, **kwargs ): """Initialize the Language class. The language is called "udpipe_en" instead of "en" in order to avoid any potential conflicts with spaCy's built-in languages. Using entry points, this enables serializing and deserializing the language class and "lang": "udpipe_en" in the meta.json will automatically instantiate this class if this package is available. udpipe_model: The loaded UDPipe model. meta: spaCy model metadata. kwargs: Optional config parameters. """ self.udpipe = udpipe_model self.Defaults = get_defaults(lang=udpipe_model._lang) self.lang = f"udpipe_{udpipe_model._lang}" ignore_tag_map = kwargs.get("ignore_tag_map", False) if ignore_tag_map: self.Defaults.tag_map = {} # workaround for ValueError: [E167] self.vocab = self.Defaults.create_vocab() self.tokenizer = UDPipeTokenizer(model=self.udpipe, vocab=self.vocab) self.pipeline = [] self.max_length = kwargs.get("max_length", 10 ** 6) self._meta = self.udpipe._meta if meta is None else dict(meta) self._path = None self._optimizer = None