Python spacy.load() Examples
The following are 30
code examples of spacy.load().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
spacy
, or try the search function
.
Example #1
Source File: tokenization_openai.py From Bert-Chinese-Text-Classification-Pytorch with MIT License | 7 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #2
Source File: test.py From negspacy with MIT License | 6 votes |
def test_umls2(): nlp = spacy.load("en_core_sci_sm") negex = Negex( nlp, language="en_clinical_sensitive", ent_types=["ENTITY"], chunk_prefix=["no"] ) nlp.add_pipe(negex, last=True) docs = build_med_docs() for d in docs: doc = nlp(d[0]) for i, e in enumerate(doc.ents): print(e.text, e._.negex) assert (e.text, e._.negex) == d[1][i] # blocked by spacy 2.1.8 issue. Adding back after spacy 2.2. # def test_no_ner(): # nlp = spacy.load("en_core_web_sm", disable=["ner"]) # negex = Negex(nlp) # nlp.add_pipe(negex, last=True) # with pytest.raises(ValueError): # doc = nlp("this doc has not been NERed")
Example #3
Source File: algorithm.py From neuralcoref with MIT License | 6 votes |
def one_shot_coref( self, utterances, utterances_speakers_id=None, context=None, context_speakers_id=None, speakers_names=None, ): """ Clear history, load a list of utterances and an optional context and run the coreference model on them Arg: - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences. Can be a single string for non-dialogue text. - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue). - if not provided, assume two speakers speaking alternatively. - if utterances and utterances_speaker are not of the same length padded with None - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances. - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id` Return: clusters of entities with coreference resolved """ self.data.set_utterances(context, context_speakers_id, speakers_names) self.continuous_coref(utterances, utterances_speakers_id, speakers_names) return self.get_clusters()
Example #4
Source File: annotate.py From ConvLab with MIT License | 6 votes |
def tokenize(data, process_text=True, process_da=True, process_ref=True): print('Begin tokenization:') print('='*50) nlp = spacy.load('en_core_web_sm') cnt = 0 for no, session in data.items(): cnt += 1 if cnt % 1000 == 0: print('[%d|%d]' % (cnt,len(data))) for turn in session['log']: if process_text: doc = nlp(turn['text']) turn['text'] = ' '.join([token.text for token in doc]).strip() if process_da: for da, svs in turn['dialog_act'].items(): for i in range(len(svs)): if svs[i][0] == 'Ref' and not process_ref: continue svs[i][1] = ' '.join([token.text for token in nlp(svs[i][1])]).strip() print('=' * 50) print('Finish tokenization')
Example #5
Source File: science_ie_data_utils.py From sciwing with MIT License | 6 votes |
def __init__(self, folderpath: pathlib.Path, ignore_warnings=False): """ Given the folderpath where the ScienceIE data is stored, this class provides various utilities. For more information on the dataset you can refer to https://scienceie.github.io/ Parameters ---------- folderpath : pathlib.Path The path where the ScienceIEDataset is stored ignore_warnings : bool If True, then all the warnings generated by this class for inconsistencies in the data is ignored """ self.folderpath = folderpath self.ignore_warning = ignore_warnings self.entity_types = ["Process", "Material", "Task"] self.file_ids = self.get_file_ids() self.msg_printer = wasabi.Printer() self.nlp = spacy.load("en_core_web_sm") self._conll_col_sep = " "
Example #6
Source File: imdb_pytorch.py From lineflow with MIT License | 6 votes |
def build_vocab(tokens, cache='vocab.pkl', max_size=50000): if not osp.isfile(cache): counter = Counter(tokens) words, _ = zip(*counter.most_common(max_size)) words = [PAD_TOKEN, UNK_TOKEN] + list(words) token_to_index = dict(zip(words, range(len(words)))) if START_TOKEN not in token_to_index: token_to_index[START_TOKEN] = len(token_to_index) words += [START_TOKEN] if END_TOKEN not in token_to_index: token_to_index[END_TOKEN] = len(token_to_index) words += [END_TOKEN] with open(cache, 'wb') as f: pickle.dump((token_to_index, words), f) else: with open(cache, 'rb') as f: token_to_index, words = pickle.load(f) return token_to_index, words
Example #7
Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0 | 6 votes |
def get_X_y(df): nlp = spacy.load('en') # for POS tagging X, y = [], [] for instance in df.iterrows(): instance = instance[1] abstract_tokens, POS_tags = tokenize_abstract(instance["ab_numbers"], nlp) abstract_tokens = replace_n_equals(abstract_tokens) nums_to_labels = {instance["enrolled_totals"]:"N", instance["enrolled_P1"]:"n1", instance["enrolled_P2"]:"n2"} cur_y = annotate(abstract_tokens, nums_to_labels) cur_x, numeric_token_indices = abstract2features(abstract_tokens, POS_tags) X.extend(cur_x) y.extend([cur_y[idx] for idx in numeric_token_indices]) return X, y_to_bin(y)
Example #8
Source File: algorithm.py From neuralcoref with MIT License | 6 votes |
def __init__(self, model_path): weights, biases = [], [] for file in sorted(os.listdir(model_path)): if file.startswith("single_mention_weights"): w = np.load(os.path.join(model_path, file)) weights.append(w) if file.startswith("single_mention_bias"): w = np.load(os.path.join(model_path, file)) biases.append(w) self.single_mention_model = list(zip(weights, biases)) weights, biases = [], [] for file in sorted(os.listdir(model_path)): if file.startswith("pair_mentions_weights"): w = np.load(os.path.join(model_path, file)) weights.append(w) if file.startswith("pair_mentions_bias"): w = np.load(os.path.join(model_path, file)) biases.append(w) self.pair_mentions_model = list(zip(weights, biases))
Example #9
Source File: utils.py From fastNLP with Apache License 2.0 | 6 votes |
def get_tokenizer(tokenize_method: str, lang='en'): r""" :param str tokenize_method: 获取tokenzier方法 :param str lang: 语言,当前仅支持en :return: 返回tokenize函数 """ tokenizer_dict = { 'spacy': None, 'raw': _raw_split, 'cn-char': _cn_char_split, } if tokenize_method == 'spacy': import spacy spacy.prefer_gpu() if lang != 'en': raise RuntimeError("Spacy only supports en right right.") en = spacy.load(lang) tokenizer = lambda x: [w.text for w in en.tokenizer(x)] elif tokenize_method in tokenizer_dict: tokenizer = tokenizer_dict[tokenize_method] else: raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.") return tokenizer
Example #10
Source File: run.py From fake-news-detection with MIT License | 6 votes |
def pipeline(args): ''' Runs the model loop. ''' df = pd.read_csv(args.filename) df.loc[:,args.x_label] = df[args.x_label].fillna("None") if args.dedupe: df = df.drop_duplicates(subset='content') if args.reduce: df = restrict_sources(df) X = df[args.x_label] y = df[args.y_label] parser = spacy.load('en') X_train, X_test, y_train, y_test = train_test_split(X, y) loop = ModelLoop(X_train, X_test, y_train, y_test, args.models, args.iterations, args.output_dir, thresholds = args.thresholds, ks = args.ks, setting=args.features[0]) loop.run()
Example #11
Source File: test_spacy_udpipe.py From spacy-udpipe with MIT License | 6 votes |
def test_morph_exception() -> None: assert spacy.__version__ <= SPACY_VERSION lang = RO text = "Ce mai faci?" download(lang=lang) try: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" doc = nlp(text) except ValueError: nlp = load(lang=lang, ignore_tag_map=True) assert nlp._meta["lang"] == f"udpipe_{lang}" doc = nlp(text) assert doc
Example #12
Source File: data_loader.py From LipReading with MIT License | 6 votes |
def split_sentences(dataviews, captions): nlp = spacy.load('en') new_frames, new_captions = [], [] for frames, caps in zip(dataviews, captions): new_fs, new_caps = [], [] left = 0 right = 1 while left < len(caps) and right < len(caps): cap = " ".join(caps[left:right]) doc = nlp(cap) sentences = [x.string.strip() for x in doc.sents] if len(sentences) >= 2 and right - 1 - left > 0: cap = " ".join(caps[left:right - 1]) new_fs.append(np.concatenate(frames[left:right - 1])) print("sentence:", cap) new_caps.append(cap) left = right - 1 right += 1 new_frames.append(new_fs) new_captions.append(new_caps) return new_frames, new_captions # REVIEW josephz: This is a copy of `FrameCaptionDataset.parse_caption`.
Example #13
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #14
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def get_stdeng_spacy_tagger(suppress_errors=False): global SPACY_WRAPPER if SPACY_WRAPPER is not None: return SPACY_WRAPPER try: import spacy SPACY_WRAPPER = SpacyTagger() SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False) return SPACY_WRAPPER except ImportError: if not suppress_errors: raise except RuntimeError: ## this seems to happen if the 'en' model is not installed. it might ## look like this: # RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model. if not suppress_errors: raise return None
Example #15
Source File: test_construct_query.py From adam_qas with GNU General Public License v3.0 | 6 votes |
def test_construct_query(self): sql_man = SqLiteManager() en_nlp_l = spacy.load(EN_MODEL_MD) result = sql_man.get_questions_between(5, 7) for row in result: qid = row[0] with self.subTest(qid): question = row[1] question_type = row[2] question_feat = json.loads(row[3]) if question_feat is not None: en_doc = en_nlp_l(u'' + question) query = construct_query(question_feat, en_doc) print("{0}){1} :\nQuery: {2}".format(qid, question, repr(query))) js_query = json.dumps(repr(query)) sql_man.update_search_query(qid, js_query) assert query is not None # sql_man.close_db()
Example #16
Source File: text2mapVec.py From Geocoding-with-Map-Vector with GNU General Public License v3.0 | 6 votes |
def buildMapVec(text): """ An example wrapper function for text2mapVec(), reads in necessary collections and then runs text2mapVec(). Feel free to modify to your preference and task objective. :param text: to create the Map Vector from encoded as unicode. :return: currently only prints the vector, add 'return map_vector' or whatever you prefer. """ ENCODING_MAP = cPickle.load(open(u"data/1x1_encode_map.pkl")) # the resolution of the map OUTLIERS_MAP = cPickle.load(open(u"data/1x1_outliers_map.pkl")) # dimensions must match the above nlp = spacy.load(u'en_core_web_lg') # or spacy.load(u'en') depending on your Spacy Download (simple or full) conn = sqlite3.connect(u'../data/geonames.db').cursor() # this DB can be downloaded using the GitHub link map_vector = text2mapvec(doc=nlp(text), mapping=ENCODING_MAP, outliers=OUTLIERS_MAP, polygon_size=1, db=conn, exclude=u"Cairo") print(map_vector) # text = u"The Giza pyramid complex is an archaeological site on the Giza Plateau, on the outskirts of Cairo, Egypt." # buildMapVec(text)
Example #17
Source File: prepare_data.py From Hierarchical-Sentiment with MIT License | 6 votes |
def build_dataset(args): print("Building dataset from : {}".format(args.input)) print("-> Building {} random splits".format(args.nb_splits)) nlp = spacy.load('en', create_pipeline=custom_pipeline) gen_a,gen_b = itertools.tee(data_generator(args.input),2) data = [(z["reviewerID"],z["asin"],tok,z["overall"]) for z,tok in zip(tqdm((z for z in gen_a),desc="reading file"),nlp.pipe((x["reviewText"] for x in gen_b), batch_size=1000000, n_threads=8))] print(data[0]) shuffle(data) splits = [randint(0,args.nb_splits-1) for _ in range(0,len(data))] count = Counter(splits) print("Split distribution is the following:") print(count) return {"data":data,"splits":splits,"rows":("user_id","item_id","review","rating")}
Example #18
Source File: data_loader.py From LipReading with MIT License | 6 votes |
def build_vocab(dataset_name, labels): raw_dir = _util.getRelRawPath(dataset_name) labels_path = os.path.join(raw_dir, labels) try: with open(labels_path) as label_file: labels = str(''.join(json.load(label_file))) except: labels = _labels _getSharedLogger().warning("Could not open '%s'... \n\tUsing hardcoded labels: '%s'", labels_path, labels) char2idx = {} for k, v in _markers2Id.items(): char2idx[k] = v for char in labels: char2idx[char] = len(char2idx) return char2idx
Example #19
Source File: utils.py From comet-commonsense with Apache License 2.0 | 5 votes |
def load_existing_data_loader(data_loader, path): old_data_loader = torch.load(path) for attr in data_loader.__dict__.keys(): if attr not in old_data_loader.__dict__.keys(): continue setattr(data_loader, attr, getattr(old_data_loader, attr)) ################################################################################ # # Code Below taken from HuggingFace pytorch-openai-lm repository # ################################################################################
Example #20
Source File: TermDocMatrixFactory.py From scattertext with Apache License 2.0 | 5 votes |
def get_nlp(self): nlp = self._nlp if nlp is None: import spacy nlp = spacy.load('en') return nlp
Example #21
Source File: test.py From negspacy with MIT License | 5 votes |
def test_own_terminology(): nlp = spacy.load("en_core_web_sm") negex = Negex(nlp, termination=["whatever"]) nlp.add_pipe(negex, last=True) doc = nlp("He does not like Steve Jobs whatever he says about Barack Obama.") assert doc.ents[1]._.negex == False
Example #22
Source File: test_science_ie_data_utils.py From sciwing with MIT License | 5 votes |
def test_get_bilou_lines( self, setup_science_ie_train_data_utils, entity_type, file_id ): utils = setup_science_ie_train_data_utils # test whether all the annotations that you get for different file ids # are present as either U, B I or L tag annotations = utils._get_annotations_for_entity( file_id=file_id, entity=entity_type ) bilou_lines = utils.get_bilou_lines_for_entity( file_id=file_id, entity=entity_type ) nlp = spacy.load("en_core_web_sm") annotation_words = [] for annotation in annotations: words = annotation["words"] words = words.strip() doc = nlp(words) words = [tok.text for tok in doc] annotation_words.extend(words) bilou_words_without_o = [] for bilou_line in bilou_lines: word, _, _, tag = bilou_line.split() if not tag.startswith("O"): bilou_words_without_o.append(word) print(annotation_words) print(bilou_words_without_o) assert len(annotation_words) == len(bilou_words_without_o)
Example #23
Source File: test_spacy.py From docker-python with Apache License 2.0 | 5 votes |
def test_model(self): nlp = spacy.load('en') doc = nlp('This is a sentence.') self.assertEqual(5, len(doc))
Example #24
Source File: infer.py From BERT-Relation-Extraction with Apache License 2.0 | 5 votes |
def load_pickle(filename): completeName = os.path.join("./data/",\ filename) with open(completeName, 'rb') as pkl_file: data = pickle.load(pkl_file) return data
Example #25
Source File: word_tokenizer.py From sciwing with MIT License | 5 votes |
def __init__(self, tokenizer: str = "spacy"): """ WordTokenizers split the text into tokens Parameters ---------- tokenizer : str The type of tokenizer. spacy Tokenizer from spact nltk NLTK based tokenizer vanilla Tokenize words according to space spacy-whtiespace Same as vanilla but implemented using custom white space tokenizer from spacy """ super(WordTokenizer, self).__init__() self.msg_printer = Printer() self.tokenizer = tokenizer self.allowed_tokenizers = ["spacy", "nltk", "vanilla", "spacy-whitespace"] assert self.tokenizer in self.allowed_tokenizers, AssertionError( f"The word tokenizer can be {self.allowed_tokenizers}" ) if self.tokenizer == "spacy" or "spacy-whitespace": self.nlp = spacy.load("en_core_web_sm") self.nlp.remove_pipe("parser") self.nlp.remove_pipe("tagger") self.nlp.remove_pipe("ner") if self.tokenizer == "spacy-whitespace": self.nlp.tokenizer = CustomSpacyWhiteSpaceTokenizer(self.nlp.vocab)
Example #26
Source File: build_pretrain_corpus.py From scibert with Apache License 2.0 | 5 votes |
def _get_spacy_nlp(): nlp = spacy.load('en_scispacy_core_web_sm', disable=['ner', 'tagger']) # nlp = spacy.load('en_core_web_sm', disable=['ner', 'tagger']) return nlp
Example #27
Source File: install.py From QuickUMLS with MIT License | 5 votes |
def install_spacy(lang): """Tries to create a spacy object; if it fails, downloads the dataset""" print(f'Determining if SpaCy for language "{lang}" is installed...') if lang in SPACY_LANGUAGE_MAP: try: spacy.load(SPACY_LANGUAGE_MAP[lang]) print(f'SpaCy is installed and avaliable for {lang}!') except OSError: print(f'SpaCy is not available! Attempting to download and install...') spacy.cli.download(SPACY_LANGUAGE_MAP[lang])
Example #28
Source File: data_loader.py From LipReading with MIT License | 5 votes |
def __init__(self, dataset_name, split_name, vid_ids, labels='labels.json', start_end='s_e', threshold=0.8, fps=29.97, cap='cap', frame_type='face_lmk_seq', sentence_dataset=False, in_ext='.npy', out_ext='.pkl', refresh=False): """ Dataset that loads the video dataview and captions. :param vid_ids: Directories of video ids to include in the dataset. :param labels: Corresponding dataset vocabulary. :param cap: Base filename for caption rows to load. :param frame_type: Frame type to use for input, also the base filename for frame rows to load. """ super(FrameCaptionDataset, self).__init__() assert all(os.path.isdir(x) for x in vid_ids) assert frame_type in ('face_lmk_seq', 'face_vtx_seq') pickle_dir = _util.getRelPicklesPath(dataset_name, 'sentence' if sentence_dataset else 'non-sentence', split_name) if refresh or not os.path.isdir(pickle_dir): char2idx, frames, captions = FrameCaptionDataset.construct_dataset(dataset_name, pickle_dir, vid_ids, labels=labels, start_end=start_end, cap=cap, frame_type=frame_type, sentence_dataset=sentence_dataset, in_ext=in_ext, fps=fps, threshold=threshold) else: char2idx, frames, captions = load_dataset(pickle_dir, out_ext=out_ext) assert len(frames) == len(captions) > 0 # Cache all rows. self.char2idx = char2idx self.idx2char = {v: k for k, v in char2idx.items()} self.frames = frames self.captions = captions self.num_elements = len(captions) # Cache dataview paths. self.frame_type = frame_type
Example #29
Source File: test.py From negspacy with MIT License | 5 votes |
def test_issue7(): nlp = spacy.load("en_core_web_sm") negex = Negex(nlp) nlp.add_pipe(negex, last=True) ruler = EntityRuler(nlp) patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] doc = nlp("fgfgdghgdh")
Example #30
Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def tokenize_abstract(abstract, nlp=None): if nlp is None: nlp = spacy.load('en') tokens, POS_tags = [], [] ab = nlp(abstract) for word in ab: tokens.append(word.text) POS_tags.append(word.tag_) #import pdb;pdb.set_trace() return tokens, POS_tags