Python spacy.load() Examples

The following are 30 code examples of spacy.load(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy , or try the search function .
Example #1
Source File: tokenization_openai.py    From Bert-Chinese-Text-Classification-Pytorch with MIT License 7 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
Example #2
Source File: test.py    From negspacy with MIT License 6 votes vote down vote up
def test_umls2():
    nlp = spacy.load("en_core_sci_sm")
    negex = Negex(
        nlp, language="en_clinical_sensitive", ent_types=["ENTITY"], chunk_prefix=["no"]
    )
    nlp.add_pipe(negex, last=True)
    docs = build_med_docs()
    for d in docs:
        doc = nlp(d[0])
        for i, e in enumerate(doc.ents):
            print(e.text, e._.negex)
            assert (e.text, e._.negex) == d[1][i]


# blocked by spacy 2.1.8 issue. Adding back after spacy 2.2.
# def test_no_ner():
#     nlp = spacy.load("en_core_web_sm", disable=["ner"])
#     negex = Negex(nlp)
#     nlp.add_pipe(negex, last=True)
#     with pytest.raises(ValueError):
#         doc = nlp("this doc has not been NERed") 
Example #3
Source File: algorithm.py    From neuralcoref with MIT License 6 votes vote down vote up
def one_shot_coref(
        self,
        utterances,
        utterances_speakers_id=None,
        context=None,
        context_speakers_id=None,
        speakers_names=None,
    ):
        """ Clear history, load a list of utterances and an optional context and run the coreference model on them

        Arg:
        - `utterances` : iterator or list of string corresponding to successive utterances (in a dialogue) or sentences.
            Can be a single string for non-dialogue text.
        - `utterances_speakers_id=None` : iterator or list of speaker id for each utterance (in the case of a dialogue).
            - if not provided, assume two speakers speaking alternatively.
            - if utterances and utterances_speaker are not of the same length padded with None
        - `context=None` : iterator or list of string corresponding to additionnal utterances/sentences sent prior to `utterances`. Coreferences are not computed for the mentions identified in `context`. The mentions in `context` are only used as possible antecedents to mentions in `uterrance`. Reduce the computations when we are only interested in resolving coreference in the last sentences/utterances.
        - `context_speakers_id=None` : same as `utterances_speakers_id` for `context`. 
        - `speakers_names=None` : dictionnary of list of acceptable speaker names (strings) for speaker_id in `utterances_speakers_id` and `context_speakers_id`
        Return:
            clusters of entities with coreference resolved
        """
        self.data.set_utterances(context, context_speakers_id, speakers_names)
        self.continuous_coref(utterances, utterances_speakers_id, speakers_names)
        return self.get_clusters() 
Example #4
Source File: annotate.py    From ConvLab with MIT License 6 votes vote down vote up
def tokenize(data, process_text=True, process_da=True, process_ref=True):
    print('Begin tokenization:')
    print('='*50)
    nlp = spacy.load('en_core_web_sm')
    cnt = 0
    for no, session in data.items():
        cnt += 1
        if cnt % 1000 == 0:
            print('[%d|%d]' % (cnt,len(data)))
        for turn in session['log']:
            if process_text:
                doc = nlp(turn['text'])
                turn['text'] = ' '.join([token.text for token in doc]).strip()
            if process_da:
                for da, svs in turn['dialog_act'].items():
                    for i in range(len(svs)):
                        if svs[i][0] == 'Ref' and not process_ref:
                            continue
                        svs[i][1] = ' '.join([token.text for token in nlp(svs[i][1])]).strip()
    print('=' * 50)
    print('Finish tokenization') 
Example #5
Source File: science_ie_data_utils.py    From sciwing with MIT License 6 votes vote down vote up
def __init__(self, folderpath: pathlib.Path, ignore_warnings=False):
        """ Given the folderpath where the ScienceIE data is stored, this class provides various
        utilities. For more information on the dataset you can refer to https://scienceie.github.io/

        Parameters
        ----------
        folderpath : pathlib.Path
            The path where the ScienceIEDataset is stored
        ignore_warnings : bool
            If True, then all the warnings generated by this class for inconsistencies in the
            data is ignored

        """
        self.folderpath = folderpath
        self.ignore_warning = ignore_warnings
        self.entity_types = ["Process", "Material", "Task"]
        self.file_ids = self.get_file_ids()
        self.msg_printer = wasabi.Printer()
        self.nlp = spacy.load("en_core_web_sm")
        self._conll_col_sep = " " 
Example #6
Source File: imdb_pytorch.py    From lineflow with MIT License 6 votes vote down vote up
def build_vocab(tokens, cache='vocab.pkl', max_size=50000):
    if not osp.isfile(cache):
        counter = Counter(tokens)
        words, _ = zip(*counter.most_common(max_size))
        words = [PAD_TOKEN, UNK_TOKEN] + list(words)
        token_to_index = dict(zip(words, range(len(words))))
        if START_TOKEN not in token_to_index:
            token_to_index[START_TOKEN] = len(token_to_index)
            words += [START_TOKEN]
        if END_TOKEN not in token_to_index:
            token_to_index[END_TOKEN] = len(token_to_index)
            words += [END_TOKEN]
        with open(cache, 'wb') as f:
            pickle.dump((token_to_index, words), f)
    else:
        with open(cache, 'rb') as f:
            token_to_index, words = pickle.load(f)

    return token_to_index, words 
Example #7
Source File: sample_size_NN.py    From robotreviewer with GNU General Public License v3.0 6 votes vote down vote up
def get_X_y(df):
    nlp = spacy.load('en') # for POS tagging

    X, y = [], []
    for instance in df.iterrows():
        instance = instance[1]

        abstract_tokens, POS_tags = tokenize_abstract(instance["ab_numbers"], nlp)
        abstract_tokens = replace_n_equals(abstract_tokens)

        nums_to_labels = {instance["enrolled_totals"]:"N", instance["enrolled_P1"]:"n1", instance["enrolled_P2"]:"n2"}
        cur_y = annotate(abstract_tokens, nums_to_labels)
        cur_x, numeric_token_indices = abstract2features(abstract_tokens, POS_tags)

        X.extend(cur_x)
        y.extend([cur_y[idx] for idx in numeric_token_indices])

    return X, y_to_bin(y) 
Example #8
Source File: algorithm.py    From neuralcoref with MIT License 6 votes vote down vote up
def __init__(self, model_path):
        weights, biases = [], []
        for file in sorted(os.listdir(model_path)):
            if file.startswith("single_mention_weights"):
                w = np.load(os.path.join(model_path, file))
                weights.append(w)
            if file.startswith("single_mention_bias"):
                w = np.load(os.path.join(model_path, file))
                biases.append(w)
        self.single_mention_model = list(zip(weights, biases))
        weights, biases = [], []
        for file in sorted(os.listdir(model_path)):
            if file.startswith("pair_mentions_weights"):
                w = np.load(os.path.join(model_path, file))
                weights.append(w)
            if file.startswith("pair_mentions_bias"):
                w = np.load(os.path.join(model_path, file))
                biases.append(w)
        self.pair_mentions_model = list(zip(weights, biases)) 
Example #9
Source File: utils.py    From fastNLP with Apache License 2.0 6 votes vote down vote up
def get_tokenizer(tokenize_method: str, lang='en'):
    r"""

    :param str tokenize_method: 获取tokenzier方法
    :param str lang: 语言,当前仅支持en
    :return: 返回tokenize函数
    """
    tokenizer_dict = {
        'spacy': None,
        'raw': _raw_split,
        'cn-char': _cn_char_split,
    }
    if tokenize_method == 'spacy':
        import spacy
        spacy.prefer_gpu()
        if lang != 'en':
            raise RuntimeError("Spacy only supports en right right.")
        en = spacy.load(lang)
        tokenizer = lambda x: [w.text for w in en.tokenizer(x)]
    elif tokenize_method in tokenizer_dict:
        tokenizer = tokenizer_dict[tokenize_method]
    else:
        raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.")
    return tokenizer 
Example #10
Source File: run.py    From fake-news-detection with MIT License 6 votes vote down vote up
def pipeline(args):
    '''
    Runs the model loop.
    '''
    df = pd.read_csv(args.filename)
    df.loc[:,args.x_label] = df[args.x_label].fillna("None")
    if args.dedupe:
        df = df.drop_duplicates(subset='content')
    if args.reduce:
        df = restrict_sources(df)
    X = df[args.x_label]
    y = df[args.y_label]
    parser = spacy.load('en')
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
                     args.iterations, args.output_dir,
                     thresholds = args.thresholds, ks = args.ks,
                     setting=args.features[0])
    loop.run() 
Example #11
Source File: test_spacy_udpipe.py    From spacy-udpipe with MIT License 6 votes vote down vote up
def test_morph_exception() -> None:
    assert spacy.__version__ <= SPACY_VERSION

    lang = RO
    text = "Ce mai faci?"

    download(lang=lang)

    try:
        nlp = load(lang=lang)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)
    except ValueError:
        nlp = load(lang=lang, ignore_tag_map=True)
        assert nlp._meta["lang"] == f"udpipe_{lang}"
        doc = nlp(text)

    assert doc 
Example #12
Source File: data_loader.py    From LipReading with MIT License 6 votes vote down vote up
def split_sentences(dataviews, captions):
    nlp = spacy.load('en')
    new_frames, new_captions = [], []
    for frames, caps in zip(dataviews, captions):
      new_fs, new_caps = [], []
      left = 0
      right = 1
      while left < len(caps) and right < len(caps):
        cap = " ".join(caps[left:right])
        doc = nlp(cap)
        sentences = [x.string.strip() for x in doc.sents]
        if len(sentences) >= 2 and right - 1 - left > 0:
          cap = " ".join(caps[left:right - 1])
          new_fs.append(np.concatenate(frames[left:right - 1]))
          print("sentence:", cap)
          new_caps.append(cap)
          left = right - 1
        right += 1
      new_frames.append(new_fs)
      new_captions.append(new_caps)
    return new_frames, new_captions

  # REVIEW josephz: This is a copy of `FrameCaptionDataset.parse_caption`. 
Example #13
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example #14
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def get_stdeng_spacy_tagger(suppress_errors=False):
	global SPACY_WRAPPER
	if SPACY_WRAPPER is not None:
		return SPACY_WRAPPER
	try:
		import spacy
		SPACY_WRAPPER = SpacyTagger()
		SPACY_WRAPPER.spacy_object = spacy.load('en', parser=False, entity=False)
		return SPACY_WRAPPER
	except ImportError:
		if not suppress_errors: raise
	except RuntimeError:
		## this seems to happen if the 'en' model is not installed. it might
		## look like this:
		# RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
		if not suppress_errors: raise
	return None 
Example #15
Source File: test_construct_query.py    From adam_qas with GNU General Public License v3.0 6 votes vote down vote up
def test_construct_query(self):
        sql_man = SqLiteManager()
        en_nlp_l = spacy.load(EN_MODEL_MD)

        result = sql_man.get_questions_between(5, 7)

        for row in result:
            qid = row[0]
            with self.subTest(qid):
                question = row[1]
                question_type = row[2]
                question_feat = json.loads(row[3])

                if question_feat is not None:

                    en_doc = en_nlp_l(u'' + question)

                    query = construct_query(question_feat, en_doc)
                    print("{0}){1} :\nQuery: {2}".format(qid, question, repr(query)))
                    js_query = json.dumps(repr(query))
                    sql_man.update_search_query(qid, js_query)
                    assert query is not None
        # sql_man.close_db() 
Example #16
Source File: text2mapVec.py    From Geocoding-with-Map-Vector with GNU General Public License v3.0 6 votes vote down vote up
def buildMapVec(text):
    """
    An example wrapper function for text2mapVec(), reads in necessary collections and then runs text2mapVec().
    Feel free to modify to your preference and task objective.
    :param text: to create the Map Vector from encoded as unicode.
    :return: currently only prints the vector, add 'return map_vector' or whatever you prefer.
    """
    ENCODING_MAP = cPickle.load(open(u"data/1x1_encode_map.pkl"))  # the resolution of the map
    OUTLIERS_MAP = cPickle.load(open(u"data/1x1_outliers_map.pkl"))  # dimensions must match the above
    nlp = spacy.load(u'en_core_web_lg')  # or spacy.load(u'en') depending on your Spacy Download (simple or full)
    conn = sqlite3.connect(u'../data/geonames.db').cursor()  # this DB can be downloaded using the GitHub link
    map_vector = text2mapvec(doc=nlp(text), mapping=ENCODING_MAP, outliers=OUTLIERS_MAP, polygon_size=1, db=conn, exclude=u"Cairo")
    print(map_vector)


# text = u"The Giza pyramid complex is an archaeological site on the Giza Plateau, on the outskirts of Cairo, Egypt."
# buildMapVec(text) 
Example #17
Source File: prepare_data.py    From Hierarchical-Sentiment with MIT License 6 votes vote down vote up
def build_dataset(args):

    print("Building dataset from : {}".format(args.input))
    print("-> Building {} random splits".format(args.nb_splits))

    nlp = spacy.load('en', create_pipeline=custom_pipeline)
    gen_a,gen_b = itertools.tee(data_generator(args.input),2)
    data = [(z["reviewerID"],z["asin"],tok,z["overall"]) for z,tok in zip(tqdm((z for z in gen_a),desc="reading file"),nlp.pipe((x["reviewText"] for x in gen_b), batch_size=1000000, n_threads=8))]

    print(data[0])
    shuffle(data)

    splits = [randint(0,args.nb_splits-1) for _ in range(0,len(data))]
    count = Counter(splits)

    print("Split distribution is the following:")
    print(count)

    return {"data":data,"splits":splits,"rows":("user_id","item_id","review","rating")} 
Example #18
Source File: data_loader.py    From LipReading with MIT License 6 votes vote down vote up
def build_vocab(dataset_name, labels):
  raw_dir = _util.getRelRawPath(dataset_name)
  labels_path = os.path.join(raw_dir, labels)
  try:
    with open(labels_path) as label_file:
      labels = str(''.join(json.load(label_file)))
  except:
    labels = _labels
    _getSharedLogger().warning("Could not open '%s'... \n\tUsing hardcoded labels: '%s'", labels_path, labels)

  char2idx = {}
  for k, v in _markers2Id.items():
    char2idx[k] = v
  for char in labels:
    char2idx[char] = len(char2idx)
  return char2idx 
Example #19
Source File: utils.py    From comet-commonsense with Apache License 2.0 5 votes vote down vote up
def load_existing_data_loader(data_loader, path):
    old_data_loader = torch.load(path)
    for attr in data_loader.__dict__.keys():
        if attr not in old_data_loader.__dict__.keys():
            continue
        setattr(data_loader, attr, getattr(old_data_loader, attr))


################################################################################
#
# Code Below taken from HuggingFace pytorch-openai-lm repository
#
################################################################################ 
Example #20
Source File: TermDocMatrixFactory.py    From scattertext with Apache License 2.0 5 votes vote down vote up
def get_nlp(self):
        nlp = self._nlp
        if nlp is None:
            import spacy
            nlp = spacy.load('en')
        return nlp 
Example #21
Source File: test.py    From negspacy with MIT License 5 votes vote down vote up
def test_own_terminology():
    nlp = spacy.load("en_core_web_sm")
    negex = Negex(nlp, termination=["whatever"])
    nlp.add_pipe(negex, last=True)
    doc = nlp("He does not like Steve Jobs whatever he says about Barack Obama.")
    assert doc.ents[1]._.negex == False 
Example #22
Source File: test_science_ie_data_utils.py    From sciwing with MIT License 5 votes vote down vote up
def test_get_bilou_lines(
        self, setup_science_ie_train_data_utils, entity_type, file_id
    ):
        utils = setup_science_ie_train_data_utils

        # test whether all the annotations that you get for different file ids
        # are present as either U, B I or L tag

        annotations = utils._get_annotations_for_entity(
            file_id=file_id, entity=entity_type
        )
        bilou_lines = utils.get_bilou_lines_for_entity(
            file_id=file_id, entity=entity_type
        )
        nlp = spacy.load("en_core_web_sm")

        annotation_words = []
        for annotation in annotations:
            words = annotation["words"]
            words = words.strip()
            doc = nlp(words)
            words = [tok.text for tok in doc]
            annotation_words.extend(words)

        bilou_words_without_o = []
        for bilou_line in bilou_lines:
            word, _, _, tag = bilou_line.split()
            if not tag.startswith("O"):
                bilou_words_without_o.append(word)

        print(annotation_words)
        print(bilou_words_without_o)
        assert len(annotation_words) == len(bilou_words_without_o) 
Example #23
Source File: test_spacy.py    From docker-python with Apache License 2.0 5 votes vote down vote up
def test_model(self):
        nlp = spacy.load('en')
        doc = nlp('This is a sentence.')
        self.assertEqual(5, len(doc)) 
Example #24
Source File: infer.py    From BERT-Relation-Extraction with Apache License 2.0 5 votes vote down vote up
def load_pickle(filename):
    completeName = os.path.join("./data/",\
                                filename)
    with open(completeName, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data 
Example #25
Source File: word_tokenizer.py    From sciwing with MIT License 5 votes vote down vote up
def __init__(self, tokenizer: str = "spacy"):
        """ WordTokenizers split the text into tokens

        Parameters
        ----------
        tokenizer : str
            The type of tokenizer.

            spacy
                Tokenizer from spact
            nltk
                NLTK based tokenizer
            vanilla
                Tokenize words according to space
            spacy-whtiespace
                Same as vanilla but implemented using custom white space tokenizer from spacy


        """
        super(WordTokenizer, self).__init__()
        self.msg_printer = Printer()
        self.tokenizer = tokenizer
        self.allowed_tokenizers = ["spacy", "nltk", "vanilla", "spacy-whitespace"]
        assert self.tokenizer in self.allowed_tokenizers, AssertionError(
            f"The word tokenizer can be {self.allowed_tokenizers}"
        )

        if self.tokenizer == "spacy" or "spacy-whitespace":
            self.nlp = spacy.load("en_core_web_sm")
            self.nlp.remove_pipe("parser")
            self.nlp.remove_pipe("tagger")
            self.nlp.remove_pipe("ner")

        if self.tokenizer == "spacy-whitespace":
            self.nlp.tokenizer = CustomSpacyWhiteSpaceTokenizer(self.nlp.vocab) 
Example #26
Source File: build_pretrain_corpus.py    From scibert with Apache License 2.0 5 votes vote down vote up
def _get_spacy_nlp():
    nlp = spacy.load('en_scispacy_core_web_sm', disable=['ner', 'tagger'])
    # nlp = spacy.load('en_core_web_sm', disable=['ner', 'tagger'])
    return nlp 
Example #27
Source File: install.py    From QuickUMLS with MIT License 5 votes vote down vote up
def install_spacy(lang):
    """Tries to create a spacy object; if it fails, downloads the dataset"""

    print(f'Determining if SpaCy for language "{lang}" is installed...')

    if lang in SPACY_LANGUAGE_MAP:
        try:
            spacy.load(SPACY_LANGUAGE_MAP[lang])
            print(f'SpaCy is installed and avaliable for {lang}!')
        except OSError:
            print(f'SpaCy is not available! Attempting to download and install...')
            spacy.cli.download(SPACY_LANGUAGE_MAP[lang]) 
Example #28
Source File: data_loader.py    From LipReading with MIT License 5 votes vote down vote up
def __init__(self, dataset_name, split_name, vid_ids,
      labels='labels.json', start_end='s_e', threshold=0.8, fps=29.97,
      cap='cap', frame_type='face_lmk_seq', sentence_dataset=False,
      in_ext='.npy', out_ext='.pkl',
      refresh=False):
    """ Dataset that loads the video dataview and captions.

    :param vid_ids: Directories of video ids to include in the dataset.
    :param labels: Corresponding dataset vocabulary.
    :param cap: Base filename for caption rows to load.
    :param frame_type: Frame type to use for input, also the base filename for frame rows to load.
    """
    super(FrameCaptionDataset, self).__init__()
    assert all(os.path.isdir(x) for x in vid_ids)
    assert frame_type in ('face_lmk_seq', 'face_vtx_seq')

    pickle_dir = _util.getRelPicklesPath(dataset_name, 'sentence' if sentence_dataset else 'non-sentence', split_name)
    if refresh or not os.path.isdir(pickle_dir):
      char2idx, frames, captions = FrameCaptionDataset.construct_dataset(dataset_name, pickle_dir, vid_ids,
        labels=labels, start_end=start_end, cap=cap, frame_type=frame_type, sentence_dataset=sentence_dataset,
        in_ext=in_ext, fps=fps, threshold=threshold)
    else:
      char2idx, frames, captions = load_dataset(pickle_dir, out_ext=out_ext)
    assert len(frames) == len(captions) > 0

    # Cache all rows.
    self.char2idx = char2idx
    self.idx2char = {v: k for k, v in char2idx.items()}

    self.frames = frames
    self.captions = captions
    self.num_elements = len(captions)

    # Cache dataview paths.
    self.frame_type = frame_type 
Example #29
Source File: test.py    From negspacy with MIT License 5 votes vote down vote up
def test_issue7():
    nlp = spacy.load("en_core_web_sm")
    negex = Negex(nlp)
    nlp.add_pipe(negex, last=True)
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    doc = nlp("fgfgdghgdh") 
Example #30
Source File: sample_size_NN.py    From robotreviewer with GNU General Public License v3.0 5 votes vote down vote up
def tokenize_abstract(abstract, nlp=None):
    if nlp is None:
        nlp = spacy.load('en')

    tokens, POS_tags = [], []
    ab = nlp(abstract)
    for word in ab:
        tokens.append(word.text)
        POS_tags.append(word.tag_)
        #import pdb;pdb.set_trace()
    return tokens, POS_tags