Python ftfy.fix_text() Examples

The following are 30 code examples of ftfy.fix_text(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module ftfy , or try the search function .
Example #1
Source File: utils.py    From comet-commonsense with Apache License 2.0 8 votes vote down vote up
def encode(self, texts, verbose=True):
        texts_tokens = []
        if verbose:
            for text in tqdm(texts, ncols=80, leave=False):
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend(
                        [self.encoder.get(t, 0) for t in
                         self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        else:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend(
                        [self.encoder.get(t, 0) for t in
                         self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        return texts_tokens 
Example #2
Source File: tokenization_openai.py    From Bert-Chinese-Text-Classification-Pytorch with MIT License 7 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
Example #3
Source File: cnndm.py    From lm-human-preferences with MIT License 6 votes vote down vote up
def cnndm_generator(mode, seed=0, shuffle=False, comm=None):
    # data originally from https://github.com/abisee/cnn-dailymail
    if mode == 'valid':
        mode = 'val'
    with open(gcs.download_file_cached(f'gs://lm-human-preferences/datasets/cnndm/url_lists/all_{mode}.txt', comm=comm)) as f:
        urls = [line.strip() for line in f]
    if shuffle:
        random.seed(seed)
        random.shuffle(urls)
    # if n_eval > 0:
    #     urls = urls[:n_eval]

    urls_dir = gcs.download_directory_cached(f'gs://lm-human-preferences/datasets/cnndm/cache_{mode}', comm=comm)

    for i, url in enumerate(urls):
        path = os.path.join(urls_dir, get_path_of_url(url))
        text = open(path).read()
        text = clean_up_start(text)
        text = ftfy.fix_text(text)

        text = re.sub(r"\n{3,}", "\n\n", text)
        text = text.split('@highlight')[0].strip()
        yield text
        # _, ref_sents = get_art_abs(path) 
Example #4
Source File: autosumpdf.py    From autosum with MIT License 6 votes vote down vote up
def search_citation(text, exp):
    '''Finds sentences around citations, where the regexp `exp matches'''

    text = text.decode('utf-8')
    lines = text.split('\n')
    text = ' '.join(lines)
    text = ' '.join(text.split())
    text = ftfy.fix_text(text)
    logging.info("Search...'{0!s}'".format(exp))

    sentences = split_sentences(text)
    regex = re.compile(exp, flags=(re.I))

    founds = set()
    for sent in sentences:
        if regex.search(sent):
            founds.add(sent)
    return founds 
Example #5
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def test(data):
    text, expected = data

    text2 = text.encode('cp437').decode('utf-8')

    text3 = unidecode(text2)
    text4 = unicodedata.normalize('NFC', text2)

    text5 = unidecode(text4)

    print('                                text:', text, '| len:', len(text))
    print('                            expected:', expected, '  | len:', len(expected))
    print('                    text == expected:', text == expected)
    print('-------------------------------------')
    print('text.encode("cp437").decode("utf-8"):', text2, '  | len:', len(text2), '| expected:', text2 == expected)
    print('                      unicode(text2):', text3, '  | len:', len(text3), '| expected:', text3 == expected)
    print('-------------------------------------')
    print(' unicodedata.normalize("NFC", text2):', text4, '  | len:', len(text4), '| expected:', text4 == expected)
    print('                      unicode(text4):', text5, '  | len:', len(text5), '| expected:', text5 == expected)
    print('-------------------------------------')
    print('                 ftfy.fix_text(text):', ftfy.fix_text(text))
    print('-------------------------------------') 
Example #6
Source File: text_utils.py    From openai-gpt-pytorch with MIT License 6 votes vote down vote up
def encode(self, texts, verbose=True):
        texts_tokens = []
        if verbose:
            for text in tqdm(texts, ncols=80, leave=False):
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        else:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        return texts_tokens 
Example #7
Source File: subtitle.py    From bazarr with GNU General Public License v3.0 6 votes vote down vote up
def get_modified_content(self, format="srt", debug=False):
        """
        :return: string 
        """
        if not self.mods:
            return fix_text(self.content.decode(encoding=self._guessed_encoding), **ftfy_defaults).encode(
                encoding=self._guessed_encoding)

        submods = SubtitleModifications(debug=debug)
        if submods.load(content=self.text, language=self.language):
            logger.info("Applying mods: %s", self.mods)
            submods.modify(*self.mods)
            self.mods = submods.mods_used

            content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\
                .encode(encoding=self._guessed_encoding)
            submods.f = None
            del submods
            return content
        return None 
Example #8
Source File: tokenization_openai.py    From HPSG-Neural-Parser with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)

        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
Example #9
Source File: summarize.py    From python-qutescript with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def summarize_text(request):
    if request.html:
        parser = HtmlParser.from_file(file_path=request.html,
                                      url=request.url,
                                      tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = PlaintextParser.from_file(file_path=request.html,
                                           tokenizer=Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
    html = generate_html(sentences, fix_text(request.title)).render()
    request.send_html(html) 
Example #10
Source File: tokenization_openai.py    From PPLM with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
Example #11
Source File: tokenization_openai.py    From KagNet with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.set_special_tokens(special_tokens) 
Example #12
Source File: tokenization_openai.py    From bert_on_stilts with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.set_special_tokens(special_tokens) 
Example #13
Source File: tokenization_openai.py    From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)

        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
Example #14
Source File: text_utils.py    From models with MIT License 6 votes vote down vote up
def encode(self, texts, verbose=True):
        texts_tokens = []
        if verbose:
            for text in tqdm(texts, ncols=80, leave=False):
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend(
                        [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        else:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend(
                        [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        return texts_tokens 
Example #15
Source File: tokenization_openai.py    From TextClassify with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)

        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
Example #16
Source File: tldr.py    From lm-human-preferences with MIT License 6 votes vote down vote up
def tldr_generator(mode, seed=0, shuffle=False, comm=None):
    random.seed(seed)

    if mode == 'test':
        mode = 'valid' # validation set serves as training set, since we don't have access..
    assert mode in ['train', 'valid']

    with open(gcs.download_file_cached(f'gs://lm-human-preferences/tldr/{mode}-subset.json', comm=comm)) as f:
        datas = json.load(f)

    if shuffle:
        random.seed(seed)
        random.shuffle(datas)

    for data in datas:
        text = data['content']
        text = ftfy.fix_text(text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = text.strip()
        yield text 
Example #17
Source File: tokenization_openai.py    From squash-generation with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True,
                                      never_split=special_tokens if special_tokens is not None else [])
            self.fix_text = None

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens) 
Example #18
Source File: text_utils.py    From finetune-transformer-lm with MIT License 6 votes vote down vote up
def encode(self, texts, verbose=True):
        texts_tokens = []
        if verbose:
            for text in tqdm(texts, ncols=80, leave=False):
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        else:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        return texts_tokens 
Example #19
Source File: tokenization_xlm.py    From HPSG-Neural-Parser with MIT License 5 votes vote down vote up
def _tokenize(self, text):
        """ Tokenize a string. """
        split_tokens = []
        if self.fix_text is None:
            # Using BERT's BasicTokenizer
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend([t for t in self.bpe(token).split(' ')])
        else:
            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
        return split_tokens 
Example #20
Source File: pre_process.py    From gpt-2-tensorflow2.0 with MIT License 5 votes vote down vote up
def process_text(text_files):
	print("Pre-processing the text data.....")
	file_writer = open(PROCESS_DATA_PATH, "w")
	for file_name in tqdm.tqdm(text_files):
		fr = open(file_name, 'r')
		file_writer.writelines([fix_text(line, normalization='NFKC') for line in fr.readlines()])
		fr.close
	file_writer.close() 
Example #21
Source File: EnglishTextPreprocessor.py    From NeuronBlocks with MIT License 5 votes vote down vote up
def preprocess(self, string):
        if self.__unicode_fix:
            string = ftfy.fix_text(string)
        if self.__DBC2SBC:
            string = self.DBC2SBC(string)
        return string 
Example #22
Source File: tokenization_xlm.py    From TextClassify with Apache License 2.0 5 votes vote down vote up
def _tokenize(self, text):
        """ Tokenize a string. """
        split_tokens = []
        if self.fix_text is None:
            # Using BERT's BasicTokenizer
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend([t for t in self.bpe(token).split(' ')])
        else:
            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
        return split_tokens 
Example #23
Source File: tokenization_openai.py    From PPLM with Apache License 2.0 5 votes vote down vote up
def set_special_tokens(self, special_tokens):
        """ Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        """
        if not special_tokens:
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return
        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
        if self.fix_text is None:
            # Using BERT's BasicTokenizer: we can update the tokenizer
            self.nlp.never_split = special_tokens
        logger.info("Special tokens {}".format(self.special_tokens)) 
Example #24
Source File: tokenization_openai.py    From PPLM with Apache License 2.0 5 votes vote down vote up
def tokenize(self, text):
        """ Tokenize a string. """
        split_tokens = []
        if self.fix_text is None:
            # Using BERT's BasicTokenizer
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend([t for t in self.bpe(token).split(' ')])
        else:
            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
        return split_tokens 
Example #25
Source File: upgradedCleaner.py    From EchoBurst with MIT License 5 votes vote down vote up
def jsonCleaner(jsonl, filter, writeList, count):
    """Reads and cleans the JSONL input files."""

    for jsonFile in jsonl:
        try:
            # loads the line as a dictionary
            d = json.loads(jsonFile, encoding="utf-8")
            body = d["body"]
            # deletes URLs
            body = re.sub(r'https?:\/\/.*[\r\n]*', '', body, flags=re.MULTILINE)
            # fixes any unicode encoding errors
            body = ftfy.fix_text(body)
            # deletes chosen punctuation
            body = body.translate(filter)
            body = body.lower()
            # eliminates empty, deleted or removed comments
            if re.search('[a-z]', body) \
                    and body != " removed " \
                    and body != " deleted " \
                    and "your submission has been automatically removed" not in body \
                    and d["author"] != "AutoModerator":
                # this line can be changed to include subreddits or other pieces of data in addition to the comment body
                writeList.append("{}\n".format(body))
                count += 1
            # every 500 000th line it will spit out the 10 most recent comments
            if count == 500000:
                print(writeList[-10:])
                count = 0
        # handles invalid dictionaries
        except ValueError:
            print("Decoding Error") 
Example #26
Source File: upgradedCleaner.py    From EchoBurst with MIT License 5 votes vote down vote up
def textCleaner(text, filter, writeList, count):
    for line in text:
        # deletes URLs
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)
        # fixes unicode encoding errors
        line = ftfy.fix_text(line)
        # deletes chosen punctuation
        line = line.translate(filter)
        line = line.lower()
        if re.search('[a-z]', line):
            writeList.append(line + "\n")
            count += 1
        if count == 500000:
            print(writeList[-10:])
            count = 0 
Example #27
Source File: tokenization_xlm.py    From TextClassify with Apache License 2.0 5 votes vote down vote up
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
                 mask_token="<special1>", additional_special_tokens=["<special0>",
                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
                                           sep_token=sep_token, pad_token=pad_token,
                                           cls_token=cls_token, mask_token=mask_token,
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
        try:
            import ftfy
            from spacy.lang.en import English
            _nlp = English()
            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
Example #28
Source File: tokenization_xlm.py    From HPSG-Neural-Parser with MIT License 5 votes vote down vote up
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
                 mask_token="<special1>", additional_special_tokens=["<special0>",
                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
                                           sep_token=sep_token, pad_token=pad_token,
                                           cls_token=cls_token, mask_token=mask_token,
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
        try:
            import ftfy
            import spacy
            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {} 
Example #29
Source File: tokenization_openai.py    From KagNet with MIT License 5 votes vote down vote up
def tokenize(self, text):
        """ Tokenize a string. """
        split_tokens = []
        if self.fix_text is None:
            # Using BERT's BasicTokenizer
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend([t for t in self.bpe(token).split(' ')])
        else:
            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
        return split_tokens 
Example #30
Source File: plugin.py    From limnoria-plugins with Do What The F*ck You Want To Public License 5 votes vote down vote up
def normalize(self, q):
            q = BeautifulSoup(q)
            q = fix_text(q.text).replace(r"\'", "'").replace(r"\"", '"')
            q = re.sub("([.!?])([A-Z(])(?![.'])", r"\g<1> \g<2>", q)
            q = re.sub("([,;:)])([a-zA-Z(])", r"\g<1> \g<2>", q)
            q = " ".join(q.split())
            return q