Python regex.IGNORECASE Examples
The following are 30
code examples of regex.IGNORECASE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: gpt2_bpe_utils.py From attn2d with MIT License | 6 votes |
def __init__(self, encoder, bpe_merges, errors='replace'): self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} try: import regex as re self.re = re except ImportError: raise ImportError('Please install regex with: pip install regex') # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = self.re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #2
Source File: regexp_tokenizer.py From justcopy-backend with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #3
Source File: regexp_tokenizer.py From OpenQA with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #4
Source File: gpt2_bpe_utils.py From fairseq with MIT License | 6 votes |
def __init__(self, encoder, bpe_merges, errors='replace'): self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} try: import regex as re self.re = re except ImportError: raise ImportError('Please install regex with: pip install regex') # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = self.re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #5
Source File: tokenization_gpt2.py From Bert-Chinese-Text-Classification-Pytorch with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #6
Source File: tokenization.py From GPT2sQA with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #7
Source File: tokenization_roberta.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #8
Source File: tokenization_gpt2.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #9
Source File: tokenization_gpt2.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #10
Source File: regexp_tokenizer.py From FusionNet with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #11
Source File: tokenization_roberta.py From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #12
Source File: tokenization_gpt2.py From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #13
Source File: utils.py From chepy with GNU General Public License v3.0 | 6 votes |
def count_occurances(self, regex: str, case_sensitive: bool = False): """Counts occurances of the regex. Counts the number of times the provided string occurs. Args: regex (str): Required. Regex string to search for case_sensitive (bool, optional): If search should be case insensitive, by default False Returns: Chepy: The Chepy object. Examples: >>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output 2 """ if case_sensitive: r = re.compile(regex) else: r = re.compile(regex, re.IGNORECASE) self.state = len(r.findall(self._convert_to_str())) return self
Example #14
Source File: search.py From chepy with GNU General Public License v3.0 | 6 votes |
def search_ctf_flags(self, prefix: str, postfix: str = ".+?\{*\}"): """Search CTF style flags. This by default assumes that the flag format is similar to something like picoCTF{some_flag} as an example. Args: prefix (str): Prefix of the flag. Like `picoCTF` postfix (str, optional): Regex for the remainder of the flag. Defaults to '.+\{.+}'. Returns: Chepy: The Chepy object. Examples: >>> Chepy("tests/files/flags").read_file().search_ctf_flags("pico").get_by_index(0) picoCTF{r3source_pag3_f1ag} """ self.state = re.findall(prefix + postfix, self._convert_to_str(), re.IGNORECASE) return self
Example #15
Source File: test_pre_process_regexp_option.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def test_regex_recognizes_source(self): src_list = [ ('Retail Store A', 1,), ('Big Bank AB', 1,), ('Acme Capital, Inc.', 1,), ('Lowe & Swayze', 1,), ('Big Bank & Company (004578)', 1,), ('Family Name Limited (173437)', 1,), ('Financial Services & Co. (015607)', 1,), ('Food Wholsale, Inc. (056230)', 1,), ('All Eyes Communications (018951)', 1,), ('Joe Smith Archives, LLC d/b/a Foxtrot (085292)', 2,), ] importer = self.make_importer() importer.wrap_in_wordbreaks = True for phrase, target_ct in src_list: ptrns = importer.pre_process_regexp_option([phrase]) self.assertEqual(target_ct, len(ptrns), f'"{phrase}" produced {len(ptrns)} patterns, expected {target_ct}') matches = 0 for ptrn in ptrns: rg = re.compile(ptrn, re.IGNORECASE) for _ in rg.finditer(phrase): matches += 1 self.assertEqual(target_ct, matches, f'"{phrase}" gives {matches} matches, expected {target_ct}')
Example #16
Source File: tokenization_gpt2.py From squash-generation with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #17
Source File: timezone_parser.py From dateparser with BSD 3-Clause "New" or "Revised" License | 6 votes |
def build_tz_offsets(search_regex_parts): def get_offset(tz_obj, regex, repl='', replw=''): return ( tz_obj[0], { 'regex': re.compile(re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE), 'offset': timedelta(seconds=tz_obj[1]) } ) for tz_info in timezone_info_list: for regex in tz_info['regex_patterns']: for tz_obj in tz_info['timezones']: search_regex_parts.append(tz_obj[0]) yield get_offset(tz_obj, regex) # alternate patterns for replace, replacewith in tz_info.get('replace', []): for tz_obj in tz_info['timezones']: search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0])) yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
Example #18
Source File: tokenization_gpt2.py From PPLM with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #19
Source File: utils.py From chepy with GNU General Public License v3.0 | 6 votes |
def find_replace(self, pattern: str, repl: str, ignore_case=True): """Replace matched pattern with repln Args: pattern (str): Required. Pattern to search repl (str): Required. Pattern to match ignore_case (bool, optional): Case insensitive. Defaults to True. Returns: Chepy: The Chepy object. Examples: >>> Chepy("some some data").find_replace(r"some\s", "data").o "datadatadata" """ flags = 0 if ignore_case: flags = re.IGNORECASE self.state = re.sub(pattern, repl, self._convert_to_str(), flags=flags) return self
Example #20
Source File: regexp_tokenizer.py From RCZoo with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #21
Source File: tokens.py From chimera with MIT License | 5 votes |
def tokenize_sentences(text: str): text = re.sub(r" no\. ent_(\d)", r" shorthand_number ent_\1", text, flags=re.IGNORECASE) return [s.replace("shorthand_number", "no.") for s in sent_tokenize(text)]
Example #22
Source File: grammar.py From estnltk with GNU General Public License v2.0 | 5 votes |
def __init__(self, pattern, flags=re.UNICODE | re.MULTILINE | re.IGNORECASE, name=None): super(IRegex, self).__init__(pattern, flags, name)
Example #23
Source File: encoder.py From gpt2-estimator with MIT License | 5 votes |
def __init__(self, encoder, bpe_merges, errors='replace'): self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #24
Source File: tokenization_gpt2.py From exbert with Apache License 2.0 | 5 votes |
def __init__( self, vocab_file, merges_file, errors="replace", unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs ): super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.max_len_single_sentence = ( self.max_len ) # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = ( self.max_len ) # no default special tokens - you can update this value if you add special tokens with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} with open(merges_file, encoding="utf-8") as merges_handle: bpe_merges = merges_handle.read().split("\n")[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_merges] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #25
Source File: tokenization_gpt2.py From KagNet with MIT License | 5 votes |
def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #26
Source File: encoder.py From grover with Apache License 2.0 | 5 votes |
def __init__(self, encoder, bpe_merges, errors='replace'): self.encoder = {k: v + 1 for k, v in encoder.items()} self.encoder['<|padding|>'] = 0 self.padding = 0 del self.encoder['<|endoftext|>'] for special_token_type in ['domain', 'date', 'authors', 'title', 'article', 'summary']: setattr(self, f'begin_{special_token_type}', len(self.encoder)) self.encoder[f'<|begin{special_token_type}|>'] = len(self.encoder) setattr(self, f'end_{special_token_type}', len(self.encoder)) self.encoder[f'<|endof{special_token_type}|>'] = len(self.encoder) # This will be used if we want to combine short articles. self.reset_context = len(self.encoder) self.encoder['<|resetcontext|>'] = len(self.encoder) ################################## END OF SPECIAL TOKENS TO ADD self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #27
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _generate_relative_translations(self, normalize=False): relative_translations = self.info.get('relative-type-regex', {}) relative_dictionary = OrderedDict() for key, value in relative_translations.items(): if normalize: value = list(map(normalize_unicode, value)) pattern = '|'.join(sorted(value, key=len, reverse=True)) pattern = DIGIT_GROUP_PATTERN.sub(r'?P<n>\d+', pattern) pattern = re.compile(r'^(?:{})$'.format(pattern), re.UNICODE | re.IGNORECASE) relative_dictionary[pattern] = key return relative_dictionary
Example #28
Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _construct_split_regex(self): known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache())) if self._no_word_spacing: regex = r"^(.*?)({})(.*)$".format(known_words_group) else: regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group) self._split_regex_cache.setdefault( self._settings.registry_key, {})[self.info['name']] = \ re.compile(regex, re.UNICODE | re.IGNORECASE)
Example #29
Source File: tokenization_gpt2.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #30
Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _construct_split_relative_regex(self): known_relative_strings_group = "|".join(self._get_sorted_relative_strings_from_cache()) if self._no_word_spacing: regex = "({})".format(known_relative_strings_group) else: regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(known_relative_strings_group) self._split_relative_regex_cache.setdefault( self._settings.registry_key, {})[self.info['name']] = \ re.compile(regex, re.UNICODE | re.IGNORECASE)