Python regex.compile() Examples
The following are 30
code examples of regex.compile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: barcodes.py From umis with MIT License | 6 votes |
def exact_barcode_filter(chunk, bc1, bc2, bc3, re_string=None): if not re_string: re_string = '(.*):CELL_(?P<CB>.*):UMI_(.*)\\n(.*)\\n\\+\\n(.*)\\n' parser_re = re.compile(re_string) kept = [] for read in chunk: match = parser_re.search(read).groupdict() cb1 = match['CB'] if bc3: cb1, cb2, cb3 = cb1.split("-") elif bc2: cb1, cb2 = cb1.split("-") if cb1 not in bc1: continue if bc2 and cb2 not in bc2: continue if bc3 and cb3 not in bc3: continue kept.append(read) return kept
Example #2
Source File: sentence_splitter.py From SoMaJo with GNU General Public License v3.0 | 6 votes |
def __init__(self, is_tuple=False, language="de_CMC"): """Create a SentenceSplitter object. If the tokenized paragraphs contain token classes or extra info, set is_tuple=True. """ self.is_tuple = is_tuple # full stop, ellipsis, exclamation and question marks self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$") self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$") self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$") # International quotes: «» “” ‹› ‘’ # German quotes: »« „“ ›‹ ‚‘ self.problematic_quotes = set(['"']) if language == "de" or language == "de_CMC": # German opening quotes [»›] have category Pf # German closing quotes [“‘«‹] have category Pi self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"]) self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")
Example #3
Source File: regexp_tokenizer.py From OpenQA with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #4
Source File: tokenization.py From GPT2sQA with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #5
Source File: regexp_tokenizer.py From justcopy-backend with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #6
Source File: beta_to_unicode.py From cltk with MIT License | 6 votes |
def __init__(self, pattern1=None, pattern2=None, pattern3=None): if pattern1 is None: pattern1 = UPPER if pattern2 is None: pattern2 = LOWER if pattern3 is None: pattern3 = PUNCT self.pattern1 = \ [(regex.compile(beta_regex, flags=regex.VERSION1), repl) for (beta_regex, repl) in pattern1] self.pattern2 = \ [(regex.compile(beta_regex, flags=regex.VERSION1), repl) for (beta_regex, repl) in pattern2] self.pattern3 = \ [(regex.compile(beta_regex, flags=regex.VERSION1), repl) for (beta_regex, repl) in pattern3]
Example #7
Source File: sentence_splitter.py From SoMaJo with GNU General Public License v3.0 | 6 votes |
def __init__(self, is_tuple=False, language="de_CMC"): """Create a SentenceSplitter object. If the tokenized paragraphs contain token classes or extra info, set is_tuple=True. """ self.is_tuple = is_tuple # full stop, ellipsis, exclamation and question marks self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$") self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$") self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$") # International quotes: «» “” ‹› ‘’ # German quotes: »« „“ ›‹ ‚‘ self.problematic_quotes = set(['"']) if language == "de" or language == "de_CMC": # German opening quotes [»›] have category Pf # German closing quotes [“‘«‹] have category Pi self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"]) self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")
Example #8
Source File: phrase2vec.py From mat2vec with MIT License | 6 votes |
def exclude_words(phrasegrams, words): """Given a list of words, excludes those from the keys of the phrase dictionary.""" new_phrasergrams = {} words_re_list = [] for word in words: we = regex.escape(word) words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_") word_reg = regex.compile(r""+"|".join(words_re_list)) for gram in tqdm(phrasegrams): valid = True for sub_gram in gram: if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None: valid = False break if not valid: continue if valid: new_phrasergrams[gram] = phrasegrams[gram] return new_phrasergrams # Generating word grams.
Example #9
Source File: tokenization_roberta.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #10
Source File: tokenization_gpt2.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #11
Source File: tokenization_gpt2.py From Bert-Chinese-Text-Classification-Pytorch with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #12
Source File: tokenization_gpt2.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #13
Source File: hosters.py From bioconda-utils with MIT License | 6 votes |
def get_versions(self, req, orig_version): exclude = set(self.exclude) vals = {key: val for key, val in self.vals.items() if key not in exclude} link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals) link_re = re.compile(link_pattern) result = [] for url in self.releases_urls: parser = HrefParser(link_re) parser.feed(await req.get_text_from_url(url)) for match in parser.get_matches(): match["link"] = urljoin(url, match["href"]) match["releases_url"] = url match["vals"] = vals result.append(match) return result
Example #14
Source File: umis.py From umis with MIT License | 6 votes |
def umi_histogram(fastq): ''' Counts the number of reads for each UMI Expects formatted fastq files. ''' annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) counter = collections.Counter() for read in read_fastq(fastq): match = parser_re.search(read).groupdict() counter[match['MB']] += 1 for bc, count in counter.most_common(): sys.stdout.write('{}\t{}\n'.format(bc, count))
Example #15
Source File: hosters.py From bioconda-utils with MIT License | 6 votes |
def get_versions(self, req, orig_version): exclude = set(self.exclude) vals = {key: val for key, val in self.vals.items() if key not in exclude} link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals) link_re = re.compile(link_pattern) result = [] for url in self.releases_urls: files = await req.get_ftp_listing(url) for fname in files: match = link_re.search(fname) if match: data = match.groupdict() data['fn'] = fname data['link'] = "ftp://" + vals['host'] + fname data['releases_url'] = url result.append(data) return result
Example #16
Source File: pygrok.py From pygrok with MIT License | 6 votes |
def _load_search_pattern(self): self.type_mapper = {} py_regex_pattern = self.pattern while True: # Finding all types specified in the groks m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern) for n in m: self.type_mapper[n[1]] = n[2] #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type} # with regex and regex group name py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}', lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) #replace %{pattern_name} with regex py_regex_pattern = re.sub(r'%{(\w+)}', lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None: break self.regex_obj = re.compile(py_regex_pattern)
Example #17
Source File: helpers.py From PyInquirer with MIT License | 6 votes |
def expect_regex(self, pattern): """Read until matches pattern or timeout.""" # inspired by pexpect/pty_spawn and pexpect/expect.py expect_loop end_time = time.time() + self.timeout buf = '' prog = regex.compile(pattern) while (end_time - time.time()) > 0.0: # switch to nonblocking read reads, _, _ = select.select([self.fd], [], [], end_time - time.time()) if len(reads) > 0: try: buf = remove_ansi_escape_sequences(buf + self.read()) except EOFError: assert prog.match(buf) is not None, \ 'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern) if prog.match(buf): return True else: # do not eat up CPU when waiting for the timeout to expire time.sleep(self.timeout/10) assert prog.match(buf) is not None, \ 'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
Example #18
Source File: helpers.py From talon with Apache License 2.0 | 6 votes |
def contains_sender_names(sender): '''Returns a functions to search sender\'s name or it\'s part. >>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>") >>> feature("Sergey Obukhov") 1 >>> feature("BR, Sergey N.") 1 >>> feature("Sergey") 1 >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic") 1 >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") 1 ''' names = '( |$)|'.join(flatten_list([[e, e.capitalize()] for e in extract_names(sender)])) names = names or sender if names != '': return binary_regex_search(re.compile(names)) return lambda s: 0
Example #19
Source File: helpers_test.py From talon with Apache License 2.0 | 5 votes |
def test_binary_regex_search(): eq_(1, h.binary_regex_search(re.compile("12"))("12")) eq_(0, h.binary_regex_search(re.compile("12"))("34"))
Example #20
Source File: strings.py From ibis with Apache License 2.0 | 5 votes |
def execute_string_like_series_string(op, data, pattern, escape, **kwargs): new_pattern = re.compile(sql_like_to_regex(pattern, escape=escape)) return data.map( lambda x, pattern=new_pattern: pattern.search(x) is not None )
Example #21
Source File: client.py From ibis with Apache License 2.0 | 5 votes |
def list_tables(self, like: str = None, database: str = None) -> list: """List all tables inside given or current database. Parameters ---------- like : str, optional database : str, optional Returns ------- list """ _database = None if not self.db_name == database: _database = self.db_name self.set_database(database) tables = self.con.get_tables() if _database: self.set_database(_database) if like is None: return tables pattern = re.compile(like) return list(filter(lambda t: pattern.findall(t), tables))
Example #22
Source File: genrelease.py From ibis with Apache License 2.0 | 5 votes |
def iter_release_notes(repo, from_ref, to_ref, default_role): """Yield release notes from `from_ref` to `to_ref`.""" pattern = re.compile( r'^(?:{})\s+#(\d+)\s+from'.format('|'.join(GITHUB_CLOSE_KEYWORDS)), flags=re.MULTILINE | re.IGNORECASE, ) for commit in commits_between( repo, from_ref, to_ref, options=pygit2.GIT_SORT_TOPOLOGICAL ): message = commit.message.strip() subject, *lines = map(str.strip, message.splitlines()) tag, *rest = subject.split(':', 1) tag = tag.lower() lineitem = ''.join(rest) or subject role = KEYWORD_MAP.get(tag, default_role) modifier = ' major' if role == 'bug' else '' try: issue_number, *_ = pattern.findall(message) except ValueError: issue_number = '-' yield "* :{role}:`{issue_number}{modifier}` {lineitem}".format( role=role, issue_number=issue_number, modifier=modifier, lineitem=lineitem.strip(), )
Example #23
Source File: encoder.py From GPT2 with MIT License | 5 votes |
def __init__(self, encoder, bpe_merges, errors='replace'): self.encoder = encoder self.decoder = {v:k for k,v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
Example #24
Source File: spelling_checker.py From kaggle-HomeDepot with MIT License | 5 votes |
def autocorrect_query(self, query): # correct bigram bigram_corrector = {} words_bigram = self.get_valid_bigram_words(query.split(" ")) for word in words_bigram: corrected_word = self.spelling_checker.correct(word) if word != corrected_word: bigram_corrector[word] = corrected_word for k,v in bigram_corrector.items(): pattern = regex.compile(r"(?<=\W|^)%s(?=\W|$)"%k) query = regex.sub(pattern, v, query) # correct unigram corrected_query = [] for word in query.split(" "): if len(word) < self.min_len: corrected_query.append(word) elif self.exclude_stopwords and word in config.STOP_WORDS: corrected_query.append(word) elif self.skip_digit and len(re.findall(re.compile("\d+"), word)): corrected_query.append(word) else: corrected_word = self.spelling_checker.correct(word) if len(corrected_word) < self.min_len: corrected_query.append(word) else: corrected_query.append( corrected_word ) return " ".join(corrected_query)
Example #25
Source File: search.py From PyDev.Debugger with Eclipse Public License 1.0 | 5 votes |
def __init__(self, regexp, flags = 0, maxLength = None): """ @type regexp: str @param regexp: Regular expression string. @type flags: int @param flags: Regular expression flags. @type maxLength: int @param maxLength: Maximum expected length of the strings matched by this regular expression. This value will be used to calculate the required buffer size when doing buffered searches. Ideally it should be an exact value, but in some cases it's not possible to calculate so an upper limit should be given instead. If that's not possible either, C{None} should be used. That will cause an exception to be raised if this pattern is used in a buffered search. """ self.pattern = regexp self.flags = flags self.regexp = re.compile(regexp, flags) self.maxLength = maxLength
Example #26
Source File: postprocessing.py From ocr-process-service with MIT License | 5 votes |
def remove_non_numeric_date(text): regex = re.compile(r'[^\d\/]+') return regex.sub('', text)
Example #27
Source File: postprocessing.py From ocr-process-service with MIT License | 5 votes |
def remove_non_numeric_currency(text): regex = re.compile(r'[^\d\,\.]+') temp = regex.sub('', text) return temp.replace(',', ".")
Example #28
Source File: postprocessing.py From ocr-process-service with MIT License | 5 votes |
def remove_non_numeric(text): non_decimal = re.compile(r'[^\d]+') return non_decimal.sub('', text)
Example #29
Source File: postprocessing.py From ocr-process-service with MIT License | 5 votes |
def approximate_match(word_re, lines, fuzziness='e<=1'): logger = logging.getLogger(__name__) logger.debug('Looking for %s with fuzziness: %s' % (word_re, fuzziness)) best_partial_matches = [] search = re.compile( ur'(' + word_re + '){' + fuzziness + '}', flags=re.BESTMATCH | re.IGNORECASE).search for m in ifilter(None, imap(search, lines)): logger.debug('%s %s' % (m.span(), m[0])) best_partial_matches.append(m[0]) return best_partial_matches
Example #30
Source File: simple_tokenizer.py From FusionNet with MIT License | 5 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). """ self._regexp = regex.compile( '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set()