Python regex.findall() Examples
The following are 30
code examples of regex.findall().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: featuretable.py From panphon with MIT License | 6 votes |
def compile_regex_from_str(self, pat): """Given a string describing features masks for a sequence of segments, return a compiled regex matching the corresponding strings. Args: ft_str (str): feature masks, each enclosed in square brackets, in which the features are delimited by any standard delimiter. Returns: Pattern: regular expression pattern equivalent to `ft_str` """ s2n = {'-': -1, '0': 0, '+': 1} seg_res = [] for mat in re.findall(r'\[[^]]+\]+', pat): ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)} segs = self.all_segs_matching_fts(ft_mask) seg_res.append('({})'.format('|'.join(segs))) regexp = ''.join(seg_res) return re.compile(regexp)
Example #2
Source File: extractors.py From chepy with GNU General Public License v3.0 | 6 votes |
def extract_strings(self, length: int = 4): """Extract strings from state Args: length (int, optional): Min length of string. Defaults to 4. Returns: Chepy: The Chepy object. Examples: >>> Chepy("tests/files/hello").load_file().extract_strings().o [ b'__PAGEZERO', b'__TEXT', b'__text', b'__TEXT', b'__stubs', b'__TEXT', ... ] """ pattern = b"[^\x00-\x1F\x7F-\xFF]{" + str(length).encode() + b",}" self.state = re.findall(pattern, self._convert_to_bytes()) return self
Example #3
Source File: test_teachers.py From ParlAI with MIT License | 6 votes |
def _test_display_output(self, image_mode): """ Test display data output with given image_mode. """ with testing_utils.tempdir() as tmpdir: data_path = tmpdir os.makedirs(os.path.join(data_path, 'ImageTeacher')) opt = { 'task': 'integration_tests:ImageTeacher', 'datapath': data_path, 'image_mode': image_mode, 'display_verbose': True, } output = testing_utils.display_data(opt) train_labels = re.findall(r"\[labels\].*\n", output[0]) valid_labels = re.findall(r"\[eval_labels\].*\n", output[1]) test_labels = re.findall(r"\[eval_labels\].*\n", output[2]) for i, lbls in enumerate([train_labels, valid_labels, test_labels]): self.assertGreater(len(lbls), 0, 'DisplayData failed') self.assertEqual(len(lbls), len(set(lbls)), output[i])
Example #4
Source File: parse.py From olapy with GNU Lesser General Public License v2.1 | 6 votes |
def get_nested_select(self): """Get tuples groups in query like :: Select { ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010], [Geography].[Geography].[Continent].[Europe], [Measures].[Amount]), ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010], [Geography].[Geography].[Continent].[Europe], [Measures].[Amount]) } out : ['[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],\ [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]', '[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],\ [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]'] :return: All groups as list of strings. """ return regex.findall(r"\(([^()]+)\)", self.mdx_query)
Example #5
Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0 | 6 votes |
def ascii_emoticons(index, question, answer): global valid_emoticon valid_emoticon = False # Disabled if score_settings['ascii_emoticon_modifier_value'] is None: return 0 # Split by words (tokens) tokens = answer.split() # Calculate emoticon score score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens] score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value'] if score: valid_emoticon = True return score # Check if sentence includes 'unk' token
Example #6
Source File: utils.py From open-syllabus-project with Apache License 2.0 | 6 votes |
def tokenize_field(value): """ Extract normalized tokens from a field. Args: value (str): The field value. Returns: list: The cleaned tokens. """ # Extract tokens. tokens = regex.findall('\p{L}{2,}', value.lower()) # Remove articles. tokens = [t for t in tokens if t not in [ 'a', 'an', 'the', 'and', ]] return tokens
Example #7
Source File: tokenization.py From language with Apache License 2.0 | 6 votes |
def tokenize(self, text): bpe_tokens = [] list_starts, str_starts = [], [] basic_tokens = text if isinstance(text, list) else [text] for i, basic_token in enumerate(basic_tokens): num_subtokens = 0 basic_token = basic_token if (i == 0 or not isinstance(text, list)) else ( ' ' + basic_token) for token in re.findall(self.bpe.pat, basic_token): token = ''.join(self.bpe.byte_encoder[b] for b in token.encode('utf-8')) sub_tokens = [bpe_token for bpe_token in self.bpe.bpe(token).split(' ')] bpe_tokens.extend(sub_tokens) str_starts += [True] + [False] * (len(sub_tokens) - 1) num_subtokens += len(sub_tokens) list_starts += [True] + [False] * (num_subtokens - 1) word_starts = list_starts if isinstance(text, list) else str_starts assert len(bpe_tokens) == len(word_starts) return bpe_tokens, word_starts
Example #8
Source File: language.py From chepy with GNU General Public License v3.0 | 6 votes |
def unicode_chrs_by_lang(self, lang: str): """Detect language specific characters Detect characters from varios Unicode code point ids. Example of languages are Common, Arabic, Armenian, Bengali, Bopomofo, Braille, Buhid, Canadian_Aboriginal, Cherokee, Cyrillic, Devanagari, Ethiopic, Georgian, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, Hiragana, Inherited, Kannada, Katakana, Khmer, Lao, Latin, Limbu, Malayalam, Mongolian, Myanmar, Ogham, Oriya, Runic, Sinhala, Syriac, Tagalog, Tagbanwa, TaiLe, Tamil, Telugu, Thaana, Thai, Tibetan, Yi, but other code points should work also. Args: lang (str): Required. A string value identifying the language. Returns: Chepy: The Chepy object. """ self.state = re.findall(r"\p{" + lang + "}", self._convert_to_str()) return self
Example #9
Source File: utils.py From chepy with GNU General Public License v3.0 | 6 votes |
def count_occurances(self, regex: str, case_sensitive: bool = False): """Counts occurances of the regex. Counts the number of times the provided string occurs. Args: regex (str): Required. Regex string to search for case_sensitive (bool, optional): If search should be case insensitive, by default False Returns: Chepy: The Chepy object. Examples: >>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output 2 """ if case_sensitive: r = re.compile(regex) else: r = re.compile(regex, re.IGNORECASE) self.state = len(r.findall(self._convert_to_str())) return self
Example #10
Source File: search.py From chepy with GNU General Public License v3.0 | 6 votes |
def search_ctf_flags(self, prefix: str, postfix: str = ".+?\{*\}"): """Search CTF style flags. This by default assumes that the flag format is similar to something like picoCTF{some_flag} as an example. Args: prefix (str): Prefix of the flag. Like `picoCTF` postfix (str, optional): Regex for the remainder of the flag. Defaults to '.+\{.+}'. Returns: Chepy: The Chepy object. Examples: >>> Chepy("tests/files/flags").read_file().search_ctf_flags("pico").get_by_index(0) picoCTF{r3source_pag3_f1ag} """ self.state = re.findall(prefix + postfix, self._convert_to_str(), re.IGNORECASE) return self
Example #11
Source File: pygrok.py From pygrok with MIT License | 6 votes |
def _load_search_pattern(self): self.type_mapper = {} py_regex_pattern = self.pattern while True: # Finding all types specified in the groks m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern) for n in m: self.type_mapper[n[1]] = n[2] #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type} # with regex and regex group name py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}', lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) #replace %{pattern_name} with regex py_regex_pattern = re.sub(r'%{(\w+)}', lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")", py_regex_pattern) if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None: break self.regex_obj = re.compile(py_regex_pattern)
Example #12
Source File: encoder.py From Few-Shot-NLG with MIT License | 6 votes |
def encode(self, text): bpe_tokens = [] bpe_token_original = [] for token in re.findall(self.pat, text): # print (token) token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) ### handle oov for bpe_token in self.bpe(token).split(' '): if bpe_token in self.encoder: bpe_tokens.append(self.encoder[bpe_token]) else: bpe_tokens.append(self.encoder["empty"]) # bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) bpe_token_original.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens, bpe_token_original
Example #13
Source File: tokenization_roberta.py From HPSG-Neural-Parser with MIT License | 5 votes |
def _tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): if sys.version_info[0] == 2: token = ''.join(self.byte_encoder[ord(b)] for b in token) else: token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens
Example #14
Source File: tokenization_gpt2.py From exbert with Apache License 2.0 | 5 votes |
def _tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens
Example #15
Source File: புணர்ச்சி.py From pytamil with MIT License | 5 votes |
def __init__(self, txt): val = regex.findall(r'(.*)\+(.*)\=(.*)',txt) self.நிலைமொழி = val[0][0].strip() self.வருமொழி = val[0][1].strip() self.தொடர்மொழி = val[0][2].strip() self.நிலைமொழி_regex = _convert_to_regex(self.நிலைமொழி) self.வருமொழி_regex = _convert_to_regex(self.வருமொழி) self.வாக்கியம் = txt
Example #16
Source File: புணர்ச்சி.py From pytamil with MIT License | 5 votes |
def _get_regex_chars(charslist): p='' for c in charslist: p=p + c.strip() +'|' p = p[:-1] # remove trailing '|' symbol return p # def matchவிதிகள்(pattern, பதம்): # # tokenize # tokens = re.findall(r'\((.*?)\)', pattern) # # print(tokens) # regexpat = "" # # expand macros and convert to regex patterns # for token in tokens: # if token in எழுத்து.எழுத்துக்கள்.keys(): # expanded= எழுத்து.எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ" # chars = _get_chars(expanded) # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ" # regexpat = regexpat + "[" + chars + "]" # elif token == "...": # regexpat = regexpat + ".*" # else : # chars = _get_chars(token.split(",")) # regexpat = regexpat + "[" + chars + "]" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ" # regexpat = regexpat + '$' # # print(regexpat) # # match regex # matchval = re.match(regexpat,எழுத்து.உயிர்மெய்விரி(பதம்)) # return matchval
Example #17
Source File: புணர்ச்சி.py From pytamil with MIT License | 5 votes |
def _convert_to_regex(pattern): # tokenize tokens = regex.findall(r'\((.*?)\)', pattern) # print(tokens) regexpat = "" # expand macros and convert to regex patterns for token in tokens: if token in எழுத்து._எழுத்துக்கள்.keys(): expanded= எழுத்து._எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ" chars = _get_regex_chars(expanded) # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ" regexpat = regexpat + "(" + chars + ")" elif token == "...": regexpat = regexpat + ".*" elif token == 'தனிக்குறில்': regexpat = regexpat + "..(அ|இ|உ|எ|ஒ)" else : chars = _get_regex_chars(token.split(",")) regexpat = regexpat + "(" + chars + ")" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ" # regexpat = regexpat + '$' # print(regexpat) return regexpat ## convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"
Example #18
Source File: extractors.py From chepy with GNU General Public License v3.0 | 5 votes |
def extract_rsa_private(self): """Extract RSA private key Returns: Chepy: The Chepy object. """ self.state = re.findall( r"-----BEGIN RSA PRIVATE KEY-----", self._convert_to_str() ) return self
Example #19
Source File: tokenization_ctrl.py From exbert with Apache License 2.0 | 5 votes |
def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] words = re.findall(r"\S+\n?", text) for token in words: split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens
Example #20
Source File: tokenization_gpt2.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def encode(self, text): bpe_tokens = [] for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) if len(bpe_tokens) > self.max_len: raise ValueError( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) ) return bpe_tokens
Example #21
Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def clean(s): # Cleans a string: Lowercasing, trimming, removing non-alphanumeric return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
Example #22
Source File: regexps_field_detection.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def get_numbers_from_str(self) -> Tuple[List[int], List[int]]: str_parts = self.selected_columns_str.split(':') if len(str_parts) != 2: raise RuntimeError('Selected columns field should contain at least two parts ' + 'separated by ":" ("A: B" or "2, 1: 1" or "B, C: D, E" ...)') col_indices = ([], [],) # type: Tuple[List[int], List[int]] for i in range(2): for col_str in re.findall(r'\d+', str_parts[i]): col_index = int(col_str) - 1 col_indices[i].append(col_index) if not col_indices[i]: search_str = str_parts[i].lower() start_order = ord('a') for col_str in re.findall(r'[a-z]{1,1}', search_str): col_index = ord(col_str) - start_order col_indices[i].append(col_index) errors = [] if not col_indices[0]: errors.append('Selected columns field: left part of ":" (value columns) should ' + 'contain at least one number or letter') if len(col_indices[0]) > 2: errors.append('Selected columns field: left part of ":" (value columns) should ' + 'contain one or two values') if not col_indices[1]: errors.append('Selected columns field: right part of ":" (search columns) should ' + 'contain at least one number or letter') if errors: raise RuntimeError('\n'.join(errors)) return col_indices
Example #23
Source File: search.py From chepy with GNU General Public License v3.0 | 5 votes |
def search_twilio_key(self): """Search for Twilio api key Returns: Chepy: The Chepy object. """ self.state = re.findall("SK[a-z0-9]{32}", self._convert_to_str()) return self
Example #24
Source File: search.py From chepy with GNU General Public License v3.0 | 5 votes |
def search_private_key(self): """Search varios private key headers Returns: Chepy: The Chepy object. """ self.state = re.findall( "-----BEGIN (RSA|OPENSSH|DSA|EC) PRIVATE KEY-----", self._convert_to_str() ) return self
Example #25
Source File: search.py From chepy with GNU General Public License v3.0 | 5 votes |
def search_slack_webhook(self): """Search slack webhook Returns: Chepy: The Chepy object. """ self.state = re.findall( "https://hooks\.slack\.com/services/T[a-zA-Z0-9_]{8}/B[a-zA-Z0-9_]{8}/[a-zA-Z0-9_]{24}", self._convert_to_str(), ) return self
Example #26
Source File: search.py From chepy with GNU General Public License v3.0 | 5 votes |
def search_slack_tokens(self): """Search slack tokens Returns: Chepy: The Chepy object. Examples: >>> Chepy("tests/files/flags").read_file().search_slack_tokens().get_by_index(0) xoxp...859 """ self.state = re.findall( "(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})", self._convert_to_str(), ) return self
Example #27
Source File: extractors.py From chepy with GNU General Public License v3.0 | 5 votes |
def extract_base64(self, min: int = 20): """Extract base64 encoded strings Args: min (int, optional): Minium length to match. Defaults to 20. Returns: Chepy: The Chepy object. """ found = re.findall("[a-zA-Z0-9+/=]{%s,}" % str(20), self._convert_to_str()) if len(found) > 1: # pragma: no cover self.state = found else: self.state = found[0] return self
Example #28
Source File: extractors.py From chepy with GNU General Public License v3.0 | 5 votes |
def extract_jwt_token(self): """Extract JWT token Returns: Chepy: The Chepy object. """ self.state = re.findall( r"ey[A-Za-z0-9_-]*\.[A-Za-z0-9._-]*|ey[A-Za-z0-9_\/+-]*\.[A-Za-z0-9._\/+-]*", self._convert_to_str(), ) return self
Example #29
Source File: extractors.py From chepy with GNU General Public License v3.0 | 5 votes |
def extract_dsa_private(self): """Extract DSA private key Returns: Chepy: The Chepy object. """ self.state = re.findall( r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str() ) return self
Example #30
Source File: extractors.py From chepy with GNU General Public License v3.0 | 5 votes |
def extract_paypal_bt(self): """Extract Paypal braintree access token Returns: Chepy: The Chepy object. """ self.state = re.findall( r"access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}", self._convert_to_str(), ) return self