Python Examples of regex.findall

Source File: featuretable.py From panphon with MIT License

6 votes

def compile_regex_from_str(self, pat):
        """Given a string describing features masks for a sequence of segments,
        return a compiled regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """
        s2n = {'-': -1, '0': 0, '+': 1}
        seg_res = []
        for mat in re.findall(r'\[[^]]+\]+', pat):
            ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)}
            segs = self.all_segs_matching_fts(ft_mask)
            seg_res.append('({})'.format('|'.join(segs)))
        regexp = ''.join(seg_res)
        return re.compile(regexp)

Source File: extractors.py From chepy with GNU General Public License v3.0

6 votes

def extract_strings(self, length: int = 4):
        """Extract strings from state
        
        Args:
            length (int, optional): Min length of string. Defaults to 4.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/hello").load_file().extract_strings().o
            [
                b'__PAGEZERO',
                b'__TEXT',
                b'__text',
                b'__TEXT',
                b'__stubs',
                b'__TEXT',
                ...
            ]
        """
        pattern = b"[^\x00-\x1F\x7F-\xFF]{" + str(length).encode() + b",}"
        self.state = re.findall(pattern, self._convert_to_bytes())
        return self

Source File: test_teachers.py From ParlAI with MIT License

6 votes

def _test_display_output(self, image_mode):
        """
        Test display data output with given image_mode.
        """
        with testing_utils.tempdir() as tmpdir:
            data_path = tmpdir
            os.makedirs(os.path.join(data_path, 'ImageTeacher'))

            opt = {
                'task': 'integration_tests:ImageTeacher',
                'datapath': data_path,
                'image_mode': image_mode,
                'display_verbose': True,
            }
            output = testing_utils.display_data(opt)
            train_labels = re.findall(r"\[labels\].*\n", output[0])
            valid_labels = re.findall(r"\[eval_labels\].*\n", output[1])
            test_labels = re.findall(r"\[eval_labels\].*\n", output[2])

            for i, lbls in enumerate([train_labels, valid_labels, test_labels]):
                self.assertGreater(len(lbls), 0, 'DisplayData failed')
                self.assertEqual(len(lbls), len(set(lbls)), output[i])

Source File: parse.py From olapy with GNU Lesser General Public License v2.1

6 votes

def get_nested_select(self):
        """Get tuples groups in query like ::

                Select {
                    ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],
                    [Geography].[Geography].[Continent].[Europe],
                    [Measures].[Amount]),

                    ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],
                    [Geography].[Geography].[Continent].[Europe],
                    [Measures].[Amount])
                    }

                out :
                    ['[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],\
                    [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]',

                    '[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],\
                    [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]']

        :return: All groups as list of strings.
        """
        return regex.findall(r"\(([^()]+)\)", self.mdx_query)

Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0

6 votes

def ascii_emoticons(index, question, answer):
    global valid_emoticon

    valid_emoticon = False

    # Disabled
    if score_settings['ascii_emoticon_modifier_value'] is None:
        return 0

    # Split by words (tokens)
    tokens = answer.split()

    # Calculate emoticon score
    score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
    score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']

    if score:
        valid_emoticon = True

    return score

# Check if sentence includes 'unk' token

Source File: utils.py From open-syllabus-project with Apache License 2.0

6 votes

def tokenize_field(value):

    """
    Extract normalized tokens from a field.

    Args:
        value (str): The field value.

    Returns:
        list: The cleaned tokens.
    """

    # Extract tokens.
    tokens = regex.findall('\p{L}{2,}', value.lower())

    # Remove articles.
    tokens = [t for t in tokens if t not in [
        'a', 'an', 'the', 'and',
    ]]

    return tokens

Source File: tokenization.py From language with Apache License 2.0

6 votes

def tokenize(self, text):
    bpe_tokens = []
    list_starts, str_starts = [], []
    basic_tokens = text if isinstance(text, list) else [text]
    for i, basic_token in enumerate(basic_tokens):
      num_subtokens = 0
      basic_token = basic_token if (i == 0 or not isinstance(text, list)) else (
          ' ' + basic_token)
      for token in re.findall(self.bpe.pat, basic_token):
        token = ''.join(self.bpe.byte_encoder[b] for b in token.encode('utf-8'))
        sub_tokens = [bpe_token for bpe_token in self.bpe.bpe(token).split(' ')]
        bpe_tokens.extend(sub_tokens)
        str_starts += [True] + [False] * (len(sub_tokens) - 1)
        num_subtokens += len(sub_tokens)
      list_starts += [True] + [False] * (num_subtokens - 1)
    word_starts = list_starts if isinstance(text, list) else str_starts
    assert len(bpe_tokens) == len(word_starts)
    return bpe_tokens, word_starts

Source File: language.py From chepy with GNU General Public License v3.0

6 votes

def unicode_chrs_by_lang(self, lang: str):
        """Detect language specific characters
        
        Detect characters from varios Unicode code point ids. Example 
        of languages are Common, Arabic, Armenian, Bengali, Bopomofo, Braille, 
        Buhid, Canadian_Aboriginal, Cherokee, Cyrillic, Devanagari, Ethiopic, 
        Georgian, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, 
        Hiragana, Inherited, Kannada, Katakana, Khmer, Lao, Latin, Limbu, 
        Malayalam, Mongolian, Myanmar, Ogham, Oriya, Runic, Sinhala, Syriac, 
        Tagalog, Tagbanwa, TaiLe, Tamil, Telugu, Thaana, Thai, Tibetan, Yi, 
        but other code points should work also.
        
        Args:
            lang (str): Required. A string value identifying the language. 
        
        Returns:
            Chepy: The Chepy object.
        """
        self.state = re.findall(r"\p{" + lang + "}", self._convert_to_str())
        return self

Source File: utils.py From chepy with GNU General Public License v3.0

6 votes

def count_occurances(self, regex: str, case_sensitive: bool = False):
        """Counts occurances of the regex.

        Counts the number of times the provided string occurs.

        Args:
            regex (str): Required. Regex string to search for
            case_sensitive (bool, optional): If search should be case insensitive, by default False

        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output
            2
        """
        if case_sensitive:
            r = re.compile(regex)
        else:
            r = re.compile(regex, re.IGNORECASE)
        self.state = len(r.findall(self._convert_to_str()))
        return self

Source File: search.py From chepy with GNU General Public License v3.0

6 votes

def search_ctf_flags(self, prefix: str, postfix: str = ".+?\{*\}"):
        """Search CTF style flags. 

        This by default assumes that the flag format is similar 
        to something like picoCTF{some_flag} as an example. 
        
        Args:
            prefix (str): Prefix of the flag. Like `picoCTF`
            postfix (str, optional): Regex for the remainder of the flag. 
                Defaults to '.+\{.+}'.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/flags").read_file().search_ctf_flags("pico").get_by_index(0)
            picoCTF{r3source_pag3_f1ag}
        """
        self.state = re.findall(prefix + postfix, self._convert_to_str(), re.IGNORECASE)
        return self

Source File: pygrok.py From pygrok with MIT License

6 votes

def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern)

Source File: encoder.py From Few-Shot-NLG with MIT License

6 votes

def encode(self, text):
        bpe_tokens = []
        bpe_token_original = []
        for token in re.findall(self.pat, text):
            # print (token)
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            ### handle oov
            for bpe_token in self.bpe(token).split(' '):
                if bpe_token in self.encoder:
                    bpe_tokens.append(self.encoder[bpe_token])
                else:
                    bpe_tokens.append(self.encoder["empty"])


            # bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
            bpe_token_original.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens, bpe_token_original

Source File: tokenization_roberta.py From HPSG-Neural-Parser with MIT License

5 votes

def _tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            if sys.version_info[0] == 2:
                token = ''.join(self.byte_encoder[ord(b)] for b in token)
            else:
                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

Source File: tokenization_gpt2.py From exbert with Apache License 2.0

5 votes

def _tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

Source File: புணர்ச்சி.py From pytamil with MIT License

5 votes

def __init__(self, txt):
        val = regex.findall(r'(.*)\+(.*)\=(.*)',txt)
        self.நிலைமொழி = val[0][0].strip()
        self.வருமொழி = val[0][1].strip()
        self.தொடர்மொழி = val[0][2].strip()
        self.நிலைமொழி_regex = _convert_to_regex(self.நிலைமொழி)
        self.வருமொழி_regex = _convert_to_regex(self.வருமொழி)
        self.வாக்கியம் = txt

Source File: புணர்ச்சி.py From pytamil with MIT License

5 votes

def _get_regex_chars(charslist):
    p=''
    for c in charslist:
        p=p + c.strip() +'|'

    p = p[:-1] # remove trailing '|' symbol
    return p


# def matchவிதிகள்(pattern, பதம்):
#     # tokenize
#     tokens = re.findall(r'\((.*?)\)', pattern)
#     # print(tokens)
#     regexpat = ""

#     # expand macros and convert to regex patterns
#     for token in tokens:
#         if token in எழுத்து.எழுத்துக்கள்.keys():
#             expanded= எழுத்து.எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ"
#             chars = _get_chars(expanded)  # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"
#             regexpat = regexpat + "[" + chars + "]"
#         elif token == "...":
#             regexpat = regexpat + ".*"
#         else :
#             chars = _get_chars(token.split(","))
#             regexpat = regexpat + "[" + chars + "]" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"

#     regexpat = regexpat + '$'
#     # print(regexpat)

#     # match regex
#     matchval = re.match(regexpat,எழுத்து.உயிர்மெய்விரி(பதம்))

#     return matchval

Source File: புணர்ச்சி.py From pytamil with MIT License

5 votes

def _convert_to_regex(pattern):
    # tokenize
    tokens = regex.findall(r'\((.*?)\)', pattern)
    # print(tokens)
    regexpat = ""

    # expand macros and convert to regex patterns
    for token in tokens:
        if token in எழுத்து._எழுத்துக்கள்.keys():
            expanded= எழுத்து._எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ"
            chars = _get_regex_chars(expanded)  # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"
            regexpat = regexpat + "(" + chars + ")"
        elif token == "...":
            regexpat = regexpat + ".*"
        elif token == 'தனிக்குறில்':
            regexpat = regexpat + "..(அ|இ|உ|எ|ஒ)"
        else :
            chars = _get_regex_chars(token.split(","))
            regexpat = regexpat + "(" + chars + ")" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"

    # regexpat = regexpat + '$'
    # print(regexpat)

    return regexpat

    ## convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"

Source File: extractors.py From chepy with GNU General Public License v3.0

5 votes

def extract_rsa_private(self):
        """Extract RSA private key

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"-----BEGIN RSA PRIVATE KEY-----", self._convert_to_str()
        )
        return self

Source File: tokenization_ctrl.py From exbert with Apache License 2.0

5 votes

def _tokenize(self, text):
        """ Tokenize a string.
        """
        split_tokens = []

        words = re.findall(r"\S+\n?", text)

        for token in words:
            split_tokens.extend([t for t in self.bpe(token).split(" ")])
        return split_tokens

Source File: tokenization_gpt2.py From bert_on_stilts with Apache License 2.0

5 votes

def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        if len(bpe_tokens) > self.max_len:
            raise ValueError(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
                " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
            )
        return bpe_tokens

Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

Source File: regexps_field_detection.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def get_numbers_from_str(self) -> Tuple[List[int], List[int]]:
        str_parts = self.selected_columns_str.split(':')
        if len(str_parts) != 2:
            raise RuntimeError('Selected columns field should contain at least two parts ' +
                               'separated by ":" ("A: B" or "2, 1: 1" or "B, C: D, E" ...)')
        col_indices = ([], [],)  # type: Tuple[List[int], List[int]]
        for i in range(2):
            for col_str in re.findall(r'\d+', str_parts[i]):
                col_index = int(col_str) - 1
                col_indices[i].append(col_index)
            if not col_indices[i]:
                search_str = str_parts[i].lower()
                start_order = ord('a')
                for col_str in re.findall(r'[a-z]{1,1}', search_str):
                    col_index = ord(col_str) - start_order
                    col_indices[i].append(col_index)

        errors = []
        if not col_indices[0]:
            errors.append('Selected columns field: left part of ":" (value columns) should ' +
                          'contain at least one number or letter')
        if len(col_indices[0]) > 2:
            errors.append('Selected columns field: left part of ":" (value columns) should ' +
                          'contain one or two values')
        if not col_indices[1]:
            errors.append('Selected columns field: right part of ":" (search columns) should ' +
                          'contain at least one number or letter')
        if errors:
            raise RuntimeError('\n'.join(errors))

        return col_indices

Source File: search.py From chepy with GNU General Public License v3.0

5 votes

def search_twilio_key(self):
        """Search for Twilio api key
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall("SK[a-z0-9]{32}", self._convert_to_str())
        return self

Source File: search.py From chepy with GNU General Public License v3.0

5 votes

def search_private_key(self):
        """Search varios private key headers
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            "-----BEGIN (RSA|OPENSSH|DSA|EC) PRIVATE KEY-----", self._convert_to_str()
        )
        return self

Source File: search.py From chepy with GNU General Public License v3.0

5 votes

def search_slack_webhook(self):
        """Search slack webhook
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            "https://hooks\.slack\.com/services/T[a-zA-Z0-9_]{8}/B[a-zA-Z0-9_]{8}/[a-zA-Z0-9_]{24}",
            self._convert_to_str(),
        )
        return self

Source File: search.py From chepy with GNU General Public License v3.0

5 votes

def search_slack_tokens(self):
        """Search slack tokens
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/flags").read_file().search_slack_tokens().get_by_index(0)
            xoxp...859
        """
        self.state = re.findall(
            "(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})",
            self._convert_to_str(),
        )
        return self

Source File: extractors.py From chepy with GNU General Public License v3.0

5 votes

def extract_base64(self, min: int = 20):
        """Extract base64 encoded strings
        
        Args:
            min (int, optional): Minium length to match. Defaults to 20.
        
        Returns:
            Chepy: The Chepy object. 
        """
        found = re.findall("[a-zA-Z0-9+/=]{%s,}" % str(20), self._convert_to_str())
        if len(found) > 1: # pragma: no cover
            self.state = found
        else:
            self.state = found[0]
        return self

Source File: extractors.py From chepy with GNU General Public License v3.0

5 votes

def extract_jwt_token(self):
        """Extract JWT token

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"ey[A-Za-z0-9_-]*\.[A-Za-z0-9._-]*|ey[A-Za-z0-9_\/+-]*\.[A-Za-z0-9._\/+-]*",
            self._convert_to_str(),
        )
        return self

Source File: extractors.py From chepy with GNU General Public License v3.0

5 votes

def extract_dsa_private(self):
        """Extract DSA private key

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str()
        )
        return self

Source File: extractors.py From chepy with GNU General Public License v3.0

5 votes

def extract_paypal_bt(self):
        """Extract Paypal braintree access token

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}",
            self._convert_to_str(),
        )
        return self

Python regex.findall() Examples