Python regex.findall() Examples

The following are 30 code examples of regex.findall(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function .
Example #1
Source File: featuretable.py    From panphon with MIT License 6 votes vote down vote up
def compile_regex_from_str(self, pat):
        """Given a string describing features masks for a sequence of segments,
        return a compiled regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """
        s2n = {'-': -1, '0': 0, '+': 1}
        seg_res = []
        for mat in re.findall(r'\[[^]]+\]+', pat):
            ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)}
            segs = self.all_segs_matching_fts(ft_mask)
            seg_res.append('({})'.format('|'.join(segs)))
        regexp = ''.join(seg_res)
        return re.compile(regexp) 
Example #2
Source File: extractors.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def extract_strings(self, length: int = 4):
        """Extract strings from state
        
        Args:
            length (int, optional): Min length of string. Defaults to 4.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/hello").load_file().extract_strings().o
            [
                b'__PAGEZERO',
                b'__TEXT',
                b'__text',
                b'__TEXT',
                b'__stubs',
                b'__TEXT',
                ...
            ]
        """
        pattern = b"[^\x00-\x1F\x7F-\xFF]{" + str(length).encode() + b",}"
        self.state = re.findall(pattern, self._convert_to_bytes())
        return self 
Example #3
Source File: test_teachers.py    From ParlAI with MIT License 6 votes vote down vote up
def _test_display_output(self, image_mode):
        """
        Test display data output with given image_mode.
        """
        with testing_utils.tempdir() as tmpdir:
            data_path = tmpdir
            os.makedirs(os.path.join(data_path, 'ImageTeacher'))

            opt = {
                'task': 'integration_tests:ImageTeacher',
                'datapath': data_path,
                'image_mode': image_mode,
                'display_verbose': True,
            }
            output = testing_utils.display_data(opt)
            train_labels = re.findall(r"\[labels\].*\n", output[0])
            valid_labels = re.findall(r"\[eval_labels\].*\n", output[1])
            test_labels = re.findall(r"\[eval_labels\].*\n", output[2])

            for i, lbls in enumerate([train_labels, valid_labels, test_labels]):
                self.assertGreater(len(lbls), 0, 'DisplayData failed')
                self.assertEqual(len(lbls), len(set(lbls)), output[i]) 
Example #4
Source File: parse.py    From olapy with GNU Lesser General Public License v2.1 6 votes vote down vote up
def get_nested_select(self):
        """Get tuples groups in query like ::

                Select {
                    ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],
                    [Geography].[Geography].[Continent].[Europe],
                    [Measures].[Amount]),

                    ([Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],
                    [Geography].[Geography].[Continent].[Europe],
                    [Measures].[Amount])
                    }

                out :
                    ['[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 19,2010],\
                    [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]',

                    '[Time].[Time].[Day].[2010].[Q2 2010].[May 2010].[May 17,2010],\
                    [Geography].[Geography].[Continent].[Europe],[Measures].[Amount]']

        :return: All groups as list of strings.
        """
        return regex.findall(r"\(([^()]+)\)", self.mdx_query) 
Example #5
Source File: scorer.py    From nmt-chatbot with GNU General Public License v3.0 6 votes vote down vote up
def ascii_emoticons(index, question, answer):
    global valid_emoticon

    valid_emoticon = False

    # Disabled
    if score_settings['ascii_emoticon_modifier_value'] is None:
        return 0

    # Split by words (tokens)
    tokens = answer.split()

    # Calculate emoticon score
    score = [1 if len(token) > 1 and len(re.findall('[^a-zA-Z0-9]', token)) / len(token) > score_settings['ascii_emoticon_non_char_to_all_chars_ratio'] else 0 for token in tokens]
    score = sum([1 if (index > 0 and score[index - 1] == 0 and value == 1) or (index == 0 and value == 1) else 0 for index, value in enumerate(score)]) * score_settings['ascii_emoticon_modifier_value']

    if score:
        valid_emoticon = True

    return score

# Check if sentence includes 'unk' token 
Example #6
Source File: utils.py    From open-syllabus-project with Apache License 2.0 6 votes vote down vote up
def tokenize_field(value):

    """
    Extract normalized tokens from a field.

    Args:
        value (str): The field value.

    Returns:
        list: The cleaned tokens.
    """

    # Extract tokens.
    tokens = regex.findall('\p{L}{2,}', value.lower())

    # Remove articles.
    tokens = [t for t in tokens if t not in [
        'a', 'an', 'the', 'and',
    ]]

    return tokens 
Example #7
Source File: tokenization.py    From language with Apache License 2.0 6 votes vote down vote up
def tokenize(self, text):
    bpe_tokens = []
    list_starts, str_starts = [], []
    basic_tokens = text if isinstance(text, list) else [text]
    for i, basic_token in enumerate(basic_tokens):
      num_subtokens = 0
      basic_token = basic_token if (i == 0 or not isinstance(text, list)) else (
          ' ' + basic_token)
      for token in re.findall(self.bpe.pat, basic_token):
        token = ''.join(self.bpe.byte_encoder[b] for b in token.encode('utf-8'))
        sub_tokens = [bpe_token for bpe_token in self.bpe.bpe(token).split(' ')]
        bpe_tokens.extend(sub_tokens)
        str_starts += [True] + [False] * (len(sub_tokens) - 1)
        num_subtokens += len(sub_tokens)
      list_starts += [True] + [False] * (num_subtokens - 1)
    word_starts = list_starts if isinstance(text, list) else str_starts
    assert len(bpe_tokens) == len(word_starts)
    return bpe_tokens, word_starts 
Example #8
Source File: language.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def unicode_chrs_by_lang(self, lang: str):
        """Detect language specific characters
        
        Detect characters from varios Unicode code point ids. Example 
        of languages are Common, Arabic, Armenian, Bengali, Bopomofo, Braille, 
        Buhid, Canadian_Aboriginal, Cherokee, Cyrillic, Devanagari, Ethiopic, 
        Georgian, Greek, Gujarati, Gurmukhi, Han, Hangul, Hanunoo, Hebrew, 
        Hiragana, Inherited, Kannada, Katakana, Khmer, Lao, Latin, Limbu, 
        Malayalam, Mongolian, Myanmar, Ogham, Oriya, Runic, Sinhala, Syriac, 
        Tagalog, Tagbanwa, TaiLe, Tamil, Telugu, Thaana, Thai, Tibetan, Yi, 
        but other code points should work also.
        
        Args:
            lang (str): Required. A string value identifying the language. 
        
        Returns:
            Chepy: The Chepy object.
        """
        self.state = re.findall(r"\p{" + lang + "}", self._convert_to_str())
        return self 
Example #9
Source File: utils.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def count_occurances(self, regex: str, case_sensitive: bool = False):
        """Counts occurances of the regex.

        Counts the number of times the provided string occurs.

        Args:
            regex (str): Required. Regex string to search for
            case_sensitive (bool, optional): If search should be case insensitive, by default False

        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output
            2
        """
        if case_sensitive:
            r = re.compile(regex)
        else:
            r = re.compile(regex, re.IGNORECASE)
        self.state = len(r.findall(self._convert_to_str()))
        return self 
Example #10
Source File: search.py    From chepy with GNU General Public License v3.0 6 votes vote down vote up
def search_ctf_flags(self, prefix: str, postfix: str = ".+?\{*\}"):
        """Search CTF style flags. 

        This by default assumes that the flag format is similar 
        to something like picoCTF{some_flag} as an example. 
        
        Args:
            prefix (str): Prefix of the flag. Like `picoCTF`
            postfix (str, optional): Regex for the remainder of the flag. 
                Defaults to '.+\{.+}'.
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/flags").read_file().search_ctf_flags("pico").get_by_index(0)
            picoCTF{r3source_pag3_f1ag}
        """
        self.state = re.findall(prefix + postfix, self._convert_to_str(), re.IGNORECASE)
        return self 
Example #11
Source File: pygrok.py    From pygrok with MIT License 6 votes vote down vote up
def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern) 
Example #12
Source File: encoder.py    From Few-Shot-NLG with MIT License 6 votes vote down vote up
def encode(self, text):
        bpe_tokens = []
        bpe_token_original = []
        for token in re.findall(self.pat, text):
            # print (token)
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            ### handle oov
            for bpe_token in self.bpe(token).split(' '):
                if bpe_token in self.encoder:
                    bpe_tokens.append(self.encoder[bpe_token])
                else:
                    bpe_tokens.append(self.encoder["empty"])


            # bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
            bpe_token_original.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens, bpe_token_original 
Example #13
Source File: tokenization_roberta.py    From HPSG-Neural-Parser with MIT License 5 votes vote down vote up
def _tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            if sys.version_info[0] == 2:
                token = ''.join(self.byte_encoder[ord(b)] for b in token)
            else:
                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens 
Example #14
Source File: tokenization_gpt2.py    From exbert with Apache License 2.0 5 votes vote down vote up
def _tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens 
Example #15
Source File: புணர்ச்சி.py    From pytamil with MIT License 5 votes vote down vote up
def __init__(self, txt):
        val = regex.findall(r'(.*)\+(.*)\=(.*)',txt)
        self.நிலைமொழி = val[0][0].strip()
        self.வருமொழி = val[0][1].strip()
        self.தொடர்மொழி = val[0][2].strip()
        self.நிலைமொழி_regex = _convert_to_regex(self.நிலைமொழி)
        self.வருமொழி_regex = _convert_to_regex(self.வருமொழி)
        self.வாக்கியம் = txt 
Example #16
Source File: புணர்ச்சி.py    From pytamil with MIT License 5 votes vote down vote up
def _get_regex_chars(charslist):
    p=''
    for c in charslist:
        p=p + c.strip() +'|'

    p = p[:-1] # remove trailing '|' symbol
    return p


# def matchவிதிகள்(pattern, பதம்):
#     # tokenize
#     tokens = re.findall(r'\((.*?)\)', pattern)
#     # print(tokens)
#     regexpat = ""

#     # expand macros and convert to regex patterns
#     for token in tokens:
#         if token in எழுத்து.எழுத்துக்கள்.keys():
#             expanded= எழுத்து.எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ"
#             chars = _get_chars(expanded)  # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"
#             regexpat = regexpat + "[" + chars + "]"
#         elif token == "...":
#             regexpat = regexpat + ".*"
#         else :
#             chars = _get_chars(token.split(","))
#             regexpat = regexpat + "[" + chars + "]" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"

#     regexpat = regexpat + '$'
#     # print(regexpat)

#     # match regex
#     matchval = re.match(regexpat,எழுத்து.உயிர்மெய்விரி(பதம்))

#     return matchval 
Example #17
Source File: புணர்ச்சி.py    From pytamil with MIT License 5 votes vote down vote up
def _convert_to_regex(pattern):
    # tokenize
    tokens = regex.findall(r'\((.*?)\)', pattern)
    # print(tokens)
    regexpat = ""

    # expand macros and convert to regex patterns
    for token in tokens:
        if token in எழுத்து._எழுத்துக்கள்.keys():
            expanded= எழுத்து._எழுத்துக்கள்[token] # macro expansion eg. expand "உயிர்" to "[அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ"
            chars = _get_regex_chars(expanded)  # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"
            regexpat = regexpat + "(" + chars + ")"
        elif token == "...":
            regexpat = regexpat + ".*"
        elif token == 'தனிக்குறில்':
            regexpat = regexpat + "..(அ|இ|உ|எ|ஒ)"
        else :
            chars = _get_regex_chars(token.split(","))
            regexpat = regexpat + "(" + chars + ")" # convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ"

    # regexpat = regexpat + '$'
    # print(regexpat)

    return regexpat

    ## convert "அ, இ, உ, எ, ஒ" t0 "அ|இ|உ|எ|ஒ" 
Example #18
Source File: extractors.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def extract_rsa_private(self):
        """Extract RSA private key

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"-----BEGIN RSA PRIVATE KEY-----", self._convert_to_str()
        )
        return self 
Example #19
Source File: tokenization_ctrl.py    From exbert with Apache License 2.0 5 votes vote down vote up
def _tokenize(self, text):
        """ Tokenize a string.
        """
        split_tokens = []

        words = re.findall(r"\S+\n?", text)

        for token in words:
            split_tokens.extend([t for t in self.bpe(token).split(" ")])
        return split_tokens 
Example #20
Source File: tokenization_gpt2.py    From bert_on_stilts with Apache License 2.0 5 votes vote down vote up
def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        if len(bpe_tokens) > self.max_len:
            raise ValueError(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
                " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
            )
        return bpe_tokens 
Example #21
Source File: feature_engineering.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower() 
Example #22
Source File: regexps_field_detection.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_numbers_from_str(self) -> Tuple[List[int], List[int]]:
        str_parts = self.selected_columns_str.split(':')
        if len(str_parts) != 2:
            raise RuntimeError('Selected columns field should contain at least two parts ' +
                               'separated by ":" ("A: B" or "2, 1: 1" or "B, C: D, E" ...)')
        col_indices = ([], [],)  # type: Tuple[List[int], List[int]]
        for i in range(2):
            for col_str in re.findall(r'\d+', str_parts[i]):
                col_index = int(col_str) - 1
                col_indices[i].append(col_index)
            if not col_indices[i]:
                search_str = str_parts[i].lower()
                start_order = ord('a')
                for col_str in re.findall(r'[a-z]{1,1}', search_str):
                    col_index = ord(col_str) - start_order
                    col_indices[i].append(col_index)

        errors = []
        if not col_indices[0]:
            errors.append('Selected columns field: left part of ":" (value columns) should ' +
                          'contain at least one number or letter')
        if len(col_indices[0]) > 2:
            errors.append('Selected columns field: left part of ":" (value columns) should ' +
                          'contain one or two values')
        if not col_indices[1]:
            errors.append('Selected columns field: right part of ":" (search columns) should ' +
                          'contain at least one number or letter')
        if errors:
            raise RuntimeError('\n'.join(errors))

        return col_indices 
Example #23
Source File: search.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def search_twilio_key(self):
        """Search for Twilio api key
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall("SK[a-z0-9]{32}", self._convert_to_str())
        return self 
Example #24
Source File: search.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def search_private_key(self):
        """Search varios private key headers
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            "-----BEGIN (RSA|OPENSSH|DSA|EC) PRIVATE KEY-----", self._convert_to_str()
        )
        return self 
Example #25
Source File: search.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def search_slack_webhook(self):
        """Search slack webhook
        
        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            "https://hooks\.slack\.com/services/T[a-zA-Z0-9_]{8}/B[a-zA-Z0-9_]{8}/[a-zA-Z0-9_]{24}",
            self._convert_to_str(),
        )
        return self 
Example #26
Source File: search.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def search_slack_tokens(self):
        """Search slack tokens
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy("tests/files/flags").read_file().search_slack_tokens().get_by_index(0)
            xoxp...859
        """
        self.state = re.findall(
            "(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})",
            self._convert_to_str(),
        )
        return self 
Example #27
Source File: extractors.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def extract_base64(self, min: int = 20):
        """Extract base64 encoded strings
        
        Args:
            min (int, optional): Minium length to match. Defaults to 20.
        
        Returns:
            Chepy: The Chepy object. 
        """
        found = re.findall("[a-zA-Z0-9+/=]{%s,}" % str(20), self._convert_to_str())
        if len(found) > 1: # pragma: no cover
            self.state = found
        else:
            self.state = found[0]
        return self 
Example #28
Source File: extractors.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def extract_jwt_token(self):
        """Extract JWT token

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"ey[A-Za-z0-9_-]*\.[A-Za-z0-9._-]*|ey[A-Za-z0-9_\/+-]*\.[A-Za-z0-9._\/+-]*",
            self._convert_to_str(),
        )
        return self 
Example #29
Source File: extractors.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def extract_dsa_private(self):
        """Extract DSA private key

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str()
        )
        return self 
Example #30
Source File: extractors.py    From chepy with GNU General Public License v3.0 5 votes vote down vote up
def extract_paypal_bt(self):
        """Extract Paypal braintree access token

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}",
            self._convert_to_str(),
        )
        return self