Python Examples of re.UNICODE

Source File: magic_check_fn.py From recipes-py with Apache License 2.0

10 votes

def render_re(regex):
  """Renders a repr()-style value for a compiled regular expression."""
  actual_flags = []
  if regex.flags:
    flags = [
      (re.IGNORECASE, 'IGNORECASE'),
      (re.LOCALE, 'LOCALE'),
      (re.UNICODE, 'UNICODE'),
      (re.MULTILINE, 'MULTILINE'),
      (re.DOTALL, 'DOTALL'),
      (re.VERBOSE, 'VERBOSE'),
    ]
    for val, name in flags:
      if regex.flags & val:
        actual_flags.append(name)
  if actual_flags:
    return 're.compile(%r, %s)' % (regex.pattern, '|'.join(actual_flags))
  else:
    return 're.compile(%r)' % regex.pattern

Source File: urlresolvers.py From GTDWeb with GNU General Public License v2.0

6 votes

def regex(self):
        """
        Returns a compiled regular expression, depending upon the activated
        language-code.
        """
        language_code = get_language()
        if language_code not in self._regex_dict:
            if isinstance(self._regex, six.string_types):
                regex = self._regex
            else:
                regex = force_text(self._regex)
            try:
                compiled_regex = re.compile(regex, re.UNICODE)
            except re.error as e:
                raise ImproperlyConfigured(
                    '"%s" is not a valid regular expression: %s' %
                    (regex, six.text_type(e)))

            self._regex_dict[language_code] = compiled_regex
        return self._regex_dict[language_code]

Source File: gitdm.py From grimoirelab-sortinghat with GNU General Public License v3.0

6 votes

def __parse_domain_to_employer_line(self, raw_domain, raw_org):
        """Parse domain to employer lines"""

        d = re.match(self.DOMAIN_REGEX, raw_domain, re.UNICODE)
        if not d:
            cause = "invalid domain format: '%s'" % raw_domain
            raise InvalidFormatError(cause=cause)

        dom = d.group('domain').strip()

        o = re.match(self.ORGANIZATION_REGEX, raw_org, re.UNICODE)
        if not o:
            cause = "invalid organization format: '%s'" % raw_org
            raise InvalidFormatError(cause=cause)

        org = o.group('organization').strip()

        org = self.__encode(org)
        dom = self.__encode(dom)

        return org, dom

Source File: inlinepatterns.py From lambda-packs with MIT License

6 votes

def __init__(self, pattern, markdown_instance=None):
        """
        Create an instant of an inline pattern.

        Keyword arguments:

        * pattern: A regular expression that matches a pattern

        """
        self.pattern = pattern
        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 
                                      re.DOTALL | re.UNICODE)

        # Api for Markdown to pass safe_mode into instance
        self.safe_mode = False
        if markdown_instance:
            self.markdown = markdown_instance

Source File: test_re.py From ironpython2 with Apache License 2.0

6 votes

def test_bug_6561(self):
        # '\d' should match characters in Unicode category 'Nd'
        # (Number, Decimal Digit), but not those in 'Nl' (Number,
        # Letter) or 'No' (Number, Other).
        decimal_digits = [
            unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
            unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
            unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
            ]
        for x in decimal_digits:
            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)

        not_decimal_digits = [
            unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
            unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
            unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
            unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
            ]
        for x in not_decimal_digits:
            self.assertIsNone(re.match('^\d$', x, re.UNICODE))

Source File: test_re.py From jawfish with MIT License

6 votes

def test_ascii_and_unicode_flag(self):
        # String patterns
        for flags in (0, re.UNICODE):
            pat = re.compile('\xc0', flags | re.IGNORECASE)
            self.assertNotEqual(pat.match('\xe0'), None)
            pat = re.compile('\w', flags)
            self.assertNotEqual(pat.match('\xe0'), None)
        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('(?a)\xc0', re.IGNORECASE)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('\w', re.ASCII)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('(?a)\w')
        self.assertEqual(pat.match('\xe0'), None)
        # Bytes patterns
        for flags in (0, re.ASCII):
            pat = re.compile(b'\xc0', re.IGNORECASE)
            self.assertEqual(pat.match(b'\xe0'), None)
            pat = re.compile(b'\w')
            self.assertEqual(pat.match(b'\xe0'), None)
        # Incompatibilities
        self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
        self.assertRaises(ValueError, re.compile, b'(?u)\w')
        self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
        self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
        self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
        self.assertRaises(ValueError, re.compile, '(?au)\w')

Source File: lexer.py From pyhcl with Mozilla Public License 2.0

6 votes

def __init__(self, export_comments=None):
        if export_comments is not None:
            if export_comments == 'LINE':
                self.can_export_comments = ['COMMENT']
            elif export_comments == 'MULTILINE':
                self.can_export_comments = ['MULTICOMMENT']
            elif export_comments == 'ALL':
                self.can_export_comments = ['COMMENT', 'MULTICOMMENT']
            else:
                raise ValueError(
                    'Only `LINE`, `MULTILINE` and `ALL` value are allowed for '
                    '`export_comments`. given: `%s`.' % export_comments
                )

        self.lex = lex.lex(
            module=self,
            debug=False,
            reflags=(re.UNICODE | re.MULTILINE),
            errorlog=lex.NullLogger(),
        )

Source File: marshal.py From razzy-spinner with GNU General Public License v3.0

6 votes

def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
       
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in handler.readlines():
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close()

Source File: marshal.py From razzy-spinner with GNU General Public License v3.0

6 votes

def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
        
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        lines = handler.readlines()
        # will fail if "length " and "minlength " are not present
        self._length = int(lines[0].split("length ")[1])
        self._minlength = int(lines[1].split("minlength ")[1])
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in lines[2:]:
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close()

Source File: _stdlib.py From bugatsinho.github.io with GNU General Public License v3.0

6 votes

def expandvars(path):
    """
    Args:
        path (pathlike): A path to expand
    Returns:
        `fsnative`

    Like :func:`python:os.path.expandvars` but supports unicode under Windows
    + Python 2 and always returns a `fsnative`.
    """

    path = path2fsn(path)

    def repl_func(match):
        return environ.get(match.group(1), match.group(0))

    path = re.compile(r"\$(\w+)", flags=re.UNICODE).sub(repl_func, path)
    if os.name == "nt":
        path = re.sub(r"%([^%]+)%", repl_func, path)
    return re.sub(r"\$\{([^\}]+)\}", repl_func, path)

Source File: base.py From jbox with MIT License

6 votes

def __init__(self, **kwargs):
        """Construct a LONGTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(LONGTEXT, self).__init__(**kwargs)

Source File: base.py From jbox with MIT License

6 votes

def __init__(self, **kwargs):
        """Construct a MEDIUMTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(MEDIUMTEXT, self).__init__(**kwargs)

Source File: base.py From jbox with MIT License

6 votes

def __init__(self, **kwargs):
        """Construct a TINYTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(TINYTEXT, self).__init__(**kwargs)

Source File: regex.py From recruit with Apache License 2.0

6 votes

def str_flags_to_int(str_flags):
    flags = 0
    if "i" in str_flags:
        flags |= re.IGNORECASE
    if "l" in str_flags:
        flags |= re.LOCALE
    if "m" in str_flags:
        flags |= re.MULTILINE
    if "s" in str_flags:
        flags |= re.DOTALL
    if "u" in str_flags:
        flags |= re.UNICODE
    if "x" in str_flags:
        flags |= re.VERBOSE

    return flags

Source File: hnd_ft.py From DeepLearn with MIT License

5 votes

def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

Source File: alinea_lexer.py From DuraLex with GNU Affero General Public License v3.0

5 votes

def skip_to_next_word(tokens, i):
    return skip_tokens(tokens, i, lambda t: not re.compile('[\wà]+', re.IGNORECASE | re.UNICODE).match(t))

Source File: utils.py From ivre with GNU General Public License v3.0

5 votes

def regexp2pattern(string):
    """This function takes a regexp or a string and returns a pattern and
    some flags, suitable for use with re.compile(), combined with
    another pattern before. Useful, for example, if you want to create
    a regexp like '^ *Set-Cookie: *[name]=[value]' where name and
    value are regexp.

    """
    if isinstance(string, REGEXP_T):
        flags = string.flags
        string = string.pattern
        patterns = (('^', '$', '.*')
                    if isinstance(string, str) else
                    (b'^', b'$', b'.*'))
        if string.startswith(patterns[0]):
            string = string[1:]
        # elif string.startswith('('):
        #     raise ValueError("Regexp starting with a group are not "
        #                      "(yet) supported")
        else:
            string = patterns[2] + string
        if string.endswith(patterns[1]):
            string = string[:-1]
        # elif string.endswith(')'):
        #     raise ValueError("Regexp ending with a group are not "
        #                      "(yet) supported")
        else:
            string += patterns[2]
        return string, flags
    return re.escape(string), re.UNICODE if isinstance(string, str) else 0

Source File: base.py From jbox with MIT License

5 votes

def _re_compile(regex):
    """Compile a string to regex, I and UNICODE."""

    return re.compile(regex, re.I | re.UNICODE)

Source File: base.py From jbox with MIT License

5 votes

def _extend_string(self, type_, defaults, spec):
        """Extend a string-type declaration with standard SQL CHARACTER SET /
        COLLATE annotations and MySQL specific extensions.

        """

        def attr(name):
            return getattr(type_, name, defaults.get(name))

        if attr('charset'):
            charset = 'CHARACTER SET %s' % attr('charset')
        elif attr('ascii'):
            charset = 'ASCII'
        elif attr('unicode'):
            charset = 'UNICODE'
        else:
            charset = None

        if attr('collation'):
            collation = 'COLLATE %s' % type_.collation
        elif attr('binary'):
            collation = 'BINARY'
        else:
            collation = None

        if attr('national'):
            # NATIONAL (aka NCHAR/NVARCHAR) trumps charsets.
            return ' '.join([c for c in ('NATIONAL', spec, collation)
                             if c is not None])
        return ' '.join([c for c in (spec, charset, collation)
                         if c is not None])

Source File: helpers.py From poetry with MIT License

5 votes

def escape_name(name):
    """Escaped wheel name as specified in :pep:`427#escaping-and-unicode`."""
    return re.sub(r"[^\w\d.]+", "_", name, flags=re.UNICODE)

Source File: test_re.py From ironpython2 with Apache License 2.0

5 votes

def test_special_escapes(self):
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx").group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd").group(1), "bx")
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx", re.LOCALE).group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd", re.LOCALE).group(1), "bx")
        if have_unicode:
            self.assertEqual(re.search(r"\b(b.)\b",
                                       "abcd abc bcd bx", re.UNICODE).group(1), "bx")
            self.assertEqual(re.search(r"\B(b.)\B",
                                       "abc bcd bc abxd", re.UNICODE).group(1), "bx")
        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
        self.assertEqual(re.search(r"\b(b.)\b",
                                   u"abcd abc bcd bx").group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   u"abc bcd bc abxd").group(1), "bx")
        self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
        self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                   "1aa! a").group(0), "1aa! a")
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                   "1aa! a", re.LOCALE).group(0), "1aa! a")
        if have_unicode:
            self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                       "1aa! a", re.UNICODE).group(0), "1aa! a")

Source File: test_re.py From ironpython2 with Apache License 2.0

5 votes

def test_bigcharset(self):
        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
                                  unichr(0x2222)).group(1), unichr(0x2222))
        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
                                  unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
        r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
        self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))

Source File: test_re.py From ironpython2 with Apache License 2.0

5 votes

def test_getlower(self):
        import _sre
        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
        if have_unicode:
            self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))

        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")

Source File: base.py From jbox with MIT License

5 votes

def __init__(self, length=None, **kw):
        """Construct a TEXT.

        :param length: Optional, if provided the server may optimize storage
          by substituting the smallest TEXT type sufficient to store
          ``length`` characters.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(TEXT, self).__init__(length=length, **kw)

Source File: vqaEval.py From block.bootstrap.pytorch with BSD 3-Clause "New" or "Revised" License

5 votes

def processPunctuation(self, inText):
		outText = inText
		for p in self.punct:
			if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
				outText = outText.replace(p, '')
			else:
				outText = outText.replace(p, ' ')	
		outText = self.periodStrip.sub("",
									  outText,
									  re.UNICODE)
		return outText

Source File: video_process.py From Auto_Record_Matsuri with MIT License

5 votes

def remove_emoji(self):
        emoji_pattern = re.compile(
            u'(\U0001F1F2\U0001F1F4)|'  # Macau flag
            u'([\U0001F1E6-\U0001F1FF]{2})|'  # flags
            u'([\U0001F600-\U0001F64F])'  # emoticons
            "+", flags=re.UNICODE)
        self.filename = emoji_pattern.sub('', self.filename)

Source File: helper.py From calibre-web with GNU General Public License v3.0

5 votes

def get_valid_filename(value, replace_whitespace=True):
    """
    Returns the given string converted to a string that can be used for a clean
    filename. Limits num characters to 128 max.
    """
    if value[-1:] == u'.':
        value = value[:-1]+u'_'
    value = value.replace("/", "_").replace(":", "_").strip('\0')
    if use_unidecode:
        value = (unidecode.unidecode(value)).strip()
    else:
        value = value.replace(u'§', u'SS')
        value = value.replace(u'ß', u'ss')
        value = unicodedata.normalize('NFKD', value)
        re_slugify = re.compile(r'[\W\s-]', re.UNICODE)
        if isinstance(value, str):  # Python3 str, Python2 unicode
            value = re_slugify.sub('', value).strip()
        else:
            value = unicode(re_slugify.sub('', value).strip())
    if replace_whitespace:
        #  *+:\"/<>? are replaced by _
        value = re.sub(r'[\*\+:\\\"/<>\?]+', u'_', value, flags=re.U)
        # pipe has to be replaced with comma
        value = re.sub(r'[\|]+', u',', value, flags=re.U)
    value = value[:128]
    if not value:
        raise ValueError("Filename cannot be empty")
    if sys.version_info.major == 3:
        return value
    else:
        return value.decode('utf-8')

Source File: searchsupport.py From codimension with GNU General Public License v3.0

5 votes

def __fillInMatch(self, match, content, name, lineNumber,
                      customMessage=None):
        """Fills in the match fields from the content"""
        # Form the regexp corresponding to a single word search
        line = content[lineNumber - 1]
        if customMessage:
            match.text = customMessage
        else:
            match.text = line.strip()

        if name:
            regexpText = re.escape(name)
            regexpText = "\\b%s\\b" % regexpText
            flags = re.UNICODE
            searchRegexp = re.compile(regexpText, flags)

            contains = searchRegexp.search(line)
            match.start = contains.start()
            match.finish = contains.end()
        else:
            match.start = 0
            match.finish = len(line)

        match.tooltip = self.__buildTooltip(content, lineNumber - 1,
                                            len(content),
                                            match.start, match.finish)

        self.__extractDocstring(content)

Source File: base.py From fishroom with GNU General Public License v3.0

5 votes

def match_nickname_content(self, content: str) -> Tuple[str, str]:
        m = re.match(
            r'^\[(?P<nick>.+?)\] (?P<content>.*)',
            content, flags=re.UNICODE
        )
        return (m.group('nick'), m.group('content')) if m else (None, None)

Source File: squad_evaluation.py From FARM with Apache License 2.0

5 votes

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

Python re.UNICODE Examples