Python re.UNICODE Examples
The following are 30
code examples of re.UNICODE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
re
, or try the search function
.
Example #1
Source File: magic_check_fn.py From recipes-py with Apache License 2.0 | 10 votes |
def render_re(regex): """Renders a repr()-style value for a compiled regular expression.""" actual_flags = [] if regex.flags: flags = [ (re.IGNORECASE, 'IGNORECASE'), (re.LOCALE, 'LOCALE'), (re.UNICODE, 'UNICODE'), (re.MULTILINE, 'MULTILINE'), (re.DOTALL, 'DOTALL'), (re.VERBOSE, 'VERBOSE'), ] for val, name in flags: if regex.flags & val: actual_flags.append(name) if actual_flags: return 're.compile(%r, %s)' % (regex.pattern, '|'.join(actual_flags)) else: return 're.compile(%r)' % regex.pattern
Example #2
Source File: urlresolvers.py From GTDWeb with GNU General Public License v2.0 | 6 votes |
def regex(self): """ Returns a compiled regular expression, depending upon the activated language-code. """ language_code = get_language() if language_code not in self._regex_dict: if isinstance(self._regex, six.string_types): regex = self._regex else: regex = force_text(self._regex) try: compiled_regex = re.compile(regex, re.UNICODE) except re.error as e: raise ImproperlyConfigured( '"%s" is not a valid regular expression: %s' % (regex, six.text_type(e))) self._regex_dict[language_code] = compiled_regex return self._regex_dict[language_code]
Example #3
Source File: gitdm.py From grimoirelab-sortinghat with GNU General Public License v3.0 | 6 votes |
def __parse_domain_to_employer_line(self, raw_domain, raw_org): """Parse domain to employer lines""" d = re.match(self.DOMAIN_REGEX, raw_domain, re.UNICODE) if not d: cause = "invalid domain format: '%s'" % raw_domain raise InvalidFormatError(cause=cause) dom = d.group('domain').strip() o = re.match(self.ORGANIZATION_REGEX, raw_org, re.UNICODE) if not o: cause = "invalid organization format: '%s'" % raw_org raise InvalidFormatError(cause=cause) org = o.group('organization').strip() org = self.__encode(org) dom = self.__encode(dom) return org, dom
Example #4
Source File: inlinepatterns.py From lambda-packs with MIT License | 6 votes |
def __init__(self, pattern, markdown_instance=None): """ Create an instant of an inline pattern. Keyword arguments: * pattern: A regular expression that matches a pattern """ self.pattern = pattern self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL | re.UNICODE) # Api for Markdown to pass safe_mode into instance self.safe_mode = False if markdown_instance: self.markdown = markdown_instance
Example #5
Source File: test_re.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_bug_6561(self): # '\d' should match characters in Unicode category 'Nd' # (Number, Decimal Digit), but not those in 'Nl' (Number, # Letter) or 'No' (Number, Other). decimal_digits = [ unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd' unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd' unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' ] for x in decimal_digits: self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) not_decimal_digits = [ unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl' unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No' unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' ] for x in not_decimal_digits: self.assertIsNone(re.match('^\d$', x, re.UNICODE))
Example #6
Source File: test_re.py From jawfish with MIT License | 6 votes |
def test_ascii_and_unicode_flag(self): # String patterns for flags in (0, re.UNICODE): pat = re.compile('\xc0', flags | re.IGNORECASE) self.assertNotEqual(pat.match('\xe0'), None) pat = re.compile('\w', flags) self.assertNotEqual(pat.match('\xe0'), None) pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = re.compile('(?a)\xc0', re.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = re.compile('\w', re.ASCII) self.assertEqual(pat.match('\xe0'), None) pat = re.compile('(?a)\w') self.assertEqual(pat.match('\xe0'), None) # Bytes patterns for flags in (0, re.ASCII): pat = re.compile(b'\xc0', re.IGNORECASE) self.assertEqual(pat.match(b'\xe0'), None) pat = re.compile(b'\w') self.assertEqual(pat.match(b'\xe0'), None) # Incompatibilities self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE) self.assertRaises(ValueError, re.compile, b'(?u)\w') self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII) self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII) self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE) self.assertRaises(ValueError, re.compile, '(?au)\w')
Example #7
Source File: lexer.py From pyhcl with Mozilla Public License 2.0 | 6 votes |
def __init__(self, export_comments=None): if export_comments is not None: if export_comments == 'LINE': self.can_export_comments = ['COMMENT'] elif export_comments == 'MULTILINE': self.can_export_comments = ['MULTICOMMENT'] elif export_comments == 'ALL': self.can_export_comments = ['COMMENT', 'MULTICOMMENT'] else: raise ValueError( 'Only `LINE`, `MULTILINE` and `ALL` value are allowed for ' '`export_comments`. given: `%s`.' % export_comments ) self.lex = lex.lex( module=self, debug=False, reflags=(re.UNICODE | re.MULTILINE), errorlog=lex.NullLogger(), )
Example #8
Source File: marshal.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def unmarshal (self, filename): """ Unmarshals (loads from a plain text file) the tagger model. For safety, this operation is intended to be performed only on newly created taggers (i.e., without any previous model). @param filename: Name of the file from which the model will be read. @type filename: C{string} """ handler = file(filename, "r") pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE) for line in handler.readlines(): m = re.match(pattern, line) text, tag = m.groups() self._model[text] = tag handler.close()
Example #9
Source File: marshal.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def unmarshal (self, filename): """ Unmarshals (loads from a plain text file) the tagger model. For safety, this operation is intended to be performed only on newly created taggers (i.e., without any previous model). @param filename: Name of the file from which the model will be read. @type filename: C{string} """ handler = file(filename, "r") lines = handler.readlines() # will fail if "length " and "minlength " are not present self._length = int(lines[0].split("length ")[1]) self._minlength = int(lines[1].split("minlength ")[1]) pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE) for line in lines[2:]: m = re.match(pattern, line) text, tag = m.groups() self._model[text] = tag handler.close()
Example #10
Source File: _stdlib.py From bugatsinho.github.io with GNU General Public License v3.0 | 6 votes |
def expandvars(path): """ Args: path (pathlike): A path to expand Returns: `fsnative` Like :func:`python:os.path.expandvars` but supports unicode under Windows + Python 2 and always returns a `fsnative`. """ path = path2fsn(path) def repl_func(match): return environ.get(match.group(1), match.group(0)) path = re.compile(r"\$(\w+)", flags=re.UNICODE).sub(repl_func, path) if os.name == "nt": path = re.sub(r"%([^%]+)%", repl_func, path) return re.sub(r"\$\{([^\}]+)\}", repl_func, path)
Example #11
Source File: base.py From jbox with MIT License | 6 votes |
def __init__(self, **kwargs): """Construct a LONGTEXT. :param charset: Optional, a column-level character set for this string value. Takes precedence to 'ascii' or 'unicode' short-hand. :param collation: Optional, a column-level collation for this string value. Takes precedence to 'binary' short-hand. :param ascii: Defaults to False: short-hand for the ``latin1`` character set, generates ASCII in schema. :param unicode: Defaults to False: short-hand for the ``ucs2`` character set, generates UNICODE in schema. :param national: Optional. If true, use the server's configured national character set. :param binary: Defaults to False: short-hand, pick the binary collation type that matches the column's character set. Generates BINARY in schema. This does not affect the type of data stored, only the collation of character data. """ super(LONGTEXT, self).__init__(**kwargs)
Example #12
Source File: base.py From jbox with MIT License | 6 votes |
def __init__(self, **kwargs): """Construct a MEDIUMTEXT. :param charset: Optional, a column-level character set for this string value. Takes precedence to 'ascii' or 'unicode' short-hand. :param collation: Optional, a column-level collation for this string value. Takes precedence to 'binary' short-hand. :param ascii: Defaults to False: short-hand for the ``latin1`` character set, generates ASCII in schema. :param unicode: Defaults to False: short-hand for the ``ucs2`` character set, generates UNICODE in schema. :param national: Optional. If true, use the server's configured national character set. :param binary: Defaults to False: short-hand, pick the binary collation type that matches the column's character set. Generates BINARY in schema. This does not affect the type of data stored, only the collation of character data. """ super(MEDIUMTEXT, self).__init__(**kwargs)
Example #13
Source File: base.py From jbox with MIT License | 6 votes |
def __init__(self, **kwargs): """Construct a TINYTEXT. :param charset: Optional, a column-level character set for this string value. Takes precedence to 'ascii' or 'unicode' short-hand. :param collation: Optional, a column-level collation for this string value. Takes precedence to 'binary' short-hand. :param ascii: Defaults to False: short-hand for the ``latin1`` character set, generates ASCII in schema. :param unicode: Defaults to False: short-hand for the ``ucs2`` character set, generates UNICODE in schema. :param national: Optional. If true, use the server's configured national character set. :param binary: Defaults to False: short-hand, pick the binary collation type that matches the column's character set. Generates BINARY in schema. This does not affect the type of data stored, only the collation of character data. """ super(TINYTEXT, self).__init__(**kwargs)
Example #14
Source File: regex.py From recruit with Apache License 2.0 | 6 votes |
def str_flags_to_int(str_flags): flags = 0 if "i" in str_flags: flags |= re.IGNORECASE if "l" in str_flags: flags |= re.LOCALE if "m" in str_flags: flags |= re.MULTILINE if "s" in str_flags: flags |= re.DOTALL if "u" in str_flags: flags |= re.UNICODE if "x" in str_flags: flags |= re.VERBOSE return flags
Example #15
Source File: hnd_ft.py From DeepLearn with MIT License | 5 votes |
def clean(s): # Cleans a string: Lowercasing, trimming, removing non-alphanumeric return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
Example #16
Source File: alinea_lexer.py From DuraLex with GNU Affero General Public License v3.0 | 5 votes |
def skip_to_next_word(tokens, i): return skip_tokens(tokens, i, lambda t: not re.compile('[\wà]+', re.IGNORECASE | re.UNICODE).match(t))
Example #17
Source File: utils.py From ivre with GNU General Public License v3.0 | 5 votes |
def regexp2pattern(string): """This function takes a regexp or a string and returns a pattern and some flags, suitable for use with re.compile(), combined with another pattern before. Useful, for example, if you want to create a regexp like '^ *Set-Cookie: *[name]=[value]' where name and value are regexp. """ if isinstance(string, REGEXP_T): flags = string.flags string = string.pattern patterns = (('^', '$', '.*') if isinstance(string, str) else (b'^', b'$', b'.*')) if string.startswith(patterns[0]): string = string[1:] # elif string.startswith('('): # raise ValueError("Regexp starting with a group are not " # "(yet) supported") else: string = patterns[2] + string if string.endswith(patterns[1]): string = string[:-1] # elif string.endswith(')'): # raise ValueError("Regexp ending with a group are not " # "(yet) supported") else: string += patterns[2] return string, flags return re.escape(string), re.UNICODE if isinstance(string, str) else 0
Example #18
Source File: base.py From jbox with MIT License | 5 votes |
def _re_compile(regex): """Compile a string to regex, I and UNICODE.""" return re.compile(regex, re.I | re.UNICODE)
Example #19
Source File: base.py From jbox with MIT License | 5 votes |
def _extend_string(self, type_, defaults, spec): """Extend a string-type declaration with standard SQL CHARACTER SET / COLLATE annotations and MySQL specific extensions. """ def attr(name): return getattr(type_, name, defaults.get(name)) if attr('charset'): charset = 'CHARACTER SET %s' % attr('charset') elif attr('ascii'): charset = 'ASCII' elif attr('unicode'): charset = 'UNICODE' else: charset = None if attr('collation'): collation = 'COLLATE %s' % type_.collation elif attr('binary'): collation = 'BINARY' else: collation = None if attr('national'): # NATIONAL (aka NCHAR/NVARCHAR) trumps charsets. return ' '.join([c for c in ('NATIONAL', spec, collation) if c is not None]) return ' '.join([c for c in (spec, charset, collation) if c is not None])
Example #20
Source File: helpers.py From poetry with MIT License | 5 votes |
def escape_name(name): """Escaped wheel name as specified in :pep:`427#escaping-and-unicode`.""" return re.sub(r"[^\w\d.]+", "_", name, flags=re.UNICODE)
Example #21
Source File: test_re.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_special_escapes(self): self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd").group(1), "bx") self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx", re.LOCALE).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.LOCALE).group(1), "bx") if have_unicode: self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx", re.UNICODE).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.UNICODE).group(1), "bx") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) self.assertEqual(re.search(r"\b(b.)\b", u"abcd abc bcd bx").group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", u"abc bcd bc abxd").group(1), "bx") self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M)) self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a").group(0), "1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.LOCALE).group(0), "1aa! a") if have_unicode: self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a")
Example #22
Source File: test_re.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_bigcharset(self): self.assertEqual(re.match(u(r"([\u2222\u2223])"), unichr(0x2222)).group(1), unichr(0x2222)) self.assertEqual(re.match(u(r"([\u2222\u2223])"), unichr(0x2222), re.UNICODE).group(1), unichr(0x2222)) r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255))) self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Example #23
Source File: test_re.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_getlower(self): import _sre self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) if have_unicode: self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Example #24
Source File: base.py From jbox with MIT License | 5 votes |
def __init__(self, length=None, **kw): """Construct a TEXT. :param length: Optional, if provided the server may optimize storage by substituting the smallest TEXT type sufficient to store ``length`` characters. :param charset: Optional, a column-level character set for this string value. Takes precedence to 'ascii' or 'unicode' short-hand. :param collation: Optional, a column-level collation for this string value. Takes precedence to 'binary' short-hand. :param ascii: Defaults to False: short-hand for the ``latin1`` character set, generates ASCII in schema. :param unicode: Defaults to False: short-hand for the ``ucs2`` character set, generates UNICODE in schema. :param national: Optional. If true, use the server's configured national character set. :param binary: Defaults to False: short-hand, pick the binary collation type that matches the column's character set. Generates BINARY in schema. This does not affect the type of data stored, only the collation of character data. """ super(TEXT, self).__init__(length=length, **kw)
Example #25
Source File: vqaEval.py From block.bootstrap.pytorch with BSD 3-Clause "New" or "Revised" License | 5 votes |
def processPunctuation(self, inText): outText = inText for p in self.punct: if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None): outText = outText.replace(p, '') else: outText = outText.replace(p, ' ') outText = self.periodStrip.sub("", outText, re.UNICODE) return outText
Example #26
Source File: video_process.py From Auto_Record_Matsuri with MIT License | 5 votes |
def remove_emoji(self): emoji_pattern = re.compile( u'(\U0001F1F2\U0001F1F4)|' # Macau flag u'([\U0001F1E6-\U0001F1FF]{2})|' # flags u'([\U0001F600-\U0001F64F])' # emoticons "+", flags=re.UNICODE) self.filename = emoji_pattern.sub('', self.filename)
Example #27
Source File: helper.py From calibre-web with GNU General Public License v3.0 | 5 votes |
def get_valid_filename(value, replace_whitespace=True): """ Returns the given string converted to a string that can be used for a clean filename. Limits num characters to 128 max. """ if value[-1:] == u'.': value = value[:-1]+u'_' value = value.replace("/", "_").replace(":", "_").strip('\0') if use_unidecode: value = (unidecode.unidecode(value)).strip() else: value = value.replace(u'§', u'SS') value = value.replace(u'ß', u'ss') value = unicodedata.normalize('NFKD', value) re_slugify = re.compile(r'[\W\s-]', re.UNICODE) if isinstance(value, str): # Python3 str, Python2 unicode value = re_slugify.sub('', value).strip() else: value = unicode(re_slugify.sub('', value).strip()) if replace_whitespace: # *+:\"/<>? are replaced by _ value = re.sub(r'[\*\+:\\\"/<>\?]+', u'_', value, flags=re.U) # pipe has to be replaced with comma value = re.sub(r'[\|]+', u',', value, flags=re.U) value = value[:128] if not value: raise ValueError("Filename cannot be empty") if sys.version_info.major == 3: return value else: return value.decode('utf-8')
Example #28
Source File: searchsupport.py From codimension with GNU General Public License v3.0 | 5 votes |
def __fillInMatch(self, match, content, name, lineNumber, customMessage=None): """Fills in the match fields from the content""" # Form the regexp corresponding to a single word search line = content[lineNumber - 1] if customMessage: match.text = customMessage else: match.text = line.strip() if name: regexpText = re.escape(name) regexpText = "\\b%s\\b" % regexpText flags = re.UNICODE searchRegexp = re.compile(regexpText, flags) contains = searchRegexp.search(line) match.start = contains.start() match.finish = contains.end() else: match.start = 0 match.finish = len(line) match.tooltip = self.__buildTooltip(content, lineNumber - 1, len(content), match.start, match.finish) self.__extractDocstring(content)
Example #29
Source File: base.py From fishroom with GNU General Public License v3.0 | 5 votes |
def match_nickname_content(self, content: str) -> Tuple[str, str]: m = re.match( r'^\[(?P<nick>.+?)\] (?P<content>.*)', content, flags=re.UNICODE ) return (m.group('nick'), m.group('content')) if m else (None, None)
Example #30
Source File: squad_evaluation.py From FARM with Apache License 2.0 | 5 votes |
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) return re.sub(regex, ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))