Python unicodedata.decomposition() Examples
The following are 7
code examples of unicodedata.decomposition().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
unicodedata
, or try the search function
.
Example #1
Source File: test_regressions.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
Example #2
Source File: test_regressions.py From ironpython3 with Apache License 2.0 | 6 votes |
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
Example #3
Source File: test_urlparse.py From android_universal with MIT License | 6 votes |
def test_urlsplit_normalization(self): # Certain characters should never occur in the netloc, # including under normalization. # Ensure that ALL of them are detected and cause an error illegal_chars = '/:#?@' hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} denorm_chars = [ c for c in map(chr, range(128, sys.maxunicode)) if (hex_chars & set(unicodedata.decomposition(c).split())) and c not in illegal_chars ] # Sanity check that we found at least one such character self.assertIn('\u2100', denorm_chars) self.assertIn('\uFF03', denorm_chars) for scheme in ["http", "https", "ftp"]: for c in denorm_chars: url = "{}://netloc{}false.netloc/path".format(scheme, c) with self.subTest(url=url, char='{:04X}'.format(ord(c))): with self.assertRaises(ValueError): urllib.parse.urlsplit(url)
Example #4
Source File: test_urlparse.py From ironpython3 with Apache License 2.0 | 5 votes |
def test_urlsplit_normalization(self): # Certain characters should never occur in the netloc, # including under normalization. # Ensure that ALL of them are detected and cause an error illegal_chars = '/:#?@' hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} maxunicode = 0xffff if sys.implementation.name == "ironpython" else sys.maxunicode # https://github.com/IronLanguages/ironpython3/issues/252 denorm_chars = [ c for c in map(chr, range(128, maxunicode)) if (hex_chars & set(unicodedata.decomposition(c).split())) and c not in illegal_chars ] # Sanity check that we found at least one such character self.assertIn('\u2100', denorm_chars) self.assertIn('\uFF03', denorm_chars) # https://github.com/IronLanguages/ironpython3/issues/614 is_mono = False mono_issue_chars = ("\ufe13", "\ufe16", "\ufe5f") if sys.implementation.name == "ironpython": import clr is_mono = clr.IsMono for scheme in ["http", "https", "ftp"]: for c in denorm_chars: url = "{}://netloc{}false.netloc/path".format(scheme, c) with self.subTest(url=url, char='{:04X}'.format(ord(c))): if is_mono and c in mono_issue_chars: urllib.parse.urlsplit(url) # ensure we fail if this ever gets fixed continue with self.assertRaises(ValueError): urllib.parse.urlsplit(url)
Example #5
Source File: normalize_anth.py From acl-anthology with Apache License 2.0 | 5 votes |
def clean_unicode(s): s = s.replace("\u00ad", "") # soft hyphen s = s.replace("\u2010", "-") # hyphen # Some sources encode an i with an accent above using dotless i, # which must be converted to normal i s = list(s) for i in range(len(s) - 1): # bug: we should only be looking for accents above, not # below if s[i] == "ı" and unicodedata.category(s[i + 1]) == "Mn": s[i] = "i" s = "".join(s) # Selectively apply compatibility decomposition. # This converts, e.g., fi to fi and : to :, but not ² to 2. # Unsure: … to ... # More classes could be added here. def decompose(c): d = unicodedata.decomposition(c) if d and d.split(None, 1)[0] in ["<compat>", "<wide>", "<narrow>", "<noBreak>"]: return unicodedata.normalize("NFKD", c) else: return c s = "".join(map(decompose, s)) # Convert combining characters when possible s = unicodedata.normalize("NFC", s) return s
Example #6
Source File: text_07.py From Modern-Python-Standard-Library-Cookbook with MIT License | 5 votes |
def __missing__(self, key): ch = self.get(key) if ch is not None: return ch de = unicodedata.decomposition(chr(key)) if de: try: ch = int(de.split(None, 1)[0], 16) except (IndexError, ValueError): ch = key else: ch = key self[key] = ch return ch
Example #7
Source File: test_urlparse.py From ironpython2 with Apache License 2.0 | 4 votes |
def test_urlsplit_normalization(self): # Certain characters should never occur in the netloc, # including under normalization. # Ensure that ALL of them are detected and cause an error illegal_chars = u'/:#?@' hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} denorm_chars = [ c for c in map(unichr, range(128, sys.maxunicode)) if (hex_chars & set(unicodedata.decomposition(c).split())) and c not in illegal_chars ] # Sanity check that we found at least one such character self.assertIn(u'\u2100', denorm_chars) self.assertIn(u'\uFF03', denorm_chars) # https://github.com/IronLanguages/ironpython3/issues/614 is_mono = False mono_issue_chars = (u"\ufe13", u"\ufe16", u"\ufe5f") if sys.platform == "cli": import clr is_mono = clr.IsMono # bpo-36742: Verify port separators are ignored when they # existed prior to decomposition urlparse.urlsplit(u'http://\u30d5\u309a:80') with self.assertRaises(ValueError): urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380') if is_mono: raise ValueError if is_mono: urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380') # ensure we fail if this ever gets fixed for scheme in [u"http", u"https", u"ftp"]: for netloc in [u"netloc{}false.netloc", u"n{}user@netloc"]: for c in denorm_chars: url = u"{}://{}/path".format(scheme, netloc.format(c)) if test_support.verbose: print "Checking %r" % url if is_mono and c in mono_issue_chars: urlparse.urlsplit(url) # ensure we fail if this ever gets fixed continue with self.assertRaises(ValueError): urlparse.urlsplit(url) # check error message: invalid netloc must be formated with repr() # to get an ASCII error message with self.assertRaises(ValueError) as cm: urlparse.urlsplit(u'http://example.com\uFF03@bing.com') self.assertEqual(str(cm.exception), "netloc u'example.com\\uff03@bing.com' contains invalid characters " "under NFKC normalization") self.assertIsInstance(cm.exception.args[0], str)