Python unicodedata.decomposition() Examples

The following are 7 code examples of unicodedata.decomposition(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module unicodedata , or try the search function .
Example #1
Source File: test_regressions.py    From ironpython2 with Apache License 2.0 6 votes vote down vote up
def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '') 
Example #2
Source File: test_regressions.py    From ironpython3 with Apache License 2.0 6 votes vote down vote up
def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '') 
Example #3
Source File: test_urlparse.py    From android_universal with MIT License 6 votes vote down vote up
def test_urlsplit_normalization(self):
        # Certain characters should never occur in the netloc,
        # including under normalization.
        # Ensure that ALL of them are detected and cause an error
        illegal_chars = '/:#?@'
        hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
        denorm_chars = [
            c for c in map(chr, range(128, sys.maxunicode))
            if (hex_chars & set(unicodedata.decomposition(c).split()))
            and c not in illegal_chars
        ]
        # Sanity check that we found at least one such character
        self.assertIn('\u2100', denorm_chars)
        self.assertIn('\uFF03', denorm_chars)

        for scheme in ["http", "https", "ftp"]:
            for c in denorm_chars:
                url = "{}://netloc{}false.netloc/path".format(scheme, c)
                with self.subTest(url=url, char='{:04X}'.format(ord(c))):
                    with self.assertRaises(ValueError):
                        urllib.parse.urlsplit(url) 
Example #4
Source File: test_urlparse.py    From ironpython3 with Apache License 2.0 5 votes vote down vote up
def test_urlsplit_normalization(self):
        # Certain characters should never occur in the netloc,
        # including under normalization.
        # Ensure that ALL of them are detected and cause an error
        illegal_chars = '/:#?@'
        hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
        maxunicode = 0xffff if sys.implementation.name == "ironpython" else sys.maxunicode # https://github.com/IronLanguages/ironpython3/issues/252
        denorm_chars = [
            c for c in map(chr, range(128, maxunicode))
            if (hex_chars & set(unicodedata.decomposition(c).split()))
            and c not in illegal_chars
        ]
        # Sanity check that we found at least one such character
        self.assertIn('\u2100', denorm_chars)
        self.assertIn('\uFF03', denorm_chars)

        # https://github.com/IronLanguages/ironpython3/issues/614
        is_mono = False
        mono_issue_chars = ("\ufe13", "\ufe16", "\ufe5f")
        if sys.implementation.name == "ironpython":
            import clr
            is_mono = clr.IsMono

        for scheme in ["http", "https", "ftp"]:
            for c in denorm_chars:
                url = "{}://netloc{}false.netloc/path".format(scheme, c)
                with self.subTest(url=url, char='{:04X}'.format(ord(c))):
                    if is_mono and c in mono_issue_chars:
                        urllib.parse.urlsplit(url) # ensure we fail if this ever gets fixed
                        continue
                    with self.assertRaises(ValueError):
                        urllib.parse.urlsplit(url) 
Example #5
Source File: normalize_anth.py    From acl-anthology with Apache License 2.0 5 votes vote down vote up
def clean_unicode(s):
    s = s.replace("\u00ad", "")  # soft hyphen
    s = s.replace("\u2010", "-")  # hyphen

    # Some sources encode an i with an accent above using dotless i,
    # which must be converted to normal i
    s = list(s)
    for i in range(len(s) - 1):
        # bug: we should only be looking for accents above, not
        # below
        if s[i] == "ı" and unicodedata.category(s[i + 1]) == "Mn":
            s[i] = "i"
    s = "".join(s)

    # Selectively apply compatibility decomposition.
    # This converts, e.g., fi to fi and : to :, but not ² to 2.
    # Unsure: … to ...
    # More classes could be added here.
    def decompose(c):
        d = unicodedata.decomposition(c)
        if d and d.split(None, 1)[0] in ["<compat>", "<wide>", "<narrow>", "<noBreak>"]:
            return unicodedata.normalize("NFKD", c)
        else:
            return c

    s = "".join(map(decompose, s))

    # Convert combining characters when possible
    s = unicodedata.normalize("NFC", s)

    return s 
Example #6
Source File: text_07.py    From Modern-Python-Standard-Library-Cookbook with MIT License 5 votes vote down vote up
def __missing__(self, key):
        ch = self.get(key)
        if ch is not None:
            return ch
        de = unicodedata.decomposition(chr(key))
        if de:
            try:
                ch = int(de.split(None, 1)[0], 16)
            except (IndexError, ValueError):
                ch = key
        else:
            ch = key
        self[key] = ch
        return ch 
Example #7
Source File: test_urlparse.py    From ironpython2 with Apache License 2.0 4 votes vote down vote up
def test_urlsplit_normalization(self):
        # Certain characters should never occur in the netloc,
        # including under normalization.
        # Ensure that ALL of them are detected and cause an error
        illegal_chars = u'/:#?@'
        hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
        denorm_chars = [
            c for c in map(unichr, range(128, sys.maxunicode))
            if (hex_chars & set(unicodedata.decomposition(c).split()))
            and c not in illegal_chars
        ]
        # Sanity check that we found at least one such character
        self.assertIn(u'\u2100', denorm_chars)
        self.assertIn(u'\uFF03', denorm_chars)

        # https://github.com/IronLanguages/ironpython3/issues/614
        is_mono = False
        mono_issue_chars = (u"\ufe13", u"\ufe16", u"\ufe5f")
        if sys.platform == "cli":
            import clr
            is_mono = clr.IsMono

        # bpo-36742: Verify port separators are ignored when they
        # existed prior to decomposition
        urlparse.urlsplit(u'http://\u30d5\u309a:80')
        with self.assertRaises(ValueError):
            urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380')
            if is_mono: raise ValueError
        if is_mono: urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380') # ensure we fail if this ever gets fixed

        for scheme in [u"http", u"https", u"ftp"]:
            for netloc in [u"netloc{}false.netloc", u"n{}user@netloc"]:
                for c in denorm_chars:
                    url = u"{}://{}/path".format(scheme, netloc.format(c))
                    if test_support.verbose:
                        print "Checking %r" % url
                    if is_mono and c in mono_issue_chars:
                        urlparse.urlsplit(url) # ensure we fail if this ever gets fixed
                        continue
                    with self.assertRaises(ValueError):
                        urlparse.urlsplit(url)

        # check error message: invalid netloc must be formated with repr()
        # to get an ASCII error message
        with self.assertRaises(ValueError) as cm:
            urlparse.urlsplit(u'http://example.com\uFF03@bing.com')
        self.assertEqual(str(cm.exception),
                         "netloc u'example.com\\uff03@bing.com' contains invalid characters "
                         "under NFKC normalization")
        self.assertIsInstance(cm.exception.args[0], str)