Python Examples of unicodedata.combining

Source File: text.py From bioforum with MIT License

6 votes

def _text_chars(self, length, truncate, text, truncate_len):
        """Truncate a string after a certain number of chars."""
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
                continue
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],
                                                truncate)

        # Return the original string since no truncation was necessary
        return text

Source File: STFIWF.py From 2016CCF-sougou with Apache License 2.0

6 votes

def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])

Source File: cpp_lint.py From Deep-Exemplar-based-Colorization with MIT License

6 votes

def GetLineWidth(line):
  """Determines the width of the line in column positions.

  Args:
    line: A string, which may be a Unicode string.

  Returns:
    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  """
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
  else:
    return len(line)

Source File: albert_tokenization.py From bert-for-tf2 with MIT License

6 votes

def preprocess_text(inputs, remove_space=True, lower=False):
    """preprocess data by removing extra space and normalize data."""
    outputs = inputs
    if remove_space:
        outputs = " ".join(inputs.strip().split())

    if six.PY2 and isinstance(outputs, str):
        try:
            outputs = six.ensure_text(outputs, "utf-8")
        except UnicodeDecodeError:
            outputs = six.ensure_text(outputs, "latin-1")

    outputs = unicodedata.normalize("NFKD", outputs)
    outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
    if lower:
        outputs = outputs.lower()

    return outputs

Source File: text.py From GTDWeb with GNU General Public License v2.0

6 votes

def chars(self, num, truncate=None, html=False):
        """
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        """
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
                    break
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)

Source File: text.py From GTDWeb with GNU General Public License v2.0

6 votes

def _text_chars(self, length, truncate, text, truncate_len):
        """
        Truncates a string after a certain number of chars.
        """
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
                continue
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],
                                                truncate)

        # Return the original string since no truncation was necessary
        return text

Source File: test_regressions.py From ironpython2 with Apache License 2.0

6 votes

def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')

Source File: text.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    Parameters
    ----------
    s : string
        The string to strip

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)])

Source File: prepro_utils.py From XLnet-gen with MIT License

6 votes

def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
  if remove_space:
    outputs = ' '.join(inputs.strip().split())
  else:
    outputs = inputs
  outputs = outputs.replace("``", '"').replace("''", '"')

  if six.PY2 and isinstance(outputs, str):
    outputs = outputs.decode('utf-8')

  if not keep_accents:
    outputs = unicodedata.normalize('NFKD', outputs)
    outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
  if lower:
    outputs = outputs.lower()

  return outputs

Source File: text.py From bioforum with MIT License

6 votes

def chars(self, num, truncate=None, html=False):
        """
        Return the text truncated to be no longer than the specified number
        of characters.

        `truncate` specifies what should be used to notify that the string has
        been truncated, defaulting to a translatable string of an ellipsis
        (...).
        """
        self._setup()
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
                    break
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len)

Source File: texttable.py From deepWordBug with Apache License 2.0

5 votes

def len(iterable):
    """Redefining len here so it will be able to work with non-ASCII characters
    """
    if isinstance(iterable, bytes_type) or isinstance(iterable, unicode_type):
        unicode_data = obj2unicode(iterable)
        if hasattr(unicodedata, 'east_asian_width'):
            w = unicodedata.east_asian_width
            return sum([w(c) in 'WF' and 2 or 0 if unicodedata.combining(c) else 1 for c in unicode_data])
        else:
            return unicode_data.__len__()
    else:
        return iterable.__len__()

Source File: core.py From pex with Apache License 2.0

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From pex with Apache License 2.0

5 votes

def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v

Source File: __init__.py From deepWordBug with Apache License 2.0

5 votes

def column_indices(text):
    """Indices of Unicode string `text` when skipping combining characters.

    >>> from docutils.utils import column_indices
    >>> column_indices(u'A t̆ab̆lĕ')
    [0, 1, 2, 4, 5, 7, 8]

    """
    # TODO: account for asian wide chars here instead of using dummy
    # replacements in the tableparser?
    string_indices = list(range(len(text)))
    for index in find_combining_chars(text):
        string_indices[index] = None
    return [i for i in string_indices if i is not None]

Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v

Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License

5 votes

def removeAccents(self, text):
        text = str(text)
        normalized = unicodedata.normalize("NFKD", text)
        normalized = "".join([c for c in normalized if not unicodedata.combining(c)])
        return normalized

Source File: mnemonic.py From torba with MIT License

5 votes

def normalize_text(seed):
    seed = unicodedata.normalize('NFKD', seed)
    seed = seed.lower()
    # remove accents
    seed = u''.join([c for c in seed if not unicodedata.combining(c)])
    # normalize whitespaces
    seed = u' '.join(seed.split())
    # remove whitespaces between CJK
    seed = u''.join([
        seed[i] for i in range(len(seed))
        if not (seed[i] in string.whitespace and is_cjk(seed[i-1]) and is_cjk(seed[i+1]))
    ])
    return seed

Source File: core.py From chinese-support-redux with GNU General Public License v3.0

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From chinese-support-redux with GNU General Public License v3.0

5 votes

def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v

Source File: utils.py From verejne.digital with Apache License 2.0

5 votes

def remove_accents(s):
    s_NFKD = unicodedata.normalize('NFKD', s)
    return ''.join([c for c in s_NFKD if not unicodedata.combining(c)])

Source File: utils.py From verejne.digital with Apache License 2.0

5 votes

def remove_accents(s):
    s_NFKD = unicodedata.normalize('NFKD', s)
    return u''.join([c for c in s_NFKD if not unicodedata.combining(c)])

Source File: __init__.py From deepWordBug with Apache License 2.0

5 votes

def column_width(text):
    """Return the column width of text.

    Correct ``len(text)`` for wide East Asian and combining Unicode chars.
    """
    if isinstance(text, str) and sys.version_info < (3,0):
        return len(text)
    try:
        width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
                     for c in text])
    except AttributeError:  # east_asian_width() New in version 2.4.
        width = len(text)
    # correction for combining chars:
    width -= len(find_combining_chars(text))
    return width

Source File: core.py From oss-ftp with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From oss-ftp with MIT License

5 votes

def _combining_class(cp):
    return unicodedata.combining(unichr(cp))

Source File: core.py From wow-addon-updater with GNU General Public License v3.0

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From wow-addon-updater with GNU General Public License v3.0

5 votes

def _combining_class(cp):
    return unicodedata.combining(unichr(cp))

Source File: core.py From kahoot-hack with GNU General Public License v3.0

5 votes

def _combining_class(cp):
    return unicodedata.combining(unichr(cp))

Source File: core.py From kahoot-hack with GNU General Public License v3.0

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v

Python unicodedata.combining() Examples