Python unicodedata.combining() Examples
The following are 30
code examples of unicodedata.combining().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
unicodedata
, or try the search function
.
Example #1
Source File: text.py From bioforum with MIT License | 6 votes |
def _text_chars(self, length, truncate, text, truncate_len): """Truncate a string after a certain number of chars.""" s_len = 0 end_index = None for i, char in enumerate(text): if unicodedata.combining(char): # Don't consider combining characters # as adding to the string length continue s_len += 1 if end_index is None and s_len > truncate_len: end_index = i if s_len > length: # Return the truncated string return self.add_truncation_text(text[:end_index or 0], truncate) # Return the original string since no truncation was necessary return text
Example #2
Source File: STFIWF.py From 2016CCF-sougou with Apache License 2.0 | 6 votes |
def strip_accents_unicode(s): """Transform accentuated unicode symbols into their simple counterpart Warning: the python-level loop and join operations make this implementation 20 times slower than the strip_accents_ascii basic normalization. See also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct ASCII equivalent. """ normalized = unicodedata.normalize('NFKD', s) if normalized == s: return s else: return ''.join([c for c in normalized if not unicodedata.combining(c)])
Example #3
Source File: cpp_lint.py From Deep-Exemplar-based-Colorization with MIT License | 6 votes |
def GetLineWidth(line): """Determines the width of the line in column positions. Args: line: A string, which may be a Unicode string. Returns: The width of the line in column positions, accounting for Unicode combining characters and wide characters. """ if isinstance(line, unicode): width = 0 for uc in unicodedata.normalize('NFC', line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): width += 1 return width else: return len(line)
Example #4
Source File: albert_tokenization.py From bert-for-tf2 with MIT License | 6 votes |
def preprocess_text(inputs, remove_space=True, lower=False): """preprocess data by removing extra space and normalize data.""" outputs = inputs if remove_space: outputs = " ".join(inputs.strip().split()) if six.PY2 and isinstance(outputs, str): try: outputs = six.ensure_text(outputs, "utf-8") except UnicodeDecodeError: outputs = six.ensure_text(outputs, "latin-1") outputs = unicodedata.normalize("NFKD", outputs) outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if lower: outputs = outputs.lower() return outputs
Example #5
Source File: text.py From GTDWeb with GNU General Public License v2.0 | 6 votes |
def chars(self, num, truncate=None, html=False): """ Returns the text truncated to be no longer than the specified number of characters. Takes an optional argument of what should be used to notify that the string has been truncated, defaulting to a translatable string of an ellipsis (...). """ length = int(num) text = unicodedata.normalize('NFC', self._wrapped) # Calculate the length to truncate to (max length - end_text length) truncate_len = length for char in self.add_truncation_text('', truncate): if not unicodedata.combining(char): truncate_len -= 1 if truncate_len == 0: break if html: return self._truncate_html(length, truncate, text, truncate_len, False) return self._text_chars(length, truncate, text, truncate_len)
Example #6
Source File: text.py From GTDWeb with GNU General Public License v2.0 | 6 votes |
def _text_chars(self, length, truncate, text, truncate_len): """ Truncates a string after a certain number of chars. """ s_len = 0 end_index = None for i, char in enumerate(text): if unicodedata.combining(char): # Don't consider combining characters # as adding to the string length continue s_len += 1 if end_index is None and s_len > truncate_len: end_index = i if s_len > length: # Return the truncated string return self.add_truncation_text(text[:end_index or 0], truncate) # Return the original string since no truncation was necessary return text
Example #7
Source File: test_regressions.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
Example #8
Source File: text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def strip_accents_unicode(s): """Transform accentuated unicode symbols into their simple counterpart Warning: the python-level loop and join operations make this implementation 20 times slower than the strip_accents_ascii basic normalization. Parameters ---------- s : string The string to strip See also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct ASCII equivalent. """ normalized = unicodedata.normalize('NFKD', s) if normalized == s: return s else: return ''.join([c for c in normalized if not unicodedata.combining(c)])
Example #9
Source File: prepro_utils.py From XLnet-gen with MIT License | 6 votes |
def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False): if remove_space: outputs = ' '.join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if six.PY2 and isinstance(outputs, str): outputs = outputs.decode('utf-8') if not keep_accents: outputs = unicodedata.normalize('NFKD', outputs) outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) if lower: outputs = outputs.lower() return outputs
Example #10
Source File: text.py From bioforum with MIT License | 6 votes |
def chars(self, num, truncate=None, html=False): """ Return the text truncated to be no longer than the specified number of characters. `truncate` specifies what should be used to notify that the string has been truncated, defaulting to a translatable string of an ellipsis (...). """ self._setup() length = int(num) text = unicodedata.normalize('NFC', self._wrapped) # Calculate the length to truncate to (max length - end_text length) truncate_len = length for char in self.add_truncation_text('', truncate): if not unicodedata.combining(char): truncate_len -= 1 if truncate_len == 0: break if html: return self._truncate_html(length, truncate, text, truncate_len, False) return self._text_chars(length, truncate, text, truncate_len)
Example #11
Source File: texttable.py From deepWordBug with Apache License 2.0 | 5 votes |
def len(iterable): """Redefining len here so it will be able to work with non-ASCII characters """ if isinstance(iterable, bytes_type) or isinstance(iterable, unicode_type): unicode_data = obj2unicode(iterable) if hasattr(unicodedata, 'east_asian_width'): w = unicodedata.east_asian_width return sum([w(c) in 'WF' and 2 or 0 if unicodedata.combining(c) else 1 for c in unicode_data]) else: return unicode_data.__len__() else: return iterable.__len__()
Example #12
Source File: core.py From pex with Apache License 2.0 | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #13
Source File: core.py From pex with Apache License 2.0 | 5 votes |
def _combining_class(cp): v = unicodedata.combining(unichr(cp)) if v == 0: if not unicodedata.name(unichr(cp)): raise ValueError("Unknown character in unicodedata") return v
Example #14
Source File: __init__.py From deepWordBug with Apache License 2.0 | 5 votes |
def column_indices(text): """Indices of Unicode string `text` when skipping combining characters. >>> from docutils.utils import column_indices >>> column_indices(u'A t̆ab̆lĕ') [0, 1, 2, 4, 5, 7, 8] """ # TODO: account for asian wide chars here instead of using dummy # replacements in the tableparser? string_indices = list(range(len(text))) for index in find_combining_chars(text): string_indices[index] = None return [i for i in string_indices if i is not None]
Example #15
Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #16
Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def _combining_class(cp): v = unicodedata.combining(unichr(cp)) if v == 0: if not unicodedata.name(unichr(cp)): raise ValueError("Unknown character in unicodedata") return v
Example #17
Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License | 5 votes |
def removeAccents(self, text): text = str(text) normalized = unicodedata.normalize("NFKD", text) normalized = "".join([c for c in normalized if not unicodedata.combining(c)]) return normalized
Example #18
Source File: mnemonic.py From torba with MIT License | 5 votes |
def normalize_text(seed): seed = unicodedata.normalize('NFKD', seed) seed = seed.lower() # remove accents seed = u''.join([c for c in seed if not unicodedata.combining(c)]) # normalize whitespaces seed = u' '.join(seed.split()) # remove whitespaces between CJK seed = u''.join([ seed[i] for i in range(len(seed)) if not (seed[i] in string.whitespace and is_cjk(seed[i-1]) and is_cjk(seed[i+1])) ]) return seed
Example #19
Source File: core.py From chinese-support-redux with GNU General Public License v3.0 | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #20
Source File: core.py From chinese-support-redux with GNU General Public License v3.0 | 5 votes |
def _combining_class(cp): v = unicodedata.combining(unichr(cp)) if v == 0: if not unicodedata.name(unichr(cp)): raise ValueError("Unknown character in unicodedata") return v
Example #21
Source File: utils.py From verejne.digital with Apache License 2.0 | 5 votes |
def remove_accents(s): s_NFKD = unicodedata.normalize('NFKD', s) return ''.join([c for c in s_NFKD if not unicodedata.combining(c)])
Example #22
Source File: utils.py From verejne.digital with Apache License 2.0 | 5 votes |
def remove_accents(s): s_NFKD = unicodedata.normalize('NFKD', s) return u''.join([c for c in s_NFKD if not unicodedata.combining(c)])
Example #23
Source File: __init__.py From deepWordBug with Apache License 2.0 | 5 votes |
def column_width(text): """Return the column width of text. Correct ``len(text)`` for wide East Asian and combining Unicode chars. """ if isinstance(text, str) and sys.version_info < (3,0): return len(text) try: width = sum([east_asian_widths[unicodedata.east_asian_width(c)] for c in text]) except AttributeError: # east_asian_width() New in version 2.4. width = len(text) # correction for combining chars: width -= len(find_combining_chars(text)) return width
Example #24
Source File: core.py From oss-ftp with MIT License | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #25
Source File: core.py From oss-ftp with MIT License | 5 votes |
def _combining_class(cp): return unicodedata.combining(unichr(cp))
Example #26
Source File: core.py From wow-addon-updater with GNU General Public License v3.0 | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #27
Source File: core.py From wow-addon-updater with GNU General Public License v3.0 | 5 votes |
def _combining_class(cp): return unicodedata.combining(unichr(cp))
Example #28
Source File: core.py From kahoot-hack with GNU General Public License v3.0 | 5 votes |
def _combining_class(cp): return unicodedata.combining(unichr(cp))
Example #29
Source File: core.py From kahoot-hack with GNU General Public License v3.0 | 5 votes |
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
Example #30
Source File: core.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def _combining_class(cp): v = unicodedata.combining(unichr(cp)) if v == 0: if not unicodedata.name(unichr(cp)): raise ValueError("Unknown character in unicodedata") return v