Python unicodedata.normalize() Examples
The following are 30
code examples of unicodedata.normalize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
unicodedata
, or try the search function
.
Example #1
Source File: tokenizer.py From keras-bert with MIT License | 8 votes |
def _tokenize(self, text): if not self._cased: text = unicodedata.normalize('NFD', text) text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn']) text = text.lower() spaced = '' for ch in text: if self._is_punctuation(ch) or self._is_cjk_character(ch): spaced += ' ' + ch + ' ' elif self._is_space(ch): spaced += ' ' elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch): continue else: spaced += ch tokens = [] for word in spaced.strip().split(): tokens += self._word_piece_tokenize(word) return tokens
Example #2
Source File: static.py From cherrypy with BSD 3-Clause "New" or "Revised" License | 7 votes |
def _make_content_disposition(disposition, file_name): """Create HTTP header for downloading a file with a UTF-8 filename. This function implements the recommendations of :rfc:`6266#appendix-D`. See this and related answers: https://stackoverflow.com/a/8996249/2173868. """ # As normalization algorithm for `unicodedata` is used composed form (NFC # and NFKC) with compatibility equivalence criteria (NFK), so "NFKC" is the # one. It first applies the compatibility decomposition, followed by the # canonical composition. Should be displayed in the same manner, should be # treated in the same way by applications such as alphabetizing names or # searching, and may be substituted for each other. # See: https://en.wikipedia.org/wiki/Unicode_equivalence. ascii_name = ( unicodedata.normalize('NFKC', file_name). encode('ascii', errors='ignore').decode() ) header = '{}; filename="{}"'.format(disposition, ascii_name) if ascii_name != file_name: quoted_name = urllib.parse.quote(file_name) header += '; filename*=UTF-8\'\'{}'.format(quoted_name) return header
Example #3
Source File: wsgi.py From vergeml with MIT License | 6 votes |
def secure_filename(filename): if isinstance(filename, str): from unicodedata import normalize filename = normalize('NFKD', filename).encode('ascii', 'ignore') filename = filename.decode('ascii') for sep in os.path.sep, os.path.altsep: if sep: filename = filename.replace(sep, ' ') filename = str(_filename_ascii_strip_re.sub('', '_'.join( filename.split()))).strip('._') # on nt a couple of special files are present in each folder. We # have to ensure that the target file is not such a filename. In # this case we prepend an underline if os.name == 'nt' and filename and \ filename.split('.')[0].upper() in _windows_device_files: filename = '_' + filename return filename
Example #4
Source File: connector.py From dataiku-contrib with Apache License 2.0 | 6 votes |
def _byteify(data, ignore_dicts = False): # if this is a unicode string, return its string representation if isinstance(data, unicode): return unicodedata.normalize('NFKD', data).encode('ascii','ignore') # if this is a list of values, return list of byteified values if isinstance(data, list): return [ _byteify(item, ignore_dicts=True) for item in data ] # if this is a dictionary, return dictionary of byteified keys and values # but only if we haven't already byteified it if isinstance(data, dict) and not ignore_dicts: return { _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True) for key, value in data.iteritems() } # if it's anything else, return it in its original form return data
Example #5
Source File: ncm2.py From ncm2 with MIT License | 6 votes |
def strdisplaywidth(self, s): def get_char_display_width(unicode_str): r = unicodedata.east_asian_width(unicode_str) if r == "F": # Fullwidth return 1 elif r == "H": # Half-width return 1 elif r == "W": # Wide return 2 elif r == "Na": # Narrow return 1 elif r == "A": # Ambiguous, go with 2 return 1 elif r == "N": # Neutral return 1 else: return 1 s = unicodedata.normalize('NFC', s) w = 0 for c in s: w += get_char_display_width(c) return w
Example #6
Source File: workflow.py From Quiver-alfred with MIT License | 6 votes |
def fold_to_ascii(self, text): """Convert non-ASCII characters to closest ASCII equivalent. .. versionadded:: 1.3 .. note:: This only works for a subset of European languages. :param text: text to convert :type text: ``unicode`` :returns: text containing only ASCII characters :rtype: ``unicode`` """ if isascii(text): return text text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text]) return unicode(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore'))
Example #7
Source File: text.py From pyspelling with MIT License | 6 votes |
def setup(self): """Setup.""" self.normalize = self.config['normalize'].upper() self.convert_encoding = self.config['convert_encoding'].lower() self.errors = self.config['errors'].lower() if self.convert_encoding: self.convert_encoding = codecs.lookup( filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower() ).name # Don't generate content with BOMs if ( self.convert_encoding.startswith(('utf-32', 'utf-16')) and not self.convert_encoding.endswith(('le', 'be')) ): self.convert_encoding += '-le' if self.convert_encoding == 'utf-8-sig': self.convert_encoding = 'utf-8'
Example #8
Source File: connector.py From dataiku-contrib with Apache License 2.0 | 6 votes |
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit = -1): query_date = datetime.datetime.now() rows = self.list_epics() if len(rows) == 0: logging.info("Not epics.") else: nb = 0 for row in rows: if 0 <= records_limit <= nb: logging.info("Reached records_limit (%i), stopping." % records_limit) return encoded_row = {} encoded_row["query_date"] = query_date for key in row: val = row[key] if isinstance(val, unicode): val = unicodedata.normalize('NFKD', val).encode('ascii','ignore') encoded_row[str(key)] = val yield encoded_row nb += 1
Example #9
Source File: util.py From gist-alfred with MIT License | 6 votes |
def unicodify(s, encoding='utf-8', norm=None): """Ensure string is Unicode. .. versionadded:: 1.31 Decode encoded strings using ``encoding`` and normalise Unicode to form ``norm`` if specified. Args: s (str): String to decode. May also be Unicode. encoding (str, optional): Encoding to use on bytestrings. norm (None, optional): Normalisation form to apply to Unicode string. Returns: unicode: Decoded, optionally normalised, Unicode string. """ if not isinstance(s, unicode): s = unicode(s, encoding) if norm: from unicodedata import normalize s = normalize(norm, s) return s
Example #10
Source File: utils.py From plugin.video.emby with GNU General Public License v3.0 | 6 votes |
def normalize_string(text): ''' For theme media, do not modify unless modified in TV Tunes. Remove dots from the last character as windows can not have directories with dots at the end ''' text = text.replace(":", "") text = text.replace("/", "-") text = text.replace("\\", "-") text = text.replace("<", "") text = text.replace(">", "") text = text.replace("*", "") text = text.replace("?", "") text = text.replace('|', "") text = text.strip() text = text.rstrip('.') text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore') return text
Example #11
Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0 | 6 votes |
def _run_strip_accents(self, text): """Strips accents from a piece of text.""" # 这个函数去除掉text中的非间距字符 # 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。 # 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。 # normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示 text = unicodedata.normalize("NFD", text) output = [] for char in text: # category() 返回字符在UNICODE里分类的类型 cat = unicodedata.category(char) if cat == "Mn": # Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。 # https://www.fileformat.info/info/unicode/category/Mn/list.htm continue output.append(char) return "".join(output)
Example #12
Source File: workflow.py From gist-alfred with MIT License | 6 votes |
def fold_to_ascii(self, text): """Convert non-ASCII characters to closest ASCII equivalent. .. versionadded:: 1.3 .. note:: This only works for a subset of European languages. :param text: text to convert :type text: ``unicode`` :returns: text containing only ASCII characters :rtype: ``unicode`` """ if isascii(text): return text text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text]) return unicode(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore'))
Example #13
Source File: word2vecReaderUtils.py From word2vec-twitter with MIT License | 6 votes |
def deaccent(text): """ Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. Return input string with accents removed, as unicode. >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") u'Sef chomutovskych komunistu dostal postou bily prasek' """ if not isinstance(text, unicode): # assume utf8 for byte strings, use default (strict) error handling text = text.decode('utf8') norm = unicodedata.normalize("NFD", text) result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') return unicodedata.normalize("NFC", result)
Example #14
Source File: gftools-fix-ascii-fontmetadata.py From gftools with Apache License 2.0 | 6 votes |
def normalizestr(string): """ Converts special characters like copyright, trademark signs to ascii name """ # print("input: '{}'".format(string)) input_string = string for mark, ascii_repl in unicode_marks(string): string = string.replace(mark, ascii_repl) rv = [] # for c in unicodedata.normalize('NFKC', smart_text(string)): for c in unicodedata.normalize('NFKC', string): # cat = unicodedata.category(c)[0] # if cat in 'LN' or c in ok: rv.append(c) new = ''.join(rv).strip() result = unidecode(new) if result != input_string: print("Fixed string: '{}'".format(result)) return result
Example #15
Source File: workflow.py From wechat-alfred-workflow with MIT License | 6 votes |
def fold_to_ascii(self, text): """Convert non-ASCII characters to closest ASCII equivalent. .. versionadded:: 1.3 .. note:: This only works for a subset of European languages. :param text: text to convert :type text: ``unicode`` :returns: text containing only ASCII characters :rtype: ``unicode`` """ if isascii(text): return text text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text]) return unicode(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore'))
Example #16
Source File: strings.py From recruit with Apache License 2.0 | 6 votes |
def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. For more information on the forms, see the :func:`unicodedata.normalize`. Parameters ---------- form : {'NFC', 'NFKC', 'NFD', 'NFKD'} Unicode form Returns ------- normalized : Series/Index of objects """ import unicodedata f = lambda x: unicodedata.normalize(form, compat.u_safe(x)) result = _na_map(f, self._parent) return self._wrap_result(result)
Example #17
Source File: util.py From wechat-alfred-workflow with MIT License | 6 votes |
def unicodify(s, encoding='utf-8', norm=None): """Ensure string is Unicode. .. versionadded:: 1.31 Decode encoded strings using ``encoding`` and normalise Unicode to form ``norm`` if specified. Args: s (str): String to decode. May also be Unicode. encoding (str, optional): Encoding to use on bytestrings. norm (None, optional): Normalisation form to apply to Unicode string. Returns: unicode: Decoded, optionally normalised, Unicode string. """ if not isinstance(s, unicode): s = unicode(s, encoding) if norm: from unicodedata import normalize s = normalize(norm, s) return s
Example #18
Source File: names.py From yamdwe with BSD 3-Clause "New" or "Revised" License | 6 votes |
def clean_id(name, preserve_case=False): """ Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php Ignores both slashes and colons as valid namespace choices (to convert slashes to colons, call make_dokuwiki_pagename) """ main,ext = os.path.splitext(name) # remove accents try: decomposed = unicodedata.normalize("NFKD", main) no_accent = ''.join(c for c in decomposed if ord(c)<0x7f) except TypeError: no_accent = main # name was plaintext to begin with # recombine without any other characters result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext) if not preserve_case: result = result.lower() while "__" in result: result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it return result
Example #19
Source File: clean.py From cleanco with MIT License | 5 votes |
def remove_accents(t): """based on https://stackoverflow.com/a/51230541""" nfkd_form = unicodedata.normalize('NFKD', t.casefold()) return ''.join( NON_NFKD_MAP[c] if c in NON_NFKD_MAP else c for part in nfkd_form for c in part if unicodedata.category(part) != 'Mn' )
Example #20
Source File: clean.py From cleanco with MIT License | 5 votes |
def normalize_terms(terms): "normalize terms" return (strip_punct(remove_accents(t)) for t in terms)
Example #21
Source File: terminalwriter.py From py with MIT License | 5 votes |
def get_line_width(text): text = unicodedata.normalize('NFC', text) return sum(char_width.get(unicodedata.east_asian_width(c), 1) for c in text) # XXX unify with _escaped func below
Example #22
Source File: common.py From vulscan with MIT License | 5 votes |
def normalizeUnicode(value): """ Does an ASCII normalization of unicode strings Reference: http://www.peterbe.com/plog/unicode-to-ascii >>> normalizeUnicode(u'\u0161u\u0107uraj') 'sucuraj' """ return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') if isinstance(value, unicode) else value
Example #23
Source File: pipedriveapi.py From dataiku-contrib with Apache License 2.0 | 5 votes |
def slugify(s, lower=True): """ Creates a slug (ascii) for a given unicode string. If the unidecode package is available, an ascii transliteration is done. """ normalized = unicodedata.normalize("NFD", s) cleaned = ''.join([c for c in normalized if unicodedata.category(c) != 'Mn']) slugified_ascii = re.sub(r"[^A-Za-z0-9_-]", '_', cleaned) if unidecode_available: slugified_ascii = re.sub(r"[^A-Za-z0-9_-]", '_', unidecode(cleaned)) slugified_ascii = re.sub(r"_{2,}", '_', slugified_ascii) if lower: slugified_ascii = slugified_ascii.lower() ### If you prefer to work with a unicode slug, use instead the following: # slugified_unicode = u"" # for c in cleaned: # cat = unicodedata.category(c) # if cat.startswith("L") or cat.startswith("N"): # slugified_unicode += c # else: # slugified_unicode += "_" return slugified_ascii
Example #24
Source File: workflow.py From Quiver-alfred with MIT License | 5 votes |
def decode(self, text, encoding=None, normalization=None): """Return ``text`` as normalised unicode. If ``encoding`` and/or ``normalization`` is ``None``, the ``input_encoding``and ``normalization`` parameters passed to :class:`Workflow` are used. :param text: string :type text: encoded or Unicode string. If ``text`` is already a Unicode string, it will only be normalised. :param encoding: The text encoding to use to decode ``text`` to Unicode. :type encoding: ``unicode`` or ``None`` :param normalization: The nomalisation form to apply to ``text``. :type normalization: ``unicode`` or ``None`` :returns: decoded and normalised ``unicode`` :class:`Workflow` uses "NFC" normalisation by default. This is the standard for Python and will work well with data from the web (via :mod:`~workflow.web` or :mod:`json`). OS X, on the other hand, uses "NFD" normalisation (nearly), so data coming from the system (e.g. via :mod:`subprocess` or :func:`os.listdir`/:mod:`os.path`) may not match. You should either normalise this data, too, or change the default normalisation used by :class:`Workflow`. """ encoding = encoding or self._input_encoding normalization = normalization or self._normalizsation if not isinstance(text, unicode): text = unicode(text, encoding) return unicodedata.normalize(normalization, text)
Example #25
Source File: web.py From Quiver-alfred with MIT License | 5 votes |
def text(self): """Unicode-decoded content of response body. If no encoding can be determined from HTTP headers or the content itself, the encoded response body will be returned instead. :returns: Body of HTTP response :rtype: :class:`unicode` or :class:`str` """ if self.encoding: return unicodedata.normalize('NFC', unicode(self.content, self.encoding)) return self.content
Example #26
Source File: notify.py From Quiver-alfred with MIT License | 5 votes |
def uni(s): """Coerce `s` to normalised Unicode.""" ustr = s.decode('utf-8') return normalize('NFD', ustr)
Example #27
Source File: core.py From core with MIT License | 5 votes |
def uts46_remap(domain, std3_rules=True, transitional=False): """Re-map the characters in the string according to UTS46 processing.""" from .uts46data import uts46data output = u"" try: for pos, char in enumerate(domain): code_point = ord(char) uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1] status = uts46row[1] replacement = uts46row[2] if len(uts46row) == 3 else None if (status == "V" or (status == "D" and not transitional) or (status == "3" and std3_rules and replacement is None)): output += char elif replacement is not None and (status == "M" or (status == "3" and std3_rules) or (status == "D" and transitional)): output += replacement elif status != "I": raise IndexError() return unicodedata.normalize("NFC", output) except IndexError: raise InvalidCodepoint( "Codepoint {0} not allowed at position {1} in {2}".format( _unot(code_point), pos + 1, repr(domain)))
Example #28
Source File: core.py From core with MIT License | 5 votes |
def check_nfc(label): if unicodedata.normalize('NFC', label) != label: raise IDNAError('Label must be in Normalization Form C')
Example #29
Source File: helpers.py From gitlab-tools with GNU General Public License v3.0 | 5 votes |
def slugify(value): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. """ value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') value = re.sub('[^\w\s-]', '', value).strip().lower() return re.sub('[-\s]+', '-', value)
Example #30
Source File: sutils.py From plugin.video.sosac.ph with GNU General Public License v2.0 | 5 votes |
def encode(string): return unicodedata.normalize('NFKD', string.decode('utf-8')).encode('ascii', 'ignore')