Python unicodedata.normalize() Examples

The following are 30 code examples of unicodedata.normalize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module unicodedata , or try the search function .
Example #1
Source File: tokenizer.py    From keras-bert with MIT License 8 votes vote down vote up
def _tokenize(self, text):
        if not self._cased:
            text = unicodedata.normalize('NFD', text)
            text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens 
Example #2
Source File: static.py    From cherrypy with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def _make_content_disposition(disposition, file_name):
    """Create HTTP header for downloading a file with a UTF-8 filename.

    This function implements the recommendations of :rfc:`6266#appendix-D`.
    See this and related answers: https://stackoverflow.com/a/8996249/2173868.
    """
    # As normalization algorithm for `unicodedata` is used composed form (NFC
    # and NFKC) with compatibility equivalence criteria (NFK), so "NFKC" is the
    # one. It first applies the compatibility decomposition, followed by the
    # canonical composition. Should be displayed in the same manner, should be
    # treated in the same way by applications such as alphabetizing names or
    # searching, and may be substituted for each other.
    # See: https://en.wikipedia.org/wiki/Unicode_equivalence.
    ascii_name = (
        unicodedata.normalize('NFKC', file_name).
        encode('ascii', errors='ignore').decode()
    )
    header = '{}; filename="{}"'.format(disposition, ascii_name)
    if ascii_name != file_name:
        quoted_name = urllib.parse.quote(file_name)
        header += '; filename*=UTF-8\'\'{}'.format(quoted_name)
    return header 
Example #3
Source File: wsgi.py    From vergeml with MIT License 6 votes vote down vote up
def secure_filename(filename):
    if isinstance(filename, str):
        from unicodedata import normalize
        filename = normalize('NFKD', filename).encode('ascii', 'ignore')
        filename = filename.decode('ascii')
    for sep in os.path.sep, os.path.altsep:
        if sep:
            filename = filename.replace(sep, ' ')
    filename = str(_filename_ascii_strip_re.sub('', '_'.join(
                   filename.split()))).strip('._')

    # on nt a couple of special files are present in each folder.  We
    # have to ensure that the target file is not such a filename.  In
    # this case we prepend an underline
    if os.name == 'nt' and filename and \
       filename.split('.')[0].upper() in _windows_device_files:
        filename = '_' + filename

    return filename 
Example #4
Source File: connector.py    From dataiku-contrib with Apache License 2.0 6 votes vote down vote up
def _byteify(data, ignore_dicts = False):
    # if this is a unicode string, return its string representation
    if isinstance(data, unicode):
        return unicodedata.normalize('NFKD', data).encode('ascii','ignore')
    # if this is a list of values, return list of byteified values
    if isinstance(data, list):
        return [ _byteify(item, ignore_dicts=True) for item in data ]
    # if this is a dictionary, return dictionary of byteified keys and values
    # but only if we haven't already byteified it
    if isinstance(data, dict) and not ignore_dicts:
        return {
            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
            for key, value in data.iteritems()
        }
    # if it's anything else, return it in its original form
    return data 
Example #5
Source File: ncm2.py    From ncm2 with MIT License 6 votes vote down vote up
def strdisplaywidth(self, s):
        def get_char_display_width(unicode_str):
            r = unicodedata.east_asian_width(unicode_str)
            if r == "F":  # Fullwidth
                return 1
            elif r == "H":  # Half-width
                return 1
            elif r == "W":  # Wide
                return 2
            elif r == "Na":  # Narrow
                return 1
            elif r == "A":  # Ambiguous, go with 2
                return 1
            elif r == "N":  # Neutral
                return 1
            else:
                return 1

        s = unicodedata.normalize('NFC', s)
        w = 0
        for c in s:
            w += get_char_display_width(c)
        return w 
Example #6
Source File: workflow.py    From Quiver-alfred with MIT License 6 votes vote down vote up
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore')) 
Example #7
Source File: text.py    From pyspelling with MIT License 6 votes vote down vote up
def setup(self):
        """Setup."""

        self.normalize = self.config['normalize'].upper()
        self.convert_encoding = self.config['convert_encoding'].lower()
        self.errors = self.config['errors'].lower()

        if self.convert_encoding:
            self.convert_encoding = codecs.lookup(
                filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower()
            ).name

            # Don't generate content with BOMs
            if (
                self.convert_encoding.startswith(('utf-32', 'utf-16')) and
                not self.convert_encoding.endswith(('le', 'be'))
            ):
                self.convert_encoding += '-le'

            if self.convert_encoding == 'utf-8-sig':
                self.convert_encoding = 'utf-8' 
Example #8
Source File: connector.py    From dataiku-contrib with Apache License 2.0 6 votes vote down vote up
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        query_date = datetime.datetime.now()

        rows = self.list_epics()
        if len(rows) == 0:
            logging.info("Not epics.")
        else:
            nb = 0
            for row in rows:
                if 0 <= records_limit <= nb:
                    logging.info("Reached records_limit (%i), stopping." % records_limit)
                    return
                
                encoded_row = {}
                encoded_row["query_date"] = query_date
                for key in row:
                    val = row[key]
                    if isinstance(val, unicode):
                        val = unicodedata.normalize('NFKD', val).encode('ascii','ignore')                                             
                    encoded_row[str(key)] = val
                    
                yield encoded_row
                nb += 1 
Example #9
Source File: util.py    From gist-alfred with MIT License 6 votes vote down vote up
def unicodify(s, encoding='utf-8', norm=None):
    """Ensure string is Unicode.

    .. versionadded:: 1.31

    Decode encoded strings using ``encoding`` and normalise Unicode
    to form ``norm`` if specified.

    Args:
        s (str): String to decode. May also be Unicode.
        encoding (str, optional): Encoding to use on bytestrings.
        norm (None, optional): Normalisation form to apply to Unicode string.

    Returns:
        unicode: Decoded, optionally normalised, Unicode string.

    """
    if not isinstance(s, unicode):
        s = unicode(s, encoding)

    if norm:
        from unicodedata import normalize
        s = normalize(norm, s)

    return s 
Example #10
Source File: utils.py    From plugin.video.emby with GNU General Public License v3.0 6 votes vote down vote up
def normalize_string(text):

    ''' For theme media, do not modify unless modified in TV Tunes.
        Remove dots from the last character as windows can not have directories
        with dots at the end
    '''
    text = text.replace(":", "")
    text = text.replace("/", "-")
    text = text.replace("\\", "-")
    text = text.replace("<", "")
    text = text.replace(">", "")
    text = text.replace("*", "")
    text = text.replace("?", "")
    text = text.replace('|', "")
    text = text.strip()

    text = text.rstrip('.')
    text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore')

    return text 
Example #11
Source File: tokenization.py    From BERT-for-Chinese-Question-Answering with Apache License 2.0 6 votes vote down vote up
def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 这个函数去除掉text中的非间距字符

        # 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
        # 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
        # normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # category() 返回字符在UNICODE里分类的类型
            cat = unicodedata.category(char)
            if cat == "Mn":
                #  Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。
                # https://www.fileformat.info/info/unicode/category/Mn/list.htm
                continue
            output.append(char)
        return "".join(output) 
Example #12
Source File: workflow.py    From gist-alfred with MIT License 6 votes vote down vote up
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore')) 
Example #13
Source File: word2vecReaderUtils.py    From word2vec-twitter with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #14
Source File: gftools-fix-ascii-fontmetadata.py    From gftools with Apache License 2.0 6 votes vote down vote up
def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result 
Example #15
Source File: workflow.py    From wechat-alfred-workflow with MIT License 6 votes vote down vote up
def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore')) 
Example #16
Source File: strings.py    From recruit with Apache License 2.0 6 votes vote down vote up
def normalize(self, form):
        """
        Return the Unicode normal form for the strings in the Series/Index.
        For more information on the forms, see the
        :func:`unicodedata.normalize`.

        Parameters
        ----------
        form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
            Unicode form

        Returns
        -------
        normalized : Series/Index of objects
        """
        import unicodedata
        f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
        result = _na_map(f, self._parent)
        return self._wrap_result(result) 
Example #17
Source File: util.py    From wechat-alfred-workflow with MIT License 6 votes vote down vote up
def unicodify(s, encoding='utf-8', norm=None):
    """Ensure string is Unicode.

    .. versionadded:: 1.31

    Decode encoded strings using ``encoding`` and normalise Unicode
    to form ``norm`` if specified.

    Args:
        s (str): String to decode. May also be Unicode.
        encoding (str, optional): Encoding to use on bytestrings.
        norm (None, optional): Normalisation form to apply to Unicode string.

    Returns:
        unicode: Decoded, optionally normalised, Unicode string.

    """
    if not isinstance(s, unicode):
        s = unicode(s, encoding)

    if norm:
        from unicodedata import normalize
        s = normalize(norm, s)

    return s 
Example #18
Source File: names.py    From yamdwe with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def clean_id(name, preserve_case=False):
    """
    Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php

    Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
    call make_dokuwiki_pagename)
    """
    main,ext = os.path.splitext(name)

    # remove accents
    try:
        decomposed = unicodedata.normalize("NFKD", main)
        no_accent = ''.join(c for c in decomposed if ord(c)<0x7f)
    except TypeError:
        no_accent = main # name was plaintext to begin with

    # recombine without any other characters
    result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext)
    if not preserve_case:
        result = result.lower()
    while "__" in result:
        result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
    return result 
Example #19
Source File: clean.py    From cleanco with MIT License 5 votes vote down vote up
def remove_accents(t):
    """based on https://stackoverflow.com/a/51230541"""
    nfkd_form = unicodedata.normalize('NFKD', t.casefold())
    return ''.join(
        NON_NFKD_MAP[c]
            if c in NON_NFKD_MAP
        else c
            for part in nfkd_form for c in part
            if unicodedata.category(part) != 'Mn'
        ) 
Example #20
Source File: clean.py    From cleanco with MIT License 5 votes vote down vote up
def normalize_terms(terms):
    "normalize terms"
    return (strip_punct(remove_accents(t)) for t in terms) 
Example #21
Source File: terminalwriter.py    From py with MIT License 5 votes vote down vote up
def get_line_width(text):
    text = unicodedata.normalize('NFC', text)
    return sum(char_width.get(unicodedata.east_asian_width(c), 1) for c in text)


# XXX unify with _escaped func below 
Example #22
Source File: common.py    From vulscan with MIT License 5 votes vote down vote up
def normalizeUnicode(value):
    """
    Does an ASCII normalization of unicode strings
    Reference: http://www.peterbe.com/plog/unicode-to-ascii

    >>> normalizeUnicode(u'\u0161u\u0107uraj')
    'sucuraj'
    """

    return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') if isinstance(value, unicode) else value 
Example #23
Source File: pipedriveapi.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def slugify(s, lower=True):
    """
    Creates a slug (ascii) for a given unicode string.
    If the unidecode package is available, an ascii transliteration is done.
    """

    normalized =  unicodedata.normalize("NFD", s)
    cleaned = ''.join([c for c in normalized if unicodedata.category(c) != 'Mn'])
    slugified_ascii =  re.sub(r"[^A-Za-z0-9_-]", '_', cleaned)

    if unidecode_available:
        slugified_ascii = re.sub(r"[^A-Za-z0-9_-]", '_', unidecode(cleaned))

    slugified_ascii = re.sub(r"_{2,}", '_', slugified_ascii)

    if lower:
        slugified_ascii = slugified_ascii.lower()

    ### If you prefer to work with a unicode slug, use instead the following:
    # slugified_unicode = u""
    # for c in cleaned:
    #   cat = unicodedata.category(c)
    #   if cat.startswith("L") or cat.startswith("N"):
    #       slugified_unicode += c
    #   else:
    #       slugified_unicode += "_"

    return slugified_ascii 
Example #24
Source File: workflow.py    From Quiver-alfred with MIT License 5 votes vote down vote up
def decode(self, text, encoding=None, normalization=None):
        """Return ``text`` as normalised unicode.

        If ``encoding`` and/or ``normalization`` is ``None``, the
        ``input_encoding``and ``normalization`` parameters passed to
        :class:`Workflow` are used.

        :param text: string
        :type text: encoded or Unicode string. If ``text`` is already a
            Unicode string, it will only be normalised.
        :param encoding: The text encoding to use to decode ``text`` to
            Unicode.
        :type encoding: ``unicode`` or ``None``
        :param normalization: The nomalisation form to apply to ``text``.
        :type normalization: ``unicode`` or ``None``
        :returns: decoded and normalised ``unicode``

        :class:`Workflow` uses "NFC" normalisation by default. This is the
        standard for Python and will work well with data from the web (via
        :mod:`~workflow.web` or :mod:`json`).

        OS X, on the other hand, uses "NFD" normalisation (nearly), so data
        coming from the system (e.g. via :mod:`subprocess` or
        :func:`os.listdir`/:mod:`os.path`) may not match. You should either
        normalise this data, too, or change the default normalisation used by
        :class:`Workflow`.

        """

        encoding = encoding or self._input_encoding
        normalization = normalization or self._normalizsation
        if not isinstance(text, unicode):
            text = unicode(text, encoding)
        return unicodedata.normalize(normalization, text) 
Example #25
Source File: web.py    From Quiver-alfred with MIT License 5 votes vote down vote up
def text(self):
        """Unicode-decoded content of response body.

        If no encoding can be determined from HTTP headers or the content
        itself, the encoded response body will be returned instead.

        :returns: Body of HTTP response
        :rtype: :class:`unicode` or :class:`str`

        """
        if self.encoding:
            return unicodedata.normalize('NFC', unicode(self.content,
                                                        self.encoding))
        return self.content 
Example #26
Source File: notify.py    From Quiver-alfred with MIT License 5 votes vote down vote up
def uni(s):
        """Coerce `s` to normalised Unicode."""
        ustr = s.decode('utf-8')
        return normalize('NFD', ustr) 
Example #27
Source File: core.py    From core with MIT License 5 votes vote down vote up
def uts46_remap(domain, std3_rules=True, transitional=False):
    """Re-map the characters in the string according to UTS46 processing."""
    from .uts46data import uts46data
    output = u""
    try:
        for pos, char in enumerate(domain):
            code_point = ord(char)
            uts46row = uts46data[code_point if code_point < 256 else
                bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
            status = uts46row[1]
            replacement = uts46row[2] if len(uts46row) == 3 else None
            if (status == "V" or
                    (status == "D" and not transitional) or
                    (status == "3" and std3_rules and replacement is None)):
                output += char
            elif replacement is not None and (status == "M" or
                    (status == "3" and std3_rules) or
                    (status == "D" and transitional)):
                output += replacement
            elif status != "I":
                raise IndexError()
        return unicodedata.normalize("NFC", output)
    except IndexError:
        raise InvalidCodepoint(
            "Codepoint {0} not allowed at position {1} in {2}".format(
            _unot(code_point), pos + 1, repr(domain))) 
Example #28
Source File: core.py    From core with MIT License 5 votes vote down vote up
def check_nfc(label):

    if unicodedata.normalize('NFC', label) != label:
        raise IDNAError('Label must be in Normalization Form C') 
Example #29
Source File: helpers.py    From gitlab-tools with GNU General Public License v3.0 5 votes vote down vote up
def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    return re.sub('[-\s]+', '-', value) 
Example #30
Source File: sutils.py    From plugin.video.sosac.ph with GNU General Public License v2.0 5 votes vote down vote up
def encode(string):
        return unicodedata.normalize('NFKD', string.decode('utf-8')).encode('ascii', 'ignore')