Python Examples of unicodedata.normalize

Source File: tokenizer.py From keras-bert with MIT License

8 votes

def _tokenize(self, text):
        if not self._cased:
            text = unicodedata.normalize('NFD', text)
            text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens

Source File: static.py From cherrypy with BSD 3-Clause "New" or "Revised" License

7 votes

def _make_content_disposition(disposition, file_name):
    """Create HTTP header for downloading a file with a UTF-8 filename.

    This function implements the recommendations of :rfc:`6266#appendix-D`.
    See this and related answers: https://stackoverflow.com/a/8996249/2173868.
    """
    # As normalization algorithm for `unicodedata` is used composed form (NFC
    # and NFKC) with compatibility equivalence criteria (NFK), so "NFKC" is the
    # one. It first applies the compatibility decomposition, followed by the
    # canonical composition. Should be displayed in the same manner, should be
    # treated in the same way by applications such as alphabetizing names or
    # searching, and may be substituted for each other.
    # See: https://en.wikipedia.org/wiki/Unicode_equivalence.
    ascii_name = (
        unicodedata.normalize('NFKC', file_name).
        encode('ascii', errors='ignore').decode()
    )
    header = '{}; filename="{}"'.format(disposition, ascii_name)
    if ascii_name != file_name:
        quoted_name = urllib.parse.quote(file_name)
        header += '; filename*=UTF-8\'\'{}'.format(quoted_name)
    return header

Source File: wsgi.py From vergeml with MIT License

6 votes

def secure_filename(filename):
    if isinstance(filename, str):
        from unicodedata import normalize
        filename = normalize('NFKD', filename).encode('ascii', 'ignore')
        filename = filename.decode('ascii')
    for sep in os.path.sep, os.path.altsep:
        if sep:
            filename = filename.replace(sep, ' ')
    filename = str(_filename_ascii_strip_re.sub('', '_'.join(
                   filename.split()))).strip('._')

    # on nt a couple of special files are present in each folder.  We
    # have to ensure that the target file is not such a filename.  In
    # this case we prepend an underline
    if os.name == 'nt' and filename and \
       filename.split('.')[0].upper() in _windows_device_files:
        filename = '_' + filename

    return filename

Source File: connector.py From dataiku-contrib with Apache License 2.0

6 votes

def _byteify(data, ignore_dicts = False):
    # if this is a unicode string, return its string representation
    if isinstance(data, unicode):
        return unicodedata.normalize('NFKD', data).encode('ascii','ignore')
    # if this is a list of values, return list of byteified values
    if isinstance(data, list):
        return [ _byteify(item, ignore_dicts=True) for item in data ]
    # if this is a dictionary, return dictionary of byteified keys and values
    # but only if we haven't already byteified it
    if isinstance(data, dict) and not ignore_dicts:
        return {
            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
            for key, value in data.iteritems()
        }
    # if it's anything else, return it in its original form
    return data

Source File: ncm2.py From ncm2 with MIT License

6 votes

def strdisplaywidth(self, s):
        def get_char_display_width(unicode_str):
            r = unicodedata.east_asian_width(unicode_str)
            if r == "F":  # Fullwidth
                return 1
            elif r == "H":  # Half-width
                return 1
            elif r == "W":  # Wide
                return 2
            elif r == "Na":  # Narrow
                return 1
            elif r == "A":  # Ambiguous, go with 2
                return 1
            elif r == "N":  # Neutral
                return 1
            else:
                return 1

        s = unicodedata.normalize('NFC', s)
        w = 0
        for c in s:
            w += get_char_display_width(c)
        return w

Source File: workflow.py From Quiver-alfred with MIT License

6 votes

def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))

Source File: text.py From pyspelling with MIT License

6 votes

def setup(self):
        """Setup."""

        self.normalize = self.config['normalize'].upper()
        self.convert_encoding = self.config['convert_encoding'].lower()
        self.errors = self.config['errors'].lower()

        if self.convert_encoding:
            self.convert_encoding = codecs.lookup(
                filters.PYTHON_ENCODING_NAMES.get(self.default_encoding, self.default_encoding).lower()
            ).name

            # Don't generate content with BOMs
            if (
                self.convert_encoding.startswith(('utf-32', 'utf-16')) and
                not self.convert_encoding.endswith(('le', 'be'))
            ):
                self.convert_encoding += '-le'

            if self.convert_encoding == 'utf-8-sig':
                self.convert_encoding = 'utf-8'

Source File: connector.py From dataiku-contrib with Apache License 2.0

6 votes

def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        query_date = datetime.datetime.now()

        rows = self.list_epics()
        if len(rows) == 0:
            logging.info("Not epics.")
        else:
            nb = 0
            for row in rows:
                if 0 <= records_limit <= nb:
                    logging.info("Reached records_limit (%i), stopping." % records_limit)
                    return
                
                encoded_row = {}
                encoded_row["query_date"] = query_date
                for key in row:
                    val = row[key]
                    if isinstance(val, unicode):
                        val = unicodedata.normalize('NFKD', val).encode('ascii','ignore')                                             
                    encoded_row[str(key)] = val
                    
                yield encoded_row
                nb += 1

Source File: util.py From gist-alfred with MIT License

6 votes

def unicodify(s, encoding='utf-8', norm=None):
    """Ensure string is Unicode.

    .. versionadded:: 1.31

    Decode encoded strings using ``encoding`` and normalise Unicode
    to form ``norm`` if specified.

    Args:
        s (str): String to decode. May also be Unicode.
        encoding (str, optional): Encoding to use on bytestrings.
        norm (None, optional): Normalisation form to apply to Unicode string.

    Returns:
        unicode: Decoded, optionally normalised, Unicode string.

    """
    if not isinstance(s, unicode):
        s = unicode(s, encoding)

    if norm:
        from unicodedata import normalize
        s = normalize(norm, s)

    return s

Source File: utils.py From plugin.video.emby with GNU General Public License v3.0

6 votes

def normalize_string(text):

    ''' For theme media, do not modify unless modified in TV Tunes.
        Remove dots from the last character as windows can not have directories
        with dots at the end
    '''
    text = text.replace(":", "")
    text = text.replace("/", "-")
    text = text.replace("\\", "-")
    text = text.replace("<", "")
    text = text.replace(">", "")
    text = text.replace("*", "")
    text = text.replace("?", "")
    text = text.replace('|', "")
    text = text.strip()

    text = text.rstrip('.')
    text = unicodedata.normalize('NFKD', unicode(text, 'utf-8')).encode('ascii', 'ignore')

    return text

Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

6 votes

def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 这个函数去除掉text中的非间距字符

        # 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
        # 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
        # normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # category() 返回字符在UNICODE里分类的类型
            cat = unicodedata.category(char)
            if cat == "Mn":
                #  Mark, Nonspacing 指示字符是非间距字符，这指示基字符的修改。
                # https://www.fileformat.info/info/unicode/category/Mn/list.htm
                continue
            output.append(char)
        return "".join(output)

Source File: workflow.py From gist-alfred with MIT License

6 votes

def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))

Source File: word2vecReaderUtils.py From word2vec-twitter with MIT License

6 votes

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

Source File: gftools-fix-ascii-fontmetadata.py From gftools with Apache License 2.0

6 votes

def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result

Source File: workflow.py From wechat-alfred-workflow with MIT License

6 votes

def fold_to_ascii(self, text):
        """Convert non-ASCII characters to closest ASCII equivalent.

        .. versionadded:: 1.3

        .. note:: This only works for a subset of European languages.

        :param text: text to convert
        :type text: ``unicode``
        :returns: text containing only ASCII characters
        :rtype: ``unicode``

        """
        if isascii(text):
            return text
        text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text])
        return unicode(unicodedata.normalize('NFKD',
                       text).encode('ascii', 'ignore'))

Source File: strings.py From recruit with Apache License 2.0

6 votes

def normalize(self, form):
        """
        Return the Unicode normal form for the strings in the Series/Index.
        For more information on the forms, see the
        :func:`unicodedata.normalize`.

        Parameters
        ----------
        form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
            Unicode form

        Returns
        -------
        normalized : Series/Index of objects
        """
        import unicodedata
        f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
        result = _na_map(f, self._parent)
        return self._wrap_result(result)

Source File: util.py From wechat-alfred-workflow with MIT License

6 votes

def unicodify(s, encoding='utf-8', norm=None):
    """Ensure string is Unicode.

    .. versionadded:: 1.31

    Decode encoded strings using ``encoding`` and normalise Unicode
    to form ``norm`` if specified.

    Args:
        s (str): String to decode. May also be Unicode.
        encoding (str, optional): Encoding to use on bytestrings.
        norm (None, optional): Normalisation form to apply to Unicode string.

    Returns:
        unicode: Decoded, optionally normalised, Unicode string.

    """
    if not isinstance(s, unicode):
        s = unicode(s, encoding)

    if norm:
        from unicodedata import normalize
        s = normalize(norm, s)

    return s

Source File: names.py From yamdwe with BSD 3-Clause "New" or "Revised" License

6 votes

def clean_id(name, preserve_case=False):
    """
    Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php

    Ignores both slashes and colons as valid namespace choices (to convert slashes to colons,
    call make_dokuwiki_pagename)
    """
    main,ext = os.path.splitext(name)

    # remove accents
    try:
        decomposed = unicodedata.normalize("NFKD", main)
        no_accent = ''.join(c for c in decomposed if ord(c)<0x7f)
    except TypeError:
        no_accent = main # name was plaintext to begin with

    # recombine without any other characters
    result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext)
    if not preserve_case:
        result = result.lower()
    while "__" in result:
        result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it
    return result

Source File: clean.py From cleanco with MIT License

5 votes

def remove_accents(t):
    """based on https://stackoverflow.com/a/51230541"""
    nfkd_form = unicodedata.normalize('NFKD', t.casefold())
    return ''.join(
        NON_NFKD_MAP[c]
            if c in NON_NFKD_MAP
        else c
            for part in nfkd_form for c in part
            if unicodedata.category(part) != 'Mn'
        )

Source File: clean.py From cleanco with MIT License

5 votes

def normalize_terms(terms):
    "normalize terms"
    return (strip_punct(remove_accents(t)) for t in terms)

Source File: terminalwriter.py From py with MIT License

5 votes

def get_line_width(text):
    text = unicodedata.normalize('NFC', text)
    return sum(char_width.get(unicodedata.east_asian_width(c), 1) for c in text)


# XXX unify with _escaped func below

Source File: common.py From vulscan with MIT License

5 votes

def normalizeUnicode(value):
    """
    Does an ASCII normalization of unicode strings
    Reference: http://www.peterbe.com/plog/unicode-to-ascii

    >>> normalizeUnicode(u'\u0161u\u0107uraj')
    'sucuraj'
    """

    return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') if isinstance(value, unicode) else value

Source File: pipedriveapi.py From dataiku-contrib with Apache License 2.0

5 votes

def slugify(s, lower=True):
    """
    Creates a slug (ascii) for a given unicode string.
    If the unidecode package is available, an ascii transliteration is done.
    """

    normalized =  unicodedata.normalize("NFD", s)
    cleaned = ''.join([c for c in normalized if unicodedata.category(c) != 'Mn'])
    slugified_ascii =  re.sub(r"[^A-Za-z0-9_-]", '_', cleaned)

    if unidecode_available:
        slugified_ascii = re.sub(r"[^A-Za-z0-9_-]", '_', unidecode(cleaned))

    slugified_ascii = re.sub(r"_{2,}", '_', slugified_ascii)

    if lower:
        slugified_ascii = slugified_ascii.lower()

    ### If you prefer to work with a unicode slug, use instead the following:
    # slugified_unicode = u""
    # for c in cleaned:
    #   cat = unicodedata.category(c)
    #   if cat.startswith("L") or cat.startswith("N"):
    #       slugified_unicode += c
    #   else:
    #       slugified_unicode += "_"

    return slugified_ascii

Source File: workflow.py From Quiver-alfred with MIT License

5 votes

def decode(self, text, encoding=None, normalization=None):
        """Return ``text`` as normalised unicode.

        If ``encoding`` and/or ``normalization`` is ``None``, the
        ``input_encoding``and ``normalization`` parameters passed to
        :class:`Workflow` are used.

        :param text: string
        :type text: encoded or Unicode string. If ``text`` is already a
            Unicode string, it will only be normalised.
        :param encoding: The text encoding to use to decode ``text`` to
            Unicode.
        :type encoding: ``unicode`` or ``None``
        :param normalization: The nomalisation form to apply to ``text``.
        :type normalization: ``unicode`` or ``None``
        :returns: decoded and normalised ``unicode``

        :class:`Workflow` uses "NFC" normalisation by default. This is the
        standard for Python and will work well with data from the web (via
        :mod:`~workflow.web` or :mod:`json`).

        OS X, on the other hand, uses "NFD" normalisation (nearly), so data
        coming from the system (e.g. via :mod:`subprocess` or
        :func:`os.listdir`/:mod:`os.path`) may not match. You should either
        normalise this data, too, or change the default normalisation used by
        :class:`Workflow`.

        """

        encoding = encoding or self._input_encoding
        normalization = normalization or self._normalizsation
        if not isinstance(text, unicode):
            text = unicode(text, encoding)
        return unicodedata.normalize(normalization, text)

Source File: web.py From Quiver-alfred with MIT License

5 votes

def text(self):
        """Unicode-decoded content of response body.

        If no encoding can be determined from HTTP headers or the content
        itself, the encoded response body will be returned instead.

        :returns: Body of HTTP response
        :rtype: :class:`unicode` or :class:`str`

        """
        if self.encoding:
            return unicodedata.normalize('NFC', unicode(self.content,
                                                        self.encoding))
        return self.content

Source File: notify.py From Quiver-alfred with MIT License

5 votes

def uni(s):
        """Coerce `s` to normalised Unicode."""
        ustr = s.decode('utf-8')
        return normalize('NFD', ustr)

Source File: core.py From core with MIT License

5 votes

def uts46_remap(domain, std3_rules=True, transitional=False):
    """Re-map the characters in the string according to UTS46 processing."""
    from .uts46data import uts46data
    output = u""
    try:
        for pos, char in enumerate(domain):
            code_point = ord(char)
            uts46row = uts46data[code_point if code_point < 256 else
                bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
            status = uts46row[1]
            replacement = uts46row[2] if len(uts46row) == 3 else None
            if (status == "V" or
                    (status == "D" and not transitional) or
                    (status == "3" and std3_rules and replacement is None)):
                output += char
            elif replacement is not None and (status == "M" or
                    (status == "3" and std3_rules) or
                    (status == "D" and transitional)):
                output += replacement
            elif status != "I":
                raise IndexError()
        return unicodedata.normalize("NFC", output)
    except IndexError:
        raise InvalidCodepoint(
            "Codepoint {0} not allowed at position {1} in {2}".format(
            _unot(code_point), pos + 1, repr(domain)))

Source File: core.py From core with MIT License

5 votes

def check_nfc(label):

    if unicodedata.normalize('NFC', label) != label:
        raise IDNAError('Label must be in Normalization Form C')

Source File: helpers.py From gitlab-tools with GNU General Public License v3.0

5 votes

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    return re.sub('[-\s]+', '-', value)

Source File: sutils.py From plugin.video.sosac.ph with GNU General Public License v2.0

5 votes

def encode(string):
        return unicodedata.normalize('NFKD', string.decode('utf-8')).encode('ascii', 'ignore')

Python unicodedata.normalize() Examples