Python Examples of unicodedata.category

Source File: tokenizer.py From keras-bert with MIT License

8 votes

def _tokenize(self, text):
        if not self._cased:
            text = unicodedata.normalize('NFD', text)
            text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
            text = text.lower()
        spaced = ''
        for ch in text:
            if self._is_punctuation(ch) or self._is_cjk_character(ch):
                spaced += ' ' + ch + ' '
            elif self._is_space(ch):
                spaced += ' '
            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
                continue
            else:
                spaced += ch
        tokens = []
        for word in spaced.strip().split():
            tokens += self._word_piece_tokenize(word)
        return tokens

Source File: bert_wrapper.py From UDPipe-Future with Mozilla Public License 2.0

6 votes

def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

### modeling.py

Source File: tokenization.py From UDA_pytorch with Apache License 2.0

6 votes

def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""

        korean = "%s-%s%s-%s" % (chr(0xac00), chr(0xd7a3),
                                chr(0x3131), chr(0x3163))
        if re.search("[%s]+" % korean, text):
            return "".join(
                substr if re.search("^[%s]+$" % korean, substr)
                else self._run_strip_accents(substr)
                for substr in re.findall("[%s]+|[^%s]+" % (korean, korean), text)
            )

        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

Source File: DipperUtil.py From dipper with BSD 3-Clause "New" or "Revised" License

6 votes

def get_hgnc_id_from_symbol(gene_symbol):
        """
        Get HGNC curie from symbol using monarch and mygene services
        :param gene_symbol:
        :return:
        """
        monarch_url = 'https://solr.monarchinitiative.org/solr/search/select'
        params = DipperUtil._get_solr_weight_settings()
        params["q"] = "{0} \"{0}\"".format(gene_symbol)
        params["fq"] = ["taxon:\"NCBITaxon:9606\"", "category:\"gene\""]
        gene_id = None
        try:
            monarch_request = requests.get(monarch_url, params=params)
            response = monarch_request.json()
            count = response['response']['numFound']
            if count > 0:
                gene_id = response['response']['docs'][0]['id']
        except requests.ConnectionError:
            print("error fetching {0}".format(monarch_url))

        return gene_id

Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

6 votes

def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 这个函数去除掉text中的非间距字符

        # 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
        # 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
        # normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # category() 返回字符在UNICODE里分类的类型
            cat = unicodedata.category(char)
            if cat == "Mn":
                #  Mark, Nonspacing 指示字符是非间距字符，这指示基字符的修改。
                # https://www.fileformat.info/info/unicode/category/Mn/list.htm
                continue
            output.append(char)
        return "".join(output)

Source File: base.py From uxy with MIT License

6 votes

def encode_field(s):
  # Empty string converts to "".
  if s == "":
    return '""  '
  # Check whether the string contains any special characters.
  special = False
  if '"' in s or ' ' in s:
    special = True
  else:
    for c in s:
      if unicodedata.category(c)[0] == "C":
        special = True
        break
  if not special:
    return s
  # Quoted field is needed.
  f = '"'
  for c in s:
    if c in ESCAPE_SEQUENCES2:
      f += ESCAPE_SEQUENCES2[c]
      continue
    f += c
  return f + '"'

Source File: gftools-fix-ascii-fontmetadata.py From gftools with Apache License 2.0

6 votes

def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result

Source File: word2vecReaderUtils.py From word2vec-twitter with MIT License

6 votes

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

Source File: base.py From pyfiscal with MIT License

6 votes

def remove_accents(self, text):
		""" Normalise (normalize) unicode data in Python to remove umlauts, accents etc.

		Rule 10 - When special characters appear as part of the name, paternal surname and maternal surname,
		they must be excluded for the calculation of the homonym and the verification digit. 
		The characters will be interpreted, yes and only if, they are individually within the name,
		paternal surname and maternal surname. Examples:
			
		Roberto O’farril Carballo OACR-661121
		Rubén D’angelo Fargo DAFR-710108
		Luz Ma. Fernández Juárez FEJL-830120
		"""		 
		#s_no_accents = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
		try:
			text = unicode(text, 'utf-8')
		except (TypeError, NameError): # unicode is a default on python 3 
			pass
		text = unicodedata.normalize('NFD', text)
		text = text.encode('ascii', 'ignore')
		text = text.decode("utf-8")
		return str(text)

Source File: strings_utils.py From ludwig with Apache License 2.0

5 votes

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

Source File: base.py From uxy with MIT License

5 votes

def decode_field(s):
  # Replace control characters by question marks.
  s = "".join((c if unicodedata.category(c)[0] != "C" else '?') for c in s)
  if not (s.startswith('"') and s.endswith('"')):
    return s
  # Quoted field.
  s = s[1:-1]
  # Expand escape sequences.
  f = ""
  j = 0
  while j < len(s):
    if s[j] == '\\':
      if j + 1 >= len(s):
        f += "?"
        j += 1;
        continue
      if s[j + 1] not in ESCAPE_SEQUENCES1:
        f += "?"
        j += 2
        continue
      f += ESCAPE_SEQUENCES1[s[j + 1]]
      j += 2
      continue
    f += s[j]
    j += 1
  return f


# Convert arbitrary string into a __main__.py field.

Source File: tokenization.py From crosentgec with GNU General Public License v3.0

5 votes

def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

Source File: tokenization.py From HGL-pytorch with MIT License

5 votes

def _is_punctuation(char):
  """Checks whether `chars` is a punctuation character."""
  cp = ord(char)
  # We treat all non-letter/number ASCII as punctuation.
  # Characters such as "^", "$", and "`" are not in the Unicode
  # Punctuation class but we treat them as punctuation anyways, for
  # consistency.
  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
    return True
  cat = unicodedata.category(char)
  if cat.startswith("P"):
    return True
  return False

Source File: tokenization.py From HGL-pytorch with MIT License

5 votes

def _is_control(char):
  """Checks whether `chars` is a control character."""
  # These are technically control characters but we count them as whitespace
  # characters.
  if char == "\t" or char == "\n" or char == "\r":
    return False
  cat = unicodedata.category(char)
  if cat.startswith("C"):
    return True
  return False

Source File: wordembedding_utils.py From FARM with Apache License 2.0

5 votes

def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

Source File: tokenization.py From crosentgec with GNU General Public License v3.0

5 votes

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

Source File: card.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def unaccent(s: str) -> str:
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

Source File: Utils.py From qgis-cartodb with GNU General Public License v2.0

5 votes

def stripAccents(text):
    """Strips accent to text"""
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

Source File: pipedriveapi.py From dataiku-contrib with Apache License 2.0

5 votes

def slugify(s, lower=True):
    """
    Creates a slug (ascii) for a given unicode string.
    If the unidecode package is available, an ascii transliteration is done.
    """

    normalized =  unicodedata.normalize("NFD", s)
    cleaned = ''.join([c for c in normalized if unicodedata.category(c) != 'Mn'])
    slugified_ascii =  re.sub(r"[^A-Za-z0-9_-]", '_', cleaned)

    if unidecode_available:
        slugified_ascii = re.sub(r"[^A-Za-z0-9_-]", '_', unidecode(cleaned))

    slugified_ascii = re.sub(r"_{2,}", '_', slugified_ascii)

    if lower:
        slugified_ascii = slugified_ascii.lower()

    ### If you prefer to work with a unicode slug, use instead the following:
    # slugified_unicode = u""
    # for c in cleaned:
    #   cat = unicodedata.category(c)
    #   if cat.startswith("L") or cat.startswith("N"):
    #       slugified_unicode += c
    #   else:
    #       slugified_unicode += "_"

    return slugified_ascii

Source File: clean.py From cleanco with MIT License

5 votes

def remove_accents(t):
    """based on https://stackoverflow.com/a/51230541"""
    nfkd_form = unicodedata.normalize('NFKD', t.casefold())
    return ''.join(
        NON_NFKD_MAP[c]
            if c in NON_NFKD_MAP
        else c
            for part in nfkd_form for c in part
            if unicodedata.category(part) != 'Mn'
        )

Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

5 votes

def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

5 votes

def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

Source File: tokenization.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

5 votes

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

Source File: core.py From ServerlessCrawler-VancouverRealState with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From ServerlessCrawler-VancouverRealState with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: core.py From ServerlessCrawler-VancouverRealState with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Source File: bert_wrapper.py From UDPipe-Future with Mozilla Public License 2.0

5 votes

def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

Source File: bert_wrapper.py From UDPipe-Future with Mozilla Public License 2.0

5 votes

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

Source File: bert_wrapper.py From UDPipe-Future with Mozilla Public License 2.0

5 votes

def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

Source File: core.py From core with MIT License

5 votes

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

Python unicodedata.category() Examples