Python remove accents

Source File: linkedin2username.py From linkedin2username with MIT License

10 votes

def remove_accents(raw_text):
    """Removes common accent characters.

    Our goal is to brute force login mechanisms, and I work primary with
    companies deploying Engligh-language systems. From my experience, user
    accounts tend to be created without special accented characters. This
    function tries to swap those out for standard Engligh alphabet.
    """

    raw_text = re.sub(u"[àáâãäå]", 'a', raw_text)
    raw_text = re.sub(u"[èéêë]", 'e', raw_text)
    raw_text = re.sub(u"[ìíîï]", 'i', raw_text)
    raw_text = re.sub(u"[òóôõö]", 'o', raw_text)
    raw_text = re.sub(u"[ùúûü]", 'u', raw_text)
    raw_text = re.sub(u"[ýÿ]", 'y', raw_text)
    raw_text = re.sub(u"[ß]", 'ss', raw_text)
    raw_text = re.sub(u"[ñ]", 'n', raw_text)
    return raw_text

Source File: util.py From urduhack with MIT License

7 votes

def remove_accents(text: str) -> str:
    """
    Remove accents from any accented unicode characters in ``text`` str, either by
    transforming them into ascii equivalents or removing them entirely.

    Args:
        text (str): Urdu text
    Returns:
        str
    Examples:
        >>> from urduhack.preprocessing import remove_accents
        >>>text = "دالتِ عظمیٰ درخواست"
        >>> remove_accents(text)
    'دالت عظمی درخواست'
    """
    return ''.join(c for c in text if not unicodedata.combining(c))

Source File: base.py From pyfiscal with MIT License

6 votes

def remove_accents(self, text):
		""" Normalise (normalize) unicode data in Python to remove umlauts, accents etc.

		Rule 10 - When special characters appear as part of the name, paternal surname and maternal surname,
		they must be excluded for the calculation of the homonym and the verification digit. 
		The characters will be interpreted, yes and only if, they are individually within the name,
		paternal surname and maternal surname. Examples:
			
		Roberto O’farril Carballo OACR-661121
		Rubén D’angelo Fargo DAFR-710108
		Luz Ma. Fernández Juárez FEJL-830120
		"""		 
		#s_no_accents = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
		try:
			text = unicode(text, 'utf-8')
		except (TypeError, NameError): # unicode is a default on python 3 
			pass
		text = unicodedata.normalize('NFD', text)
		text = text.encode('ascii', 'ignore')
		text = text.decode("utf-8")
		return str(text)

Source File: text_manipulation.py From text-mining-class with MIT License

6 votes

def remove_accents(text):
    """Replace accentuated characters by their non-accentuated counterparts

    A simple way to do this would be to decompose accentuated characters in the
    sequence using one of the unicode decomposition schemes and then filter the
    resulting sequence to remove combining characters (also known as
    diacritical marks).

    Comments: the following solution is a very naive implementation of that
    only uses basic operations on the sequence of unicode characters.

    A more efficient approach that works only for languages that use the
    latin alphabet would use batch conversion to ASCII characters as done in:

        sklearn.feature_extraction.text.strip_accents_ascii

    """
    text = unicodedata.normalize('NFKD', text)
    return "".join([c for c in text if not unicodedata.combining(c)])

Source File: tv_grab_fetch.py From tvgrabpyAPI with GNU General Public License v3.0

6 votes

def remove_accents(self, name):
        name = re.sub('á','a', name)
        name = re.sub('é','e', name)
        name = re.sub('í','i', name)
        name = re.sub('ó','o', name)
        name = re.sub('ú','u', name)
        name = re.sub('ý','y', name)
        name = re.sub('à','a', name)
        name = re.sub('è','e', name)
        name = re.sub('ì','i', name)
        name = re.sub('ò','o', name)
        name = re.sub('ù','u', name)
        name = re.sub('ä','a', name)
        name = re.sub('ë','e', name)
        name = re.sub('ï','i', name)
        name = re.sub('ö','o', name)
        name = re.sub('ü','u', name)
        name = re.sub('ÿ','y', name)
        name = re.sub('â','a', name)
        name = re.sub('ê','e', name)
        name = re.sub('î','i', name)
        name = re.sub('ô','o', name)
        name = re.sub('û','u', name)
        name = re.sub('ã','a', name)
        name = re.sub('õ','o', name)
        name = re.sub('@','a', name)
        return name
    # end remove_accents()

Source File: compat.py From razzy-spinner with GNU General Public License v3.0

5 votes

def remove_accents(text):

    if isinstance(text, bytes):
        text = text.decode('ascii')

    category = unicodedata.category  # this gives a small (~10%) speedup
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
    )

# Select the best transliteration method:

Source File: clean.py From cleanco with MIT License

5 votes

def remove_accents(t):
    """based on https://stackoverflow.com/a/51230541"""
    nfkd_form = unicodedata.normalize('NFKD', t.casefold())
    return ''.join(
        NON_NFKD_MAP[c]
            if c in NON_NFKD_MAP
        else c
            for part in nfkd_form for c in part
            if unicodedata.category(part) != 'Mn'
        )

Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License

5 votes

def removeAccents(self, text):
        text = str(text)
        normalized = unicodedata.normalize("NFKD", text)
        normalized = "".join([c for c in normalized if not unicodedata.combining(c)])
        return normalized

Source File: normalize.py From script.elementum.burst with Do What The F*ck You Want To Public License

5 votes

def remove_accents(string):
    """
        Remove any accent in the string
    :param string: string to remove accents
    :type string: str or unicode
    :return: string without accents
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        string = normalize_string(string)

    nfkd_form = unicodedata.normalize('NFKD', string)
    only_ascii = nfkd_form.encode('ASCII', 'ignore').strip()
    return string if only_ascii == u'' else only_ascii

Source File: simpletokenizer.py From Poetry-Tools with MIT License

5 votes

def remove_accents(string):
    """
    Removes unicode accents from a string, downgrading to the base character
    """

    nfkd = unicodedata.normalize('NFKD', string)
    return u"".join([c for c in nfkd if not unicodedata.combining(c)])

Source File: search_util.py From python-compat-runtime with Apache License 2.0

5 votes

def RemoveAccents(text):
  if not isinstance(text, basestring):
    return text
  if isinstance(text, str):
    text = text.decode('utf-8')
  return u''.join([c for c in text if not unicodedata.combining(c)])

Source File: search_util.py From python-compat-runtime with Apache License 2.0

5 votes

def RemoveAccentsNfkd(text):
  if not isinstance(text, basestring):
    return text
  if isinstance(text, str):
    text = text.decode('utf-8')
  return u''.join([c for c in unicodedata.normalize('NFKD', text)
                   if not unicodedata.combining(c)])

Source File: compat.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

4 votes

def remove_accents(text):

    if isinstance(text, bytes):
        text = text.decode('ascii')

    category = unicodedata.category  # this gives a small (~10%) speedup
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
    )


# Select the best transliteration method: