Python unidecode.unidecode() Examples

The following are 30 code examples of unidecode.unidecode(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module unidecode , or try the search function .
Example #1
Source File: models.py    From arguman.org with GNU Affero General Public License v3.0 6 votes vote down vote up
def save(self, *args, **kwargs):
        """
        - Make unique slug if it is not given.
        """
        if not self.slug:
            slug = slugify(unidecode(self.title))
            duplications = Contention.objects.filter(slug=slug)
            if duplications.exists():
                self.slug = "%s-%s" % (slug, uuid4().hex)
            else:
                self.slug = slug

        if not kwargs.pop('skip_date_update', False):
            self.date_modification = datetime.now()

        return super(Contention, self).save(*args, **kwargs) 
Example #2
Source File: utils.py    From dissemin with GNU Affero General Public License v3.0 6 votes vote down vote up
def remove_diacritics(s):
    """
    Removes diacritics using the `unidecode` package.

    :param: an str or unicode string
    :returns: if bytes: the same string. if str: the unidecoded string.

    >>> remove_diacritics('aéèï')
    'aeei'
    >>> remove_diacritics('aéè'.encode('utf-8'))
    b'a\\xc3\\xa9\\xc3\\xa8'
    """
    if isinstance(s, str):
        # for issue #305
        # because I have no idea what the general solution for this would be
        s = s.replace("’", "'")

        return unidecode(s)
    else:
        return s 
Example #3
Source File: make_video_analysis.py    From edx2bigquery with GNU General Public License v2.0 6 votes vote down vote up
def findVideoLength(dataset, youtube_id, api_key=None):
    '''
    Handle video length lookup
    '''
    try:
        youtube_id = unidecode(youtube_id)
    except Exception as err:
        print "youtube_id is not ascii?  ytid=", youtube_id
        return 0
    try:
        assert youtube_id is not None, "[analyze videos] youtube id does not exist"
        content, stats = get_youtube_api_stats(youtube_id=youtube_id, api_key=api_key, part=YOUTUBE_PARTS)
        durationDict = parseISOduration(content['duration'].encode("ascii","ignore"))
        length = getTotalTimeSecs(durationDict)
        print "[analyze videos] totalTime for youtube video %s is %s sec" % (youtube_id, length)
    except (AssertionError, Exception) as err:
        print "Failed to lookup video length for %s!  Error=%s, data=%s" % (youtube_id, err, dataset)
        length = 0
    return length

#----------------------------------------------------------------------------- 
Example #4
Source File: invoice_template.py    From invoice2data with MIT License 6 votes vote down vote up
def prepare_input(self, extracted_str):
        """
        Input raw string and do transformations, as set in template file.
        """

        # Remove withspace
        if self.options["remove_whitespace"]:
            optimized_str = re.sub(" +", "", extracted_str)
        else:
            optimized_str = extracted_str

        # Remove accents
        if self.options["remove_accents"]:
            optimized_str = unidecode(optimized_str)

        # convert to lower case
        if self.options["lowercase"]:
            optimized_str = optimized_str.lower()

        # specific replace
        for replace in self.options["replace"]:
            assert len(replace) == 2, "A replace should be a list of 2 items"
            optimized_str = optimized_str.replace(replace[0], replace[1])

        return optimized_str 
Example #5
Source File: downloadRecipes.py    From extract_recipe with Apache License 2.0 6 votes vote down vote up
def get_url_markdown(baseurl,start,increment):
  '''
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  try:
    j = opener.open(baseurl)
  except:
    return None
  data = j.read()
  '''
  urlHandler = urllib2.urlopen(baseurl)
  data = urlHandler.read()
  '''
  os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
  data = open('temp' + str(start)+"_"+str(increment),'rU').read()
  '''
  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data) 
Example #6
Source File: gftools-fix-ascii-fontmetadata.py    From gftools with Apache License 2.0 6 votes vote down vote up
def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result 
Example #7
Source File: MdownloadRecipes.py    From extract_recipe with Apache License 2.0 6 votes vote down vote up
def get_url_markdown(baseurl,start,increment):
  try:
    '''
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
    try:
      j = opener.open(baseurl)
    except:
      return None
    data = j.read()
    '''
    urlHandler = urllib2.urlopen(baseurl)
    data = urlHandler.read()
    '''
    os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
    data = open('temp' + str(start)+"_"+str(increment),'rU').read()
    '''
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 10000
    data = h.handle(unidecode(unicode(data,errors='ignore')))
    return unidecode(data)
  except:
    return None 
Example #8
Source File: nlu_client.py    From idunn with Apache License 2.0 6 votes vote down vote up
def fuzzy_match(cls, query, response):
        """ Does the response match the query reasonably well ?
        >>> NLU_Helper.fuzzy_match("bastille", "Beuzeville-la-Bastille")
        False
        >>> NLU_Helper.fuzzy_match("paris 20", "Paris 20e Arrondissement")
        True
        >>> NLU_Helper.fuzzy_match("av victor hugo paris", "Avenue Victor Hugo")
        True
        """
        q = unidecode(query.strip()).lower()
        r = unidecode(response).lower()
        if r[: len(q)] == q:
            # Response starts with query
            return True
        if sum((Counter(r) - Counter(q)).values()) < len(q):
            # Number of missing chars to match the response is low
            # compared to the query length
            return True
        return False 
Example #9
Source File: utils.py    From pycon with MIT License 5 votes vote down vote up
def dict_to_xml(dict: XMLDict):
    tags: List[etree._Element] = []

    for key, value in dict.items():
        # skip empty value

        if not value:
            continue

        if isinstance(value, (Dict, List)):
            if not isinstance(value, List):
                value = [value]

            for item in value:
                tag = etree.Element(key)

                for subtag in dict_to_xml(item):
                    tag.append(subtag)

                tags.append(tag)
        else:
            if isinstance(value, (int, float, Decimal)):
                value = str(value)

            value = unidecode.unidecode(value).encode("latin_1")

            for tag in _split_tags(key, value):
                tags.append(tag)

    return tags 
Example #10
Source File: downloadRecipes.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data) 
Example #11
Source File: MdownloadRecipes.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def worker(start,increment):
  logger = logging.getLogger('worker'+str(start)+"_"+str(increment))      
  """thread worker function"""
  print 'Worker: %s/%s' % (start,increment)
  indexFile = 'recipes/index'+str(start)+"_"+str(increment)+'.txt'
  lastLine = ""
  if os.path.isfile(indexFile):
      with open(indexFile,'rb') as f:
          for line in f:
              lastLine = line
      lastfileNum = int(lastLine.split()[0])
  else:
      lastfileNum = -1

  fileNum = 0
  t = time.time()
  with open('recipeitems-latest.json','rb') as f:
    for line in f:
      fileNum = fileNum + 1
      if fileNum % increment == start:
        folderSave = str(int(fileNum/500))
        if not os.path.exists('recipes/' + folderSave):
            os.makedirs('recipes/' + folderSave)

        if fileNum>lastfileNum:
          recipe = json.loads(line)
          logger.info(str(fileNum) + "\t" + recipe['url'] + '\t' + recipe['name'])
          t=time.time()
          recipeMD = get_url_markdown(recipe['url'],start,increment)
          logger.info('%s seconds' % str(round(time.time()-t,1)))
          if recipeMD is not None:
            with open('recipes/' + folderSave + '/' + str(fileNum) + '.md','wb') as g:
              g.write(recipeMD)
            #os.system('bzip2 ' + 'recipes/' + folderSave + '/' + str(fileNum) + '.md')        
            with open(indexFile,'a') as g:
              g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + unidecode(recipe['name']) + '\n')
          else:
            with open(indexFile,'a') as g:
              g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + 'None' + '\n')      
  return 
Example #12
Source File: match.py    From osm-wikidata with GNU General Public License v3.0 5 votes vote down vote up
def tidy_name(n):
    # expects to be passed a name in lowercase
    n = unidecode(n).strip().rstrip("'")
    n = n.replace(' no. ', ' number ')
    n = n.replace('saint ', 'st ')
    n = n.replace('mount ', 'mt ')
    n = n.replace(' mountain', ' mtn')
    n = n.replace(' county', ' co')
    n = n.replace(' church of england ', ' ce ')
    n = n.replace(' cofe ', ' ce ')
    n = n.replace(' c of e ', ' ce ')
    n = n.replace(' @ ', ' at ')
    n = n.replace(' roman catholic ', ' rc ')
    n = n.replace(' catholic ', ' rc ')
    n = n.replace(' preparatory school', ' prep school')
    n = n.replace(' incorporated', ' inc')
    n = n.replace(' cooperative', ' coop')
    n = n.replace(' co-operative', ' coop')
    n = n.replace(' hotel and country club', ' hotel')
    n = n.replace(' hotel and spa', ' hotel')
    n = n.replace(' missionary baptist', ' baptist')

    if n.endswith("'s"):
        n = n[:-2]

    n = re_plural.sub('', n)

    n = n.replace('ss', 's')

    n = n.replace('center', 'centre').replace('theater', 'theatre')
    return n 
Example #13
Source File: process.py    From matscholar with MIT License 5 votes vote down vote up
def remove_accent(txt):
        """
        Removes accents from a string
        :param txt: input text
        :return: de-accented text
        """
        # there is a problem with angstrom sometimes, so ignoring length 1 strings
        return unidecode.unidecode(txt) if len(txt) > 1 else txt 
Example #14
Source File: cleaners.py    From libfaceid with MIT License 5 votes vote down vote up
def convert_to_ascii(text):
  return unidecode(text) 
Example #15
Source File: particles.py    From tossi with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __repr__(self):
            try:
                from unidecode import unidecode
            except ImportError:
                return '<Particle: %r>' % self.tolerance()
            else:
                return '<Particle: %s>' % unidecode(self.tolerance()) 
Example #16
Source File: utils.py    From pclpy with MIT License 5 votes vote down vote up
def clean_doxygen(doxygen):
    replace = [
        ("/** ", ""),
        ("* ", ""),
        ("\n*/", ""),
        ("*\n", "\n"),
        ("{", ""),
        ("}", ""),
        ("<b>", ""),
        ("</b>", ""),
    ]
    for k, v in replace:
        doxygen = doxygen.replace(k, v)
    doxygen = unidecode(doxygen)
    return doxygen 
Example #17
Source File: extras.py    From clist with Apache License 2.0 5 votes vote down vote up
def slug(value):
    return slugify(unidecode(value)) 
Example #18
Source File: models.py    From Politikon with GNU General Public License v2.0 5 votes vote down vote up
def get_relative_url(self):
        return '/event/%(id)d-%(title)s' % {'id': self.id, 'title': slugify(unidecode(self.title))} 
Example #19
Source File: pubmed_oa_parser.py    From pubmed_parser with MIT License 5 votes vote down vote up
def table_to_df(table_text):
    """
    This is a function to transform an input table XML text to list of row values and columns.
    This will return a list of column names, and list of list of values in the table

    Parameters
    ----------
    table_text: str
        An XML string of table parsed from PubMed OA

    Return
    ------
    columns, row_values: tuple (list, list)
        ``columns`` is a list of column names of the table,
        ``row_values`` is a list of list of values in the table
    """
    table_tree = etree.fromstring(table_text)
    columns = []
    for tr in table_tree.xpath("thead/tr"):
        for c in tr.getchildren():
            columns.append(unidecode(stringify_children(c)))

    row_values = []
    len_rows = []
    for tr in table_tree.findall("tbody/tr"):
        es = tr.xpath("td")
        row_value = [unidecode(stringify_children(e)) for e in es]
        len_rows.append(len(es))
        row_values.append(row_value)
    if len(len_rows) >= 1:
        len_row = max(set(len_rows), key=len_rows.count)
        row_values = [
            r for r in row_values if len(r) == len_row
        ]  # remove row with different length
        return columns, row_values
    else:
        return None, None 
Example #20
Source File: plugin.py    From limnoria-plugins with Do What The F*ck You Want To Public License 5 votes vote down vote up
def clean(self, text):
            text = unidecode(text)
            if len(text) > 2:
                text = re.sub("[^a-zA-Z0-9 ]+", "", text)
                text = re.sub("^a |^an |^the |^or ", "", text).replace(" ", "")
            else:
                text = re.sub("[^a-zA-Z0-9]+", "", text)
            return text 
Example #21
Source File: banphrase.py    From pajbot with MIT License 5 votes vote down vote up
def format_message(self, message):
        if self.case_sensitive is False:
            message = message.lower()
        if self.remove_accents:
            message = unidecode(message).strip()

        return message 
Example #22
Source File: cleaners.py    From vae_tacotron2 with MIT License 5 votes vote down vote up
def convert_to_ascii(text):
  return unidecode(text) 
Example #23
Source File: reports_to_gml.py    From arguman.org with GNU Affero General Public License v3.0 5 votes vote down vote up
def create_conjunction_graph(self):
        fallacy_map = {
            unidecode(key): value
            for (key, value) in
            get_fallacy_types()
        }
        for contention in Contention.objects.all():
            for premise in contention.premises.all():
                fallacies = filter(None, premise.reports.values_list(
                    'fallacy_type', flat=True))
                fallacies = [
                    fallacy_map[unidecode(_f)]
                    for _f in fallacies
                ]
                fallacies_set = set(fallacies)
                for fallacy in fallacies_set:
                    graph.add_edges_from(
                        [
                            (unidecode(self.normalize(fallacy)),
                             unidecode(self.normalize(_f)))
                            for _f in fallacies_set
                            if _f != fallacy
                        ]
                    )


        nx.write_gml(graph, 'conjunction.gml') 
Example #24
Source File: premise_tags.py    From arguman.org with GNU Affero General Public License v3.0 5 votes vote down vote up
def parse_markdown_tabs(text):
    start, end = '<h1>', '</h1>'
    tab_template = '<div class="tab-content" id="%(slug)s">%(content)s</div>'
    title_template = '<a class="tab-title" href="#%(slug)s">%(name)s</a>'

    if start not in text:
        return text

    titles = []
    tabs = []

    for tab in text.split(start)[1:]:
        title, content = tab.split(end)

        slug = slugify(unidecode(title))

        titles.append(title_template % {
            'name': title,
            'slug': slug
        })

        tabs.append(tab_template % {
            'content': content,
            'slug': slug
        })

    return '\n'.join(titles + tabs) 
Example #25
Source File: models.py    From arguman.org with GNU Affero General Public License v3.0 5 votes vote down vote up
def save(self, *args, **kwargs):
        if not self.slug:
            slug = slugify(unidecode(self.text))
            self.slug = slug
        return super(Channel, self).save(*args, **kwargs) 
Example #26
Source File: models.py    From arguman.org with GNU Affero General Public License v3.0 5 votes vote down vote up
def save(self, *args, **kwargs):
        if not self.slug:
            slug = slugify(unidecode(self.text))
            duplications = Noun.objects.filter(slug=slug,
                                               language=self.language)
            if duplications.exists():
                self.slug = "%s-%s" % (slug, uuid4().hex)
            else:
                self.slug = slug
        return super(Noun, self).save(*args, **kwargs) 
Example #27
Source File: build.py    From ParlAI with MIT License 5 votes vote down vote up
def create_fb_format(data, dpath):
    fw1 = open(os.path.join(dpath, 'train.txt'), 'w')
    fw2 = open(os.path.join(dpath, 'valid.txt'), 'w')
    fw3 = open(os.path.join(dpath, 'test.txt'), 'w')
    for i in range(0, len(data) - 1, 2):
        fout = fw1
        if (i % 500) == 0:
            fout = fw2
        elif (i % 500) == 2:
            fout = fw3
        use = True
        x = data[i].rstrip(' ').lstrip(' ').replace('\t', ' ')
        y = data[i + 1].rstrip(' ').lstrip(' ').replace('\t', ' ')
        x = x.replace('|', ' __PIPE__ ')
        y = y.replace('|', ' __PIPE__ ')
        x = ''.join(list(map(replace_emoji, x)))
        y = ''.join(list(map(replace_emoji, y)))
        x = split_punctuation(unidecode.unidecode(x))
        y = split_punctuation(unidecode.unidecode(y))
        x = ' '.join(x.split())
        y = ' '.join(y.split())

        if len(x) < 1 or len(y) < 1:
            use = False
        if use:
            s = 'text:' + x + '\tlabels:' + y + '\tepisode_done:True'
            fout.write('{} \n'.format(s))
    fw1.close()
    fw2.close()
    fw3.close() 
Example #28
Source File: engine.py    From marvin-python-toolbox with Apache License 2.0 5 votes vote down vote up
def _slugify(text, delim='_'):
    result = []
    for word in _punct_re.split(text.lower()):
        result.extend(unidecode(word).split())
    return six.u(delim.join(result)) 
Example #29
Source File: cleaners.py    From tn2-wg with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def convert_to_ascii(text):
  return unidecode(text) 
Example #30
Source File: transliterate.py    From textkit with MIT License 5 votes vote down vote up
def transliterate(file):
    '''Convert international text to ascii.'''
    content = ''.join(file.readlines())
    try:
        content = content.decode(chardet.detect(content)['encoding'])
    except AttributeError:
        # Strings do not have a decode method in python 3.
        pass
    [output(unidecode(content).encode('ascii', 'ignore'))]