Python Examples of unidecode.unidecode

Source File: models.py From arguman.org with GNU Affero General Public License v3.0

6 votes

def save(self, *args, **kwargs):
        """
        - Make unique slug if it is not given.
        """
        if not self.slug:
            slug = slugify(unidecode(self.title))
            duplications = Contention.objects.filter(slug=slug)
            if duplications.exists():
                self.slug = "%s-%s" % (slug, uuid4().hex)
            else:
                self.slug = slug

        if not kwargs.pop('skip_date_update', False):
            self.date_modification = datetime.now()

        return super(Contention, self).save(*args, **kwargs)

Source File: utils.py From dissemin with GNU Affero General Public License v3.0

6 votes

def remove_diacritics(s):
    """
    Removes diacritics using the `unidecode` package.

    :param: an str or unicode string
    :returns: if bytes: the same string. if str: the unidecoded string.

    >>> remove_diacritics('aéèï')
    'aeei'
    >>> remove_diacritics('aéè'.encode('utf-8'))
    b'a\\xc3\\xa9\\xc3\\xa8'
    """
    if isinstance(s, str):
        # for issue #305
        # because I have no idea what the general solution for this would be
        s = s.replace("’", "'")

        return unidecode(s)
    else:
        return s

Source File: make_video_analysis.py From edx2bigquery with GNU General Public License v2.0

6 votes

def findVideoLength(dataset, youtube_id, api_key=None):
    '''
    Handle video length lookup
    '''
    try:
        youtube_id = unidecode(youtube_id)
    except Exception as err:
        print "youtube_id is not ascii?  ytid=", youtube_id
        return 0
    try:
        assert youtube_id is not None, "[analyze videos] youtube id does not exist"
        content, stats = get_youtube_api_stats(youtube_id=youtube_id, api_key=api_key, part=YOUTUBE_PARTS)
        durationDict = parseISOduration(content['duration'].encode("ascii","ignore"))
        length = getTotalTimeSecs(durationDict)
        print "[analyze videos] totalTime for youtube video %s is %s sec" % (youtube_id, length)
    except (AssertionError, Exception) as err:
        print "Failed to lookup video length for %s!  Error=%s, data=%s" % (youtube_id, err, dataset)
        length = 0
    return length

#-----------------------------------------------------------------------------

Source File: invoice_template.py From invoice2data with MIT License

6 votes

def prepare_input(self, extracted_str):
        """
        Input raw string and do transformations, as set in template file.
        """

        # Remove withspace
        if self.options["remove_whitespace"]:
            optimized_str = re.sub(" +", "", extracted_str)
        else:
            optimized_str = extracted_str

        # Remove accents
        if self.options["remove_accents"]:
            optimized_str = unidecode(optimized_str)

        # convert to lower case
        if self.options["lowercase"]:
            optimized_str = optimized_str.lower()

        # specific replace
        for replace in self.options["replace"]:
            assert len(replace) == 2, "A replace should be a list of 2 items"
            optimized_str = optimized_str.replace(replace[0], replace[1])

        return optimized_str

Source File: downloadRecipes.py From extract_recipe with Apache License 2.0

6 votes

def get_url_markdown(baseurl,start,increment):
  '''
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  try:
    j = opener.open(baseurl)
  except:
    return None
  data = j.read()
  '''
  urlHandler = urllib2.urlopen(baseurl)
  data = urlHandler.read()
  '''
  os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
  data = open('temp' + str(start)+"_"+str(increment),'rU').read()
  '''
  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data)

Source File: gftools-fix-ascii-fontmetadata.py From gftools with Apache License 2.0

6 votes

def normalizestr(string):
    """ Converts special characters like copyright,
        trademark signs to ascii name """
    # print("input: '{}'".format(string))
    input_string = string
    for mark, ascii_repl in unicode_marks(string):
        string = string.replace(mark, ascii_repl)

    rv = []
#    for c in unicodedata.normalize('NFKC', smart_text(string)):
    for c in unicodedata.normalize('NFKC', string):
        # cat = unicodedata.category(c)[0]
        # if cat in 'LN' or c in ok:
        rv.append(c)

    new = ''.join(rv).strip()
    result = unidecode(new)
    if result != input_string:
        print("Fixed string: '{}'".format(result))
    return result

Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0

6 votes

def get_url_markdown(baseurl,start,increment):
  try:
    '''
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
    try:
      j = opener.open(baseurl)
    except:
      return None
    data = j.read()
    '''
    urlHandler = urllib2.urlopen(baseurl)
    data = urlHandler.read()
    '''
    os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl)
    data = open('temp' + str(start)+"_"+str(increment),'rU').read()
    '''
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 10000
    data = h.handle(unidecode(unicode(data,errors='ignore')))
    return unidecode(data)
  except:
    return None

Source File: nlu_client.py From idunn with Apache License 2.0

6 votes

def fuzzy_match(cls, query, response):
        """ Does the response match the query reasonably well ?
        >>> NLU_Helper.fuzzy_match("bastille", "Beuzeville-la-Bastille")
        False
        >>> NLU_Helper.fuzzy_match("paris 20", "Paris 20e Arrondissement")
        True
        >>> NLU_Helper.fuzzy_match("av victor hugo paris", "Avenue Victor Hugo")
        True
        """
        q = unidecode(query.strip()).lower()
        r = unidecode(response).lower()
        if r[: len(q)] == q:
            # Response starts with query
            return True
        if sum((Counter(r) - Counter(q)).values()) < len(q):
            # Number of missing chars to match the response is low
            # compared to the query length
            return True
        return False

Source File: utils.py From pycon with MIT License

5 votes

def dict_to_xml(dict: XMLDict):
    tags: List[etree._Element] = []

    for key, value in dict.items():
        # skip empty value

        if not value:
            continue

        if isinstance(value, (Dict, List)):
            if not isinstance(value, List):
                value = [value]

            for item in value:
                tag = etree.Element(key)

                for subtag in dict_to_xml(item):
                    tag.append(subtag)

                tags.append(tag)
        else:
            if isinstance(value, (int, float, Decimal)):
                value = str(value)

            value = unidecode.unidecode(value).encode("latin_1")

            for tag in _split_tags(key, value):
                tags.append(tag)

    return tags

Source File: downloadRecipes.py From extract_recipe with Apache License 2.0

5 votes

def get_url_markdown(baseurl):
  opener = urllib2.build_opener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')]
  j = opener.open(baseurl)
  data = j.read()

  h = html2text.HTML2Text()
  h.ignore_links = True
  h.ignore_images = True
  h.body_width = 10000
  data = h.handle(unidecode(unicode(data,errors='ignore')))
  return unidecode(data)

Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0

5 votes

def worker(start,increment):
  logger = logging.getLogger('worker'+str(start)+"_"+str(increment))      
  """thread worker function"""
  print 'Worker: %s/%s' % (start,increment)
  indexFile = 'recipes/index'+str(start)+"_"+str(increment)+'.txt'
  lastLine = ""
  if os.path.isfile(indexFile):
      with open(indexFile,'rb') as f:
          for line in f:
              lastLine = line
      lastfileNum = int(lastLine.split()[0])
  else:
      lastfileNum = -1

  fileNum = 0
  t = time.time()
  with open('recipeitems-latest.json','rb') as f:
    for line in f:
      fileNum = fileNum + 1
      if fileNum % increment == start:
        folderSave = str(int(fileNum/500))
        if not os.path.exists('recipes/' + folderSave):
            os.makedirs('recipes/' + folderSave)

        if fileNum>lastfileNum:
          recipe = json.loads(line)
          logger.info(str(fileNum) + "\t" + recipe['url'] + '\t' + recipe['name'])
          t=time.time()
          recipeMD = get_url_markdown(recipe['url'],start,increment)
          logger.info('%s seconds' % str(round(time.time()-t,1)))
          if recipeMD is not None:
            with open('recipes/' + folderSave + '/' + str(fileNum) + '.md','wb') as g:
              g.write(recipeMD)
            #os.system('bzip2 ' + 'recipes/' + folderSave + '/' + str(fileNum) + '.md')        
            with open(indexFile,'a') as g:
              g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + unidecode(recipe['name']) + '\n')
          else:
            with open(indexFile,'a') as g:
              g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + 'None' + '\n')      
  return

Source File: match.py From osm-wikidata with GNU General Public License v3.0

5 votes

def tidy_name(n):
    # expects to be passed a name in lowercase
    n = unidecode(n).strip().rstrip("'")
    n = n.replace(' no. ', ' number ')
    n = n.replace('saint ', 'st ')
    n = n.replace('mount ', 'mt ')
    n = n.replace(' mountain', ' mtn')
    n = n.replace(' county', ' co')
    n = n.replace(' church of england ', ' ce ')
    n = n.replace(' cofe ', ' ce ')
    n = n.replace(' c of e ', ' ce ')
    n = n.replace(' @ ', ' at ')
    n = n.replace(' roman catholic ', ' rc ')
    n = n.replace(' catholic ', ' rc ')
    n = n.replace(' preparatory school', ' prep school')
    n = n.replace(' incorporated', ' inc')
    n = n.replace(' cooperative', ' coop')
    n = n.replace(' co-operative', ' coop')
    n = n.replace(' hotel and country club', ' hotel')
    n = n.replace(' hotel and spa', ' hotel')
    n = n.replace(' missionary baptist', ' baptist')

    if n.endswith("'s"):
        n = n[:-2]

    n = re_plural.sub('', n)

    n = n.replace('ss', 's')

    n = n.replace('center', 'centre').replace('theater', 'theatre')
    return n

Source File: process.py From matscholar with MIT License

5 votes

def remove_accent(txt):
        """
        Removes accents from a string
        :param txt: input text
        :return: de-accented text
        """
        # there is a problem with angstrom sometimes, so ignoring length 1 strings
        return unidecode.unidecode(txt) if len(txt) > 1 else txt

Source File: cleaners.py From libfaceid with MIT License

5 votes

def convert_to_ascii(text):
  return unidecode(text)

Source File: particles.py From tossi with BSD 3-Clause "New" or "Revised" License

5 votes

def __repr__(self):
            try:
                from unidecode import unidecode
            except ImportError:
                return '<Particle: %r>' % self.tolerance()
            else:
                return '<Particle: %s>' % unidecode(self.tolerance())

Source File: utils.py From pclpy with MIT License

5 votes

def clean_doxygen(doxygen):
    replace = [
        ("/** ", ""),
        ("* ", ""),
        ("\n*/", ""),
        ("*\n", "\n"),
        ("{", ""),
        ("}", ""),
        ("<b>", ""),
        ("</b>", ""),
    ]
    for k, v in replace:
        doxygen = doxygen.replace(k, v)
    doxygen = unidecode(doxygen)
    return doxygen

Source File: extras.py From clist with Apache License 2.0

5 votes

def slug(value):
    return slugify(unidecode(value))

Source File: models.py From Politikon with GNU General Public License v2.0

5 votes

def get_relative_url(self):
        return '/event/%(id)d-%(title)s' % {'id': self.id, 'title': slugify(unidecode(self.title))}

Source File: pubmed_oa_parser.py From pubmed_parser with MIT License

5 votes

def table_to_df(table_text):
    """
    This is a function to transform an input table XML text to list of row values and columns.
    This will return a list of column names, and list of list of values in the table

    Parameters
    ----------
    table_text: str
        An XML string of table parsed from PubMed OA

    Return
    ------
    columns, row_values: tuple (list, list)
        ``columns`` is a list of column names of the table,
        ``row_values`` is a list of list of values in the table
    """
    table_tree = etree.fromstring(table_text)
    columns = []
    for tr in table_tree.xpath("thead/tr"):
        for c in tr.getchildren():
            columns.append(unidecode(stringify_children(c)))

    row_values = []
    len_rows = []
    for tr in table_tree.findall("tbody/tr"):
        es = tr.xpath("td")
        row_value = [unidecode(stringify_children(e)) for e in es]
        len_rows.append(len(es))
        row_values.append(row_value)
    if len(len_rows) >= 1:
        len_row = max(set(len_rows), key=len_rows.count)
        row_values = [
            r for r in row_values if len(r) == len_row
        ]  # remove row with different length
        return columns, row_values
    else:
        return None, None

Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License

5 votes

def clean(self, text):
            text = unidecode(text)
            if len(text) > 2:
                text = re.sub("[^a-zA-Z0-9 ]+", "", text)
                text = re.sub("^a |^an |^the |^or ", "", text).replace(" ", "")
            else:
                text = re.sub("[^a-zA-Z0-9]+", "", text)
            return text

Source File: banphrase.py From pajbot with MIT License

5 votes

def format_message(self, message):
        if self.case_sensitive is False:
            message = message.lower()
        if self.remove_accents:
            message = unidecode(message).strip()

        return message

Source File: cleaners.py From vae_tacotron2 with MIT License

5 votes

def convert_to_ascii(text):
  return unidecode(text)

Source File: reports_to_gml.py From arguman.org with GNU Affero General Public License v3.0

5 votes

def create_conjunction_graph(self):
        fallacy_map = {
            unidecode(key): value
            for (key, value) in
            get_fallacy_types()
        }
        for contention in Contention.objects.all():
            for premise in contention.premises.all():
                fallacies = filter(None, premise.reports.values_list(
                    'fallacy_type', flat=True))
                fallacies = [
                    fallacy_map[unidecode(_f)]
                    for _f in fallacies
                ]
                fallacies_set = set(fallacies)
                for fallacy in fallacies_set:
                    graph.add_edges_from(
                        [
                            (unidecode(self.normalize(fallacy)),
                             unidecode(self.normalize(_f)))
                            for _f in fallacies_set
                            if _f != fallacy
                        ]
                    )


        nx.write_gml(graph, 'conjunction.gml')

Source File: premise_tags.py From arguman.org with GNU Affero General Public License v3.0

5 votes

def parse_markdown_tabs(text):
    start, end = '<h1>', '</h1>'
    tab_template = '<div class="tab-content" id="%(slug)s">%(content)s</div>'
    title_template = '<a class="tab-title" href="#%(slug)s">%(name)s</a>'

    if start not in text:
        return text

    titles = []
    tabs = []

    for tab in text.split(start)[1:]:
        title, content = tab.split(end)

        slug = slugify(unidecode(title))

        titles.append(title_template % {
            'name': title,
            'slug': slug
        })

        tabs.append(tab_template % {
            'content': content,
            'slug': slug
        })

    return '\n'.join(titles + tabs)

Source File: models.py From arguman.org with GNU Affero General Public License v3.0

5 votes

def save(self, *args, **kwargs):
        if not self.slug:
            slug = slugify(unidecode(self.text))
            self.slug = slug
        return super(Channel, self).save(*args, **kwargs)

Source File: models.py From arguman.org with GNU Affero General Public License v3.0

5 votes

def save(self, *args, **kwargs):
        if not self.slug:
            slug = slugify(unidecode(self.text))
            duplications = Noun.objects.filter(slug=slug,
                                               language=self.language)
            if duplications.exists():
                self.slug = "%s-%s" % (slug, uuid4().hex)
            else:
                self.slug = slug
        return super(Noun, self).save(*args, **kwargs)

Source File: build.py From ParlAI with MIT License

5 votes

def create_fb_format(data, dpath):
    fw1 = open(os.path.join(dpath, 'train.txt'), 'w')
    fw2 = open(os.path.join(dpath, 'valid.txt'), 'w')
    fw3 = open(os.path.join(dpath, 'test.txt'), 'w')
    for i in range(0, len(data) - 1, 2):
        fout = fw1
        if (i % 500) == 0:
            fout = fw2
        elif (i % 500) == 2:
            fout = fw3
        use = True
        x = data[i].rstrip(' ').lstrip(' ').replace('\t', ' ')
        y = data[i + 1].rstrip(' ').lstrip(' ').replace('\t', ' ')
        x = x.replace('|', ' __PIPE__ ')
        y = y.replace('|', ' __PIPE__ ')
        x = ''.join(list(map(replace_emoji, x)))
        y = ''.join(list(map(replace_emoji, y)))
        x = split_punctuation(unidecode.unidecode(x))
        y = split_punctuation(unidecode.unidecode(y))
        x = ' '.join(x.split())
        y = ' '.join(y.split())

        if len(x) < 1 or len(y) < 1:
            use = False
        if use:
            s = 'text:' + x + '\tlabels:' + y + '\tepisode_done:True'
            fout.write('{} \n'.format(s))
    fw1.close()
    fw2.close()
    fw3.close()

Source File: engine.py From marvin-python-toolbox with Apache License 2.0

5 votes

def _slugify(text, delim='_'):
    result = []
    for word in _punct_re.split(text.lower()):
        result.extend(unidecode(word).split())
    return six.u(delim.join(result))

Source File: cleaners.py From tn2-wg with BSD 3-Clause "New" or "Revised" License

5 votes

def convert_to_ascii(text):
  return unidecode(text)

Source File: transliterate.py From textkit with MIT License

5 votes

def transliterate(file):
    '''Convert international text to ascii.'''
    content = ''.join(file.readlines())
    try:
        content = content.decode(chardet.detect(content)['encoding'])
    except AttributeError:
        # Strings do not have a decode method in python 3.
        pass
    [output(unidecode(content).encode('ascii', 'ignore'))]

Python unidecode.unidecode() Examples