Python unidecode.unidecode() Examples
The following are 30
code examples of unidecode.unidecode().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
unidecode
, or try the search function
.
Example #1
Source File: models.py From arguman.org with GNU Affero General Public License v3.0 | 6 votes |
def save(self, *args, **kwargs): """ - Make unique slug if it is not given. """ if not self.slug: slug = slugify(unidecode(self.title)) duplications = Contention.objects.filter(slug=slug) if duplications.exists(): self.slug = "%s-%s" % (slug, uuid4().hex) else: self.slug = slug if not kwargs.pop('skip_date_update', False): self.date_modification = datetime.now() return super(Contention, self).save(*args, **kwargs)
Example #2
Source File: utils.py From dissemin with GNU Affero General Public License v3.0 | 6 votes |
def remove_diacritics(s): """ Removes diacritics using the `unidecode` package. :param: an str or unicode string :returns: if bytes: the same string. if str: the unidecoded string. >>> remove_diacritics('aéèï') 'aeei' >>> remove_diacritics('aéè'.encode('utf-8')) b'a\\xc3\\xa9\\xc3\\xa8' """ if isinstance(s, str): # for issue #305 # because I have no idea what the general solution for this would be s = s.replace("’", "'") return unidecode(s) else: return s
Example #3
Source File: make_video_analysis.py From edx2bigquery with GNU General Public License v2.0 | 6 votes |
def findVideoLength(dataset, youtube_id, api_key=None): ''' Handle video length lookup ''' try: youtube_id = unidecode(youtube_id) except Exception as err: print "youtube_id is not ascii? ytid=", youtube_id return 0 try: assert youtube_id is not None, "[analyze videos] youtube id does not exist" content, stats = get_youtube_api_stats(youtube_id=youtube_id, api_key=api_key, part=YOUTUBE_PARTS) durationDict = parseISOduration(content['duration'].encode("ascii","ignore")) length = getTotalTimeSecs(durationDict) print "[analyze videos] totalTime for youtube video %s is %s sec" % (youtube_id, length) except (AssertionError, Exception) as err: print "Failed to lookup video length for %s! Error=%s, data=%s" % (youtube_id, err, dataset) length = 0 return length #-----------------------------------------------------------------------------
Example #4
Source File: invoice_template.py From invoice2data with MIT License | 6 votes |
def prepare_input(self, extracted_str): """ Input raw string and do transformations, as set in template file. """ # Remove withspace if self.options["remove_whitespace"]: optimized_str = re.sub(" +", "", extracted_str) else: optimized_str = extracted_str # Remove accents if self.options["remove_accents"]: optimized_str = unidecode(optimized_str) # convert to lower case if self.options["lowercase"]: optimized_str = optimized_str.lower() # specific replace for replace in self.options["replace"]: assert len(replace) == 2, "A replace should be a list of 2 items" optimized_str = optimized_str.replace(replace[0], replace[1]) return optimized_str
Example #5
Source File: downloadRecipes.py From extract_recipe with Apache License 2.0 | 6 votes |
def get_url_markdown(baseurl,start,increment): ''' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] try: j = opener.open(baseurl) except: return None data = j.read() ''' urlHandler = urllib2.urlopen(baseurl) data = urlHandler.read() ''' os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl) data = open('temp' + str(start)+"_"+str(increment),'rU').read() ''' h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data)
Example #6
Source File: gftools-fix-ascii-fontmetadata.py From gftools with Apache License 2.0 | 6 votes |
def normalizestr(string): """ Converts special characters like copyright, trademark signs to ascii name """ # print("input: '{}'".format(string)) input_string = string for mark, ascii_repl in unicode_marks(string): string = string.replace(mark, ascii_repl) rv = [] # for c in unicodedata.normalize('NFKC', smart_text(string)): for c in unicodedata.normalize('NFKC', string): # cat = unicodedata.category(c)[0] # if cat in 'LN' or c in ok: rv.append(c) new = ''.join(rv).strip() result = unidecode(new) if result != input_string: print("Fixed string: '{}'".format(result)) return result
Example #7
Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0 | 6 votes |
def get_url_markdown(baseurl,start,increment): try: ''' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] try: j = opener.open(baseurl) except: return None data = j.read() ''' urlHandler = urllib2.urlopen(baseurl) data = urlHandler.read() ''' os.system('wget -O temp' + str(start)+"_"+str(increment) + ' ' + baseurl) data = open('temp' + str(start)+"_"+str(increment),'rU').read() ''' h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data) except: return None
Example #8
Source File: nlu_client.py From idunn with Apache License 2.0 | 6 votes |
def fuzzy_match(cls, query, response): """ Does the response match the query reasonably well ? >>> NLU_Helper.fuzzy_match("bastille", "Beuzeville-la-Bastille") False >>> NLU_Helper.fuzzy_match("paris 20", "Paris 20e Arrondissement") True >>> NLU_Helper.fuzzy_match("av victor hugo paris", "Avenue Victor Hugo") True """ q = unidecode(query.strip()).lower() r = unidecode(response).lower() if r[: len(q)] == q: # Response starts with query return True if sum((Counter(r) - Counter(q)).values()) < len(q): # Number of missing chars to match the response is low # compared to the query length return True return False
Example #9
Source File: utils.py From pycon with MIT License | 5 votes |
def dict_to_xml(dict: XMLDict): tags: List[etree._Element] = [] for key, value in dict.items(): # skip empty value if not value: continue if isinstance(value, (Dict, List)): if not isinstance(value, List): value = [value] for item in value: tag = etree.Element(key) for subtag in dict_to_xml(item): tag.append(subtag) tags.append(tag) else: if isinstance(value, (int, float, Decimal)): value = str(value) value = unidecode.unidecode(value).encode("latin_1") for tag in _split_tags(key, value): tags.append(tag) return tags
Example #10
Source File: downloadRecipes.py From extract_recipe with Apache License 2.0 | 5 votes |
def get_url_markdown(baseurl): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0')] j = opener.open(baseurl) data = j.read() h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True h.body_width = 10000 data = h.handle(unidecode(unicode(data,errors='ignore'))) return unidecode(data)
Example #11
Source File: MdownloadRecipes.py From extract_recipe with Apache License 2.0 | 5 votes |
def worker(start,increment): logger = logging.getLogger('worker'+str(start)+"_"+str(increment)) """thread worker function""" print 'Worker: %s/%s' % (start,increment) indexFile = 'recipes/index'+str(start)+"_"+str(increment)+'.txt' lastLine = "" if os.path.isfile(indexFile): with open(indexFile,'rb') as f: for line in f: lastLine = line lastfileNum = int(lastLine.split()[0]) else: lastfileNum = -1 fileNum = 0 t = time.time() with open('recipeitems-latest.json','rb') as f: for line in f: fileNum = fileNum + 1 if fileNum % increment == start: folderSave = str(int(fileNum/500)) if not os.path.exists('recipes/' + folderSave): os.makedirs('recipes/' + folderSave) if fileNum>lastfileNum: recipe = json.loads(line) logger.info(str(fileNum) + "\t" + recipe['url'] + '\t' + recipe['name']) t=time.time() recipeMD = get_url_markdown(recipe['url'],start,increment) logger.info('%s seconds' % str(round(time.time()-t,1))) if recipeMD is not None: with open('recipes/' + folderSave + '/' + str(fileNum) + '.md','wb') as g: g.write(recipeMD) #os.system('bzip2 ' + 'recipes/' + folderSave + '/' + str(fileNum) + '.md') with open(indexFile,'a') as g: g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + unidecode(recipe['name']) + '\n') else: with open(indexFile,'a') as g: g.write(str(fileNum) + "\t" + recipe['url'] + '\t' + 'None' + '\n') return
Example #12
Source File: match.py From osm-wikidata with GNU General Public License v3.0 | 5 votes |
def tidy_name(n): # expects to be passed a name in lowercase n = unidecode(n).strip().rstrip("'") n = n.replace(' no. ', ' number ') n = n.replace('saint ', 'st ') n = n.replace('mount ', 'mt ') n = n.replace(' mountain', ' mtn') n = n.replace(' county', ' co') n = n.replace(' church of england ', ' ce ') n = n.replace(' cofe ', ' ce ') n = n.replace(' c of e ', ' ce ') n = n.replace(' @ ', ' at ') n = n.replace(' roman catholic ', ' rc ') n = n.replace(' catholic ', ' rc ') n = n.replace(' preparatory school', ' prep school') n = n.replace(' incorporated', ' inc') n = n.replace(' cooperative', ' coop') n = n.replace(' co-operative', ' coop') n = n.replace(' hotel and country club', ' hotel') n = n.replace(' hotel and spa', ' hotel') n = n.replace(' missionary baptist', ' baptist') if n.endswith("'s"): n = n[:-2] n = re_plural.sub('', n) n = n.replace('ss', 's') n = n.replace('center', 'centre').replace('theater', 'theatre') return n
Example #13
Source File: process.py From matscholar with MIT License | 5 votes |
def remove_accent(txt): """ Removes accents from a string :param txt: input text :return: de-accented text """ # there is a problem with angstrom sometimes, so ignoring length 1 strings return unidecode.unidecode(txt) if len(txt) > 1 else txt
Example #14
Source File: cleaners.py From libfaceid with MIT License | 5 votes |
def convert_to_ascii(text): return unidecode(text)
Example #15
Source File: particles.py From tossi with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __repr__(self): try: from unidecode import unidecode except ImportError: return '<Particle: %r>' % self.tolerance() else: return '<Particle: %s>' % unidecode(self.tolerance())
Example #16
Source File: utils.py From pclpy with MIT License | 5 votes |
def clean_doxygen(doxygen): replace = [ ("/** ", ""), ("* ", ""), ("\n*/", ""), ("*\n", "\n"), ("{", ""), ("}", ""), ("<b>", ""), ("</b>", ""), ] for k, v in replace: doxygen = doxygen.replace(k, v) doxygen = unidecode(doxygen) return doxygen
Example #17
Source File: extras.py From clist with Apache License 2.0 | 5 votes |
def slug(value): return slugify(unidecode(value))
Example #18
Source File: models.py From Politikon with GNU General Public License v2.0 | 5 votes |
def get_relative_url(self): return '/event/%(id)d-%(title)s' % {'id': self.id, 'title': slugify(unidecode(self.title))}
Example #19
Source File: pubmed_oa_parser.py From pubmed_parser with MIT License | 5 votes |
def table_to_df(table_text): """ This is a function to transform an input table XML text to list of row values and columns. This will return a list of column names, and list of list of values in the table Parameters ---------- table_text: str An XML string of table parsed from PubMed OA Return ------ columns, row_values: tuple (list, list) ``columns`` is a list of column names of the table, ``row_values`` is a list of list of values in the table """ table_tree = etree.fromstring(table_text) columns = [] for tr in table_tree.xpath("thead/tr"): for c in tr.getchildren(): columns.append(unidecode(stringify_children(c))) row_values = [] len_rows = [] for tr in table_tree.findall("tbody/tr"): es = tr.xpath("td") row_value = [unidecode(stringify_children(e)) for e in es] len_rows.append(len(es)) row_values.append(row_value) if len(len_rows) >= 1: len_row = max(set(len_rows), key=len_rows.count) row_values = [ r for r in row_values if len(r) == len_row ] # remove row with different length return columns, row_values else: return None, None
Example #20
Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License | 5 votes |
def clean(self, text): text = unidecode(text) if len(text) > 2: text = re.sub("[^a-zA-Z0-9 ]+", "", text) text = re.sub("^a |^an |^the |^or ", "", text).replace(" ", "") else: text = re.sub("[^a-zA-Z0-9]+", "", text) return text
Example #21
Source File: banphrase.py From pajbot with MIT License | 5 votes |
def format_message(self, message): if self.case_sensitive is False: message = message.lower() if self.remove_accents: message = unidecode(message).strip() return message
Example #22
Source File: cleaners.py From vae_tacotron2 with MIT License | 5 votes |
def convert_to_ascii(text): return unidecode(text)
Example #23
Source File: reports_to_gml.py From arguman.org with GNU Affero General Public License v3.0 | 5 votes |
def create_conjunction_graph(self): fallacy_map = { unidecode(key): value for (key, value) in get_fallacy_types() } for contention in Contention.objects.all(): for premise in contention.premises.all(): fallacies = filter(None, premise.reports.values_list( 'fallacy_type', flat=True)) fallacies = [ fallacy_map[unidecode(_f)] for _f in fallacies ] fallacies_set = set(fallacies) for fallacy in fallacies_set: graph.add_edges_from( [ (unidecode(self.normalize(fallacy)), unidecode(self.normalize(_f))) for _f in fallacies_set if _f != fallacy ] ) nx.write_gml(graph, 'conjunction.gml')
Example #24
Source File: premise_tags.py From arguman.org with GNU Affero General Public License v3.0 | 5 votes |
def parse_markdown_tabs(text): start, end = '<h1>', '</h1>' tab_template = '<div class="tab-content" id="%(slug)s">%(content)s</div>' title_template = '<a class="tab-title" href="#%(slug)s">%(name)s</a>' if start not in text: return text titles = [] tabs = [] for tab in text.split(start)[1:]: title, content = tab.split(end) slug = slugify(unidecode(title)) titles.append(title_template % { 'name': title, 'slug': slug }) tabs.append(tab_template % { 'content': content, 'slug': slug }) return '\n'.join(titles + tabs)
Example #25
Source File: models.py From arguman.org with GNU Affero General Public License v3.0 | 5 votes |
def save(self, *args, **kwargs): if not self.slug: slug = slugify(unidecode(self.text)) self.slug = slug return super(Channel, self).save(*args, **kwargs)
Example #26
Source File: models.py From arguman.org with GNU Affero General Public License v3.0 | 5 votes |
def save(self, *args, **kwargs): if not self.slug: slug = slugify(unidecode(self.text)) duplications = Noun.objects.filter(slug=slug, language=self.language) if duplications.exists(): self.slug = "%s-%s" % (slug, uuid4().hex) else: self.slug = slug return super(Noun, self).save(*args, **kwargs)
Example #27
Source File: build.py From ParlAI with MIT License | 5 votes |
def create_fb_format(data, dpath): fw1 = open(os.path.join(dpath, 'train.txt'), 'w') fw2 = open(os.path.join(dpath, 'valid.txt'), 'w') fw3 = open(os.path.join(dpath, 'test.txt'), 'w') for i in range(0, len(data) - 1, 2): fout = fw1 if (i % 500) == 0: fout = fw2 elif (i % 500) == 2: fout = fw3 use = True x = data[i].rstrip(' ').lstrip(' ').replace('\t', ' ') y = data[i + 1].rstrip(' ').lstrip(' ').replace('\t', ' ') x = x.replace('|', ' __PIPE__ ') y = y.replace('|', ' __PIPE__ ') x = ''.join(list(map(replace_emoji, x))) y = ''.join(list(map(replace_emoji, y))) x = split_punctuation(unidecode.unidecode(x)) y = split_punctuation(unidecode.unidecode(y)) x = ' '.join(x.split()) y = ' '.join(y.split()) if len(x) < 1 or len(y) < 1: use = False if use: s = 'text:' + x + '\tlabels:' + y + '\tepisode_done:True' fout.write('{} \n'.format(s)) fw1.close() fw2.close() fw3.close()
Example #28
Source File: engine.py From marvin-python-toolbox with Apache License 2.0 | 5 votes |
def _slugify(text, delim='_'): result = [] for word in _punct_re.split(text.lower()): result.extend(unidecode(word).split()) return six.u(delim.join(result))
Example #29
Source File: cleaners.py From tn2-wg with BSD 3-Clause "New" or "Revised" License | 5 votes |
def convert_to_ascii(text): return unidecode(text)
Example #30
Source File: transliterate.py From textkit with MIT License | 5 votes |
def transliterate(file): '''Convert international text to ascii.''' content = ''.join(file.readlines()) try: content = content.decode(chardet.detect(content)['encoding']) except AttributeError: # Strings do not have a decode method in python 3. pass [output(unidecode(content).encode('ascii', 'ignore'))]