Python Levenshtein.distance() Examples
The following are 30
code examples of Levenshtein.distance().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Levenshtein
, or try the search function
.
Example #1
Source File: decoder.py From training with Apache License 2.0 | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #2
Source File: decoder.py From LipReading with MIT License | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #3
Source File: decoder.py From LipReading with MIT License | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #4
Source File: edit_distance.py From tensorflow_end2end_speech_recognition with MIT License | 6 votes |
def compute_per(ref, hyp, normalize=True): """Compute Phone Error Rate. Args: ref (list): phones in the reference transcript hyp (list): phones in the predicted transcript normalize (bool, optional): if True, divide by the length of str_true Returns: per (float): Phone Error Rate between str_true and str_pred """ # Build mapping of phone to index phone_set = set(ref + hyp) phone2char = dict(zip(phone_set, range(len(phone_set)))) # Map phones to a single char array # NOTE: Levenshtein packages only accepts strings phones_ref = [chr(phone2char[p]) for p in ref] phones_hyp = [chr(phone2char[p]) for p in hyp] per = lev.distance(''.join(phones_ref), ''.join(phones_hyp)) if normalize: per /= len(ref) return per
Example #5
Source File: edit_distance.py From tensorflow_end2end_speech_recognition with MIT License | 6 votes |
def compute_edit_distance(session, labels_true_st, labels_pred_st): """Compute edit distance per mini-batch. Args: session: labels_true_st: A `SparseTensor` of ground truth labels_pred_st: A `SparseTensor` of prediction Returns: edit_distances: list of edit distance of each uttearance """ indices, values, dense_shape = labels_true_st labels_pred_pl = tf.SparseTensor(indices, values, dense_shape) indices, values, dense_shape = labels_pred_st labels_true_pl = tf.SparseTensor(indices, values, dense_shape) edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True) edit_distances = session.run(edit_op) return edit_distances
Example #6
Source File: BotDigger.py From BotDigger with GNU General Public License v3.0 | 6 votes |
def distanceDomain(domain, DomainDict, ccTldDict, tldDict): similarDomain = "" minDistance = sys.maxint level = domain.split(".") if len(level) <=1: return ("not a domain", sys.maxint) (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict) for popularDomain in DomainDict: distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8')) if distance < minDistance: minDistance = distance similarDomain = popularDomain #debug #sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance)) if len(similarDomain) > 0: return (similarDomain, minDistance/float(len(similarDomain))) else: return (domain2LD, 0) # check whether a domain contains invalid TLD
Example #7
Source File: test_string_distances.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _random_common_char_pairs(n_pairs=50, seed=1): """ Return string pairs with a common char at random positions, in order to distinguish different thresholds for matching chararacters in Jaro distance. """ # Make strings with random length and common char at index 0 rng = np.random.RandomState(seed=seed) list1 = ['a' + 'b' * rng.randint(2, 20) for k in range(n_pairs)] list2 = ['a' + 'c' * rng.randint(2, 20) for k in range(n_pairs)] # Shuffle strings list1 = [''.join(rng.choice( list(s), size=len(s), replace=False)) for s in list1] list2 = [''.join(rng.choice( list(s), size=len(s), replace=False)) for s in list2] pairs = zip(list1, list2) return pairs # TODO: some factorization of what is common for distances; # check results for same examples on all distances
Example #8
Source File: decoder.py From ngraph-python with Apache License 2.0 | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = {ss: ii for ii, ss in enumerate(b)} # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #9
Source File: metrics.py From end2end-asr-pytorch with MIT License | 6 votes |
def calculate_wer(s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #10
Source File: generate_accuracy_report.py From namsel with MIT License | 6 votes |
def _get_compare_data(tif_txt_pair): tif = tif_txt_pair[0] txt = tif_txt_pair[1] if tif[:-4] == txt[:-4]: # This should always be true # ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True) # ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True) ocr = run_main(tif, text=True) # ocr = run_all_confs_for_page(tif, text = True) ocr = ocr.strip() txt = open(txt,'r').read() txt = _normalize_input(txt) edit_dist = L.distance(txt, ocr) edit_ratio = L.ratio(txt, ocr) html = _make_html_diff(txt, ocr) # sys.exit() data = {'edit_distance': edit_dist, 'edit_ratio': edit_ratio, 'filename': os.path.basename(tif), 'html': html } return data
Example #11
Source File: base.py From patter with MIT License | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #12
Source File: edit_distance.py From neural_sp with Apache License 2.0 | 6 votes |
def compute_cer(ref, hyp, normalize=False): """Compute Character Error Rate. Args: ref (str): a sentence without spaces hyp (str): a sentence without spaces normalize (bool, optional): if True, divide by the length of ref Returns: cer (float): Character Error Rate between ref and hyp """ import Levenshtein as lev # TODO(hirofumi): install cer = lev.distance(hyp, ref) if normalize: cer /= len(list(ref)) return cer * 100
Example #13
Source File: decoder.py From inference with Apache License 2.0 | 6 votes |
def wer(s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #14
Source File: decoder.py From pytorch-nlp with MIT License | 6 votes |
def wer(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #15
Source File: metrics.py From KoSpeech with Apache License 2.0 | 6 votes |
def metric(self, s1, s2): """ Computes the Word Error Rate, defined as the edit distance between the two provided sentences after tokenizing to words. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ # build mapping of words to integers b = set(s1.split() + s2.split()) word2char = dict(zip(b, range(len(b)))) # map the words to a char array (Levenshtein packages only accepts # strings) w1 = [chr(word2char[w]) for w in s1.split()] w2 = [chr(word2char[w]) for w in s2.split()] return Lev.distance(''.join(w1), ''.join(w2))
Example #16
Source File: metrics.py From KoSpeech with Apache License 2.0 | 6 votes |
def metric(self, s1, s2): """ Computes the Character Error Rate, defined as the edit distance between the two provided sentences after tokenizing to characters. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ s1 = s1.replace(' ', '') s2 = s2.replace(' ', '') dist = Lev.distance(s2, s1) length = len(s1.replace(' ', '')) return dist, length
Example #17
Source File: metrics.py From KoSpeech with Apache License 2.0 | 6 votes |
def _get_distance(self, targets, y_hats): """ Provides total character distance between targets & y_hats Args: targets (torch.Tensor): set of ground truth y_hats (torch.Tensor): predicted y values (y_hat) by the model Returns: total_dist, total_length - **total_dist**: total distance between targets & y_hats - **total_length**: total length of targets sequence """ total_dist = 0 total_length = 0 for (target, y_hat) in zip(targets, y_hats): s1 = label_to_string(target, self.id2char, self.eos_id) s2 = label_to_string(y_hat, self.id2char, self.eos_id) dist, length = self.metric(s1, s2) total_dist += dist total_length += length return total_dist, total_length
Example #18
Source File: poetics.py From Poetry-Tools with MIT License | 6 votes |
def guess_metre(tokenized_poem): """ Guess a poem's metre via Levenshtein distance from candidates """ joined_lines = [''.join(line) for line in scanscion(tokenized_poem) if line] line_lengths = [len(line) for line in joined_lines] num_lines = len(joined_lines) metres = [] for line in joined_lines: metres.append(levenshtein(line, POSSIBLE_METRES)) guessed_metre = max(zip((metres.count(item) for item in set(metres)), set(metres)))[1] return joined_lines, num_lines, line_lengths, guessed_metre
Example #19
Source File: poetics.py From Poetry-Tools with MIT License | 6 votes |
def levenshtein(string, candidates): """ Compare a string's Levenshtein distance to each candidate in a dictionary. Returns the name of the closest match """ distances = defaultdict(int) num_lines = len(string) for k, v in candidates.items(): expanded = False # Expands the length of each candidate to match the length of the compared string if len(v) != len(string): v = (v * (num_lines // len(v) + 1))[:num_lines] expanded = True edit_distance = distance(string, v) # If we expanded the candidate, then it is a worse match than what we have already if edit_distance in distances and expanded: continue distances[distance(string, v)] = k return distances[min(distances)]
Example #20
Source File: trainer.py From pytorch-asr with GNU General Public License v3.0 | 6 votes |
def validate(self, data_loader): "validate with label error rate by the edit distance between hyps and refs" self.model.eval() with torch.no_grad(): N, D = 0, 0 t = tqdm(enumerate(data_loader), total=len(data_loader), desc="validating", ncols=params.NCOLS) for i, (data) in t: hyps, refs = self.unit_validate(data) # calculate ler N += self.edit_distance(refs, hyps) D += sum(len(r) for r in refs) ler = N * 100. / D t.set_description(f"validating (LER: {ler:.2f} %)") t.refresh() logger.info(f"validating at epoch {self.epoch:03d}: LER {ler:.2f} %") title = f"validate" x = self.epoch - 1 + i / len(data_loader) if logger.visdom is not None: opts = { 'xlabel': 'epoch', 'ylabel': 'LER', } logger.visdom.add_point(title=title, x=x, y=ler, **opts) if logger.tensorboard is not None: logger.tensorboard.add_scalars(title, self.global_step, { 'LER': ler, })
Example #21
Source File: align.py From hgraph2graph with MIT License | 6 votes |
def align(xy_tuple): x,y = xy_tuple xmol, ymol = Chem.MolFromSmiles(x), Chem.MolFromSmiles(y) x = Chem.MolToSmiles(xmol, isomericSmiles=False) xmol = Chem.MolFromSmiles(x) xleaf = get_leaves(xmol) yleaf = get_leaves(ymol) best_i,best_j = 0,0 best = 1000000 for i in xleaf: for j in yleaf: new_x = Chem.MolToSmiles(xmol, rootedAtAtom=i, isomericSmiles=False) new_y = Chem.MolToSmiles(ymol, rootedAtAtom=j, isomericSmiles=False) le = min(len(new_x), len(new_y)) // 2 dist = Levenshtein.distance(new_x[:le], new_y[:le]) if dist < best: best_i, best_j = i, j best = dist return Chem.MolToSmiles(xmol, rootedAtAtom=best_i, isomericSmiles=False), Chem.MolToSmiles(ymol, rootedAtAtom=best_j, isomericSmiles=False)
Example #22
Source File: similarity.py From DeepFMPO with MIT License | 5 votes |
def calculateDistance(smi1,smi2): return 1 - ETA * Levenshtein.distance(smi1, smi2) # Calculate the MCS Tanimoto similarity between two molecules
Example #23
Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License | 5 votes |
def edit_levenshtein(str1, str2): return Leven.distance(str1, str2)
Example #24
Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License | 5 votes |
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离 # model.init_sims(replace=True) distance = model.wmdistance(sent1_cut_list, sent2_cut_list) return distance # def HamMings_Levenshtein(str1, str2): # sim = Leven.hamming(str1, str2) # return sim
Example #25
Source File: distance.py From text2vec with Apache License 2.0 | 5 votes |
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离 """ wmd 距离 :param model: gensim word2vec model :param sent1_cut_list: :param sent2_cut_list: :return: """ distance = model.wmdistance(sent1_cut_list, sent2_cut_list) return distance
Example #26
Source File: distance.py From text2vec with Apache License 2.0 | 5 votes |
def edit_distance(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed import Levenshtein d = Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x == " ", str1, str2).ratio() return d
Example #27
Source File: align_wordlists.py From panphon with MIT License | 5 votes |
def dogol_leven_dist(_, a, b): return Levenshtein.distance(dist.map_to_dogol_prime(a), dist.map_to_dogol_prime(b))
Example #28
Source File: align_wordlists.py From panphon with MIT License | 5 votes |
def levenshtein_dist(_, a, b): return Levenshtein.distance(a, b)
Example #29
Source File: utils.py From Particle-Cloud-Framework with Apache License 2.0 | 5 votes |
def similar_strings(given_str, search_list=[]): """ Returns a list of similar strings to given_str from an iterable of potentially similar strings, search_list. """ threshold = ceil(len(given_str) / 2.5) similar = [ st for st in search_list if distance(given_str.lower(), st.lower()) <= threshold ] return similar
Example #30
Source File: train.py From pytorch-asr with GNU General Public License v3.0 | 5 votes |
def edit_distance(self, refs, hyps): assert len(refs) == len(hyps) n = 0 for ref, hyp in zip(refs, hyps): r = [chr(c) for c in ref] h = [chr(c) for c in hyp] n += Lev.distance(''.join(r), ''.join(h)) return n