Python Examples of Levenshtein.distance

Source File: decoder.py From training with Apache License 2.0

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: decoder.py From LipReading with MIT License

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: decoder.py From LipReading with MIT License

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: edit_distance.py From tensorflow_end2end_speech_recognition with MIT License

6 votes

def compute_per(ref, hyp, normalize=True):
    """Compute Phone Error Rate.
    Args:
        ref (list): phones in the reference transcript
        hyp (list): phones in the predicted transcript
        normalize (bool, optional): if True, divide by the length of str_true
    Returns:
        per (float): Phone Error Rate between str_true and str_pred
    """
    # Build mapping of phone to index
    phone_set = set(ref + hyp)
    phone2char = dict(zip(phone_set, range(len(phone_set))))

    # Map phones to a single char array
    # NOTE: Levenshtein packages only accepts strings
    phones_ref = [chr(phone2char[p]) for p in ref]
    phones_hyp = [chr(phone2char[p]) for p in hyp]

    per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
    if normalize:
        per /= len(ref)
    return per

Source File: edit_distance.py From tensorflow_end2end_speech_recognition with MIT License

6 votes

def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
    Args:
        session:
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
    Returns:
        edit_distances: list of edit distance of each uttearance
    """
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances

Source File: BotDigger.py From BotDigger with GNU General Public License v3.0

6 votes

def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
	similarDomain = ""
	minDistance = sys.maxint
	level = domain.split(".")
	if len(level) <=1:
		return ("not a domain", sys.maxint)
	(domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
	for popularDomain in DomainDict:
		distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
		if distance < minDistance:
			minDistance = distance
			similarDomain = popularDomain
	#debug
	#sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
	if len(similarDomain) > 0:
		return (similarDomain, minDistance/float(len(similarDomain)))
	else:
		return (domain2LD, 0)

# check whether a domain contains invalid TLD

Source File: test_string_distances.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def _random_common_char_pairs(n_pairs=50, seed=1):
    """
    Return string pairs with a common char at random positions, in order to
    distinguish different thresholds for matching chararacters in Jaro
    distance.
    """
    # Make strings with random length and common char at index 0
    rng = np.random.RandomState(seed=seed)
    list1 = ['a' + 'b' * rng.randint(2, 20) for k in range(n_pairs)]
    list2 = ['a' + 'c' * rng.randint(2, 20) for k in range(n_pairs)]
    # Shuffle strings
    list1 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list1]
    list2 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list2]
    pairs = zip(list1, list2)
    return pairs


# TODO: some factorization of what is common for distances;
# check results for same examples on all distances

Source File: decoder.py From ngraph-python with Apache License 2.0

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = {ss: ii for ii, ss in enumerate(b)}

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: metrics.py From end2end-asr-pytorch with MIT License

6 votes

def calculate_wer(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2))

Source File: generate_accuracy_report.py From namsel with MIT License

6 votes

def _get_compare_data(tif_txt_pair):
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
#         ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
        ocr = run_main(tif, text=True)
#         ocr = run_all_confs_for_page(tif, text = True)
        ocr = ocr.strip()
        txt = open(txt,'r').read()
        txt = _normalize_input(txt)
        edit_dist = L.distance(txt, ocr)
        edit_ratio = L.ratio(txt, ocr)
        html = _make_html_diff(txt, ocr)
#        sys.exit()
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename(tif), 
                'html': html
            }
    return data

Source File: base.py From patter with MIT License

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: edit_distance.py From neural_sp with Apache License 2.0

6 votes

def compute_cer(ref, hyp, normalize=False):
    """Compute Character Error Rate.

    Args:
        ref (str): a sentence without spaces
        hyp (str): a sentence without spaces
        normalize (bool, optional): if True, divide by the length of ref
    Returns:
        cer (float): Character Error Rate between ref and hyp

    """
    import Levenshtein as lev
    # TODO(hirofumi): install
    cer = lev.distance(hyp, ref)
    if normalize:
        cer /= len(list(ref))
    return cer * 100

Source File: decoder.py From inference with Apache License 2.0

6 votes

def wer(s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: decoder.py From pytorch-nlp with MIT License

6 votes

def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: metrics.py From KoSpeech with Apache License 2.0

6 votes

def metric(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.

        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2))

Source File: metrics.py From KoSpeech with Apache License 2.0

6 votes

def metric(self, s1, s2):
        """
        Computes the Character Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to characters.

        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """
        s1 = s1.replace(' ', '')
        s2 = s2.replace(' ', '')

        dist = Lev.distance(s2, s1)
        length = len(s1.replace(' ', ''))

        return dist, length

Source File: metrics.py From KoSpeech with Apache License 2.0

6 votes

def _get_distance(self, targets, y_hats):
        """
        Provides total character distance between targets & y_hats

        Args:
            targets (torch.Tensor): set of ground truth
            y_hats (torch.Tensor): predicted y values (y_hat) by the model

        Returns: total_dist, total_length
            - **total_dist**: total distance between targets & y_hats
            - **total_length**: total length of targets sequence
        """
        total_dist = 0
        total_length = 0

        for (target, y_hat) in zip(targets, y_hats):
            s1 = label_to_string(target, self.id2char, self.eos_id)
            s2 = label_to_string(y_hat, self.id2char, self.eos_id)

            dist, length = self.metric(s1, s2)

            total_dist += dist
            total_length += length

        return total_dist, total_length

Source File: poetics.py From Poetry-Tools with MIT License

6 votes

def guess_metre(tokenized_poem):
    """
    Guess a poem's metre via Levenshtein distance from candidates
    """

    joined_lines = [''.join(line) for line in scanscion(tokenized_poem) if line]
    line_lengths = [len(line) for line in joined_lines]
    num_lines = len(joined_lines)

    metres = []
    for line in joined_lines:
        metres.append(levenshtein(line, POSSIBLE_METRES))

    guessed_metre = max(zip((metres.count(item) for item in set(metres)), set(metres)))[1]

    return joined_lines, num_lines, line_lengths, guessed_metre

Source File: poetics.py From Poetry-Tools with MIT License

6 votes

def levenshtein(string, candidates):
    """
    Compare a string's Levenshtein distance to each candidate in a dictionary. 
    Returns the name of the closest match
    """

    distances = defaultdict(int)
    num_lines = len(string)

    for k, v in candidates.items():
        expanded = False
        # Expands the length of each candidate to match the length of the compared string
        if len(v) != len(string):
            v = (v * (num_lines // len(v) + 1))[:num_lines]
            expanded = True

        edit_distance = distance(string, v)

        # If we expanded the candidate, then it is a worse match than what we have already
        if edit_distance in distances and expanded:
            continue

        distances[distance(string, v)] = k

    return distances[min(distances)]

Source File: trainer.py From pytorch-asr with GNU General Public License v3.0

6 votes

def validate(self, data_loader):
        "validate with label error rate by the edit distance between hyps and refs"
        self.model.eval()
        with torch.no_grad():
            N, D = 0, 0
            t = tqdm(enumerate(data_loader), total=len(data_loader), desc="validating", ncols=params.NCOLS)
            for i, (data) in t:
                hyps, refs = self.unit_validate(data)
                # calculate ler
                N += self.edit_distance(refs, hyps)
                D += sum(len(r) for r in refs)
                ler = N * 100. / D
                t.set_description(f"validating (LER: {ler:.2f} %)")
                t.refresh()
            logger.info(f"validating at epoch {self.epoch:03d}: LER {ler:.2f} %")

            title = f"validate"
            x = self.epoch - 1 + i / len(data_loader)
            if logger.visdom is not None:
                opts = { 'xlabel': 'epoch', 'ylabel': 'LER', }
                logger.visdom.add_point(title=title, x=x, y=ler, **opts)
            if logger.tensorboard is not None:
                logger.tensorboard.add_scalars(title, self.global_step, { 'LER': ler, })

Source File: align.py From hgraph2graph with MIT License

6 votes

def align(xy_tuple):
    x,y = xy_tuple
    xmol, ymol = Chem.MolFromSmiles(x), Chem.MolFromSmiles(y)
    x = Chem.MolToSmiles(xmol, isomericSmiles=False)
    xmol = Chem.MolFromSmiles(x)

    xleaf = get_leaves(xmol)
    yleaf = get_leaves(ymol)

    best_i,best_j = 0,0
    best = 1000000
    for i in xleaf:
        for j in yleaf:
            new_x = Chem.MolToSmiles(xmol, rootedAtAtom=i, isomericSmiles=False)
            new_y = Chem.MolToSmiles(ymol, rootedAtAtom=j, isomericSmiles=False)
            le = min(len(new_x), len(new_y)) // 2
            dist = Levenshtein.distance(new_x[:le], new_y[:le])
            if dist < best:
                best_i, best_j = i, j
                best = dist

    return Chem.MolToSmiles(xmol, rootedAtAtom=best_i, isomericSmiles=False), Chem.MolToSmiles(ymol, rootedAtAtom=best_j, isomericSmiles=False)

Source File: similarity.py From DeepFMPO with MIT License

5 votes

def calculateDistance(smi1,smi2): 
    return 1 - ETA * Levenshtein.distance(smi1, smi2)


# Calculate the MCS Tanimoto similarity between two molecules

Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License

5 votes

def edit_levenshtein(str1, str2):
    return Leven.distance(str1, str2)

Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License

5 votes

def wmd_distance(model, sent1_cut_list, sent2_cut_list):  # WMD距离
    # model.init_sims(replace=True)
    distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
    return distance


# def HamMings_Levenshtein(str1, str2):
#     sim = Leven.hamming(str1, str2)
#     return sim

Source File: distance.py From text2vec with Apache License 2.0

5 votes

def wmd_distance(model, sent1_cut_list, sent2_cut_list):  # WMD距离
    """
    wmd 距离
    :param model: gensim word2vec model
    :param sent1_cut_list:
    :param sent2_cut_list:
    :return:
    """
    distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
    return distance

Source File: distance.py From text2vec with Apache License 2.0

5 votes

def edit_distance(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        import Levenshtein
        d = Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x == " ", str1, str2).ratio()
    return d

Source File: align_wordlists.py From panphon with MIT License

5 votes

def dogol_leven_dist(_, a, b):
    return Levenshtein.distance(dist.map_to_dogol_prime(a),
                                dist.map_to_dogol_prime(b))

Source File: align_wordlists.py From panphon with MIT License

5 votes

def levenshtein_dist(_, a, b):
    return Levenshtein.distance(a, b)

Source File: utils.py From Particle-Cloud-Framework with Apache License 2.0

5 votes

def similar_strings(given_str, search_list=[]):
    """ Returns a list of similar strings to given_str from an iterable of potentially
        similar strings, search_list.
    """
    threshold = ceil(len(given_str) / 2.5)
    similar = [
        st for st in search_list if distance(given_str.lower(), st.lower()) <= threshold
    ]
    return similar

Source File: train.py From pytorch-asr with GNU General Public License v3.0

5 votes

def edit_distance(self, refs, hyps):
        assert len(refs) == len(hyps)
        n = 0
        for ref, hyp in zip(refs, hyps):
            r = [chr(c) for c in ref]
            h = [chr(c) for c in hyp]
            n += Lev.distance(''.join(r), ''.join(h))
        return n

Python Levenshtein.distance() Examples