Python Levenshtein.distance() Examples

The following are 30 code examples of Levenshtein.distance(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Levenshtein , or try the search function .
Example #1
Source File: decoder.py    From training with Apache License 2.0 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #2
Source File: decoder.py    From LipReading with MIT License 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #3
Source File: decoder.py    From LipReading with MIT License 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #4
Source File: edit_distance.py    From tensorflow_end2end_speech_recognition with MIT License 6 votes vote down vote up
def compute_per(ref, hyp, normalize=True):
    """Compute Phone Error Rate.
    Args:
        ref (list): phones in the reference transcript
        hyp (list): phones in the predicted transcript
        normalize (bool, optional): if True, divide by the length of str_true
    Returns:
        per (float): Phone Error Rate between str_true and str_pred
    """
    # Build mapping of phone to index
    phone_set = set(ref + hyp)
    phone2char = dict(zip(phone_set, range(len(phone_set))))

    # Map phones to a single char array
    # NOTE: Levenshtein packages only accepts strings
    phones_ref = [chr(phone2char[p]) for p in ref]
    phones_hyp = [chr(phone2char[p]) for p in hyp]

    per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
    if normalize:
        per /= len(ref)
    return per 
Example #5
Source File: edit_distance.py    From tensorflow_end2end_speech_recognition with MIT License 6 votes vote down vote up
def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
    Args:
        session:
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
    Returns:
        edit_distances: list of edit distance of each uttearance
    """
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances 
Example #6
Source File: BotDigger.py    From BotDigger with GNU General Public License v3.0 6 votes vote down vote up
def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
	similarDomain = ""
	minDistance = sys.maxint
	level = domain.split(".")
	if len(level) <=1:
		return ("not a domain", sys.maxint)
	(domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
	for popularDomain in DomainDict:
		distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
		if distance < minDistance:
			minDistance = distance
			similarDomain = popularDomain
	#debug
	#sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
	if len(similarDomain) > 0:
		return (similarDomain, minDistance/float(len(similarDomain)))
	else:
		return (domain2LD, 0)

# check whether a domain contains invalid TLD 
Example #7
Source File: test_string_distances.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _random_common_char_pairs(n_pairs=50, seed=1):
    """
    Return string pairs with a common char at random positions, in order to
    distinguish different thresholds for matching chararacters in Jaro
    distance.
    """
    # Make strings with random length and common char at index 0
    rng = np.random.RandomState(seed=seed)
    list1 = ['a' + 'b' * rng.randint(2, 20) for k in range(n_pairs)]
    list2 = ['a' + 'c' * rng.randint(2, 20) for k in range(n_pairs)]
    # Shuffle strings
    list1 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list1]
    list2 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list2]
    pairs = zip(list1, list2)
    return pairs


# TODO: some factorization of what is common for distances;
# check results for same examples on all distances 
Example #8
Source File: decoder.py    From ngraph-python with Apache License 2.0 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = {ss: ii for ii, ss in enumerate(b)}

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #9
Source File: metrics.py    From end2end-asr-pytorch with MIT License 6 votes vote down vote up
def calculate_wer(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) 
Example #10
Source File: generate_accuracy_report.py    From namsel with MIT License 6 votes vote down vote up
def _get_compare_data(tif_txt_pair):
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
#         ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
        ocr = run_main(tif, text=True)
#         ocr = run_all_confs_for_page(tif, text = True)
        ocr = ocr.strip()
        txt = open(txt,'r').read()
        txt = _normalize_input(txt)
        edit_dist = L.distance(txt, ocr)
        edit_ratio = L.ratio(txt, ocr)
        html = _make_html_diff(txt, ocr)
#        sys.exit()
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename(tif), 
                'html': html
            }
    return data 
Example #11
Source File: base.py    From patter with MIT License 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #12
Source File: edit_distance.py    From neural_sp with Apache License 2.0 6 votes vote down vote up
def compute_cer(ref, hyp, normalize=False):
    """Compute Character Error Rate.

    Args:
        ref (str): a sentence without spaces
        hyp (str): a sentence without spaces
        normalize (bool, optional): if True, divide by the length of ref
    Returns:
        cer (float): Character Error Rate between ref and hyp

    """
    import Levenshtein as lev
    # TODO(hirofumi): install
    cer = lev.distance(hyp, ref)
    if normalize:
        cer /= len(list(ref))
    return cer * 100 
Example #13
Source File: decoder.py    From inference with Apache License 2.0 6 votes vote down vote up
def wer(s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #14
Source File: decoder.py    From pytorch-nlp with MIT License 6 votes vote down vote up
def wer(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #15
Source File: metrics.py    From KoSpeech with Apache License 2.0 6 votes vote down vote up
def metric(self, s1, s2):
        """
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.

        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 
Example #16
Source File: metrics.py    From KoSpeech with Apache License 2.0 6 votes vote down vote up
def metric(self, s1, s2):
        """
        Computes the Character Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to characters.

        Arguments:
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence
        """
        s1 = s1.replace(' ', '')
        s2 = s2.replace(' ', '')

        dist = Lev.distance(s2, s1)
        length = len(s1.replace(' ', ''))

        return dist, length 
Example #17
Source File: metrics.py    From KoSpeech with Apache License 2.0 6 votes vote down vote up
def _get_distance(self, targets, y_hats):
        """
        Provides total character distance between targets & y_hats

        Args:
            targets (torch.Tensor): set of ground truth
            y_hats (torch.Tensor): predicted y values (y_hat) by the model

        Returns: total_dist, total_length
            - **total_dist**: total distance between targets & y_hats
            - **total_length**: total length of targets sequence
        """
        total_dist = 0
        total_length = 0

        for (target, y_hat) in zip(targets, y_hats):
            s1 = label_to_string(target, self.id2char, self.eos_id)
            s2 = label_to_string(y_hat, self.id2char, self.eos_id)

            dist, length = self.metric(s1, s2)

            total_dist += dist
            total_length += length

        return total_dist, total_length 
Example #18
Source File: poetics.py    From Poetry-Tools with MIT License 6 votes vote down vote up
def guess_metre(tokenized_poem):
    """
    Guess a poem's metre via Levenshtein distance from candidates
    """

    joined_lines = [''.join(line) for line in scanscion(tokenized_poem) if line]
    line_lengths = [len(line) for line in joined_lines]
    num_lines = len(joined_lines)

    metres = []
    for line in joined_lines:
        metres.append(levenshtein(line, POSSIBLE_METRES))

    guessed_metre = max(zip((metres.count(item) for item in set(metres)), set(metres)))[1]

    return joined_lines, num_lines, line_lengths, guessed_metre 
Example #19
Source File: poetics.py    From Poetry-Tools with MIT License 6 votes vote down vote up
def levenshtein(string, candidates):
    """
    Compare a string's Levenshtein distance to each candidate in a dictionary. 
    Returns the name of the closest match
    """

    distances = defaultdict(int)
    num_lines = len(string)

    for k, v in candidates.items():
        expanded = False
        # Expands the length of each candidate to match the length of the compared string
        if len(v) != len(string):
            v = (v * (num_lines // len(v) + 1))[:num_lines]
            expanded = True

        edit_distance = distance(string, v)

        # If we expanded the candidate, then it is a worse match than what we have already
        if edit_distance in distances and expanded:
            continue

        distances[distance(string, v)] = k

    return distances[min(distances)] 
Example #20
Source File: trainer.py    From pytorch-asr with GNU General Public License v3.0 6 votes vote down vote up
def validate(self, data_loader):
        "validate with label error rate by the edit distance between hyps and refs"
        self.model.eval()
        with torch.no_grad():
            N, D = 0, 0
            t = tqdm(enumerate(data_loader), total=len(data_loader), desc="validating", ncols=params.NCOLS)
            for i, (data) in t:
                hyps, refs = self.unit_validate(data)
                # calculate ler
                N += self.edit_distance(refs, hyps)
                D += sum(len(r) for r in refs)
                ler = N * 100. / D
                t.set_description(f"validating (LER: {ler:.2f} %)")
                t.refresh()
            logger.info(f"validating at epoch {self.epoch:03d}: LER {ler:.2f} %")

            title = f"validate"
            x = self.epoch - 1 + i / len(data_loader)
            if logger.visdom is not None:
                opts = { 'xlabel': 'epoch', 'ylabel': 'LER', }
                logger.visdom.add_point(title=title, x=x, y=ler, **opts)
            if logger.tensorboard is not None:
                logger.tensorboard.add_scalars(title, self.global_step, { 'LER': ler, }) 
Example #21
Source File: align.py    From hgraph2graph with MIT License 6 votes vote down vote up
def align(xy_tuple):
    x,y = xy_tuple
    xmol, ymol = Chem.MolFromSmiles(x), Chem.MolFromSmiles(y)
    x = Chem.MolToSmiles(xmol, isomericSmiles=False)
    xmol = Chem.MolFromSmiles(x)

    xleaf = get_leaves(xmol)
    yleaf = get_leaves(ymol)

    best_i,best_j = 0,0
    best = 1000000
    for i in xleaf:
        for j in yleaf:
            new_x = Chem.MolToSmiles(xmol, rootedAtAtom=i, isomericSmiles=False)
            new_y = Chem.MolToSmiles(ymol, rootedAtAtom=j, isomericSmiles=False)
            le = min(len(new_x), len(new_y)) // 2
            dist = Levenshtein.distance(new_x[:le], new_y[:le])
            if dist < best:
                best_i, best_j = i, j
                best = dist

    return Chem.MolToSmiles(xmol, rootedAtAtom=best_i, isomericSmiles=False), Chem.MolToSmiles(ymol, rootedAtAtom=best_j, isomericSmiles=False) 
Example #22
Source File: similarity.py    From DeepFMPO with MIT License 5 votes vote down vote up
def calculateDistance(smi1,smi2): 
    return 1 - ETA * Levenshtein.distance(smi1, smi2)


# Calculate the MCS Tanimoto similarity between two molecules 
Example #23
Source File: distance_text_or_vec.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def edit_levenshtein(str1, str2):
    return Leven.distance(str1, str2) 
Example #24
Source File: distance_text_or_vec.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def wmd_distance(model, sent1_cut_list, sent2_cut_list):  # WMD距离
    # model.init_sims(replace=True)
    distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
    return distance


# def HamMings_Levenshtein(str1, str2):
#     sim = Leven.hamming(str1, str2)
#     return sim 
Example #25
Source File: distance.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def wmd_distance(model, sent1_cut_list, sent2_cut_list):  # WMD距离
    """
    wmd 距离
    :param model: gensim word2vec model
    :param sent1_cut_list:
    :param sent2_cut_list:
    :return:
    """
    distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
    return distance 
Example #26
Source File: distance.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def edit_distance(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        import Levenshtein
        d = Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x == " ", str1, str2).ratio()
    return d 
Example #27
Source File: align_wordlists.py    From panphon with MIT License 5 votes vote down vote up
def dogol_leven_dist(_, a, b):
    return Levenshtein.distance(dist.map_to_dogol_prime(a),
                                dist.map_to_dogol_prime(b)) 
Example #28
Source File: align_wordlists.py    From panphon with MIT License 5 votes vote down vote up
def levenshtein_dist(_, a, b):
    return Levenshtein.distance(a, b) 
Example #29
Source File: utils.py    From Particle-Cloud-Framework with Apache License 2.0 5 votes vote down vote up
def similar_strings(given_str, search_list=[]):
    """ Returns a list of similar strings to given_str from an iterable of potentially
        similar strings, search_list.
    """
    threshold = ceil(len(given_str) / 2.5)
    similar = [
        st for st in search_list if distance(given_str.lower(), st.lower()) <= threshold
    ]
    return similar 
Example #30
Source File: train.py    From pytorch-asr with GNU General Public License v3.0 5 votes vote down vote up
def edit_distance(self, refs, hyps):
        assert len(refs) == len(hyps)
        n = 0
        for ref, hyp in zip(refs, hyps):
            r = [chr(c) for c in ref]
            h = [chr(c) for c in hyp]
            n += Lev.distance(''.join(r), ''.join(h))
        return n