Python Levenshtein.editops() Examples

The following are 7 code examples of Levenshtein.editops(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Levenshtein , or try the search function .
Example #1
Source File: measures.py    From jiwer with Apache License 2.0 6 votes vote down vote up
def _get_operation_counts(
    source_string: str, destination_string: str
) -> Tuple[int, int, int, int]:
    """
    Check how many edit operations (delete, insert, replace) are required to
    transform the source string into the destination string. The number of hits
    can be given by subtracting the number of deletes and substitutions from the
    total length of the source string.

    :param source_string: the source string to transform into the destination string
    :param destination_string: the destination to transform the source string into
    :return: a tuple of #hits, #substitutions, #deletions, #insertions
    """

    editops = Levenshtein.editops(source_string, destination_string)

    substitutions = sum(1 if op[0] == "replace" else 0 for op in editops)
    deletions = sum(1 if op[0] == "delete" else 0 for op in editops)
    insertions = sum(1 if op[0] == "insert" else 0 for op in editops)
    hits = len(source_string) - (substitutions + deletions)

    return hits, substitutions, deletions, insertions 
Example #2
Source File: error_analysis.py    From stanza-old with Apache License 2.0 6 votes vote down vote up
def print_error_analysis():
    options = config.options(read=True)
    output = get_output(options.run_dir, 'eval')
    errors = [(inst['input'], pred, inst['output'])
              for inst, pred in zip(output.data, output.predictions)
              if inst['output'] != pred]
    if 0 < options.max_examples < len(errors):
        indices = np.random.choice(np.arange(len(errors)), size=options.max_examples, replace=False)
    else:
        indices = range(len(errors))

    if options.html:
        print('<!DOCTYPE html>')
        print('<html><head><title>Error analysis</title><meta charset="utf-8" /></head><body>')
    for i in indices:
        inp, pred, gold = [unicode(s).strip() for s in errors[i]]
        editops = lev.editops(gold, pred)
        print_visualization(inp, pred, gold, editops, html=options.html)
    if options.html:
        print('</body></html>') 
Example #3
Source File: error_analysis.py    From stanza-old with Apache License 2.0 6 votes vote down vote up
def print_visualization(input_seq, pred_output_seq,
                        gold_output_seq, editops, html=False):
    gold_highlights = []
    pred_highlights = []
    for optype, gold_idx, pred_idx in editops:
        gold_highlights.append(gold_idx)
        pred_highlights.append(pred_idx)

    input_seq = highlight(input_seq, pred_highlights, 'cyan', html=html)
    pred_output_seq = highlight(pred_output_seq, pred_highlights, 'red', html=html)
    gold_output_seq = highlight(gold_output_seq, gold_highlights, 'yellow', html=html)

    if html:
        print('<p>')
        br = u' <br />'
    else:
        br = u''
    uprint(input_seq + br)
    uprint(pred_output_seq + br)
    uprint(gold_output_seq)
    if html:
        print('</p>')
    print('') 
Example #4
Source File: transcription_error.py    From patter with MIT License 5 votes vote down vote up
def _get_word_errors(s1, s2):
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        ops = Lev.editops(''.join(w1), ''.join(w2))
        errors = {"delete": 0, "insert": 0, "replace": 0}
        for x in ops:
            errors[x[0]] += 1
        return errors 
Example #5
Source File: transcription_error.py    From patter with MIT License 5 votes vote down vote up
def _get_char_errors(s1, s2):
        s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
        ops = Lev.editops(s1, s2)
        errors = {"delete": 0, "insert": 0, "replace": 0}
        for x in ops:
            errors[x[0]] += 1
        return errors 
Example #6
Source File: test_align.py    From circleseq with GNU Affero General Public License v3.0 5 votes vote down vote up
def main():

    # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC'
    # b = 'GAGTCGAGCAGAAGAAGAANGG'

    a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
    b= 'TTTNCTGATGGTCCATGTCTGTTACTC'

    print(l.distance(a, b))
    print(l.editops(a, b))
    print(l.matching_blocks(l.editops(a,b), a, b)) 
Example #7
Source File: analyse.py    From avsr-tf1 with GNU General Public License v3.0 4 votes vote down vote up
def compute_uer_confusion_matrix(predictions_dict, labels_dict, unit_dict):

    slim_dict = {key:val for key, val in unit_dict.items() if val not in ['GO', 'EOS', 'MASK', 'END']}
    vocab_size = len(slim_dict)
    invdict = {v: k for k, v in slim_dict.items()}

    conf_matrix = np.zeros(shape=(vocab_size, vocab_size + 2))  # plus deletions, insertions
    edit_ops_indices = []
    edit_ops_at_word_boundaries = []
    edit_ops_not_at_word_boundaries = []

    for (id, label) in labels_dict.items():
        label_str = ''.join(_strip_extra_chars(label))
        prediction_str = ''.join(_strip_extra_chars(predictions_dict[id]))
        edit_ops = Levenshtein.editops(prediction_str, label_str)

        seen_positions = []
        for op in edit_ops:
            opname = op[0]
            if len(prediction_str) >= 40:
                edit_ops_indices.append(op[1] / len(prediction_str))  # store all errors in the source (prediction) string

            if opname == 'delete':
                source_unit = prediction_str[op[1]]
                mat_col = vocab_size
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            elif opname == 'insert':
                source_unit = label_str[op[2]]  # the inserted unit does not exist in the source string
                mat_col = vocab_size + 1
            elif opname == 'replace':
                source_unit = prediction_str[op[1]]
                dest_unit = label_str[op[2]]
                mat_col = invdict[dest_unit] - 1
                seen_positions.append(op[1])

                if source_unit == ' ':
                    edit_ops_at_word_boundaries.append(source_unit)
                else:
                    edit_ops_not_at_word_boundaries.append(source_unit)

            else:
                raise Exception('unknown opname {}'.format(opname))

            mat_row = invdict[source_unit] - 1
            conf_matrix[mat_row, mat_col] += 1


        for idx, symbol in enumerate(prediction_str):
            if idx not in seen_positions:  # correct match
                mat_pos = invdict[symbol] - 1
                conf_matrix[mat_pos, mat_pos] += 1

    # plot_confusion_matrix(conf_matrix, invdict)
    plot_edit_ops_histogram(edit_ops_indices)