Python Examples of editdistance.eval

Source File: rule.py From YaYaGen with BSD 2-Clause "Simplified" License

7 votes

def __is_wrong_permission(self, permission):
        """
        Return True if the permission contains a typo
        """
        if permission not in YaraRule.PERMISSION_SET:
            if permission in YaraRule.WRONG_PERMISSION_DICT.keys():
                return True
            if permission in YaraRule.CACHE_NNSTD_PERMISSION_DICT:
                return False
            for standard_perm in YaraRule.PERMISSION_SET:
                distance = editdistance.eval(permission, standard_perm)
                if distance > 0 and distance <= 3:
                    YaraRule.WRONG_PERMISSION_DICT[permission] = standard_perm
                    return True
                else:
                    YaraRule.CACHE_NNSTD_PERMISSION_DICT.add(permission)
        return False

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: e2e_st.py From espnet with Apache License 2.0

6 votes

def encode(self, x):
        """Encode acoustic features.

        :param ndarray x: input acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        """
        self.eval()
        ilens = [x.shape[0]]

        # subsample frame
        x = x[:: self.subsample[0], :]
        p = next(self.parameters())
        h = torch.as_tensor(x, device=p.device, dtype=p.dtype)
        # make a utt list (1) to use the same interface for encoder
        hs = h.contiguous().unsqueeze(0)

        # 1. encoder
        hs, _, _ = self.enc(hs, ilens)
        return hs.squeeze(0)

Source File: e2e_asr.py From espnet with Apache License 2.0

6 votes

def enhance(self, xs):
        """Forward only in the frontend stage.

        :param ndarray xs: input acoustic feature (T, C, F)
        :return: enhaned feature
        :rtype: torch.Tensor
        """
        if self.frontend is None:
            raise RuntimeError("Frontend does't exist")
        prev = self.training
        self.eval()
        ilens = np.fromiter((xx.shape[0] for xx in xs), dtype=np.int64)

        # subsample frame
        xs = [xx[:: self.subsample[0], :] for xx in xs]
        xs = [to_device(self, to_torch_tensor(xx).float()) for xx in xs]
        xs_pad = pad_list(xs, 0.0)
        enhanced, hlensm, mask = self.frontend(xs_pad, ilens)
        if prev:
            self.train()
        return enhanced.cpu().numpy(), mask.cpu().numpy(), ilens

Source File: evaluation.py From neutralizing-bias with MIT License

6 votes

def inference_metrics(model, src, tgt, config):
    """ decode and evaluate bleu """
    inputs, preds, top_k_preds, ground_truths, auxs, raw_srcs = decode_dataset(
        model, src, tgt, config, k=config['eval']['precision_recall_k'])

    eval_classifier = models.TextClassifier.from_pickle(
        config['eval']['classifier_path'])

    metrics = get_metrics(
        raw_srcs, preds, ground_truths, 
        top_k_preds=top_k_preds, classifier=eval_classifier)

    inputs = [' '.join(seq) for seq in inputs]
    preds = [' '.join(seq) for seq in preds]
    ground_truths = [' '.join(seq) for seq in ground_truths]
    auxs = [' '.join(seq) for seq in auxs]

    return metrics, inputs, preds, ground_truths, auxs

Source File: image_ocr.py From pCVR with Apache License 2.0

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: e2e_asr_common.py From espnet with Apache License 2.0

6 votes

def calculate_cer(self, seqs_hat, seqs_true):
        """Calculate sentence-level CER score.

        :param list seqs_hat: prediction
        :param list seqs_true: reference
        :return: average sentence-level CER score
        :rtype float
        """
        char_eds, char_ref_lens = [], []
        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]
            hyp_chars = seq_hat_text.replace(" ", "")
            ref_chars = seq_true_text.replace(" ", "")
            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
            char_ref_lens.append(len(ref_chars))
        return float(sum(char_eds)) / sum(char_ref_lens)

Source File: e2e_asr_common.py From espnet with Apache License 2.0

6 votes

def calculate_wer(self, seqs_hat, seqs_true):
        """Calculate sentence-level WER score.

        :param list seqs_hat: prediction
        :param list seqs_true: reference
        :return: average sentence-level WER score
        :rtype float
        """
        word_eds, word_ref_lens = [], []
        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]
            hyp_words = seq_hat_text.split()
            ref_words = seq_true_text.split()
            word_eds.append(editdistance.eval(hyp_words, ref_words))
            word_ref_lens.append(len(ref_words))
        return float(sum(word_eds)) / sum(word_ref_lens)

Source File: vocab.py From ancient-text-restoration with Apache License 2.0

6 votes

def edit_distance_batch(hyp, tar, tar_len, eos_idx):
  cer = 0.
  bs = hyp.shape[0]

  for i in range(bs):
    # filter hyp for eos
    hyp_len = np.argmax(hyp[i] == eos_idx, axis=0)
    if hyp_len.size == 0:
      hyp_len = hyp[i].size

    # filter tar for eos
    eos_pos = np.argmax(tar[i] == eos_idx, axis=0)
    if eos_pos.size > 0:
      tar_len[i] = eos_pos

    cer += editdistance.eval(hyp[i, :hyp_len], tar[i, :tar_len[i]]) / float(tar_len[i])
  return np.float32(cer / bs)

Source File: distance.py From ABXpy with MIT License

6 votes

def edit_distance(x, y):
    """Levenshtein Distance

    The "feature" dimension is along the columns and the "time" dimension
    along the lines of arrays x and y
    """
    # convert arrays to tuple, to evaluate w/ editdistance
    def totuple(a):
        try:
            return tuple(totuple(i) for i in a)
        except TypeError:
            return a

    if x.shape[0] > 0 and y.shape[0] > 0:
        # x and y are not empty
        d = editdistance.eval(totuple(x), totuple(y))
    elif x.shape[0] == y.shape[0]:
        # both x and y are empty
        d = 0
    else:
        # x or y is empty
        d = np.inf
    return d

Source File: comparison.py From ws-backend-community with GNU General Public License v3.0

6 votes

def compare_strings_by_edit_distance(first=None, second=None):
        """
        Get the edit distance between the two strings passed to this method.
        :param first: The first string to compare.
        :param second: The second string to compare.
        :return: A number representing the edit distance between the two strings passed
        as arguments to this method.
        """
        return editdistance.eval(first, second)

    # Class Methods

    # Public Methods

    # Protected Methods

    # Private Methods

    # Properties

    # Representation and Comparison

Source File: e2e_asr_common.py From espnet with Apache License 2.0

6 votes

def calculate_wer(self, seqs_hat, seqs_true):
        """Calculate sentence-level WER score for transducer model.

        Args:
            seqs_hat (torch.Tensor): prediction (batch, seqlen)
            seqs_true (torch.Tensor): reference (batch, seqlen)

        Returns:
            (float): average sentence-level WER score

        """
        word_eds, word_ref_lens = [], []

        for i, seq_hat_text in enumerate(seqs_hat):
            seq_true_text = seqs_true[i]
            hyp_words = seq_hat_text.split()
            ref_words = seq_true_text.split()

            word_eds.append(editdistance.eval(hyp_words, ref_words))
            word_ref_lens.append(len(ref_words))

        return float(sum(word_eds)) / sum(word_ref_lens)

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0

6 votes

def transform(self, X: dt.Frame):
        import editdistance
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = str(text1).lower().split()
                text2 = text2_arr[ind]
                text2 = str(text2).lower().split()
                edit_distance = editdistance.eval(text1, text2)
                output.append(edit_distance)
            except:
                output.append(-1)
        return np.array(output)

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: BKTree.py From CTCDecoder with MIT License

6 votes

def _query(self, node, txt, tolerance):
		# handle empty root node
		if node is None:
			return []

		# distance between query and current node
		d = ed.eval(node[0], txt)

		# add current node to result if within tolerance
		res = []
		if d <= tolerance:
			res.append(node[0])

		# iterate over children
		for (edge, child) in node[1].items():
			if d - tolerance <= edge and edge <= d + tolerance:
				res += self._query(child, txt, tolerance)

		return res

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: test.py From crnn-pytorch with BSD 2-Clause "Simplified" License

6 votes

def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    cuda = True if gpu is not '' else False

    input_size = [int(x) for x in input_size.split('x')]
    transform = Compose([
        Rotation(),
        Resize(size=(input_size[0], input_size[1]))
    ])
    if data_path is not None:
        data = TextDataset(data_path=data_path, mode="test", transform=transform)
    else:
        data = TestDataset(transform=transform, abc=abc)
    seq_proj = [int(x) for x in seq_proj.split('x')]
    net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
    acc, avg_ed = test(net, data, data.get_abc(), cuda, visualize)
    print("Accuracy: {}".format(acc))
    print("Edit distance: {}".format(avg_ed))

Source File: utils.py From Decoupled-attention-network with MIT License

6 votes

def add_iter(self, output, out_length, label_length, labels):
        start = 0
        start_o = 0
        self.total_samples += label_length.size()[0]
        raw_prdts = output.topk(1)[1]
        prdt_texts, prdt_prob = self.de.decode(output, out_length)
        for i in range(0, len(prdt_texts)):
            if not self.case_sensitive:
                prdt_texts[i] = prdt_texts[i].lower()
                labels[i] = labels[i].lower()
            all_words = []
            for w in labels[i].split('|') + prdt_texts[i].split('|'):
                if w not in all_words:
                    all_words.append(w)
            l_words = [all_words.index(_) for _ in labels[i].split('|')]
            p_words = [all_words.index(_) for _ in prdt_texts[i].split('|')]
            self.distance_C += ed.eval(labels[i], prdt_texts[i])
            self.distance_W += ed.eval(l_words, p_words)
            self.total_C += len(labels[i])
            self.total_W += len(l_words)
            self.correct = self.correct + 1 if labels[i] == prdt_texts[i] else self.correct

Source File: image_ocr.py From DeepLearning_Wavelet-LSTM with MIT License

6 votes

def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))

Source File: util.py From AfterQC with MIT License

6 votes

def editDistance(s1, s2):
    # check if editdistance module loaded
    if EDIT_DISTANCE_MODULE_EXISTS:
        return editdistance.eval(s1, s2)
    elif EDIT_DISTANCE_CTYPES_LOADED:
        return ed_ctypes.edit_distance(s1, len(s1), s2, len(s2))

    m=len(s1)+1
    n=len(s2)+1

    tbl = [([0] * n) for i in xrange(m)]
    for i in xrange(m):tbl[i][0]=i
    for j in xrange(n):tbl[0][j]=j
    for i in xrange(1, m):
        for j in xrange(1, n):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            tbl[i][j] = min(tbl[i][j-1]+1, tbl[i-1][j]+1, tbl[i-1][j-1]+cost)

    return tbl[i][j]

Source File: node_utils.py From gtos with MIT License

6 votes

def get_frames(self, lemma):
        """
        Given a lemma, find the most likely frames for the lemma.
        If no lemma is found or it should be a senseless node, return a single element list [lemma].
        """
        if lemma in self.frequent_senseless_nodes or lemma not in self.lemma_frame_map:
            return [lemma]
        else:
            frames = list(self.lemma_frame_map[lemma])
            frames.sort(
                key=lambda frame: (
                    editdistance.eval(re.sub(r'-\d\d$', '', frame), lemma),
                    -int(frame[-2:]) if re.search(r'-\d\d$', frame) else 0
                ),
                reverse=True
            )
            return frames

Source File: MorseDecoder.py From LSTM_morse with MIT License

5 votes

def validate(model, loader):
    "validate NN"
    print('Validate NN')
    loader.validationSet()
    #loader.trainSet()
    charErrorRate = float('inf')
    numCharErr = 0
    numCharTotal = 0
    numWordOK = 0
    numWordTotal = 0
    wordAccuracy = 0
    while loader.hasNext():
        iterInfo = loader.getIteratorInfo()
        print('Batch:', iterInfo[0],'/', iterInfo[1])
        batch = loader.getNext()
        (recognized, probability) = model.inferBatch(batch)
        print(recognized, probability)
        
        print('Ground truth -> Recognized')    
        for i in range(len(recognized)):
            numWordOK += 1 if batch.gtTexts[i] == recognized[i] else 0
            numWordTotal += 1
            dist = editdistance.eval(recognized[i], batch.gtTexts[i])
            numCharErr += dist
            numCharTotal += len(batch.gtTexts[i])
            print('[OK]' if dist==0 else '[ERR:%d]' % dist,'"' + batch.gtTexts[i] + '"', '->', '"' + recognized[i] + '"')
    
    # print validation result
    
    try:
        charErrorRate = numCharErr / numCharTotal
        wordAccuracy = numWordOK / numWordTotal
        print('Character error rate: {:4.1f}%. Word accuracy: {:4.1f}%.'.format(charErrorRate*100.0, wordAccuracy*100.0))
        print('numCharTotal:{} numWordTotal:{}'.format(numCharTotal,numWordTotal))
    except:
        print('numCharTotal:{} numWordTotal:{}'.format(numCharTotal,numWordTotal))
    return charErrorRate, wordAccuracy

Source File: evaluation.py From neutralizing-bias with MIT License

5 votes

def get_edit_distance(hypotheses, reference):
    ed = 0
    for hyp, ref in zip(hypotheses, reference):
        ed += editdistance.eval(hyp, ref)

    return ed * 1.0 / len(hypotheses)

Source File: main.py From SimpleHTR with MIT License

5 votes

def validate(model, loader):
	"validate NN"
	print('Validate NN')
	loader.validationSet()
	numCharErr = 0
	numCharTotal = 0
	numWordOK = 0
	numWordTotal = 0
	while loader.hasNext():
		iterInfo = loader.getIteratorInfo()
		print('Batch:', iterInfo[0],'/', iterInfo[1])
		batch = loader.getNext()
		(recognized, _) = model.inferBatch(batch)
		
		print('Ground truth -> Recognized')	
		for i in range(len(recognized)):
			numWordOK += 1 if batch.gtTexts[i] == recognized[i] else 0
			numWordTotal += 1
			dist = editdistance.eval(recognized[i], batch.gtTexts[i])
			numCharErr += dist
			numCharTotal += len(batch.gtTexts[i])
			print('[OK]' if dist==0 else '[ERR:%d]' % dist,'"' + batch.gtTexts[i] + '"', '->', '"' + recognized[i] + '"')
	
	# print validation result
	charErrorRate = numCharErr / numCharTotal
	wordAccuracy = numWordOK / numWordTotal
	print('Character error rate: %f%%. Word accuracy: %f%%.' % (charErrorRate*100.0, wordAccuracy*100.0))
	return charErrorRate

Source File: callbacks.py From LipNet with MIT License

5 votes

def get_mean_character_error_rate(self, data):
        mean_individual_length = np.mean([len(pair[1]) for pair in data])
        return self.get_mean_tuples(data, mean_individual_length, editdistance.eval)

Source File: distance.py From panphon with MIT License

5 votes

def fast_levenshtein_distance_div_maxlen(self, source, target):
        """Levenshtein distance divided by maxlen

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target` divided by the length of the longest
                 of these arguments
        """
        maxlen = max(len(source), len(target))
        return int(editdistance.eval(source, target)) / maxlen

Source File: distance.py From panphon with MIT License

5 votes

def fast_levenshtein_distance(self, source, target):
        """Wrapper for the distance function in the Levenshtein module

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target`
        """
        return int(editdistance.eval(source, target))

Python editdistance.eval() Examples