Python Examples of nltk.translate.bleu_score.corpus

Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def test_corpus_bleu_with_bad_sentence(self):
        hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
        ref = str(
            "Their tasks include changing a pump on the faulty stokehold ."
            "Likewise , two species that are very similar in morphology "
            "were distinguished using genetics ."
        )
        references = [[ref.split()]]
        hypotheses = [hyp.split()]
        try:  # Check that the warning is raised since no. of 2-grams < 0.
            with self.assertWarns(UserWarning):
                # Verify that the BLEU output is undesired since no. of 2-grams < 0.
                self.assertAlmostEqual(
                    corpus_bleu(references, hypotheses), 0.0, places=4
                )
        except AttributeError:  # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
            self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)

Source File: eval_utils.py From tf-var-attention with MIT License

6 votes

def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    bleu_1 = np.round(100 * corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4

Source File: seq2seq_chainerio.py From pfio with MIT License

6 votes

def forward(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu})

Source File: seq2seq.py From convolutional_seq2seq with BSD 3-Clause "New" or "Revised" License

6 votes

def __call__(self, trainer):
        print('## Calculate BLEU')
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                references = []
                hypotheses = []
                for i in range(0, len(self.test_data), self.batch):
                    sources, targets = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(self.device, x) for x in sources]
                    ys = [y.tolist()
                          for y in self.model.translate(sources, self.max_length)]
                    hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1) * 100
        print('BLEU:', bleu)
        reporter.report({self.key: bleu})

Source File: evaluators.py From NeuralDialog-LaRL with Apache License 2.0

6 votes

def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1

Source File: evaluators.py From NeuralDialog-LaRL with Apache License 2.0

6 votes

def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report)

Source File: seq2seq.py From chainer with MIT License

6 votes

def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        reporter.report({self.key: bleu})

Source File: seq2seq.py From chainer with MIT License

6 votes

def __call__(self, trainer):
        device = self.device

        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [device.send(x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu})

Source File: utils.py From nonauto-nmt with BSD 3-Clause "New" or "Revised" License

6 votes

def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))


# load the dataset + reversible tokenization

Source File: bleu.py From funcom with GNU General Public License v3.0

6 votes

def bleu_so_far(refs, preds):
    Ba = corpus_bleu(refs, preds)
    B1 = corpus_bleu(refs, preds, weights=(1,0,0,0))
    B2 = corpus_bleu(refs, preds, weights=(0,1,0,0))
    B3 = corpus_bleu(refs, preds, weights=(0,0,1,0))
    B4 = corpus_bleu(refs, preds, weights=(0,0,0,1))

    Ba = round(Ba * 100, 2)
    B1 = round(B1 * 100, 2)
    B2 = round(B2 * 100, 2)
    B3 = round(B3 * 100, 2)
    B4 = round(B4 * 100, 2)

    ret = ''
    ret += ('for %s functions\n' % (len(preds)))
    ret += ('Ba %s\n' % (Ba))
    ret += ('B1 %s\n' % (B1))
    ret += ('B2 %s\n' % (B2))
    ret += ('B3 %s\n' % (B3))
    ret += ('B4 %s\n' % (B4))
    
    return ret

Source File: data_utils.py From dgm_latent_bow with MIT License

6 votes

def quora_read(file_path, bleu_baseline=False):
  """Read the quora dataset"""
  print("Reading quora raw data .. ")
  print("  data path: %s" % file_path)
  with open(file_path) as fd:
    lines = fd.readlines()
  sentence_sets = []
  for l in tqdm(lines):
    p0, p1 = l[:-1].lower().split("\t")
    sentence_sets.append([word_tokenize(p0), word_tokenize(p1)])

  if(bleu_baseline):
    print("calculating bleu ... ")
    hypothesis = [s[0] for s in sentence_sets]
    references = [s[1:] for s in sentence_sets]
    bleu = corpus_bleu(references, hypothesis)
    print("bleu on the training set: %.4f" % bleu)
  return sentence_sets

Source File: helper.py From DiPS with Apache License 2.0

6 votes

def bleu_scorer(ref, hyp, script='default'):
    refsend = []
    for i in range(len(ref)):
        refsi = []
        for j in range(len(ref[i])):
            refsi.append(ref[i][j].split())
        refsend.append(refsi)

    gensend = []
    for i in range(len(hyp)):
        gensend.append(hyp[i].split())

    if script == 'nltk':
         metrics = corpus_bleu(refsend, gensend)
         return [metrics]

    metrics = compute_bleu(refsend, gensend)
    return metrics

Source File: bleu.py From FocusSeq2Seq with MIT License

6 votes

def oracle_bleu(hyp_list, ref, n_process=4):

    assert len(set([len(h) for h in hyp_list])) == 1

    all_hyp_sentence_bleu_list = [get_sent_bleu_list(hyp, ref, n_process=n_process)
                                  for hyp in hyp_list]

    if n_process > len(hyp_list[0]):
        n_process = len(hyp_list[0])

    with Pool(n_process) as pool:
        max_hyp_index_list = list(tqdm(pool.imap(np.argmax, zip(*all_hyp_sentence_bleu_list)),
                                       total=len(all_hyp_sentence_bleu_list)))

    best_hyp_list = []
    for i, max_hyp_index in enumerate(max_hyp_index_list):
        best_hyp = hyp_list[max_hyp_index][i]
        best_hyp_list.append(best_hyp)

    return corpus_bleu([[r] for r in ref], best_hyp_list, smoothing_function=cm.method2)

Source File: evaluators.py From ConvLab with MIT License

6 votes

def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1

Source File: evaluators.py From ConvLab with MIT License

6 votes

def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report)

Source File: metrics.py From summarus with Apache License 2.0

6 votes

def calc_metrics(refs, hyps, language, metric="all", meteor_jar=None):
    metrics = dict()
    metrics["count"] = len(hyps)
    metrics["ref_example"] = refs[-1]
    metrics["hyp_example"] = hyps[-1]
    many_refs = [[r] if r is not list else r for r in refs]
    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(many_refs, hyps)
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(hyps, refs, avg=True)
        metrics.update(scores)
    if metric in ("meteor", "all") and meteor_jar is not None and os.path.exists(meteor_jar):
        meteor = Meteor(meteor_jar, language=language)
        metrics["meteor"] = meteor.compute_score(hyps, many_refs)
    if metric in ("duplicate_ngrams", "all"):
        metrics["duplicate_ngrams"] = dict()
        metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps))
    return metrics

Source File: bleu.py From FocusSeq2Seq with MIT License

5 votes

def avg_bleu(hyp_list, ref):
    return corpus_bleu([[r] for r in ref * len(hyp_list)], flatten(hyp_list), smoothing_function=cm.method2)

Source File: test_bleu.py From cotk with Apache License 2.0

5 votes

def test_bleu_bug():
	ref = [[[1, 3], [3], [4]]]
	gen = [[1]]
	with pytest.raises(ZeroDivisionError):
		corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)

Source File: evaluate.py From tatk with Apache License 2.0

5 votes

def get_bleu4(dialog_acts, golden_utts, gen_utts):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        utt = utt.lower()
        gen = gen.lower()
        for da, svs in das.items():
            domain, act = da.split('-')
            if act == 'Request' or domain == 'general':
                continue
            else:
                for s, v in sorted(svs, key=lambda x: x[0]):
                    if s == 'Internet' or s == 'Parking' or s == 'none' or v == 'none':
                        continue
                    else:
                        v = v.lower()
                        if (' ' + v in utt) or (v + ' ' in utt):
                            utt = utt.replace(v, '{}-{}'.format(da, s), 1)
                        if (' ' + v in gen) or (v + ' ' in gen):
                            gen = gen.replace(v, '{}-{}'.format(da, s), 1)
        hash_key = ''
        for da in sorted(das.keys()):
            for s, v in sorted(das[da], key=lambda x: x[0]):
                hash_key += da + '-' + s + ';'
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append(gen)
    # pprint(das2utts)
    refs, gens = [], []
    for das in das2utts.keys():
        for gen in das2utts[das]['gens']:
            refs.append([s.split() for s in das2utts[das]['refs']])
            gens.append(gen.split())
    bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    return bleu

Source File: evaluate.py From tatk with Apache License 2.0

5 votes

def get_bleu4(dialog_acts, golden_utts, gen_utts):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        utt = utt.lower()
        gen = gen.lower()
        for da, svs in das.items():
            if da == 'request' or da == 'nooffer':
                continue
            else:
                for s, v in sorted(svs, key=lambda x: x[0]):
                    if s == 'none' or v == 'none':
                        continue
                    else:
                        v = v.lower()
                        if (' ' + v in utt) or (v + ' ' in utt):
                            utt = utt.replace(v, '{}-{}'.format(da, s), 1)
                        if (' ' + v in gen) or (v + ' ' in gen):
                            gen = gen.replace(v, '{}-{}'.format(da, s), 1)
        hash_key = ''
        for da in sorted(das.keys()):
            for s, v in sorted(das[da], key=lambda x: x[0]):
                hash_key += da + '-' + s + ';'
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append(gen)
    # pprint(das2utts)
    refs, gens = [], []
    for das in das2utts.keys():
        for gen in das2utts[das]['gens']:
            refs.append([s.split() for s in das2utts[das]['refs']])
            gens.append(gen.split())
    bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    return bleu

Source File: test_bleu.py From cotk with Apache License 2.0

5 votes

def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]):
			gen_sen_processed = dataloader.trim_in_ids(gen_sen)
			resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:])
			refs.append([resp_sen_processed])
			gens.append(gen_sen_processed)
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3)

Source File: test_bleu.py From cotk with Apache License 2.0

5 votes

def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for i in range(len(input[reference_key])):
			for resp_sen, gen_sen in zip(input[reference_key][i], input[gen_key][i]):
				gen_sen_processed = dataloader.trim_in_ids(gen_sen)
				resp_sen_processed = dataloader.trim_in_ids(resp_sen)
				gens.append(gen_sen_processed)
				refs.append([resp_sen_processed[1:]])
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3)

Source File: bleu.py From cotk with Apache License 2.0

5 votes

def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")

		if self.tokenizer:
			self._do_tokenize()

		if "unk" in self.dataloader.get_special_tokens_mapping():
			self.hyps = replace_unk(self.hyps, self.dataloader.get_special_tokens_mapping()["unk"])
		try:
			weights = np.ones(self.ngram) / self.ngram
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, weights=weights, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.") from None
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result

Source File: bleu.py From cotk with Apache License 2.0

5 votes

def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")
		self.hyps = replace_unk(self.hyps, self.dataloader.unk_id)

		self._hash_unordered_list(self.refs)

		try:
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.")
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result

Source File: evaluators.py From NeuralDialog-ZSDG with Apache License 2.0

5 votes

def get_report(self, include_error=False):
        reports = []
        tokenize = get_tokenize()

        for domain, labels in self.domain_labels.items():
            predictions = self.domain_hyps[domain]
            self.logger.info("Generate report for {} for {} samples".format(domain, len(predictions)))
            refs, hyps = [], []

            # find entity precision, recall and f1
            tp, fp, fn = 0.0, 0.0, 0.0

            for label, hyp in zip(labels, predictions):
                label = label.replace(EOS, '').replace(BOS, '')
                hyp = hyp.replace(EOS, '').replace(BOS, '')
                ref_tokens = tokenize(label)[2:]
                hyp_tokens = tokenize(hyp)[2:]

                refs.append([ref_tokens])
                hyps.append(hyp_tokens)

                label_ents = self.pred_ents(label, tokenize, None)
                hyp_ents = self.pred_ents(hyp, tokenize, None)
                # hyp_ents = list(set(hyp_ents))

                ttpp, ffpp, ffnn = self._get_tp_fp_fn(label_ents, hyp_ents)
                tp += ttpp
                fp += ffpp
                fn += ffnn

            ent_precision, ent_recall, ent_f1 = self._get_prec_recall(tp, fp, fn)

            # compute corpus level scores
            bleu = bleu_score.corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
            report = "\nDomain: %s BLEU %f\n Entity precision %f recall %f and f1 %f\n" \
                     % (domain, bleu, ent_precision, ent_recall, ent_f1)
            reports.append(report)

        return "\n==== REPORT===={report}".format(report="========".join(reports))

Source File: metrics.py From dstc8-meta-dialog with MIT License

5 votes

def overlap_metrics(self, references, predictions):
    for k, v in dict(bleu1=[1.],
                     bleu2=[0.5, 0.5],
                     bleu3=[0.33, 0.33, 0.33],
                     bleu4=[0.25, 0.25, 0.25]).items():
      yield k, corpus_bleu(references, predictions, weights=v)

Source File: bleu.py From DeepPavlov with Apache License 2.0

5 votes

def bleu(y_true, y_predicted):
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])

Source File: bleu.py From DeepPavlov with Apache License 2.0

5 votes

def per_item_bleu(y_true, y_predicted):
    y_predicted = itertools.chain(*y_predicted)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])

Source File: bleu.py From DeepPavlov with Apache License 2.0

5 votes

def per_item_dialog_bleu(y_true, y_predicted):
    y_true = (y['text'] for dialog in y_true for y in dialog)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y.lower().split() for y_p in y_predicted for y in y_p])

Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references, hypothesis, weights=(1.0 / i,) * i
                    )
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references,
                        hypothesis,
                        weights=(1.0 / i,) * i,
                        smoothing_function=chencherry.method3,
                    )
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

Python nltk.translate.bleu_score.corpus_bleu() Examples