Python nltk.translate.bleu_score.corpus_bleu() Examples

The following are 30 code examples of nltk.translate.bleu_score.corpus_bleu(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.translate.bleu_score , or try the search function .
Example #1
Source File: test_bleu.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def test_corpus_bleu_with_bad_sentence(self):
        hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
        ref = str(
            "Their tasks include changing a pump on the faulty stokehold ."
            "Likewise , two species that are very similar in morphology "
            "were distinguished using genetics ."
        )
        references = [[ref.split()]]
        hypotheses = [hyp.split()]
        try:  # Check that the warning is raised since no. of 2-grams < 0.
            with self.assertWarns(UserWarning):
                # Verify that the BLEU output is undesired since no. of 2-grams < 0.
                self.assertAlmostEqual(
                    corpus_bleu(references, hypotheses), 0.0, places=4
                )
        except AttributeError:  # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
            self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4) 
Example #2
Source File: eval_utils.py    From tf-var-attention with MIT License 6 votes vote down vote up
def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    bleu_1 = np.round(100 * corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4 
Example #3
Source File: seq2seq_chainerio.py    From pfio with MIT License 6 votes vote down vote up
def forward(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu}) 
Example #4
Source File: seq2seq.py    From convolutional_seq2seq with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __call__(self, trainer):
        print('## Calculate BLEU')
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                references = []
                hypotheses = []
                for i in range(0, len(self.test_data), self.batch):
                    sources, targets = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(self.device, x) for x in sources]
                    ys = [y.tolist()
                          for y in self.model.translate(sources, self.max_length)]
                    hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1) * 100
        print('BLEU:', bleu)
        reporter.report({self.key: bleu}) 
Example #5
Source File: evaluators.py    From NeuralDialog-LaRL with Apache License 2.0 6 votes vote down vote up
def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1 
Example #6
Source File: evaluators.py    From NeuralDialog-LaRL with Apache License 2.0 6 votes vote down vote up
def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report) 
Example #7
Source File: seq2seq.py    From chainer with MIT License 6 votes vote down vote up
def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        reporter.report({self.key: bleu}) 
Example #8
Source File: seq2seq.py    From chainer with MIT License 6 votes vote down vote up
def __call__(self, trainer):
        device = self.device

        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [device.send(x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu}) 
Example #9
Source File: utils.py    From nonauto-nmt with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]
    maxlens = max([len(t) for t in targets])
    print(maxlens)
    maxlens = min([maxlens, maxmaxlen])
    nums = int(np.ceil(maxlens / bra))
    outputs_buckets = [[] for _ in range(nums)]
    targets_buckets = [[] for _ in range(nums)]
    for o, t in zip(outputs, targets):
        idx = len(o) // bra
        if idx >= len(outputs_buckets):
            idx = -1
        outputs_buckets[idx] += [o]
        targets_buckets[idx] += [t]

    for k in range(nums):
        print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True))


# load the dataset + reversible tokenization 
Example #10
Source File: bleu.py    From funcom with GNU General Public License v3.0 6 votes vote down vote up
def bleu_so_far(refs, preds):
    Ba = corpus_bleu(refs, preds)
    B1 = corpus_bleu(refs, preds, weights=(1,0,0,0))
    B2 = corpus_bleu(refs, preds, weights=(0,1,0,0))
    B3 = corpus_bleu(refs, preds, weights=(0,0,1,0))
    B4 = corpus_bleu(refs, preds, weights=(0,0,0,1))

    Ba = round(Ba * 100, 2)
    B1 = round(B1 * 100, 2)
    B2 = round(B2 * 100, 2)
    B3 = round(B3 * 100, 2)
    B4 = round(B4 * 100, 2)

    ret = ''
    ret += ('for %s functions\n' % (len(preds)))
    ret += ('Ba %s\n' % (Ba))
    ret += ('B1 %s\n' % (B1))
    ret += ('B2 %s\n' % (B2))
    ret += ('B3 %s\n' % (B3))
    ret += ('B4 %s\n' % (B4))
    
    return ret 
Example #11
Source File: data_utils.py    From dgm_latent_bow with MIT License 6 votes vote down vote up
def quora_read(file_path, bleu_baseline=False):
  """Read the quora dataset"""
  print("Reading quora raw data .. ")
  print("  data path: %s" % file_path)
  with open(file_path) as fd:
    lines = fd.readlines()
  sentence_sets = []
  for l in tqdm(lines):
    p0, p1 = l[:-1].lower().split("\t")
    sentence_sets.append([word_tokenize(p0), word_tokenize(p1)])

  if(bleu_baseline):
    print("calculating bleu ... ")
    hypothesis = [s[0] for s in sentence_sets]
    references = [s[1:] for s in sentence_sets]
    bleu = corpus_bleu(references, hypothesis)
    print("bleu on the training set: %.4f" % bleu)
  return sentence_sets 
Example #12
Source File: helper.py    From DiPS with Apache License 2.0 6 votes vote down vote up
def bleu_scorer(ref, hyp, script='default'):
    refsend = []
    for i in range(len(ref)):
        refsi = []
        for j in range(len(ref[i])):
            refsi.append(ref[i][j].split())
        refsend.append(refsi)

    gensend = []
    for i in range(len(hyp)):
        gensend.append(hyp[i].split())

    if script == 'nltk':
         metrics = corpus_bleu(refsend, gensend)
         return [metrics]

    metrics = compute_bleu(refsend, gensend)
    return metrics 
Example #13
Source File: bleu.py    From FocusSeq2Seq with MIT License 6 votes vote down vote up
def oracle_bleu(hyp_list, ref, n_process=4):

    assert len(set([len(h) for h in hyp_list])) == 1

    all_hyp_sentence_bleu_list = [get_sent_bleu_list(hyp, ref, n_process=n_process)
                                  for hyp in hyp_list]

    if n_process > len(hyp_list[0]):
        n_process = len(hyp_list[0])

    with Pool(n_process) as pool:
        max_hyp_index_list = list(tqdm(pool.imap(np.argmax, zip(*all_hyp_sentence_bleu_list)),
                                       total=len(all_hyp_sentence_bleu_list)))

    best_hyp_list = []
    for i, max_hyp_index in enumerate(max_hyp_index_list):
        best_hyp = hyp_list[max_hyp_index][i]
        best_hyp_list.append(best_hyp)

    return corpus_bleu([[r] for r in ref], best_hyp_list, smoothing_function=cm.method2) 
Example #14
Source File: evaluators.py    From ConvLab with MIT License 6 votes vote down vote up
def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1 
Example #15
Source File: evaluators.py    From ConvLab with MIT License 6 votes vote down vote up
def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report) 
Example #16
Source File: metrics.py    From summarus with Apache License 2.0 6 votes vote down vote up
def calc_metrics(refs, hyps, language, metric="all", meteor_jar=None):
    metrics = dict()
    metrics["count"] = len(hyps)
    metrics["ref_example"] = refs[-1]
    metrics["hyp_example"] = hyps[-1]
    many_refs = [[r] if r is not list else r for r in refs]
    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(many_refs, hyps)
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(hyps, refs, avg=True)
        metrics.update(scores)
    if metric in ("meteor", "all") and meteor_jar is not None and os.path.exists(meteor_jar):
        meteor = Meteor(meteor_jar, language=language)
        metrics["meteor"] = meteor.compute_score(hyps, many_refs)
    if metric in ("duplicate_ngrams", "all"):
        metrics["duplicate_ngrams"] = dict()
        metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps))
    return metrics 
Example #17
Source File: bleu.py    From FocusSeq2Seq with MIT License 5 votes vote down vote up
def avg_bleu(hyp_list, ref):
    return corpus_bleu([[r] for r in ref * len(hyp_list)], flatten(hyp_list), smoothing_function=cm.method2) 
Example #18
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def test_bleu_bug():
	ref = [[[1, 3], [3], [4]]]
	gen = [[1]]
	with pytest.raises(ZeroDivisionError):
		corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3) 
Example #19
Source File: evaluate.py    From tatk with Apache License 2.0 5 votes vote down vote up
def get_bleu4(dialog_acts, golden_utts, gen_utts):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        utt = utt.lower()
        gen = gen.lower()
        for da, svs in das.items():
            domain, act = da.split('-')
            if act == 'Request' or domain == 'general':
                continue
            else:
                for s, v in sorted(svs, key=lambda x: x[0]):
                    if s == 'Internet' or s == 'Parking' or s == 'none' or v == 'none':
                        continue
                    else:
                        v = v.lower()
                        if (' ' + v in utt) or (v + ' ' in utt):
                            utt = utt.replace(v, '{}-{}'.format(da, s), 1)
                        if (' ' + v in gen) or (v + ' ' in gen):
                            gen = gen.replace(v, '{}-{}'.format(da, s), 1)
        hash_key = ''
        for da in sorted(das.keys()):
            for s, v in sorted(das[da], key=lambda x: x[0]):
                hash_key += da + '-' + s + ';'
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append(gen)
    # pprint(das2utts)
    refs, gens = [], []
    for das in das2utts.keys():
        for gen in das2utts[das]['gens']:
            refs.append([s.split() for s in das2utts[das]['refs']])
            gens.append(gen.split())
    bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    return bleu 
Example #20
Source File: evaluate.py    From tatk with Apache License 2.0 5 votes vote down vote up
def get_bleu4(dialog_acts, golden_utts, gen_utts):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        utt = utt.lower()
        gen = gen.lower()
        for da, svs in das.items():
            if da == 'request' or da == 'nooffer':
                continue
            else:
                for s, v in sorted(svs, key=lambda x: x[0]):
                    if s == 'none' or v == 'none':
                        continue
                    else:
                        v = v.lower()
                        if (' ' + v in utt) or (v + ' ' in utt):
                            utt = utt.replace(v, '{}-{}'.format(da, s), 1)
                        if (' ' + v in gen) or (v + ' ' in gen):
                            gen = gen.replace(v, '{}-{}'.format(da, s), 1)
        hash_key = ''
        for da in sorted(das.keys()):
            for s, v in sorted(das[da], key=lambda x: x[0]):
                hash_key += da + '-' + s + ';'
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append(gen)
    # pprint(das2utts)
    refs, gens = [], []
    for das in das2utts.keys():
        for gen in das2utts[das]['gens']:
            refs.append([s.split() for s in das2utts[das]['refs']])
            gens.append(gen.split())
    bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    return bleu 
Example #21
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]):
			gen_sen_processed = dataloader.trim_in_ids(gen_sen)
			resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:])
			refs.append([resp_sen_processed])
			gens.append(gen_sen_processed)
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3) 
Example #22
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for i in range(len(input[reference_key])):
			for resp_sen, gen_sen in zip(input[reference_key][i], input[gen_key][i]):
				gen_sen_processed = dataloader.trim_in_ids(gen_sen)
				resp_sen_processed = dataloader.trim_in_ids(resp_sen)
				gens.append(gen_sen_processed)
				refs.append([resp_sen_processed[1:]])
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3) 
Example #23
Source File: bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")

		if self.tokenizer:
			self._do_tokenize()

		if "unk" in self.dataloader.get_special_tokens_mapping():
			self.hyps = replace_unk(self.hyps, self.dataloader.get_special_tokens_mapping()["unk"])
		try:
			weights = np.ones(self.ngram) / self.ngram
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, weights=weights, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.") from None
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result 
Example #24
Source File: bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")
		self.hyps = replace_unk(self.hyps, self.dataloader.unk_id)

		self._hash_unordered_list(self.refs)

		try:
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.")
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result 
Example #25
Source File: evaluators.py    From NeuralDialog-ZSDG with Apache License 2.0 5 votes vote down vote up
def get_report(self, include_error=False):
        reports = []
        tokenize = get_tokenize()

        for domain, labels in self.domain_labels.items():
            predictions = self.domain_hyps[domain]
            self.logger.info("Generate report for {} for {} samples".format(domain, len(predictions)))
            refs, hyps = [], []

            # find entity precision, recall and f1
            tp, fp, fn = 0.0, 0.0, 0.0

            for label, hyp in zip(labels, predictions):
                label = label.replace(EOS, '').replace(BOS, '')
                hyp = hyp.replace(EOS, '').replace(BOS, '')
                ref_tokens = tokenize(label)[2:]
                hyp_tokens = tokenize(hyp)[2:]

                refs.append([ref_tokens])
                hyps.append(hyp_tokens)

                label_ents = self.pred_ents(label, tokenize, None)
                hyp_ents = self.pred_ents(hyp, tokenize, None)
                # hyp_ents = list(set(hyp_ents))

                ttpp, ffpp, ffnn = self._get_tp_fp_fn(label_ents, hyp_ents)
                tp += ttpp
                fp += ffpp
                fn += ffnn

            ent_precision, ent_recall, ent_f1 = self._get_prec_recall(tp, fp, fn)

            # compute corpus level scores
            bleu = bleu_score.corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
            report = "\nDomain: %s BLEU %f\n Entity precision %f recall %f and f1 %f\n" \
                     % (domain, bleu, ent_precision, ent_recall, ent_f1)
            reports.append(report)

        return "\n==== REPORT===={report}".format(report="========".join(reports)) 
Example #26
Source File: metrics.py    From dstc8-meta-dialog with MIT License 5 votes vote down vote up
def overlap_metrics(self, references, predictions):
    for k, v in dict(bleu1=[1.],
                     bleu2=[0.5, 0.5],
                     bleu3=[0.33, 0.33, 0.33],
                     bleu4=[0.25, 0.25, 0.25]).items():
      yield k, corpus_bleu(references, predictions, weights=v) 
Example #27
Source File: bleu.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def bleu(y_true, y_predicted):
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted]) 
Example #28
Source File: bleu.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def per_item_bleu(y_true, y_predicted):
    y_predicted = itertools.chain(*y_predicted)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted]) 
Example #29
Source File: bleu.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def per_item_dialog_bleu(y_true, y_predicted):
    y_true = (y['text'] for dialog in y_true for y in dialog)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y.lower().split() for y_p in y_predicted for y in y_p]) 
Example #30
Source File: test_bleu.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references, hypothesis, weights=(1.0 / i,) * i
                    )
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references,
                        hypothesis,
                        weights=(1.0 / i,) * i,
                        smoothing_function=chencherry.method3,
                    )
                    assert abs(mteval_bleu - nltk_bleu) < 0.005