Python nltk.translate.bleu_score.corpus_bleu() Examples
The following are 30
code examples of nltk.translate.bleu_score.corpus_bleu().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.translate.bleu_score
, or try the search function
.
Example #1
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def test_corpus_bleu_with_bad_sentence(self): hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" ref = str( "Their tasks include changing a pump on the faulty stokehold ." "Likewise , two species that are very similar in morphology " "were distinguished using genetics ." ) references = [[ref.split()]] hypotheses = [hyp.split()] try: # Check that the warning is raised since no. of 2-grams < 0. with self.assertWarns(UserWarning): # Verify that the BLEU output is undesired since no. of 2-grams < 0. self.assertAlmostEqual( corpus_bleu(references, hypotheses), 0.0, places=4 ) except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2. self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
Example #2
Source File: eval_utils.py From tf-var-attention with MIT License | 6 votes |
def calculate_bleu_scores(references, hypotheses): """ Calculates BLEU 1-4 scores based on NLTK functionality Args: references: List of reference sentences hypotheses: List of generated sentences Returns: bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores """ bleu_1 = np.round(100 * corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2) bleu_2 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2) bleu_3 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2) bleu_4 = np.round(100 * corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2) return bleu_1, bleu_2, bleu_3, bleu_4
Example #3
Source File: seq2seq_chainerio.py From pfio with MIT License | 6 votes |
def forward(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources] ys = [y.tolist() for y in self.model.translate(sources, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({self.key: bleu})
Example #4
Source File: seq2seq.py From convolutional_seq2seq with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __call__(self, trainer): print('## Calculate BLEU') with chainer.no_backprop_mode(): with chainer.using_config('train', False): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources] ys = [y.tolist() for y in self.model.translate(sources, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) * 100 print('BLEU:', bleu) reporter.report({self.key: bleu})
Example #5
Source File: evaluators.py From NeuralDialog-LaRL with Apache License 2.0 | 6 votes |
def get_report(self): tokenize = lambda x: x.split() print('Generate report for {} samples'.format(len(self.hyps))) refs, hyps = [], [] tp, fp, fn = 0, 0, 0 for label, hyp in zip(self.labels, self.hyps): ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS] hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS] refs.append([ref_tokens]) hyps.append(hyp_tokens) ref_entities = self._parse_entities(ref_tokens) hyp_entities = self._parse_entities(hyp_tokens) tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities) tp += tpp fp += fpp fn += fnn # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) bleu = BLEUScorer().score(hyps, refs) prec, rec, f1 = self._get_prec_recall(tp, fp, fn) report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1) return report, bleu, prec, rec, f1
Example #6
Source File: evaluators.py From NeuralDialog-LaRL with Apache License 2.0 | 6 votes |
def get_report(self): tokenize = get_tokenize() print('Generate report for {} samples'.format(len(self.hyps))) refs, hyps = [], [] for label, hyp in zip(self.labels, self.hyps): # label = label.replace(EOS, '') # hyp = hyp.replace(EOS, '') # ref_tokens = tokenize(label)[1:] # hyp_tokens = tokenize(hyp)[1:] ref_tokens = tokenize(label) hyp_tokens = tokenize(hyp) refs.append([ref_tokens]) hyps.append(hyp_tokens) bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) report = '\n===== BLEU = %f =====\n' % (bleu,) return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report)
Example #7
Source File: seq2seq.py From chainer with MIT License | 6 votes |
def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources] ys = [y.tolist() for y in self.model.translate(sources, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) reporter.report({self.key: bleu})
Example #8
Source File: seq2seq.py From chainer with MIT License | 6 votes |
def __call__(self, trainer): device = self.device with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [device.send(x) for x in sources] ys = [y.tolist() for y in self.model.translate(sources, self.max_length)] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({self.key: bleu})
Example #9
Source File: utils.py From nonauto-nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def computeGroupBLEU(outputs, targets, tokenizer=None, bra=10, maxmaxlen=80): if tokenizer is None: tokenizer = revtok.tokenize outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] maxlens = max([len(t) for t in targets]) print(maxlens) maxlens = min([maxlens, maxmaxlen]) nums = int(np.ceil(maxlens / bra)) outputs_buckets = [[] for _ in range(nums)] targets_buckets = [[] for _ in range(nums)] for o, t in zip(outputs, targets): idx = len(o) // bra if idx >= len(outputs_buckets): idx = -1 outputs_buckets[idx] += [o] targets_buckets[idx] += [t] for k in range(nums): print(corpus_bleu([[t] for t in targets_buckets[k]], [o for o in outputs_buckets[k]], emulate_multibleu=True)) # load the dataset + reversible tokenization
Example #10
Source File: bleu.py From funcom with GNU General Public License v3.0 | 6 votes |
def bleu_so_far(refs, preds): Ba = corpus_bleu(refs, preds) B1 = corpus_bleu(refs, preds, weights=(1,0,0,0)) B2 = corpus_bleu(refs, preds, weights=(0,1,0,0)) B3 = corpus_bleu(refs, preds, weights=(0,0,1,0)) B4 = corpus_bleu(refs, preds, weights=(0,0,0,1)) Ba = round(Ba * 100, 2) B1 = round(B1 * 100, 2) B2 = round(B2 * 100, 2) B3 = round(B3 * 100, 2) B4 = round(B4 * 100, 2) ret = '' ret += ('for %s functions\n' % (len(preds))) ret += ('Ba %s\n' % (Ba)) ret += ('B1 %s\n' % (B1)) ret += ('B2 %s\n' % (B2)) ret += ('B3 %s\n' % (B3)) ret += ('B4 %s\n' % (B4)) return ret
Example #11
Source File: data_utils.py From dgm_latent_bow with MIT License | 6 votes |
def quora_read(file_path, bleu_baseline=False): """Read the quora dataset""" print("Reading quora raw data .. ") print(" data path: %s" % file_path) with open(file_path) as fd: lines = fd.readlines() sentence_sets = [] for l in tqdm(lines): p0, p1 = l[:-1].lower().split("\t") sentence_sets.append([word_tokenize(p0), word_tokenize(p1)]) if(bleu_baseline): print("calculating bleu ... ") hypothesis = [s[0] for s in sentence_sets] references = [s[1:] for s in sentence_sets] bleu = corpus_bleu(references, hypothesis) print("bleu on the training set: %.4f" % bleu) return sentence_sets
Example #12
Source File: helper.py From DiPS with Apache License 2.0 | 6 votes |
def bleu_scorer(ref, hyp, script='default'): refsend = [] for i in range(len(ref)): refsi = [] for j in range(len(ref[i])): refsi.append(ref[i][j].split()) refsend.append(refsi) gensend = [] for i in range(len(hyp)): gensend.append(hyp[i].split()) if script == 'nltk': metrics = corpus_bleu(refsend, gensend) return [metrics] metrics = compute_bleu(refsend, gensend) return metrics
Example #13
Source File: bleu.py From FocusSeq2Seq with MIT License | 6 votes |
def oracle_bleu(hyp_list, ref, n_process=4): assert len(set([len(h) for h in hyp_list])) == 1 all_hyp_sentence_bleu_list = [get_sent_bleu_list(hyp, ref, n_process=n_process) for hyp in hyp_list] if n_process > len(hyp_list[0]): n_process = len(hyp_list[0]) with Pool(n_process) as pool: max_hyp_index_list = list(tqdm(pool.imap(np.argmax, zip(*all_hyp_sentence_bleu_list)), total=len(all_hyp_sentence_bleu_list))) best_hyp_list = [] for i, max_hyp_index in enumerate(max_hyp_index_list): best_hyp = hyp_list[max_hyp_index][i] best_hyp_list.append(best_hyp) return corpus_bleu([[r] for r in ref], best_hyp_list, smoothing_function=cm.method2)
Example #14
Source File: evaluators.py From ConvLab with MIT License | 6 votes |
def get_report(self): tokenize = lambda x: x.split() print('Generate report for {} samples'.format(len(self.hyps))) refs, hyps = [], [] tp, fp, fn = 0, 0, 0 for label, hyp in zip(self.labels, self.hyps): ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS] hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS] refs.append([ref_tokens]) hyps.append(hyp_tokens) ref_entities = self._parse_entities(ref_tokens) hyp_entities = self._parse_entities(hyp_tokens) tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities) tp += tpp fp += fpp fn += fnn # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) bleu = BLEUScorer().score(hyps, refs) prec, rec, f1 = self._get_prec_recall(tp, fp, fn) report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1) return report, bleu, prec, rec, f1
Example #15
Source File: evaluators.py From ConvLab with MIT License | 6 votes |
def get_report(self): tokenize = get_tokenize() print('Generate report for {} samples'.format(len(self.hyps))) refs, hyps = [], [] for label, hyp in zip(self.labels, self.hyps): # label = label.replace(EOS, '') # hyp = hyp.replace(EOS, '') # ref_tokens = tokenize(label)[1:] # hyp_tokens = tokenize(hyp)[1:] ref_tokens = tokenize(label) hyp_tokens = tokenize(hyp) refs.append([ref_tokens]) hyps.append(hyp_tokens) bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) report = '\n===== BLEU = %f =====\n' % (bleu,) return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report)
Example #16
Source File: metrics.py From summarus with Apache License 2.0 | 6 votes |
def calc_metrics(refs, hyps, language, metric="all", meteor_jar=None): metrics = dict() metrics["count"] = len(hyps) metrics["ref_example"] = refs[-1] metrics["hyp_example"] = hyps[-1] many_refs = [[r] if r is not list else r for r in refs] if metric in ("bleu", "all"): metrics["bleu"] = corpus_bleu(many_refs, hyps) if metric in ("rouge", "all"): rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) metrics.update(scores) if metric in ("meteor", "all") and meteor_jar is not None and os.path.exists(meteor_jar): meteor = Meteor(meteor_jar, language=language) metrics["meteor"] = meteor.compute_score(hyps, many_refs) if metric in ("duplicate_ngrams", "all"): metrics["duplicate_ngrams"] = dict() metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps)) return metrics
Example #17
Source File: bleu.py From FocusSeq2Seq with MIT License | 5 votes |
def avg_bleu(hyp_list, ref): return corpus_bleu([[r] for r in ref * len(hyp_list)], flatten(hyp_list), smoothing_function=cm.method2)
Example #18
Source File: test_bleu.py From cotk with Apache License 2.0 | 5 votes |
def test_bleu_bug(): ref = [[[1, 3], [3], [4]]] gen = [[1]] with pytest.raises(ZeroDivisionError): corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)
Example #19
Source File: evaluate.py From tatk with Apache License 2.0 | 5 votes |
def get_bleu4(dialog_acts, golden_utts, gen_utts): das2utts = {} for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts): utt = utt.lower() gen = gen.lower() for da, svs in das.items(): domain, act = da.split('-') if act == 'Request' or domain == 'general': continue else: for s, v in sorted(svs, key=lambda x: x[0]): if s == 'Internet' or s == 'Parking' or s == 'none' or v == 'none': continue else: v = v.lower() if (' ' + v in utt) or (v + ' ' in utt): utt = utt.replace(v, '{}-{}'.format(da, s), 1) if (' ' + v in gen) or (v + ' ' in gen): gen = gen.replace(v, '{}-{}'.format(da, s), 1) hash_key = '' for da in sorted(das.keys()): for s, v in sorted(das[da], key=lambda x: x[0]): hash_key += da + '-' + s + ';' das2utts.setdefault(hash_key, {'refs': [], 'gens': []}) das2utts[hash_key]['refs'].append(utt) das2utts[hash_key]['gens'].append(gen) # pprint(das2utts) refs, gens = [], [] for das in das2utts.keys(): for gen in das2utts[das]['gens']: refs.append([s.split() for s in das2utts[das]['refs']]) gens.append(gen.split()) bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1) return bleu
Example #20
Source File: evaluate.py From tatk with Apache License 2.0 | 5 votes |
def get_bleu4(dialog_acts, golden_utts, gen_utts): das2utts = {} for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts): utt = utt.lower() gen = gen.lower() for da, svs in das.items(): if da == 'request' or da == 'nooffer': continue else: for s, v in sorted(svs, key=lambda x: x[0]): if s == 'none' or v == 'none': continue else: v = v.lower() if (' ' + v in utt) or (v + ' ' in utt): utt = utt.replace(v, '{}-{}'.format(da, s), 1) if (' ' + v in gen) or (v + ' ' in gen): gen = gen.replace(v, '{}-{}'.format(da, s), 1) hash_key = '' for da in sorted(das.keys()): for s, v in sorted(das[da], key=lambda x: x[0]): hash_key += da + '-' + s + ';' das2utts.setdefault(hash_key, {'refs': [], 'gens': []}) das2utts[hash_key]['refs'].append(utt) das2utts[hash_key]['gens'].append(gen) # pprint(das2utts) refs, gens = [], [] for das in das2utts.keys(): for gen in das2utts[das]['gens']: refs.append([s.split() for s in das2utts[das]['refs']]) gens.append(gen.split()) bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1) return bleu
Example #21
Source File: test_bleu.py From cotk with Apache License 2.0 | 5 votes |
def get_bleu(self, dataloader, input, reference_key, gen_key): refs = [] gens = [] for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]): gen_sen_processed = dataloader.trim_in_ids(gen_sen) resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:]) refs.append([resp_sen_processed]) gens.append(gen_sen_processed) gens = replace_unk(gens) return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3)
Example #22
Source File: test_bleu.py From cotk with Apache License 2.0 | 5 votes |
def get_bleu(self, dataloader, input, reference_key, gen_key): refs = [] gens = [] for i in range(len(input[reference_key])): for resp_sen, gen_sen in zip(input[reference_key][i], input[gen_key][i]): gen_sen_processed = dataloader.trim_in_ids(gen_sen) resp_sen_processed = dataloader.trim_in_ids(resp_sen) gens.append(gen_sen_processed) refs.append([resp_sen_processed[1:]]) gens = replace_unk(gens) return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3)
Example #23
Source File: bleu.py From cotk with Apache License 2.0 | 5 votes |
def close(self) -> Dict[str, Any]: '''Return a dict which contains * **bleu**: bleu value. * **bleu hashvalue**: hash value for bleu metric, same hash value stands for same evaluation settings. ''' result = super().close() if (not self.hyps) or (not self.refs): raise RuntimeError("The metric has not been forwarded data correctly.") if self.tokenizer: self._do_tokenize() if "unk" in self.dataloader.get_special_tokens_mapping(): self.hyps = replace_unk(self.hyps, self.dataloader.get_special_tokens_mapping()["unk"]) try: weights = np.ones(self.ngram) / self.ngram result.update({"bleu": \ corpus_bleu(self.refs, self.hyps, weights=weights, smoothing_function=SmoothingFunction().method3), \ "bleu hashvalue": self._hashvalue()}) except ZeroDivisionError as _: if not self.ignore_smoothing_error: raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \ usually caused when there is only one sample and the sample length is 1.") from None result.update({"bleu": \ 0, \ "bleu hashvalue": self._hashvalue()}) return result
Example #24
Source File: bleu.py From cotk with Apache License 2.0 | 5 votes |
def close(self) -> Dict[str, Any]: '''Return a dict which contains * **bleu**: bleu value. * **bleu hashvalue**: hash value for bleu metric, same hash value stands for same evaluation settings. ''' result = super().close() if (not self.hyps) or (not self.refs): raise RuntimeError("The metric has not been forwarded data correctly.") self.hyps = replace_unk(self.hyps, self.dataloader.unk_id) self._hash_unordered_list(self.refs) try: result.update({"bleu": \ corpus_bleu(self.refs, self.hyps, smoothing_function=SmoothingFunction().method3), \ "bleu hashvalue": self._hashvalue()}) except ZeroDivisionError as _: if not self.ignore_smoothing_error: raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \ usually caused when there is only one sample and the sample length is 1.") result.update({"bleu": \ 0, \ "bleu hashvalue": self._hashvalue()}) return result
Example #25
Source File: evaluators.py From NeuralDialog-ZSDG with Apache License 2.0 | 5 votes |
def get_report(self, include_error=False): reports = [] tokenize = get_tokenize() for domain, labels in self.domain_labels.items(): predictions = self.domain_hyps[domain] self.logger.info("Generate report for {} for {} samples".format(domain, len(predictions))) refs, hyps = [], [] # find entity precision, recall and f1 tp, fp, fn = 0.0, 0.0, 0.0 for label, hyp in zip(labels, predictions): label = label.replace(EOS, '').replace(BOS, '') hyp = hyp.replace(EOS, '').replace(BOS, '') ref_tokens = tokenize(label)[2:] hyp_tokens = tokenize(hyp)[2:] refs.append([ref_tokens]) hyps.append(hyp_tokens) label_ents = self.pred_ents(label, tokenize, None) hyp_ents = self.pred_ents(hyp, tokenize, None) # hyp_ents = list(set(hyp_ents)) ttpp, ffpp, ffnn = self._get_tp_fp_fn(label_ents, hyp_ents) tp += ttpp fp += ffpp fn += ffnn ent_precision, ent_recall, ent_f1 = self._get_prec_recall(tp, fp, fn) # compute corpus level scores bleu = bleu_score.corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1) report = "\nDomain: %s BLEU %f\n Entity precision %f recall %f and f1 %f\n" \ % (domain, bleu, ent_precision, ent_recall, ent_f1) reports.append(report) return "\n==== REPORT===={report}".format(report="========".join(reports))
Example #26
Source File: metrics.py From dstc8-meta-dialog with MIT License | 5 votes |
def overlap_metrics(self, references, predictions): for k, v in dict(bleu1=[1.], bleu2=[0.5, 0.5], bleu3=[0.33, 0.33, 0.33], bleu4=[0.25, 0.25, 0.25]).items(): yield k, corpus_bleu(references, predictions, weights=v)
Example #27
Source File: bleu.py From DeepPavlov with Apache License 2.0 | 5 votes |
def bleu(y_true, y_predicted): return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted])
Example #28
Source File: bleu.py From DeepPavlov with Apache License 2.0 | 5 votes |
def per_item_bleu(y_true, y_predicted): y_predicted = itertools.chain(*y_predicted) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y_p.lower().split() for y_p in y_predicted])
Example #29
Source File: bleu.py From DeepPavlov with Apache License 2.0 | 5 votes |
def per_item_dialog_bleu(y_true, y_predicted): y_true = (y['text'] for dialog in y_true for y in dialog) return corpus_bleu([[y_t.lower().split()] for y_t in y_true], [y.lower().split() for y_p in y_predicted for y in y_p])
Example #30
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_corpus_bleu(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): nltk_bleu = corpus_bleu( references, hypothesis, weights=(1.0 / i,) * i ) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): nltk_bleu = corpus_bleu( references, hypothesis, weights=(1.0 / i,) * i, smoothing_function=chencherry.method3, ) assert abs(mteval_bleu - nltk_bleu) < 0.005