Python nltk.translate.bleu_score.sentence_bleu() Examples
The following are 30
code examples of nltk.translate.bleu_score.sentence_bleu().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.translate.bleu_score
, or try the search function
.
Example #1
Source File: metrics.py From ParlAI with MIT License | 8 votes |
def compute(guess: str, answers: List[str], k: int = 4) -> Optional['BleuMetric']: """ Compute approximate BLEU score between guess and a set of answers. """ if nltkbleu is None: # bleu library not installed, just return a default value return None # Warning: BLEU calculation *should* include proper tokenization and # punctuation etc. We're using the normalize_answer for everything though, # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is # going to be slower than fairseq's (which is written in C), but fairseq's # requires that everything be in arrays of ints (i.e. as tensors). NLTK's # works with strings, which is better suited for this module. weights = [1 / k for _ in range(k)] score = nltkbleu.sentence_bleu( [normalize_answer(a).split(" ") for a in answers], normalize_answer(guess).split(" "), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1, weights=weights, ) return BleuMetric(score)
Example #2
Source File: model.py From DeepNews with Apache License 2.0 | 6 votes |
def blue_score_text(self,y_actual,y_predicated): #check length equal assert len(y_actual) == len(y_predicated) #list of healine .. each headline has words no_of_news = len(y_actual) blue_score = 0.0 for i in range(no_of_news): reference = y_actual[i] hypothesis = y_predicated[i] #Avoid ZeroDivisionError in blue score #default weights weights=(0.25, 0.25, 0.25, 0.25) min_len_present = min(len(reference),len(hypothesis)) if min_len_present==0: continue if min_len_present<4: weights=[1.0/min_len_present,]*min_len_present blue_score = blue_score + sentence_bleu([reference],hypothesis,weights=weights) return blue_score/float(no_of_news)
Example #3
Source File: metrics.py From deepAPI with MIT License | 6 votes |
def sim_bleu(self, hyps, ref): """ :param ref - a list of tokens of the reference :param hyps - a list of tokens of the hypothesis :return maxbleu - recall bleu :return avgbleu - precision bleu """ scores = [] for hyp in hyps: try: # scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7, # weights=[1./4, 1./4, 1./4, 1./4])) scores.append(smoothed_bleu(list(bleu_stats(hyp, ref)))) except: scores.append(0.0) return np.max(scores), np.mean(scores)
Example #4
Source File: utils.py From quick-nlp with MIT License | 6 votes |
def print_batch(learner: Learner, modeldata: ModelData, input_field, output_field, num_batches=1, num_sentences=-1, is_test=False, num_beams=1, weights=None, smoothing_function=None): predictions, targets, inputs = learner.predict_with_targs_and_inputs(is_test=is_test, num_beams=num_beams) weights = (1 / 3., 1 / 3., 1 / 3.) if weights is None else weights smoothing_function = SmoothingFunction().method1 if smoothing_function is None else smoothing_function blue_scores = [] for batch_num, (input, target, prediction) in enumerate(zip(inputs, targets, predictions)): inputs_str: BatchBeamTokens = modeldata.itos(input, input_field) predictions_str: BatchBeamTokens = modeldata.itos(prediction, output_field) targets_str: BatchBeamTokens = modeldata.itos(target, output_field) for index, (inp, targ, pred) in enumerate(zip(inputs_str, targets_str, predictions_str)): blue_score = sentence_bleu([targ], pred, smoothing_function=smoothing_function, weights=weights) print( f'batch: {batch_num} sample : {index}\ninput: {" ".join(inp)}\ntarget: { " ".join(targ)}\nprediction: {" ".join(pred)}\nbleu: {blue_score}\n\n') blue_scores.append(blue_score) if 0 < num_sentences <= index - 1: break if 0 < num_batches <= batch_num - 1: break print(f'mean bleu score: {np.mean(blue_scores)}')
Example #5
Source File: precision_recall.py From cotk with Apache License 2.0 | 6 votes |
def _score(self, gen: List[int], reference: List[int]) -> float: '''Return a BLEU score \in [0, 1] to calculate BLEU-ngram precision and recall. Arguments: gen (list): list of generated word ids. reference (list): list of word ids of a reference. Here is an Example: >>> gen = [4,5] >>> reference = [5,6] >>> self._score(gen, reference) 0.150 # assume self.weights = [0.25,0.25,0.25,0.25] ''' gen = self._replace_unk(gen) return sentence_bleu([reference], gen, self.weights, SmoothingFunction().method1)
Example #6
Source File: test_bleu.py From cotk with Apache License 2.0 | 6 votes |
def get_bleu(self, dataloader, input, reference_key, gen_key): refs = [] gens = [] for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]): gen_sen_processed = dataloader.trim_in_ids(gen_sen) resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:]) refs.append(resp_sen_processed) gens.append(gen_sen_processed) gens = replace_unk(gens) bleu_irl_bw, bleu_irl_fw = [], [] for i in range(len(gens)): bleu_irl_fw.append(sentence_bleu(refs, gens[i], smoothing_function=SmoothingFunction().method1)) for i in range(len(refs)): bleu_irl_bw.append(sentence_bleu(gens, refs[i], smoothing_function=SmoothingFunction().method1)) fw_bleu = (1.0 * sum(bleu_irl_fw) / len(bleu_irl_fw)) bw_bleu = (1.0 * sum(bleu_irl_bw) / len(bleu_irl_bw)) return 2.0 * bw_bleu * fw_bleu / (fw_bleu + bw_bleu)
Example #7
Source File: utils.py From lang2program with Apache License 2.0 | 6 votes |
def bleu(reference, predict): """Compute sentence-level bleu score. Args: reference (list[str]) predict (list[str]) """ from nltk.translate import bleu_score if len(predict) == 0: if len(reference) == 0: return 1.0 else: return 0.0 # TODO(kelvin): is this quite right? # use a maximum of 4-grams. If 4-grams aren't present, use only lower n-grams. n = min(4, len(reference), len(predict)) weights = tuple([1. / n] * n) # uniform weight on n-gram precisions return bleu_score.sentence_bleu([reference], predict, weights)
Example #8
Source File: utils.py From lang2program with Apache License 2.0 | 6 votes |
def bleu(reference, predict): """Compute sentence-level bleu score. Args: reference (list[str]) predict (list[str]) """ from nltk.translate import bleu_score if len(predict) == 0: if len(reference) == 0: return 1.0 else: return 0.0 # TODO(kelvin): is this quite right? # use a maximum of 4-grams. If 4-grams aren't present, use only lower n-grams. n = min(4, len(reference), len(predict)) weights = tuple([1. / n] * n) # uniform weight on n-gram precisions return bleu_score.sentence_bleu([reference], predict, weights)
Example #9
Source File: metric.py From MultiTurnDialogZoo with MIT License | 6 votes |
def cal_BLEU_nltk(refer, candidate, ngram=1): ''' SmoothingFunction refer to https://github.com/PaddlePaddle/models/blob/a72760dff8574fe2cb8b803e01b44624db3f3eff/PaddleNLP/Research/IJCAI2019-MMPMS/mmpms/utils/metrics.py ''' smoothie = SmoothingFunction().method7 if ngram == 1: weight = (1, 0, 0, 0) elif ngram == 2: weight = (0.5, 0.5, 0, 0) elif ngram == 3: weight = (0.33, 0.33, 0.33, 0) elif ngram == 4: weight = (0.25, 0.25, 0.25, 0.25) return sentence_bleu(refer, candidate, weights=weight, smoothing_function=smoothie) # BLEU of nlg-eval
Example #10
Source File: bleu_metrics.py From dialog-eval with MIT License | 6 votes |
def update_metrics(self, resp, gt, source): ''' Params: :resp: Response word list. :gt: Ground truth word list. :source: Source word list. ''' try: self.metrics['bleu-1'].append( bleu_score.sentence_bleu([gt], resp, weights=(1, 0, 0, 0), smoothing_function=self.smoothing)) self.metrics['bleu-2'].append( bleu_score.sentence_bleu([gt], resp, weights=(0.5, 0.5, 0, 0), smoothing_function=self.smoothing)) self.metrics['bleu-3'].append( bleu_score.sentence_bleu([gt], resp, weights=(0.33, 0.33, 0.33, 0), smoothing_function=self.smoothing)) self.metrics['bleu-4'].append( bleu_score.sentence_bleu([gt], resp, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=self.smoothing)) except (KeyError, ZeroDivisionError): self.metrics['bleu-1'].append(0) self.metrics['bleu-2'].append(0) self.metrics['bleu-3'].append(0) self.metrics['bleu-4'].append(0)
Example #11
Source File: metrics.py From KBRD with MIT License | 6 votes |
def _bleu(guess, answers): """Compute approximate BLEU score between guess and a set of answers.""" if nltkbleu is None: # bleu library not installed, just return a default value return None # Warning: BLEU calculation *should* include proper tokenization and # punctuation etc. We're using the normalize_answer for everything though, # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is # going to be slower than fairseq's (which is written in C), but fairseq's # requires that everything be in arrays of ints (i.e. as tensors). NLTK's # works with strings, which is better suited for this module. return nltkbleu.sentence_bleu( [normalize_answer(a).split(" ") for a in answers], normalize_answer(guess).split(" "), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1, )
Example #12
Source File: metrics.py From neural_chat with MIT License | 6 votes |
def _bleu(guess, answers): """Compute approximate BLEU score between guess and a set of answers.""" if nltkbleu is None: # bleu library not installed, just return a default value return None # Warning: BLEU calculation *should* include proper tokenization and # punctuation etc. We're using the normalize_answer for everything though, # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is # going to be slower than fairseq's (which is written in C), but fairseq's # requires that everything be in arrays of ints (i.e. as tensors). NLTK's # works with strings, which is better suited for this module. return nltkbleu.sentence_bleu( [normalize_answer(a).split(" ") for a in answers], normalize_answer(guess).split(" "), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1, )
Example #13
Source File: bleu.py From dialogbot with Apache License 2.0 | 6 votes |
def bleu(answer_file, standard_answer_file): rf_answer = open(answer_file, 'r', "utf-8") rf_standard_answer = open(standard_answer_file, 'r', "utf-8") answer_lines = rf_answer.readlines() standard_answer_lines = rf_standard_answer.readlines() # compute score scores = [] for i in range(len(answer_lines)): candidate = list(answer_lines[i].strip()) each_score = 0 for j in range(10): references = [] standard_answer_line = standard_answer_lines[i * 11 + j].strip().split('\t') references.append(list(standard_answer_line[0].strip())) standard_score = standard_answer_line[1] bleu_score = sentence_bleu(references, candidate, weights=(0.35, 0.45, 0.1, 0.1), smoothing_function=SmoothingFunction().method1) each_score = bleu_score * float(standard_score) + each_score scores.append(each_score / 10) rf_answer.close() rf_standard_answer.close() score_final = sum(scores) / float(len(answer_lines)) precision_score = round(score_final, 6) return precision_score
Example #14
Source File: evaluator.py From tranX with Apache License 2.0 | 5 votes |
def get_sentence_bleu(self, example, hyp): return sentence_bleu([tokenize_for_bleu_eval(example.meta['example_dict']['snippet'])], tokenize_for_bleu_eval(hyp.decanonical_code), smoothing_function=SmoothingFunction().method3)
Example #15
Source File: utils.py From Deep-Reinforcement-Learning-Hands-On with MIT License | 5 votes |
def calc_bleu_many(cand_seq, ref_sequences): sf = bleu_score.SmoothingFunction() return bleu_score.sentence_bleu(ref_sequences, cand_seq, smoothing_function=sf.method1, weights=(0.5, 0.5))
Example #16
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_reference_or_hypothesis_shorter_than_fourgrams(self): # Tese case where the length of reference or hypothesis # is shorter than 4. references = ['let it go'.split()] hypothesis = 'let go it'.split() # Checks that the value the hypothesis and reference returns is 0.0 # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) # Checks that the warning has been raised. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
Example #17
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_empty_references_and_hypothesis(self): # Test case where both references and hypothesis is empty. references = [[]] hypothesis = [] assert sentence_bleu(references, hypothesis) == 0
Example #18
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_empty_hypothesis(self): # Test case where there's hypothesis is empty. references = ['The candidate has no alignment to any of the references'.split()] hypothesis = [] assert sentence_bleu(references, hypothesis) == 0
Example #19
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). references = ['John loves Mary ?'.split()] hypothesis = 'John loves Mary'.split() n = len(hypothesis) + 1 # weights = [1.0 / n] * n # Uniform weights. # Since no n-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual( sentence_bleu(references, hypothesis, weights), 0.0, places=4 ) # Checks that the warning has been raised because len(hypothesis) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # Test case where n > len(hypothesis) but so is n > len(reference), and # it's a special case where reference == hypothesis. references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() # Since no 4-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual( sentence_bleu(references, hypothesis, weights), 0.0, places=4 )
Example #20
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_partial_matches_hypothesis_longer_than_reference(self): references = ['John loves Mary'.split()] hypothesis = 'John loves Mary who loves Mike'.split() # Since no 4-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) # Checks that the warning has been raised because len(reference) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # @unittest.skip("Skipping fringe cases for BLEU.")
Example #21
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_full_matches(self): # Test case where there's 100% matches references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): weights = [1.0 / n] * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 1.0
Example #22
Source File: test_bleu.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_zero_matches(self): # Test case where there's 0 matches references = ['The candidate has no alignment to any of the references'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): weights = [1.0 / n] * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 0
Example #23
Source File: matcher.py From supervised-oie with MIT License | 5 votes |
def bleuMatch(ref, ex, ignoreStopwords, ignoreCase): sRef = ref.bow() sEx = ex.bow() bleu = sentence_bleu(references = [sRef.split(' ')], hypothesis = sEx.split(' ')) return bleu > Matcher.BLEU_THRESHOLD
Example #24
Source File: bleu.py From DeepPavlov with Apache License 2.0 | 5 votes |
def bleu_advanced(y_true: List[Any], y_predicted: List[Any], weights: Tuple = (1,), smoothing_function=SMOOTH.method1, auto_reweigh=False, penalty=True) -> float: """Calculate BLEU score Parameters: y_true: list of reference tokens y_predicted: list of query tokens weights: n-gram weights smoothing_function: SmoothingFunction auto_reweigh: Option to re-normalize the weights uniformly penalty: either enable brevity penalty or not Return: BLEU score """ bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh) hyp_len = len(y_predicted) hyp_lengths = hyp_len ref_lengths = closest_ref_length([y_true], hyp_len) bpenalty = brevity_penalty(ref_lengths, hyp_lengths) if penalty is True or bpenalty == 0: return bleu_measure return bleu_measure / bpenalty
Example #25
Source File: utils.py From quick-nlp with MIT License | 5 votes |
def print_dialogue_batch(learner: Learner, modeldata: ModelData, input_field, output_field, num_batches=1, num_sentences=-1, is_test=False, num_beams=1, smoothing_function=None, weights=None): weights = (1 / 3., 1 / 3., 1 / 3.) if weights is None else weights smoothing_function = SmoothingFunction().method1 if smoothing_function is None else smoothing_function predictions, targets, inputs = learner.predict_with_targs_and_inputs(is_test=is_test, num_beams=num_beams) blue_scores = [] for batch_num, (input, target, prediction) in enumerate(zip(inputs, targets, predictions)): input = np.transpose(input, [1, 2, 0]) # transpose number of utterances to beams [sl, bs, nb] inputs_str: BatchBeamTokens = modeldata.itos(input, input_field) inputs_str: List[str] = ["\n".join(conv) for conv in inputs_str] predictions_str: BatchBeamTokens = modeldata.itos(prediction, output_field) targets_str: BatchBeamTokens = modeldata.itos(target, output_field) for index, (inp, targ, pred) in enumerate(zip(inputs_str, targets_str, predictions_str)): if targ[0].split() == pred[0].split()[1:]: blue_score = 1 else: blue_score = sentence_bleu([targ[0].split()], pred[0].split()[1:], smoothing_function=smoothing_function, weights=weights ) print( f'BATCH: {batch_num} SAMPLE : {index}\nINPUT:\n{"".join(inp)}\nTARGET:\n{ "".join(targ)}\nPREDICTON:\n{"".join(pred)}\nblue: {blue_score}\n\n') blue_scores.append(blue_score) if 0 < num_sentences <= index - 1: break if 0 < num_batches <= batch_num - 1: break print(f'bleu score: mean: {np.mean(blue_scores)}, std: {np.std(blue_scores)}')
Example #26
Source File: bleu.py From dialogbot with Apache License 2.0 | 5 votes |
def bleu_score(candidate, reference): score = sentence_bleu( [list(reference)], list(candidate), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1) return score
Example #27
Source File: metrics.py From quick-nlp with MIT License | 5 votes |
def bleu_score(preds, targs, stoi=None): sf = SmoothingFunction().method1 preds = torch.max(preds, dim=-1)[1][:-1] bleus = np.zeros(targs.size(1)) for res in zip(to_np(targs, preds)): if len(res[1]) > 2: bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(1 / 3., 1 / 3., 1 / 3.)) elif len(res[1]) == 2: bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(0.5, 0.5)) else: bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(1.0,)) bleus.append(bleu) return
Example #28
Source File: process_samples.py From texar with Apache License 2.0 | 5 votes |
def sample_from_hamming_distance_payoff_distribution(args): src_sents = read_corpus(args.src, 'src') tgt_sents = read_corpus(args.tgt, 'src') # do not read in <s> and </s> f_out = open(args.output, 'w') vocab = torch.load(args.vocab) tgt_vocab = vocab.tgt payoff_prob, Z_qs = generate_hamming_distance_payoff_distribution(max(len(sent) for sent in tgt_sents), vocab_size=len(vocab.tgt), tau=args.temp) for src_sent, tgt_sent in zip(src_sents, tgt_sents): tgt_samples = [] # make sure the ground truth y* is in the samples tgt_sent_len = len(tgt_sent) - 3 # remove <s> and </s> and ending period . tgt_ref_tokens = tgt_sent[1:-1] bleu_scores = [] # sample an edit distances e_samples = np.random.choice(range(tgt_sent_len + 1), p=payoff_prob[tgt_sent_len], size=args.sample_size, replace=True) for i, e in enumerate(e_samples): if e > 0: # sample a new tgt_sent $y$ old_word_pos = np.random.choice(range(1, tgt_sent_len + 1), size=e, replace=False) new_words = [vocab.tgt.id2word[wid] for wid in np.random.randint(3, len(vocab.tgt), size=e)] new_tgt_sent = list(tgt_sent) for pos, word in zip(old_word_pos, new_words): new_tgt_sent[pos] = word bleu_score = sentence_bleu([tgt_ref_tokens], new_tgt_sent[1:-1]) bleu_scores.append(bleu_score) else: new_tgt_sent = list(tgt_sent) bleu_scores.append(1.) # print('y: %s' % ' '.join(new_tgt_sent)) tgt_samples.append(new_tgt_sent)
Example #29
Source File: model.py From RLSeq2Seq with MIT License | 5 votes |
def reward_function(self, reference, summary, measure='rouge_l/f_score'): """Calculate the reward between the reference and summary. Args: reference: A list of ids representing the ground-truth data summary: A list of ids representing the model generated data Returns: A single value representing the evaluation value for reference and summary """ if 'rouge' in measure: return rouge([summary],[reference])[measure] else: return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
Example #30
Source File: bleu.py From cotk with Apache License 2.0 | 5 votes |
def _sentence_bleu(ele): '''Auxiliary function for computing sentence bleu: Arguments: ele (tuple): A tuple (`reference sentences`, `a hypothesis sentence`). Returns: * int: **sentence-bleu** value. ''' return sentence_bleu(ele[0], ele[1], weights=ele[2], smoothing_function=SmoothingFunction().method1)