Python rouge.Rouge() Examples

The following are 20 code examples of rouge.Rouge(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rouge , or try the search function .
Example #1
Source File: train_and_eval.py    From structured-neural-summarization with MIT License 8 votes vote down vote up
def compute_rouge(predictions, targets):
    predictions = [" ".join(prediction).lower() for prediction in predictions]
    predictions = [prediction if prediction else "EMPTY" for prediction in predictions]
    targets = [" ".join(target).lower() for target in targets]
    targets = [target if target else "EMPTY" for target in targets]
    rouge = Rouge()
    scores = rouge.get_scores(hyps=predictions, refs=targets, avg=True)
    return scores['rouge-2']['f'] 
Example #2
Source File: metrics.py    From summarus with Apache License 2.0 6 votes vote down vote up
def calc_metrics(refs, hyps, language, metric="all", meteor_jar=None):
    metrics = dict()
    metrics["count"] = len(hyps)
    metrics["ref_example"] = refs[-1]
    metrics["hyp_example"] = hyps[-1]
    many_refs = [[r] if r is not list else r for r in refs]
    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(many_refs, hyps)
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(hyps, refs, avg=True)
        metrics.update(scores)
    if metric in ("meteor", "all") and meteor_jar is not None and os.path.exists(meteor_jar):
        meteor = Meteor(meteor_jar, language=language)
        metrics["meteor"] = meteor.compute_score(hyps, many_refs)
    if metric in ("duplicate_ngrams", "all"):
        metrics["duplicate_ngrams"] = dict()
        metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps))
    return metrics 
Example #3
Source File: interpolation_helper.py    From texar with Apache License 2.0 6 votes vote down vote up
def calc_reward(refs, hypo, unk_id, metric):
    """
    calculate the reward given hypo and refs and will return
    bleu score if metric is 'bleu' or return
    sum of (Rouge-1, Rouge-2, Rouge-L) if metric is 'rouge'
    """
    if len(hypo) == 0 or len(refs[0]) == 0:
        return 0.

    for i in range(len(hypo)):
        assert isinstance(hypo[i], int)
        if hypo[i] == unk_id:
            hypo[i] = -1

    if metric == 'bleu':
        return 0.01 * sentence_bleu(
            references=refs, hypothesis=hypo, smooth=True)
    else:
        ref_str = ' '.join([str(word) for word in refs[0]])
        hypo_str = ' '.join([str(word) for word in hypo])
        rouge_scores = \
            rouge.get_scores(hyps=[hypo_str], refs=[ref_str], avg=True)
        return sum([value['f'] for key, value in rouge_scores.items()]) 
Example #4
Source File: metrics.py    From neural_chat with MIT License 6 votes vote down vote up
def _rouge(guess, answers):
    global rouge
    """Compute ROUGE score between guess and *any* answers. Return the best."""
    if rouge is None:
        return None, None, None
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    try:
        scores = [
            evaluator.get_scores(normalize_answer(guess), normalize_answer(a))
            for a in answers
        ]
    except LookupError:
        warn_once(
            'ROUGE requires nltk punkt tokenizer. Please run '
            '`python -c "import nltk; nltk.download(\'punkt\')`'
        )
        rouge = None
        return None, None, None

    scores_rouge1 = [score['rouge-1']['r'] for score in scores]
    scores_rouge2 = [score['rouge-2']['r'] for score in scores]
    scores_rougeL = [score['rouge-l']['r'] for score in scores]
    return max(scores_rouge1), max(scores_rouge2), max(scores_rougeL) 
Example #5
Source File: rouge.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge() 
Example #6
Source File: build_oracle.py    From summarus with Apache License 2.0 5 votes vote down vote up
def build_oracle_records(records, nrows=None, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
    new_records = []

    for i, record in enumerate(records):
        if nrows is not None and i >= nrows:
            break

        summary = record["summary"]
        summary = summary if not lower else summary.lower()
        references.append(summary)

        text = record["text"]
        calc_score = lambda x, y: calc_single_score(x, y, rouge)
        predicted_summary, sentences, oracle_indices = build_oracle_summary_greedy(text, summary, calc_score=calc_score)
        predictions.append(predicted_summary)
        oracle_indices = [1 if i in oracle_indices else 0 for i in range(len(sentences))]

        new_record = copy.copy(record)
        new_record["sentences"] = sentences
        new_record["oracle"] = oracle_indices
        new_records.append(new_record)

    print_metrics(references, predictions)
    return new_records 
Example #7
Source File: metric.py    From MultiTurnDialogZoo with MIT License 5 votes vote down vote up
def cal_ROUGE(refer, candidate):
    if len(candidate) == 0:
        candidate = ['<unk>']
    elif len(candidate) == 1:
        candidate.append('<unk>')
    if len(refer) == 0:
        refer = ['<unk>']
    elif len(refer) == 1:
        refer.append('<unk>')
    rouge = Rouge()
    scores = rouge.get_scores(' '.join(candidate), ' '.join(refer))
    return scores[0]['rouge-2']['f'] 
Example #8
Source File: test_basic.py    From rouge with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self.hyp_path = './tests/hyp.txt'
        self.ref_path = './tests/ref.txt'

        self.data_path = './tests/data.json'
        with open(self.data_path) as f:
            self.data = json.load(f)

        self.rouge = rouge.Rouge()
        self.files_rouge = rouge.FilesRouge() 
Example #9
Source File: rouge.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge() 
Example #10
Source File: rouge.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge() 
Example #11
Source File: rouge.py    From vizseq with MIT License 5 votes vote down vote up
def _get_sent_rouge(
        hypothesis: List[str], references: List[List[str]], rouge_type: str,
        extra_args: Optional[Dict[str, str]] = None
) -> List[float]:
    assert rouge_type in {'rouge-1', 'rouge-2', 'rouge-l'}
    _rouge_type = 'rouge-l' if rouge_type == 'rouge-l' else 'rouge-n'
    _max_n = 1 if rouge_type == 'rouge-1' else 2
    joint_references = [list(r) for r in zip(*references)]
    scores = _rouge.Rouge(
        metrics=[_rouge_type], max_n=_max_n, apply_avg=False
    ).get_scores(hypothesis, joint_references)
    return [s[STATS_TYPE][0] for s in scores[rouge_type]] 
Example #12
Source File: controller.py    From dgm_latent_bow with MIT License 5 votes vote down vote up
def __init__(self, config):
    """Initialization from the configuration"""
    self.mode = config.controller_mode
    self.model_name = config.model_name
    self.model_name_version = config.model_name + "_" + config.model_version
    self.start_epoch = config.start_epoch
    self.num_epoch = config.num_epoch
    self.write_output = config.write_output
    self.batch_size = config.batch_size
    self.print_interval = config.train_print_interval
    self.gpu_id = config.gpu_id
    self.drop_out = config.drop_out
    self.dec_start_id = config.dec_start_id
    self.dec_end_id = config.dec_end_id
    self.model_path = config.model_path
    self.output_path = config.output_path
    self.random_seed = config.random_seed
    self.bow_pred_method = config.bow_pred_method
    self.train_log = TrainingLog(config)
    self.id2word = None
    self.target_metrics = config.target_metrics
    self.lm_load_path = config.lm_load_path
    self.rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    self.save_ckpt = config.save_ckpt
    self.eval_metrics_list = config.eval_metrics_list
    self.log_metrics = config.log_metrics
    self.gumbel_samples = config.gumbel_samples
    self.is_gumbel = config.is_gumbel
    return 
Example #13
Source File: utils.py    From fastNLP with Apache License 2.0 5 votes vote down vote up
def rouge_all(hyps, refer):
    rouge = Rouge()
    score = rouge.get_scores(hyps, refer)[0]
    # mean_score = np.mean([score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"]])
    return score 
Example #14
Source File: utils.py    From fastNLP with Apache License 2.0 5 votes vote down vote up
def rouge_eval(hyps, refer):
    rouge = Rouge()
    # print(hyps)
    # print(refer)
    # print(rouge.get_scores(hyps, refer))
    try:
        score = rouge.get_scores(hyps, refer)[0]
        mean_score = np.mean([score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"]])
    except:
        mean_score = 0.0
    return mean_score 
Example #15
Source File: Metric.py    From fastNLP with Apache License 2.0 5 votes vote down vote up
def get_metric(self, reset=True):
        logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.hyps), len(self.refers))
        if len(self.hyps) == 0 or len(self.refers) == 0 :
            logger.error("During testing, no hyps or refers is selected!")
            return
        rouge = Rouge()
        scores_all = rouge.get_scores(self.hyps, self.refers, avg=True)
        if reset:
            self.hyps = []
            self.refers = []
        logger.info(scores_all)
        return scores_all 
Example #16
Source File: rouge_calculated.py    From Pointer-Generator with MIT License 5 votes vote down vote up
def rouge(sys, ref):
    rouge = Rouge()
    return rouge.get_scores(sys, ref, avg=True) 
Example #17
Source File: evaluate.py    From Counterfactual-StoryRW with MIT License 5 votes vote down vote up
def eval_rouge(instances: List[CFRInstance]):
    references = []
    hypotheses = []

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                            max_n=4,
                            limit_length=True,
                            length_limit=100,
                            length_limit_type='words',
                            apply_avg=True,
                            apply_best=False,
                            alpha=0.5,  # Default F1_score
                            weight_factor=1.2,
                            stemming=True)

    by_instance = []
    for instance in instances:
        _r = [_clean_text(g) for g in instance.gold_cf_endings]
        _h = _clean_text(instance.predicted_ending)
        references.append(_r)
        hypotheses.append(_h)
        try:
            by_instance.append(evaluator.get_scores(_h, _r))
        except:
            by_instance.append({})

    scores = evaluator.get_scores(hypotheses, references)
    return {'rouge_all' : scores,
            'rouge_by_instance': by_instance
            } 
Example #18
Source File: compute_rouge.py    From nlp-recipes with MIT License 4 votes vote down vote up
def compute_rouge_python(cand, ref, is_input_files=False, language="en"):
    """
    Computes ROUGE scores using the python package (https://pypi.org/project/py-rouge/).

    Args:
        cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing predicted summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the predicted summaries.
        ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing reference summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the reference summaries.
        is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are
            lists of predicted and reference summaries. Defaults to False.
        language (str, optional): Language of the input text. Supported values are "en" and
            "hi". Defaults to "en".

    Returns:
        dict: Dictionary of ROUGE scores.

    """
    supported_langauges = ["en", "hi"]
    if language not in supported_langauges:
        raise Exception(
            "Language {0} is not supported. Supported languages are: {1}.".format(
                language, supported_langauges
            )
        )

    if is_input_files:
        candidates = [line.strip() for line in open(cand, encoding="utf-8")]
        references = [line.strip() for line in open(ref, encoding="utf-8")]
    else:
        candidates = cand
        references = ref

    print("Number of candidates: {}".format(len(candidates)))
    print("Number of references: {}".format(len(references)))
    assert len(candidates) == len(references)

    if language == "en":
        evaluator = Rouge(
            metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True
        )
    else:
        evaluator = RougeExt(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=False,
            apply_avg=True,
            language=language,
        )

    scores = evaluator.get_scores(candidates, [[it] for it in references])

    return scores 
Example #19
Source File: metrics.py    From ParlAI with MIT License 4 votes vote down vote up
def compute_many(
        guess: str, answers: List[str]
    ) -> Tuple[
        Optional['RougeMetric'], Optional['RougeMetric'], Optional['RougeMetric']
    ]:
        """
        Compute ROUGE score between guess and *any* answer.

        Done with compute_many due to increased efficiency.

        :return: (rouge-1, rouge-2, rouge-L)
        """
        # possible global initialization
        global rouge
        if rouge is None:
            return None, None, None
        if RougeMetric._evaluator is None:
            RougeMetric._evaluator = rouge.Rouge(
                metrics=['rouge-n', 'rouge-l'], max_n=2
            )
        try:
            scores = [
                RougeMetric._evaluator.get_scores(
                    normalize_answer(guess), normalize_answer(a)
                )
                for a in answers
            ]
        except LookupError:
            warn_once(
                'ROUGE requires nltk punkt tokenizer. Please run '
                '`python -c "import nltk; nltk.download(\'punkt\')`'
            )
            return None, None, None

        scores_rouge1 = max(score['rouge-1']['r'] for score in scores)
        scores_rouge2 = max(score['rouge-2']['r'] for score in scores)
        scores_rougeL = max(score['rouge-l']['r'] for score in scores)
        return (
            RougeMetric(scores_rouge1),
            RougeMetric(scores_rouge2),
            RougeMetric(scores_rougeL),
        ) 
Example #20
Source File: rouge_evaluator.py    From structured-neural-summarization with MIT License 4 votes vote down vote up
def run(args):
    references_file = args['REFERENCES_FILE']
    predictions_file = args['PREDICTIONS_FILE']
    file_type = args['--format'] or 'jsonl'
    case_sensitive = args.get('--case_sensitive', False)

    if file_type != 'textfolder':
        references = extract_sentences(references_file, file_type, case_sensitive)
        predictions = extract_sentences(predictions_file, file_type, case_sensitive)
    elif file_type == 'textfolder':
        references = extract_sentences_from_folder(references_file, case_sensitive)
        predictions = extract_sentences_from_folder(predictions_file, case_sensitive)

    assert len(references) == len(predictions), 'References and predictions are not of the same length: reference: %s, predictions: %s' % (len(references), len(predictions))

    if not args['--use-rouge155']:
        from rouge import Rouge
        rouge = Rouge()
        scores = rouge.get_scores(hyps=predictions, refs=references, avg=True)
        print(scores)
    else:
        import pyrouge
        with tempfile.TemporaryDirectory() as data_dir:
            # First convert to single files
            ref_dir = os.path.join(data_dir, 'references')
            os.makedirs(ref_dir)

            dec_dir = os.path.join(data_dir, 'decoded')
            os.makedirs(dec_dir)

            for i, (decoded, reference) in enumerate(zip(predictions, references)):
                with open(os.path.join(ref_dir, '%06d_reference.txt' % i), 'w') as f:
                    f.write(reference.replace('.', '.\n'))
                with open(os.path.join(dec_dir, '%06d_decoded.txt' % i), 'w') as f:
                    f.write(decoded.replace('.', '.\n'))

            r = pyrouge.Rouge155()
            r.model_filename_pattern = '#ID#_reference.txt'
            r.system_filename_pattern = '(\d+)_decoded.txt'
            r.model_dir = ref_dir
            r.system_dir = dec_dir
            logging.getLogger('global').setLevel(logging.WARNING)  # silence pyrouge logging
            rouge_results = r.convert_and_evaluate()
            results_dict = r.output_to_dict(rouge_results)
            print(results_dict)
            print()
            log_str = ""
            for x in ["1","2","l"]:
                log_str += "\nROUGE-%s:\n" % x
                for y in ["f_score", "recall", "precision"]:
                    key = "rouge_%s_%s" % (x,y)
                    key_cb = key + "_cb"
                    key_ce = key + "_ce"
                    val = results_dict[key]
                    val_cb = results_dict[key_cb]
                    val_ce = results_dict[key_ce]
                    log_str += "%s: %.4f with confidence interval (%.4f, %.4f)\n" % (key, val, val_cb, val_ce)
            print(log_str)