Python Examples of rouge.Rouge

Source File: train_and_eval.py From structured-neural-summarization with MIT License

8 votes

def compute_rouge(predictions, targets):
    predictions = [" ".join(prediction).lower() for prediction in predictions]
    predictions = [prediction if prediction else "EMPTY" for prediction in predictions]
    targets = [" ".join(target).lower() for target in targets]
    targets = [target if target else "EMPTY" for target in targets]
    rouge = Rouge()
    scores = rouge.get_scores(hyps=predictions, refs=targets, avg=True)
    return scores['rouge-2']['f']

Source File: metrics.py From summarus with Apache License 2.0

6 votes

def calc_metrics(refs, hyps, language, metric="all", meteor_jar=None):
    metrics = dict()
    metrics["count"] = len(hyps)
    metrics["ref_example"] = refs[-1]
    metrics["hyp_example"] = hyps[-1]
    many_refs = [[r] if r is not list else r for r in refs]
    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(many_refs, hyps)
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(hyps, refs, avg=True)
        metrics.update(scores)
    if metric in ("meteor", "all") and meteor_jar is not None and os.path.exists(meteor_jar):
        meteor = Meteor(meteor_jar, language=language)
        metrics["meteor"] = meteor.compute_score(hyps, many_refs)
    if metric in ("duplicate_ngrams", "all"):
        metrics["duplicate_ngrams"] = dict()
        metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps))
    return metrics

Source File: interpolation_helper.py From texar with Apache License 2.0

6 votes

def calc_reward(refs, hypo, unk_id, metric):
    """
    calculate the reward given hypo and refs and will return
    bleu score if metric is 'bleu' or return
    sum of (Rouge-1, Rouge-2, Rouge-L) if metric is 'rouge'
    """
    if len(hypo) == 0 or len(refs[0]) == 0:
        return 0.

    for i in range(len(hypo)):
        assert isinstance(hypo[i], int)
        if hypo[i] == unk_id:
            hypo[i] = -1

    if metric == 'bleu':
        return 0.01 * sentence_bleu(
            references=refs, hypothesis=hypo, smooth=True)
    else:
        ref_str = ' '.join([str(word) for word in refs[0]])
        hypo_str = ' '.join([str(word) for word in hypo])
        rouge_scores = \
            rouge.get_scores(hyps=[hypo_str], refs=[ref_str], avg=True)
        return sum([value['f'] for key, value in rouge_scores.items()])

Source File: metrics.py From neural_chat with MIT License

6 votes

def _rouge(guess, answers):
    global rouge
    """Compute ROUGE score between guess and *any* answers. Return the best."""
    if rouge is None:
        return None, None, None
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    try:
        scores = [
            evaluator.get_scores(normalize_answer(guess), normalize_answer(a))
            for a in answers
        ]
    except LookupError:
        warn_once(
            'ROUGE requires nltk punkt tokenizer. Please run '
            '`python -c "import nltk; nltk.download(\'punkt\')`'
        )
        rouge = None
        return None, None, None

    scores_rouge1 = [score['rouge-1']['r'] for score in scores]
    scores_rouge2 = [score['rouge-2']['r'] for score in scores]
    scores_rougeL = [score['rouge-l']['r'] for score in scores]
    return max(scores_rouge1), max(scores_rouge2), max(scores_rougeL)

Source File: rouge.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge()

Source File: build_oracle.py From summarus with Apache License 2.0

5 votes

def build_oracle_records(records, nrows=None, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
    new_records = []

    for i, record in enumerate(records):
        if nrows is not None and i >= nrows:
            break

        summary = record["summary"]
        summary = summary if not lower else summary.lower()
        references.append(summary)

        text = record["text"]
        calc_score = lambda x, y: calc_single_score(x, y, rouge)
        predicted_summary, sentences, oracle_indices = build_oracle_summary_greedy(text, summary, calc_score=calc_score)
        predictions.append(predicted_summary)
        oracle_indices = [1 if i in oracle_indices else 0 for i in range(len(sentences))]

        new_record = copy.copy(record)
        new_record["sentences"] = sentences
        new_record["oracle"] = oracle_indices
        new_records.append(new_record)

    print_metrics(references, predictions)
    return new_records

Source File: metric.py From MultiTurnDialogZoo with MIT License

5 votes

def cal_ROUGE(refer, candidate):
    if len(candidate) == 0:
        candidate = ['<unk>']
    elif len(candidate) == 1:
        candidate.append('<unk>')
    if len(refer) == 0:
        refer = ['<unk>']
    elif len(refer) == 1:
        refer.append('<unk>')
    rouge = Rouge()
    scores = rouge.get_scores(' '.join(candidate), ' '.join(refer))
    return scores[0]['rouge-2']['f']

Source File: test_basic.py From rouge with Apache License 2.0

5 votes

def setUp(self):
        self.hyp_path = './tests/hyp.txt'
        self.ref_path = './tests/ref.txt'

        self.data_path = './tests/data.json'
        with open(self.data_path) as f:
            self.data = json.load(f)

        self.rouge = rouge.Rouge()
        self.files_rouge = rouge.FilesRouge()

Source File: rouge.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge()

Source File: rouge.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(
            self, rouge_type: str,
            name: str = "ROUGE") -> None:
        check_argument_types()
        super().__init__(name)

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(
                ("Invalid type of rouge metric '{}', "
                 "must be '1', '2' or 'L'").format(rouge_type))

        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge()

Source File: rouge.py From vizseq with MIT License

5 votes

def _get_sent_rouge(
        hypothesis: List[str], references: List[List[str]], rouge_type: str,
        extra_args: Optional[Dict[str, str]] = None
) -> List[float]:
    assert rouge_type in {'rouge-1', 'rouge-2', 'rouge-l'}
    _rouge_type = 'rouge-l' if rouge_type == 'rouge-l' else 'rouge-n'
    _max_n = 1 if rouge_type == 'rouge-1' else 2
    joint_references = [list(r) for r in zip(*references)]
    scores = _rouge.Rouge(
        metrics=[_rouge_type], max_n=_max_n, apply_avg=False
    ).get_scores(hypothesis, joint_references)
    return [s[STATS_TYPE][0] for s in scores[rouge_type]]

Source File: controller.py From dgm_latent_bow with MIT License

5 votes

def __init__(self, config):
    """Initialization from the configuration"""
    self.mode = config.controller_mode
    self.model_name = config.model_name
    self.model_name_version = config.model_name + "_" + config.model_version
    self.start_epoch = config.start_epoch
    self.num_epoch = config.num_epoch
    self.write_output = config.write_output
    self.batch_size = config.batch_size
    self.print_interval = config.train_print_interval
    self.gpu_id = config.gpu_id
    self.drop_out = config.drop_out
    self.dec_start_id = config.dec_start_id
    self.dec_end_id = config.dec_end_id
    self.model_path = config.model_path
    self.output_path = config.output_path
    self.random_seed = config.random_seed
    self.bow_pred_method = config.bow_pred_method
    self.train_log = TrainingLog(config)
    self.id2word = None
    self.target_metrics = config.target_metrics
    self.lm_load_path = config.lm_load_path
    self.rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2)
    self.save_ckpt = config.save_ckpt
    self.eval_metrics_list = config.eval_metrics_list
    self.log_metrics = config.log_metrics
    self.gumbel_samples = config.gumbel_samples
    self.is_gumbel = config.is_gumbel
    return

Source File: utils.py From fastNLP with Apache License 2.0

5 votes

def rouge_all(hyps, refer):
    rouge = Rouge()
    score = rouge.get_scores(hyps, refer)[0]
    # mean_score = np.mean([score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"]])
    return score

Source File: utils.py From fastNLP with Apache License 2.0

5 votes

def rouge_eval(hyps, refer):
    rouge = Rouge()
    # print(hyps)
    # print(refer)
    # print(rouge.get_scores(hyps, refer))
    try:
        score = rouge.get_scores(hyps, refer)[0]
        mean_score = np.mean([score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"]])
    except:
        mean_score = 0.0
    return mean_score

Source File: Metric.py From fastNLP with Apache License 2.0

5 votes

def get_metric(self, reset=True):
        logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.hyps), len(self.refers))
        if len(self.hyps) == 0 or len(self.refers) == 0 :
            logger.error("During testing, no hyps or refers is selected!")
            return
        rouge = Rouge()
        scores_all = rouge.get_scores(self.hyps, self.refers, avg=True)
        if reset:
            self.hyps = []
            self.refers = []
        logger.info(scores_all)
        return scores_all

Source File: rouge_calculated.py From Pointer-Generator with MIT License

5 votes

def rouge(sys, ref):
    rouge = Rouge()
    return rouge.get_scores(sys, ref, avg=True)

Source File: evaluate.py From Counterfactual-StoryRW with MIT License

5 votes

def eval_rouge(instances: List[CFRInstance]):
    references = []
    hypotheses = []

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                            max_n=4,
                            limit_length=True,
                            length_limit=100,
                            length_limit_type='words',
                            apply_avg=True,
                            apply_best=False,
                            alpha=0.5,  # Default F1_score
                            weight_factor=1.2,
                            stemming=True)

    by_instance = []
    for instance in instances:
        _r = [_clean_text(g) for g in instance.gold_cf_endings]
        _h = _clean_text(instance.predicted_ending)
        references.append(_r)
        hypotheses.append(_h)
        try:
            by_instance.append(evaluator.get_scores(_h, _r))
        except:
            by_instance.append({})

    scores = evaluator.get_scores(hypotheses, references)
    return {'rouge_all' : scores,
            'rouge_by_instance': by_instance
            }

Source File: compute_rouge.py From nlp-recipes with MIT License

4 votes

def compute_rouge_python(cand, ref, is_input_files=False, language="en"):
    """
    Computes ROUGE scores using the python package (https://pypi.org/project/py-rouge/).

    Args:
        cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing predicted summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the predicted summaries.
        ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing reference summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the reference summaries.
        is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are
            lists of predicted and reference summaries. Defaults to False.
        language (str, optional): Language of the input text. Supported values are "en" and
            "hi". Defaults to "en".

    Returns:
        dict: Dictionary of ROUGE scores.

    """
    supported_langauges = ["en", "hi"]
    if language not in supported_langauges:
        raise Exception(
            "Language {0} is not supported. Supported languages are: {1}.".format(
                language, supported_langauges
            )
        )

    if is_input_files:
        candidates = [line.strip() for line in open(cand, encoding="utf-8")]
        references = [line.strip() for line in open(ref, encoding="utf-8")]
    else:
        candidates = cand
        references = ref

    print("Number of candidates: {}".format(len(candidates)))
    print("Number of references: {}".format(len(references)))
    assert len(candidates) == len(references)

    if language == "en":
        evaluator = Rouge(
            metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True
        )
    else:
        evaluator = RougeExt(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=False,
            apply_avg=True,
            language=language,
        )

    scores = evaluator.get_scores(candidates, [[it] for it in references])

    return scores

Source File: metrics.py From ParlAI with MIT License

4 votes

def compute_many(
        guess: str, answers: List[str]
    ) -> Tuple[
        Optional['RougeMetric'], Optional['RougeMetric'], Optional['RougeMetric']
    ]:
        """
        Compute ROUGE score between guess and *any* answer.

        Done with compute_many due to increased efficiency.

        :return: (rouge-1, rouge-2, rouge-L)
        """
        # possible global initialization
        global rouge
        if rouge is None:
            return None, None, None
        if RougeMetric._evaluator is None:
            RougeMetric._evaluator = rouge.Rouge(
                metrics=['rouge-n', 'rouge-l'], max_n=2
            )
        try:
            scores = [
                RougeMetric._evaluator.get_scores(
                    normalize_answer(guess), normalize_answer(a)
                )
                for a in answers
            ]
        except LookupError:
            warn_once(
                'ROUGE requires nltk punkt tokenizer. Please run '
                '`python -c "import nltk; nltk.download(\'punkt\')`'
            )
            return None, None, None

        scores_rouge1 = max(score['rouge-1']['r'] for score in scores)
        scores_rouge2 = max(score['rouge-2']['r'] for score in scores)
        scores_rougeL = max(score['rouge-l']['r'] for score in scores)
        return (
            RougeMetric(scores_rouge1),
            RougeMetric(scores_rouge2),
            RougeMetric(scores_rougeL),
        )

Source File: rouge_evaluator.py From structured-neural-summarization with MIT License

4 votes

def run(args):
    references_file = args['REFERENCES_FILE']
    predictions_file = args['PREDICTIONS_FILE']
    file_type = args['--format'] or 'jsonl'
    case_sensitive = args.get('--case_sensitive', False)

    if file_type != 'textfolder':
        references = extract_sentences(references_file, file_type, case_sensitive)
        predictions = extract_sentences(predictions_file, file_type, case_sensitive)
    elif file_type == 'textfolder':
        references = extract_sentences_from_folder(references_file, case_sensitive)
        predictions = extract_sentences_from_folder(predictions_file, case_sensitive)

    assert len(references) == len(predictions), 'References and predictions are not of the same length: reference: %s, predictions: %s' % (len(references), len(predictions))

    if not args['--use-rouge155']:
        from rouge import Rouge
        rouge = Rouge()
        scores = rouge.get_scores(hyps=predictions, refs=references, avg=True)
        print(scores)
    else:
        import pyrouge
        with tempfile.TemporaryDirectory() as data_dir:
            # First convert to single files
            ref_dir = os.path.join(data_dir, 'references')
            os.makedirs(ref_dir)

            dec_dir = os.path.join(data_dir, 'decoded')
            os.makedirs(dec_dir)

            for i, (decoded, reference) in enumerate(zip(predictions, references)):
                with open(os.path.join(ref_dir, '%06d_reference.txt' % i), 'w') as f:
                    f.write(reference.replace('.', '.\n'))
                with open(os.path.join(dec_dir, '%06d_decoded.txt' % i), 'w') as f:
                    f.write(decoded.replace('.', '.\n'))

            r = pyrouge.Rouge155()
            r.model_filename_pattern = '#ID#_reference.txt'
            r.system_filename_pattern = '(\d+)_decoded.txt'
            r.model_dir = ref_dir
            r.system_dir = dec_dir
            logging.getLogger('global').setLevel(logging.WARNING)  # silence pyrouge logging
            rouge_results = r.convert_and_evaluate()
            results_dict = r.output_to_dict(rouge_results)
            print(results_dict)
            print()
            log_str = ""
            for x in ["1","2","l"]:
                log_str += "\nROUGE-%s:\n" % x
                for y in ["f_score", "recall", "precision"]:
                    key = "rouge_%s_%s" % (x,y)
                    key_cb = key + "_cb"
                    key_ce = key + "_ce"
                    val = results_dict[key]
                    val_cb = results_dict[key_cb]
                    val_ce = results_dict[key_ce]
                    log_str += "%s: %.4f with confidence interval (%.4f, %.4f)\n" % (key, val, val_cb, val_ce)
            print(log_str)

Python rouge.Rouge() Examples