Evaluation/automatic_metrics.py

import json
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from rouge import Rouge
from bert_score import score
import numpy as np

def get_bleu_score(question, reference, hypothesis, task):
    reference, hypothesis = (
        reference.replace("\n", " ").split(),
        hypothesis.replace("\n", " ").split(),
    )

    bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0))
    bleu4 = sentence_bleu([reference], hypothesis, weights=(0, 0, 0, 1))
    return bleu1, bleu4


def get_rouge_score(question, reference, hypothesis, task, metric="r"):
    rouge = Rouge()
    rouge_ = rouge.get_scores(hyps=[hypothesis], refs=[reference])[0]
    return (
        rouge_["rouge-1"][metric],
        rouge_["rouge-2"][metric],
        rouge_["rouge-l"][metric],
    )


def get_meteor_score(question, reference, hypothesis, task):
    reference, hypothesis = (
        reference.replace("\n", " ").split(),
        hypothesis.replace("\n", " ").split(),
    )
    meteor = single_meteor_score(set(reference), set(hypothesis))
    return float(meteor)


def get_bertscore(question, reference, hypothesis, task):

    bertscore = score([reference], [hypothesis], lang="EN")
    return float(bertscore[1])


def get_exact_match(question, reference, hypothesis, task):
    count = len(reference)
    if type(hypothesis) is str:
        try:
            hypothesis = eval(hypothesis)
            assert isinstance(hypothesis, dict)
        except Exception as e:
            return 0, count
    
    exact_score_count = 0
    for key in reference:
        if key in hypothesis and hypothesis[key] == reference[key]:
            exact_score_count += 1
    return exact_score_count, count

def get_partial_match(question, reference, hypothesis, task):
    count = len(reference)
    if isinstance(hypothesis, str):
        try:
            hypothesis = eval(hypothesis)
            assert isinstance(hypothesis, dict)
        except Exception as e:
            return 0, count

    partial_score_count = 0
    for key in reference:
        if key in hypothesis:
            true_set = set(reference[key].split())
            pred_set = set(hypothesis[key].split())
            partial_score_count += int(len(true_set.intersection(pred_set)) > 0)
    return partial_score_count, count