import logging import pandas as pd import os import csv import src.envs as envs from src.backend.model_operations import SummaryGenerator, EvaluationModel import src.backend.util as util logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class Evaluator: """A class to evaluate summaries generated by a language model. Attributes: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. summary_generator (SummaryGenerator): Instance for generating summaries. eval_model (EvaluationModel): Instance for evaluating summaries. """ def __init__(self, model, revision, precision, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'): """Initializes the Evaluator with the given model and settings. Args: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. """ self.model = model self.revision = revision self.precision = precision self.batch_size = batch_size self.device = device self.no_cache = no_cache self.limit = limit self.write_out = write_out self.output_base_path = output_base_path try: self.summary_generator = SummaryGenerator(model, revision) self.eval_model = EvaluationModel(envs.HEM_PATH) except Exception as e: logging.error(f"Error initializing Evaluator: {e}") raise def evaluate(self): """ Performs the evaluation process by generating summaries and computing metrics. Returns: dict: A dictionary containing evaluation results. """ try: df = pd.read_csv(envs.DATASET_PATH) # print(envs.DATASET_PATH) # print(df.shape) # print(df.iloc[-1]) self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv") avg_summary_len = self.summary_generator.avg_length answer_rate = self.summary_generator.answer_rate self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination( self.generated_summaries_df) factual_consistency_rate = self.eval_model.compute_factual_consistency_rate() hallucination_rate = self.eval_model.hallucination_rate results = util.format_results(model_name=self.model, revision=self.revision, precision=self.precision, factual_consistency_rate=factual_consistency_rate, hallucination_rate=hallucination_rate, answer_rate=answer_rate, avg_summary_len=avg_summary_len) return results except FileNotFoundError: logging.error(f"File not found: {envs.DATASET_PATH}") raise except Exception as e: logging.error(f"Error during evaluation: {e}") raise def write_results(self): print('Updating result files') leaderboard_path = os.getcwd() # the path of leaderboard folder print(leaderboard_path) working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results') if not os.path.exists(working_path): logging.error(f"Need to first download the results from google drive to the learderboard folder") raise source_summary_df = self.generated_summaries_df[["source", "summary"]] #update leaderboard_summaries.csv #first remove previous results for the current model existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8') mask = existing_df['model'] == self.model existing_df = existing_df[~mask] print(existing_df.shape) summary_doc = set(existing_df['model'].values.tolist()) print(summary_doc) # get new result leaderboard_summaries_df = source_summary_df leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0]) leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False) print('leaderboard_summaries.csv has been updated') # update leaderboard_summaries_with_scores.csv # BUG: get error when opening the file existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), encoding='utf-8', sep=",", quotechar='"', quoting=2) print(existing_df.shape) score_doc = set(existing_df['model'].values.tolist()) print(score_doc) mask = existing_df['model'] == self.model existing_df = existing_df[~mask] # get new result leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results) leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0]) leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False) print('leaderboard_summaries_with_scores.csv has been updated')