File size: 3,550 Bytes
d7b7dc6
 
 
 
 
58b9de9
d7b7dc6
58b9de9
d7b7dc6
58b9de9
 
 
d7b7dc6
 
 
58b9de9
150bb15
2b9835a
58b9de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7b7dc6
150bb15
 
 
 
 
 
 
 
58b9de9
 
 
 
 
2aa9a75
 
dcf13df
 
 
 
 
 
7ef82ad
dcf13df
7ef82ad
 
58b9de9
 
 
d7b7dc6
 
58b9de9
d7b7dc6
150bb15
 
 
d7b7dc6
 
7ef82ad
d7b7dc6
150bb15
7ef82ad
150bb15
 
 
 
 
 
d7b7dc6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import json
import os
import logging
from datetime import datetime

import src.envs as envs
from src.backend.manage_requests import EvalRequest
from src.backend.evaluate_model import Evaluator

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger("openai").setLevel(logging.WARNING)


def run_evaluation(eval_request: EvalRequest, batch_size, device,
                local_dir: str, results_repo: str, no_cache=True, limit=None, 
                need_check=True, write_results=False):
    """
    Run the evaluation for a given model and upload the results.

    Args:
        eval_request (EvalRequest): The evaluation request object containing model details.
        num_fewshot (int): Number of few-shot examples.
        batch_size (int): Batch size for processing.
        device (str): The device to run the evaluation on.
        local_dir (str): Local directory path for saving results.
        results_repo (str): Repository ID where results will be uploaded.
        no_cache (bool): Whether to disable caching.
        limit (int, optional): Limit on the number of items to process. Use with caution.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    if limit:
        logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
    # if os.path.exists(output_folder):
    #     f_name = os.listdir(output_folder)[-1] 
    #     print(f"Loading results from {os.path.join(output_folder, f_name)}")
    #     results = json.loads(os.path.join(output_folder, f_name))
    #     dumped = json.dumps(results, indent=2)
    #     logging.info(dumped)
    # else:
    try:
        evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
                            batch_size, device, no_cache, limit, write_out=True,
                            output_base_path='logs')
        results = evaluator.evaluate()
        if write_results:
            evaluator.write_results()
            # upload leaderboard_summaries.csv to HF
            envs.API.upload_file(
                path_or_fileobj=envs.LEADERBOARD_DATASET_PATH,
                path_in_repo=envs.LEADERBOARD_DATASET_PATH.split('/')[-1],
                repo_id=envs.LEADERBOARD_DATASET_REPO,
                repo_type="dataset",
                commit_message=f"Update results for {eval_request.model}"
            )
            logging.info(f"Leaderboard result dataset has been updated to {envs.LEADERBOARD_DATASET_PATH}/{envs.LEADERBOARD_DATASET_PATH.split('/')[-1]}")

    except Exception as e:
        logging.error(f"Error during evaluation: {e}")
        raise

    dumped = json.dumps(results, indent=2)
    logging.info(dumped)

    output_path = os.path.join(output_folder,
                            f"results_{datetime.now()}.json") #
    os.makedirs(output_folder, exist_ok=True)
    with open(output_path, "w") as f:
        f.write(dumped)
    logging.info(f"Results have been saved to{output_path}")

    if not need_check:
        logging.info(f"Path in the repo: {eval_request.model}/results_{datetime.now()}.json")
        envs.API.upload_file(
            path_or_fileobj=output_path,
            path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
            repo_id=results_repo,
            repo_type="dataset",
        )

    return results