Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Minseok Bae commited on Jan 4

Commit

f0b90cf

•

1 Parent(s): c3e9147

Edited README and added reproducibility functionality in main_backend.py

Browse files

Files changed (2) hide show

main_backend.py +81 -49
src/display/about.py +55 -4

main_backend.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import logging
 import pprint
@@ -22,55 +23,86 @@ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
                 local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    manage_requests.check_completed_evals(
-        api=envs.API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=envs.QUEUE_REPO,
-        local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=envs.RESULTS_REPO,
-        local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
-    )
-    logging.info("Checked completed evals")
-    eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
-                                                    hf_repo=envs.QUEUE_REPO,
-                                                    local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
-    logging.info("Got eval requests")
-    eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
-    logging.info("Sorted eval requests")
-    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        print("No eval requests found. Exiting.")
-        return
-    eval_request = eval_requests[0]
-    pp.pprint(eval_request)
-    manage_requests.set_eval_request(
-        api=envs.API,
-        eval_request=eval_request,
-        new_status=RUNNING_STATUS,
-        hf_repo=envs.QUEUE_REPO,
-        local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
-    )
-    logging.info("Set eval request to running, now running eval")
-    run_eval_suite.run_evaluation(
-        eval_request=eval_request,
-        local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
-        results_repo=envs.RESULTS_REPO,
-        batch_size=1,
-        device=envs.DEVICE,
-        no_cache=True,
-    )
-    logging.info("Eval finished, now setting status to finished")
 if __name__ == "__main__":
-    run_auto_eval()

+import argparse
 import logging
 import pprint
                 local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+def run_auto_eval(args):
+    if not args.reproduce:
+        current_pending_status = [PENDING_STATUS]
+        manage_requests.check_completed_evals(
+            api=envs.API,
+            checked_status=RUNNING_STATUS,
+            completed_status=FINISHED_STATUS,
+            failed_status=FAILED_STATUS,
+            hf_repo=envs.QUEUE_REPO,
+            local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
+            hf_repo_results=envs.RESULTS_REPO,
+            local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
+        )
+        logging.info("Checked completed evals")
+        eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
+                                                        hf_repo=envs.QUEUE_REPO,
+                                                        local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
+        logging.info("Got eval requests")
+        eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
+        logging.info("Sorted eval requests")
+        print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+        if len(eval_requests) == 0:
+            print("No eval requests found. Exiting.")
+            return
+        eval_request = eval_requests[0]
+        pp.pprint(eval_request)
+        manage_requests.set_eval_request(
+            api=envs.API,
+            eval_request=eval_request,
+            new_status=RUNNING_STATUS,
+            hf_repo=envs.QUEUE_REPO,
+            local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
+        )
+        logging.info("Set eval request to running, now running eval")
+        run_eval_suite.run_evaluation(
+            eval_request=eval_request,
+            local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
+            results_repo=envs.RESULTS_REPO,
+            batch_size=1,
+            device=envs.DEVICE,
+            no_cache=True,
+        )
+        logging.info("Eval finished, now setting status to finished")
+    else:
+        eval_request = manage_requests.EvalRequest(
+            model=args.model,
+            status=PENDING_STATUS,
+            precision=args.precision
+        )
+        pp.pprint(eval_request)
+        logging.info("Running reproducibility eval")
+        run_eval_suite.run_evaluation(
+            eval_request=eval_request,
+            local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
+            results_repo=envs.RESULTS_REPO,
+            batch_size=1,
+            device=envs.DEVICE,
+        )
+        logging.info("Reproducibility eval finished")
+def main():
+    parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
+    # Optional arguments
+    parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
+    parser.add_argument("--model", type=str, default=None, help="Your Model ID")
+    parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
+    args = parser.parse_args()
+    run_auto_eval(args)
 if __name__ == "__main__":
+    main()

src/display/about.py CHANGED Viewed

@@ -16,7 +16,6 @@ class Tasks(Enum):
     answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
     average_summary_length = Task("average_summary_length",
                                 "average_summary_length", "Average Summary Length")
-    # error_rate = Task("error_rate", "error_rate", "Error Rate")
 # Your leaderboard name
@@ -44,12 +43,65 @@ We generate summaries for each of these documents using submitted LLMs and compu
 ## Understand each metric
 - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
 - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
-- Answer Rate: The percentage of summaries that are non-empty. (This is a proxy for whether the model generates a summary at all)
 - Average Summary Length: The average number of words in the generated summaries
 ## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
@@ -79,7 +131,6 @@ When we add extra information about models to the leaderboard, it will be automa
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

     answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
     average_summary_length = Task("average_summary_length",
                                 "average_summary_length", "Average Summary Length")
 # Your leaderboard name
 ## Understand each metric
 - Hallucination Rate: The percentage of summaries that have a hallucination score below 0.5
 - Factual Consistency Rate: (1 - Hallucination Rate) * 100 (%)
+- Answer Rate: The percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
 - Average Summary Length: The average number of words in the generated summaries
+## Note on non-Hugging Face models
+On H2EM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
+If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
 ## Reproducibility
+### For models not available on the Hugging Face model hub:
+You can access the generated summaries we used to evaluate models [here](https://github.com/vectara/hallucination-leaderboard) under a file named "leaderboard_summaries.csv".
+In the same Github repository, you can also find the prompt we used to generate the summaries under "Prompt Used" section in the README file.
+For the models that are not available on the Hugging Face model hub, we also provide the information on how we ran the evaluations under "API Integration Details" section.
+### For models available on the Hugging Face model hub:
+To reproduce the result for your model, you can follow the steps below:
+- 1) Clone the repository
+```python
+git lfs install
+git clone https://huggingface.co/spaces/vectara/leaderboard
+```
+- 2) Install the requirements
+```python
+pip install -r requirements.txt
+```
+- 3) Set up your Hugging Face token
+```python
+export HF_TOKEN=your_token
+```
+- 4) Run the evaluation script
+```python
+python main_backend.py --model your_model_id --precision float16
+```
+- 5) Wait for the evaluation to finish (it may take a while)
+Once the evaluation is finished, a file named "results.json" will be generated under the folder "eval-results-bk/your_model_id".
+The result has the following JSON format:
+```python
+{
+    "config": {
+        "model_dtype": precision,
+        "model_name": your_model_id,
+        "model_sha": "main"
+    },
+    "results": {
+        "hallucination_rate": {
+            "hallucination_rate":
+        },
+        "factual_consistency_rate": {
+            "factual_consistency_rate":
+        },
+        "answer_rate": {
+            "answer_rate":
+        },
+        "average_summary_length": {
+            "average_summary_length":
+        }
+    }
+}
+```
 """
 EVALUATION_QUEUE_TEXT = """
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"