backend_demo

Paused

App Files Files Community

Shaltiel commited on Mar 17

Commit

b686823

•

1 Parent(s): 1d6da9d

Prepped for QA test

Browse files

Files changed (12) hide show

app.py +0 -2
custom_tasks.py +2 -74
heq_task.py +111 -0
main_backend_harness.py +0 -78
main_backend_lighteval.py +2 -2
requirements.txt +2 -2
scripts/fix_harness_import.py +0 -11
src/about.py +2 -2
src/backend/run_eval_suite_harness.py +0 -57
src/envs.py +3 -3
src/logging.py +6 -6
src/populate.py +0 -56

app.py CHANGED Viewed

@@ -11,8 +11,6 @@ from src.logging import LOGGER, read_logs
 sys.stdout = LOGGER
 sys.stderr = LOGGER
-#subprocess.run(["python", "scripts/fix_harness_import.py"])
 def launch_backend():
     _ = subprocess.run(["python", "main_backend_lighteval.py"])

 sys.stdout = LOGGER
 sys.stderr = LOGGER
 def launch_backend():
     _ = subprocess.run(["python", "main_backend_lighteval.py"])

custom_tasks.py CHANGED Viewed

@@ -6,84 +6,12 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
 Author:
 """
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-## EVAL WITH NO SUBSET ##
-# This is how you create a simple tasks (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
-task = LightevalTaskConfig(
-    name="myothertask",
-    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
-    suite=["community"],
-    hf_repo="",
-    hf_subset="default",
-    hf_avail_splits=[],
-    evaluation_splits=[],
-    few_shots_split="",
-    few_shots_select="",
-    metric=[""],
-)
-## EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
-# fmt: off
-SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
-# fmt: on
-class CustomSubsetTask(LightevalTaskConfig):
-    def __init__(
-        self,
-        name,
-        hf_subset,
-    ):
-        super().__init__(
-            name=name,
-            hf_subset=hf_subset,
-            prompt_function="prompt_fn",  # must be defined in the file
-            hf_repo="",
-            metric=[""],
-            hf_avail_splits=[],
-            evaluation_splits=[],
-            few_shots_split="",
-            few_shots_select="",
-            suite=["community"],
-            generation_size=-1,
-            stop_sequence=None,
-            output_regex=None,
-            frozen=False,
-        )
-## DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
-def prompt_fn(line, task_name: str = None):
-    """Defines how to go from a dataset line to a doc object.
-    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
-    about what this function should do in the README.
-    """
-    return Doc(
-        task_name=task_name,
-        query="",
-        choices="",
-        gold_index=0,
-        instruction="",
-    )
-## STORE YOUR EVALS
-SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
-_TASKS = SUBSET_TASKS + [task]
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in _TASKS]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 Author:
 """
+from heq_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

heq_task.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+ARTICLES_REGEX = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+def normalize_answer(s):
+    def remove_articles(text):
+        return ARTICLES_REGEX.sub(" ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s.replace('<pad>', '').replace('</s>', '').strip()))))
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def normalized_edit_similarity(p1, p2):
+    return 1-distance(p1, p2)/ max(len(p1), len(p2))
+def compute_token_edit(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    num_same = sum([max([normalized_edit_similarity(gold_t, pred_t) for pred_t in pred_toks]) for gold_t in gold_toks])
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def tlnls(a_gold, a_pred):
+    digit_count = sum(1 for char in a_pred if char.isdigit())
+    if digit_count < len(a_pred) / 2:
+        return compute_token_edit(a_gold, a_pred)
+    else:
+        return compute_f1(a_gold, a_pred)
+def heq_eval_fn(golds: list[str], predictions: list[str]):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    return max([tlnls(x, predictions[0]) for x in golds])
+heq_tlnls_metric = CorpusLevelMetric(
+    metric="heq_tlnls",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    corpus_level_fn=np.mean,
+    sample_level_fn=heq_eval_fn
+)
+def heq_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"],
+        choices=line["response"],
+        gold_index=list(range(line["response"])),
+        instruction="",
+    )
+## EVAL WITH NO SUBSET ##
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+heq_task = LightevalTaskConfig(
+    name="heq-qa-tlnls",
+    prompt_function="heq_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="dicta-hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["heq"],
+    evaluation_splits=["heq"],
+    metric=[heq_tlnls_metric],
+    stop_sequence=['\n']
+)

main_backend_harness.py DELETED Viewed

@@ -1,78 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from backend.run_eval_suite_harness import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
-from src.about import Tasks, NUM_FEWSHOT
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
-logging.basicConfig(level=logging.ERROR)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    pp.pprint(eval_request)
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        num_fewshot=NUM_FEWSHOT,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        results_repo=RESULTS_REPO,
-        batch_size=1,
-        device=DEVICE,
-        no_cache=True,
-        limit=LIMIT
-        )
-if __name__ == "__main__":
-    run_auto_eval()

main_backend_lighteval.py CHANGED Viewed

@@ -63,9 +63,9 @@ def run_auto_eval():
     # This needs to be done
     #instance_size, instance_type = get_instance_for_model(eval_request)
     # For GPU
-    # instance_size, instance_type = "small", "g4dn.xlarge"
     # For CPU
-    instance_size, instance_type = "medium", "c6i"
     run_evaluation(
         eval_request=eval_request,

     # This needs to be done
     #instance_size, instance_type = get_instance_for_model(eval_request)
     # For GPU
+    instance_size, instance_type = "small", "g4dn.xlarge"
     # For CPU
+    # instance_size, instance_type = "medium", "c6i"
     run_evaluation(
         eval_request=eval_request,

requirements.txt CHANGED Viewed

@@ -13,7 +13,7 @@ requests==2.28.2
 tqdm==4.65.0
 transformers
 tokenizers>=0.15.0
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 git+https://github.com/huggingface/lighteval.git#egg=lighteval
 accelerate==0.24.1
-sentencepiece

 tqdm==4.65.0
 transformers
 tokenizers>=0.15.0
 git+https://github.com/huggingface/lighteval.git#egg=lighteval
 accelerate==0.24.1
+sentencepiece
+Levenshtein

scripts/fix_harness_import.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""This file should be used after pip install -r requirements.
-It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
-It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
-"""
-import os
-import lm_eval
-if __name__ == "__main__":
-    lm_eval_path = lm_eval.__path__[0]
-    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/about.py CHANGED Viewed

@@ -20,5 +20,5 @@ NUM_FEWSHOT = 0 # Change with your few shot
 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
-TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-#custom|myothertask|0|0

 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
+# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0"

src/backend/run_eval_suite_harness.py DELETED Viewed

@@ -1,57 +0,0 @@
-import json
-import os
-import logging
-from datetime import datetime
-from lm_eval import tasks, evaluator, utils
-from src.envs import RESULTS_REPO, API
-from src.backend.manage_requests import EvalRequest
-logging.getLogger("openai").setLevel(logging.WARNING)
-def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
-    if limit:
-        print(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
-    print(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(
-        model="hf-causal-experimental", # "hf-causal"
-        model_args=eval_request.get_model_args(),
-        tasks=task_names,
-        num_fewshot=num_fewshot,
-        batch_size=batch_size,
-        device=device,
-        no_cache=no_cache,
-        limit=limit,
-        write_out=True,
-        output_base_path="logs"
-    )
-    results["config"]["model_dtype"] = eval_request.precision
-    results["config"]["model_name"] = eval_request.model
-    results["config"]["model_sha"] = eval_request.revision
-    dumped = json.dumps(results, indent=2)
-    print(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(dumped)
-    print(evaluator.make_table(results))
-    API.upload_file(
-        path_or_fileobj=output_path,
-        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-        repo_id=results_repo,
-        repo_type="dataset",
-    )
-    return results

src/envs.py CHANGED Viewed

@@ -6,19 +6,19 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
 # For harness evaluations
 DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
 LIMIT = 20 # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
-ACCELERATOR = "cpu"
 REGION = "us-east-1"
 VENDOR = "aws"
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard-backend"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
+OWNER = "dicta-hebrew-llm-leaderboard" # Change to your org - don't forget to create a results and request file
 # For harness evaluations
 DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
 LIMIT = 20 # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
+ACCELERATOR = "cuda:0"
 REGION = "us-east-1"
 VENDOR = "aws"
 # ----------------------------------
+# REPO_ID = f"{OWNER}/leaderboard-backend"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

src/logging.py CHANGED Viewed

@@ -19,12 +19,12 @@ class Logger:
 def read_logs():
     sys.stdout.flush()
-    #API.upload_file(
-    #    path_or_fileobj="output.log",
-    #    path_in_repo="demo-backend.log",
-    #    repo_id="demo-leaderboard-backend/logs",
-    #    repo_type="dataset",
-    #)
     with open("output.log", "r") as f:
         return f.read()

 def read_logs():
     sys.stdout.flush()
+    API.upload_file(
+       path_or_fileobj="output.log",
+       path_in_repo="demo-backend.log",
+       repo_id="demo-leaderboard-backend/logs",
+       repo_type="dataset",
+    )
     with open("output.log", "r") as f:
         return f.read()

src/populate.py DELETED Viewed

@@ -1,56 +0,0 @@
-import json
-import os
-import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]