import ast import argparse import glob import pickle import gradio as gr import numpy as np import pandas as pd def model_hyperlink(model_name, link): return f'{model_name}' def load_leaderboard_table_csv(filename, add_hyperlink=True): lines = open(filename).readlines() heads = [v.strip() for v in lines[0].split(",")] rows = [] for i in range(1, len(lines)): row = [v.strip() for v in lines[i].split(",")] for j in range(len(heads)): item = {} for h, v in zip(heads, row): if "Score" in h: item[h] = float(v) elif h != "Model" and h != "Params (B)" and h != "Repo" and h != "Quantization" and h != "Link": item[h] = int(v) else: item[h] = v if add_hyperlink: item["Repo"] = model_hyperlink(item["Repo"], item["Link"]) rows.append(item) return rows def get_arena_table(model_table_df): # sort by rating model_table_df = model_table_df.sort_values(by=["Final Score"], ascending=False) values = [] for i in range(len(model_table_df)): row = [] model_key = model_table_df.index[i] model_name = model_table_df["Model"].values[model_key] # rank row.append(i + 1) # model display name row.append(model_name) row.append( model_table_df["Params (B)"].values[model_key] ) row.append( model_table_df["Repo"].values[model_key] ) row.append( model_table_df["Quantization"].values[model_key] ) row.append( model_table_df["Final Score"].values[model_key] ) row.append( model_table_df["Strict Prompt Score"].values[model_key] ) row.append( model_table_df["Strict Inst Score"].values[model_key] ) row.append( model_table_df["Loose Prompt Score"].values[model_key] ) row.append( model_table_df["Loose Inst Score"].values[model_key] ) values.append(row) return values def build_leaderboard_tab(leaderboard_table_file, show_plot=False): if leaderboard_table_file: data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) md_head = f""" # 🏆 IFEval Leaderboard """ gr.Markdown(md_head, elem_id="leaderboard_markdown") with gr.Tabs() as tabs: # arena table arena_table_vals = get_arena_table(model_table_df) with gr.Tab("IFEval", id=0): md = "Leaderboard for various Large Language Models measured with IFEval benchmark.\n\n[IFEval](https://github.com/google-research/google-research/tree/master/instruction_following_eval) is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of \"verifiable instructions\" such as \"write in more than 400 words\" and \"mention the keyword of AI at least 3 times\". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions. \n\nTest ran with `lm-evaluation-harness`. Raw results can be found in the `results` directory. Made by [Kristian Polso](https://polso.info)\n\n**Changelog**\n\n8.6.2024 - Fixed CapybaraHermes, AlphaMonarch results, was using the wrong prompt template" gr.Markdown(md, elem_id="leaderboard_markdown") gr.Dataframe( headers=[ "Rank", "Model", "Params (B)", "Repo", "Quantization", "Final Score", "Strict Prompt Score", "Strict Inst Score", "Loose Prompt Score", "Loose Inst Score" ], datatype=[ "number", "str", "number", "markdown", "str", "number", "number", "number", "number", "number" ], value=arena_table_vals, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[50, 160, 60, 230, 100, 90, 90, 90, 90, 90], wrap=True, ) else: pass def build_demo(leaderboard_table_file): text_size = gr.themes.sizes.text_lg with gr.Blocks( title="IFEval Leaderboard", theme=gr.themes.Base(text_size=text_size), ) as demo: leader_components = build_leaderboard_tab( leaderboard_table_file, show_plot=True ) return demo if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true") parser.add_argument("--IFEval_file", type=str, default="./IFEval.csv") args = parser.parse_args() demo = build_demo(args.IFEval_file) demo.launch()