Spaces:
Sleeping
Sleeping
anson-huang
commited on
Commit
β’
a0327b4
1
Parent(s):
79f9e39
update
Browse files- app.py +119 -111
- src/about.py +8 -5
- src/display/utils.py +10 -10
- src/leaderboard/read_evals.py +34 -34
app.py
CHANGED
@@ -9,7 +9,7 @@ from src.about import (
|
|
9 |
CITATION_BUTTON_TEXT,
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
-
|
13 |
TITLE,
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
@@ -63,28 +63,28 @@ def init_leaderboard(dataframe):
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
89 |
)
|
90 |
|
@@ -95,98 +95,106 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("π
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
-
with gr.TabItem("
|
102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
-
|
104 |
-
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
with gr.Column():
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
-
with gr.Row():
|
145 |
-
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("π Citation", open=False):
|
|
|
9 |
CITATION_BUTTON_TEXT,
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
+
DETECTOR_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
+
# select_columns=SelectColumns(
|
67 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
# label="Select Columns to Display:",
|
70 |
+
# ),
|
71 |
+
search_columns=[AutoEvalColumn.model.name],
|
72 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
+
# filter_columns=[
|
74 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
# ColumnFilter(
|
77 |
+
# AutoEvalColumn.params.name,
|
78 |
+
# type="slider",
|
79 |
+
# min=0.01,
|
80 |
+
# max=150,
|
81 |
+
# label="Select the number of parameters (B)",
|
82 |
+
# ),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
+
# ],
|
87 |
+
# bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
89 |
)
|
90 |
|
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("π
Detector Leaderboard", elem_id="detector-benchmark-tab-table", id=0):
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
+
with gr.TabItem("π Detector Playground ", elem_id="detector-playground-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
with gr.Row():
|
103 |
with gr.Column():
|
104 |
+
# print(LEADERBOARD_DF.keys())
|
105 |
+
gr.Dropdown(LEADERBOARD_DF['Model'].tolist())
|
106 |
+
gr.Image()
|
107 |
+
gr.Button("Submit")
|
108 |
+
|
109 |
+
with gr.TabItem("π About", elem_id="detector-benchmark-tab-table", id=2):
|
110 |
+
gr.Markdown(DETECTOR_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
111 |
+
|
112 |
+
# with gr.TabItem("π Submit here! ", elem_id="detector-benchmark-tab-table", id=3):
|
113 |
+
# with gr.Column():
|
114 |
+
# with gr.Row():
|
115 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
116 |
+
|
117 |
+
# with gr.Column():
|
118 |
+
# with gr.Accordion(
|
119 |
+
# f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
120 |
+
# open=False,
|
121 |
+
# ):
|
122 |
+
# with gr.Row():
|
123 |
+
# finished_eval_table = gr.components.Dataframe(
|
124 |
+
# value=finished_eval_queue_df,
|
125 |
+
# headers=EVAL_COLS,
|
126 |
+
# datatype=EVAL_TYPES,
|
127 |
+
# row_count=5,
|
128 |
+
# )
|
129 |
+
# with gr.Accordion(
|
130 |
+
# f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
131 |
+
# open=False,
|
132 |
+
# ):
|
133 |
+
# with gr.Row():
|
134 |
+
# running_eval_table = gr.components.Dataframe(
|
135 |
+
# value=running_eval_queue_df,
|
136 |
+
# headers=EVAL_COLS,
|
137 |
+
# datatype=EVAL_TYPES,
|
138 |
+
# row_count=5,
|
139 |
+
# )
|
140 |
+
|
141 |
+
# with gr.Accordion(
|
142 |
+
# f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
143 |
+
# open=False,
|
144 |
+
# ):
|
145 |
+
# with gr.Row():
|
146 |
+
# pending_eval_table = gr.components.Dataframe(
|
147 |
+
# value=pending_eval_queue_df,
|
148 |
+
# headers=EVAL_COLS,
|
149 |
+
# datatype=EVAL_TYPES,
|
150 |
+
# row_count=5,
|
151 |
+
# )
|
152 |
+
# with gr.Row():
|
153 |
+
# gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
154 |
+
|
155 |
+
# with gr.Row():
|
156 |
+
# with gr.Column():
|
157 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
158 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
159 |
+
# model_type = gr.Dropdown(
|
160 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
161 |
+
# label="Model type",
|
162 |
+
# multiselect=False,
|
163 |
+
# value=None,
|
164 |
+
# interactive=True,
|
165 |
+
# )
|
166 |
+
|
167 |
+
# with gr.Column():
|
168 |
+
# precision = gr.Dropdown(
|
169 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
170 |
+
# label="Precision",
|
171 |
+
# multiselect=False,
|
172 |
+
# value="float16",
|
173 |
+
# interactive=True,
|
174 |
+
# )
|
175 |
+
# weight_type = gr.Dropdown(
|
176 |
+
# choices=[i.value.name for i in WeightType],
|
177 |
+
# label="Weights type",
|
178 |
+
# multiselect=False,
|
179 |
+
# value="Original",
|
180 |
+
# interactive=True,
|
181 |
+
# )
|
182 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
183 |
+
|
184 |
+
# submit_button = gr.Button("Submit Eval")
|
185 |
+
# submission_result = gr.Markdown()
|
186 |
+
# submit_button.click(
|
187 |
+
# add_new_eval,
|
188 |
+
# [
|
189 |
+
# model_name_textbox,
|
190 |
+
# base_model_name_textbox,
|
191 |
+
# revision_name_textbox,
|
192 |
+
# precision,
|
193 |
+
# weight_type,
|
194 |
+
# model_type,
|
195 |
+
# ],
|
196 |
+
# submission_result,
|
197 |
+
# )
|
198 |
|
199 |
with gr.Row():
|
200 |
with gr.Accordion("π Citation", open=False):
|
src/about.py
CHANGED
@@ -12,8 +12,11 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,15 +24,15 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
-
|
33 |
## How it works
|
34 |
|
35 |
## Reproducibility
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("miragenews", "acc", "MiRAGeNews")
|
16 |
+
task1 = Task("genimage", "acc", "GenImage")
|
17 |
+
# task2 = Task("cnn_det", "acc_norm", "CNN Detection")
|
18 |
+
# task3 = Task("forgery_net", "acc_norm", "Forgery Net")
|
19 |
+
# task4 = Task("deepfake_det", "acc_norm", "Deepfake Detection")
|
20 |
|
21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
# ---------------------------------------------------
|
|
|
24 |
|
25 |
|
26 |
# Your leaderboard name
|
27 |
+
TITLE = """<h1 align="center" id="space-title"> π΅οΈ AI-Generated Image Detector Benchmark</h1>"""
|
28 |
|
29 |
# What does your leaderboard evaluate?
|
30 |
INTRODUCTION_TEXT = """
|
31 |
+
AI-Generated Image Detector Benchmark is a platform for evaluating the performance of existing detectors on various data sources and tasks. We collected X images from Y generators with 2 different tasks: Full Image Generation Detection and Partial Image Manipulation Detection.
|
32 |
"""
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
35 |
+
DETECTOR_BENCHMARKS_TEXT = f"""
|
36 |
## How it works
|
37 |
|
38 |
## Reproducibility
|
src/display/utils.py
CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub β€οΈ", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub β€οΈ", "number", False)])
|
40 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -20,17 +20,17 @@ class EvalResult:
|
|
20 |
full_model: str # org/model (path on hub)
|
21 |
org: str
|
22 |
model: str
|
23 |
-
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
-
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
-
license: str = "?"
|
30 |
-
likes: int = 0
|
31 |
-
num_params: int = 0
|
32 |
-
date: str = "" # submission date of request file
|
33 |
-
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
@@ -41,7 +41,7 @@ class EvalResult:
|
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -50,11 +50,11 @@ class EvalResult:
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
53 |
-
result_key = f"{model}
|
54 |
else:
|
55 |
org = org_and_model[0]
|
56 |
model = org_and_model[1]
|
57 |
-
result_key = f"{org}_{model}
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
@@ -85,15 +85,15 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision= config.get("model_sha", ""),
|
90 |
-
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model
|
97 |
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
@@ -112,27 +112,25 @@ class EvalResult:
|
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
-
|
129 |
for task in Tasks:
|
130 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
-
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def get_request_file_for_model(requests_path, model_name
|
136 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
request_files = os.path.join(
|
138 |
requests_path,
|
@@ -148,7 +146,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
148 |
req_content = json.load(f)
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
):
|
153 |
request_file = tmp_request_file
|
154 |
return request_file
|
@@ -176,7 +174,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
@@ -187,10 +185,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
187 |
|
188 |
results = []
|
189 |
for v in eval_results.values():
|
|
|
190 |
try:
|
191 |
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
|
|
194 |
continue
|
195 |
-
|
196 |
return results
|
|
|
20 |
full_model: str # org/model (path on hub)
|
21 |
org: str
|
22 |
model: str
|
23 |
+
# revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
+
# precision: Precision = Precision.Unknown
|
26 |
+
# model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
# weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
# architecture: str = "Unknown"
|
29 |
+
# license: str = "?"
|
30 |
+
# likes: int = 0
|
31 |
+
# num_params: int = 0
|
32 |
+
# date: str = "" # submission date of request file
|
33 |
+
# still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
|
|
41 |
config = data.get("config")
|
42 |
|
43 |
# Precision
|
44 |
+
# precision = Precision.from_str(config.get("model_dtype"))
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
53 |
+
result_key = f"{model}"
|
54 |
else:
|
55 |
org = org_and_model[0]
|
56 |
model = org_and_model[1]
|
57 |
+
result_key = f"{org}_{model}"
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
+
# precision=precision,
|
89 |
+
# revision= config.get("model_sha", ""),
|
90 |
+
# still_on_hub=still_on_hub,
|
91 |
+
# architecture=architecture
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model)
|
97 |
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
|
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
# AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
+
# AutoEvalColumn.license.name: self.license,
|
124 |
+
# AutoEvalColumn.likes.name: self.likes,
|
125 |
+
# AutoEvalColumn.params.name: self.num_params,
|
126 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
|
|
128 |
for task in Tasks:
|
129 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
|
|
130 |
return data_dict
|
131 |
|
132 |
|
133 |
+
def get_request_file_for_model(requests_path, model_name):
|
134 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
135 |
request_files = os.path.join(
|
136 |
requests_path,
|
|
|
146 |
req_content = json.load(f)
|
147 |
if (
|
148 |
req_content["status"] in ["FINISHED"]
|
149 |
+
# and req_content["precision"] == precision.split(".")[-1]
|
150 |
):
|
151 |
request_file = tmp_request_file
|
152 |
return request_file
|
|
|
174 |
for model_result_filepath in model_result_filepaths:
|
175 |
# Creation of result
|
176 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
177 |
+
# eval_result.update_with_request_file(requests_path)
|
178 |
|
179 |
# Store results of same eval together
|
180 |
eval_name = eval_result.eval_name
|
|
|
185 |
|
186 |
results = []
|
187 |
for v in eval_results.values():
|
188 |
+
# print(v)
|
189 |
try:
|
190 |
v.to_dict() # we test if the dict version is complete
|
191 |
results.append(v)
|
192 |
except KeyError: # not all eval values present
|
193 |
+
# print(v)
|
194 |
continue
|
195 |
+
# print("RES", results)
|
196 |
return results
|