lrl-modelcloud
commited on
Commit
•
4ddfe5a
1
Parent(s):
8148b4d
Update README.md
Browse files
README.md
CHANGED
@@ -48,94 +48,93 @@ print(result)
|
|
48 |
```
|
49 |
|
50 |
## lm-eval benchmark
|
51 |
-
|
52 |
```
|
53 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
54 |
|---------------------------------------|------:|------|-----:|----------|---|-----:|---|-----:|
|
55 |
-
|arc_challenge | 1|none | 0|acc |↑ |0.
|
56 |
-
| | |none | 0|acc_norm |↑ |0.
|
57 |
-
|arc_easy | 1|none | 0|acc |↑ |0.
|
58 |
-
| | |none | 0|acc_norm |↑ |0.
|
59 |
-
|boolq | 2|none | 0|acc |↑ |0.
|
60 |
-
|hellaswag | 1|none | 0|acc |↑ |0.
|
61 |
-
| | |none | 0|acc_norm |↑ |0.
|
62 |
-
|lambada_openai | 1|none | 0|acc |↑ |0.
|
63 |
-
| | |none | 0|perplexity|↓ |
|
64 |
-
|mmlu | 1|none | |acc |↑ |0.
|
65 |
-
| - humanities | 1|none | |acc |↑ |0.
|
66 |
-
| - formal_logic | 0|none | 0|acc |↑ |0.
|
67 |
-
| - high_school_european_history | 0|none | 0|acc |↑ |0.
|
68 |
-
| - high_school_us_history | 0|none | 0|acc |↑ |0.
|
69 |
-
| - high_school_world_history | 0|none | 0|acc |↑ |0.
|
70 |
-
| - international_law | 0|none | 0|acc |↑ |0.
|
71 |
-
| - jurisprudence | 0|none | 0|acc |↑ |0.
|
72 |
-
| - logical_fallacies | 0|none | 0|acc |↑ |0.
|
73 |
-
| - moral_disputes | 0|none | 0|acc |↑ |0.
|
74 |
-
| - moral_scenarios | 0|none | 0|acc |↑ |0.
|
75 |
-
| - philosophy | 0|none | 0|acc |↑ |0.
|
76 |
-
| - prehistory | 0|none | 0|acc |↑ |0.
|
77 |
-
| - professional_law | 0|none | 0|acc |↑ |0.
|
78 |
| - world_religions | 0|none | 0|acc |↑ |0.7953|± |0.0309|
|
79 |
-
| - other | 1|none | |acc |↑ |0.
|
80 |
-
| - business_ethics | 0|none | 0|acc |↑ |0.
|
81 |
-
| - clinical_knowledge | 0|none | 0|acc |↑ |0.
|
82 |
-
| - college_medicine | 0|none | 0|acc |↑ |0.
|
83 |
-
| - global_facts | 0|none | 0|acc |↑ |0.
|
84 |
-
| - human_aging | 0|none | 0|acc |↑ |0.
|
85 |
-
| - management | 0|none | 0|acc |↑ |0.
|
86 |
-
| - marketing | 0|none | 0|acc |↑ |0.
|
87 |
-
| - medical_genetics | 0|none | 0|acc |↑ |0.
|
88 |
-
| - miscellaneous | 0|none | 0|acc |↑ |0.
|
89 |
-
| - nutrition | 0|none | 0|acc |↑ |0.
|
90 |
-
| - professional_accounting | 0|none | 0|acc |↑ |0.
|
91 |
-
| - professional_medicine | 0|none | 0|acc |↑ |0.
|
92 |
-
| - virology | 0|none | 0|acc |↑ |0.
|
93 |
-
| - social sciences | 1|none | |acc |↑ |0.
|
94 |
-
| - econometrics | 0|none | 0|acc |↑ |0.
|
95 |
-
| - high_school_geography | 0|none | 0|acc |↑ |0.
|
96 |
-
| - high_school_government_and_politics| 0|none | 0|acc
|
97 |
-
| - high_school_macroeconomics | 0|none | 0|acc |↑ |0.
|
98 |
-
| - high_school_microeconomics | 0|none | 0|acc |↑ |0.
|
99 |
-
| - high_school_psychology | 0|none | 0|acc |↑ |0.
|
100 |
-
| - human_sexuality | 0|none | 0|acc |↑ |0.
|
101 |
-
| - professional_psychology | 0|none | 0|acc |↑ |0.
|
102 |
-
| - public_relations | 0|none | 0|acc |↑ |0.
|
103 |
-
| - security_studies | 0|none | 0|acc |↑ |0.
|
104 |
-
| - sociology | 0|none | 0|acc |↑ |0.
|
105 |
-
| - us_foreign_policy | 0|none | 0|acc |↑ |0.
|
106 |
-
| - stem | 1|none | |acc |↑ |0.
|
107 |
-
| - abstract_algebra | 0|none | 0|acc |↑ |0.
|
108 |
-
| - anatomy | 0|none | 0|acc |↑ |0.
|
109 |
-
| - astronomy | 0|none | 0|acc |↑ |0.
|
110 |
-
| - college_biology | 0|none | 0|acc |↑ |0.
|
111 |
-
| - college_chemistry | 0|none | 0|acc |↑ |0.
|
112 |
-
| - college_computer_science | 0|none | 0|acc |↑ |0.
|
113 |
| - college_mathematics | 0|none | 0|acc |↑ |0.3200|± |0.0469|
|
114 |
-
| - college_physics | 0|none | 0|acc |↑ |0.
|
115 |
-
| - computer_security | 0|none | 0|acc |↑ |0.
|
116 |
-
| - conceptual_physics | 0|none | 0|acc |↑ |0.
|
117 |
-
| - electrical_engineering | 0|none | 0|acc |↑ |0.
|
118 |
-
| - elementary_mathematics | 0|none | 0|acc |↑ |0.
|
119 |
-
| - high_school_biology | 0|none | 0|acc |↑ |0.
|
120 |
| - high_school_chemistry | 0|none | 0|acc |↑ |0.5665|± |0.0349|
|
121 |
-
| - high_school_computer_science | 0|none | 0|acc |↑ |0.
|
122 |
-
| - high_school_mathematics | 0|none | 0|acc |↑ |0.
|
123 |
-
| - high_school_physics | 0|none | 0|acc |↑ |0.
|
124 |
-
| - high_school_statistics | 0|none | 0|acc |↑ |0.
|
125 |
-
| - machine_learning | 0|none | 0|acc |↑ |0.
|
126 |
-
|openbookqa | 1|none | 0|acc |↑ |0.
|
127 |
-
| | |none | 0|acc_norm |↑ |0.
|
128 |
-
|piqa | 1|none | 0|acc |↑ |0.
|
129 |
-
| | |none | 0|acc_norm |↑ |0.
|
130 |
-
|rte | 1|none | 0|acc |↑ |0.
|
131 |
-
|truthfulqa_mc1 | 2|none | 0|acc |↑ |0.
|
132 |
-
|winogrande | 1|none | 0|acc |↑ |0.
|
133 |
|
134 |
| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
|
135 |
|------------------|------:|------|------|------|---|-----:|---|-----:|
|
136 |
-
|mmlu | 1|none | |acc |↑ |0.
|
137 |
-
| - humanities | 1|none | |acc |↑ |0.
|
138 |
-
| - other | 1|none | |acc |↑ |0.
|
139 |
-
| - social sciences| 1|none | |acc |↑ |0.
|
140 |
-
| - stem | 1|none | |acc |↑ |0.
|
141 |
-
```
|
|
|
48 |
```
|
49 |
|
50 |
## lm-eval benchmark
|
|
|
51 |
```
|
52 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
53 |
|---------------------------------------|------:|------|-----:|----------|---|-----:|---|-----:|
|
54 |
+
|arc_challenge | 1|none | 0|acc |↑ |0.5171|± |0.0146|
|
55 |
+
| | |none | 0|acc_norm |↑ |0.5290|± |0.0146|
|
56 |
+
|arc_easy | 1|none | 0|acc |↑ |0.8068|± |0.0081|
|
57 |
+
| | |none | 0|acc_norm |↑ |0.7837|± |0.0084|
|
58 |
+
|boolq | 2|none | 0|acc |↑ |0.8232|± |0.0067|
|
59 |
+
|hellaswag | 1|none | 0|acc |↑ |0.5787|± |0.0049|
|
60 |
+
| | |none | 0|acc_norm |↑ |0.7765|± |0.0042|
|
61 |
+
|lambada_openai | 1|none | 0|acc |↑ |0.7091|± |0.0063|
|
62 |
+
| | |none | 0|perplexity|↓ |3.6297|± |0.0805|
|
63 |
+
|mmlu | 1|none | |acc |↑ |0.6421|± |0.0039|
|
64 |
+
| - humanities | 1|none | |acc |↑ |0.5932|± |0.0069|
|
65 |
+
| - formal_logic | 0|none | 0|acc |↑ |0.4206|± |0.0442|
|
66 |
+
| - high_school_european_history | 0|none | 0|acc |↑ |0.7030|± |0.0357|
|
67 |
+
| - high_school_us_history | 0|none | 0|acc |↑ |0.8039|± |0.0279|
|
68 |
+
| - high_school_world_history | 0|none | 0|acc |↑ |0.8228|± |0.0249|
|
69 |
+
| - international_law | 0|none | 0|acc |↑ |0.7686|± |0.0385|
|
70 |
+
| - jurisprudence | 0|none | 0|acc |↑ |0.7685|± |0.0408|
|
71 |
+
| - logical_fallacies | 0|none | 0|acc |↑ |0.7914|± |0.0319|
|
72 |
+
| - moral_disputes | 0|none | 0|acc |↑ |0.7110|± |0.0244|
|
73 |
+
| - moral_scenarios | 0|none | 0|acc |↑ |0.4536|± |0.0167|
|
74 |
+
| - philosophy | 0|none | 0|acc |↑ |0.6913|± |0.0262|
|
75 |
+
| - prehistory | 0|none | 0|acc |↑ |0.7037|± |0.0254|
|
76 |
+
| - professional_law | 0|none | 0|acc |↑ |0.4739|± |0.0128|
|
77 |
| - world_religions | 0|none | 0|acc |↑ |0.7953|± |0.0309|
|
78 |
+
| - other | 1|none | |acc |↑ |0.7036|± |0.0079|
|
79 |
+
| - business_ethics | 0|none | 0|acc |↑ |0.6400|± |0.0482|
|
80 |
+
| - clinical_knowledge | 0|none | 0|acc |↑ |0.7094|± |0.0279|
|
81 |
+
| - college_medicine | 0|none | 0|acc |↑ |0.6358|± |0.0367|
|
82 |
+
| - global_facts | 0|none | 0|acc |↑ |0.3400|± |0.0476|
|
83 |
+
| - human_aging | 0|none | 0|acc |↑ |0.6457|± |0.0321|
|
84 |
+
| - management | 0|none | 0|acc |↑ |0.8544|± |0.0349|
|
85 |
+
| - marketing | 0|none | 0|acc |↑ |0.8761|± |0.0216|
|
86 |
+
| - medical_genetics | 0|none | 0|acc |↑ |0.7300|± |0.0446|
|
87 |
+
| - miscellaneous | 0|none | 0|acc |↑ |0.8148|± |0.0139|
|
88 |
+
| - nutrition | 0|none | 0|acc |↑ |0.7092|± |0.0260|
|
89 |
+
| - professional_accounting | 0|none | 0|acc |↑ |0.5071|± |0.0298|
|
90 |
+
| - professional_medicine | 0|none | 0|acc |↑ |0.7316|± |0.0269|
|
91 |
+
| - virology | 0|none | 0|acc |↑ |0.5000|± |0.0389|
|
92 |
+
| - social sciences | 1|none | |acc |↑ |0.7390|± |0.0077|
|
93 |
+
| - econometrics | 0|none | 0|acc |↑ |0.4561|± |0.0469|
|
94 |
+
| - high_school_geography | 0|none | 0|acc |↑ |0.8333|± |0.0266|
|
95 |
+
| - high_school_government_and_politics| 0|none | 0|acc |��� |0.8808|± |0.0234|
|
96 |
+
| - high_school_macroeconomics | 0|none | 0|acc |↑ |0.6231|± |0.0246|
|
97 |
+
| - high_school_microeconomics | 0|none | 0|acc |↑ |0.7437|± |0.0284|
|
98 |
+
| - high_school_psychology | 0|none | 0|acc |↑ |0.8404|± |0.0157|
|
99 |
+
| - human_sexuality | 0|none | 0|acc |↑ |0.7481|± |0.0381|
|
100 |
+
| - professional_psychology | 0|none | 0|acc |↑ |0.6814|± |0.0189|
|
101 |
+
| - public_relations | 0|none | 0|acc |↑ |0.6455|± |0.0458|
|
102 |
+
| - security_studies | 0|none | 0|acc |↑ |0.7143|± |0.0289|
|
103 |
+
| - sociology | 0|none | 0|acc |↑ |0.8259|± |0.0268|
|
104 |
+
| - us_foreign_policy | 0|none | 0|acc |↑ |0.8200|± |0.0386|
|
105 |
+
| - stem | 1|none | |acc |↑ |0.5601|± |0.0085|
|
106 |
+
| - abstract_algebra | 0|none | 0|acc |↑ |0.3500|± |0.0479|
|
107 |
+
| - anatomy | 0|none | 0|acc |↑ |0.6370|± |0.0415|
|
108 |
+
| - astronomy | 0|none | 0|acc |↑ |0.7566|± |0.0349|
|
109 |
+
| - college_biology | 0|none | 0|acc |↑ |0.7639|± |0.0355|
|
110 |
+
| - college_chemistry | 0|none | 0|acc |↑ |0.4800|± |0.0502|
|
111 |
+
| - college_computer_science | 0|none | 0|acc |↑ |0.5000|± |0.0503|
|
112 |
| - college_mathematics | 0|none | 0|acc |↑ |0.3200|± |0.0469|
|
113 |
+
| - college_physics | 0|none | 0|acc |↑ |0.4020|± |0.0488|
|
114 |
+
| - computer_security | 0|none | 0|acc |↑ |0.7600|± |0.0429|
|
115 |
+
| - conceptual_physics | 0|none | 0|acc |↑ |0.5574|± |0.0325|
|
116 |
+
| - electrical_engineering | 0|none | 0|acc |↑ |0.6345|± |0.0401|
|
117 |
+
| - elementary_mathematics | 0|none | 0|acc |↑ |0.4921|± |0.0257|
|
118 |
+
| - high_school_biology | 0|none | 0|acc |↑ |0.7710|± |0.0239|
|
119 |
| - high_school_chemistry | 0|none | 0|acc |↑ |0.5665|± |0.0349|
|
120 |
+
| - high_school_computer_science | 0|none | 0|acc |↑ |0.7000|± |0.0461|
|
121 |
+
| - high_school_mathematics | 0|none | 0|acc |↑ |0.4074|± |0.0300|
|
122 |
+
| - high_school_physics | 0|none | 0|acc |↑ |0.4172|± |0.0403|
|
123 |
+
| - high_school_statistics | 0|none | 0|acc |↑ |0.5278|± |0.0340|
|
124 |
+
| - machine_learning | 0|none | 0|acc |↑ |0.4732|± |0.0474|
|
125 |
+
|openbookqa | 1|none | 0|acc |↑ |0.3360|± |0.0211|
|
126 |
+
| | |none | 0|acc_norm |↑ |0.4220|± |0.0221|
|
127 |
+
|piqa | 1|none | 0|acc |↑ |0.7943|± |0.0094|
|
128 |
+
| | |none | 0|acc_norm |↑ |0.7965|± |0.0094|
|
129 |
+
|rte | 1|none | 0|acc |↑ |0.6968|± |0.0277|
|
130 |
+
|truthfulqa_mc1 | 2|none | 0|acc |↑ |0.3439|± |0.0166|
|
131 |
+
|winogrande | 1|none | 0|acc |↑ |0.7364|± |0.0124|
|
132 |
|
133 |
| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
|
134 |
|------------------|------:|------|------|------|---|-----:|---|-----:|
|
135 |
+
|mmlu | 1|none | |acc |↑ |0.6421|± |0.0039|
|
136 |
+
| - humanities | 1|none | |acc |↑ |0.5932|± |0.0069|
|
137 |
+
| - other | 1|none | |acc |↑ |0.7036|± |0.0079|
|
138 |
+
| - social sciences| 1|none | |acc |↑ |0.7390|± |0.0077|
|
139 |
+
| - stem | 1|none | |acc |↑ |0.5601|± |0.0085|
|
140 |
+
```
|