pszemraj commited on
Commit
18191c0
1 Parent(s): 8d2d40e

Model save

Browse files
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: deepmind/language-perceiver
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - f1
8
+ model-index:
9
+ - name: language-perceiver-goodreads-bookgenres-Book_cls-8e
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # language-perceiver-goodreads-bookgenres-Book_cls-8e
17
+
18
+ This model is a fine-tuned version of [deepmind/language-perceiver](https://huggingface.co/deepmind/language-perceiver) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.2832
21
+ - F1: 0.5108
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 2e-05
41
+ - train_batch_size: 32
42
+ - eval_batch_size: 32
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 128
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: linear
48
+ - num_epochs: 8.0
49
+
50
+ ### Training results
51
+
52
+ | Training Loss | Epoch | Step | Validation Loss | F1 |
53
+ |:-------------:|:-----:|:----:|:---------------:|:------:|
54
+ | 0.3059 | 1.0 | 62 | 0.2893 | 0.3263 |
55
+ | 0.2879 | 2.0 | 124 | 0.2795 | 0.4290 |
56
+ | 0.2729 | 3.0 | 186 | 0.2730 | 0.4356 |
57
+ | 0.2606 | 4.0 | 248 | 0.2722 | 0.4590 |
58
+ | 0.2433 | 5.0 | 310 | 0.2747 | 0.4775 |
59
+ | 0.227 | 6.0 | 372 | 0.2777 | 0.4976 |
60
+ | 0.207 | 7.0 | 434 | 0.2814 | 0.5088 |
61
+ | 0.1969 | 8.0 | 496 | 0.2832 | 0.5108 |
62
+
63
+
64
+ ### Framework versions
65
+
66
+ - Transformers 4.33.3
67
+ - Pytorch 2.2.0.dev20231001+cu121
68
+ - Datasets 2.14.5
69
+ - Tokenizers 0.13.3
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.97,
3
+ "eval_f1": 0.7095034823635139,
4
+ "eval_loss": 0.20497334003448486,
5
+ "eval_runtime": 16.7618,
6
+ "eval_samples": 989,
7
+ "eval_samples_per_second": 59.003,
8
+ "eval_steps_per_second": 3.699,
9
+ "train_loss": 0.20427038223762822,
10
+ "train_runtime": 2213.5462,
11
+ "train_samples": 7914,
12
+ "train_samples_per_second": 17.876,
13
+ "train_steps_per_second": 0.278
14
+ }
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_label_trainable_num_channels": 1024,
3
+ "_name_or_path": "deepmind/language-perceiver",
4
+ "architectures": [
5
+ "PerceiverForSequenceClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "audio_samples_per_frame": 1920,
9
+ "cross_attention_shape_for_attention": "kv",
10
+ "cross_attention_widening_factor": 1,
11
+ "d_latents": 1280,
12
+ "d_model": 768,
13
+ "finetuning_task": "text-classification",
14
+ "hidden_act": "gelu",
15
+ "id2label": {
16
+ "0": "History & Politics",
17
+ "1": "Health & Medicine",
18
+ "2": "Mystery & Thriller",
19
+ "3": "Arts & Design",
20
+ "4": "Self-Help & Wellness",
21
+ "5": "Sports & Recreation",
22
+ "6": "Non-Fiction",
23
+ "7": "Science Fiction & Fantasy",
24
+ "8": "Countries & Geography",
25
+ "9": "Other",
26
+ "10": "Nature & Environment",
27
+ "11": "Business & Finance",
28
+ "12": "Romance",
29
+ "13": "Philosophy & Religion",
30
+ "14": "Literature & Fiction",
31
+ "15": "Science & Technology",
32
+ "16": "Children & Young Adult",
33
+ "17": "Food & Cooking"
34
+ },
35
+ "image_size": 56,
36
+ "initializer_range": 0.02,
37
+ "label2id": {
38
+ "Arts & Design": 3,
39
+ "Business & Finance": 11,
40
+ "Children & Young Adult": 16,
41
+ "Countries & Geography": 8,
42
+ "Food & Cooking": 17,
43
+ "Health & Medicine": 1,
44
+ "History & Politics": 0,
45
+ "Literature & Fiction": 14,
46
+ "Mystery & Thriller": 2,
47
+ "Nature & Environment": 10,
48
+ "Non-Fiction": 6,
49
+ "Other": 9,
50
+ "Philosophy & Religion": 13,
51
+ "Romance": 12,
52
+ "Science & Technology": 15,
53
+ "Science Fiction & Fantasy": 7,
54
+ "Self-Help & Wellness": 4,
55
+ "Sports & Recreation": 5
56
+ },
57
+ "layer_norm_eps": 1e-12,
58
+ "max_position_embeddings": 2048,
59
+ "model_type": "perceiver",
60
+ "num_blocks": 1,
61
+ "num_cross_attention_heads": 8,
62
+ "num_frames": 16,
63
+ "num_latents": 256,
64
+ "num_self_attends_per_block": 26,
65
+ "num_self_attention_heads": 8,
66
+ "output_num_channels": 512,
67
+ "output_shape": [
68
+ 1,
69
+ 16,
70
+ 224,
71
+ 224
72
+ ],
73
+ "problem_type": "multi_label_classification",
74
+ "qk_channels": 256,
75
+ "samples_per_patch": 16,
76
+ "self_attention_widening_factor": 1,
77
+ "torch_dtype": "float32",
78
+ "train_size": [
79
+ 368,
80
+ 496
81
+ ],
82
+ "transformers_version": "4.33.3",
83
+ "use_query_residual": true,
84
+ "v_channels": 1280,
85
+ "vocab_size": 262
86
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.97,
3
+ "eval_f1": 0.7095034823635139,
4
+ "eval_loss": 0.20497334003448486,
5
+ "eval_runtime": 16.7618,
6
+ "eval_samples": 989,
7
+ "eval_samples_per_second": 59.003,
8
+ "eval_steps_per_second": 3.699
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9312728a4bb10a39a19e2e83c6166b9dc3ec22deee8d403c62ad974aaa094294
3
+ size 824536032
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffc07d47a48c3dcf3716053c4708ce476a72b2c1f5acf526b395d55c4bdaafb1
3
+ size 567671666
special_tokens_map.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ }
44
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336
3
+ size 760289
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "clean_up_tokenization_spaces": true,
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "[CLS]",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "[EOS]",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "mask_token": {
28
+ "__type": "AddedToken",
29
+ "content": "[MASK]",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "model_max_length": 2048,
36
+ "pad_token": {
37
+ "__type": "AddedToken",
38
+ "content": "[PAD]",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "sep_token": {
45
+ "__type": "AddedToken",
46
+ "content": "[SEP]",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ "tokenizer_class": "PerceiverTokenizer"
53
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.97,
3
+ "train_loss": 0.20427038223762822,
4
+ "train_runtime": 2213.5462,
5
+ "train_samples": 7914,
6
+ "train_samples_per_second": 17.876,
7
+ "train_steps_per_second": 0.278
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.96969696969697,
5
+ "eval_steps": 500,
6
+ "global_step": 615,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 1.9674796747967483e-05,
14
+ "loss": 0.5781,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.16,
19
+ "learning_rate": 1.934959349593496e-05,
20
+ "loss": 0.4087,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.24,
25
+ "learning_rate": 1.902439024390244e-05,
26
+ "loss": 0.3448,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.32,
31
+ "learning_rate": 1.869918699186992e-05,
32
+ "loss": 0.322,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 1.83739837398374e-05,
38
+ "loss": 0.3095,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.48,
43
+ "learning_rate": 1.804878048780488e-05,
44
+ "loss": 0.3056,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.57,
49
+ "learning_rate": 1.772357723577236e-05,
50
+ "loss": 0.2888,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.65,
55
+ "learning_rate": 1.739837398373984e-05,
56
+ "loss": 0.2753,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.73,
61
+ "learning_rate": 1.7073170731707317e-05,
62
+ "loss": 0.2737,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.81,
67
+ "learning_rate": 1.6747967479674798e-05,
68
+ "loss": 0.2633,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.89,
73
+ "learning_rate": 1.642276422764228e-05,
74
+ "loss": 0.2583,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.97,
79
+ "learning_rate": 1.6097560975609757e-05,
80
+ "loss": 0.2518,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.99,
85
+ "eval_f1": 0.5708365708365708,
86
+ "eval_loss": 0.23673325777053833,
87
+ "eval_runtime": 17.0069,
88
+ "eval_samples_per_second": 58.153,
89
+ "eval_steps_per_second": 3.646,
90
+ "step": 123
91
+ },
92
+ {
93
+ "epoch": 1.05,
94
+ "learning_rate": 1.5772357723577235e-05,
95
+ "loss": 0.2427,
96
+ "step": 130
97
+ },
98
+ {
99
+ "epoch": 1.13,
100
+ "learning_rate": 1.5447154471544717e-05,
101
+ "loss": 0.2416,
102
+ "step": 140
103
+ },
104
+ {
105
+ "epoch": 1.21,
106
+ "learning_rate": 1.5121951219512196e-05,
107
+ "loss": 0.2285,
108
+ "step": 150
109
+ },
110
+ {
111
+ "epoch": 1.29,
112
+ "learning_rate": 1.4796747967479676e-05,
113
+ "loss": 0.2263,
114
+ "step": 160
115
+ },
116
+ {
117
+ "epoch": 1.37,
118
+ "learning_rate": 1.4471544715447157e-05,
119
+ "loss": 0.2241,
120
+ "step": 170
121
+ },
122
+ {
123
+ "epoch": 1.45,
124
+ "learning_rate": 1.4146341463414635e-05,
125
+ "loss": 0.2254,
126
+ "step": 180
127
+ },
128
+ {
129
+ "epoch": 1.54,
130
+ "learning_rate": 1.3821138211382115e-05,
131
+ "loss": 0.2213,
132
+ "step": 190
133
+ },
134
+ {
135
+ "epoch": 1.62,
136
+ "learning_rate": 1.3495934959349594e-05,
137
+ "loss": 0.2155,
138
+ "step": 200
139
+ },
140
+ {
141
+ "epoch": 1.7,
142
+ "learning_rate": 1.3170731707317076e-05,
143
+ "loss": 0.2201,
144
+ "step": 210
145
+ },
146
+ {
147
+ "epoch": 1.78,
148
+ "learning_rate": 1.2845528455284555e-05,
149
+ "loss": 0.2167,
150
+ "step": 220
151
+ },
152
+ {
153
+ "epoch": 1.86,
154
+ "learning_rate": 1.2520325203252033e-05,
155
+ "loss": 0.215,
156
+ "step": 230
157
+ },
158
+ {
159
+ "epoch": 1.94,
160
+ "learning_rate": 1.2195121951219513e-05,
161
+ "loss": 0.2111,
162
+ "step": 240
163
+ },
164
+ {
165
+ "epoch": 2.0,
166
+ "eval_f1": 0.6588750913075239,
167
+ "eval_loss": 0.2085004448890686,
168
+ "eval_runtime": 17.0139,
169
+ "eval_samples_per_second": 58.129,
170
+ "eval_steps_per_second": 3.644,
171
+ "step": 247
172
+ },
173
+ {
174
+ "epoch": 2.02,
175
+ "learning_rate": 1.1869918699186992e-05,
176
+ "loss": 0.1933,
177
+ "step": 250
178
+ },
179
+ {
180
+ "epoch": 2.1,
181
+ "learning_rate": 1.1544715447154474e-05,
182
+ "loss": 0.1915,
183
+ "step": 260
184
+ },
185
+ {
186
+ "epoch": 2.18,
187
+ "learning_rate": 1.1219512195121953e-05,
188
+ "loss": 0.187,
189
+ "step": 270
190
+ },
191
+ {
192
+ "epoch": 2.26,
193
+ "learning_rate": 1.0894308943089431e-05,
194
+ "loss": 0.1847,
195
+ "step": 280
196
+ },
197
+ {
198
+ "epoch": 2.34,
199
+ "learning_rate": 1.0569105691056911e-05,
200
+ "loss": 0.1868,
201
+ "step": 290
202
+ },
203
+ {
204
+ "epoch": 2.42,
205
+ "learning_rate": 1.024390243902439e-05,
206
+ "loss": 0.1835,
207
+ "step": 300
208
+ },
209
+ {
210
+ "epoch": 2.51,
211
+ "learning_rate": 9.91869918699187e-06,
212
+ "loss": 0.1813,
213
+ "step": 310
214
+ },
215
+ {
216
+ "epoch": 2.59,
217
+ "learning_rate": 9.59349593495935e-06,
218
+ "loss": 0.1819,
219
+ "step": 320
220
+ },
221
+ {
222
+ "epoch": 2.67,
223
+ "learning_rate": 9.268292682926831e-06,
224
+ "loss": 0.1746,
225
+ "step": 330
226
+ },
227
+ {
228
+ "epoch": 2.75,
229
+ "learning_rate": 8.94308943089431e-06,
230
+ "loss": 0.181,
231
+ "step": 340
232
+ },
233
+ {
234
+ "epoch": 2.83,
235
+ "learning_rate": 8.617886178861789e-06,
236
+ "loss": 0.1844,
237
+ "step": 350
238
+ },
239
+ {
240
+ "epoch": 2.91,
241
+ "learning_rate": 8.292682926829268e-06,
242
+ "loss": 0.169,
243
+ "step": 360
244
+ },
245
+ {
246
+ "epoch": 2.99,
247
+ "learning_rate": 7.967479674796748e-06,
248
+ "loss": 0.1833,
249
+ "step": 370
250
+ },
251
+ {
252
+ "epoch": 3.0,
253
+ "eval_f1": 0.6790123456790124,
254
+ "eval_loss": 0.2064265012741089,
255
+ "eval_runtime": 16.9971,
256
+ "eval_samples_per_second": 58.186,
257
+ "eval_steps_per_second": 3.648,
258
+ "step": 371
259
+ },
260
+ {
261
+ "epoch": 3.07,
262
+ "learning_rate": 7.64227642276423e-06,
263
+ "loss": 0.1625,
264
+ "step": 380
265
+ },
266
+ {
267
+ "epoch": 3.15,
268
+ "learning_rate": 7.317073170731707e-06,
269
+ "loss": 0.1617,
270
+ "step": 390
271
+ },
272
+ {
273
+ "epoch": 3.23,
274
+ "learning_rate": 6.991869918699188e-06,
275
+ "loss": 0.1582,
276
+ "step": 400
277
+ },
278
+ {
279
+ "epoch": 3.31,
280
+ "learning_rate": 6.666666666666667e-06,
281
+ "loss": 0.1584,
282
+ "step": 410
283
+ },
284
+ {
285
+ "epoch": 3.39,
286
+ "learning_rate": 6.341463414634147e-06,
287
+ "loss": 0.1512,
288
+ "step": 420
289
+ },
290
+ {
291
+ "epoch": 3.47,
292
+ "learning_rate": 6.016260162601627e-06,
293
+ "loss": 0.1574,
294
+ "step": 430
295
+ },
296
+ {
297
+ "epoch": 3.56,
298
+ "learning_rate": 5.691056910569106e-06,
299
+ "loss": 0.1564,
300
+ "step": 440
301
+ },
302
+ {
303
+ "epoch": 3.64,
304
+ "learning_rate": 5.365853658536586e-06,
305
+ "loss": 0.1584,
306
+ "step": 450
307
+ },
308
+ {
309
+ "epoch": 3.72,
310
+ "learning_rate": 5.040650406504065e-06,
311
+ "loss": 0.1495,
312
+ "step": 460
313
+ },
314
+ {
315
+ "epoch": 3.8,
316
+ "learning_rate": 4.715447154471545e-06,
317
+ "loss": 0.1591,
318
+ "step": 470
319
+ },
320
+ {
321
+ "epoch": 3.88,
322
+ "learning_rate": 4.390243902439025e-06,
323
+ "loss": 0.1564,
324
+ "step": 480
325
+ },
326
+ {
327
+ "epoch": 3.96,
328
+ "learning_rate": 4.0650406504065046e-06,
329
+ "loss": 0.1485,
330
+ "step": 490
331
+ },
332
+ {
333
+ "epoch": 4.0,
334
+ "eval_f1": 0.7020743104627308,
335
+ "eval_loss": 0.20207864046096802,
336
+ "eval_runtime": 17.0075,
337
+ "eval_samples_per_second": 58.151,
338
+ "eval_steps_per_second": 3.645,
339
+ "step": 495
340
+ },
341
+ {
342
+ "epoch": 4.04,
343
+ "learning_rate": 3.7398373983739838e-06,
344
+ "loss": 0.1477,
345
+ "step": 500
346
+ },
347
+ {
348
+ "epoch": 4.12,
349
+ "learning_rate": 3.414634146341464e-06,
350
+ "loss": 0.1406,
351
+ "step": 510
352
+ },
353
+ {
354
+ "epoch": 4.2,
355
+ "learning_rate": 3.0894308943089435e-06,
356
+ "loss": 0.1383,
357
+ "step": 520
358
+ },
359
+ {
360
+ "epoch": 4.28,
361
+ "learning_rate": 2.764227642276423e-06,
362
+ "loss": 0.1374,
363
+ "step": 530
364
+ },
365
+ {
366
+ "epoch": 4.36,
367
+ "learning_rate": 2.4390243902439027e-06,
368
+ "loss": 0.1282,
369
+ "step": 540
370
+ },
371
+ {
372
+ "epoch": 4.44,
373
+ "learning_rate": 2.1138211382113824e-06,
374
+ "loss": 0.1338,
375
+ "step": 550
376
+ },
377
+ {
378
+ "epoch": 4.53,
379
+ "learning_rate": 1.788617886178862e-06,
380
+ "loss": 0.1397,
381
+ "step": 560
382
+ },
383
+ {
384
+ "epoch": 4.61,
385
+ "learning_rate": 1.4634146341463414e-06,
386
+ "loss": 0.143,
387
+ "step": 570
388
+ },
389
+ {
390
+ "epoch": 4.69,
391
+ "learning_rate": 1.1382113821138213e-06,
392
+ "loss": 0.1392,
393
+ "step": 580
394
+ },
395
+ {
396
+ "epoch": 4.77,
397
+ "learning_rate": 8.130081300813009e-07,
398
+ "loss": 0.1374,
399
+ "step": 590
400
+ },
401
+ {
402
+ "epoch": 4.85,
403
+ "learning_rate": 4.878048780487805e-07,
404
+ "loss": 0.1401,
405
+ "step": 600
406
+ },
407
+ {
408
+ "epoch": 4.93,
409
+ "learning_rate": 1.6260162601626018e-07,
410
+ "loss": 0.1382,
411
+ "step": 610
412
+ },
413
+ {
414
+ "epoch": 4.97,
415
+ "eval_f1": 0.7095034823635139,
416
+ "eval_loss": 0.20497334003448486,
417
+ "eval_runtime": 17.0074,
418
+ "eval_samples_per_second": 58.151,
419
+ "eval_steps_per_second": 3.645,
420
+ "step": 615
421
+ },
422
+ {
423
+ "epoch": 4.97,
424
+ "step": 615,
425
+ "total_flos": 3.666068136773222e+16,
426
+ "train_loss": 0.20427038223762822,
427
+ "train_runtime": 2213.5462,
428
+ "train_samples_per_second": 17.876,
429
+ "train_steps_per_second": 0.278
430
+ }
431
+ ],
432
+ "logging_steps": 10,
433
+ "max_steps": 615,
434
+ "num_train_epochs": 5,
435
+ "save_steps": 500,
436
+ "total_flos": 3.666068136773222e+16,
437
+ "trial_name": null,
438
+ "trial_params": null
439
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05febb53989d3609718ff89cfe07adccc4fe13af016c2a02e54484aa6e188756
3
+ size 4600
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.txt ADDED
The diff for this file is too large to render. See raw diff