KaushalB commited on
Commit
b41ee08
1 Parent(s): 9b6b5cf

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ state.db filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.7354497354497355,
4
+ "eval_loss": 0.8083848357200623,
5
+ "eval_runtime": 1.8368,
6
+ "eval_samples_per_second": 102.896,
7
+ "eval_steps_per_second": 6.533,
8
+ "total_flos": 2.681093741830963e+18,
9
+ "train_loss": 0.29336747460895113,
10
+ "train_runtime": 795.2059,
11
+ "train_samples_per_second": 42.681,
12
+ "train_steps_per_second": 1.358
13
+ }
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch32-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "afro",
13
+ "1": "classical",
14
+ "10": "reggae",
15
+ "11": "rock",
16
+ "2": "country",
17
+ "3": "disco",
18
+ "4": "electro",
19
+ "5": "jazz",
20
+ "6": "latin",
21
+ "7": "metal",
22
+ "8": "pop",
23
+ "9": "rap"
24
+ },
25
+ "image_size": 224,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 3072,
28
+ "label2id": {
29
+ "afro": "0",
30
+ "classical": "1",
31
+ "country": "2",
32
+ "disco": "3",
33
+ "electro": "4",
34
+ "jazz": "5",
35
+ "latin": "6",
36
+ "metal": "7",
37
+ "pop": "8",
38
+ "rap": "9",
39
+ "reggae": "10",
40
+ "rock": "11"
41
+ },
42
+ "layer_norm_eps": 1e-12,
43
+ "model_type": "vit",
44
+ "num_attention_heads": 12,
45
+ "num_channels": 3,
46
+ "num_hidden_layers": 12,
47
+ "patch_size": 32,
48
+ "problem_type": "single_label_classification",
49
+ "qkv_bias": true,
50
+ "torch_dtype": "float32",
51
+ "transformers_version": "4.39.3"
52
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.7354497354497355,
4
+ "eval_loss": 0.8083848357200623,
5
+ "eval_runtime": 1.8368,
6
+ "eval_samples_per_second": 102.896,
7
+ "eval_steps_per_second": 6.533
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8725ef095ab642a6726508ccb349c58f1cf4b687bfbe4891c0c80de6b55bf34
3
+ size 349881056
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0ea76f7d71a48349ddcc05f83ddbf8f8f117f78a49c05dd6c38d1e6db47a9d1
3
+ size 699882938
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTImageProcessor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3bd3cafcd141485c5526689e7070ba65dab1e4639fbae44141ae41439003c1f
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9022742c9d6746d087ef37f0ee553659edde91d2108327185e4b66eabe4bca26
3
+ size 1064
state.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd10d354f3aa4aec0dae3940ee299145d3d33a6cd74f1f01704018e2903f2f3b
3
+ size 349940282
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 2.681093741830963e+18,
4
+ "train_loss": 0.29336747460895113,
5
+ "train_runtime": 795.2059,
6
+ "train_samples_per_second": 42.681,
7
+ "train_steps_per_second": 1.358
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8083848357200623,
3
+ "best_model_checkpoint": "./kaggle/working/eGTZANplus/checkpoint-220",
4
+ "epoch": 20.0,
5
+ "eval_steps": 10,
6
+ "global_step": 1080,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.19,
13
+ "grad_norm": 1.4207828044891357,
14
+ "learning_rate": 0.00019814814814814814,
15
+ "loss": 2.4003,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.19,
20
+ "eval_accuracy": 0.19576719576719576,
21
+ "eval_loss": 2.282846689224243,
22
+ "eval_runtime": 1.8415,
23
+ "eval_samples_per_second": 102.636,
24
+ "eval_steps_per_second": 6.517,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.37,
29
+ "grad_norm": 1.2310184240341187,
30
+ "learning_rate": 0.0001962962962962963,
31
+ "loss": 2.1703,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.37,
36
+ "eval_accuracy": 0.35978835978835977,
37
+ "eval_loss": 1.9852432012557983,
38
+ "eval_runtime": 1.793,
39
+ "eval_samples_per_second": 105.41,
40
+ "eval_steps_per_second": 6.693,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.56,
45
+ "grad_norm": 2.0627870559692383,
46
+ "learning_rate": 0.00019444444444444446,
47
+ "loss": 1.9696,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 0.56,
52
+ "eval_accuracy": 0.3915343915343915,
53
+ "eval_loss": 1.8232808113098145,
54
+ "eval_runtime": 1.786,
55
+ "eval_samples_per_second": 105.821,
56
+ "eval_steps_per_second": 6.719,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.74,
61
+ "grad_norm": 1.5207180976867676,
62
+ "learning_rate": 0.0001925925925925926,
63
+ "loss": 1.8051,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.74,
68
+ "eval_accuracy": 0.48677248677248675,
69
+ "eval_loss": 1.6591798067092896,
70
+ "eval_runtime": 1.7501,
71
+ "eval_samples_per_second": 107.997,
72
+ "eval_steps_per_second": 6.857,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.93,
77
+ "grad_norm": 2.221734046936035,
78
+ "learning_rate": 0.00019074074074074075,
79
+ "loss": 1.6692,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.93,
84
+ "eval_accuracy": 0.582010582010582,
85
+ "eval_loss": 1.5287415981292725,
86
+ "eval_runtime": 1.7993,
87
+ "eval_samples_per_second": 105.039,
88
+ "eval_steps_per_second": 6.669,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 1.11,
93
+ "grad_norm": 1.5369292497634888,
94
+ "learning_rate": 0.0001890740740740741,
95
+ "loss": 1.5283,
96
+ "step": 60
97
+ },
98
+ {
99
+ "epoch": 1.11,
100
+ "eval_accuracy": 0.5608465608465608,
101
+ "eval_loss": 1.4252889156341553,
102
+ "eval_runtime": 1.7582,
103
+ "eval_samples_per_second": 107.493,
104
+ "eval_steps_per_second": 6.825,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 1.3,
109
+ "grad_norm": 1.9959373474121094,
110
+ "learning_rate": 0.00018722222222222222,
111
+ "loss": 1.3981,
112
+ "step": 70
113
+ },
114
+ {
115
+ "epoch": 1.3,
116
+ "eval_accuracy": 0.5925925925925926,
117
+ "eval_loss": 1.3883891105651855,
118
+ "eval_runtime": 1.7749,
119
+ "eval_samples_per_second": 106.485,
120
+ "eval_steps_per_second": 6.761,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 1.48,
125
+ "grad_norm": 2.101576805114746,
126
+ "learning_rate": 0.00018537037037037038,
127
+ "loss": 1.3047,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 1.48,
132
+ "eval_accuracy": 0.5767195767195767,
133
+ "eval_loss": 1.356843113899231,
134
+ "eval_runtime": 1.7875,
135
+ "eval_samples_per_second": 105.735,
136
+ "eval_steps_per_second": 6.713,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 1.67,
141
+ "grad_norm": 1.9240992069244385,
142
+ "learning_rate": 0.00018351851851851854,
143
+ "loss": 1.1325,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.67,
148
+ "eval_accuracy": 0.6349206349206349,
149
+ "eval_loss": 1.2104465961456299,
150
+ "eval_runtime": 1.7741,
151
+ "eval_samples_per_second": 106.533,
152
+ "eval_steps_per_second": 6.764,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 1.85,
157
+ "grad_norm": 1.6294556856155396,
158
+ "learning_rate": 0.00018166666666666667,
159
+ "loss": 1.2004,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 1.85,
164
+ "eval_accuracy": 0.6137566137566137,
165
+ "eval_loss": 1.263272762298584,
166
+ "eval_runtime": 1.8419,
167
+ "eval_samples_per_second": 102.609,
168
+ "eval_steps_per_second": 6.515,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 2.04,
173
+ "grad_norm": 4.842734336853027,
174
+ "learning_rate": 0.0001798148148148148,
175
+ "loss": 1.0475,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 2.04,
180
+ "eval_accuracy": 0.5555555555555556,
181
+ "eval_loss": 1.3616496324539185,
182
+ "eval_runtime": 1.7824,
183
+ "eval_samples_per_second": 106.036,
184
+ "eval_steps_per_second": 6.732,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 2.22,
189
+ "grad_norm": 1.8519538640975952,
190
+ "learning_rate": 0.00017796296296296296,
191
+ "loss": 0.9801,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 2.22,
196
+ "eval_accuracy": 0.671957671957672,
197
+ "eval_loss": 1.1471754312515259,
198
+ "eval_runtime": 1.796,
199
+ "eval_samples_per_second": 105.234,
200
+ "eval_steps_per_second": 6.682,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 2.41,
205
+ "grad_norm": 3.018026351928711,
206
+ "learning_rate": 0.00017611111111111112,
207
+ "loss": 0.862,
208
+ "step": 130
209
+ },
210
+ {
211
+ "epoch": 2.41,
212
+ "eval_accuracy": 0.6984126984126984,
213
+ "eval_loss": 1.0452642440795898,
214
+ "eval_runtime": 1.7578,
215
+ "eval_samples_per_second": 107.521,
216
+ "eval_steps_per_second": 6.827,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 2.59,
221
+ "grad_norm": 2.8672127723693848,
222
+ "learning_rate": 0.00017425925925925928,
223
+ "loss": 0.8905,
224
+ "step": 140
225
+ },
226
+ {
227
+ "epoch": 2.59,
228
+ "eval_accuracy": 0.6825396825396826,
229
+ "eval_loss": 0.9718140363693237,
230
+ "eval_runtime": 1.8323,
231
+ "eval_samples_per_second": 103.148,
232
+ "eval_steps_per_second": 6.549,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 2.78,
237
+ "grad_norm": 3.5106003284454346,
238
+ "learning_rate": 0.00017240740740740742,
239
+ "loss": 0.7839,
240
+ "step": 150
241
+ },
242
+ {
243
+ "epoch": 2.78,
244
+ "eval_accuracy": 0.6666666666666666,
245
+ "eval_loss": 1.0531541109085083,
246
+ "eval_runtime": 1.7655,
247
+ "eval_samples_per_second": 107.049,
248
+ "eval_steps_per_second": 6.797,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 2.96,
253
+ "grad_norm": 2.7532589435577393,
254
+ "learning_rate": 0.00017055555555555555,
255
+ "loss": 0.8304,
256
+ "step": 160
257
+ },
258
+ {
259
+ "epoch": 2.96,
260
+ "eval_accuracy": 0.6878306878306878,
261
+ "eval_loss": 0.96842360496521,
262
+ "eval_runtime": 1.8371,
263
+ "eval_samples_per_second": 102.881,
264
+ "eval_steps_per_second": 6.532,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 3.15,
269
+ "grad_norm": 2.1222331523895264,
270
+ "learning_rate": 0.0001687037037037037,
271
+ "loss": 0.883,
272
+ "step": 170
273
+ },
274
+ {
275
+ "epoch": 3.15,
276
+ "eval_accuracy": 0.6931216931216931,
277
+ "eval_loss": 0.9298208951950073,
278
+ "eval_runtime": 1.7867,
279
+ "eval_samples_per_second": 105.782,
280
+ "eval_steps_per_second": 6.716,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 3.33,
285
+ "grad_norm": 2.5858914852142334,
286
+ "learning_rate": 0.00016685185185185187,
287
+ "loss": 0.5714,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.33,
292
+ "eval_accuracy": 0.6772486772486772,
293
+ "eval_loss": 0.9491019248962402,
294
+ "eval_runtime": 1.7856,
295
+ "eval_samples_per_second": 105.846,
296
+ "eval_steps_per_second": 6.72,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 3.52,
301
+ "grad_norm": 1.7296024560928345,
302
+ "learning_rate": 0.000165,
303
+ "loss": 0.5209,
304
+ "step": 190
305
+ },
306
+ {
307
+ "epoch": 3.52,
308
+ "eval_accuracy": 0.6984126984126984,
309
+ "eval_loss": 0.914806604385376,
310
+ "eval_runtime": 1.7453,
311
+ "eval_samples_per_second": 108.289,
312
+ "eval_steps_per_second": 6.875,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 3.7,
317
+ "grad_norm": 4.235101699829102,
318
+ "learning_rate": 0.00016314814814814816,
319
+ "loss": 0.5404,
320
+ "step": 200
321
+ },
322
+ {
323
+ "epoch": 3.7,
324
+ "eval_accuracy": 0.671957671957672,
325
+ "eval_loss": 1.0290465354919434,
326
+ "eval_runtime": 1.8123,
327
+ "eval_samples_per_second": 104.285,
328
+ "eval_steps_per_second": 6.621,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 3.89,
333
+ "grad_norm": 3.8817615509033203,
334
+ "learning_rate": 0.0001612962962962963,
335
+ "loss": 0.6133,
336
+ "step": 210
337
+ },
338
+ {
339
+ "epoch": 3.89,
340
+ "eval_accuracy": 0.7142857142857143,
341
+ "eval_loss": 0.9116460680961609,
342
+ "eval_runtime": 1.7735,
343
+ "eval_samples_per_second": 106.57,
344
+ "eval_steps_per_second": 6.766,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 4.07,
349
+ "grad_norm": 1.743445634841919,
350
+ "learning_rate": 0.00015944444444444445,
351
+ "loss": 0.4347,
352
+ "step": 220
353
+ },
354
+ {
355
+ "epoch": 4.07,
356
+ "eval_accuracy": 0.7354497354497355,
357
+ "eval_loss": 0.8083848357200623,
358
+ "eval_runtime": 1.8193,
359
+ "eval_samples_per_second": 103.884,
360
+ "eval_steps_per_second": 6.596,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 4.26,
365
+ "grad_norm": 1.8867310285568237,
366
+ "learning_rate": 0.0001575925925925926,
367
+ "loss": 0.3659,
368
+ "step": 230
369
+ },
370
+ {
371
+ "epoch": 4.26,
372
+ "eval_accuracy": 0.7142857142857143,
373
+ "eval_loss": 0.890904426574707,
374
+ "eval_runtime": 1.7392,
375
+ "eval_samples_per_second": 108.672,
376
+ "eval_steps_per_second": 6.9,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 4.44,
381
+ "grad_norm": 2.56878399848938,
382
+ "learning_rate": 0.00015574074074074074,
383
+ "loss": 0.4439,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 4.44,
388
+ "eval_accuracy": 0.6825396825396826,
389
+ "eval_loss": 0.9554860591888428,
390
+ "eval_runtime": 1.7559,
391
+ "eval_samples_per_second": 107.64,
392
+ "eval_steps_per_second": 6.834,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 4.63,
397
+ "grad_norm": 1.9487425088882446,
398
+ "learning_rate": 0.0001538888888888889,
399
+ "loss": 0.3335,
400
+ "step": 250
401
+ },
402
+ {
403
+ "epoch": 4.63,
404
+ "eval_accuracy": 0.708994708994709,
405
+ "eval_loss": 0.931969404220581,
406
+ "eval_runtime": 1.8636,
407
+ "eval_samples_per_second": 101.417,
408
+ "eval_steps_per_second": 6.439,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 4.81,
413
+ "grad_norm": 2.4911906719207764,
414
+ "learning_rate": 0.00015203703703703703,
415
+ "loss": 0.3695,
416
+ "step": 260
417
+ },
418
+ {
419
+ "epoch": 4.81,
420
+ "eval_accuracy": 0.7037037037037037,
421
+ "eval_loss": 0.9643996357917786,
422
+ "eval_runtime": 1.743,
423
+ "eval_samples_per_second": 108.437,
424
+ "eval_steps_per_second": 6.885,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 5.0,
429
+ "grad_norm": 0.4799601137638092,
430
+ "learning_rate": 0.0001501851851851852,
431
+ "loss": 0.3018,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 5.0,
436
+ "eval_accuracy": 0.6455026455026455,
437
+ "eval_loss": 1.1127641201019287,
438
+ "eval_runtime": 1.8057,
439
+ "eval_samples_per_second": 104.667,
440
+ "eval_steps_per_second": 6.646,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 5.19,
445
+ "grad_norm": 0.8545930981636047,
446
+ "learning_rate": 0.00014833333333333335,
447
+ "loss": 0.2418,
448
+ "step": 280
449
+ },
450
+ {
451
+ "epoch": 5.19,
452
+ "eval_accuracy": 0.7301587301587301,
453
+ "eval_loss": 0.8752605319023132,
454
+ "eval_runtime": 1.7714,
455
+ "eval_samples_per_second": 106.698,
456
+ "eval_steps_per_second": 6.774,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 5.37,
461
+ "grad_norm": 2.0490822792053223,
462
+ "learning_rate": 0.00014648148148148148,
463
+ "loss": 0.2305,
464
+ "step": 290
465
+ },
466
+ {
467
+ "epoch": 5.37,
468
+ "eval_accuracy": 0.7142857142857143,
469
+ "eval_loss": 0.9517038464546204,
470
+ "eval_runtime": 1.7422,
471
+ "eval_samples_per_second": 108.483,
472
+ "eval_steps_per_second": 6.888,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 5.56,
477
+ "grad_norm": 1.5348315238952637,
478
+ "learning_rate": 0.00014462962962962962,
479
+ "loss": 0.238,
480
+ "step": 300
481
+ },
482
+ {
483
+ "epoch": 5.56,
484
+ "eval_accuracy": 0.7248677248677249,
485
+ "eval_loss": 0.9478802680969238,
486
+ "eval_runtime": 1.7999,
487
+ "eval_samples_per_second": 105.006,
488
+ "eval_steps_per_second": 6.667,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 5.74,
493
+ "grad_norm": 2.6169273853302,
494
+ "learning_rate": 0.00014277777777777778,
495
+ "loss": 0.2099,
496
+ "step": 310
497
+ },
498
+ {
499
+ "epoch": 5.74,
500
+ "eval_accuracy": 0.671957671957672,
501
+ "eval_loss": 1.103389024734497,
502
+ "eval_runtime": 1.8453,
503
+ "eval_samples_per_second": 102.42,
504
+ "eval_steps_per_second": 6.503,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 5.93,
509
+ "grad_norm": 2.5781023502349854,
510
+ "learning_rate": 0.00014092592592592594,
511
+ "loss": 0.2284,
512
+ "step": 320
513
+ },
514
+ {
515
+ "epoch": 5.93,
516
+ "eval_accuracy": 0.6825396825396826,
517
+ "eval_loss": 1.031624674797058,
518
+ "eval_runtime": 1.7579,
519
+ "eval_samples_per_second": 107.517,
520
+ "eval_steps_per_second": 6.826,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 6.11,
525
+ "grad_norm": 3.042239189147949,
526
+ "learning_rate": 0.0001390740740740741,
527
+ "loss": 0.1694,
528
+ "step": 330
529
+ },
530
+ {
531
+ "epoch": 6.11,
532
+ "eval_accuracy": 0.6613756613756614,
533
+ "eval_loss": 1.1174468994140625,
534
+ "eval_runtime": 1.7854,
535
+ "eval_samples_per_second": 105.856,
536
+ "eval_steps_per_second": 6.721,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 6.3,
541
+ "grad_norm": 0.8211657404899597,
542
+ "learning_rate": 0.00013722222222222223,
543
+ "loss": 0.1715,
544
+ "step": 340
545
+ },
546
+ {
547
+ "epoch": 6.3,
548
+ "eval_accuracy": 0.6772486772486772,
549
+ "eval_loss": 1.1067023277282715,
550
+ "eval_runtime": 1.8157,
551
+ "eval_samples_per_second": 104.091,
552
+ "eval_steps_per_second": 6.609,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 6.48,
557
+ "grad_norm": 1.9425742626190186,
558
+ "learning_rate": 0.00013537037037037036,
559
+ "loss": 0.123,
560
+ "step": 350
561
+ },
562
+ {
563
+ "epoch": 6.48,
564
+ "eval_accuracy": 0.7142857142857143,
565
+ "eval_loss": 1.0037899017333984,
566
+ "eval_runtime": 1.786,
567
+ "eval_samples_per_second": 105.821,
568
+ "eval_steps_per_second": 6.719,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 6.67,
573
+ "grad_norm": 2.7061989307403564,
574
+ "learning_rate": 0.00013351851851851852,
575
+ "loss": 0.1297,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 6.67,
580
+ "eval_accuracy": 0.6772486772486772,
581
+ "eval_loss": 1.1142699718475342,
582
+ "eval_runtime": 1.7368,
583
+ "eval_samples_per_second": 108.818,
584
+ "eval_steps_per_second": 6.909,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 6.85,
589
+ "grad_norm": 2.478459358215332,
590
+ "learning_rate": 0.00013166666666666668,
591
+ "loss": 0.2191,
592
+ "step": 370
593
+ },
594
+ {
595
+ "epoch": 6.85,
596
+ "eval_accuracy": 0.7354497354497355,
597
+ "eval_loss": 0.9896882176399231,
598
+ "eval_runtime": 1.7802,
599
+ "eval_samples_per_second": 106.167,
600
+ "eval_steps_per_second": 6.741,
601
+ "step": 370
602
+ },
603
+ {
604
+ "epoch": 7.04,
605
+ "grad_norm": 1.6921576261520386,
606
+ "learning_rate": 0.0001298148148148148,
607
+ "loss": 0.1206,
608
+ "step": 380
609
+ },
610
+ {
611
+ "epoch": 7.04,
612
+ "eval_accuracy": 0.7407407407407407,
613
+ "eval_loss": 0.962655782699585,
614
+ "eval_runtime": 1.7667,
615
+ "eval_samples_per_second": 106.982,
616
+ "eval_steps_per_second": 6.793,
617
+ "step": 380
618
+ },
619
+ {
620
+ "epoch": 7.22,
621
+ "grad_norm": 0.8060858845710754,
622
+ "learning_rate": 0.00012796296296296297,
623
+ "loss": 0.1071,
624
+ "step": 390
625
+ },
626
+ {
627
+ "epoch": 7.22,
628
+ "eval_accuracy": 0.7513227513227513,
629
+ "eval_loss": 1.0495431423187256,
630
+ "eval_runtime": 1.7473,
631
+ "eval_samples_per_second": 108.168,
632
+ "eval_steps_per_second": 6.868,
633
+ "step": 390
634
+ },
635
+ {
636
+ "epoch": 7.41,
637
+ "grad_norm": 0.38671812415122986,
638
+ "learning_rate": 0.0001261111111111111,
639
+ "loss": 0.1102,
640
+ "step": 400
641
+ },
642
+ {
643
+ "epoch": 7.41,
644
+ "eval_accuracy": 0.7301587301587301,
645
+ "eval_loss": 1.0441887378692627,
646
+ "eval_runtime": 1.7747,
647
+ "eval_samples_per_second": 106.496,
648
+ "eval_steps_per_second": 6.762,
649
+ "step": 400
650
+ },
651
+ {
652
+ "epoch": 7.59,
653
+ "grad_norm": 1.2801034450531006,
654
+ "learning_rate": 0.0001242592592592593,
655
+ "loss": 0.1269,
656
+ "step": 410
657
+ },
658
+ {
659
+ "epoch": 7.59,
660
+ "eval_accuracy": 0.7407407407407407,
661
+ "eval_loss": 1.0281165838241577,
662
+ "eval_runtime": 1.811,
663
+ "eval_samples_per_second": 104.363,
664
+ "eval_steps_per_second": 6.626,
665
+ "step": 410
666
+ },
667
+ {
668
+ "epoch": 7.78,
669
+ "grad_norm": 0.92644864320755,
670
+ "learning_rate": 0.00012240740740740742,
671
+ "loss": 0.0694,
672
+ "step": 420
673
+ },
674
+ {
675
+ "epoch": 7.78,
676
+ "eval_accuracy": 0.7354497354497355,
677
+ "eval_loss": 1.0361741781234741,
678
+ "eval_runtime": 1.7423,
679
+ "eval_samples_per_second": 108.479,
680
+ "eval_steps_per_second": 6.888,
681
+ "step": 420
682
+ },
683
+ {
684
+ "epoch": 7.96,
685
+ "grad_norm": 0.8203582167625427,
686
+ "learning_rate": 0.00012055555555555555,
687
+ "loss": 0.0548,
688
+ "step": 430
689
+ },
690
+ {
691
+ "epoch": 7.96,
692
+ "eval_accuracy": 0.746031746031746,
693
+ "eval_loss": 1.071204423904419,
694
+ "eval_runtime": 1.7384,
695
+ "eval_samples_per_second": 108.723,
696
+ "eval_steps_per_second": 6.903,
697
+ "step": 430
698
+ },
699
+ {
700
+ "epoch": 8.15,
701
+ "grad_norm": 4.490820407867432,
702
+ "learning_rate": 0.00011870370370370371,
703
+ "loss": 0.062,
704
+ "step": 440
705
+ },
706
+ {
707
+ "epoch": 8.15,
708
+ "eval_accuracy": 0.7301587301587301,
709
+ "eval_loss": 1.035632610321045,
710
+ "eval_runtime": 1.8141,
711
+ "eval_samples_per_second": 104.182,
712
+ "eval_steps_per_second": 6.615,
713
+ "step": 440
714
+ },
715
+ {
716
+ "epoch": 8.33,
717
+ "grad_norm": 1.979749321937561,
718
+ "learning_rate": 0.00011685185185185186,
719
+ "loss": 0.0542,
720
+ "step": 450
721
+ },
722
+ {
723
+ "epoch": 8.33,
724
+ "eval_accuracy": 0.6984126984126984,
725
+ "eval_loss": 1.2573037147521973,
726
+ "eval_runtime": 1.7824,
727
+ "eval_samples_per_second": 106.034,
728
+ "eval_steps_per_second": 6.732,
729
+ "step": 450
730
+ },
731
+ {
732
+ "epoch": 8.52,
733
+ "grad_norm": 4.157647609710693,
734
+ "learning_rate": 0.00011499999999999999,
735
+ "loss": 0.0823,
736
+ "step": 460
737
+ },
738
+ {
739
+ "epoch": 8.52,
740
+ "eval_accuracy": 0.7195767195767195,
741
+ "eval_loss": 1.1037700176239014,
742
+ "eval_runtime": 1.7489,
743
+ "eval_samples_per_second": 108.066,
744
+ "eval_steps_per_second": 6.861,
745
+ "step": 460
746
+ },
747
+ {
748
+ "epoch": 8.7,
749
+ "grad_norm": 0.08767159283161163,
750
+ "learning_rate": 0.00011314814814814816,
751
+ "loss": 0.1354,
752
+ "step": 470
753
+ },
754
+ {
755
+ "epoch": 8.7,
756
+ "eval_accuracy": 0.7407407407407407,
757
+ "eval_loss": 1.0803223848342896,
758
+ "eval_runtime": 1.7889,
759
+ "eval_samples_per_second": 105.654,
760
+ "eval_steps_per_second": 6.708,
761
+ "step": 470
762
+ },
763
+ {
764
+ "epoch": 8.89,
765
+ "grad_norm": 0.6974061131477356,
766
+ "learning_rate": 0.0001112962962962963,
767
+ "loss": 0.0798,
768
+ "step": 480
769
+ },
770
+ {
771
+ "epoch": 8.89,
772
+ "eval_accuracy": 0.671957671957672,
773
+ "eval_loss": 1.2207469940185547,
774
+ "eval_runtime": 1.7456,
775
+ "eval_samples_per_second": 108.27,
776
+ "eval_steps_per_second": 6.874,
777
+ "step": 480
778
+ },
779
+ {
780
+ "epoch": 9.07,
781
+ "grad_norm": 2.0027213096618652,
782
+ "learning_rate": 0.00010944444444444445,
783
+ "loss": 0.0963,
784
+ "step": 490
785
+ },
786
+ {
787
+ "epoch": 9.07,
788
+ "eval_accuracy": 0.656084656084656,
789
+ "eval_loss": 1.337466835975647,
790
+ "eval_runtime": 1.7654,
791
+ "eval_samples_per_second": 107.06,
792
+ "eval_steps_per_second": 6.797,
793
+ "step": 490
794
+ },
795
+ {
796
+ "epoch": 9.26,
797
+ "grad_norm": 0.14471650123596191,
798
+ "learning_rate": 0.0001075925925925926,
799
+ "loss": 0.0557,
800
+ "step": 500
801
+ },
802
+ {
803
+ "epoch": 9.26,
804
+ "eval_accuracy": 0.6984126984126984,
805
+ "eval_loss": 1.2044044733047485,
806
+ "eval_runtime": 1.9948,
807
+ "eval_samples_per_second": 94.745,
808
+ "eval_steps_per_second": 6.016,
809
+ "step": 500
810
+ },
811
+ {
812
+ "epoch": 9.44,
813
+ "grad_norm": 0.07393530756235123,
814
+ "learning_rate": 0.00010574074074074075,
815
+ "loss": 0.0491,
816
+ "step": 510
817
+ },
818
+ {
819
+ "epoch": 9.44,
820
+ "eval_accuracy": 0.7248677248677249,
821
+ "eval_loss": 1.18802809715271,
822
+ "eval_runtime": 1.8204,
823
+ "eval_samples_per_second": 103.822,
824
+ "eval_steps_per_second": 6.592,
825
+ "step": 510
826
+ },
827
+ {
828
+ "epoch": 9.63,
829
+ "grad_norm": 0.12744389474391937,
830
+ "learning_rate": 0.0001038888888888889,
831
+ "loss": 0.0502,
832
+ "step": 520
833
+ },
834
+ {
835
+ "epoch": 9.63,
836
+ "eval_accuracy": 0.746031746031746,
837
+ "eval_loss": 1.098527193069458,
838
+ "eval_runtime": 1.7601,
839
+ "eval_samples_per_second": 107.378,
840
+ "eval_steps_per_second": 6.818,
841
+ "step": 520
842
+ },
843
+ {
844
+ "epoch": 9.81,
845
+ "grad_norm": 0.07471567392349243,
846
+ "learning_rate": 0.00010203703703703704,
847
+ "loss": 0.0396,
848
+ "step": 530
849
+ },
850
+ {
851
+ "epoch": 9.81,
852
+ "eval_accuracy": 0.708994708994709,
853
+ "eval_loss": 1.214396595954895,
854
+ "eval_runtime": 1.7884,
855
+ "eval_samples_per_second": 105.68,
856
+ "eval_steps_per_second": 6.71,
857
+ "step": 530
858
+ },
859
+ {
860
+ "epoch": 10.0,
861
+ "grad_norm": 0.16710619628429413,
862
+ "learning_rate": 0.00010018518518518518,
863
+ "loss": 0.0717,
864
+ "step": 540
865
+ },
866
+ {
867
+ "epoch": 10.0,
868
+ "eval_accuracy": 0.7037037037037037,
869
+ "eval_loss": 1.2163357734680176,
870
+ "eval_runtime": 1.7401,
871
+ "eval_samples_per_second": 108.615,
872
+ "eval_steps_per_second": 6.896,
873
+ "step": 540
874
+ },
875
+ {
876
+ "epoch": 10.19,
877
+ "grad_norm": 0.07553374022245407,
878
+ "learning_rate": 9.833333333333333e-05,
879
+ "loss": 0.0279,
880
+ "step": 550
881
+ },
882
+ {
883
+ "epoch": 10.19,
884
+ "eval_accuracy": 0.7142857142857143,
885
+ "eval_loss": 1.119241714477539,
886
+ "eval_runtime": 1.766,
887
+ "eval_samples_per_second": 107.023,
888
+ "eval_steps_per_second": 6.795,
889
+ "step": 550
890
+ },
891
+ {
892
+ "epoch": 10.37,
893
+ "grad_norm": 0.07353632897138596,
894
+ "learning_rate": 9.648148148148149e-05,
895
+ "loss": 0.0329,
896
+ "step": 560
897
+ },
898
+ {
899
+ "epoch": 10.37,
900
+ "eval_accuracy": 0.7354497354497355,
901
+ "eval_loss": 1.1961112022399902,
902
+ "eval_runtime": 1.8216,
903
+ "eval_samples_per_second": 103.758,
904
+ "eval_steps_per_second": 6.588,
905
+ "step": 560
906
+ },
907
+ {
908
+ "epoch": 10.56,
909
+ "grad_norm": 0.5441647171974182,
910
+ "learning_rate": 9.462962962962963e-05,
911
+ "loss": 0.028,
912
+ "step": 570
913
+ },
914
+ {
915
+ "epoch": 10.56,
916
+ "eval_accuracy": 0.6984126984126984,
917
+ "eval_loss": 1.1282387971878052,
918
+ "eval_runtime": 1.7883,
919
+ "eval_samples_per_second": 105.689,
920
+ "eval_steps_per_second": 6.71,
921
+ "step": 570
922
+ },
923
+ {
924
+ "epoch": 10.74,
925
+ "grad_norm": 0.07243653386831284,
926
+ "learning_rate": 9.277777777777778e-05,
927
+ "loss": 0.0373,
928
+ "step": 580
929
+ },
930
+ {
931
+ "epoch": 10.74,
932
+ "eval_accuracy": 0.7195767195767195,
933
+ "eval_loss": 1.0716224908828735,
934
+ "eval_runtime": 1.736,
935
+ "eval_samples_per_second": 108.873,
936
+ "eval_steps_per_second": 6.913,
937
+ "step": 580
938
+ },
939
+ {
940
+ "epoch": 10.93,
941
+ "grad_norm": 0.04851379618048668,
942
+ "learning_rate": 9.092592592592593e-05,
943
+ "loss": 0.0368,
944
+ "step": 590
945
+ },
946
+ {
947
+ "epoch": 10.93,
948
+ "eval_accuracy": 0.7142857142857143,
949
+ "eval_loss": 1.1750774383544922,
950
+ "eval_runtime": 1.7848,
951
+ "eval_samples_per_second": 105.895,
952
+ "eval_steps_per_second": 6.723,
953
+ "step": 590
954
+ },
955
+ {
956
+ "epoch": 11.11,
957
+ "grad_norm": 0.05160636082291603,
958
+ "learning_rate": 8.907407407407407e-05,
959
+ "loss": 0.0485,
960
+ "step": 600
961
+ },
962
+ {
963
+ "epoch": 11.11,
964
+ "eval_accuracy": 0.7354497354497355,
965
+ "eval_loss": 1.0984432697296143,
966
+ "eval_runtime": 1.7772,
967
+ "eval_samples_per_second": 106.345,
968
+ "eval_steps_per_second": 6.752,
969
+ "step": 600
970
+ },
971
+ {
972
+ "epoch": 11.3,
973
+ "grad_norm": 0.054380565881729126,
974
+ "learning_rate": 8.722222222222223e-05,
975
+ "loss": 0.0234,
976
+ "step": 610
977
+ },
978
+ {
979
+ "epoch": 11.3,
980
+ "eval_accuracy": 0.7619047619047619,
981
+ "eval_loss": 1.0418734550476074,
982
+ "eval_runtime": 1.7977,
983
+ "eval_samples_per_second": 105.132,
984
+ "eval_steps_per_second": 6.675,
985
+ "step": 610
986
+ },
987
+ {
988
+ "epoch": 11.48,
989
+ "grad_norm": 0.32195061445236206,
990
+ "learning_rate": 8.537037037037038e-05,
991
+ "loss": 0.028,
992
+ "step": 620
993
+ },
994
+ {
995
+ "epoch": 11.48,
996
+ "eval_accuracy": 0.7566137566137566,
997
+ "eval_loss": 1.0536975860595703,
998
+ "eval_runtime": 1.7586,
999
+ "eval_samples_per_second": 107.47,
1000
+ "eval_steps_per_second": 6.823,
1001
+ "step": 620
1002
+ },
1003
+ {
1004
+ "epoch": 11.67,
1005
+ "grad_norm": 1.8460614681243896,
1006
+ "learning_rate": 8.351851851851852e-05,
1007
+ "loss": 0.0237,
1008
+ "step": 630
1009
+ },
1010
+ {
1011
+ "epoch": 11.67,
1012
+ "eval_accuracy": 0.746031746031746,
1013
+ "eval_loss": 1.0571786165237427,
1014
+ "eval_runtime": 1.7901,
1015
+ "eval_samples_per_second": 105.578,
1016
+ "eval_steps_per_second": 6.703,
1017
+ "step": 630
1018
+ },
1019
+ {
1020
+ "epoch": 11.85,
1021
+ "grad_norm": 1.7614848613739014,
1022
+ "learning_rate": 8.166666666666667e-05,
1023
+ "loss": 0.0198,
1024
+ "step": 640
1025
+ },
1026
+ {
1027
+ "epoch": 11.85,
1028
+ "eval_accuracy": 0.746031746031746,
1029
+ "eval_loss": 1.0192136764526367,
1030
+ "eval_runtime": 1.7683,
1031
+ "eval_samples_per_second": 106.885,
1032
+ "eval_steps_per_second": 6.786,
1033
+ "step": 640
1034
+ },
1035
+ {
1036
+ "epoch": 12.04,
1037
+ "grad_norm": 0.22871683537960052,
1038
+ "learning_rate": 7.981481481481481e-05,
1039
+ "loss": 0.02,
1040
+ "step": 650
1041
+ },
1042
+ {
1043
+ "epoch": 12.04,
1044
+ "eval_accuracy": 0.7195767195767195,
1045
+ "eval_loss": 1.244175672531128,
1046
+ "eval_runtime": 1.8603,
1047
+ "eval_samples_per_second": 101.595,
1048
+ "eval_steps_per_second": 6.45,
1049
+ "step": 650
1050
+ },
1051
+ {
1052
+ "epoch": 12.22,
1053
+ "grad_norm": 0.03752712532877922,
1054
+ "learning_rate": 7.796296296296297e-05,
1055
+ "loss": 0.0216,
1056
+ "step": 660
1057
+ },
1058
+ {
1059
+ "epoch": 12.22,
1060
+ "eval_accuracy": 0.7407407407407407,
1061
+ "eval_loss": 1.1395213603973389,
1062
+ "eval_runtime": 1.7992,
1063
+ "eval_samples_per_second": 105.048,
1064
+ "eval_steps_per_second": 6.67,
1065
+ "step": 660
1066
+ },
1067
+ {
1068
+ "epoch": 12.41,
1069
+ "grad_norm": 0.09251418709754944,
1070
+ "learning_rate": 7.61111111111111e-05,
1071
+ "loss": 0.0309,
1072
+ "step": 670
1073
+ },
1074
+ {
1075
+ "epoch": 12.41,
1076
+ "eval_accuracy": 0.7354497354497355,
1077
+ "eval_loss": 1.1767151355743408,
1078
+ "eval_runtime": 1.8204,
1079
+ "eval_samples_per_second": 103.823,
1080
+ "eval_steps_per_second": 6.592,
1081
+ "step": 670
1082
+ },
1083
+ {
1084
+ "epoch": 12.59,
1085
+ "grad_norm": 0.03858701139688492,
1086
+ "learning_rate": 7.425925925925927e-05,
1087
+ "loss": 0.0315,
1088
+ "step": 680
1089
+ },
1090
+ {
1091
+ "epoch": 12.59,
1092
+ "eval_accuracy": 0.7248677248677249,
1093
+ "eval_loss": 1.1881897449493408,
1094
+ "eval_runtime": 1.7853,
1095
+ "eval_samples_per_second": 105.862,
1096
+ "eval_steps_per_second": 6.721,
1097
+ "step": 680
1098
+ },
1099
+ {
1100
+ "epoch": 12.78,
1101
+ "grad_norm": 0.2986956536769867,
1102
+ "learning_rate": 7.240740740740741e-05,
1103
+ "loss": 0.017,
1104
+ "step": 690
1105
+ },
1106
+ {
1107
+ "epoch": 12.78,
1108
+ "eval_accuracy": 0.7354497354497355,
1109
+ "eval_loss": 1.1652072668075562,
1110
+ "eval_runtime": 1.8006,
1111
+ "eval_samples_per_second": 104.965,
1112
+ "eval_steps_per_second": 6.664,
1113
+ "step": 690
1114
+ },
1115
+ {
1116
+ "epoch": 12.96,
1117
+ "grad_norm": 0.23789283633232117,
1118
+ "learning_rate": 7.055555555555556e-05,
1119
+ "loss": 0.02,
1120
+ "step": 700
1121
+ },
1122
+ {
1123
+ "epoch": 12.96,
1124
+ "eval_accuracy": 0.7619047619047619,
1125
+ "eval_loss": 1.1011323928833008,
1126
+ "eval_runtime": 1.7393,
1127
+ "eval_samples_per_second": 108.665,
1128
+ "eval_steps_per_second": 6.899,
1129
+ "step": 700
1130
+ },
1131
+ {
1132
+ "epoch": 13.15,
1133
+ "grad_norm": 0.0361974723637104,
1134
+ "learning_rate": 6.87037037037037e-05,
1135
+ "loss": 0.0174,
1136
+ "step": 710
1137
+ },
1138
+ {
1139
+ "epoch": 13.15,
1140
+ "eval_accuracy": 0.7354497354497355,
1141
+ "eval_loss": 1.092558741569519,
1142
+ "eval_runtime": 1.8005,
1143
+ "eval_samples_per_second": 104.97,
1144
+ "eval_steps_per_second": 6.665,
1145
+ "step": 710
1146
+ },
1147
+ {
1148
+ "epoch": 13.33,
1149
+ "grad_norm": 0.04739515110850334,
1150
+ "learning_rate": 6.685185185185185e-05,
1151
+ "loss": 0.012,
1152
+ "step": 720
1153
+ },
1154
+ {
1155
+ "epoch": 13.33,
1156
+ "eval_accuracy": 0.746031746031746,
1157
+ "eval_loss": 1.0852241516113281,
1158
+ "eval_runtime": 1.787,
1159
+ "eval_samples_per_second": 105.766,
1160
+ "eval_steps_per_second": 6.715,
1161
+ "step": 720
1162
+ },
1163
+ {
1164
+ "epoch": 13.52,
1165
+ "grad_norm": 0.035977743566036224,
1166
+ "learning_rate": 6.500000000000001e-05,
1167
+ "loss": 0.0296,
1168
+ "step": 730
1169
+ },
1170
+ {
1171
+ "epoch": 13.52,
1172
+ "eval_accuracy": 0.7513227513227513,
1173
+ "eval_loss": 1.0534002780914307,
1174
+ "eval_runtime": 1.7706,
1175
+ "eval_samples_per_second": 106.746,
1176
+ "eval_steps_per_second": 6.778,
1177
+ "step": 730
1178
+ },
1179
+ {
1180
+ "epoch": 13.7,
1181
+ "grad_norm": 0.3354228436946869,
1182
+ "learning_rate": 6.314814814814815e-05,
1183
+ "loss": 0.0142,
1184
+ "step": 740
1185
+ },
1186
+ {
1187
+ "epoch": 13.7,
1188
+ "eval_accuracy": 0.746031746031746,
1189
+ "eval_loss": 1.0607830286026,
1190
+ "eval_runtime": 1.8039,
1191
+ "eval_samples_per_second": 104.775,
1192
+ "eval_steps_per_second": 6.652,
1193
+ "step": 740
1194
+ },
1195
+ {
1196
+ "epoch": 13.89,
1197
+ "grad_norm": 0.031177503988146782,
1198
+ "learning_rate": 6.12962962962963e-05,
1199
+ "loss": 0.0199,
1200
+ "step": 750
1201
+ },
1202
+ {
1203
+ "epoch": 13.89,
1204
+ "eval_accuracy": 0.746031746031746,
1205
+ "eval_loss": 1.0850036144256592,
1206
+ "eval_runtime": 1.7472,
1207
+ "eval_samples_per_second": 108.174,
1208
+ "eval_steps_per_second": 6.868,
1209
+ "step": 750
1210
+ },
1211
+ {
1212
+ "epoch": 14.07,
1213
+ "grad_norm": 0.2141834944486618,
1214
+ "learning_rate": 5.9444444444444445e-05,
1215
+ "loss": 0.0169,
1216
+ "step": 760
1217
+ },
1218
+ {
1219
+ "epoch": 14.07,
1220
+ "eval_accuracy": 0.7566137566137566,
1221
+ "eval_loss": 1.0736693143844604,
1222
+ "eval_runtime": 1.7821,
1223
+ "eval_samples_per_second": 106.054,
1224
+ "eval_steps_per_second": 6.734,
1225
+ "step": 760
1226
+ },
1227
+ {
1228
+ "epoch": 14.26,
1229
+ "grad_norm": 0.028399189934134483,
1230
+ "learning_rate": 5.75925925925926e-05,
1231
+ "loss": 0.0139,
1232
+ "step": 770
1233
+ },
1234
+ {
1235
+ "epoch": 14.26,
1236
+ "eval_accuracy": 0.7566137566137566,
1237
+ "eval_loss": 1.0717233419418335,
1238
+ "eval_runtime": 1.8135,
1239
+ "eval_samples_per_second": 104.221,
1240
+ "eval_steps_per_second": 6.617,
1241
+ "step": 770
1242
+ },
1243
+ {
1244
+ "epoch": 14.44,
1245
+ "grad_norm": 0.03289506584405899,
1246
+ "learning_rate": 5.574074074074075e-05,
1247
+ "loss": 0.0173,
1248
+ "step": 780
1249
+ },
1250
+ {
1251
+ "epoch": 14.44,
1252
+ "eval_accuracy": 0.7566137566137566,
1253
+ "eval_loss": 1.0707134008407593,
1254
+ "eval_runtime": 1.7856,
1255
+ "eval_samples_per_second": 105.849,
1256
+ "eval_steps_per_second": 6.721,
1257
+ "step": 780
1258
+ },
1259
+ {
1260
+ "epoch": 14.63,
1261
+ "grad_norm": 0.032911308109760284,
1262
+ "learning_rate": 5.388888888888889e-05,
1263
+ "loss": 0.0101,
1264
+ "step": 790
1265
+ },
1266
+ {
1267
+ "epoch": 14.63,
1268
+ "eval_accuracy": 0.7566137566137566,
1269
+ "eval_loss": 1.070402979850769,
1270
+ "eval_runtime": 1.7933,
1271
+ "eval_samples_per_second": 105.391,
1272
+ "eval_steps_per_second": 6.691,
1273
+ "step": 790
1274
+ },
1275
+ {
1276
+ "epoch": 14.81,
1277
+ "grad_norm": 0.43361806869506836,
1278
+ "learning_rate": 5.203703703703704e-05,
1279
+ "loss": 0.0286,
1280
+ "step": 800
1281
+ },
1282
+ {
1283
+ "epoch": 14.81,
1284
+ "eval_accuracy": 0.7671957671957672,
1285
+ "eval_loss": 1.0845017433166504,
1286
+ "eval_runtime": 1.7994,
1287
+ "eval_samples_per_second": 105.033,
1288
+ "eval_steps_per_second": 6.669,
1289
+ "step": 800
1290
+ },
1291
+ {
1292
+ "epoch": 15.0,
1293
+ "grad_norm": 0.05939367786049843,
1294
+ "learning_rate": 5.018518518518519e-05,
1295
+ "loss": 0.0135,
1296
+ "step": 810
1297
+ },
1298
+ {
1299
+ "epoch": 15.0,
1300
+ "eval_accuracy": 0.7513227513227513,
1301
+ "eval_loss": 1.0972745418548584,
1302
+ "eval_runtime": 1.7785,
1303
+ "eval_samples_per_second": 106.271,
1304
+ "eval_steps_per_second": 6.747,
1305
+ "step": 810
1306
+ },
1307
+ {
1308
+ "epoch": 15.19,
1309
+ "grad_norm": 0.030746394768357277,
1310
+ "learning_rate": 4.8333333333333334e-05,
1311
+ "loss": 0.0129,
1312
+ "step": 820
1313
+ },
1314
+ {
1315
+ "epoch": 15.19,
1316
+ "eval_accuracy": 0.7513227513227513,
1317
+ "eval_loss": 1.0909744501113892,
1318
+ "eval_runtime": 1.7304,
1319
+ "eval_samples_per_second": 109.222,
1320
+ "eval_steps_per_second": 6.935,
1321
+ "step": 820
1322
+ },
1323
+ {
1324
+ "epoch": 15.37,
1325
+ "grad_norm": 0.026390748098492622,
1326
+ "learning_rate": 4.648148148148148e-05,
1327
+ "loss": 0.0117,
1328
+ "step": 830
1329
+ },
1330
+ {
1331
+ "epoch": 15.37,
1332
+ "eval_accuracy": 0.7671957671957672,
1333
+ "eval_loss": 1.0890551805496216,
1334
+ "eval_runtime": 1.8164,
1335
+ "eval_samples_per_second": 104.051,
1336
+ "eval_steps_per_second": 6.606,
1337
+ "step": 830
1338
+ },
1339
+ {
1340
+ "epoch": 15.56,
1341
+ "grad_norm": 0.028341053053736687,
1342
+ "learning_rate": 4.462962962962963e-05,
1343
+ "loss": 0.014,
1344
+ "step": 840
1345
+ },
1346
+ {
1347
+ "epoch": 15.56,
1348
+ "eval_accuracy": 0.7566137566137566,
1349
+ "eval_loss": 1.0884122848510742,
1350
+ "eval_runtime": 1.8336,
1351
+ "eval_samples_per_second": 103.079,
1352
+ "eval_steps_per_second": 6.545,
1353
+ "step": 840
1354
+ },
1355
+ {
1356
+ "epoch": 15.74,
1357
+ "grad_norm": 0.027172435075044632,
1358
+ "learning_rate": 4.277777777777778e-05,
1359
+ "loss": 0.0093,
1360
+ "step": 850
1361
+ },
1362
+ {
1363
+ "epoch": 15.74,
1364
+ "eval_accuracy": 0.7513227513227513,
1365
+ "eval_loss": 1.0879539251327515,
1366
+ "eval_runtime": 1.7368,
1367
+ "eval_samples_per_second": 108.818,
1368
+ "eval_steps_per_second": 6.909,
1369
+ "step": 850
1370
+ },
1371
+ {
1372
+ "epoch": 15.93,
1373
+ "grad_norm": 0.4558853209018707,
1374
+ "learning_rate": 4.092592592592593e-05,
1375
+ "loss": 0.0264,
1376
+ "step": 860
1377
+ },
1378
+ {
1379
+ "epoch": 15.93,
1380
+ "eval_accuracy": 0.7566137566137566,
1381
+ "eval_loss": 1.0861279964447021,
1382
+ "eval_runtime": 1.8295,
1383
+ "eval_samples_per_second": 103.306,
1384
+ "eval_steps_per_second": 6.559,
1385
+ "step": 860
1386
+ },
1387
+ {
1388
+ "epoch": 16.11,
1389
+ "grad_norm": 0.023086287081241608,
1390
+ "learning_rate": 3.9074074074074076e-05,
1391
+ "loss": 0.0117,
1392
+ "step": 870
1393
+ },
1394
+ {
1395
+ "epoch": 16.11,
1396
+ "eval_accuracy": 0.7513227513227513,
1397
+ "eval_loss": 1.0812128782272339,
1398
+ "eval_runtime": 1.783,
1399
+ "eval_samples_per_second": 106.0,
1400
+ "eval_steps_per_second": 6.73,
1401
+ "step": 870
1402
+ },
1403
+ {
1404
+ "epoch": 16.3,
1405
+ "grad_norm": 0.16555258631706238,
1406
+ "learning_rate": 3.722222222222222e-05,
1407
+ "loss": 0.0131,
1408
+ "step": 880
1409
+ },
1410
+ {
1411
+ "epoch": 16.3,
1412
+ "eval_accuracy": 0.7513227513227513,
1413
+ "eval_loss": 1.084083080291748,
1414
+ "eval_runtime": 1.7979,
1415
+ "eval_samples_per_second": 105.125,
1416
+ "eval_steps_per_second": 6.675,
1417
+ "step": 880
1418
+ },
1419
+ {
1420
+ "epoch": 16.48,
1421
+ "grad_norm": 0.1985342651605606,
1422
+ "learning_rate": 3.537037037037037e-05,
1423
+ "loss": 0.0107,
1424
+ "step": 890
1425
+ },
1426
+ {
1427
+ "epoch": 16.48,
1428
+ "eval_accuracy": 0.7513227513227513,
1429
+ "eval_loss": 1.0908081531524658,
1430
+ "eval_runtime": 1.8371,
1431
+ "eval_samples_per_second": 102.877,
1432
+ "eval_steps_per_second": 6.532,
1433
+ "step": 890
1434
+ },
1435
+ {
1436
+ "epoch": 16.67,
1437
+ "grad_norm": 0.023619532585144043,
1438
+ "learning_rate": 3.351851851851852e-05,
1439
+ "loss": 0.0253,
1440
+ "step": 900
1441
+ },
1442
+ {
1443
+ "epoch": 16.67,
1444
+ "eval_accuracy": 0.7566137566137566,
1445
+ "eval_loss": 1.0818437337875366,
1446
+ "eval_runtime": 1.8128,
1447
+ "eval_samples_per_second": 104.258,
1448
+ "eval_steps_per_second": 6.62,
1449
+ "step": 900
1450
+ },
1451
+ {
1452
+ "epoch": 16.85,
1453
+ "grad_norm": 0.031866107136011124,
1454
+ "learning_rate": 3.1666666666666666e-05,
1455
+ "loss": 0.0113,
1456
+ "step": 910
1457
+ },
1458
+ {
1459
+ "epoch": 16.85,
1460
+ "eval_accuracy": 0.7671957671957672,
1461
+ "eval_loss": 1.0804176330566406,
1462
+ "eval_runtime": 1.7557,
1463
+ "eval_samples_per_second": 107.647,
1464
+ "eval_steps_per_second": 6.835,
1465
+ "step": 910
1466
+ },
1467
+ {
1468
+ "epoch": 17.04,
1469
+ "grad_norm": 0.027054764330387115,
1470
+ "learning_rate": 2.981481481481482e-05,
1471
+ "loss": 0.0117,
1472
+ "step": 920
1473
+ },
1474
+ {
1475
+ "epoch": 17.04,
1476
+ "eval_accuracy": 0.7671957671957672,
1477
+ "eval_loss": 1.0813896656036377,
1478
+ "eval_runtime": 1.8358,
1479
+ "eval_samples_per_second": 102.952,
1480
+ "eval_steps_per_second": 6.537,
1481
+ "step": 920
1482
+ },
1483
+ {
1484
+ "epoch": 17.22,
1485
+ "grad_norm": 0.025050414726138115,
1486
+ "learning_rate": 2.7962962962962965e-05,
1487
+ "loss": 0.0158,
1488
+ "step": 930
1489
+ },
1490
+ {
1491
+ "epoch": 17.22,
1492
+ "eval_accuracy": 0.7566137566137566,
1493
+ "eval_loss": 1.0813225507736206,
1494
+ "eval_runtime": 1.7643,
1495
+ "eval_samples_per_second": 107.126,
1496
+ "eval_steps_per_second": 6.802,
1497
+ "step": 930
1498
+ },
1499
+ {
1500
+ "epoch": 17.41,
1501
+ "grad_norm": 0.024830004200339317,
1502
+ "learning_rate": 2.6111111111111114e-05,
1503
+ "loss": 0.011,
1504
+ "step": 940
1505
+ },
1506
+ {
1507
+ "epoch": 17.41,
1508
+ "eval_accuracy": 0.7671957671957672,
1509
+ "eval_loss": 1.080676794052124,
1510
+ "eval_runtime": 1.759,
1511
+ "eval_samples_per_second": 107.45,
1512
+ "eval_steps_per_second": 6.822,
1513
+ "step": 940
1514
+ },
1515
+ {
1516
+ "epoch": 17.59,
1517
+ "grad_norm": 0.024760620668530464,
1518
+ "learning_rate": 2.425925925925926e-05,
1519
+ "loss": 0.0137,
1520
+ "step": 950
1521
+ },
1522
+ {
1523
+ "epoch": 17.59,
1524
+ "eval_accuracy": 0.7671957671957672,
1525
+ "eval_loss": 1.0803221464157104,
1526
+ "eval_runtime": 1.7971,
1527
+ "eval_samples_per_second": 105.168,
1528
+ "eval_steps_per_second": 6.677,
1529
+ "step": 950
1530
+ },
1531
+ {
1532
+ "epoch": 17.78,
1533
+ "grad_norm": 0.025229470804333687,
1534
+ "learning_rate": 2.240740740740741e-05,
1535
+ "loss": 0.0112,
1536
+ "step": 960
1537
+ },
1538
+ {
1539
+ "epoch": 17.78,
1540
+ "eval_accuracy": 0.7619047619047619,
1541
+ "eval_loss": 1.0807117223739624,
1542
+ "eval_runtime": 1.7675,
1543
+ "eval_samples_per_second": 106.93,
1544
+ "eval_steps_per_second": 6.789,
1545
+ "step": 960
1546
+ },
1547
+ {
1548
+ "epoch": 17.96,
1549
+ "grad_norm": 0.02313585951924324,
1550
+ "learning_rate": 2.0555555555555555e-05,
1551
+ "loss": 0.0172,
1552
+ "step": 970
1553
+ },
1554
+ {
1555
+ "epoch": 17.96,
1556
+ "eval_accuracy": 0.7566137566137566,
1557
+ "eval_loss": 1.0821946859359741,
1558
+ "eval_runtime": 1.8179,
1559
+ "eval_samples_per_second": 103.964,
1560
+ "eval_steps_per_second": 6.601,
1561
+ "step": 970
1562
+ },
1563
+ {
1564
+ "epoch": 18.15,
1565
+ "grad_norm": 0.024956317618489265,
1566
+ "learning_rate": 1.8703703703703704e-05,
1567
+ "loss": 0.0132,
1568
+ "step": 980
1569
+ },
1570
+ {
1571
+ "epoch": 18.15,
1572
+ "eval_accuracy": 0.7619047619047619,
1573
+ "eval_loss": 1.0860090255737305,
1574
+ "eval_runtime": 1.7729,
1575
+ "eval_samples_per_second": 106.607,
1576
+ "eval_steps_per_second": 6.769,
1577
+ "step": 980
1578
+ },
1579
+ {
1580
+ "epoch": 18.33,
1581
+ "grad_norm": 0.02182234823703766,
1582
+ "learning_rate": 1.6851851851851853e-05,
1583
+ "loss": 0.0127,
1584
+ "step": 990
1585
+ },
1586
+ {
1587
+ "epoch": 18.33,
1588
+ "eval_accuracy": 0.7619047619047619,
1589
+ "eval_loss": 1.0875723361968994,
1590
+ "eval_runtime": 1.7863,
1591
+ "eval_samples_per_second": 105.804,
1592
+ "eval_steps_per_second": 6.718,
1593
+ "step": 990
1594
+ },
1595
+ {
1596
+ "epoch": 18.52,
1597
+ "grad_norm": 0.024420464411377907,
1598
+ "learning_rate": 1.5e-05,
1599
+ "loss": 0.0152,
1600
+ "step": 1000
1601
+ },
1602
+ {
1603
+ "epoch": 18.52,
1604
+ "eval_accuracy": 0.7619047619047619,
1605
+ "eval_loss": 1.0873754024505615,
1606
+ "eval_runtime": 1.7723,
1607
+ "eval_samples_per_second": 106.644,
1608
+ "eval_steps_per_second": 6.771,
1609
+ "step": 1000
1610
+ },
1611
+ {
1612
+ "epoch": 18.7,
1613
+ "grad_norm": 0.18311668932437897,
1614
+ "learning_rate": 1.3148148148148148e-05,
1615
+ "loss": 0.0096,
1616
+ "step": 1010
1617
+ },
1618
+ {
1619
+ "epoch": 18.7,
1620
+ "eval_accuracy": 0.7619047619047619,
1621
+ "eval_loss": 1.088024377822876,
1622
+ "eval_runtime": 1.8979,
1623
+ "eval_samples_per_second": 99.583,
1624
+ "eval_steps_per_second": 6.323,
1625
+ "step": 1010
1626
+ },
1627
+ {
1628
+ "epoch": 18.89,
1629
+ "grad_norm": 0.023139068856835365,
1630
+ "learning_rate": 1.1296296296296297e-05,
1631
+ "loss": 0.0107,
1632
+ "step": 1020
1633
+ },
1634
+ {
1635
+ "epoch": 18.89,
1636
+ "eval_accuracy": 0.7619047619047619,
1637
+ "eval_loss": 1.08987557888031,
1638
+ "eval_runtime": 1.8132,
1639
+ "eval_samples_per_second": 104.237,
1640
+ "eval_steps_per_second": 6.618,
1641
+ "step": 1020
1642
+ },
1643
+ {
1644
+ "epoch": 19.07,
1645
+ "grad_norm": 0.024323537945747375,
1646
+ "learning_rate": 9.444444444444445e-06,
1647
+ "loss": 0.0124,
1648
+ "step": 1030
1649
+ },
1650
+ {
1651
+ "epoch": 19.07,
1652
+ "eval_accuracy": 0.7619047619047619,
1653
+ "eval_loss": 1.0899451971054077,
1654
+ "eval_runtime": 1.7841,
1655
+ "eval_samples_per_second": 105.934,
1656
+ "eval_steps_per_second": 6.726,
1657
+ "step": 1030
1658
+ },
1659
+ {
1660
+ "epoch": 19.26,
1661
+ "grad_norm": 0.20473988354206085,
1662
+ "learning_rate": 7.592592592592593e-06,
1663
+ "loss": 0.0187,
1664
+ "step": 1040
1665
+ },
1666
+ {
1667
+ "epoch": 19.26,
1668
+ "eval_accuracy": 0.7619047619047619,
1669
+ "eval_loss": 1.0915277004241943,
1670
+ "eval_runtime": 1.7828,
1671
+ "eval_samples_per_second": 106.015,
1672
+ "eval_steps_per_second": 6.731,
1673
+ "step": 1040
1674
+ },
1675
+ {
1676
+ "epoch": 19.44,
1677
+ "grad_norm": 0.021954894065856934,
1678
+ "learning_rate": 5.740740740740741e-06,
1679
+ "loss": 0.0159,
1680
+ "step": 1050
1681
+ },
1682
+ {
1683
+ "epoch": 19.44,
1684
+ "eval_accuracy": 0.7619047619047619,
1685
+ "eval_loss": 1.0916674137115479,
1686
+ "eval_runtime": 1.7554,
1687
+ "eval_samples_per_second": 107.665,
1688
+ "eval_steps_per_second": 6.836,
1689
+ "step": 1050
1690
+ },
1691
+ {
1692
+ "epoch": 19.63,
1693
+ "grad_norm": 0.02447775937616825,
1694
+ "learning_rate": 3.888888888888889e-06,
1695
+ "loss": 0.0107,
1696
+ "step": 1060
1697
+ },
1698
+ {
1699
+ "epoch": 19.63,
1700
+ "eval_accuracy": 0.7619047619047619,
1701
+ "eval_loss": 1.091030240058899,
1702
+ "eval_runtime": 1.7566,
1703
+ "eval_samples_per_second": 107.597,
1704
+ "eval_steps_per_second": 6.832,
1705
+ "step": 1060
1706
+ },
1707
+ {
1708
+ "epoch": 19.81,
1709
+ "grad_norm": 0.02190612629055977,
1710
+ "learning_rate": 2.0370370370370375e-06,
1711
+ "loss": 0.0105,
1712
+ "step": 1070
1713
+ },
1714
+ {
1715
+ "epoch": 19.81,
1716
+ "eval_accuracy": 0.7619047619047619,
1717
+ "eval_loss": 1.0911825895309448,
1718
+ "eval_runtime": 1.7879,
1719
+ "eval_samples_per_second": 105.71,
1720
+ "eval_steps_per_second": 6.712,
1721
+ "step": 1070
1722
+ },
1723
+ {
1724
+ "epoch": 20.0,
1725
+ "grad_norm": 0.0392175130546093,
1726
+ "learning_rate": 1.851851851851852e-07,
1727
+ "loss": 0.0076,
1728
+ "step": 1080
1729
+ },
1730
+ {
1731
+ "epoch": 20.0,
1732
+ "eval_accuracy": 0.7619047619047619,
1733
+ "eval_loss": 1.0909953117370605,
1734
+ "eval_runtime": 1.7898,
1735
+ "eval_samples_per_second": 105.596,
1736
+ "eval_steps_per_second": 6.704,
1737
+ "step": 1080
1738
+ },
1739
+ {
1740
+ "epoch": 20.0,
1741
+ "step": 1080,
1742
+ "total_flos": 2.681093741830963e+18,
1743
+ "train_loss": 0.29336747460895113,
1744
+ "train_runtime": 795.2059,
1745
+ "train_samples_per_second": 42.681,
1746
+ "train_steps_per_second": 1.358
1747
+ }
1748
+ ],
1749
+ "logging_steps": 10,
1750
+ "max_steps": 1080,
1751
+ "num_input_tokens_seen": 0,
1752
+ "num_train_epochs": 20,
1753
+ "save_steps": 10,
1754
+ "total_flos": 2.681093741830963e+18,
1755
+ "train_batch_size": 32,
1756
+ "trial_name": null,
1757
+ "trial_params": null
1758
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e86513f4eb3161b380f4bcf5175d22ee998a8883647c9efa0fbb1a8ca0fbded
3
+ size 4920