indiejoseph commited on
Commit
6c71261
1 Parent(s): 8bfabe6

End of training

Browse files
Files changed (4) hide show
  1. README.md +3 -0
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +162 -690
README.md CHANGED
@@ -1,4 +1,7 @@
1
  ---
 
 
 
2
  license: mit
3
  base_model: facebook/mbart-large-50
4
  tags:
 
1
  ---
2
+ language:
3
+ - zh
4
+ - yue
5
  license: mit
6
  base_model: facebook/mbart-large-50
7
  tags:
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 0.003351212132286996,
4
- "train_runtime": 2726.0601,
5
- "train_samples": 220940,
6
- "train_samples_per_second": 405.237,
7
- "train_steps_per_second": 25.328
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.23955335338535488,
4
+ "train_runtime": 7304.5803,
5
+ "train_samples": 100042,
6
+ "train_samples_per_second": 54.783,
7
+ "train_steps_per_second": 3.424
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 0.003351212132286996,
4
- "train_runtime": 2726.0601,
5
- "train_samples": 220940,
6
- "train_samples_per_second": 405.237,
7
- "train_steps_per_second": 25.328
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.23955335338535488,
4
+ "train_runtime": 7304.5803,
5
+ "train_samples": 100042,
6
+ "train_samples_per_second": 54.783,
7
+ "train_steps_per_second": 3.424
8
  }
trainer_state.json CHANGED
@@ -1,856 +1,328 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 69045,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "learning_rate": 4.9820769063654144e-05,
14
- "loss": 1.7116,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.04,
19
- "learning_rate": 4.9639727713809836e-05,
20
- "loss": 0.8933,
21
  "step": 1000
22
  },
23
  {
24
- "epoch": 0.05,
25
- "learning_rate": 4.9458686363965534e-05,
26
- "loss": 0.7493,
27
  "step": 1500
28
  },
29
  {
30
- "epoch": 0.07,
31
- "learning_rate": 4.927800709682092e-05,
32
- "loss": 0.718,
33
  "step": 2000
34
  },
35
  {
36
- "epoch": 0.09,
37
- "learning_rate": 4.9096965746976617e-05,
38
- "loss": 0.6011,
39
  "step": 2500
40
  },
41
  {
42
- "epoch": 0.11,
43
- "learning_rate": 4.891592439713231e-05,
44
- "loss": 0.5649,
45
  "step": 3000
46
  },
47
  {
48
- "epoch": 0.13,
49
- "learning_rate": 4.8734883047288006e-05,
50
- "loss": 0.5612,
51
  "step": 3500
52
  },
53
  {
54
- "epoch": 0.14,
55
- "learning_rate": 4.85538416974437e-05,
56
- "loss": 0.5292,
57
  "step": 4000
58
  },
59
  {
60
- "epoch": 0.16,
61
- "learning_rate": 4.8372800347599396e-05,
62
- "loss": 0.4878,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.18,
67
- "learning_rate": 4.819175899775509e-05,
68
- "loss": 0.5319,
69
  "step": 5000
70
  },
71
  {
72
- "epoch": 0.2,
73
- "learning_rate": 4.8010717647910786e-05,
74
- "loss": 0.4588,
75
  "step": 5500
76
  },
77
  {
78
- "epoch": 0.22,
79
- "learning_rate": 4.782967629806648e-05,
80
- "loss": 0.4249,
81
  "step": 6000
82
  },
83
  {
84
- "epoch": 0.24,
85
- "learning_rate": 4.7648634948222176e-05,
86
- "loss": 0.4334,
87
  "step": 6500
88
  },
89
  {
90
- "epoch": 0.25,
91
- "learning_rate": 4.7467593598377874e-05,
92
- "loss": 0.4356,
93
  "step": 7000
94
  },
95
  {
96
- "epoch": 0.27,
97
- "learning_rate": 4.7286552248533566e-05,
98
- "loss": 0.4183,
99
  "step": 7500
100
  },
101
  {
102
- "epoch": 0.29,
103
- "learning_rate": 4.7105510898689264e-05,
104
- "loss": 0.3894,
105
  "step": 8000
106
  },
107
  {
108
- "epoch": 0.31,
109
- "learning_rate": 4.6924469548844955e-05,
110
- "loss": 0.3652,
111
  "step": 8500
112
  },
113
  {
114
- "epoch": 0.33,
115
- "learning_rate": 4.6743428199000654e-05,
116
- "loss": 0.3811,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 0.34,
121
- "learning_rate": 4.6562386849156345e-05,
122
- "loss": 0.3613,
123
  "step": 9500
124
  },
125
  {
126
- "epoch": 0.36,
127
- "learning_rate": 4.6381707582011736e-05,
128
- "loss": 0.3654,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 0.38,
133
- "learning_rate": 4.620066623216743e-05,
134
- "loss": 0.3379,
135
  "step": 10500
136
  },
137
  {
138
- "epoch": 0.4,
139
- "learning_rate": 4.6019624882323126e-05,
140
- "loss": 0.3438,
141
  "step": 11000
142
  },
143
  {
144
- "epoch": 0.42,
145
- "learning_rate": 4.583858353247882e-05,
146
- "loss": 0.3448,
147
  "step": 11500
148
  },
149
  {
150
- "epoch": 0.43,
151
- "learning_rate": 4.5657542182634516e-05,
152
- "loss": 0.3283,
153
  "step": 12000
154
  },
155
  {
156
- "epoch": 0.45,
157
- "learning_rate": 4.5476500832790214e-05,
158
- "loss": 0.3169,
159
  "step": 12500
160
  },
161
  {
162
- "epoch": 0.47,
163
- "learning_rate": 4.5295459482945906e-05,
164
- "loss": 0.3153,
165
  "step": 13000
166
  },
167
  {
168
- "epoch": 0.49,
169
- "learning_rate": 4.5114418133101604e-05,
170
- "loss": 0.3148,
171
  "step": 13500
172
  },
173
  {
174
- "epoch": 0.51,
175
- "learning_rate": 4.4933376783257296e-05,
176
- "loss": 0.3144,
177
  "step": 14000
178
  },
179
  {
180
- "epoch": 0.53,
181
- "learning_rate": 4.4752335433412994e-05,
182
- "loss": 0.3019,
183
  "step": 14500
184
  },
185
  {
186
- "epoch": 0.54,
187
- "learning_rate": 4.4571294083568686e-05,
188
- "loss": 0.3035,
189
  "step": 15000
190
  },
191
  {
192
- "epoch": 0.56,
193
- "learning_rate": 4.4390252733724384e-05,
194
- "loss": 0.2955,
195
  "step": 15500
196
  },
197
  {
198
- "epoch": 0.58,
199
- "learning_rate": 4.4209211383880075e-05,
200
- "loss": 0.2821,
201
  "step": 16000
202
  },
203
  {
204
- "epoch": 0.6,
205
- "learning_rate": 4.4028170034035774e-05,
206
- "loss": 0.2722,
207
  "step": 16500
208
  },
209
  {
210
- "epoch": 0.62,
211
- "learning_rate": 4.384712868419147e-05,
212
- "loss": 0.2771,
213
  "step": 17000
214
  },
215
  {
216
- "epoch": 0.63,
217
- "learning_rate": 4.3666449417046856e-05,
218
- "loss": 0.2767,
219
  "step": 17500
220
  },
221
  {
222
- "epoch": 0.65,
223
- "learning_rate": 4.348540806720255e-05,
224
- "loss": 0.2724,
225
  "step": 18000
226
  },
227
  {
228
- "epoch": 0.67,
229
- "learning_rate": 4.3304366717358246e-05,
230
- "loss": 0.2537,
231
  "step": 18500
232
  },
233
  {
234
- "epoch": 0.69,
235
- "learning_rate": 4.3123325367513944e-05,
236
- "loss": 0.2591,
237
  "step": 19000
238
  },
239
  {
240
- "epoch": 0.71,
241
- "learning_rate": 4.2942284017669636e-05,
242
- "loss": 0.2573,
243
  "step": 19500
244
  },
245
  {
246
- "epoch": 0.72,
247
- "learning_rate": 4.2761242667825334e-05,
248
- "loss": 0.2586,
249
  "step": 20000
250
  },
251
  {
252
- "epoch": 0.74,
253
- "learning_rate": 4.2580201317981026e-05,
254
- "loss": 0.2514,
255
  "step": 20500
256
  },
257
  {
258
- "epoch": 0.76,
259
- "learning_rate": 4.2399159968136724e-05,
260
- "loss": 0.2403,
261
  "step": 21000
262
  },
263
  {
264
- "epoch": 0.78,
265
- "learning_rate": 4.2218118618292416e-05,
266
- "loss": 0.2443,
267
  "step": 21500
268
  },
269
  {
270
- "epoch": 0.8,
271
- "learning_rate": 4.203780143384749e-05,
272
- "loss": 0.2393,
273
  "step": 22000
274
  },
275
  {
276
- "epoch": 0.81,
277
- "learning_rate": 4.185676008400319e-05,
278
- "loss": 0.2301,
279
  "step": 22500
280
  },
281
  {
282
- "epoch": 0.83,
283
- "learning_rate": 4.167571873415889e-05,
284
- "loss": 0.2324,
285
  "step": 23000
286
  },
287
  {
288
- "epoch": 0.85,
289
- "learning_rate": 4.149467738431458e-05,
290
- "loss": 0.2353,
291
  "step": 23500
292
  },
293
  {
294
- "epoch": 0.87,
295
- "learning_rate": 4.131363603447028e-05,
296
- "loss": 0.2248,
297
  "step": 24000
298
  },
299
  {
300
- "epoch": 0.89,
301
- "learning_rate": 4.113259468462597e-05,
302
- "loss": 0.2207,
303
  "step": 24500
304
  },
305
  {
306
- "epoch": 0.91,
307
- "learning_rate": 4.095155333478167e-05,
308
- "loss": 0.2178,
309
  "step": 25000
310
  },
311
  {
312
- "epoch": 0.92,
313
- "learning_rate": 4.077051198493736e-05,
314
- "loss": 0.2048,
315
- "step": 25500
316
- },
317
- {
318
- "epoch": 0.94,
319
- "learning_rate": 4.0589832717792745e-05,
320
- "loss": 0.2163,
321
- "step": 26000
322
- },
323
- {
324
- "epoch": 0.96,
325
- "learning_rate": 4.040879136794844e-05,
326
- "loss": 0.2139,
327
- "step": 26500
328
- },
329
- {
330
- "epoch": 0.98,
331
- "learning_rate": 4.0227750018104134e-05,
332
- "loss": 0.2135,
333
- "step": 27000
334
- },
335
- {
336
- "epoch": 1.0,
337
- "learning_rate": 4.004670866825983e-05,
338
- "loss": 0.1986,
339
- "step": 27500
340
- },
341
- {
342
- "epoch": 1.01,
343
- "learning_rate": 3.986566731841553e-05,
344
- "loss": 0.1597,
345
- "step": 28000
346
- },
347
- {
348
- "epoch": 1.03,
349
- "learning_rate": 3.968462596857123e-05,
350
- "loss": 0.154,
351
- "step": 28500
352
- },
353
- {
354
- "epoch": 1.05,
355
- "learning_rate": 3.950358461872692e-05,
356
- "loss": 0.1381,
357
- "step": 29000
358
- },
359
- {
360
- "epoch": 1.07,
361
- "learning_rate": 3.932326743428199e-05,
362
- "loss": 0.1525,
363
- "step": 29500
364
- },
365
- {
366
- "epoch": 1.09,
367
- "learning_rate": 3.914222608443769e-05,
368
- "loss": 0.143,
369
- "step": 30000
370
- },
371
- {
372
- "epoch": 1.1,
373
- "learning_rate": 3.896118473459338e-05,
374
- "loss": 0.1422,
375
- "step": 30500
376
- },
377
- {
378
- "epoch": 1.12,
379
- "learning_rate": 3.878014338474908e-05,
380
- "loss": 0.1348,
381
- "step": 31000
382
- },
383
- {
384
- "epoch": 1.14,
385
- "learning_rate": 3.859910203490477e-05,
386
- "loss": 0.1421,
387
- "step": 31500
388
- },
389
- {
390
- "epoch": 1.16,
391
- "learning_rate": 3.841806068506047e-05,
392
- "loss": 0.1438,
393
- "step": 32000
394
- },
395
- {
396
- "epoch": 1.18,
397
- "learning_rate": 3.823701933521617e-05,
398
- "loss": 0.1375,
399
- "step": 32500
400
- },
401
- {
402
- "epoch": 1.19,
403
- "learning_rate": 3.805597798537186e-05,
404
- "loss": 0.1418,
405
- "step": 33000
406
- },
407
- {
408
- "epoch": 1.21,
409
- "learning_rate": 3.787493663552756e-05,
410
- "loss": 0.1336,
411
- "step": 33500
412
- },
413
- {
414
- "epoch": 1.23,
415
- "learning_rate": 3.769425736838294e-05,
416
- "loss": 0.1339,
417
- "step": 34000
418
- },
419
- {
420
- "epoch": 1.25,
421
- "learning_rate": 3.751321601853864e-05,
422
- "loss": 0.1383,
423
- "step": 34500
424
- },
425
- {
426
- "epoch": 1.27,
427
- "learning_rate": 3.733217466869433e-05,
428
- "loss": 0.1298,
429
- "step": 35000
430
- },
431
- {
432
- "epoch": 1.29,
433
- "learning_rate": 3.715113331885003e-05,
434
- "loss": 0.1314,
435
- "step": 35500
436
- },
437
- {
438
- "epoch": 1.3,
439
- "learning_rate": 3.697009196900572e-05,
440
- "loss": 0.1249,
441
- "step": 36000
442
- },
443
- {
444
- "epoch": 1.32,
445
- "learning_rate": 3.678905061916142e-05,
446
- "loss": 0.1289,
447
- "step": 36500
448
- },
449
- {
450
- "epoch": 1.34,
451
- "learning_rate": 3.660800926931711e-05,
452
- "loss": 0.1289,
453
- "step": 37000
454
- },
455
- {
456
- "epoch": 1.36,
457
- "learning_rate": 3.642696791947281e-05,
458
- "loss": 0.1266,
459
- "step": 37500
460
- },
461
- {
462
- "epoch": 1.38,
463
- "learning_rate": 3.62459265696285e-05,
464
- "loss": 0.1201,
465
- "step": 38000
466
- },
467
- {
468
- "epoch": 1.39,
469
- "learning_rate": 3.606524730248389e-05,
470
- "loss": 0.1224,
471
- "step": 38500
472
- },
473
- {
474
- "epoch": 1.41,
475
- "learning_rate": 3.5884205952639583e-05,
476
- "loss": 0.1191,
477
- "step": 39000
478
- },
479
- {
480
- "epoch": 1.43,
481
- "learning_rate": 3.570316460279528e-05,
482
- "loss": 0.1157,
483
- "step": 39500
484
- },
485
- {
486
- "epoch": 1.45,
487
- "learning_rate": 3.552212325295098e-05,
488
- "loss": 0.1167,
489
- "step": 40000
490
- },
491
- {
492
- "epoch": 1.47,
493
- "learning_rate": 3.534144398580636e-05,
494
- "loss": 0.119,
495
- "step": 40500
496
- },
497
- {
498
- "epoch": 1.48,
499
- "learning_rate": 3.5160402635962056e-05,
500
- "loss": 0.1092,
501
- "step": 41000
502
- },
503
- {
504
- "epoch": 1.5,
505
- "learning_rate": 3.4979361286117754e-05,
506
- "loss": 0.1142,
507
- "step": 41500
508
- },
509
- {
510
- "epoch": 1.52,
511
- "learning_rate": 3.479831993627345e-05,
512
- "loss": 0.1156,
513
- "step": 42000
514
- },
515
- {
516
- "epoch": 1.54,
517
- "learning_rate": 3.4617278586429144e-05,
518
- "loss": 0.1122,
519
- "step": 42500
520
- },
521
- {
522
- "epoch": 1.56,
523
- "learning_rate": 3.443623723658484e-05,
524
- "loss": 0.1104,
525
- "step": 43000
526
- },
527
- {
528
- "epoch": 1.58,
529
- "learning_rate": 3.4255195886740534e-05,
530
- "loss": 0.1076,
531
- "step": 43500
532
- },
533
- {
534
- "epoch": 1.59,
535
- "learning_rate": 3.407415453689623e-05,
536
- "loss": 0.1096,
537
- "step": 44000
538
- },
539
- {
540
- "epoch": 1.61,
541
- "learning_rate": 3.3893113187051924e-05,
542
- "loss": 0.1094,
543
- "step": 44500
544
- },
545
- {
546
- "epoch": 1.63,
547
- "learning_rate": 3.371243391990731e-05,
548
- "loss": 0.105,
549
- "step": 45000
550
- },
551
- {
552
- "epoch": 1.65,
553
- "learning_rate": 3.3531392570063e-05,
554
- "loss": 0.1059,
555
- "step": 45500
556
- },
557
- {
558
- "epoch": 1.67,
559
- "learning_rate": 3.33503512202187e-05,
560
- "loss": 0.1012,
561
- "step": 46000
562
- },
563
- {
564
- "epoch": 1.68,
565
- "learning_rate": 3.3169309870374396e-05,
566
- "loss": 0.0991,
567
- "step": 46500
568
- },
569
- {
570
- "epoch": 1.7,
571
- "learning_rate": 3.2988268520530094e-05,
572
- "loss": 0.0978,
573
- "step": 47000
574
- },
575
- {
576
- "epoch": 1.72,
577
- "learning_rate": 3.2807227170685786e-05,
578
- "loss": 0.1003,
579
- "step": 47500
580
- },
581
- {
582
- "epoch": 1.74,
583
- "learning_rate": 3.2626185820841484e-05,
584
- "loss": 0.1,
585
- "step": 48000
586
- },
587
- {
588
- "epoch": 1.76,
589
- "learning_rate": 3.244550655369686e-05,
590
- "loss": 0.0976,
591
- "step": 48500
592
- },
593
- {
594
- "epoch": 1.77,
595
- "learning_rate": 3.226446520385256e-05,
596
- "loss": 0.0961,
597
- "step": 49000
598
- },
599
- {
600
- "epoch": 1.79,
601
- "learning_rate": 3.208342385400825e-05,
602
- "loss": 0.1011,
603
- "step": 49500
604
- },
605
- {
606
- "epoch": 1.81,
607
- "learning_rate": 3.190238250416395e-05,
608
- "loss": 0.0902,
609
- "step": 50000
610
- },
611
- {
612
- "epoch": 1.83,
613
- "learning_rate": 3.1721703237019334e-05,
614
- "loss": 0.0936,
615
- "step": 50500
616
- },
617
- {
618
- "epoch": 1.85,
619
- "learning_rate": 3.154066188717503e-05,
620
- "loss": 0.0903,
621
- "step": 51000
622
- },
623
- {
624
- "epoch": 1.86,
625
- "learning_rate": 3.135962053733073e-05,
626
- "loss": 0.082,
627
- "step": 51500
628
- },
629
- {
630
- "epoch": 1.88,
631
- "learning_rate": 3.1178941270186115e-05,
632
- "loss": 0.0841,
633
- "step": 52000
634
- },
635
- {
636
- "epoch": 1.9,
637
- "learning_rate": 3.0997899920341806e-05,
638
- "loss": 0.0898,
639
- "step": 52500
640
- },
641
- {
642
- "epoch": 1.92,
643
- "learning_rate": 3.0816858570497505e-05,
644
- "loss": 0.085,
645
- "step": 53000
646
- },
647
- {
648
- "epoch": 1.94,
649
- "learning_rate": 3.06358172206532e-05,
650
- "loss": 0.0807,
651
- "step": 53500
652
- },
653
- {
654
- "epoch": 1.96,
655
- "learning_rate": 3.0454775870808895e-05,
656
- "loss": 0.0886,
657
- "step": 54000
658
- },
659
- {
660
- "epoch": 1.97,
661
- "learning_rate": 3.0273734520964593e-05,
662
- "loss": 0.0818,
663
- "step": 54500
664
- },
665
- {
666
- "epoch": 1.99,
667
- "learning_rate": 3.0092693171120284e-05,
668
- "loss": 0.0839,
669
- "step": 55000
670
- },
671
- {
672
- "epoch": 2.01,
673
- "learning_rate": 2.9911651821275983e-05,
674
- "loss": 0.0581,
675
- "step": 55500
676
- },
677
- {
678
- "epoch": 2.03,
679
- "learning_rate": 2.9730610471431674e-05,
680
- "loss": 0.0504,
681
- "step": 56000
682
- },
683
- {
684
- "epoch": 2.05,
685
- "learning_rate": 2.9549569121587373e-05,
686
- "loss": 0.0486,
687
- "step": 56500
688
- },
689
- {
690
- "epoch": 2.06,
691
- "learning_rate": 2.9368527771743064e-05,
692
- "loss": 0.0463,
693
- "step": 57000
694
- },
695
- {
696
- "epoch": 2.08,
697
- "learning_rate": 2.9187486421898762e-05,
698
- "loss": 0.0545,
699
- "step": 57500
700
- },
701
- {
702
- "epoch": 2.1,
703
- "learning_rate": 2.900644507205446e-05,
704
- "loss": 0.0482,
705
- "step": 58000
706
- },
707
- {
708
- "epoch": 2.12,
709
- "learning_rate": 2.8825403722210152e-05,
710
- "loss": 0.0431,
711
- "step": 58500
712
- },
713
- {
714
- "epoch": 2.14,
715
- "learning_rate": 2.864436237236585e-05,
716
- "loss": 0.0494,
717
- "step": 59000
718
- },
719
- {
720
- "epoch": 2.15,
721
- "learning_rate": 2.8463683105221235e-05,
722
- "loss": 0.0522,
723
- "step": 59500
724
- },
725
- {
726
- "epoch": 4.34,
727
- "learning_rate": 6.565283510753857e-06,
728
- "loss": 0.0392,
729
- "step": 60000
730
- },
731
- {
732
- "epoch": 4.38,
733
- "learning_rate": 6.203200811065248e-06,
734
- "loss": 0.0329,
735
- "step": 60500
736
- },
737
- {
738
- "epoch": 4.42,
739
- "learning_rate": 5.841118111376639e-06,
740
- "loss": 0.0306,
741
- "step": 61000
742
- },
743
- {
744
- "epoch": 4.45,
745
- "learning_rate": 5.479759577087408e-06,
746
- "loss": 0.0281,
747
- "step": 61500
748
- },
749
- {
750
- "epoch": 4.49,
751
- "learning_rate": 5.117676877398798e-06,
752
- "loss": 0.0275,
753
- "step": 62000
754
- },
755
- {
756
- "epoch": 4.53,
757
- "learning_rate": 4.755594177710189e-06,
758
- "loss": 0.0238,
759
- "step": 62500
760
- },
761
- {
762
- "epoch": 4.56,
763
- "learning_rate": 4.394235643420958e-06,
764
- "loss": 0.0253,
765
- "step": 63000
766
- },
767
- {
768
- "epoch": 4.6,
769
- "learning_rate": 4.032152943732348e-06,
770
- "loss": 0.0222,
771
- "step": 63500
772
- },
773
- {
774
- "epoch": 4.63,
775
- "learning_rate": 3.67007024404374e-06,
776
- "loss": 0.0234,
777
- "step": 64000
778
- },
779
- {
780
- "epoch": 4.67,
781
- "learning_rate": 3.307987544355131e-06,
782
- "loss": 0.0214,
783
- "step": 64500
784
- },
785
- {
786
- "epoch": 4.71,
787
- "learning_rate": 2.945904844666522e-06,
788
- "loss": 0.0217,
789
- "step": 65000
790
- },
791
- {
792
- "epoch": 4.74,
793
- "learning_rate": 2.583822144977913e-06,
794
- "loss": 0.022,
795
- "step": 65500
796
- },
797
- {
798
- "epoch": 4.78,
799
- "learning_rate": 2.221739445289304e-06,
800
- "loss": 0.0213,
801
- "step": 66000
802
- },
803
- {
804
- "epoch": 4.82,
805
- "learning_rate": 1.8596567456006953e-06,
806
- "loss": 0.0216,
807
- "step": 66500
808
- },
809
- {
810
- "epoch": 4.85,
811
- "learning_rate": 1.4975740459120864e-06,
812
- "loss": 0.0209,
813
- "step": 67000
814
- },
815
- {
816
- "epoch": 4.89,
817
- "learning_rate": 1.1362155116228547e-06,
818
- "loss": 0.0193,
819
- "step": 67500
820
- },
821
- {
822
- "epoch": 4.92,
823
- "learning_rate": 7.748569773336231e-07,
824
- "loss": 0.0199,
825
- "step": 68000
826
- },
827
- {
828
- "epoch": 4.96,
829
- "learning_rate": 4.1349844304439136e-07,
830
- "loss": 0.0183,
831
- "step": 68500
832
- },
833
- {
834
- "epoch": 5.0,
835
- "learning_rate": 5.141574335578247e-08,
836
- "loss": 0.0212,
837
- "step": 69000
838
- },
839
- {
840
- "epoch": 5.0,
841
- "step": 69045,
842
- "total_flos": 8.175381920769638e+16,
843
- "train_loss": 0.003351212132286996,
844
- "train_runtime": 2726.0601,
845
- "train_samples_per_second": 405.237,
846
- "train_steps_per_second": 25.328
847
  }
848
  ],
849
  "logging_steps": 500,
850
- "max_steps": 69045,
851
- "num_train_epochs": 5,
852
  "save_steps": 500,
853
- "total_flos": 8.175381920769638e+16,
854
  "trial_name": null,
855
  "trial_params": null
856
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 500,
6
+ "global_step": 25012,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "learning_rate": 4.9018471133855755e-05,
14
+ "loss": 1.9521,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.16,
19
+ "learning_rate": 4.801895090356629e-05,
20
+ "loss": 0.6834,
21
  "step": 1000
22
  },
23
  {
24
+ "epoch": 0.24,
25
+ "learning_rate": 4.701943067327683e-05,
26
+ "loss": 0.55,
27
  "step": 1500
28
  },
29
  {
30
+ "epoch": 0.32,
31
+ "learning_rate": 4.6019910442987366e-05,
32
+ "loss": 0.4952,
33
  "step": 2000
34
  },
35
  {
36
+ "epoch": 0.4,
37
+ "learning_rate": 4.502039021269791e-05,
38
+ "loss": 0.4475,
39
  "step": 2500
40
  },
41
  {
42
+ "epoch": 0.48,
43
+ "learning_rate": 4.402086998240845e-05,
44
+ "loss": 0.4157,
45
  "step": 3000
46
  },
47
  {
48
+ "epoch": 0.56,
49
+ "learning_rate": 4.3021349752118984e-05,
50
+ "loss": 0.3926,
51
  "step": 3500
52
  },
53
  {
54
+ "epoch": 0.64,
55
+ "learning_rate": 4.2021829521829525e-05,
56
+ "loss": 0.3788,
57
  "step": 4000
58
  },
59
  {
60
+ "epoch": 0.72,
61
+ "learning_rate": 4.102230929154006e-05,
62
+ "loss": 0.4176,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.8,
67
+ "learning_rate": 4.00227890612506e-05,
68
+ "loss": 0.3467,
69
  "step": 5000
70
  },
71
  {
72
+ "epoch": 0.88,
73
+ "learning_rate": 3.902326883096114e-05,
74
+ "loss": 0.3491,
75
  "step": 5500
76
  },
77
  {
78
+ "epoch": 0.96,
79
+ "learning_rate": 3.802374860067168e-05,
80
+ "loss": 0.321,
81
  "step": 6000
82
  },
83
  {
84
+ "epoch": 1.04,
85
+ "learning_rate": 3.702422837038222e-05,
86
+ "loss": 0.2809,
87
  "step": 6500
88
  },
89
  {
90
+ "epoch": 1.12,
91
+ "learning_rate": 3.6024708140092754e-05,
92
+ "loss": 0.2329,
93
  "step": 7000
94
  },
95
  {
96
+ "epoch": 1.2,
97
+ "learning_rate": 3.5025187909803295e-05,
98
+ "loss": 0.2385,
99
  "step": 7500
100
  },
101
  {
102
+ "epoch": 1.28,
103
+ "learning_rate": 3.402566767951384e-05,
104
+ "loss": 0.2367,
105
  "step": 8000
106
  },
107
  {
108
+ "epoch": 1.36,
109
+ "learning_rate": 3.302614744922438e-05,
110
+ "loss": 0.232,
111
  "step": 8500
112
  },
113
  {
114
+ "epoch": 1.44,
115
+ "learning_rate": 3.202662721893491e-05,
116
+ "loss": 0.2325,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 1.52,
121
+ "learning_rate": 3.102710698864545e-05,
122
+ "loss": 0.229,
123
  "step": 9500
124
  },
125
  {
126
+ "epoch": 1.6,
127
+ "learning_rate": 3.0027586758355993e-05,
128
+ "loss": 0.2213,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 1.68,
133
+ "learning_rate": 2.9028066528066527e-05,
134
+ "loss": 0.2153,
135
  "step": 10500
136
  },
137
  {
138
+ "epoch": 1.76,
139
+ "learning_rate": 2.802854629777707e-05,
140
+ "loss": 0.2163,
141
  "step": 11000
142
  },
143
  {
144
+ "epoch": 1.84,
145
+ "learning_rate": 2.7029026067487607e-05,
146
+ "loss": 0.2152,
147
  "step": 11500
148
  },
149
  {
150
+ "epoch": 1.92,
151
+ "learning_rate": 2.602950583719815e-05,
152
+ "loss": 0.2061,
153
  "step": 12000
154
  },
155
  {
156
+ "epoch": 2.0,
157
+ "learning_rate": 2.5029985606908683e-05,
158
+ "loss": 0.205,
159
  "step": 12500
160
  },
161
  {
162
+ "epoch": 2.08,
163
+ "learning_rate": 2.4030465376619225e-05,
164
+ "loss": 0.1256,
165
  "step": 13000
166
  },
167
  {
168
+ "epoch": 2.16,
169
+ "learning_rate": 2.3030945146329763e-05,
170
+ "loss": 0.1247,
171
  "step": 13500
172
  },
173
  {
174
+ "epoch": 2.24,
175
+ "learning_rate": 2.20314249160403e-05,
176
+ "loss": 0.1249,
177
  "step": 14000
178
  },
179
  {
180
+ "epoch": 2.32,
181
+ "learning_rate": 2.1033903726211422e-05,
182
+ "loss": 0.1263,
183
  "step": 14500
184
  },
185
  {
186
+ "epoch": 2.4,
187
+ "learning_rate": 2.0034383495921957e-05,
188
+ "loss": 0.1283,
189
  "step": 15000
190
  },
191
  {
192
+ "epoch": 2.48,
193
+ "learning_rate": 1.9034863265632495e-05,
194
+ "loss": 0.1279,
195
  "step": 15500
196
  },
197
  {
198
+ "epoch": 2.56,
199
+ "learning_rate": 1.8035343035343037e-05,
200
+ "loss": 0.1247,
201
  "step": 16000
202
  },
203
  {
204
+ "epoch": 2.64,
205
+ "learning_rate": 1.7035822805053575e-05,
206
+ "loss": 0.1205,
207
  "step": 16500
208
  },
209
  {
210
+ "epoch": 2.72,
211
+ "learning_rate": 1.6036302574764113e-05,
212
+ "loss": 0.1227,
213
  "step": 17000
214
  },
215
  {
216
+ "epoch": 2.8,
217
+ "learning_rate": 1.5036782344474653e-05,
218
+ "loss": 0.1173,
219
  "step": 17500
220
  },
221
  {
222
+ "epoch": 2.88,
223
+ "learning_rate": 1.4037262114185193e-05,
224
+ "loss": 0.1211,
225
  "step": 18000
226
  },
227
  {
228
+ "epoch": 2.96,
229
+ "learning_rate": 1.3037741883895729e-05,
230
+ "loss": 0.1209,
231
  "step": 18500
232
  },
233
  {
234
+ "epoch": 3.04,
235
+ "learning_rate": 1.203822165360627e-05,
236
+ "loss": 0.0903,
237
  "step": 19000
238
  },
239
  {
240
+ "epoch": 3.12,
241
+ "learning_rate": 1.1038701423316809e-05,
242
+ "loss": 0.0639,
243
  "step": 19500
244
  },
245
  {
246
+ "epoch": 3.2,
247
+ "learning_rate": 1.0039181193027347e-05,
248
+ "loss": 0.0616,
249
  "step": 20000
250
  },
251
  {
252
+ "epoch": 3.28,
253
+ "learning_rate": 9.041660003198465e-06,
254
+ "loss": 0.0578,
255
  "step": 20500
256
  },
257
  {
258
+ "epoch": 3.36,
259
+ "learning_rate": 8.042139772909003e-06,
260
+ "loss": 0.0595,
261
  "step": 21000
262
  },
263
  {
264
+ "epoch": 3.44,
265
+ "learning_rate": 7.042619542619543e-06,
266
+ "loss": 0.0564,
267
  "step": 21500
268
  },
269
  {
270
+ "epoch": 3.52,
271
+ "learning_rate": 6.043099312330082e-06,
272
+ "loss": 0.0594,
273
  "step": 22000
274
  },
275
  {
276
+ "epoch": 3.6,
277
+ "learning_rate": 5.043579082040621e-06,
278
+ "loss": 0.0566,
279
  "step": 22500
280
  },
281
  {
282
+ "epoch": 3.68,
283
+ "learning_rate": 4.04405885175116e-06,
284
+ "loss": 0.0582,
285
  "step": 23000
286
  },
287
  {
288
+ "epoch": 3.76,
289
+ "learning_rate": 3.0445386214616987e-06,
290
+ "loss": 0.0572,
291
  "step": 23500
292
  },
293
  {
294
+ "epoch": 3.84,
295
+ "learning_rate": 2.0450183911722376e-06,
296
+ "loss": 0.0561,
297
  "step": 24000
298
  },
299
  {
300
+ "epoch": 3.92,
301
+ "learning_rate": 1.0454981608827763e-06,
302
+ "loss": 0.0538,
303
  "step": 24500
304
  },
305
  {
306
+ "epoch": 4.0,
307
+ "learning_rate": 4.797697105389413e-08,
308
+ "loss": 0.0546,
309
  "step": 25000
310
  },
311
  {
312
+ "epoch": 4.0,
313
+ "step": 25012,
314
+ "total_flos": 5.323473007568486e+16,
315
+ "train_loss": 0.23955335338535488,
316
+ "train_runtime": 7304.5803,
317
+ "train_samples_per_second": 54.783,
318
+ "train_steps_per_second": 3.424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  }
320
  ],
321
  "logging_steps": 500,
322
+ "max_steps": 25012,
323
+ "num_train_epochs": 4,
324
  "save_steps": 500,
325
+ "total_flos": 5.323473007568486e+16,
326
  "trial_name": null,
327
  "trial_params": null
328
  }