{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 1590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018867924528301886, "grad_norm": 2.9446594971470272, "learning_rate": 6.289308176100629e-06, "loss": 2.3241, "step": 10 }, { "epoch": 0.03773584905660377, "grad_norm": 3.2052443029455557, "learning_rate": 1.2578616352201259e-05, "loss": 2.2496, "step": 20 }, { "epoch": 0.05660377358490566, "grad_norm": 2.593311686421377, "learning_rate": 1.8867924528301888e-05, "loss": 2.2836, "step": 30 }, { "epoch": 0.07547169811320754, "grad_norm": 2.8033026461409043, "learning_rate": 2.5157232704402517e-05, "loss": 2.2567, "step": 40 }, { "epoch": 0.09433962264150944, "grad_norm": 3.0647958985618127, "learning_rate": 3.144654088050314e-05, "loss": 2.2749, "step": 50 }, { "epoch": 0.11320754716981132, "grad_norm": 2.7224942436893023, "learning_rate": 3.7735849056603776e-05, "loss": 2.3334, "step": 60 }, { "epoch": 0.1320754716981132, "grad_norm": 2.507797193629058, "learning_rate": 4.402515723270441e-05, "loss": 2.3378, "step": 70 }, { "epoch": 0.1509433962264151, "grad_norm": 2.791876220429587, "learning_rate": 5.0314465408805034e-05, "loss": 2.3462, "step": 80 }, { "epoch": 0.16981132075471697, "grad_norm": 2.8934882752144877, "learning_rate": 5.660377358490566e-05, "loss": 2.3832, "step": 90 }, { "epoch": 0.18867924528301888, "grad_norm": 2.74130339104755, "learning_rate": 6.289308176100629e-05, "loss": 2.3658, "step": 100 }, { "epoch": 0.20754716981132076, "grad_norm": 2.464301918797891, "learning_rate": 6.918238993710691e-05, "loss": 2.3894, "step": 110 }, { "epoch": 0.22641509433962265, "grad_norm": 3.5787748721128176, "learning_rate": 7.547169811320755e-05, "loss": 2.4635, "step": 120 }, { "epoch": 0.24528301886792453, "grad_norm": 3.8614586522023586, "learning_rate": 8.176100628930818e-05, "loss": 2.4099, "step": 130 }, { "epoch": 0.2641509433962264, "grad_norm": 2.624798812422503, "learning_rate": 8.805031446540882e-05, "loss": 2.4141, "step": 140 }, { "epoch": 0.2830188679245283, "grad_norm": 3.4083833226002174, "learning_rate": 9.433962264150944e-05, "loss": 2.4505, "step": 150 }, { "epoch": 0.3018867924528302, "grad_norm": 2.4164498878680254, "learning_rate": 9.999987950741765e-05, "loss": 2.4853, "step": 160 }, { "epoch": 0.32075471698113206, "grad_norm": 4.2037868049637, "learning_rate": 9.9985421100216e-05, "loss": 2.529, "step": 170 }, { "epoch": 0.33962264150943394, "grad_norm": 10.425711730519438, "learning_rate": 9.99468721610658e-05, "loss": 2.5123, "step": 180 }, { "epoch": 0.3584905660377358, "grad_norm": 6.075608387062913, "learning_rate": 9.988425126867315e-05, "loss": 2.5137, "step": 190 }, { "epoch": 0.37735849056603776, "grad_norm": 5.979582059920921, "learning_rate": 9.979758860325019e-05, "loss": 2.4818, "step": 200 }, { "epoch": 0.39622641509433965, "grad_norm": 300.5526680134449, "learning_rate": 9.968692593196944e-05, "loss": 2.5084, "step": 210 }, { "epoch": 0.41509433962264153, "grad_norm": 2.3641585810185437, "learning_rate": 9.955231658883432e-05, "loss": 2.4667, "step": 220 }, { "epoch": 0.4339622641509434, "grad_norm": 2.399558237267707, "learning_rate": 9.93938254489746e-05, "loss": 2.4815, "step": 230 }, { "epoch": 0.4528301886792453, "grad_norm": 2.291187744959764, "learning_rate": 9.921152889737984e-05, "loss": 2.465, "step": 240 }, { "epoch": 0.4716981132075472, "grad_norm": 2.2425372020480685, "learning_rate": 9.900551479208552e-05, "loss": 2.4827, "step": 250 }, { "epoch": 0.49056603773584906, "grad_norm": 2.106996905280666, "learning_rate": 9.877588242182975e-05, "loss": 2.5077, "step": 260 }, { "epoch": 0.5094339622641509, "grad_norm": 2.56597906125238, "learning_rate": 9.852274245820096e-05, "loss": 2.5812, "step": 270 }, { "epoch": 0.5283018867924528, "grad_norm": 2.1161401839810323, "learning_rate": 9.824621690229965e-05, "loss": 2.5047, "step": 280 }, { "epoch": 0.5471698113207547, "grad_norm": 2.9746454428316467, "learning_rate": 9.79464390259397e-05, "loss": 2.4985, "step": 290 }, { "epoch": 0.5660377358490566, "grad_norm": 2.1237673830934156, "learning_rate": 9.762355330741796e-05, "loss": 2.4943, "step": 300 }, { "epoch": 0.5849056603773585, "grad_norm": 1.8440846284987655, "learning_rate": 9.727771536188275e-05, "loss": 2.4536, "step": 310 }, { "epoch": 0.6037735849056604, "grad_norm": 3.3815527986620526, "learning_rate": 9.690909186633492e-05, "loss": 2.4837, "step": 320 }, { "epoch": 0.6226415094339622, "grad_norm": 2.7797010587604953, "learning_rate": 9.651786047929773e-05, "loss": 2.5074, "step": 330 }, { "epoch": 0.6415094339622641, "grad_norm": 2.0947283835947794, "learning_rate": 9.610420975519408e-05, "loss": 2.441, "step": 340 }, { "epoch": 0.660377358490566, "grad_norm": 1.9288902952601223, "learning_rate": 9.566833905347245e-05, "loss": 2.4885, "step": 350 }, { "epoch": 0.6792452830188679, "grad_norm": 2.004635564736395, "learning_rate": 9.521045844252552e-05, "loss": 2.4342, "step": 360 }, { "epoch": 0.6981132075471698, "grad_norm": 1.6511867070394874, "learning_rate": 9.473078859844728e-05, "loss": 2.4425, "step": 370 }, { "epoch": 0.7169811320754716, "grad_norm": 1.4598720970043289, "learning_rate": 9.422956069867807e-05, "loss": 2.4567, "step": 380 }, { "epoch": 0.7358490566037735, "grad_norm": 1.5295808219144331, "learning_rate": 9.370701631058829e-05, "loss": 2.4636, "step": 390 }, { "epoch": 0.7547169811320755, "grad_norm": 1.606602994374719, "learning_rate": 9.316340727505468e-05, "loss": 2.4707, "step": 400 }, { "epoch": 0.7735849056603774, "grad_norm": 1.5773231811089237, "learning_rate": 9.259899558508543e-05, "loss": 2.4242, "step": 410 }, { "epoch": 0.7924528301886793, "grad_norm": 1.5694593702673683, "learning_rate": 9.201405325955221e-05, "loss": 2.4754, "step": 420 }, { "epoch": 0.8113207547169812, "grad_norm": 1.4513304920200845, "learning_rate": 9.14088622120905e-05, "loss": 2.4735, "step": 430 }, { "epoch": 0.8301886792452831, "grad_norm": 2.2119679560211436, "learning_rate": 9.078371411523084e-05, "loss": 2.4511, "step": 440 }, { "epoch": 0.8490566037735849, "grad_norm": 1.4837853314532448, "learning_rate": 9.013891025982704e-05, "loss": 2.4627, "step": 450 }, { "epoch": 0.8679245283018868, "grad_norm": 1.548323059472257, "learning_rate": 8.947476140984856e-05, "loss": 2.4804, "step": 460 }, { "epoch": 0.8867924528301887, "grad_norm": 1.7369189464037587, "learning_rate": 8.879158765260767e-05, "loss": 2.4872, "step": 470 }, { "epoch": 0.9056603773584906, "grad_norm": 1.4222000085980089, "learning_rate": 8.808971824449275e-05, "loss": 2.4847, "step": 480 }, { "epoch": 0.9245283018867925, "grad_norm": 1.39169720237414, "learning_rate": 8.736949145228295e-05, "loss": 2.4873, "step": 490 }, { "epoch": 0.9433962264150944, "grad_norm": 1.5495461414725966, "learning_rate": 8.66312543901201e-05, "loss": 2.4738, "step": 500 }, { "epoch": 0.9622641509433962, "grad_norm": 1.5689856394055257, "learning_rate": 8.587536285221656e-05, "loss": 2.4211, "step": 510 }, { "epoch": 0.9811320754716981, "grad_norm": 1.559462761559426, "learning_rate": 8.510218114137992e-05, "loss": 2.4183, "step": 520 }, { "epoch": 1.0, "grad_norm": 1.38445361325361, "learning_rate": 8.43120818934367e-05, "loss": 2.459, "step": 530 }, { "epoch": 1.0188679245283019, "grad_norm": 1.8042327175721304, "learning_rate": 8.350544589764016e-05, "loss": 1.8838, "step": 540 }, { "epoch": 1.0377358490566038, "grad_norm": 1.8176496290402602, "learning_rate": 8.268266191314848e-05, "loss": 1.8624, "step": 550 }, { "epoch": 1.0566037735849056, "grad_norm": 1.8868344352432986, "learning_rate": 8.184412648166183e-05, "loss": 1.8182, "step": 560 }, { "epoch": 1.0754716981132075, "grad_norm": 1.7299260995769612, "learning_rate": 8.099024373630854e-05, "loss": 1.8391, "step": 570 }, { "epoch": 1.0943396226415094, "grad_norm": 1.9113984544679725, "learning_rate": 8.01214252068728e-05, "loss": 1.8545, "step": 580 }, { "epoch": 1.1132075471698113, "grad_norm": 1.794174287705714, "learning_rate": 7.923808962145734e-05, "loss": 1.8367, "step": 590 }, { "epoch": 1.1320754716981132, "grad_norm": 1.5751797225379325, "learning_rate": 7.83406627046769e-05, "loss": 1.8149, "step": 600 }, { "epoch": 1.150943396226415, "grad_norm": 1.9105350922209694, "learning_rate": 7.742957697247984e-05, "loss": 1.8061, "step": 610 }, { "epoch": 1.169811320754717, "grad_norm": 1.7630498555967447, "learning_rate": 7.650527152369647e-05, "loss": 1.8411, "step": 620 }, { "epoch": 1.1886792452830188, "grad_norm": 1.5261816105997068, "learning_rate": 7.556819182841497e-05, "loss": 1.8264, "step": 630 }, { "epoch": 1.2075471698113207, "grad_norm": 1.9369411893196908, "learning_rate": 7.461878951328653e-05, "loss": 1.8954, "step": 640 }, { "epoch": 1.2264150943396226, "grad_norm": 1.7688000917923798, "learning_rate": 7.365752214386321e-05, "loss": 1.8346, "step": 650 }, { "epoch": 1.2452830188679245, "grad_norm": 1.6569058541238642, "learning_rate": 7.268485300407393e-05, "loss": 1.8805, "step": 660 }, { "epoch": 1.2641509433962264, "grad_norm": 1.6708545601020437, "learning_rate": 7.17012508729441e-05, "loss": 1.7728, "step": 670 }, { "epoch": 1.2830188679245282, "grad_norm": 1.652310201967167, "learning_rate": 7.070718979866702e-05, "loss": 1.8718, "step": 680 }, { "epoch": 1.3018867924528301, "grad_norm": 1.9899020380799617, "learning_rate": 6.970314887013584e-05, "loss": 1.8535, "step": 690 }, { "epoch": 1.320754716981132, "grad_norm": 1.643783798160392, "learning_rate": 6.868961198604611e-05, "loss": 1.8344, "step": 700 }, { "epoch": 1.3396226415094339, "grad_norm": 1.8435538882684133, "learning_rate": 6.766706762168022e-05, "loss": 1.8759, "step": 710 }, { "epoch": 1.3584905660377358, "grad_norm": 1.6989197917459231, "learning_rate": 6.663600859348616e-05, "loss": 1.7973, "step": 720 }, { "epoch": 1.3773584905660377, "grad_norm": 1.6640164364452317, "learning_rate": 6.55969318215641e-05, "loss": 1.8101, "step": 730 }, { "epoch": 1.3962264150943398, "grad_norm": 1.663705205393152, "learning_rate": 6.455033809017512e-05, "loss": 1.8574, "step": 740 }, { "epoch": 1.4150943396226414, "grad_norm": 1.524574911562225, "learning_rate": 6.34967318063877e-05, "loss": 1.8194, "step": 750 }, { "epoch": 1.4339622641509435, "grad_norm": 1.638744038935454, "learning_rate": 6.24366207569781e-05, "loss": 1.8557, "step": 760 }, { "epoch": 1.4528301886792452, "grad_norm": 1.5905792259719815, "learning_rate": 6.137051586370194e-05, "loss": 1.8403, "step": 770 }, { "epoch": 1.4716981132075473, "grad_norm": 1.4115389229640394, "learning_rate": 6.029893093705492e-05, "loss": 1.86, "step": 780 }, { "epoch": 1.490566037735849, "grad_norm": 1.5664716217022607, "learning_rate": 5.9222382428641174e-05, "loss": 1.8223, "step": 790 }, { "epoch": 1.509433962264151, "grad_norm": 1.3426007079954652, "learning_rate": 5.814138918226887e-05, "loss": 1.7957, "step": 800 }, { "epoch": 1.5283018867924527, "grad_norm": 1.4496928054044773, "learning_rate": 5.7056472183892806e-05, "loss": 1.8542, "step": 810 }, { "epoch": 1.5471698113207548, "grad_norm": 1.7249530177698127, "learning_rate": 5.5968154310524614e-05, "loss": 1.8043, "step": 820 }, { "epoch": 1.5660377358490565, "grad_norm": 1.4451712049547103, "learning_rate": 5.487696007823161e-05, "loss": 1.7981, "step": 830 }, { "epoch": 1.5849056603773586, "grad_norm": 1.5035729769726907, "learning_rate": 5.378341538934566e-05, "loss": 1.8313, "step": 840 }, { "epoch": 1.6037735849056602, "grad_norm": 1.3823097737594126, "learning_rate": 5.268804727900391e-05, "loss": 1.8476, "step": 850 }, { "epoch": 1.6226415094339623, "grad_norm": 1.41439773210909, "learning_rate": 5.159138366114358e-05, "loss": 1.7863, "step": 860 }, { "epoch": 1.641509433962264, "grad_norm": 1.513162165314957, "learning_rate": 5.049395307407329e-05, "loss": 1.8363, "step": 870 }, { "epoch": 1.6603773584905661, "grad_norm": 1.5375457880909025, "learning_rate": 4.9396284425743326e-05, "loss": 1.8004, "step": 880 }, { "epoch": 1.6792452830188678, "grad_norm": 1.5695919072614308, "learning_rate": 4.829890673883792e-05, "loss": 1.818, "step": 890 }, { "epoch": 1.6981132075471699, "grad_norm": 1.3666688643802247, "learning_rate": 4.7202348895812035e-05, "loss": 1.7885, "step": 900 }, { "epoch": 1.7169811320754715, "grad_norm": 1.6027481528500458, "learning_rate": 4.610713938399601e-05, "loss": 1.7906, "step": 910 }, { "epoch": 1.7358490566037736, "grad_norm": 1.3930291385793376, "learning_rate": 4.5013806040890294e-05, "loss": 1.7858, "step": 920 }, { "epoch": 1.7547169811320755, "grad_norm": 1.4293209085375194, "learning_rate": 4.392287579977374e-05, "loss": 1.7796, "step": 930 }, { "epoch": 1.7735849056603774, "grad_norm": 1.5151788900532224, "learning_rate": 4.2834874435747305e-05, "loss": 1.7666, "step": 940 }, { "epoch": 1.7924528301886793, "grad_norm": 1.5253274784864974, "learning_rate": 4.1750326312336254e-05, "loss": 1.7516, "step": 950 }, { "epoch": 1.8113207547169812, "grad_norm": 1.3957421524480444, "learning_rate": 4.066975412877255e-05, "loss": 1.7904, "step": 960 }, { "epoch": 1.830188679245283, "grad_norm": 1.399046653332325, "learning_rate": 3.959367866807926e-05, "loss": 1.7605, "step": 970 }, { "epoch": 1.849056603773585, "grad_norm": 1.48580398039922, "learning_rate": 3.852261854607866e-05, "loss": 1.8169, "step": 980 }, { "epoch": 1.8679245283018868, "grad_norm": 1.4703556780094864, "learning_rate": 3.7457089961444636e-05, "loss": 1.7652, "step": 990 }, { "epoch": 1.8867924528301887, "grad_norm": 1.4196287584590106, "learning_rate": 3.6397606446920294e-05, "loss": 1.75, "step": 1000 }, { "epoch": 1.8867924528301887, "eval_loss": 2.2884254455566406, "eval_runtime": 165.0682, "eval_samples_per_second": 11.413, "eval_steps_per_second": 2.853, "step": 1000 }, { "epoch": 1.9056603773584906, "grad_norm": 1.442346199206303, "learning_rate": 3.534467862182008e-05, "loss": 1.7847, "step": 1010 }, { "epoch": 1.9245283018867925, "grad_norm": 1.3835916856247392, "learning_rate": 3.4298813945936295e-05, "loss": 1.7737, "step": 1020 }, { "epoch": 1.9433962264150944, "grad_norm": 1.3821884730018883, "learning_rate": 3.3260516474968285e-05, "loss": 1.7281, "step": 1030 }, { "epoch": 1.9622641509433962, "grad_norm": 1.3924722724907153, "learning_rate": 3.223028661759211e-05, "loss": 1.7924, "step": 1040 }, { "epoch": 1.9811320754716981, "grad_norm": 1.3388702147690976, "learning_rate": 3.12086208942881e-05, "loss": 1.7397, "step": 1050 }, { "epoch": 2.0, "grad_norm": 1.4015243388990968, "learning_rate": 3.019601169804216e-05, "loss": 1.6932, "step": 1060 }, { "epoch": 2.018867924528302, "grad_norm": 1.7480746986263314, "learning_rate": 2.919294705703647e-05, "loss": 0.6881, "step": 1070 }, { "epoch": 2.0377358490566038, "grad_norm": 1.7026666847000977, "learning_rate": 2.819991039944363e-05, "loss": 0.6078, "step": 1080 }, { "epoch": 2.056603773584906, "grad_norm": 1.7917514233908862, "learning_rate": 2.7217380320437978e-05, "loss": 0.6092, "step": 1090 }, { "epoch": 2.0754716981132075, "grad_norm": 1.6723597171494868, "learning_rate": 2.624583035153609e-05, "loss": 0.585, "step": 1100 }, { "epoch": 2.0943396226415096, "grad_norm": 1.63904815463906, "learning_rate": 2.5285728732377613e-05, "loss": 0.577, "step": 1110 }, { "epoch": 2.1132075471698113, "grad_norm": 1.6791437732786112, "learning_rate": 2.4337538185056762e-05, "loss": 0.551, "step": 1120 }, { "epoch": 2.1320754716981134, "grad_norm": 1.6076545037137666, "learning_rate": 2.3401715691112746e-05, "loss": 0.556, "step": 1130 }, { "epoch": 2.150943396226415, "grad_norm": 1.726665027733004, "learning_rate": 2.247871227128709e-05, "loss": 0.5711, "step": 1140 }, { "epoch": 2.169811320754717, "grad_norm": 1.6490156416373818, "learning_rate": 2.1568972768153556e-05, "loss": 0.5601, "step": 1150 }, { "epoch": 2.188679245283019, "grad_norm": 1.7210537816210676, "learning_rate": 2.067293563172581e-05, "loss": 0.5609, "step": 1160 }, { "epoch": 2.207547169811321, "grad_norm": 1.6521402147978896, "learning_rate": 1.9791032708145963e-05, "loss": 0.5417, "step": 1170 }, { "epoch": 2.2264150943396226, "grad_norm": 1.7020323862071838, "learning_rate": 1.8923689031555697e-05, "loss": 0.5635, "step": 1180 }, { "epoch": 2.2452830188679247, "grad_norm": 1.5791599921066155, "learning_rate": 1.807132261925073e-05, "loss": 0.5371, "step": 1190 }, { "epoch": 2.2641509433962264, "grad_norm": 1.6370275383685373, "learning_rate": 1.7234344270216713e-05, "loss": 0.5459, "step": 1200 }, { "epoch": 2.2830188679245285, "grad_norm": 1.649807184686461, "learning_rate": 1.6413157367144354e-05, "loss": 0.5608, "step": 1210 }, { "epoch": 2.30188679245283, "grad_norm": 1.7662002841569535, "learning_rate": 1.5608157682018505e-05, "loss": 0.5613, "step": 1220 }, { "epoch": 2.3207547169811322, "grad_norm": 1.641520954901167, "learning_rate": 1.4819733185375534e-05, "loss": 0.537, "step": 1230 }, { "epoch": 2.339622641509434, "grad_norm": 1.6680780951150302, "learning_rate": 1.4048263859320344e-05, "loss": 0.5425, "step": 1240 }, { "epoch": 2.358490566037736, "grad_norm": 1.5858289559337815, "learning_rate": 1.3294121514393637e-05, "loss": 0.5289, "step": 1250 }, { "epoch": 2.3773584905660377, "grad_norm": 1.609281814988441, "learning_rate": 1.2557669610377399e-05, "loss": 0.5155, "step": 1260 }, { "epoch": 2.3962264150943398, "grad_norm": 1.6108061713809745, "learning_rate": 1.1839263081124946e-05, "loss": 0.5214, "step": 1270 }, { "epoch": 2.4150943396226414, "grad_norm": 1.5364583247125485, "learning_rate": 1.113924816350026e-05, "loss": 0.5326, "step": 1280 }, { "epoch": 2.4339622641509435, "grad_norm": 1.523827370861251, "learning_rate": 1.04579622305086e-05, "loss": 0.5218, "step": 1290 }, { "epoch": 2.452830188679245, "grad_norm": 1.6969638639614046, "learning_rate": 9.795733628699333e-06, "loss": 0.5341, "step": 1300 }, { "epoch": 2.4716981132075473, "grad_norm": 1.502222163556516, "learning_rate": 9.152881519918787e-06, "loss": 0.5102, "step": 1310 }, { "epoch": 2.490566037735849, "grad_norm": 1.6251186914379474, "learning_rate": 8.529715727489912e-06, "loss": 0.5113, "step": 1320 }, { "epoch": 2.509433962264151, "grad_norm": 1.641634385361185, "learning_rate": 7.926536586892591e-06, "loss": 0.51, "step": 1330 }, { "epoch": 2.5283018867924527, "grad_norm": 1.564996479749529, "learning_rate": 7.3436348010165025e-06, "loss": 0.5075, "step": 1340 }, { "epoch": 2.547169811320755, "grad_norm": 1.5204914266086813, "learning_rate": 6.781291300056647e-06, "loss": 0.5111, "step": 1350 }, { "epoch": 2.5660377358490565, "grad_norm": 1.5204438359613908, "learning_rate": 6.239777106118605e-06, "loss": 0.501, "step": 1360 }, { "epoch": 2.5849056603773586, "grad_norm": 1.6153170323469739, "learning_rate": 5.719353202599209e-06, "loss": 0.5065, "step": 1370 }, { "epoch": 2.6037735849056602, "grad_norm": 1.532440501266883, "learning_rate": 5.220270408405198e-06, "loss": 0.5268, "step": 1380 }, { "epoch": 2.6226415094339623, "grad_norm": 1.5295028060682831, "learning_rate": 4.7427692570708445e-06, "loss": 0.5225, "step": 1390 }, { "epoch": 2.641509433962264, "grad_norm": 1.5576876729006885, "learning_rate": 4.287079880832478e-06, "loss": 0.5094, "step": 1400 }, { "epoch": 2.660377358490566, "grad_norm": 1.535240889295645, "learning_rate": 3.853421899715992e-06, "loss": 0.4991, "step": 1410 }, { "epoch": 2.6792452830188678, "grad_norm": 1.5668838039374533, "learning_rate": 3.44200431569075e-06, "loss": 0.5011, "step": 1420 }, { "epoch": 2.69811320754717, "grad_norm": 1.6597779325377704, "learning_rate": 3.053025411940802e-06, "loss": 0.4954, "step": 1430 }, { "epoch": 2.7169811320754715, "grad_norm": 1.5562079580978392, "learning_rate": 2.6866726573021026e-06, "loss": 0.5054, "step": 1440 }, { "epoch": 2.7358490566037736, "grad_norm": 1.5996686204830912, "learning_rate": 2.3431226159116637e-06, "loss": 0.5154, "step": 1450 }, { "epoch": 2.7547169811320753, "grad_norm": 1.6603987931741782, "learning_rate": 2.022540862112282e-06, "loss": 0.5029, "step": 1460 }, { "epoch": 2.7735849056603774, "grad_norm": 1.4442160081367916, "learning_rate": 1.725081900653791e-06, "loss": 0.5147, "step": 1470 }, { "epoch": 2.7924528301886795, "grad_norm": 1.5601472307077258, "learning_rate": 1.4508890922293018e-06, "loss": 0.4882, "step": 1480 }, { "epoch": 2.811320754716981, "grad_norm": 1.6882814081660615, "learning_rate": 1.2000945843823551e-06, "loss": 0.4909, "step": 1490 }, { "epoch": 2.830188679245283, "grad_norm": 1.5897926116142052, "learning_rate": 9.728192478182574e-07, "loss": 0.485, "step": 1500 }, { "epoch": 2.849056603773585, "grad_norm": 1.480162495765326, "learning_rate": 7.691726181503267e-07, "loss": 0.4985, "step": 1510 }, { "epoch": 2.867924528301887, "grad_norm": 1.5161543246256077, "learning_rate": 5.892528431090393e-07, "loss": 0.4816, "step": 1520 }, { "epoch": 2.8867924528301887, "grad_norm": 1.5434464499844907, "learning_rate": 4.331466352396396e-07, "loss": 0.4955, "step": 1530 }, { "epoch": 2.9056603773584904, "grad_norm": 1.5292680330833108, "learning_rate": 3.009292301109412e-07, "loss": 0.5018, "step": 1540 }, { "epoch": 2.9245283018867925, "grad_norm": 1.501995031518757, "learning_rate": 1.9266435005540483e-07, "loss": 0.5011, "step": 1550 }, { "epoch": 2.9433962264150946, "grad_norm": 1.5344813758662075, "learning_rate": 1.0840417345814313e-07, "loss": 0.5141, "step": 1560 }, { "epoch": 2.9622641509433962, "grad_norm": 1.5204098865333115, "learning_rate": 4.818930960945878e-08, "loss": 0.4904, "step": 1570 }, { "epoch": 2.981132075471698, "grad_norm": 1.5256874098586901, "learning_rate": 1.2048779133150279e-08, "loss": 0.4746, "step": 1580 }, { "epoch": 3.0, "grad_norm": 1.4804382321073322, "learning_rate": 0.0, "loss": 0.5039, "step": 1590 }, { "epoch": 3.0, "step": 1590, "total_flos": 83202240675840.0, "train_loss": 1.594443890733539, "train_runtime": 15602.0514, "train_samples_per_second": 3.26, "train_steps_per_second": 0.102 } ], "logging_steps": 10, "max_steps": 1590, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 83202240675840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }