diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,165508 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5910, + "global_step": 23638, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.2304763516371946e-05, + "grad_norm": 0.8915818929672241, + "learning_rate": 0.001, + "loss": 2.2708, + "step": 1 + }, + { + "epoch": 8.460952703274389e-05, + "grad_norm": 0.4329769015312195, + "learning_rate": 0.001, + "loss": 2.483, + "step": 2 + }, + { + "epoch": 0.00012691429054911582, + "grad_norm": 0.45692744851112366, + "learning_rate": 0.001, + "loss": 3.5579, + "step": 3 + }, + { + "epoch": 0.00016921905406548778, + "grad_norm": 0.40001946687698364, + "learning_rate": 0.001, + "loss": 2.4916, + "step": 4 + }, + { + "epoch": 0.00021152381758185972, + "grad_norm": 0.530651867389679, + "learning_rate": 0.001, + "loss": 1.7408, + "step": 5 + }, + { + "epoch": 0.00025382858109823165, + "grad_norm": 1.0012280941009521, + "learning_rate": 0.001, + "loss": 2.3514, + "step": 6 + }, + { + "epoch": 0.0002961333446146036, + "grad_norm": 0.3786524534225464, + "learning_rate": 0.001, + "loss": 2.6244, + "step": 7 + }, + { + "epoch": 0.00033843810813097557, + "grad_norm": 0.7540691494941711, + "learning_rate": 0.001, + "loss": 2.4326, + "step": 8 + }, + { + "epoch": 0.0003807428716473475, + "grad_norm": 0.44959110021591187, + "learning_rate": 0.001, + "loss": 2.1347, + "step": 9 + }, + { + "epoch": 0.00042304763516371943, + "grad_norm": 0.3813895881175995, + "learning_rate": 0.001, + "loss": 2.6791, + "step": 10 + }, + { + "epoch": 0.00046535239868009137, + "grad_norm": 0.42967715859413147, + "learning_rate": 0.001, + "loss": 1.9963, + "step": 11 + }, + { + "epoch": 0.0005076571621964633, + "grad_norm": 0.8154281377792358, + "learning_rate": 0.001, + "loss": 3.3512, + "step": 12 + }, + { + "epoch": 0.0005499619257128352, + "grad_norm": 0.4104110896587372, + "learning_rate": 0.001, + "loss": 1.9401, + "step": 13 + }, + { + "epoch": 0.0005922666892292072, + "grad_norm": 0.536148190498352, + "learning_rate": 0.001, + "loss": 2.0377, + "step": 14 + }, + { + "epoch": 0.0006345714527455792, + "grad_norm": 0.3982076644897461, + "learning_rate": 0.001, + "loss": 2.6555, + "step": 15 + }, + { + "epoch": 0.0006768762162619511, + "grad_norm": 0.41156458854675293, + "learning_rate": 0.001, + "loss": 2.0036, + "step": 16 + }, + { + "epoch": 0.0007191809797783231, + "grad_norm": 0.5118016004562378, + "learning_rate": 0.001, + "loss": 2.4408, + "step": 17 + }, + { + "epoch": 0.000761485743294695, + "grad_norm": 0.44030845165252686, + "learning_rate": 0.001, + "loss": 3.1642, + "step": 18 + }, + { + "epoch": 0.0008037905068110669, + "grad_norm": 0.3349159061908722, + "learning_rate": 0.001, + "loss": 1.947, + "step": 19 + }, + { + "epoch": 0.0008460952703274389, + "grad_norm": 0.44204267859458923, + "learning_rate": 0.001, + "loss": 2.3666, + "step": 20 + }, + { + "epoch": 0.0008884000338438108, + "grad_norm": 0.4936717748641968, + "learning_rate": 0.001, + "loss": 3.2801, + "step": 21 + }, + { + "epoch": 0.0009307047973601827, + "grad_norm": 0.48639440536499023, + "learning_rate": 0.001, + "loss": 2.9606, + "step": 22 + }, + { + "epoch": 0.0009730095608765547, + "grad_norm": 0.3897702991962433, + "learning_rate": 0.001, + "loss": 2.0694, + "step": 23 + }, + { + "epoch": 0.0010153143243929266, + "grad_norm": 0.47153130173683167, + "learning_rate": 0.001, + "loss": 1.8757, + "step": 24 + }, + { + "epoch": 0.0010576190879092986, + "grad_norm": 0.7353273034095764, + "learning_rate": 0.001, + "loss": 3.3964, + "step": 25 + }, + { + "epoch": 0.0010999238514256705, + "grad_norm": 0.4378684461116791, + "learning_rate": 0.001, + "loss": 3.3866, + "step": 26 + }, + { + "epoch": 0.0011422286149420425, + "grad_norm": 0.5399907231330872, + "learning_rate": 0.001, + "loss": 3.5229, + "step": 27 + }, + { + "epoch": 0.0011845333784584143, + "grad_norm": 0.4380228519439697, + "learning_rate": 0.001, + "loss": 3.3026, + "step": 28 + }, + { + "epoch": 0.0012268381419747864, + "grad_norm": 0.5237672924995422, + "learning_rate": 0.001, + "loss": 2.7964, + "step": 29 + }, + { + "epoch": 0.0012691429054911584, + "grad_norm": 0.5567352771759033, + "learning_rate": 0.001, + "loss": 2.2648, + "step": 30 + }, + { + "epoch": 0.0013114476690075302, + "grad_norm": 0.4474937915802002, + "learning_rate": 0.001, + "loss": 2.6151, + "step": 31 + }, + { + "epoch": 0.0013537524325239023, + "grad_norm": 0.5182416439056396, + "learning_rate": 0.001, + "loss": 3.4561, + "step": 32 + }, + { + "epoch": 0.001396057196040274, + "grad_norm": 0.4204736649990082, + "learning_rate": 0.001, + "loss": 3.0416, + "step": 33 + }, + { + "epoch": 0.0014383619595566461, + "grad_norm": 0.45871949195861816, + "learning_rate": 0.001, + "loss": 2.2159, + "step": 34 + }, + { + "epoch": 0.001480666723073018, + "grad_norm": 0.4581148028373718, + "learning_rate": 0.001, + "loss": 2.0835, + "step": 35 + }, + { + "epoch": 0.00152297148658939, + "grad_norm": 0.41536086797714233, + "learning_rate": 0.001, + "loss": 2.5392, + "step": 36 + }, + { + "epoch": 0.0015652762501057618, + "grad_norm": 0.48674535751342773, + "learning_rate": 0.001, + "loss": 2.6758, + "step": 37 + }, + { + "epoch": 0.0016075810136221339, + "grad_norm": 0.8809964656829834, + "learning_rate": 0.001, + "loss": 2.8721, + "step": 38 + }, + { + "epoch": 0.0016498857771385057, + "grad_norm": 0.6416422724723816, + "learning_rate": 0.001, + "loss": 2.2551, + "step": 39 + }, + { + "epoch": 0.0016921905406548777, + "grad_norm": 0.8650534152984619, + "learning_rate": 0.001, + "loss": 2.4497, + "step": 40 + }, + { + "epoch": 0.0017344953041712498, + "grad_norm": 0.42286792397499084, + "learning_rate": 0.001, + "loss": 2.813, + "step": 41 + }, + { + "epoch": 0.0017768000676876216, + "grad_norm": 0.7012924551963806, + "learning_rate": 0.001, + "loss": 3.2002, + "step": 42 + }, + { + "epoch": 0.0018191048312039936, + "grad_norm": 0.6307465434074402, + "learning_rate": 0.001, + "loss": 2.1075, + "step": 43 + }, + { + "epoch": 0.0018614095947203655, + "grad_norm": 0.3289887309074402, + "learning_rate": 0.001, + "loss": 1.9497, + "step": 44 + }, + { + "epoch": 0.0019037143582367375, + "grad_norm": 0.5121589303016663, + "learning_rate": 0.001, + "loss": 2.9249, + "step": 45 + }, + { + "epoch": 0.0019460191217531093, + "grad_norm": 0.545623242855072, + "learning_rate": 0.001, + "loss": 2.364, + "step": 46 + }, + { + "epoch": 0.001988323885269481, + "grad_norm": 0.427653431892395, + "learning_rate": 0.001, + "loss": 2.1296, + "step": 47 + }, + { + "epoch": 0.002030628648785853, + "grad_norm": 0.4887496531009674, + "learning_rate": 0.001, + "loss": 2.9931, + "step": 48 + }, + { + "epoch": 0.0020729334123022252, + "grad_norm": 0.5210931897163391, + "learning_rate": 0.001, + "loss": 2.232, + "step": 49 + }, + { + "epoch": 0.0021152381758185973, + "grad_norm": 0.4477267861366272, + "learning_rate": 0.001, + "loss": 2.4165, + "step": 50 + }, + { + "epoch": 0.0021575429393349693, + "grad_norm": 1.6815810203552246, + "learning_rate": 0.001, + "loss": 2.0357, + "step": 51 + }, + { + "epoch": 0.002199847702851341, + "grad_norm": 0.3510444164276123, + "learning_rate": 0.001, + "loss": 2.9298, + "step": 52 + }, + { + "epoch": 0.002242152466367713, + "grad_norm": 0.42857974767684937, + "learning_rate": 0.001, + "loss": 2.8284, + "step": 53 + }, + { + "epoch": 0.002284457229884085, + "grad_norm": 0.6068593263626099, + "learning_rate": 0.001, + "loss": 3.6971, + "step": 54 + }, + { + "epoch": 0.002326761993400457, + "grad_norm": 0.46816545724868774, + "learning_rate": 0.001, + "loss": 2.0175, + "step": 55 + }, + { + "epoch": 0.0023690667569168287, + "grad_norm": 0.44248470664024353, + "learning_rate": 0.001, + "loss": 2.1154, + "step": 56 + }, + { + "epoch": 0.0024113715204332007, + "grad_norm": 0.4355832636356354, + "learning_rate": 0.001, + "loss": 2.5438, + "step": 57 + }, + { + "epoch": 0.0024536762839495727, + "grad_norm": 0.39491739869117737, + "learning_rate": 0.001, + "loss": 2.5997, + "step": 58 + }, + { + "epoch": 0.002495981047465945, + "grad_norm": 23.964534759521484, + "learning_rate": 0.001, + "loss": 2.3666, + "step": 59 + }, + { + "epoch": 0.002538285810982317, + "grad_norm": 0.5575146079063416, + "learning_rate": 0.001, + "loss": 2.6327, + "step": 60 + }, + { + "epoch": 0.0025805905744986884, + "grad_norm": 0.44521743059158325, + "learning_rate": 0.001, + "loss": 3.0179, + "step": 61 + }, + { + "epoch": 0.0026228953380150605, + "grad_norm": 0.891205370426178, + "learning_rate": 0.001, + "loss": 1.837, + "step": 62 + }, + { + "epoch": 0.0026652001015314325, + "grad_norm": 0.33089524507522583, + "learning_rate": 0.001, + "loss": 2.9327, + "step": 63 + }, + { + "epoch": 0.0027075048650478046, + "grad_norm": 0.3559476435184479, + "learning_rate": 0.001, + "loss": 2.6628, + "step": 64 + }, + { + "epoch": 0.002749809628564176, + "grad_norm": 0.4208162724971771, + "learning_rate": 0.001, + "loss": 2.1463, + "step": 65 + }, + { + "epoch": 0.002792114392080548, + "grad_norm": 0.48232489824295044, + "learning_rate": 0.001, + "loss": 2.1562, + "step": 66 + }, + { + "epoch": 0.0028344191555969202, + "grad_norm": 0.44821107387542725, + "learning_rate": 0.001, + "loss": 2.3234, + "step": 67 + }, + { + "epoch": 0.0028767239191132923, + "grad_norm": 0.42845988273620605, + "learning_rate": 0.001, + "loss": 2.1317, + "step": 68 + }, + { + "epoch": 0.002919028682629664, + "grad_norm": 0.6322043538093567, + "learning_rate": 0.001, + "loss": 1.8463, + "step": 69 + }, + { + "epoch": 0.002961333446146036, + "grad_norm": 0.4234910011291504, + "learning_rate": 0.001, + "loss": 2.8342, + "step": 70 + }, + { + "epoch": 0.003003638209662408, + "grad_norm": 0.4582638740539551, + "learning_rate": 0.001, + "loss": 2.1037, + "step": 71 + }, + { + "epoch": 0.00304594297317878, + "grad_norm": 0.41287997364997864, + "learning_rate": 0.001, + "loss": 2.4761, + "step": 72 + }, + { + "epoch": 0.003088247736695152, + "grad_norm": 0.6007514595985413, + "learning_rate": 0.001, + "loss": 2.221, + "step": 73 + }, + { + "epoch": 0.0031305525002115237, + "grad_norm": 0.6128891110420227, + "learning_rate": 0.001, + "loss": 2.847, + "step": 74 + }, + { + "epoch": 0.0031728572637278957, + "grad_norm": 0.713722288608551, + "learning_rate": 0.001, + "loss": 2.3765, + "step": 75 + }, + { + "epoch": 0.0032151620272442677, + "grad_norm": 0.5172343254089355, + "learning_rate": 0.001, + "loss": 2.3498, + "step": 76 + }, + { + "epoch": 0.00325746679076064, + "grad_norm": 0.44072654843330383, + "learning_rate": 0.001, + "loss": 2.6489, + "step": 77 + }, + { + "epoch": 0.0032997715542770114, + "grad_norm": 0.4492231011390686, + "learning_rate": 0.001, + "loss": 2.786, + "step": 78 + }, + { + "epoch": 0.0033420763177933834, + "grad_norm": 0.38566169142723083, + "learning_rate": 0.001, + "loss": 2.5199, + "step": 79 + }, + { + "epoch": 0.0033843810813097555, + "grad_norm": 0.44315439462661743, + "learning_rate": 0.001, + "loss": 3.2877, + "step": 80 + }, + { + "epoch": 0.0034266858448261275, + "grad_norm": 0.4293360710144043, + "learning_rate": 0.001, + "loss": 3.0169, + "step": 81 + }, + { + "epoch": 0.0034689906083424996, + "grad_norm": 0.4374128580093384, + "learning_rate": 0.001, + "loss": 1.8099, + "step": 82 + }, + { + "epoch": 0.003511295371858871, + "grad_norm": 0.3986782133579254, + "learning_rate": 0.001, + "loss": 1.953, + "step": 83 + }, + { + "epoch": 0.003553600135375243, + "grad_norm": 0.4710613787174225, + "learning_rate": 0.001, + "loss": 2.3669, + "step": 84 + }, + { + "epoch": 0.0035959048988916152, + "grad_norm": 0.8579991459846497, + "learning_rate": 0.001, + "loss": 1.8707, + "step": 85 + }, + { + "epoch": 0.0036382096624079873, + "grad_norm": 0.5034980177879333, + "learning_rate": 0.001, + "loss": 3.2651, + "step": 86 + }, + { + "epoch": 0.003680514425924359, + "grad_norm": 0.8276948928833008, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 87 + }, + { + "epoch": 0.003722819189440731, + "grad_norm": 0.4035736322402954, + "learning_rate": 0.001, + "loss": 2.4765, + "step": 88 + }, + { + "epoch": 0.003765123952957103, + "grad_norm": 0.6690497994422913, + "learning_rate": 0.001, + "loss": 3.0138, + "step": 89 + }, + { + "epoch": 0.003807428716473475, + "grad_norm": 0.3947993814945221, + "learning_rate": 0.001, + "loss": 2.2625, + "step": 90 + }, + { + "epoch": 0.003849733479989847, + "grad_norm": 0.7430382966995239, + "learning_rate": 0.001, + "loss": 1.8479, + "step": 91 + }, + { + "epoch": 0.0038920382435062187, + "grad_norm": 0.4126277267932892, + "learning_rate": 0.001, + "loss": 1.9788, + "step": 92 + }, + { + "epoch": 0.003934343007022591, + "grad_norm": 0.39753755927085876, + "learning_rate": 0.001, + "loss": 2.1964, + "step": 93 + }, + { + "epoch": 0.003976647770538962, + "grad_norm": 0.4967736601829529, + "learning_rate": 0.001, + "loss": 3.2626, + "step": 94 + }, + { + "epoch": 0.004018952534055335, + "grad_norm": 0.3497101664543152, + "learning_rate": 0.001, + "loss": 2.0893, + "step": 95 + }, + { + "epoch": 0.004061257297571706, + "grad_norm": 0.3811435401439667, + "learning_rate": 0.001, + "loss": 2.0541, + "step": 96 + }, + { + "epoch": 0.004103562061088079, + "grad_norm": 0.43603935837745667, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 97 + }, + { + "epoch": 0.0041458668246044505, + "grad_norm": 0.7152244448661804, + "learning_rate": 0.001, + "loss": 2.3025, + "step": 98 + }, + { + "epoch": 0.004188171588120822, + "grad_norm": 0.43975409865379333, + "learning_rate": 0.001, + "loss": 2.2561, + "step": 99 + }, + { + "epoch": 0.0042304763516371946, + "grad_norm": 0.41654467582702637, + "learning_rate": 0.001, + "loss": 2.5949, + "step": 100 + }, + { + "epoch": 0.004272781115153566, + "grad_norm": 0.5210958123207092, + "learning_rate": 0.001, + "loss": 2.3369, + "step": 101 + }, + { + "epoch": 0.004315085878669939, + "grad_norm": 0.35408666729927063, + "learning_rate": 0.001, + "loss": 2.4174, + "step": 102 + }, + { + "epoch": 0.00435739064218631, + "grad_norm": 0.38895490765571594, + "learning_rate": 0.001, + "loss": 2.6246, + "step": 103 + }, + { + "epoch": 0.004399695405702682, + "grad_norm": 0.34460920095443726, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 104 + }, + { + "epoch": 0.004442000169219054, + "grad_norm": 0.3262885510921478, + "learning_rate": 0.001, + "loss": 2.4733, + "step": 105 + }, + { + "epoch": 0.004484304932735426, + "grad_norm": 1.2628955841064453, + "learning_rate": 0.001, + "loss": 3.0509, + "step": 106 + }, + { + "epoch": 0.0045266096962517976, + "grad_norm": 0.42988988757133484, + "learning_rate": 0.001, + "loss": 2.5068, + "step": 107 + }, + { + "epoch": 0.00456891445976817, + "grad_norm": 0.42915594577789307, + "learning_rate": 0.001, + "loss": 2.6055, + "step": 108 + }, + { + "epoch": 0.004611219223284542, + "grad_norm": 0.5522227883338928, + "learning_rate": 0.001, + "loss": 2.6447, + "step": 109 + }, + { + "epoch": 0.004653523986800914, + "grad_norm": 0.3777415156364441, + "learning_rate": 0.001, + "loss": 2.4052, + "step": 110 + }, + { + "epoch": 0.004695828750317286, + "grad_norm": 0.5317783951759338, + "learning_rate": 0.001, + "loss": 2.5688, + "step": 111 + }, + { + "epoch": 0.004738133513833657, + "grad_norm": 0.4777368903160095, + "learning_rate": 0.001, + "loss": 2.3497, + "step": 112 + }, + { + "epoch": 0.00478043827735003, + "grad_norm": 1.6108906269073486, + "learning_rate": 0.001, + "loss": 2.4328, + "step": 113 + }, + { + "epoch": 0.004822743040866401, + "grad_norm": 0.3473729193210602, + "learning_rate": 0.001, + "loss": 2.3263, + "step": 114 + }, + { + "epoch": 0.004865047804382774, + "grad_norm": 0.43898317217826843, + "learning_rate": 0.001, + "loss": 3.2201, + "step": 115 + }, + { + "epoch": 0.0049073525678991455, + "grad_norm": 0.6918124556541443, + "learning_rate": 0.001, + "loss": 3.625, + "step": 116 + }, + { + "epoch": 0.004949657331415517, + "grad_norm": 9.253972053527832, + "learning_rate": 0.001, + "loss": 2.8214, + "step": 117 + }, + { + "epoch": 0.00499196209493189, + "grad_norm": 0.8446986079216003, + "learning_rate": 0.001, + "loss": 2.1835, + "step": 118 + }, + { + "epoch": 0.005034266858448261, + "grad_norm": 2.1206488609313965, + "learning_rate": 0.001, + "loss": 2.2383, + "step": 119 + }, + { + "epoch": 0.005076571621964634, + "grad_norm": 0.4840485453605652, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 120 + }, + { + "epoch": 0.005118876385481005, + "grad_norm": 0.39015278220176697, + "learning_rate": 0.001, + "loss": 2.548, + "step": 121 + }, + { + "epoch": 0.005161181148997377, + "grad_norm": 0.3702382743358612, + "learning_rate": 0.001, + "loss": 2.992, + "step": 122 + }, + { + "epoch": 0.005203485912513749, + "grad_norm": 0.4635108709335327, + "learning_rate": 0.001, + "loss": 2.5117, + "step": 123 + }, + { + "epoch": 0.005245790676030121, + "grad_norm": 0.34755992889404297, + "learning_rate": 0.001, + "loss": 2.3573, + "step": 124 + }, + { + "epoch": 0.0052880954395464926, + "grad_norm": 0.5302094221115112, + "learning_rate": 0.001, + "loss": 2.4309, + "step": 125 + }, + { + "epoch": 0.005330400203062865, + "grad_norm": 0.562074601650238, + "learning_rate": 0.001, + "loss": 2.0844, + "step": 126 + }, + { + "epoch": 0.005372704966579237, + "grad_norm": 0.42684710025787354, + "learning_rate": 0.001, + "loss": 2.7916, + "step": 127 + }, + { + "epoch": 0.005415009730095609, + "grad_norm": 0.5416858792304993, + "learning_rate": 0.001, + "loss": 2.5613, + "step": 128 + }, + { + "epoch": 0.005457314493611981, + "grad_norm": 0.4300224483013153, + "learning_rate": 0.001, + "loss": 3.3811, + "step": 129 + }, + { + "epoch": 0.005499619257128352, + "grad_norm": 0.46849384903907776, + "learning_rate": 0.001, + "loss": 2.5776, + "step": 130 + }, + { + "epoch": 0.005541924020644725, + "grad_norm": 0.46816426515579224, + "learning_rate": 0.001, + "loss": 2.3562, + "step": 131 + }, + { + "epoch": 0.005584228784161096, + "grad_norm": 0.4470241069793701, + "learning_rate": 0.001, + "loss": 2.3042, + "step": 132 + }, + { + "epoch": 0.005626533547677469, + "grad_norm": 0.7884194254875183, + "learning_rate": 0.001, + "loss": 2.8377, + "step": 133 + }, + { + "epoch": 0.0056688383111938405, + "grad_norm": 0.6185025572776794, + "learning_rate": 0.001, + "loss": 2.3049, + "step": 134 + }, + { + "epoch": 0.005711143074710212, + "grad_norm": 1.048901915550232, + "learning_rate": 0.001, + "loss": 2.1582, + "step": 135 + }, + { + "epoch": 0.005753447838226585, + "grad_norm": 0.47046688199043274, + "learning_rate": 0.001, + "loss": 3.4005, + "step": 136 + }, + { + "epoch": 0.005795752601742956, + "grad_norm": 0.435097873210907, + "learning_rate": 0.001, + "loss": 2.4624, + "step": 137 + }, + { + "epoch": 0.005838057365259328, + "grad_norm": 0.41452470421791077, + "learning_rate": 0.001, + "loss": 2.7871, + "step": 138 + }, + { + "epoch": 0.0058803621287757, + "grad_norm": 0.8315361142158508, + "learning_rate": 0.001, + "loss": 2.007, + "step": 139 + }, + { + "epoch": 0.005922666892292072, + "grad_norm": 1.842594027519226, + "learning_rate": 0.001, + "loss": 2.3946, + "step": 140 + }, + { + "epoch": 0.005964971655808444, + "grad_norm": 0.6004571318626404, + "learning_rate": 0.001, + "loss": 2.3669, + "step": 141 + }, + { + "epoch": 0.006007276419324816, + "grad_norm": 0.534443199634552, + "learning_rate": 0.001, + "loss": 2.5264, + "step": 142 + }, + { + "epoch": 0.0060495811828411876, + "grad_norm": 0.4516150951385498, + "learning_rate": 0.001, + "loss": 2.3313, + "step": 143 + }, + { + "epoch": 0.00609188594635756, + "grad_norm": 0.4618544280529022, + "learning_rate": 0.001, + "loss": 2.4899, + "step": 144 + }, + { + "epoch": 0.006134190709873932, + "grad_norm": 0.7234168648719788, + "learning_rate": 0.001, + "loss": 2.6252, + "step": 145 + }, + { + "epoch": 0.006176495473390304, + "grad_norm": 0.5123864412307739, + "learning_rate": 0.001, + "loss": 2.9488, + "step": 146 + }, + { + "epoch": 0.006218800236906676, + "grad_norm": 0.5958865284919739, + "learning_rate": 0.001, + "loss": 3.4051, + "step": 147 + }, + { + "epoch": 0.006261105000423047, + "grad_norm": 0.5423961877822876, + "learning_rate": 0.001, + "loss": 4.0919, + "step": 148 + }, + { + "epoch": 0.00630340976393942, + "grad_norm": 1.3664882183074951, + "learning_rate": 0.001, + "loss": 2.0117, + "step": 149 + }, + { + "epoch": 0.006345714527455791, + "grad_norm": 0.35952356457710266, + "learning_rate": 0.001, + "loss": 2.8003, + "step": 150 + }, + { + "epoch": 0.006388019290972164, + "grad_norm": 0.43103012442588806, + "learning_rate": 0.001, + "loss": 2.4838, + "step": 151 + }, + { + "epoch": 0.0064303240544885355, + "grad_norm": 0.6071809530258179, + "learning_rate": 0.001, + "loss": 2.6764, + "step": 152 + }, + { + "epoch": 0.006472628818004907, + "grad_norm": 0.38204053044319153, + "learning_rate": 0.001, + "loss": 2.2442, + "step": 153 + }, + { + "epoch": 0.00651493358152128, + "grad_norm": 0.301871120929718, + "learning_rate": 0.001, + "loss": 1.855, + "step": 154 + }, + { + "epoch": 0.006557238345037651, + "grad_norm": 0.4744417667388916, + "learning_rate": 0.001, + "loss": 2.5978, + "step": 155 + }, + { + "epoch": 0.006599543108554023, + "grad_norm": 0.5253042578697205, + "learning_rate": 0.001, + "loss": 1.9776, + "step": 156 + }, + { + "epoch": 0.006641847872070395, + "grad_norm": 0.611546516418457, + "learning_rate": 0.001, + "loss": 2.3661, + "step": 157 + }, + { + "epoch": 0.006684152635586767, + "grad_norm": 0.40985289216041565, + "learning_rate": 0.001, + "loss": 2.6935, + "step": 158 + }, + { + "epoch": 0.006726457399103139, + "grad_norm": 0.561894953250885, + "learning_rate": 0.001, + "loss": 2.7189, + "step": 159 + }, + { + "epoch": 0.006768762162619511, + "grad_norm": 0.7344470620155334, + "learning_rate": 0.001, + "loss": 2.0996, + "step": 160 + }, + { + "epoch": 0.0068110669261358826, + "grad_norm": 0.6005984544754028, + "learning_rate": 0.001, + "loss": 2.9467, + "step": 161 + }, + { + "epoch": 0.006853371689652255, + "grad_norm": 1.274715542793274, + "learning_rate": 0.001, + "loss": 3.6441, + "step": 162 + }, + { + "epoch": 0.006895676453168627, + "grad_norm": 0.44184061884880066, + "learning_rate": 0.001, + "loss": 2.4605, + "step": 163 + }, + { + "epoch": 0.006937981216684999, + "grad_norm": 0.4935331642627716, + "learning_rate": 0.001, + "loss": 3.2329, + "step": 164 + }, + { + "epoch": 0.006980285980201371, + "grad_norm": 1.1876336336135864, + "learning_rate": 0.001, + "loss": 1.9255, + "step": 165 + }, + { + "epoch": 0.007022590743717742, + "grad_norm": 0.3959547281265259, + "learning_rate": 0.001, + "loss": 2.0602, + "step": 166 + }, + { + "epoch": 0.007064895507234115, + "grad_norm": 0.6868635416030884, + "learning_rate": 0.001, + "loss": 3.2132, + "step": 167 + }, + { + "epoch": 0.007107200270750486, + "grad_norm": 1.7616485357284546, + "learning_rate": 0.001, + "loss": 2.568, + "step": 168 + }, + { + "epoch": 0.007149505034266858, + "grad_norm": 0.7364703416824341, + "learning_rate": 0.001, + "loss": 3.4709, + "step": 169 + }, + { + "epoch": 0.0071918097977832305, + "grad_norm": 0.3650343716144562, + "learning_rate": 0.001, + "loss": 2.5879, + "step": 170 + }, + { + "epoch": 0.007234114561299602, + "grad_norm": 0.5033729076385498, + "learning_rate": 0.001, + "loss": 2.3781, + "step": 171 + }, + { + "epoch": 0.007276419324815975, + "grad_norm": 7.086312294006348, + "learning_rate": 0.001, + "loss": 3.8916, + "step": 172 + }, + { + "epoch": 0.007318724088332346, + "grad_norm": 0.6279006600379944, + "learning_rate": 0.001, + "loss": 2.9944, + "step": 173 + }, + { + "epoch": 0.007361028851848718, + "grad_norm": 0.4906074106693268, + "learning_rate": 0.001, + "loss": 3.5258, + "step": 174 + }, + { + "epoch": 0.00740333361536509, + "grad_norm": 0.3576771318912506, + "learning_rate": 0.001, + "loss": 1.7132, + "step": 175 + }, + { + "epoch": 0.007445638378881462, + "grad_norm": 0.39911970496177673, + "learning_rate": 0.001, + "loss": 3.7563, + "step": 176 + }, + { + "epoch": 0.007487943142397834, + "grad_norm": 2.3106889724731445, + "learning_rate": 0.001, + "loss": 3.3329, + "step": 177 + }, + { + "epoch": 0.007530247905914206, + "grad_norm": 1.1456013917922974, + "learning_rate": 0.001, + "loss": 1.9892, + "step": 178 + }, + { + "epoch": 0.007572552669430578, + "grad_norm": 0.457940012216568, + "learning_rate": 0.001, + "loss": 2.1332, + "step": 179 + }, + { + "epoch": 0.00761485743294695, + "grad_norm": 0.44287726283073425, + "learning_rate": 0.001, + "loss": 2.6956, + "step": 180 + }, + { + "epoch": 0.007657162196463322, + "grad_norm": 0.4767186641693115, + "learning_rate": 0.001, + "loss": 2.0452, + "step": 181 + }, + { + "epoch": 0.007699466959979694, + "grad_norm": 26.49453353881836, + "learning_rate": 0.001, + "loss": 2.0203, + "step": 182 + }, + { + "epoch": 0.007741771723496066, + "grad_norm": 0.6047090888023376, + "learning_rate": 0.001, + "loss": 2.731, + "step": 183 + }, + { + "epoch": 0.007784076487012437, + "grad_norm": 0.8945805430412292, + "learning_rate": 0.001, + "loss": 2.3029, + "step": 184 + }, + { + "epoch": 0.00782638125052881, + "grad_norm": 0.8593363761901855, + "learning_rate": 0.001, + "loss": 2.5228, + "step": 185 + }, + { + "epoch": 0.007868686014045181, + "grad_norm": 2.482999801635742, + "learning_rate": 0.001, + "loss": 3.0185, + "step": 186 + }, + { + "epoch": 0.007910990777561553, + "grad_norm": 0.7577206492424011, + "learning_rate": 0.001, + "loss": 2.948, + "step": 187 + }, + { + "epoch": 0.007953295541077925, + "grad_norm": 2.415463924407959, + "learning_rate": 0.001, + "loss": 3.2312, + "step": 188 + }, + { + "epoch": 0.007995600304594298, + "grad_norm": 0.9507613182067871, + "learning_rate": 0.001, + "loss": 2.5924, + "step": 189 + }, + { + "epoch": 0.00803790506811067, + "grad_norm": 0.7477337121963501, + "learning_rate": 0.001, + "loss": 3.3504, + "step": 190 + }, + { + "epoch": 0.008080209831627041, + "grad_norm": 1.864358901977539, + "learning_rate": 0.001, + "loss": 2.9727, + "step": 191 + }, + { + "epoch": 0.008122514595143413, + "grad_norm": 1.1556525230407715, + "learning_rate": 0.001, + "loss": 2.8587, + "step": 192 + }, + { + "epoch": 0.008164819358659784, + "grad_norm": 1.382063388824463, + "learning_rate": 0.001, + "loss": 2.4826, + "step": 193 + }, + { + "epoch": 0.008207124122176158, + "grad_norm": 0.43554991483688354, + "learning_rate": 0.001, + "loss": 2.5445, + "step": 194 + }, + { + "epoch": 0.00824942888569253, + "grad_norm": 0.4753492772579193, + "learning_rate": 0.001, + "loss": 3.262, + "step": 195 + }, + { + "epoch": 0.008291733649208901, + "grad_norm": 1.1016758680343628, + "learning_rate": 0.001, + "loss": 2.1801, + "step": 196 + }, + { + "epoch": 0.008334038412725273, + "grad_norm": 0.6898307800292969, + "learning_rate": 0.001, + "loss": 2.3774, + "step": 197 + }, + { + "epoch": 0.008376343176241644, + "grad_norm": 4.887700080871582, + "learning_rate": 0.001, + "loss": 3.0435, + "step": 198 + }, + { + "epoch": 0.008418647939758018, + "grad_norm": 1.1673232316970825, + "learning_rate": 0.001, + "loss": 2.4091, + "step": 199 + }, + { + "epoch": 0.008460952703274389, + "grad_norm": 0.5722813010215759, + "learning_rate": 0.001, + "loss": 2.1268, + "step": 200 + }, + { + "epoch": 0.00850325746679076, + "grad_norm": 0.45928850769996643, + "learning_rate": 0.001, + "loss": 2.4795, + "step": 201 + }, + { + "epoch": 0.008545562230307132, + "grad_norm": 0.429862380027771, + "learning_rate": 0.001, + "loss": 2.8722, + "step": 202 + }, + { + "epoch": 0.008587866993823504, + "grad_norm": 0.45352330803871155, + "learning_rate": 0.001, + "loss": 2.2689, + "step": 203 + }, + { + "epoch": 0.008630171757339877, + "grad_norm": 0.4726592004299164, + "learning_rate": 0.001, + "loss": 3.3355, + "step": 204 + }, + { + "epoch": 0.008672476520856249, + "grad_norm": 0.44824856519699097, + "learning_rate": 0.001, + "loss": 2.5926, + "step": 205 + }, + { + "epoch": 0.00871478128437262, + "grad_norm": 0.39711594581604004, + "learning_rate": 0.001, + "loss": 2.2368, + "step": 206 + }, + { + "epoch": 0.008757086047888992, + "grad_norm": 0.47843319177627563, + "learning_rate": 0.001, + "loss": 2.983, + "step": 207 + }, + { + "epoch": 0.008799390811405364, + "grad_norm": 0.33170491456985474, + "learning_rate": 0.001, + "loss": 2.1638, + "step": 208 + }, + { + "epoch": 0.008841695574921735, + "grad_norm": 0.4759407937526703, + "learning_rate": 0.001, + "loss": 2.4175, + "step": 209 + }, + { + "epoch": 0.008884000338438109, + "grad_norm": 0.5208505392074585, + "learning_rate": 0.001, + "loss": 3.6394, + "step": 210 + }, + { + "epoch": 0.00892630510195448, + "grad_norm": 0.5411098599433899, + "learning_rate": 0.001, + "loss": 3.212, + "step": 211 + }, + { + "epoch": 0.008968609865470852, + "grad_norm": 0.4449928104877472, + "learning_rate": 0.001, + "loss": 2.8441, + "step": 212 + }, + { + "epoch": 0.009010914628987223, + "grad_norm": 0.7975568175315857, + "learning_rate": 0.001, + "loss": 2.4782, + "step": 213 + }, + { + "epoch": 0.009053219392503595, + "grad_norm": 1.3413524627685547, + "learning_rate": 0.001, + "loss": 1.8429, + "step": 214 + }, + { + "epoch": 0.009095524156019968, + "grad_norm": 1.463435411453247, + "learning_rate": 0.001, + "loss": 2.138, + "step": 215 + }, + { + "epoch": 0.00913782891953634, + "grad_norm": 0.5830798745155334, + "learning_rate": 0.001, + "loss": 2.7381, + "step": 216 + }, + { + "epoch": 0.009180133683052712, + "grad_norm": 1.1858150959014893, + "learning_rate": 0.001, + "loss": 2.9576, + "step": 217 + }, + { + "epoch": 0.009222438446569083, + "grad_norm": 0.5822926163673401, + "learning_rate": 0.001, + "loss": 2.9898, + "step": 218 + }, + { + "epoch": 0.009264743210085455, + "grad_norm": 0.38207611441612244, + "learning_rate": 0.001, + "loss": 2.5692, + "step": 219 + }, + { + "epoch": 0.009307047973601828, + "grad_norm": 0.3924141228199005, + "learning_rate": 0.001, + "loss": 2.1596, + "step": 220 + }, + { + "epoch": 0.0093493527371182, + "grad_norm": 0.3632948398590088, + "learning_rate": 0.001, + "loss": 1.9336, + "step": 221 + }, + { + "epoch": 0.009391657500634571, + "grad_norm": 0.6469441652297974, + "learning_rate": 0.001, + "loss": 3.3174, + "step": 222 + }, + { + "epoch": 0.009433962264150943, + "grad_norm": 0.4201594293117523, + "learning_rate": 0.001, + "loss": 2.3425, + "step": 223 + }, + { + "epoch": 0.009476267027667315, + "grad_norm": 0.4190143942832947, + "learning_rate": 0.001, + "loss": 2.306, + "step": 224 + }, + { + "epoch": 0.009518571791183688, + "grad_norm": 0.7149160504341125, + "learning_rate": 0.001, + "loss": 2.82, + "step": 225 + }, + { + "epoch": 0.00956087655470006, + "grad_norm": 0.6239504814147949, + "learning_rate": 0.001, + "loss": 2.1841, + "step": 226 + }, + { + "epoch": 0.009603181318216431, + "grad_norm": 0.5916045904159546, + "learning_rate": 0.001, + "loss": 2.1635, + "step": 227 + }, + { + "epoch": 0.009645486081732803, + "grad_norm": 0.4300955832004547, + "learning_rate": 0.001, + "loss": 2.3918, + "step": 228 + }, + { + "epoch": 0.009687790845249174, + "grad_norm": 0.7572110891342163, + "learning_rate": 0.001, + "loss": 2.1596, + "step": 229 + }, + { + "epoch": 0.009730095608765548, + "grad_norm": 0.4874956011772156, + "learning_rate": 0.001, + "loss": 3.3631, + "step": 230 + }, + { + "epoch": 0.00977240037228192, + "grad_norm": 0.43173378705978394, + "learning_rate": 0.001, + "loss": 2.446, + "step": 231 + }, + { + "epoch": 0.009814705135798291, + "grad_norm": 0.6573549509048462, + "learning_rate": 0.001, + "loss": 3.0706, + "step": 232 + }, + { + "epoch": 0.009857009899314663, + "grad_norm": 0.42250698804855347, + "learning_rate": 0.001, + "loss": 2.9873, + "step": 233 + }, + { + "epoch": 0.009899314662831034, + "grad_norm": 3.47033953666687, + "learning_rate": 0.001, + "loss": 2.4825, + "step": 234 + }, + { + "epoch": 0.009941619426347408, + "grad_norm": 0.39495372772216797, + "learning_rate": 0.001, + "loss": 2.5977, + "step": 235 + }, + { + "epoch": 0.00998392418986378, + "grad_norm": 0.5418169498443604, + "learning_rate": 0.001, + "loss": 2.0364, + "step": 236 + }, + { + "epoch": 0.01002622895338015, + "grad_norm": 0.4567147195339203, + "learning_rate": 0.001, + "loss": 2.9259, + "step": 237 + }, + { + "epoch": 0.010068533716896522, + "grad_norm": 0.8719536066055298, + "learning_rate": 0.001, + "loss": 2.8369, + "step": 238 + }, + { + "epoch": 0.010110838480412894, + "grad_norm": 0.44655323028564453, + "learning_rate": 0.001, + "loss": 2.4415, + "step": 239 + }, + { + "epoch": 0.010153143243929267, + "grad_norm": 0.5208900570869446, + "learning_rate": 0.001, + "loss": 2.4879, + "step": 240 + }, + { + "epoch": 0.010195448007445639, + "grad_norm": 0.3061939775943756, + "learning_rate": 0.001, + "loss": 1.9186, + "step": 241 + }, + { + "epoch": 0.01023775277096201, + "grad_norm": 0.48679229617118835, + "learning_rate": 0.001, + "loss": 2.2632, + "step": 242 + }, + { + "epoch": 0.010280057534478382, + "grad_norm": 0.5101296305656433, + "learning_rate": 0.001, + "loss": 3.1684, + "step": 243 + }, + { + "epoch": 0.010322362297994754, + "grad_norm": 1.0889201164245605, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 244 + }, + { + "epoch": 0.010364667061511125, + "grad_norm": 0.4110303819179535, + "learning_rate": 0.001, + "loss": 2.9342, + "step": 245 + }, + { + "epoch": 0.010406971825027499, + "grad_norm": 0.528121292591095, + "learning_rate": 0.001, + "loss": 3.7985, + "step": 246 + }, + { + "epoch": 0.01044927658854387, + "grad_norm": 2.397808074951172, + "learning_rate": 0.001, + "loss": 3.0319, + "step": 247 + }, + { + "epoch": 0.010491581352060242, + "grad_norm": 1.5704766511917114, + "learning_rate": 0.001, + "loss": 2.6339, + "step": 248 + }, + { + "epoch": 0.010533886115576614, + "grad_norm": 2.5879034996032715, + "learning_rate": 0.001, + "loss": 2.4711, + "step": 249 + }, + { + "epoch": 0.010576190879092985, + "grad_norm": 0.5719565153121948, + "learning_rate": 0.001, + "loss": 2.0952, + "step": 250 + }, + { + "epoch": 0.010618495642609358, + "grad_norm": 0.48833173513412476, + "learning_rate": 0.001, + "loss": 1.9318, + "step": 251 + }, + { + "epoch": 0.01066080040612573, + "grad_norm": 0.4771743416786194, + "learning_rate": 0.001, + "loss": 2.4739, + "step": 252 + }, + { + "epoch": 0.010703105169642102, + "grad_norm": 0.6914018392562866, + "learning_rate": 0.001, + "loss": 3.0228, + "step": 253 + }, + { + "epoch": 0.010745409933158473, + "grad_norm": 0.655381441116333, + "learning_rate": 0.001, + "loss": 2.2491, + "step": 254 + }, + { + "epoch": 0.010787714696674845, + "grad_norm": 0.47917628288269043, + "learning_rate": 0.001, + "loss": 3.1589, + "step": 255 + }, + { + "epoch": 0.010830019460191218, + "grad_norm": 0.5288345217704773, + "learning_rate": 0.001, + "loss": 2.5891, + "step": 256 + }, + { + "epoch": 0.01087232422370759, + "grad_norm": 0.7679047584533691, + "learning_rate": 0.001, + "loss": 2.1992, + "step": 257 + }, + { + "epoch": 0.010914628987223961, + "grad_norm": 0.4910162091255188, + "learning_rate": 0.001, + "loss": 2.5994, + "step": 258 + }, + { + "epoch": 0.010956933750740333, + "grad_norm": 0.40495947003364563, + "learning_rate": 0.001, + "loss": 2.7402, + "step": 259 + }, + { + "epoch": 0.010999238514256705, + "grad_norm": 0.32625123858451843, + "learning_rate": 0.001, + "loss": 2.6868, + "step": 260 + }, + { + "epoch": 0.011041543277773078, + "grad_norm": 0.4822738468647003, + "learning_rate": 0.001, + "loss": 2.6367, + "step": 261 + }, + { + "epoch": 0.01108384804128945, + "grad_norm": 0.5915836095809937, + "learning_rate": 0.001, + "loss": 2.3544, + "step": 262 + }, + { + "epoch": 0.011126152804805821, + "grad_norm": 0.32710862159729004, + "learning_rate": 0.001, + "loss": 2.3255, + "step": 263 + }, + { + "epoch": 0.011168457568322193, + "grad_norm": 0.4875565767288208, + "learning_rate": 0.001, + "loss": 2.5211, + "step": 264 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 1.2803752422332764, + "learning_rate": 0.001, + "loss": 2.407, + "step": 265 + }, + { + "epoch": 0.011253067095354938, + "grad_norm": 0.7037876844406128, + "learning_rate": 0.001, + "loss": 3.2321, + "step": 266 + }, + { + "epoch": 0.01129537185887131, + "grad_norm": 0.36865493655204773, + "learning_rate": 0.001, + "loss": 2.4842, + "step": 267 + }, + { + "epoch": 0.011337676622387681, + "grad_norm": 0.4892154037952423, + "learning_rate": 0.001, + "loss": 3.5454, + "step": 268 + }, + { + "epoch": 0.011379981385904053, + "grad_norm": 0.5366520285606384, + "learning_rate": 0.001, + "loss": 2.1276, + "step": 269 + }, + { + "epoch": 0.011422286149420424, + "grad_norm": 0.47337809205055237, + "learning_rate": 0.001, + "loss": 3.3193, + "step": 270 + }, + { + "epoch": 0.011464590912936798, + "grad_norm": 0.4499555230140686, + "learning_rate": 0.001, + "loss": 3.5789, + "step": 271 + }, + { + "epoch": 0.01150689567645317, + "grad_norm": 0.3636587858200073, + "learning_rate": 0.001, + "loss": 2.3532, + "step": 272 + }, + { + "epoch": 0.01154920043996954, + "grad_norm": 0.5930542349815369, + "learning_rate": 0.001, + "loss": 2.3694, + "step": 273 + }, + { + "epoch": 0.011591505203485912, + "grad_norm": 0.7394574284553528, + "learning_rate": 0.001, + "loss": 3.1853, + "step": 274 + }, + { + "epoch": 0.011633809967002284, + "grad_norm": 0.6907839775085449, + "learning_rate": 0.001, + "loss": 2.5174, + "step": 275 + }, + { + "epoch": 0.011676114730518656, + "grad_norm": 1.15705406665802, + "learning_rate": 0.001, + "loss": 2.315, + "step": 276 + }, + { + "epoch": 0.011718419494035029, + "grad_norm": 0.5256633758544922, + "learning_rate": 0.001, + "loss": 2.429, + "step": 277 + }, + { + "epoch": 0.0117607242575514, + "grad_norm": 0.9328014850616455, + "learning_rate": 0.001, + "loss": 2.4643, + "step": 278 + }, + { + "epoch": 0.011803029021067772, + "grad_norm": 0.5655497312545776, + "learning_rate": 0.001, + "loss": 3.344, + "step": 279 + }, + { + "epoch": 0.011845333784584144, + "grad_norm": 0.6000591516494751, + "learning_rate": 0.001, + "loss": 3.2783, + "step": 280 + }, + { + "epoch": 0.011887638548100515, + "grad_norm": 1.3328900337219238, + "learning_rate": 0.001, + "loss": 4.1927, + "step": 281 + }, + { + "epoch": 0.011929943311616889, + "grad_norm": 2.8590199947357178, + "learning_rate": 0.001, + "loss": 2.3905, + "step": 282 + }, + { + "epoch": 0.01197224807513326, + "grad_norm": 0.7419810891151428, + "learning_rate": 0.001, + "loss": 2.6781, + "step": 283 + }, + { + "epoch": 0.012014552838649632, + "grad_norm": 0.6407875418663025, + "learning_rate": 0.001, + "loss": 2.5239, + "step": 284 + }, + { + "epoch": 0.012056857602166004, + "grad_norm": 0.7808129191398621, + "learning_rate": 0.001, + "loss": 2.41, + "step": 285 + }, + { + "epoch": 0.012099162365682375, + "grad_norm": 0.5127401947975159, + "learning_rate": 0.001, + "loss": 2.5634, + "step": 286 + }, + { + "epoch": 0.012141467129198748, + "grad_norm": 0.7659389972686768, + "learning_rate": 0.001, + "loss": 2.6071, + "step": 287 + }, + { + "epoch": 0.01218377189271512, + "grad_norm": 0.4947212040424347, + "learning_rate": 0.001, + "loss": 2.8672, + "step": 288 + }, + { + "epoch": 0.012226076656231492, + "grad_norm": 0.7338204979896545, + "learning_rate": 0.001, + "loss": 2.9333, + "step": 289 + }, + { + "epoch": 0.012268381419747863, + "grad_norm": 0.5832387804985046, + "learning_rate": 0.001, + "loss": 3.106, + "step": 290 + }, + { + "epoch": 0.012310686183264235, + "grad_norm": 0.3840453028678894, + "learning_rate": 0.001, + "loss": 1.9948, + "step": 291 + }, + { + "epoch": 0.012352990946780608, + "grad_norm": 0.6502733826637268, + "learning_rate": 0.001, + "loss": 2.5404, + "step": 292 + }, + { + "epoch": 0.01239529571029698, + "grad_norm": 0.4887555241584778, + "learning_rate": 0.001, + "loss": 2.4991, + "step": 293 + }, + { + "epoch": 0.012437600473813351, + "grad_norm": 0.9977436661720276, + "learning_rate": 0.001, + "loss": 2.0015, + "step": 294 + }, + { + "epoch": 0.012479905237329723, + "grad_norm": 0.6219228506088257, + "learning_rate": 0.001, + "loss": 2.7717, + "step": 295 + }, + { + "epoch": 0.012522210000846095, + "grad_norm": 0.4836757481098175, + "learning_rate": 0.001, + "loss": 2.207, + "step": 296 + }, + { + "epoch": 0.012564514764362468, + "grad_norm": 0.43116042017936707, + "learning_rate": 0.001, + "loss": 2.7609, + "step": 297 + }, + { + "epoch": 0.01260681952787884, + "grad_norm": 0.990545928478241, + "learning_rate": 0.001, + "loss": 2.7015, + "step": 298 + }, + { + "epoch": 0.012649124291395211, + "grad_norm": 0.39893725514411926, + "learning_rate": 0.001, + "loss": 1.6939, + "step": 299 + }, + { + "epoch": 0.012691429054911583, + "grad_norm": 0.6461228132247925, + "learning_rate": 0.001, + "loss": 2.8863, + "step": 300 + }, + { + "epoch": 0.012733733818427954, + "grad_norm": 1.7560275793075562, + "learning_rate": 0.001, + "loss": 2.7424, + "step": 301 + }, + { + "epoch": 0.012776038581944328, + "grad_norm": 0.40187498927116394, + "learning_rate": 0.001, + "loss": 2.6793, + "step": 302 + }, + { + "epoch": 0.0128183433454607, + "grad_norm": 1.5104907751083374, + "learning_rate": 0.001, + "loss": 3.1125, + "step": 303 + }, + { + "epoch": 0.012860648108977071, + "grad_norm": 0.4994620382785797, + "learning_rate": 0.001, + "loss": 2.9088, + "step": 304 + }, + { + "epoch": 0.012902952872493443, + "grad_norm": 0.4981285035610199, + "learning_rate": 0.001, + "loss": 2.1621, + "step": 305 + }, + { + "epoch": 0.012945257636009814, + "grad_norm": 0.5565193295478821, + "learning_rate": 0.001, + "loss": 2.0941, + "step": 306 + }, + { + "epoch": 0.012987562399526186, + "grad_norm": 0.5400714874267578, + "learning_rate": 0.001, + "loss": 2.6684, + "step": 307 + }, + { + "epoch": 0.01302986716304256, + "grad_norm": 0.6254021525382996, + "learning_rate": 0.001, + "loss": 2.0742, + "step": 308 + }, + { + "epoch": 0.01307217192655893, + "grad_norm": 0.6768165230751038, + "learning_rate": 0.001, + "loss": 2.6983, + "step": 309 + }, + { + "epoch": 0.013114476690075302, + "grad_norm": 0.4339453876018524, + "learning_rate": 0.001, + "loss": 2.8612, + "step": 310 + }, + { + "epoch": 0.013156781453591674, + "grad_norm": 0.5363678336143494, + "learning_rate": 0.001, + "loss": 2.3609, + "step": 311 + }, + { + "epoch": 0.013199086217108046, + "grad_norm": 0.5207034945487976, + "learning_rate": 0.001, + "loss": 2.3486, + "step": 312 + }, + { + "epoch": 0.013241390980624419, + "grad_norm": 0.6771058440208435, + "learning_rate": 0.001, + "loss": 1.5712, + "step": 313 + }, + { + "epoch": 0.01328369574414079, + "grad_norm": 0.5648268461227417, + "learning_rate": 0.001, + "loss": 3.9228, + "step": 314 + }, + { + "epoch": 0.013326000507657162, + "grad_norm": 0.464399516582489, + "learning_rate": 0.001, + "loss": 2.0737, + "step": 315 + }, + { + "epoch": 0.013368305271173534, + "grad_norm": 0.47292885184288025, + "learning_rate": 0.001, + "loss": 2.1631, + "step": 316 + }, + { + "epoch": 0.013410610034689905, + "grad_norm": 0.5086976885795593, + "learning_rate": 0.001, + "loss": 2.0718, + "step": 317 + }, + { + "epoch": 0.013452914798206279, + "grad_norm": 0.9637677669525146, + "learning_rate": 0.001, + "loss": 2.6751, + "step": 318 + }, + { + "epoch": 0.01349521956172265, + "grad_norm": 0.9571125507354736, + "learning_rate": 0.001, + "loss": 2.147, + "step": 319 + }, + { + "epoch": 0.013537524325239022, + "grad_norm": 0.37801995873451233, + "learning_rate": 0.001, + "loss": 1.9653, + "step": 320 + }, + { + "epoch": 0.013579829088755394, + "grad_norm": 0.5243141651153564, + "learning_rate": 0.001, + "loss": 2.2679, + "step": 321 + }, + { + "epoch": 0.013622133852271765, + "grad_norm": 0.6378395557403564, + "learning_rate": 0.001, + "loss": 1.8723, + "step": 322 + }, + { + "epoch": 0.013664438615788138, + "grad_norm": 0.45354440808296204, + "learning_rate": 0.001, + "loss": 2.2213, + "step": 323 + }, + { + "epoch": 0.01370674337930451, + "grad_norm": 0.976237952709198, + "learning_rate": 0.001, + "loss": 2.6927, + "step": 324 + }, + { + "epoch": 0.013749048142820882, + "grad_norm": 0.4840424358844757, + "learning_rate": 0.001, + "loss": 2.3498, + "step": 325 + }, + { + "epoch": 0.013791352906337253, + "grad_norm": 0.5081096291542053, + "learning_rate": 0.001, + "loss": 2.9997, + "step": 326 + }, + { + "epoch": 0.013833657669853625, + "grad_norm": 0.5560334324836731, + "learning_rate": 0.001, + "loss": 3.1096, + "step": 327 + }, + { + "epoch": 0.013875962433369998, + "grad_norm": 0.41798263788223267, + "learning_rate": 0.001, + "loss": 2.0079, + "step": 328 + }, + { + "epoch": 0.01391826719688637, + "grad_norm": 0.40997782349586487, + "learning_rate": 0.001, + "loss": 1.4621, + "step": 329 + }, + { + "epoch": 0.013960571960402741, + "grad_norm": 0.4821615517139435, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 330 + }, + { + "epoch": 0.014002876723919113, + "grad_norm": 0.332916796207428, + "learning_rate": 0.001, + "loss": 1.9644, + "step": 331 + }, + { + "epoch": 0.014045181487435485, + "grad_norm": 0.49843454360961914, + "learning_rate": 0.001, + "loss": 3.7125, + "step": 332 + }, + { + "epoch": 0.014087486250951858, + "grad_norm": 0.6166870594024658, + "learning_rate": 0.001, + "loss": 1.9159, + "step": 333 + }, + { + "epoch": 0.01412979101446823, + "grad_norm": 0.6194549202919006, + "learning_rate": 0.001, + "loss": 2.4323, + "step": 334 + }, + { + "epoch": 0.014172095777984601, + "grad_norm": 0.5313667058944702, + "learning_rate": 0.001, + "loss": 2.9331, + "step": 335 + }, + { + "epoch": 0.014214400541500973, + "grad_norm": 0.3673984408378601, + "learning_rate": 0.001, + "loss": 2.2797, + "step": 336 + }, + { + "epoch": 0.014256705305017344, + "grad_norm": 0.4663550853729248, + "learning_rate": 0.001, + "loss": 3.1726, + "step": 337 + }, + { + "epoch": 0.014299010068533716, + "grad_norm": 1.455588459968567, + "learning_rate": 0.001, + "loss": 1.7567, + "step": 338 + }, + { + "epoch": 0.01434131483205009, + "grad_norm": 1.234546184539795, + "learning_rate": 0.001, + "loss": 2.1583, + "step": 339 + }, + { + "epoch": 0.014383619595566461, + "grad_norm": 2.307666301727295, + "learning_rate": 0.001, + "loss": 2.2571, + "step": 340 + }, + { + "epoch": 0.014425924359082833, + "grad_norm": 2.126053810119629, + "learning_rate": 0.001, + "loss": 2.0934, + "step": 341 + }, + { + "epoch": 0.014468229122599204, + "grad_norm": 0.5785415768623352, + "learning_rate": 0.001, + "loss": 2.4343, + "step": 342 + }, + { + "epoch": 0.014510533886115576, + "grad_norm": 0.5483373999595642, + "learning_rate": 0.001, + "loss": 1.878, + "step": 343 + }, + { + "epoch": 0.01455283864963195, + "grad_norm": 0.7590854167938232, + "learning_rate": 0.001, + "loss": 3.4764, + "step": 344 + }, + { + "epoch": 0.01459514341314832, + "grad_norm": 1.6678118705749512, + "learning_rate": 0.001, + "loss": 1.8089, + "step": 345 + }, + { + "epoch": 0.014637448176664692, + "grad_norm": 0.5652883052825928, + "learning_rate": 0.001, + "loss": 4.2131, + "step": 346 + }, + { + "epoch": 0.014679752940181064, + "grad_norm": 0.5262782573699951, + "learning_rate": 0.001, + "loss": 3.3063, + "step": 347 + }, + { + "epoch": 0.014722057703697436, + "grad_norm": 0.5519827604293823, + "learning_rate": 0.001, + "loss": 2.9829, + "step": 348 + }, + { + "epoch": 0.014764362467213809, + "grad_norm": 0.4387628436088562, + "learning_rate": 0.001, + "loss": 2.4457, + "step": 349 + }, + { + "epoch": 0.01480666723073018, + "grad_norm": 0.5566309690475464, + "learning_rate": 0.001, + "loss": 2.4278, + "step": 350 + }, + { + "epoch": 0.014848971994246552, + "grad_norm": 0.3900257647037506, + "learning_rate": 0.001, + "loss": 2.6352, + "step": 351 + }, + { + "epoch": 0.014891276757762924, + "grad_norm": 0.4080370366573334, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 352 + }, + { + "epoch": 0.014933581521279295, + "grad_norm": 0.6504301428794861, + "learning_rate": 0.001, + "loss": 2.5511, + "step": 353 + }, + { + "epoch": 0.014975886284795669, + "grad_norm": 0.9981370568275452, + "learning_rate": 0.001, + "loss": 2.4473, + "step": 354 + }, + { + "epoch": 0.01501819104831204, + "grad_norm": 1.0626752376556396, + "learning_rate": 0.001, + "loss": 2.398, + "step": 355 + }, + { + "epoch": 0.015060495811828412, + "grad_norm": 0.487619549036026, + "learning_rate": 0.001, + "loss": 2.8647, + "step": 356 + }, + { + "epoch": 0.015102800575344784, + "grad_norm": 0.47419092059135437, + "learning_rate": 0.001, + "loss": 2.0267, + "step": 357 + }, + { + "epoch": 0.015145105338861155, + "grad_norm": 0.6504938006401062, + "learning_rate": 0.001, + "loss": 2.5824, + "step": 358 + }, + { + "epoch": 0.015187410102377528, + "grad_norm": 0.5113835334777832, + "learning_rate": 0.001, + "loss": 3.355, + "step": 359 + }, + { + "epoch": 0.0152297148658939, + "grad_norm": 0.38182953000068665, + "learning_rate": 0.001, + "loss": 2.3596, + "step": 360 + }, + { + "epoch": 0.015272019629410272, + "grad_norm": 0.5668423771858215, + "learning_rate": 0.001, + "loss": 2.5567, + "step": 361 + }, + { + "epoch": 0.015314324392926643, + "grad_norm": 0.6734775304794312, + "learning_rate": 0.001, + "loss": 1.5854, + "step": 362 + }, + { + "epoch": 0.015356629156443015, + "grad_norm": 3.227863073348999, + "learning_rate": 0.001, + "loss": 2.8558, + "step": 363 + }, + { + "epoch": 0.015398933919959388, + "grad_norm": 1.0969175100326538, + "learning_rate": 0.001, + "loss": 2.2552, + "step": 364 + }, + { + "epoch": 0.01544123868347576, + "grad_norm": 0.8865289092063904, + "learning_rate": 0.001, + "loss": 2.6505, + "step": 365 + }, + { + "epoch": 0.015483543446992131, + "grad_norm": 0.506199836730957, + "learning_rate": 0.001, + "loss": 2.7035, + "step": 366 + }, + { + "epoch": 0.015525848210508503, + "grad_norm": 0.4529375731945038, + "learning_rate": 0.001, + "loss": 3.187, + "step": 367 + }, + { + "epoch": 0.015568152974024875, + "grad_norm": 0.49568095803260803, + "learning_rate": 0.001, + "loss": 2.9369, + "step": 368 + }, + { + "epoch": 0.015610457737541246, + "grad_norm": 3.762782096862793, + "learning_rate": 0.001, + "loss": 3.8394, + "step": 369 + }, + { + "epoch": 0.01565276250105762, + "grad_norm": 1.1418230533599854, + "learning_rate": 0.001, + "loss": 2.9679, + "step": 370 + }, + { + "epoch": 0.01569506726457399, + "grad_norm": 1.2121808528900146, + "learning_rate": 0.001, + "loss": 3.15, + "step": 371 + }, + { + "epoch": 0.015737372028090363, + "grad_norm": 0.39264196157455444, + "learning_rate": 0.001, + "loss": 2.4917, + "step": 372 + }, + { + "epoch": 0.015779676791606734, + "grad_norm": 0.44038763642311096, + "learning_rate": 0.001, + "loss": 3.1301, + "step": 373 + }, + { + "epoch": 0.015821981555123106, + "grad_norm": 0.5781055688858032, + "learning_rate": 0.001, + "loss": 2.0814, + "step": 374 + }, + { + "epoch": 0.015864286318639478, + "grad_norm": 0.4739765226840973, + "learning_rate": 0.001, + "loss": 3.3641, + "step": 375 + }, + { + "epoch": 0.01590659108215585, + "grad_norm": 0.45817050337791443, + "learning_rate": 0.001, + "loss": 2.6101, + "step": 376 + }, + { + "epoch": 0.015948895845672224, + "grad_norm": 0.45967790484428406, + "learning_rate": 0.001, + "loss": 2.9429, + "step": 377 + }, + { + "epoch": 0.015991200609188596, + "grad_norm": 0.512673556804657, + "learning_rate": 0.001, + "loss": 2.7608, + "step": 378 + }, + { + "epoch": 0.016033505372704968, + "grad_norm": 0.32649508118629456, + "learning_rate": 0.001, + "loss": 2.9877, + "step": 379 + }, + { + "epoch": 0.01607581013622134, + "grad_norm": 0.45146769285202026, + "learning_rate": 0.001, + "loss": 3.3331, + "step": 380 + }, + { + "epoch": 0.01611811489973771, + "grad_norm": 0.7151511907577515, + "learning_rate": 0.001, + "loss": 2.1693, + "step": 381 + }, + { + "epoch": 0.016160419663254082, + "grad_norm": 0.4157434403896332, + "learning_rate": 0.001, + "loss": 2.2174, + "step": 382 + }, + { + "epoch": 0.016202724426770454, + "grad_norm": 0.4276852607727051, + "learning_rate": 0.001, + "loss": 2.6254, + "step": 383 + }, + { + "epoch": 0.016245029190286826, + "grad_norm": 1.40143620967865, + "learning_rate": 0.001, + "loss": 3.6675, + "step": 384 + }, + { + "epoch": 0.016287333953803197, + "grad_norm": 2.487680673599243, + "learning_rate": 0.001, + "loss": 1.7132, + "step": 385 + }, + { + "epoch": 0.01632963871731957, + "grad_norm": 0.7442037463188171, + "learning_rate": 0.001, + "loss": 2.906, + "step": 386 + }, + { + "epoch": 0.01637194348083594, + "grad_norm": 0.5867713093757629, + "learning_rate": 0.001, + "loss": 2.4791, + "step": 387 + }, + { + "epoch": 0.016414248244352315, + "grad_norm": 3.043729066848755, + "learning_rate": 0.001, + "loss": 3.2289, + "step": 388 + }, + { + "epoch": 0.016456553007868687, + "grad_norm": 1.29078209400177, + "learning_rate": 0.001, + "loss": 3.2969, + "step": 389 + }, + { + "epoch": 0.01649885777138506, + "grad_norm": 0.46137046813964844, + "learning_rate": 0.001, + "loss": 2.3551, + "step": 390 + }, + { + "epoch": 0.01654116253490143, + "grad_norm": 6.1766510009765625, + "learning_rate": 0.001, + "loss": 2.9738, + "step": 391 + }, + { + "epoch": 0.016583467298417802, + "grad_norm": 0.5667681694030762, + "learning_rate": 0.001, + "loss": 2.3875, + "step": 392 + }, + { + "epoch": 0.016625772061934174, + "grad_norm": 0.6917624473571777, + "learning_rate": 0.001, + "loss": 2.6381, + "step": 393 + }, + { + "epoch": 0.016668076825450545, + "grad_norm": 0.7266809344291687, + "learning_rate": 0.001, + "loss": 1.8559, + "step": 394 + }, + { + "epoch": 0.016710381588966917, + "grad_norm": 2.0730323791503906, + "learning_rate": 0.001, + "loss": 2.3098, + "step": 395 + }, + { + "epoch": 0.01675268635248329, + "grad_norm": 0.48305729031562805, + "learning_rate": 0.001, + "loss": 2.1544, + "step": 396 + }, + { + "epoch": 0.01679499111599966, + "grad_norm": 0.7128873467445374, + "learning_rate": 0.001, + "loss": 2.3416, + "step": 397 + }, + { + "epoch": 0.016837295879516035, + "grad_norm": 2.950924873352051, + "learning_rate": 0.001, + "loss": 3.7996, + "step": 398 + }, + { + "epoch": 0.016879600643032407, + "grad_norm": 0.8031635284423828, + "learning_rate": 0.001, + "loss": 2.3264, + "step": 399 + }, + { + "epoch": 0.016921905406548778, + "grad_norm": 0.6791091561317444, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 400 + }, + { + "epoch": 0.01696421017006515, + "grad_norm": 0.445100873708725, + "learning_rate": 0.001, + "loss": 2.1986, + "step": 401 + }, + { + "epoch": 0.01700651493358152, + "grad_norm": 0.43192756175994873, + "learning_rate": 0.001, + "loss": 2.4621, + "step": 402 + }, + { + "epoch": 0.017048819697097893, + "grad_norm": 0.6217234134674072, + "learning_rate": 0.001, + "loss": 2.6163, + "step": 403 + }, + { + "epoch": 0.017091124460614265, + "grad_norm": 7.2630791664123535, + "learning_rate": 0.001, + "loss": 2.2638, + "step": 404 + }, + { + "epoch": 0.017133429224130636, + "grad_norm": 0.48048147559165955, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 405 + }, + { + "epoch": 0.017175733987647008, + "grad_norm": 1.1334668397903442, + "learning_rate": 0.001, + "loss": 3.1583, + "step": 406 + }, + { + "epoch": 0.01721803875116338, + "grad_norm": 0.3903160095214844, + "learning_rate": 0.001, + "loss": 3.0678, + "step": 407 + }, + { + "epoch": 0.017260343514679755, + "grad_norm": 0.8138815760612488, + "learning_rate": 0.001, + "loss": 2.5374, + "step": 408 + }, + { + "epoch": 0.017302648278196126, + "grad_norm": 0.8442886471748352, + "learning_rate": 0.001, + "loss": 2.5721, + "step": 409 + }, + { + "epoch": 0.017344953041712498, + "grad_norm": 0.6825190186500549, + "learning_rate": 0.001, + "loss": 2.7989, + "step": 410 + }, + { + "epoch": 0.01738725780522887, + "grad_norm": 0.625694215297699, + "learning_rate": 0.001, + "loss": 3.3783, + "step": 411 + }, + { + "epoch": 0.01742956256874524, + "grad_norm": 0.5650992393493652, + "learning_rate": 0.001, + "loss": 3.2683, + "step": 412 + }, + { + "epoch": 0.017471867332261613, + "grad_norm": 0.5415240526199341, + "learning_rate": 0.001, + "loss": 3.2396, + "step": 413 + }, + { + "epoch": 0.017514172095777984, + "grad_norm": 1.5903420448303223, + "learning_rate": 0.001, + "loss": 3.1021, + "step": 414 + }, + { + "epoch": 0.017556476859294356, + "grad_norm": 0.4088042676448822, + "learning_rate": 0.001, + "loss": 2.5221, + "step": 415 + }, + { + "epoch": 0.017598781622810727, + "grad_norm": 0.367471307516098, + "learning_rate": 0.001, + "loss": 2.0654, + "step": 416 + }, + { + "epoch": 0.0176410863863271, + "grad_norm": 0.6564489603042603, + "learning_rate": 0.001, + "loss": 2.2583, + "step": 417 + }, + { + "epoch": 0.01768339114984347, + "grad_norm": 0.681338369846344, + "learning_rate": 0.001, + "loss": 2.0275, + "step": 418 + }, + { + "epoch": 0.017725695913359846, + "grad_norm": 0.36002209782600403, + "learning_rate": 0.001, + "loss": 2.3529, + "step": 419 + }, + { + "epoch": 0.017768000676876217, + "grad_norm": 0.8276366591453552, + "learning_rate": 0.001, + "loss": 1.9392, + "step": 420 + }, + { + "epoch": 0.01781030544039259, + "grad_norm": 0.6230726838111877, + "learning_rate": 0.001, + "loss": 2.0964, + "step": 421 + }, + { + "epoch": 0.01785261020390896, + "grad_norm": 0.5634045004844666, + "learning_rate": 0.001, + "loss": 2.8694, + "step": 422 + }, + { + "epoch": 0.017894914967425332, + "grad_norm": 0.6966080665588379, + "learning_rate": 0.001, + "loss": 3.0085, + "step": 423 + }, + { + "epoch": 0.017937219730941704, + "grad_norm": 1.007291316986084, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 424 + }, + { + "epoch": 0.017979524494458075, + "grad_norm": 1.6382334232330322, + "learning_rate": 0.001, + "loss": 3.448, + "step": 425 + }, + { + "epoch": 0.018021829257974447, + "grad_norm": 0.46437203884124756, + "learning_rate": 0.001, + "loss": 2.4852, + "step": 426 + }, + { + "epoch": 0.01806413402149082, + "grad_norm": 0.6570785641670227, + "learning_rate": 0.001, + "loss": 2.6239, + "step": 427 + }, + { + "epoch": 0.01810643878500719, + "grad_norm": 0.501592755317688, + "learning_rate": 0.001, + "loss": 2.7384, + "step": 428 + }, + { + "epoch": 0.018148743548523565, + "grad_norm": 0.44482702016830444, + "learning_rate": 0.001, + "loss": 2.3664, + "step": 429 + }, + { + "epoch": 0.018191048312039937, + "grad_norm": 0.4519723951816559, + "learning_rate": 0.001, + "loss": 3.4909, + "step": 430 + }, + { + "epoch": 0.01823335307555631, + "grad_norm": 0.37969911098480225, + "learning_rate": 0.001, + "loss": 2.9331, + "step": 431 + }, + { + "epoch": 0.01827565783907268, + "grad_norm": 0.35430458188056946, + "learning_rate": 0.001, + "loss": 1.9049, + "step": 432 + }, + { + "epoch": 0.01831796260258905, + "grad_norm": 0.3929635286331177, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 433 + }, + { + "epoch": 0.018360267366105423, + "grad_norm": 0.6036363244056702, + "learning_rate": 0.001, + "loss": 2.2359, + "step": 434 + }, + { + "epoch": 0.018402572129621795, + "grad_norm": 0.9288585186004639, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 435 + }, + { + "epoch": 0.018444876893138167, + "grad_norm": 0.5024868249893188, + "learning_rate": 0.001, + "loss": 2.3324, + "step": 436 + }, + { + "epoch": 0.018487181656654538, + "grad_norm": 0.3713514804840088, + "learning_rate": 0.001, + "loss": 1.9664, + "step": 437 + }, + { + "epoch": 0.01852948642017091, + "grad_norm": 0.3861111104488373, + "learning_rate": 0.001, + "loss": 1.807, + "step": 438 + }, + { + "epoch": 0.018571791183687285, + "grad_norm": 0.42646774649620056, + "learning_rate": 0.001, + "loss": 2.1702, + "step": 439 + }, + { + "epoch": 0.018614095947203656, + "grad_norm": 1.0648521184921265, + "learning_rate": 0.001, + "loss": 2.8879, + "step": 440 + }, + { + "epoch": 0.018656400710720028, + "grad_norm": 0.5311859846115112, + "learning_rate": 0.001, + "loss": 2.2553, + "step": 441 + }, + { + "epoch": 0.0186987054742364, + "grad_norm": 0.4035789966583252, + "learning_rate": 0.001, + "loss": 2.5759, + "step": 442 + }, + { + "epoch": 0.01874101023775277, + "grad_norm": 0.4712333083152771, + "learning_rate": 0.001, + "loss": 3.1349, + "step": 443 + }, + { + "epoch": 0.018783315001269143, + "grad_norm": 0.3229919373989105, + "learning_rate": 0.001, + "loss": 2.1652, + "step": 444 + }, + { + "epoch": 0.018825619764785514, + "grad_norm": 0.3765084147453308, + "learning_rate": 0.001, + "loss": 2.9515, + "step": 445 + }, + { + "epoch": 0.018867924528301886, + "grad_norm": 0.33263400197029114, + "learning_rate": 0.001, + "loss": 2.7202, + "step": 446 + }, + { + "epoch": 0.018910229291818258, + "grad_norm": 1.0643253326416016, + "learning_rate": 0.001, + "loss": 2.129, + "step": 447 + }, + { + "epoch": 0.01895253405533463, + "grad_norm": 0.5070205926895142, + "learning_rate": 0.001, + "loss": 2.2968, + "step": 448 + }, + { + "epoch": 0.018994838818851, + "grad_norm": 0.8920692801475525, + "learning_rate": 0.001, + "loss": 2.2572, + "step": 449 + }, + { + "epoch": 0.019037143582367376, + "grad_norm": 0.5254008173942566, + "learning_rate": 0.001, + "loss": 2.6386, + "step": 450 + }, + { + "epoch": 0.019079448345883748, + "grad_norm": 0.7540894746780396, + "learning_rate": 0.001, + "loss": 3.8766, + "step": 451 + }, + { + "epoch": 0.01912175310940012, + "grad_norm": 3.530369281768799, + "learning_rate": 0.001, + "loss": 2.6652, + "step": 452 + }, + { + "epoch": 0.01916405787291649, + "grad_norm": 0.385513573884964, + "learning_rate": 0.001, + "loss": 2.0282, + "step": 453 + }, + { + "epoch": 0.019206362636432862, + "grad_norm": 0.3878650367259979, + "learning_rate": 0.001, + "loss": 2.3184, + "step": 454 + }, + { + "epoch": 0.019248667399949234, + "grad_norm": 0.47612032294273376, + "learning_rate": 0.001, + "loss": 2.2334, + "step": 455 + }, + { + "epoch": 0.019290972163465606, + "grad_norm": 0.4401743412017822, + "learning_rate": 0.001, + "loss": 1.9957, + "step": 456 + }, + { + "epoch": 0.019333276926981977, + "grad_norm": 0.4817490875720978, + "learning_rate": 0.001, + "loss": 2.1504, + "step": 457 + }, + { + "epoch": 0.01937558169049835, + "grad_norm": 0.461186021566391, + "learning_rate": 0.001, + "loss": 2.0759, + "step": 458 + }, + { + "epoch": 0.01941788645401472, + "grad_norm": 0.572381317615509, + "learning_rate": 0.001, + "loss": 2.7651, + "step": 459 + }, + { + "epoch": 0.019460191217531096, + "grad_norm": 0.4275853931903839, + "learning_rate": 0.001, + "loss": 2.5997, + "step": 460 + }, + { + "epoch": 0.019502495981047467, + "grad_norm": 0.4098559021949768, + "learning_rate": 0.001, + "loss": 2.641, + "step": 461 + }, + { + "epoch": 0.01954480074456384, + "grad_norm": 0.4118167757987976, + "learning_rate": 0.001, + "loss": 1.7914, + "step": 462 + }, + { + "epoch": 0.01958710550808021, + "grad_norm": 0.46635180711746216, + "learning_rate": 0.001, + "loss": 3.0353, + "step": 463 + }, + { + "epoch": 0.019629410271596582, + "grad_norm": 1.0905091762542725, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 464 + }, + { + "epoch": 0.019671715035112954, + "grad_norm": 0.42049458622932434, + "learning_rate": 0.001, + "loss": 2.4426, + "step": 465 + }, + { + "epoch": 0.019714019798629325, + "grad_norm": 0.5391716361045837, + "learning_rate": 0.001, + "loss": 2.2722, + "step": 466 + }, + { + "epoch": 0.019756324562145697, + "grad_norm": 1.7512651681900024, + "learning_rate": 0.001, + "loss": 1.9385, + "step": 467 + }, + { + "epoch": 0.01979862932566207, + "grad_norm": 0.33571651577949524, + "learning_rate": 0.001, + "loss": 2.7864, + "step": 468 + }, + { + "epoch": 0.01984093408917844, + "grad_norm": 0.4144052267074585, + "learning_rate": 0.001, + "loss": 2.1114, + "step": 469 + }, + { + "epoch": 0.019883238852694815, + "grad_norm": 0.39613720774650574, + "learning_rate": 0.001, + "loss": 2.5951, + "step": 470 + }, + { + "epoch": 0.019925543616211187, + "grad_norm": 0.5358018279075623, + "learning_rate": 0.001, + "loss": 3.1943, + "step": 471 + }, + { + "epoch": 0.01996784837972756, + "grad_norm": 0.5452474355697632, + "learning_rate": 0.001, + "loss": 2.9164, + "step": 472 + }, + { + "epoch": 0.02001015314324393, + "grad_norm": 0.3648831248283386, + "learning_rate": 0.001, + "loss": 2.3703, + "step": 473 + }, + { + "epoch": 0.0200524579067603, + "grad_norm": 0.3464433252811432, + "learning_rate": 0.001, + "loss": 2.7931, + "step": 474 + }, + { + "epoch": 0.020094762670276673, + "grad_norm": 1.0535080432891846, + "learning_rate": 0.001, + "loss": 2.5576, + "step": 475 + }, + { + "epoch": 0.020137067433793045, + "grad_norm": 0.2924720346927643, + "learning_rate": 0.001, + "loss": 2.6184, + "step": 476 + }, + { + "epoch": 0.020179372197309416, + "grad_norm": 0.388220876455307, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 477 + }, + { + "epoch": 0.020221676960825788, + "grad_norm": 1.577642560005188, + "learning_rate": 0.001, + "loss": 2.0881, + "step": 478 + }, + { + "epoch": 0.02026398172434216, + "grad_norm": 0.795059323310852, + "learning_rate": 0.001, + "loss": 2.5844, + "step": 479 + }, + { + "epoch": 0.020306286487858535, + "grad_norm": 0.7602189183235168, + "learning_rate": 0.001, + "loss": 2.3127, + "step": 480 + }, + { + "epoch": 0.020348591251374906, + "grad_norm": 0.44378194212913513, + "learning_rate": 0.001, + "loss": 2.1581, + "step": 481 + }, + { + "epoch": 0.020390896014891278, + "grad_norm": 0.5162745714187622, + "learning_rate": 0.001, + "loss": 2.8813, + "step": 482 + }, + { + "epoch": 0.02043320077840765, + "grad_norm": 0.39076024293899536, + "learning_rate": 0.001, + "loss": 2.1318, + "step": 483 + }, + { + "epoch": 0.02047550554192402, + "grad_norm": 0.3666277825832367, + "learning_rate": 0.001, + "loss": 2.2807, + "step": 484 + }, + { + "epoch": 0.020517810305440393, + "grad_norm": 0.5351219177246094, + "learning_rate": 0.001, + "loss": 2.2821, + "step": 485 + }, + { + "epoch": 0.020560115068956764, + "grad_norm": 0.3654989004135132, + "learning_rate": 0.001, + "loss": 2.062, + "step": 486 + }, + { + "epoch": 0.020602419832473136, + "grad_norm": 0.3425626754760742, + "learning_rate": 0.001, + "loss": 1.9772, + "step": 487 + }, + { + "epoch": 0.020644724595989507, + "grad_norm": 6.2479166984558105, + "learning_rate": 0.001, + "loss": 1.9888, + "step": 488 + }, + { + "epoch": 0.02068702935950588, + "grad_norm": 0.381218820810318, + "learning_rate": 0.001, + "loss": 2.1181, + "step": 489 + }, + { + "epoch": 0.02072933412302225, + "grad_norm": 0.37633004784584045, + "learning_rate": 0.001, + "loss": 1.9825, + "step": 490 + }, + { + "epoch": 0.020771638886538626, + "grad_norm": 0.5049352049827576, + "learning_rate": 0.001, + "loss": 2.3675, + "step": 491 + }, + { + "epoch": 0.020813943650054997, + "grad_norm": 0.3255545496940613, + "learning_rate": 0.001, + "loss": 1.4964, + "step": 492 + }, + { + "epoch": 0.02085624841357137, + "grad_norm": 0.43696948885917664, + "learning_rate": 0.001, + "loss": 2.4114, + "step": 493 + }, + { + "epoch": 0.02089855317708774, + "grad_norm": 0.37952399253845215, + "learning_rate": 0.001, + "loss": 1.6125, + "step": 494 + }, + { + "epoch": 0.020940857940604112, + "grad_norm": 0.41570955514907837, + "learning_rate": 0.001, + "loss": 3.0489, + "step": 495 + }, + { + "epoch": 0.020983162704120484, + "grad_norm": 0.46523749828338623, + "learning_rate": 0.001, + "loss": 2.5591, + "step": 496 + }, + { + "epoch": 0.021025467467636855, + "grad_norm": 0.8702875971794128, + "learning_rate": 0.001, + "loss": 2.686, + "step": 497 + }, + { + "epoch": 0.021067772231153227, + "grad_norm": 2.071568012237549, + "learning_rate": 0.001, + "loss": 2.4348, + "step": 498 + }, + { + "epoch": 0.0211100769946696, + "grad_norm": 0.8501281142234802, + "learning_rate": 0.001, + "loss": 1.8937, + "step": 499 + }, + { + "epoch": 0.02115238175818597, + "grad_norm": 22.378358840942383, + "learning_rate": 0.001, + "loss": 2.139, + "step": 500 + }, + { + "epoch": 0.021194686521702345, + "grad_norm": 56.75684356689453, + "learning_rate": 0.001, + "loss": 3.3663, + "step": 501 + }, + { + "epoch": 0.021236991285218717, + "grad_norm": 0.6316158771514893, + "learning_rate": 0.001, + "loss": 2.0616, + "step": 502 + }, + { + "epoch": 0.02127929604873509, + "grad_norm": 0.6567540764808655, + "learning_rate": 0.001, + "loss": 3.1767, + "step": 503 + }, + { + "epoch": 0.02132160081225146, + "grad_norm": 3.0959877967834473, + "learning_rate": 0.001, + "loss": 3.7518, + "step": 504 + }, + { + "epoch": 0.02136390557576783, + "grad_norm": 0.4583059549331665, + "learning_rate": 0.001, + "loss": 2.7923, + "step": 505 + }, + { + "epoch": 0.021406210339284203, + "grad_norm": 0.7765421271324158, + "learning_rate": 0.001, + "loss": 3.0666, + "step": 506 + }, + { + "epoch": 0.021448515102800575, + "grad_norm": 0.5920457243919373, + "learning_rate": 0.001, + "loss": 2.7377, + "step": 507 + }, + { + "epoch": 0.021490819866316947, + "grad_norm": 0.5121840834617615, + "learning_rate": 0.001, + "loss": 2.056, + "step": 508 + }, + { + "epoch": 0.021533124629833318, + "grad_norm": 0.3588508367538452, + "learning_rate": 0.001, + "loss": 2.4136, + "step": 509 + }, + { + "epoch": 0.02157542939334969, + "grad_norm": 0.5358933806419373, + "learning_rate": 0.001, + "loss": 2.65, + "step": 510 + }, + { + "epoch": 0.021617734156866065, + "grad_norm": 0.45659172534942627, + "learning_rate": 0.001, + "loss": 3.4225, + "step": 511 + }, + { + "epoch": 0.021660038920382436, + "grad_norm": 0.4603763520717621, + "learning_rate": 0.001, + "loss": 2.3112, + "step": 512 + }, + { + "epoch": 0.021702343683898808, + "grad_norm": 0.4111917316913605, + "learning_rate": 0.001, + "loss": 2.4298, + "step": 513 + }, + { + "epoch": 0.02174464844741518, + "grad_norm": 0.3395131826400757, + "learning_rate": 0.001, + "loss": 2.235, + "step": 514 + }, + { + "epoch": 0.02178695321093155, + "grad_norm": 0.4213927388191223, + "learning_rate": 0.001, + "loss": 2.3075, + "step": 515 + }, + { + "epoch": 0.021829257974447923, + "grad_norm": 0.6425489187240601, + "learning_rate": 0.001, + "loss": 3.7484, + "step": 516 + }, + { + "epoch": 0.021871562737964294, + "grad_norm": 0.7505455613136292, + "learning_rate": 0.001, + "loss": 1.9084, + "step": 517 + }, + { + "epoch": 0.021913867501480666, + "grad_norm": 0.35921424627304077, + "learning_rate": 0.001, + "loss": 2.0054, + "step": 518 + }, + { + "epoch": 0.021956172264997038, + "grad_norm": 0.31863313913345337, + "learning_rate": 0.001, + "loss": 2.5181, + "step": 519 + }, + { + "epoch": 0.02199847702851341, + "grad_norm": 0.34935393929481506, + "learning_rate": 0.001, + "loss": 2.407, + "step": 520 + }, + { + "epoch": 0.02204078179202978, + "grad_norm": 0.34255489706993103, + "learning_rate": 0.001, + "loss": 2.0617, + "step": 521 + }, + { + "epoch": 0.022083086555546156, + "grad_norm": 0.3920708894729614, + "learning_rate": 0.001, + "loss": 2.9539, + "step": 522 + }, + { + "epoch": 0.022125391319062528, + "grad_norm": 0.3370257318019867, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 523 + }, + { + "epoch": 0.0221676960825789, + "grad_norm": 0.4249412417411804, + "learning_rate": 0.001, + "loss": 1.8416, + "step": 524 + }, + { + "epoch": 0.02221000084609527, + "grad_norm": 0.35336267948150635, + "learning_rate": 0.001, + "loss": 2.1076, + "step": 525 + }, + { + "epoch": 0.022252305609611642, + "grad_norm": 0.3903971016407013, + "learning_rate": 0.001, + "loss": 2.2353, + "step": 526 + }, + { + "epoch": 0.022294610373128014, + "grad_norm": 0.3893096446990967, + "learning_rate": 0.001, + "loss": 2.6471, + "step": 527 + }, + { + "epoch": 0.022336915136644386, + "grad_norm": 0.3965369462966919, + "learning_rate": 0.001, + "loss": 2.8242, + "step": 528 + }, + { + "epoch": 0.022379219900160757, + "grad_norm": 0.5321797728538513, + "learning_rate": 0.001, + "loss": 2.8859, + "step": 529 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 0.4475747346878052, + "learning_rate": 0.001, + "loss": 3.02, + "step": 530 + }, + { + "epoch": 0.0224638294271935, + "grad_norm": 1.6769299507141113, + "learning_rate": 0.001, + "loss": 3.5111, + "step": 531 + }, + { + "epoch": 0.022506134190709876, + "grad_norm": 0.40092411637306213, + "learning_rate": 0.001, + "loss": 1.5887, + "step": 532 + }, + { + "epoch": 0.022548438954226247, + "grad_norm": 1.476195216178894, + "learning_rate": 0.001, + "loss": 2.5106, + "step": 533 + }, + { + "epoch": 0.02259074371774262, + "grad_norm": 1.381837248802185, + "learning_rate": 0.001, + "loss": 1.9169, + "step": 534 + }, + { + "epoch": 0.02263304848125899, + "grad_norm": 0.8170017004013062, + "learning_rate": 0.001, + "loss": 1.8514, + "step": 535 + }, + { + "epoch": 0.022675353244775362, + "grad_norm": 0.6372575163841248, + "learning_rate": 0.001, + "loss": 2.8408, + "step": 536 + }, + { + "epoch": 0.022717658008291734, + "grad_norm": 0.5463494658470154, + "learning_rate": 0.001, + "loss": 2.3254, + "step": 537 + }, + { + "epoch": 0.022759962771808105, + "grad_norm": 0.5595137476921082, + "learning_rate": 0.001, + "loss": 3.0805, + "step": 538 + }, + { + "epoch": 0.022802267535324477, + "grad_norm": 0.7195737361907959, + "learning_rate": 0.001, + "loss": 2.4751, + "step": 539 + }, + { + "epoch": 0.02284457229884085, + "grad_norm": 0.3829958438873291, + "learning_rate": 0.001, + "loss": 2.0523, + "step": 540 + }, + { + "epoch": 0.02288687706235722, + "grad_norm": 1.4294495582580566, + "learning_rate": 0.001, + "loss": 2.5157, + "step": 541 + }, + { + "epoch": 0.022929181825873595, + "grad_norm": 0.5000084638595581, + "learning_rate": 0.001, + "loss": 2.1641, + "step": 542 + }, + { + "epoch": 0.022971486589389967, + "grad_norm": 0.8625162243843079, + "learning_rate": 0.001, + "loss": 3.6346, + "step": 543 + }, + { + "epoch": 0.02301379135290634, + "grad_norm": 0.5158907771110535, + "learning_rate": 0.001, + "loss": 2.6786, + "step": 544 + }, + { + "epoch": 0.02305609611642271, + "grad_norm": 1.403883457183838, + "learning_rate": 0.001, + "loss": 2.3591, + "step": 545 + }, + { + "epoch": 0.02309840087993908, + "grad_norm": 1.1025398969650269, + "learning_rate": 0.001, + "loss": 4.0568, + "step": 546 + }, + { + "epoch": 0.023140705643455453, + "grad_norm": 0.7397220730781555, + "learning_rate": 0.001, + "loss": 2.6037, + "step": 547 + }, + { + "epoch": 0.023183010406971825, + "grad_norm": 1.0023000240325928, + "learning_rate": 0.001, + "loss": 2.5716, + "step": 548 + }, + { + "epoch": 0.023225315170488196, + "grad_norm": 0.9285343885421753, + "learning_rate": 0.001, + "loss": 2.373, + "step": 549 + }, + { + "epoch": 0.023267619934004568, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.001, + "loss": 3.0031, + "step": 550 + }, + { + "epoch": 0.02330992469752094, + "grad_norm": 0.6443062424659729, + "learning_rate": 0.001, + "loss": 2.2552, + "step": 551 + }, + { + "epoch": 0.02335222946103731, + "grad_norm": 0.4823873043060303, + "learning_rate": 0.001, + "loss": 3.3305, + "step": 552 + }, + { + "epoch": 0.023394534224553686, + "grad_norm": 0.37984201312065125, + "learning_rate": 0.001, + "loss": 2.3388, + "step": 553 + }, + { + "epoch": 0.023436838988070058, + "grad_norm": 2.326221466064453, + "learning_rate": 0.001, + "loss": 2.7738, + "step": 554 + }, + { + "epoch": 0.02347914375158643, + "grad_norm": 0.5138685703277588, + "learning_rate": 0.001, + "loss": 2.2619, + "step": 555 + }, + { + "epoch": 0.0235214485151028, + "grad_norm": 1.178184151649475, + "learning_rate": 0.001, + "loss": 2.28, + "step": 556 + }, + { + "epoch": 0.023563753278619173, + "grad_norm": 0.4544781446456909, + "learning_rate": 0.001, + "loss": 2.7337, + "step": 557 + }, + { + "epoch": 0.023606058042135544, + "grad_norm": 0.589658260345459, + "learning_rate": 0.001, + "loss": 3.867, + "step": 558 + }, + { + "epoch": 0.023648362805651916, + "grad_norm": 0.7427922487258911, + "learning_rate": 0.001, + "loss": 2.7503, + "step": 559 + }, + { + "epoch": 0.023690667569168287, + "grad_norm": 0.3987663686275482, + "learning_rate": 0.001, + "loss": 1.9502, + "step": 560 + }, + { + "epoch": 0.02373297233268466, + "grad_norm": 0.45180320739746094, + "learning_rate": 0.001, + "loss": 2.6439, + "step": 561 + }, + { + "epoch": 0.02377527709620103, + "grad_norm": 0.677946150302887, + "learning_rate": 0.001, + "loss": 2.3775, + "step": 562 + }, + { + "epoch": 0.023817581859717406, + "grad_norm": 0.6633649468421936, + "learning_rate": 0.001, + "loss": 3.2441, + "step": 563 + }, + { + "epoch": 0.023859886623233777, + "grad_norm": 0.44053682684898376, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 564 + }, + { + "epoch": 0.02390219138675015, + "grad_norm": 0.47400352358818054, + "learning_rate": 0.001, + "loss": 2.6964, + "step": 565 + }, + { + "epoch": 0.02394449615026652, + "grad_norm": 0.394731730222702, + "learning_rate": 0.001, + "loss": 2.7092, + "step": 566 + }, + { + "epoch": 0.023986800913782892, + "grad_norm": 0.34284764528274536, + "learning_rate": 0.001, + "loss": 2.6047, + "step": 567 + }, + { + "epoch": 0.024029105677299264, + "grad_norm": 0.8231601119041443, + "learning_rate": 0.001, + "loss": 2.5821, + "step": 568 + }, + { + "epoch": 0.024071410440815635, + "grad_norm": 0.47505271434783936, + "learning_rate": 0.001, + "loss": 2.4732, + "step": 569 + }, + { + "epoch": 0.024113715204332007, + "grad_norm": 0.40913763642311096, + "learning_rate": 0.001, + "loss": 2.8477, + "step": 570 + }, + { + "epoch": 0.02415601996784838, + "grad_norm": 0.3416093587875366, + "learning_rate": 0.001, + "loss": 2.3009, + "step": 571 + }, + { + "epoch": 0.02419832473136475, + "grad_norm": 0.3416256010532379, + "learning_rate": 0.001, + "loss": 2.4531, + "step": 572 + }, + { + "epoch": 0.024240629494881125, + "grad_norm": 0.355182409286499, + "learning_rate": 0.001, + "loss": 2.9286, + "step": 573 + }, + { + "epoch": 0.024282934258397497, + "grad_norm": 0.5212019085884094, + "learning_rate": 0.001, + "loss": 2.0053, + "step": 574 + }, + { + "epoch": 0.02432523902191387, + "grad_norm": 0.4017347991466522, + "learning_rate": 0.001, + "loss": 3.4724, + "step": 575 + }, + { + "epoch": 0.02436754378543024, + "grad_norm": 0.34895554184913635, + "learning_rate": 0.001, + "loss": 2.7992, + "step": 576 + }, + { + "epoch": 0.024409848548946612, + "grad_norm": 0.3911406993865967, + "learning_rate": 0.001, + "loss": 2.4038, + "step": 577 + }, + { + "epoch": 0.024452153312462983, + "grad_norm": 0.38320183753967285, + "learning_rate": 0.001, + "loss": 3.2096, + "step": 578 + }, + { + "epoch": 0.024494458075979355, + "grad_norm": 0.4434884786605835, + "learning_rate": 0.001, + "loss": 2.6099, + "step": 579 + }, + { + "epoch": 0.024536762839495727, + "grad_norm": 0.34895849227905273, + "learning_rate": 0.001, + "loss": 2.2875, + "step": 580 + }, + { + "epoch": 0.024579067603012098, + "grad_norm": 0.38560599088668823, + "learning_rate": 0.001, + "loss": 2.8941, + "step": 581 + }, + { + "epoch": 0.02462137236652847, + "grad_norm": 1.215502381324768, + "learning_rate": 0.001, + "loss": 3.2749, + "step": 582 + }, + { + "epoch": 0.02466367713004484, + "grad_norm": 0.38922634720802307, + "learning_rate": 0.001, + "loss": 3.0296, + "step": 583 + }, + { + "epoch": 0.024705981893561216, + "grad_norm": 0.39207783341407776, + "learning_rate": 0.001, + "loss": 2.1744, + "step": 584 + }, + { + "epoch": 0.024748286657077588, + "grad_norm": 0.4072285294532776, + "learning_rate": 0.001, + "loss": 3.4887, + "step": 585 + }, + { + "epoch": 0.02479059142059396, + "grad_norm": 0.4496638774871826, + "learning_rate": 0.001, + "loss": 2.5288, + "step": 586 + }, + { + "epoch": 0.02483289618411033, + "grad_norm": 0.6576820015907288, + "learning_rate": 0.001, + "loss": 2.7665, + "step": 587 + }, + { + "epoch": 0.024875200947626703, + "grad_norm": 0.32960543036460876, + "learning_rate": 0.001, + "loss": 2.256, + "step": 588 + }, + { + "epoch": 0.024917505711143074, + "grad_norm": 0.44357311725616455, + "learning_rate": 0.001, + "loss": 3.0581, + "step": 589 + }, + { + "epoch": 0.024959810474659446, + "grad_norm": 0.5914167165756226, + "learning_rate": 0.001, + "loss": 2.8782, + "step": 590 + }, + { + "epoch": 0.025002115238175818, + "grad_norm": 0.6511535048484802, + "learning_rate": 0.001, + "loss": 3.7855, + "step": 591 + }, + { + "epoch": 0.02504442000169219, + "grad_norm": 0.42436838150024414, + "learning_rate": 0.001, + "loss": 1.9582, + "step": 592 + }, + { + "epoch": 0.02508672476520856, + "grad_norm": 0.38704079389572144, + "learning_rate": 0.001, + "loss": 3.0496, + "step": 593 + }, + { + "epoch": 0.025129029528724936, + "grad_norm": 13.88198184967041, + "learning_rate": 0.001, + "loss": 2.714, + "step": 594 + }, + { + "epoch": 0.025171334292241308, + "grad_norm": 0.7542757987976074, + "learning_rate": 0.001, + "loss": 2.4161, + "step": 595 + }, + { + "epoch": 0.02521363905575768, + "grad_norm": 0.475299209356308, + "learning_rate": 0.001, + "loss": 2.1213, + "step": 596 + }, + { + "epoch": 0.02525594381927405, + "grad_norm": 0.599808931350708, + "learning_rate": 0.001, + "loss": 2.5554, + "step": 597 + }, + { + "epoch": 0.025298248582790422, + "grad_norm": 0.45108288526535034, + "learning_rate": 0.001, + "loss": 2.0113, + "step": 598 + }, + { + "epoch": 0.025340553346306794, + "grad_norm": 0.471336305141449, + "learning_rate": 0.001, + "loss": 2.5406, + "step": 599 + }, + { + "epoch": 0.025382858109823166, + "grad_norm": 0.5292516946792603, + "learning_rate": 0.001, + "loss": 2.4529, + "step": 600 + }, + { + "epoch": 0.025425162873339537, + "grad_norm": 0.7199414968490601, + "learning_rate": 0.001, + "loss": 2.9323, + "step": 601 + }, + { + "epoch": 0.02546746763685591, + "grad_norm": 0.48330041766166687, + "learning_rate": 0.001, + "loss": 2.0881, + "step": 602 + }, + { + "epoch": 0.02550977240037228, + "grad_norm": 0.3391055464744568, + "learning_rate": 0.001, + "loss": 1.897, + "step": 603 + }, + { + "epoch": 0.025552077163888656, + "grad_norm": 0.4490862190723419, + "learning_rate": 0.001, + "loss": 2.6384, + "step": 604 + }, + { + "epoch": 0.025594381927405027, + "grad_norm": 0.7531641125679016, + "learning_rate": 0.001, + "loss": 3.3676, + "step": 605 + }, + { + "epoch": 0.0256366866909214, + "grad_norm": 0.3735019266605377, + "learning_rate": 0.001, + "loss": 2.1751, + "step": 606 + }, + { + "epoch": 0.02567899145443777, + "grad_norm": 0.5445840954780579, + "learning_rate": 0.001, + "loss": 2.6547, + "step": 607 + }, + { + "epoch": 0.025721296217954142, + "grad_norm": 0.4098416566848755, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 608 + }, + { + "epoch": 0.025763600981470514, + "grad_norm": 1.7501581907272339, + "learning_rate": 0.001, + "loss": 3.0225, + "step": 609 + }, + { + "epoch": 0.025805905744986885, + "grad_norm": 0.3755500912666321, + "learning_rate": 0.001, + "loss": 3.3205, + "step": 610 + }, + { + "epoch": 0.025848210508503257, + "grad_norm": 0.37451812624931335, + "learning_rate": 0.001, + "loss": 2.1556, + "step": 611 + }, + { + "epoch": 0.02589051527201963, + "grad_norm": 0.3815903663635254, + "learning_rate": 0.001, + "loss": 2.7517, + "step": 612 + }, + { + "epoch": 0.025932820035536, + "grad_norm": 0.327603280544281, + "learning_rate": 0.001, + "loss": 2.0281, + "step": 613 + }, + { + "epoch": 0.02597512479905237, + "grad_norm": 0.40311387181282043, + "learning_rate": 0.001, + "loss": 2.5982, + "step": 614 + }, + { + "epoch": 0.026017429562568747, + "grad_norm": 0.412945032119751, + "learning_rate": 0.001, + "loss": 2.4766, + "step": 615 + }, + { + "epoch": 0.02605973432608512, + "grad_norm": 0.7048538327217102, + "learning_rate": 0.001, + "loss": 2.8419, + "step": 616 + }, + { + "epoch": 0.02610203908960149, + "grad_norm": 0.3429849147796631, + "learning_rate": 0.001, + "loss": 2.3492, + "step": 617 + }, + { + "epoch": 0.02614434385311786, + "grad_norm": 0.39611124992370605, + "learning_rate": 0.001, + "loss": 2.307, + "step": 618 + }, + { + "epoch": 0.026186648616634233, + "grad_norm": 0.8335956931114197, + "learning_rate": 0.001, + "loss": 1.8694, + "step": 619 + }, + { + "epoch": 0.026228953380150605, + "grad_norm": 0.4806004762649536, + "learning_rate": 0.001, + "loss": 4.0374, + "step": 620 + }, + { + "epoch": 0.026271258143666976, + "grad_norm": 0.5001326203346252, + "learning_rate": 0.001, + "loss": 2.9919, + "step": 621 + }, + { + "epoch": 0.026313562907183348, + "grad_norm": 0.32230761647224426, + "learning_rate": 0.001, + "loss": 1.7651, + "step": 622 + }, + { + "epoch": 0.02635586767069972, + "grad_norm": 0.351005494594574, + "learning_rate": 0.001, + "loss": 2.1334, + "step": 623 + }, + { + "epoch": 0.02639817243421609, + "grad_norm": 0.3018239438533783, + "learning_rate": 0.001, + "loss": 2.0244, + "step": 624 + }, + { + "epoch": 0.026440477197732466, + "grad_norm": 0.4218401610851288, + "learning_rate": 0.001, + "loss": 2.0751, + "step": 625 + }, + { + "epoch": 0.026482781961248838, + "grad_norm": 0.4317582845687866, + "learning_rate": 0.001, + "loss": 3.2093, + "step": 626 + }, + { + "epoch": 0.02652508672476521, + "grad_norm": 0.604067325592041, + "learning_rate": 0.001, + "loss": 1.9582, + "step": 627 + }, + { + "epoch": 0.02656739148828158, + "grad_norm": 0.7366513013839722, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 628 + }, + { + "epoch": 0.026609696251797953, + "grad_norm": 0.4882150888442993, + "learning_rate": 0.001, + "loss": 2.1566, + "step": 629 + }, + { + "epoch": 0.026652001015314324, + "grad_norm": 0.4116131663322449, + "learning_rate": 0.001, + "loss": 2.4123, + "step": 630 + }, + { + "epoch": 0.026694305778830696, + "grad_norm": 0.3892541527748108, + "learning_rate": 0.001, + "loss": 2.4224, + "step": 631 + }, + { + "epoch": 0.026736610542347067, + "grad_norm": 1.0133605003356934, + "learning_rate": 0.001, + "loss": 2.1201, + "step": 632 + }, + { + "epoch": 0.02677891530586344, + "grad_norm": 0.4348303973674774, + "learning_rate": 0.001, + "loss": 2.1447, + "step": 633 + }, + { + "epoch": 0.02682122006937981, + "grad_norm": 0.5337515473365784, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 634 + }, + { + "epoch": 0.026863524832896186, + "grad_norm": 0.696890652179718, + "learning_rate": 0.001, + "loss": 2.4125, + "step": 635 + }, + { + "epoch": 0.026905829596412557, + "grad_norm": 0.4275985360145569, + "learning_rate": 0.001, + "loss": 2.661, + "step": 636 + }, + { + "epoch": 0.02694813435992893, + "grad_norm": 0.6423033475875854, + "learning_rate": 0.001, + "loss": 2.4138, + "step": 637 + }, + { + "epoch": 0.0269904391234453, + "grad_norm": 0.39524486660957336, + "learning_rate": 0.001, + "loss": 2.1282, + "step": 638 + }, + { + "epoch": 0.027032743886961672, + "grad_norm": 0.37805479764938354, + "learning_rate": 0.001, + "loss": 2.4532, + "step": 639 + }, + { + "epoch": 0.027075048650478044, + "grad_norm": 0.38140320777893066, + "learning_rate": 0.001, + "loss": 2.911, + "step": 640 + }, + { + "epoch": 0.027117353413994415, + "grad_norm": 0.33297136425971985, + "learning_rate": 0.001, + "loss": 1.9244, + "step": 641 + }, + { + "epoch": 0.027159658177510787, + "grad_norm": 0.5293309688568115, + "learning_rate": 0.001, + "loss": 2.1008, + "step": 642 + }, + { + "epoch": 0.02720196294102716, + "grad_norm": 0.7821404933929443, + "learning_rate": 0.001, + "loss": 2.258, + "step": 643 + }, + { + "epoch": 0.02724426770454353, + "grad_norm": 0.3198223412036896, + "learning_rate": 0.001, + "loss": 2.4415, + "step": 644 + }, + { + "epoch": 0.027286572468059902, + "grad_norm": 0.38337278366088867, + "learning_rate": 0.001, + "loss": 3.3935, + "step": 645 + }, + { + "epoch": 0.027328877231576277, + "grad_norm": 0.34149906039237976, + "learning_rate": 0.001, + "loss": 2.4166, + "step": 646 + }, + { + "epoch": 0.02737118199509265, + "grad_norm": 0.3562192916870117, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 647 + }, + { + "epoch": 0.02741348675860902, + "grad_norm": 0.42415452003479004, + "learning_rate": 0.001, + "loss": 4.2058, + "step": 648 + }, + { + "epoch": 0.027455791522125392, + "grad_norm": 0.5879824757575989, + "learning_rate": 0.001, + "loss": 1.9211, + "step": 649 + }, + { + "epoch": 0.027498096285641763, + "grad_norm": 0.27752256393432617, + "learning_rate": 0.001, + "loss": 2.2914, + "step": 650 + }, + { + "epoch": 0.027540401049158135, + "grad_norm": 0.399371474981308, + "learning_rate": 0.001, + "loss": 2.4115, + "step": 651 + }, + { + "epoch": 0.027582705812674507, + "grad_norm": 0.3457944691181183, + "learning_rate": 0.001, + "loss": 3.1942, + "step": 652 + }, + { + "epoch": 0.027625010576190878, + "grad_norm": 0.30785703659057617, + "learning_rate": 0.001, + "loss": 2.3755, + "step": 653 + }, + { + "epoch": 0.02766731533970725, + "grad_norm": 2.2121496200561523, + "learning_rate": 0.001, + "loss": 3.7683, + "step": 654 + }, + { + "epoch": 0.02770962010322362, + "grad_norm": 0.5387346148490906, + "learning_rate": 0.001, + "loss": 3.003, + "step": 655 + }, + { + "epoch": 0.027751924866739996, + "grad_norm": 0.3397753834724426, + "learning_rate": 0.001, + "loss": 2.1727, + "step": 656 + }, + { + "epoch": 0.027794229630256368, + "grad_norm": 0.5060691237449646, + "learning_rate": 0.001, + "loss": 1.9395, + "step": 657 + }, + { + "epoch": 0.02783653439377274, + "grad_norm": 0.8157616853713989, + "learning_rate": 0.001, + "loss": 2.0495, + "step": 658 + }, + { + "epoch": 0.02787883915728911, + "grad_norm": 0.2972865402698517, + "learning_rate": 0.001, + "loss": 2.1643, + "step": 659 + }, + { + "epoch": 0.027921143920805483, + "grad_norm": 0.3544447124004364, + "learning_rate": 0.001, + "loss": 1.8471, + "step": 660 + }, + { + "epoch": 0.027963448684321855, + "grad_norm": 0.4713522791862488, + "learning_rate": 0.001, + "loss": 2.3587, + "step": 661 + }, + { + "epoch": 0.028005753447838226, + "grad_norm": 0.6973052024841309, + "learning_rate": 0.001, + "loss": 2.1898, + "step": 662 + }, + { + "epoch": 0.028048058211354598, + "grad_norm": 0.30767449736595154, + "learning_rate": 0.001, + "loss": 1.8533, + "step": 663 + }, + { + "epoch": 0.02809036297487097, + "grad_norm": 0.4573628902435303, + "learning_rate": 0.001, + "loss": 2.3476, + "step": 664 + }, + { + "epoch": 0.02813266773838734, + "grad_norm": 0.3506143391132355, + "learning_rate": 0.001, + "loss": 2.35, + "step": 665 + }, + { + "epoch": 0.028174972501903716, + "grad_norm": 0.3929474353790283, + "learning_rate": 0.001, + "loss": 2.7286, + "step": 666 + }, + { + "epoch": 0.028217277265420088, + "grad_norm": 0.5329930782318115, + "learning_rate": 0.001, + "loss": 2.3529, + "step": 667 + }, + { + "epoch": 0.02825958202893646, + "grad_norm": 0.41064202785491943, + "learning_rate": 0.001, + "loss": 2.9625, + "step": 668 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 13.079408645629883, + "learning_rate": 0.001, + "loss": 1.974, + "step": 669 + }, + { + "epoch": 0.028344191555969202, + "grad_norm": 0.3811262845993042, + "learning_rate": 0.001, + "loss": 3.0237, + "step": 670 + }, + { + "epoch": 0.028386496319485574, + "grad_norm": 0.48839476704597473, + "learning_rate": 0.001, + "loss": 2.8337, + "step": 671 + }, + { + "epoch": 0.028428801083001946, + "grad_norm": 0.6662212014198303, + "learning_rate": 0.001, + "loss": 2.888, + "step": 672 + }, + { + "epoch": 0.028471105846518317, + "grad_norm": 0.3799011707305908, + "learning_rate": 0.001, + "loss": 3.3166, + "step": 673 + }, + { + "epoch": 0.02851341061003469, + "grad_norm": 0.4989493191242218, + "learning_rate": 0.001, + "loss": 2.6335, + "step": 674 + }, + { + "epoch": 0.02855571537355106, + "grad_norm": 0.3663240075111389, + "learning_rate": 0.001, + "loss": 2.1542, + "step": 675 + }, + { + "epoch": 0.028598020137067432, + "grad_norm": 0.3251192271709442, + "learning_rate": 0.001, + "loss": 2.1317, + "step": 676 + }, + { + "epoch": 0.028640324900583807, + "grad_norm": 0.36757683753967285, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 677 + }, + { + "epoch": 0.02868262966410018, + "grad_norm": 0.34744536876678467, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 678 + }, + { + "epoch": 0.02872493442761655, + "grad_norm": 0.6469220519065857, + "learning_rate": 0.001, + "loss": 2.9052, + "step": 679 + }, + { + "epoch": 0.028767239191132922, + "grad_norm": 0.3472583591938019, + "learning_rate": 0.001, + "loss": 2.1404, + "step": 680 + }, + { + "epoch": 0.028809543954649294, + "grad_norm": 0.3611651062965393, + "learning_rate": 0.001, + "loss": 2.7106, + "step": 681 + }, + { + "epoch": 0.028851848718165665, + "grad_norm": 1.0853683948516846, + "learning_rate": 0.001, + "loss": 2.0889, + "step": 682 + }, + { + "epoch": 0.028894153481682037, + "grad_norm": 0.5070203542709351, + "learning_rate": 0.001, + "loss": 2.1454, + "step": 683 + }, + { + "epoch": 0.02893645824519841, + "grad_norm": 1.9084913730621338, + "learning_rate": 0.001, + "loss": 2.6574, + "step": 684 + }, + { + "epoch": 0.02897876300871478, + "grad_norm": 0.4699901044368744, + "learning_rate": 0.001, + "loss": 2.2962, + "step": 685 + }, + { + "epoch": 0.02902106777223115, + "grad_norm": 0.5868147611618042, + "learning_rate": 0.001, + "loss": 1.9462, + "step": 686 + }, + { + "epoch": 0.029063372535747527, + "grad_norm": 0.4815481901168823, + "learning_rate": 0.001, + "loss": 2.3967, + "step": 687 + }, + { + "epoch": 0.0291056772992639, + "grad_norm": 1.597350001335144, + "learning_rate": 0.001, + "loss": 2.6351, + "step": 688 + }, + { + "epoch": 0.02914798206278027, + "grad_norm": 0.3565700352191925, + "learning_rate": 0.001, + "loss": 2.237, + "step": 689 + }, + { + "epoch": 0.02919028682629664, + "grad_norm": 0.362229585647583, + "learning_rate": 0.001, + "loss": 2.5553, + "step": 690 + }, + { + "epoch": 0.029232591589813013, + "grad_norm": 0.3516606390476227, + "learning_rate": 0.001, + "loss": 2.0371, + "step": 691 + }, + { + "epoch": 0.029274896353329385, + "grad_norm": 0.3809257447719574, + "learning_rate": 0.001, + "loss": 2.0524, + "step": 692 + }, + { + "epoch": 0.029317201116845756, + "grad_norm": 0.3532337248325348, + "learning_rate": 0.001, + "loss": 2.1929, + "step": 693 + }, + { + "epoch": 0.029359505880362128, + "grad_norm": 0.5409374833106995, + "learning_rate": 0.001, + "loss": 2.7353, + "step": 694 + }, + { + "epoch": 0.0294018106438785, + "grad_norm": 0.3616213798522949, + "learning_rate": 0.001, + "loss": 1.8789, + "step": 695 + }, + { + "epoch": 0.02944411540739487, + "grad_norm": 0.31754282116889954, + "learning_rate": 0.001, + "loss": 2.617, + "step": 696 + }, + { + "epoch": 0.029486420170911246, + "grad_norm": 0.4599725604057312, + "learning_rate": 0.001, + "loss": 2.535, + "step": 697 + }, + { + "epoch": 0.029528724934427618, + "grad_norm": 1.1604878902435303, + "learning_rate": 0.001, + "loss": 2.6166, + "step": 698 + }, + { + "epoch": 0.02957102969794399, + "grad_norm": 0.3580074906349182, + "learning_rate": 0.001, + "loss": 2.5489, + "step": 699 + }, + { + "epoch": 0.02961333446146036, + "grad_norm": 0.3962548077106476, + "learning_rate": 0.001, + "loss": 1.9984, + "step": 700 + }, + { + "epoch": 0.029655639224976733, + "grad_norm": 0.342292845249176, + "learning_rate": 0.001, + "loss": 2.9691, + "step": 701 + }, + { + "epoch": 0.029697943988493104, + "grad_norm": 0.4738942086696625, + "learning_rate": 0.001, + "loss": 2.2661, + "step": 702 + }, + { + "epoch": 0.029740248752009476, + "grad_norm": 0.4526740610599518, + "learning_rate": 0.001, + "loss": 2.084, + "step": 703 + }, + { + "epoch": 0.029782553515525848, + "grad_norm": 0.37444889545440674, + "learning_rate": 0.001, + "loss": 2.6041, + "step": 704 + }, + { + "epoch": 0.02982485827904222, + "grad_norm": 0.3665851950645447, + "learning_rate": 0.001, + "loss": 2.6698, + "step": 705 + }, + { + "epoch": 0.02986716304255859, + "grad_norm": 0.3661838471889496, + "learning_rate": 0.001, + "loss": 2.8261, + "step": 706 + }, + { + "epoch": 0.029909467806074962, + "grad_norm": 9.27115249633789, + "learning_rate": 0.001, + "loss": 2.6095, + "step": 707 + }, + { + "epoch": 0.029951772569591337, + "grad_norm": 0.7346689701080322, + "learning_rate": 0.001, + "loss": 2.503, + "step": 708 + }, + { + "epoch": 0.02999407733310771, + "grad_norm": 45.74531555175781, + "learning_rate": 0.001, + "loss": 2.5024, + "step": 709 + }, + { + "epoch": 0.03003638209662408, + "grad_norm": 0.5024700164794922, + "learning_rate": 0.001, + "loss": 2.0991, + "step": 710 + }, + { + "epoch": 0.030078686860140452, + "grad_norm": 0.44415217638015747, + "learning_rate": 0.001, + "loss": 3.5278, + "step": 711 + }, + { + "epoch": 0.030120991623656824, + "grad_norm": 0.3363064229488373, + "learning_rate": 0.001, + "loss": 2.9116, + "step": 712 + }, + { + "epoch": 0.030163296387173195, + "grad_norm": 0.4409061670303345, + "learning_rate": 0.001, + "loss": 2.5084, + "step": 713 + }, + { + "epoch": 0.030205601150689567, + "grad_norm": 0.9301056265830994, + "learning_rate": 0.001, + "loss": 3.2272, + "step": 714 + }, + { + "epoch": 0.03024790591420594, + "grad_norm": 0.3383950889110565, + "learning_rate": 0.001, + "loss": 2.6067, + "step": 715 + }, + { + "epoch": 0.03029021067772231, + "grad_norm": 2.455920934677124, + "learning_rate": 0.001, + "loss": 3.4956, + "step": 716 + }, + { + "epoch": 0.030332515441238682, + "grad_norm": 0.31969428062438965, + "learning_rate": 0.001, + "loss": 2.7223, + "step": 717 + }, + { + "epoch": 0.030374820204755057, + "grad_norm": 0.4326886534690857, + "learning_rate": 0.001, + "loss": 2.7745, + "step": 718 + }, + { + "epoch": 0.03041712496827143, + "grad_norm": 0.2943534553050995, + "learning_rate": 0.001, + "loss": 1.881, + "step": 719 + }, + { + "epoch": 0.0304594297317878, + "grad_norm": 0.31899815797805786, + "learning_rate": 0.001, + "loss": 1.6605, + "step": 720 + }, + { + "epoch": 0.030501734495304172, + "grad_norm": 1.0850050449371338, + "learning_rate": 0.001, + "loss": 2.5878, + "step": 721 + }, + { + "epoch": 0.030544039258820543, + "grad_norm": 0.4966890215873718, + "learning_rate": 0.001, + "loss": 3.5874, + "step": 722 + }, + { + "epoch": 0.030586344022336915, + "grad_norm": 0.44982099533081055, + "learning_rate": 0.001, + "loss": 2.1516, + "step": 723 + }, + { + "epoch": 0.030628648785853287, + "grad_norm": 0.5142163038253784, + "learning_rate": 0.001, + "loss": 2.5352, + "step": 724 + }, + { + "epoch": 0.030670953549369658, + "grad_norm": 0.4751744270324707, + "learning_rate": 0.001, + "loss": 2.1874, + "step": 725 + }, + { + "epoch": 0.03071325831288603, + "grad_norm": 0.5114376544952393, + "learning_rate": 0.001, + "loss": 2.2583, + "step": 726 + }, + { + "epoch": 0.0307555630764024, + "grad_norm": 0.4771519601345062, + "learning_rate": 0.001, + "loss": 2.4049, + "step": 727 + }, + { + "epoch": 0.030797867839918776, + "grad_norm": 0.3221670389175415, + "learning_rate": 0.001, + "loss": 1.871, + "step": 728 + }, + { + "epoch": 0.030840172603435148, + "grad_norm": 0.6503878235816956, + "learning_rate": 0.001, + "loss": 2.5809, + "step": 729 + }, + { + "epoch": 0.03088247736695152, + "grad_norm": 0.6677407622337341, + "learning_rate": 0.001, + "loss": 3.2025, + "step": 730 + }, + { + "epoch": 0.03092478213046789, + "grad_norm": 0.46367889642715454, + "learning_rate": 0.001, + "loss": 2.6524, + "step": 731 + }, + { + "epoch": 0.030967086893984263, + "grad_norm": 0.5044416189193726, + "learning_rate": 0.001, + "loss": 2.579, + "step": 732 + }, + { + "epoch": 0.031009391657500635, + "grad_norm": 1.4938032627105713, + "learning_rate": 0.001, + "loss": 2.0139, + "step": 733 + }, + { + "epoch": 0.031051696421017006, + "grad_norm": 0.6672400236129761, + "learning_rate": 0.001, + "loss": 2.7173, + "step": 734 + }, + { + "epoch": 0.031094001184533378, + "grad_norm": 1.8434799909591675, + "learning_rate": 0.001, + "loss": 2.4786, + "step": 735 + }, + { + "epoch": 0.03113630594804975, + "grad_norm": 0.4245956540107727, + "learning_rate": 0.001, + "loss": 2.2883, + "step": 736 + }, + { + "epoch": 0.03117861071156612, + "grad_norm": 0.6225207448005676, + "learning_rate": 0.001, + "loss": 2.9485, + "step": 737 + }, + { + "epoch": 0.031220915475082493, + "grad_norm": 1.0561782121658325, + "learning_rate": 0.001, + "loss": 2.7129, + "step": 738 + }, + { + "epoch": 0.031263220238598864, + "grad_norm": 0.46243247389793396, + "learning_rate": 0.001, + "loss": 2.3125, + "step": 739 + }, + { + "epoch": 0.03130552500211524, + "grad_norm": 0.8395326137542725, + "learning_rate": 0.001, + "loss": 2.0963, + "step": 740 + }, + { + "epoch": 0.03134782976563161, + "grad_norm": 0.49101972579956055, + "learning_rate": 0.001, + "loss": 2.9097, + "step": 741 + }, + { + "epoch": 0.03139013452914798, + "grad_norm": 1.3258086442947388, + "learning_rate": 0.001, + "loss": 2.8042, + "step": 742 + }, + { + "epoch": 0.03143243929266435, + "grad_norm": 3.282763719558716, + "learning_rate": 0.001, + "loss": 2.2387, + "step": 743 + }, + { + "epoch": 0.031474744056180726, + "grad_norm": 1.5577962398529053, + "learning_rate": 0.001, + "loss": 3.241, + "step": 744 + }, + { + "epoch": 0.0315170488196971, + "grad_norm": 0.40975457429885864, + "learning_rate": 0.001, + "loss": 2.8257, + "step": 745 + }, + { + "epoch": 0.03155935358321347, + "grad_norm": 0.9148398041725159, + "learning_rate": 0.001, + "loss": 3.0109, + "step": 746 + }, + { + "epoch": 0.031601658346729844, + "grad_norm": 0.45257365703582764, + "learning_rate": 0.001, + "loss": 2.0356, + "step": 747 + }, + { + "epoch": 0.03164396311024621, + "grad_norm": 1.465282917022705, + "learning_rate": 0.001, + "loss": 2.6872, + "step": 748 + }, + { + "epoch": 0.03168626787376259, + "grad_norm": 0.41279497742652893, + "learning_rate": 0.001, + "loss": 2.452, + "step": 749 + }, + { + "epoch": 0.031728572637278955, + "grad_norm": 0.48798638582229614, + "learning_rate": 0.001, + "loss": 2.6751, + "step": 750 + }, + { + "epoch": 0.03177087740079533, + "grad_norm": 7.742496967315674, + "learning_rate": 0.001, + "loss": 1.8392, + "step": 751 + }, + { + "epoch": 0.0318131821643117, + "grad_norm": 0.8577462434768677, + "learning_rate": 0.001, + "loss": 2.1078, + "step": 752 + }, + { + "epoch": 0.031855486927828074, + "grad_norm": 0.454903781414032, + "learning_rate": 0.001, + "loss": 2.7662, + "step": 753 + }, + { + "epoch": 0.03189779169134445, + "grad_norm": 0.7064248919487, + "learning_rate": 0.001, + "loss": 2.8522, + "step": 754 + }, + { + "epoch": 0.03194009645486082, + "grad_norm": 2.0568008422851562, + "learning_rate": 0.001, + "loss": 2.7169, + "step": 755 + }, + { + "epoch": 0.03198240121837719, + "grad_norm": 0.5723927617073059, + "learning_rate": 0.001, + "loss": 2.5269, + "step": 756 + }, + { + "epoch": 0.03202470598189356, + "grad_norm": 0.37855952978134155, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 757 + }, + { + "epoch": 0.032067010745409935, + "grad_norm": 0.6945350170135498, + "learning_rate": 0.001, + "loss": 2.7209, + "step": 758 + }, + { + "epoch": 0.0321093155089263, + "grad_norm": 0.45397865772247314, + "learning_rate": 0.001, + "loss": 2.3489, + "step": 759 + }, + { + "epoch": 0.03215162027244268, + "grad_norm": 0.36017417907714844, + "learning_rate": 0.001, + "loss": 1.7156, + "step": 760 + }, + { + "epoch": 0.032193925035959046, + "grad_norm": 0.5965908765792847, + "learning_rate": 0.001, + "loss": 2.2643, + "step": 761 + }, + { + "epoch": 0.03223622979947542, + "grad_norm": 0.4590226709842682, + "learning_rate": 0.001, + "loss": 2.5763, + "step": 762 + }, + { + "epoch": 0.03227853456299179, + "grad_norm": 1.0792022943496704, + "learning_rate": 0.001, + "loss": 2.0132, + "step": 763 + }, + { + "epoch": 0.032320839326508165, + "grad_norm": 0.4761914908885956, + "learning_rate": 0.001, + "loss": 1.9762, + "step": 764 + }, + { + "epoch": 0.03236314409002454, + "grad_norm": 0.5108850598335266, + "learning_rate": 0.001, + "loss": 2.6186, + "step": 765 + }, + { + "epoch": 0.03240544885354091, + "grad_norm": 0.47809892892837524, + "learning_rate": 0.001, + "loss": 2.1778, + "step": 766 + }, + { + "epoch": 0.03244775361705728, + "grad_norm": 1.9831953048706055, + "learning_rate": 0.001, + "loss": 3.3741, + "step": 767 + }, + { + "epoch": 0.03249005838057365, + "grad_norm": 1.5907195806503296, + "learning_rate": 0.001, + "loss": 2.9347, + "step": 768 + }, + { + "epoch": 0.032532363144090026, + "grad_norm": 0.43444883823394775, + "learning_rate": 0.001, + "loss": 1.9747, + "step": 769 + }, + { + "epoch": 0.032574667907606394, + "grad_norm": 0.4542893171310425, + "learning_rate": 0.001, + "loss": 2.4121, + "step": 770 + }, + { + "epoch": 0.03261697267112277, + "grad_norm": 0.41902869939804077, + "learning_rate": 0.001, + "loss": 2.1587, + "step": 771 + }, + { + "epoch": 0.03265927743463914, + "grad_norm": 5.790309906005859, + "learning_rate": 0.001, + "loss": 2.8927, + "step": 772 + }, + { + "epoch": 0.03270158219815551, + "grad_norm": 0.6397790908813477, + "learning_rate": 0.001, + "loss": 2.8784, + "step": 773 + }, + { + "epoch": 0.03274388696167188, + "grad_norm": 1.084983468055725, + "learning_rate": 0.001, + "loss": 3.2401, + "step": 774 + }, + { + "epoch": 0.032786191725188256, + "grad_norm": 0.4556136429309845, + "learning_rate": 0.001, + "loss": 2.6491, + "step": 775 + }, + { + "epoch": 0.03282849648870463, + "grad_norm": 1.9562443494796753, + "learning_rate": 0.001, + "loss": 3.3097, + "step": 776 + }, + { + "epoch": 0.032870801252221, + "grad_norm": 0.3327184319496155, + "learning_rate": 0.001, + "loss": 2.3438, + "step": 777 + }, + { + "epoch": 0.032913106015737374, + "grad_norm": 0.29970988631248474, + "learning_rate": 0.001, + "loss": 1.7802, + "step": 778 + }, + { + "epoch": 0.03295541077925374, + "grad_norm": 3.424224615097046, + "learning_rate": 0.001, + "loss": 3.0777, + "step": 779 + }, + { + "epoch": 0.03299771554277012, + "grad_norm": 0.7678899168968201, + "learning_rate": 0.001, + "loss": 3.1746, + "step": 780 + }, + { + "epoch": 0.033040020306286486, + "grad_norm": 11.198646545410156, + "learning_rate": 0.001, + "loss": 4.0651, + "step": 781 + }, + { + "epoch": 0.03308232506980286, + "grad_norm": 0.5121546387672424, + "learning_rate": 0.001, + "loss": 2.7573, + "step": 782 + }, + { + "epoch": 0.03312462983331923, + "grad_norm": 0.449379563331604, + "learning_rate": 0.001, + "loss": 2.4772, + "step": 783 + }, + { + "epoch": 0.033166934596835604, + "grad_norm": 1.0964363813400269, + "learning_rate": 0.001, + "loss": 2.0905, + "step": 784 + }, + { + "epoch": 0.03320923936035198, + "grad_norm": 60.26259994506836, + "learning_rate": 0.001, + "loss": 2.596, + "step": 785 + }, + { + "epoch": 0.03325154412386835, + "grad_norm": 0.7746313810348511, + "learning_rate": 0.001, + "loss": 3.0838, + "step": 786 + }, + { + "epoch": 0.03329384888738472, + "grad_norm": 0.37214627861976624, + "learning_rate": 0.001, + "loss": 2.0005, + "step": 787 + }, + { + "epoch": 0.03333615365090109, + "grad_norm": 0.42488759756088257, + "learning_rate": 0.001, + "loss": 2.6669, + "step": 788 + }, + { + "epoch": 0.033378458414417465, + "grad_norm": 0.41743898391723633, + "learning_rate": 0.001, + "loss": 2.7365, + "step": 789 + }, + { + "epoch": 0.033420763177933833, + "grad_norm": 0.3555799424648285, + "learning_rate": 0.001, + "loss": 2.8075, + "step": 790 + }, + { + "epoch": 0.03346306794145021, + "grad_norm": 0.4383140504360199, + "learning_rate": 0.001, + "loss": 2.7863, + "step": 791 + }, + { + "epoch": 0.03350537270496658, + "grad_norm": 0.38277480006217957, + "learning_rate": 0.001, + "loss": 1.9109, + "step": 792 + }, + { + "epoch": 0.03354767746848295, + "grad_norm": 0.3921913802623749, + "learning_rate": 0.001, + "loss": 2.5242, + "step": 793 + }, + { + "epoch": 0.03358998223199932, + "grad_norm": 0.6960230469703674, + "learning_rate": 0.001, + "loss": 2.7836, + "step": 794 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 0.38849663734436035, + "learning_rate": 0.001, + "loss": 2.4461, + "step": 795 + }, + { + "epoch": 0.03367459175903207, + "grad_norm": 0.37054547667503357, + "learning_rate": 0.001, + "loss": 2.7659, + "step": 796 + }, + { + "epoch": 0.03371689652254844, + "grad_norm": 0.3550194501876831, + "learning_rate": 0.001, + "loss": 2.3983, + "step": 797 + }, + { + "epoch": 0.03375920128606481, + "grad_norm": 0.48828840255737305, + "learning_rate": 0.001, + "loss": 2.2206, + "step": 798 + }, + { + "epoch": 0.03380150604958118, + "grad_norm": 1.516994833946228, + "learning_rate": 0.001, + "loss": 1.6824, + "step": 799 + }, + { + "epoch": 0.033843810813097557, + "grad_norm": 0.5133375525474548, + "learning_rate": 0.001, + "loss": 2.385, + "step": 800 + }, + { + "epoch": 0.033886115576613925, + "grad_norm": 1.752416729927063, + "learning_rate": 0.001, + "loss": 2.2985, + "step": 801 + }, + { + "epoch": 0.0339284203401303, + "grad_norm": 0.3678469657897949, + "learning_rate": 0.001, + "loss": 2.3368, + "step": 802 + }, + { + "epoch": 0.03397072510364667, + "grad_norm": 0.5483198165893555, + "learning_rate": 0.001, + "loss": 2.8708, + "step": 803 + }, + { + "epoch": 0.03401302986716304, + "grad_norm": 0.7874423265457153, + "learning_rate": 0.001, + "loss": 2.8457, + "step": 804 + }, + { + "epoch": 0.03405533463067941, + "grad_norm": 1.40812087059021, + "learning_rate": 0.001, + "loss": 2.5861, + "step": 805 + }, + { + "epoch": 0.034097639394195786, + "grad_norm": 1.082838535308838, + "learning_rate": 0.001, + "loss": 1.8972, + "step": 806 + }, + { + "epoch": 0.03413994415771216, + "grad_norm": 1.259365200996399, + "learning_rate": 0.001, + "loss": 3.0101, + "step": 807 + }, + { + "epoch": 0.03418224892122853, + "grad_norm": 2.4589128494262695, + "learning_rate": 0.001, + "loss": 2.2503, + "step": 808 + }, + { + "epoch": 0.034224553684744904, + "grad_norm": 0.39903524518013, + "learning_rate": 0.001, + "loss": 3.306, + "step": 809 + }, + { + "epoch": 0.03426685844826127, + "grad_norm": 1.4496986865997314, + "learning_rate": 0.001, + "loss": 2.5615, + "step": 810 + }, + { + "epoch": 0.03430916321177765, + "grad_norm": 1.3750876188278198, + "learning_rate": 0.001, + "loss": 3.1478, + "step": 811 + }, + { + "epoch": 0.034351467975294016, + "grad_norm": 0.6277172565460205, + "learning_rate": 0.001, + "loss": 3.0823, + "step": 812 + }, + { + "epoch": 0.03439377273881039, + "grad_norm": 1.379515290260315, + "learning_rate": 0.001, + "loss": 2.5736, + "step": 813 + }, + { + "epoch": 0.03443607750232676, + "grad_norm": 0.5735925436019897, + "learning_rate": 0.001, + "loss": 2.4369, + "step": 814 + }, + { + "epoch": 0.034478382265843134, + "grad_norm": 0.3423369228839874, + "learning_rate": 0.001, + "loss": 2.274, + "step": 815 + }, + { + "epoch": 0.03452068702935951, + "grad_norm": 1.0366361141204834, + "learning_rate": 0.001, + "loss": 2.4531, + "step": 816 + }, + { + "epoch": 0.03456299179287588, + "grad_norm": 0.40095871686935425, + "learning_rate": 0.001, + "loss": 1.8635, + "step": 817 + }, + { + "epoch": 0.03460529655639225, + "grad_norm": 0.44527915120124817, + "learning_rate": 0.001, + "loss": 2.0943, + "step": 818 + }, + { + "epoch": 0.03464760131990862, + "grad_norm": 0.6049659848213196, + "learning_rate": 0.001, + "loss": 1.9373, + "step": 819 + }, + { + "epoch": 0.034689906083424996, + "grad_norm": 0.3163706064224243, + "learning_rate": 0.001, + "loss": 2.163, + "step": 820 + }, + { + "epoch": 0.034732210846941364, + "grad_norm": 0.7788311243057251, + "learning_rate": 0.001, + "loss": 2.3464, + "step": 821 + }, + { + "epoch": 0.03477451561045774, + "grad_norm": 0.46176034212112427, + "learning_rate": 0.001, + "loss": 3.2508, + "step": 822 + }, + { + "epoch": 0.03481682037397411, + "grad_norm": 0.3383532464504242, + "learning_rate": 0.001, + "loss": 1.9256, + "step": 823 + }, + { + "epoch": 0.03485912513749048, + "grad_norm": 0.4552175998687744, + "learning_rate": 0.001, + "loss": 2.0759, + "step": 824 + }, + { + "epoch": 0.03490142990100685, + "grad_norm": 1.0574527978897095, + "learning_rate": 0.001, + "loss": 2.2718, + "step": 825 + }, + { + "epoch": 0.034943734664523225, + "grad_norm": 0.3664693236351013, + "learning_rate": 0.001, + "loss": 2.5472, + "step": 826 + }, + { + "epoch": 0.0349860394280396, + "grad_norm": 1.357131838798523, + "learning_rate": 0.001, + "loss": 2.9158, + "step": 827 + }, + { + "epoch": 0.03502834419155597, + "grad_norm": 0.5285866260528564, + "learning_rate": 0.001, + "loss": 2.1292, + "step": 828 + }, + { + "epoch": 0.035070648955072344, + "grad_norm": 0.3371189534664154, + "learning_rate": 0.001, + "loss": 2.5495, + "step": 829 + }, + { + "epoch": 0.03511295371858871, + "grad_norm": 0.33508920669555664, + "learning_rate": 0.001, + "loss": 1.6964, + "step": 830 + }, + { + "epoch": 0.03515525848210509, + "grad_norm": 4.2542643547058105, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 831 + }, + { + "epoch": 0.035197563245621455, + "grad_norm": 0.29248476028442383, + "learning_rate": 0.001, + "loss": 2.0451, + "step": 832 + }, + { + "epoch": 0.03523986800913783, + "grad_norm": 0.2993159592151642, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 833 + }, + { + "epoch": 0.0352821727726542, + "grad_norm": 0.34885692596435547, + "learning_rate": 0.001, + "loss": 2.5053, + "step": 834 + }, + { + "epoch": 0.03532447753617057, + "grad_norm": 0.3291875422000885, + "learning_rate": 0.001, + "loss": 2.3983, + "step": 835 + }, + { + "epoch": 0.03536678229968694, + "grad_norm": 0.34914126992225647, + "learning_rate": 0.001, + "loss": 2.0887, + "step": 836 + }, + { + "epoch": 0.035409087063203316, + "grad_norm": 0.38519594073295593, + "learning_rate": 0.001, + "loss": 2.7274, + "step": 837 + }, + { + "epoch": 0.03545139182671969, + "grad_norm": 5.37352991104126, + "learning_rate": 0.001, + "loss": 2.5118, + "step": 838 + }, + { + "epoch": 0.03549369659023606, + "grad_norm": 0.6151306629180908, + "learning_rate": 0.001, + "loss": 2.654, + "step": 839 + }, + { + "epoch": 0.035536001353752435, + "grad_norm": 0.504226803779602, + "learning_rate": 0.001, + "loss": 2.674, + "step": 840 + }, + { + "epoch": 0.0355783061172688, + "grad_norm": 0.3421787619590759, + "learning_rate": 0.001, + "loss": 2.526, + "step": 841 + }, + { + "epoch": 0.03562061088078518, + "grad_norm": 0.3042963445186615, + "learning_rate": 0.001, + "loss": 3.2472, + "step": 842 + }, + { + "epoch": 0.035662915644301546, + "grad_norm": 0.3438011705875397, + "learning_rate": 0.001, + "loss": 2.2514, + "step": 843 + }, + { + "epoch": 0.03570522040781792, + "grad_norm": 0.3311542570590973, + "learning_rate": 0.001, + "loss": 2.3859, + "step": 844 + }, + { + "epoch": 0.03574752517133429, + "grad_norm": 0.5320329666137695, + "learning_rate": 0.001, + "loss": 1.6051, + "step": 845 + }, + { + "epoch": 0.035789829934850664, + "grad_norm": 0.3768901526927948, + "learning_rate": 0.001, + "loss": 2.8369, + "step": 846 + }, + { + "epoch": 0.03583213469836704, + "grad_norm": 1.0556464195251465, + "learning_rate": 0.001, + "loss": 2.1928, + "step": 847 + }, + { + "epoch": 0.03587443946188341, + "grad_norm": 0.40488889813423157, + "learning_rate": 0.001, + "loss": 1.6987, + "step": 848 + }, + { + "epoch": 0.03591674422539978, + "grad_norm": 2.706275224685669, + "learning_rate": 0.001, + "loss": 2.6072, + "step": 849 + }, + { + "epoch": 0.03595904898891615, + "grad_norm": 0.8498132824897766, + "learning_rate": 0.001, + "loss": 3.2416, + "step": 850 + }, + { + "epoch": 0.036001353752432526, + "grad_norm": 0.38959065079689026, + "learning_rate": 0.001, + "loss": 2.3458, + "step": 851 + }, + { + "epoch": 0.036043658515948894, + "grad_norm": 0.41819003224372864, + "learning_rate": 0.001, + "loss": 2.1264, + "step": 852 + }, + { + "epoch": 0.03608596327946527, + "grad_norm": 0.48070141673088074, + "learning_rate": 0.001, + "loss": 2.6504, + "step": 853 + }, + { + "epoch": 0.03612826804298164, + "grad_norm": 0.5976713299751282, + "learning_rate": 0.001, + "loss": 3.0266, + "step": 854 + }, + { + "epoch": 0.03617057280649801, + "grad_norm": 0.3724801242351532, + "learning_rate": 0.001, + "loss": 3.0966, + "step": 855 + }, + { + "epoch": 0.03621287757001438, + "grad_norm": 0.35969069600105286, + "learning_rate": 0.001, + "loss": 3.1912, + "step": 856 + }, + { + "epoch": 0.036255182333530755, + "grad_norm": 4.334183692932129, + "learning_rate": 0.001, + "loss": 3.7079, + "step": 857 + }, + { + "epoch": 0.03629748709704713, + "grad_norm": 1.282253384590149, + "learning_rate": 0.001, + "loss": 2.2486, + "step": 858 + }, + { + "epoch": 0.0363397918605635, + "grad_norm": 0.44701820611953735, + "learning_rate": 0.001, + "loss": 2.8982, + "step": 859 + }, + { + "epoch": 0.036382096624079874, + "grad_norm": 0.37352272868156433, + "learning_rate": 0.001, + "loss": 3.1632, + "step": 860 + }, + { + "epoch": 0.03642440138759624, + "grad_norm": 0.33013972640037537, + "learning_rate": 0.001, + "loss": 2.0044, + "step": 861 + }, + { + "epoch": 0.03646670615111262, + "grad_norm": 0.37367743253707886, + "learning_rate": 0.001, + "loss": 3.3361, + "step": 862 + }, + { + "epoch": 0.036509010914628985, + "grad_norm": 0.5262033343315125, + "learning_rate": 0.001, + "loss": 4.0203, + "step": 863 + }, + { + "epoch": 0.03655131567814536, + "grad_norm": 0.8196743726730347, + "learning_rate": 0.001, + "loss": 1.9911, + "step": 864 + }, + { + "epoch": 0.03659362044166173, + "grad_norm": 1.1237256526947021, + "learning_rate": 0.001, + "loss": 3.1662, + "step": 865 + }, + { + "epoch": 0.0366359252051781, + "grad_norm": 2.1621487140655518, + "learning_rate": 0.001, + "loss": 1.6604, + "step": 866 + }, + { + "epoch": 0.03667822996869447, + "grad_norm": 0.5566956400871277, + "learning_rate": 0.001, + "loss": 2.1015, + "step": 867 + }, + { + "epoch": 0.03672053473221085, + "grad_norm": 0.35224634408950806, + "learning_rate": 0.001, + "loss": 2.6242, + "step": 868 + }, + { + "epoch": 0.03676283949572722, + "grad_norm": 0.33982589840888977, + "learning_rate": 0.001, + "loss": 2.6346, + "step": 869 + }, + { + "epoch": 0.03680514425924359, + "grad_norm": 0.9707451462745667, + "learning_rate": 0.001, + "loss": 2.7817, + "step": 870 + }, + { + "epoch": 0.036847449022759965, + "grad_norm": 0.32144203782081604, + "learning_rate": 0.001, + "loss": 1.9405, + "step": 871 + }, + { + "epoch": 0.03688975378627633, + "grad_norm": 0.31683671474456787, + "learning_rate": 0.001, + "loss": 2.9229, + "step": 872 + }, + { + "epoch": 0.03693205854979271, + "grad_norm": 0.36165180802345276, + "learning_rate": 0.001, + "loss": 3.1064, + "step": 873 + }, + { + "epoch": 0.036974363313309076, + "grad_norm": 0.311084508895874, + "learning_rate": 0.001, + "loss": 2.1846, + "step": 874 + }, + { + "epoch": 0.03701666807682545, + "grad_norm": 0.3572933077812195, + "learning_rate": 0.001, + "loss": 2.8808, + "step": 875 + }, + { + "epoch": 0.03705897284034182, + "grad_norm": 1.0521661043167114, + "learning_rate": 0.001, + "loss": 2.187, + "step": 876 + }, + { + "epoch": 0.037101277603858195, + "grad_norm": 0.41594183444976807, + "learning_rate": 0.001, + "loss": 2.4503, + "step": 877 + }, + { + "epoch": 0.03714358236737457, + "grad_norm": 1.126168131828308, + "learning_rate": 0.001, + "loss": 1.9875, + "step": 878 + }, + { + "epoch": 0.03718588713089094, + "grad_norm": 0.3451598882675171, + "learning_rate": 0.001, + "loss": 2.6444, + "step": 879 + }, + { + "epoch": 0.03722819189440731, + "grad_norm": 0.7697860598564148, + "learning_rate": 0.001, + "loss": 2.1816, + "step": 880 + }, + { + "epoch": 0.03727049665792368, + "grad_norm": 0.7067566514015198, + "learning_rate": 0.001, + "loss": 2.5282, + "step": 881 + }, + { + "epoch": 0.037312801421440056, + "grad_norm": 0.4576680064201355, + "learning_rate": 0.001, + "loss": 2.2394, + "step": 882 + }, + { + "epoch": 0.037355106184956424, + "grad_norm": 0.47805696725845337, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 883 + }, + { + "epoch": 0.0373974109484728, + "grad_norm": 18.085206985473633, + "learning_rate": 0.001, + "loss": 3.4886, + "step": 884 + }, + { + "epoch": 0.03743971571198917, + "grad_norm": 0.35168933868408203, + "learning_rate": 0.001, + "loss": 2.0473, + "step": 885 + }, + { + "epoch": 0.03748202047550554, + "grad_norm": 0.4354495108127594, + "learning_rate": 0.001, + "loss": 2.42, + "step": 886 + }, + { + "epoch": 0.03752432523902191, + "grad_norm": 0.4347662329673767, + "learning_rate": 0.001, + "loss": 2.7022, + "step": 887 + }, + { + "epoch": 0.037566630002538286, + "grad_norm": 2.830768346786499, + "learning_rate": 0.001, + "loss": 2.2454, + "step": 888 + }, + { + "epoch": 0.03760893476605466, + "grad_norm": 0.7625808119773865, + "learning_rate": 0.001, + "loss": 2.8905, + "step": 889 + }, + { + "epoch": 0.03765123952957103, + "grad_norm": 0.4779190719127655, + "learning_rate": 0.001, + "loss": 2.4054, + "step": 890 + }, + { + "epoch": 0.037693544293087404, + "grad_norm": 0.3955821692943573, + "learning_rate": 0.001, + "loss": 2.4408, + "step": 891 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 1.995564579963684, + "learning_rate": 0.001, + "loss": 1.9623, + "step": 892 + }, + { + "epoch": 0.03777815382012015, + "grad_norm": 1.702072024345398, + "learning_rate": 0.001, + "loss": 2.2569, + "step": 893 + }, + { + "epoch": 0.037820458583636515, + "grad_norm": 0.8360947370529175, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 894 + }, + { + "epoch": 0.03786276334715289, + "grad_norm": 0.4631440043449402, + "learning_rate": 0.001, + "loss": 3.6477, + "step": 895 + }, + { + "epoch": 0.03790506811066926, + "grad_norm": 0.4883536100387573, + "learning_rate": 0.001, + "loss": 2.1303, + "step": 896 + }, + { + "epoch": 0.037947372874185634, + "grad_norm": 1.4146642684936523, + "learning_rate": 0.001, + "loss": 2.8125, + "step": 897 + }, + { + "epoch": 0.037989677637702, + "grad_norm": 33.48441696166992, + "learning_rate": 0.001, + "loss": 2.3088, + "step": 898 + }, + { + "epoch": 0.03803198240121838, + "grad_norm": 1.0009665489196777, + "learning_rate": 0.001, + "loss": 3.2976, + "step": 899 + }, + { + "epoch": 0.03807428716473475, + "grad_norm": 0.6075432896614075, + "learning_rate": 0.001, + "loss": 3.0491, + "step": 900 + }, + { + "epoch": 0.03811659192825112, + "grad_norm": 0.6014549136161804, + "learning_rate": 0.001, + "loss": 2.6115, + "step": 901 + }, + { + "epoch": 0.038158896691767495, + "grad_norm": 4.742966175079346, + "learning_rate": 0.001, + "loss": 3.1309, + "step": 902 + }, + { + "epoch": 0.03820120145528386, + "grad_norm": 1.2808791399002075, + "learning_rate": 0.001, + "loss": 2.7616, + "step": 903 + }, + { + "epoch": 0.03824350621880024, + "grad_norm": 0.6155859231948853, + "learning_rate": 0.001, + "loss": 3.0479, + "step": 904 + }, + { + "epoch": 0.038285810982316607, + "grad_norm": 0.4951868951320648, + "learning_rate": 0.001, + "loss": 2.4222, + "step": 905 + }, + { + "epoch": 0.03832811574583298, + "grad_norm": 3.6468491554260254, + "learning_rate": 0.001, + "loss": 2.7734, + "step": 906 + }, + { + "epoch": 0.03837042050934935, + "grad_norm": 0.6409814953804016, + "learning_rate": 0.001, + "loss": 2.7391, + "step": 907 + }, + { + "epoch": 0.038412725272865725, + "grad_norm": 1.037394642829895, + "learning_rate": 0.001, + "loss": 3.3225, + "step": 908 + }, + { + "epoch": 0.0384550300363821, + "grad_norm": 0.7472317218780518, + "learning_rate": 0.001, + "loss": 4.0019, + "step": 909 + }, + { + "epoch": 0.03849733479989847, + "grad_norm": 0.5227680802345276, + "learning_rate": 0.001, + "loss": 2.0158, + "step": 910 + }, + { + "epoch": 0.03853963956341484, + "grad_norm": 0.5070827603340149, + "learning_rate": 0.001, + "loss": 2.1145, + "step": 911 + }, + { + "epoch": 0.03858194432693121, + "grad_norm": 1.9357705116271973, + "learning_rate": 0.001, + "loss": 2.4728, + "step": 912 + }, + { + "epoch": 0.038624249090447586, + "grad_norm": 3.635646104812622, + "learning_rate": 0.001, + "loss": 2.7857, + "step": 913 + }, + { + "epoch": 0.038666553853963954, + "grad_norm": 0.9932013154029846, + "learning_rate": 0.001, + "loss": 2.1222, + "step": 914 + }, + { + "epoch": 0.03870885861748033, + "grad_norm": 1.2177910804748535, + "learning_rate": 0.001, + "loss": 2.5434, + "step": 915 + }, + { + "epoch": 0.0387511633809967, + "grad_norm": 0.6072919964790344, + "learning_rate": 0.001, + "loss": 2.2818, + "step": 916 + }, + { + "epoch": 0.03879346814451307, + "grad_norm": 1.0168566703796387, + "learning_rate": 0.001, + "loss": 3.5772, + "step": 917 + }, + { + "epoch": 0.03883577290802944, + "grad_norm": 12.94654655456543, + "learning_rate": 0.001, + "loss": 3.1531, + "step": 918 + }, + { + "epoch": 0.038878077671545816, + "grad_norm": 0.9608505964279175, + "learning_rate": 0.001, + "loss": 4.1271, + "step": 919 + }, + { + "epoch": 0.03892038243506219, + "grad_norm": 0.9271876215934753, + "learning_rate": 0.001, + "loss": 3.0042, + "step": 920 + }, + { + "epoch": 0.03896268719857856, + "grad_norm": 0.7582124471664429, + "learning_rate": 0.001, + "loss": 2.4802, + "step": 921 + }, + { + "epoch": 0.039004991962094934, + "grad_norm": 3.8157336711883545, + "learning_rate": 0.001, + "loss": 4.0794, + "step": 922 + }, + { + "epoch": 0.0390472967256113, + "grad_norm": 0.4885561764240265, + "learning_rate": 0.001, + "loss": 3.57, + "step": 923 + }, + { + "epoch": 0.03908960148912768, + "grad_norm": 0.5436772704124451, + "learning_rate": 0.001, + "loss": 2.4098, + "step": 924 + }, + { + "epoch": 0.039131906252644046, + "grad_norm": 0.5220960378646851, + "learning_rate": 0.001, + "loss": 2.5934, + "step": 925 + }, + { + "epoch": 0.03917421101616042, + "grad_norm": 0.4250609278678894, + "learning_rate": 0.001, + "loss": 2.1337, + "step": 926 + }, + { + "epoch": 0.03921651577967679, + "grad_norm": 0.45072197914123535, + "learning_rate": 0.001, + "loss": 2.2017, + "step": 927 + }, + { + "epoch": 0.039258820543193164, + "grad_norm": 0.5734637379646301, + "learning_rate": 0.001, + "loss": 3.231, + "step": 928 + }, + { + "epoch": 0.03930112530670953, + "grad_norm": 2.7498257160186768, + "learning_rate": 0.001, + "loss": 2.0577, + "step": 929 + }, + { + "epoch": 0.03934343007022591, + "grad_norm": 1.0663455724716187, + "learning_rate": 0.001, + "loss": 2.2796, + "step": 930 + }, + { + "epoch": 0.03938573483374228, + "grad_norm": 0.4455733895301819, + "learning_rate": 0.001, + "loss": 2.7131, + "step": 931 + }, + { + "epoch": 0.03942803959725865, + "grad_norm": 10.824166297912598, + "learning_rate": 0.001, + "loss": 3.0632, + "step": 932 + }, + { + "epoch": 0.039470344360775025, + "grad_norm": 0.9566563367843628, + "learning_rate": 0.001, + "loss": 2.4747, + "step": 933 + }, + { + "epoch": 0.039512649124291394, + "grad_norm": 0.694932222366333, + "learning_rate": 0.001, + "loss": 3.1625, + "step": 934 + }, + { + "epoch": 0.03955495388780777, + "grad_norm": 2.0650901794433594, + "learning_rate": 0.001, + "loss": 2.1686, + "step": 935 + }, + { + "epoch": 0.03959725865132414, + "grad_norm": 0.47802698612213135, + "learning_rate": 0.001, + "loss": 3.7587, + "step": 936 + }, + { + "epoch": 0.03963956341484051, + "grad_norm": 1.5605882406234741, + "learning_rate": 0.001, + "loss": 2.1477, + "step": 937 + }, + { + "epoch": 0.03968186817835688, + "grad_norm": 3.1594226360321045, + "learning_rate": 0.001, + "loss": 2.8316, + "step": 938 + }, + { + "epoch": 0.039724172941873255, + "grad_norm": 1.2251648902893066, + "learning_rate": 0.001, + "loss": 3.0162, + "step": 939 + }, + { + "epoch": 0.03976647770538963, + "grad_norm": 0.4009862542152405, + "learning_rate": 0.001, + "loss": 2.5269, + "step": 940 + }, + { + "epoch": 0.039808782468906, + "grad_norm": 0.426503449678421, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 941 + }, + { + "epoch": 0.03985108723242237, + "grad_norm": 0.5413312315940857, + "learning_rate": 0.001, + "loss": 2.9823, + "step": 942 + }, + { + "epoch": 0.03989339199593874, + "grad_norm": 0.4892127811908722, + "learning_rate": 0.001, + "loss": 1.9651, + "step": 943 + }, + { + "epoch": 0.03993569675945512, + "grad_norm": 0.47731220722198486, + "learning_rate": 0.001, + "loss": 1.7261, + "step": 944 + }, + { + "epoch": 0.039978001522971485, + "grad_norm": 0.44389474391937256, + "learning_rate": 0.001, + "loss": 2.0669, + "step": 945 + }, + { + "epoch": 0.04002030628648786, + "grad_norm": 0.6225860714912415, + "learning_rate": 0.001, + "loss": 2.501, + "step": 946 + }, + { + "epoch": 0.04006261105000423, + "grad_norm": 0.5539319515228271, + "learning_rate": 0.001, + "loss": 2.2267, + "step": 947 + }, + { + "epoch": 0.0401049158135206, + "grad_norm": 0.34385305643081665, + "learning_rate": 0.001, + "loss": 2.3557, + "step": 948 + }, + { + "epoch": 0.04014722057703697, + "grad_norm": 0.41491398215293884, + "learning_rate": 0.001, + "loss": 2.2824, + "step": 949 + }, + { + "epoch": 0.040189525340553346, + "grad_norm": 0.8437483906745911, + "learning_rate": 0.001, + "loss": 3.2014, + "step": 950 + }, + { + "epoch": 0.04023183010406972, + "grad_norm": 0.3619152307510376, + "learning_rate": 0.001, + "loss": 2.7257, + "step": 951 + }, + { + "epoch": 0.04027413486758609, + "grad_norm": 0.8494651317596436, + "learning_rate": 0.001, + "loss": 2.5558, + "step": 952 + }, + { + "epoch": 0.040316439631102464, + "grad_norm": 1.8171448707580566, + "learning_rate": 0.001, + "loss": 3.2215, + "step": 953 + }, + { + "epoch": 0.04035874439461883, + "grad_norm": 1.0728228092193604, + "learning_rate": 0.001, + "loss": 3.5815, + "step": 954 + }, + { + "epoch": 0.04040104915813521, + "grad_norm": 3.6907944679260254, + "learning_rate": 0.001, + "loss": 3.3357, + "step": 955 + }, + { + "epoch": 0.040443353921651576, + "grad_norm": 0.7766746282577515, + "learning_rate": 0.001, + "loss": 3.0304, + "step": 956 + }, + { + "epoch": 0.04048565868516795, + "grad_norm": 1.010837197303772, + "learning_rate": 0.001, + "loss": 2.4535, + "step": 957 + }, + { + "epoch": 0.04052796344868432, + "grad_norm": 0.6098260879516602, + "learning_rate": 0.001, + "loss": 3.0176, + "step": 958 + }, + { + "epoch": 0.040570268212200694, + "grad_norm": 1.3032490015029907, + "learning_rate": 0.001, + "loss": 2.7713, + "step": 959 + }, + { + "epoch": 0.04061257297571707, + "grad_norm": 6.024738788604736, + "learning_rate": 0.001, + "loss": 2.4266, + "step": 960 + }, + { + "epoch": 0.04065487773923344, + "grad_norm": 0.8786823749542236, + "learning_rate": 0.001, + "loss": 2.61, + "step": 961 + }, + { + "epoch": 0.04069718250274981, + "grad_norm": 0.5354873538017273, + "learning_rate": 0.001, + "loss": 2.3966, + "step": 962 + }, + { + "epoch": 0.04073948726626618, + "grad_norm": 0.8180838823318481, + "learning_rate": 0.001, + "loss": 3.0533, + "step": 963 + }, + { + "epoch": 0.040781792029782556, + "grad_norm": 0.6282289028167725, + "learning_rate": 0.001, + "loss": 3.1841, + "step": 964 + }, + { + "epoch": 0.040824096793298924, + "grad_norm": 0.783220648765564, + "learning_rate": 0.001, + "loss": 2.6245, + "step": 965 + }, + { + "epoch": 0.0408664015568153, + "grad_norm": 0.8866094946861267, + "learning_rate": 0.001, + "loss": 3.3324, + "step": 966 + }, + { + "epoch": 0.04090870632033167, + "grad_norm": 0.9526342153549194, + "learning_rate": 0.001, + "loss": 2.7561, + "step": 967 + }, + { + "epoch": 0.04095101108384804, + "grad_norm": 0.7338128685951233, + "learning_rate": 0.001, + "loss": 3.8083, + "step": 968 + }, + { + "epoch": 0.04099331584736441, + "grad_norm": 0.7096972465515137, + "learning_rate": 0.001, + "loss": 2.2787, + "step": 969 + }, + { + "epoch": 0.041035620610880785, + "grad_norm": 0.459326833486557, + "learning_rate": 0.001, + "loss": 2.6205, + "step": 970 + }, + { + "epoch": 0.04107792537439716, + "grad_norm": 0.5549113154411316, + "learning_rate": 0.001, + "loss": 3.9624, + "step": 971 + }, + { + "epoch": 0.04112023013791353, + "grad_norm": 0.5863713026046753, + "learning_rate": 0.001, + "loss": 2.6585, + "step": 972 + }, + { + "epoch": 0.041162534901429904, + "grad_norm": 0.435687780380249, + "learning_rate": 0.001, + "loss": 3.875, + "step": 973 + }, + { + "epoch": 0.04120483966494627, + "grad_norm": 0.43353310227394104, + "learning_rate": 0.001, + "loss": 3.5305, + "step": 974 + }, + { + "epoch": 0.04124714442846265, + "grad_norm": 0.3369213342666626, + "learning_rate": 0.001, + "loss": 3.0293, + "step": 975 + }, + { + "epoch": 0.041289449191979015, + "grad_norm": 0.37502485513687134, + "learning_rate": 0.001, + "loss": 2.6536, + "step": 976 + }, + { + "epoch": 0.04133175395549539, + "grad_norm": 3.277416229248047, + "learning_rate": 0.001, + "loss": 2.3418, + "step": 977 + }, + { + "epoch": 0.04137405871901176, + "grad_norm": 0.30327439308166504, + "learning_rate": 0.001, + "loss": 2.6483, + "step": 978 + }, + { + "epoch": 0.04141636348252813, + "grad_norm": 0.3673608601093292, + "learning_rate": 0.001, + "loss": 2.884, + "step": 979 + }, + { + "epoch": 0.0414586682460445, + "grad_norm": 0.49414071440696716, + "learning_rate": 0.001, + "loss": 2.3366, + "step": 980 + }, + { + "epoch": 0.041500973009560876, + "grad_norm": 2.8874402046203613, + "learning_rate": 0.001, + "loss": 2.0754, + "step": 981 + }, + { + "epoch": 0.04154327777307725, + "grad_norm": 0.5763828754425049, + "learning_rate": 0.001, + "loss": 3.3845, + "step": 982 + }, + { + "epoch": 0.04158558253659362, + "grad_norm": 0.5612279772758484, + "learning_rate": 0.001, + "loss": 2.315, + "step": 983 + }, + { + "epoch": 0.041627887300109995, + "grad_norm": 1.2401258945465088, + "learning_rate": 0.001, + "loss": 3.435, + "step": 984 + }, + { + "epoch": 0.04167019206362636, + "grad_norm": 0.9611271023750305, + "learning_rate": 0.001, + "loss": 2.7392, + "step": 985 + }, + { + "epoch": 0.04171249682714274, + "grad_norm": 0.4988934397697449, + "learning_rate": 0.001, + "loss": 2.2294, + "step": 986 + }, + { + "epoch": 0.041754801590659106, + "grad_norm": 1.0643762350082397, + "learning_rate": 0.001, + "loss": 2.4758, + "step": 987 + }, + { + "epoch": 0.04179710635417548, + "grad_norm": 0.3355838656425476, + "learning_rate": 0.001, + "loss": 2.1455, + "step": 988 + }, + { + "epoch": 0.04183941111769185, + "grad_norm": 17.996070861816406, + "learning_rate": 0.001, + "loss": 3.2964, + "step": 989 + }, + { + "epoch": 0.041881715881208224, + "grad_norm": 4.5193610191345215, + "learning_rate": 0.001, + "loss": 2.0008, + "step": 990 + }, + { + "epoch": 0.0419240206447246, + "grad_norm": 0.4277860224246979, + "learning_rate": 0.001, + "loss": 3.1608, + "step": 991 + }, + { + "epoch": 0.04196632540824097, + "grad_norm": 0.47893789410591125, + "learning_rate": 0.001, + "loss": 2.6076, + "step": 992 + }, + { + "epoch": 0.04200863017175734, + "grad_norm": 0.5489811301231384, + "learning_rate": 0.001, + "loss": 2.1761, + "step": 993 + }, + { + "epoch": 0.04205093493527371, + "grad_norm": 0.5169845819473267, + "learning_rate": 0.001, + "loss": 2.3756, + "step": 994 + }, + { + "epoch": 0.042093239698790086, + "grad_norm": 1.397851586341858, + "learning_rate": 0.001, + "loss": 3.6065, + "step": 995 + }, + { + "epoch": 0.042135544462306454, + "grad_norm": 0.43019843101501465, + "learning_rate": 0.001, + "loss": 2.4526, + "step": 996 + }, + { + "epoch": 0.04217784922582283, + "grad_norm": 0.45522528886795044, + "learning_rate": 0.001, + "loss": 1.9483, + "step": 997 + }, + { + "epoch": 0.0422201539893392, + "grad_norm": 0.405224472284317, + "learning_rate": 0.001, + "loss": 2.6085, + "step": 998 + }, + { + "epoch": 0.04226245875285557, + "grad_norm": 0.6403412818908691, + "learning_rate": 0.001, + "loss": 3.2959, + "step": 999 + }, + { + "epoch": 0.04230476351637194, + "grad_norm": 0.4327405095100403, + "learning_rate": 0.001, + "loss": 2.4858, + "step": 1000 + }, + { + "epoch": 0.042347068279888316, + "grad_norm": 1.4853562116622925, + "learning_rate": 0.001, + "loss": 2.0621, + "step": 1001 + }, + { + "epoch": 0.04238937304340469, + "grad_norm": 0.42528292536735535, + "learning_rate": 0.001, + "loss": 2.4629, + "step": 1002 + }, + { + "epoch": 0.04243167780692106, + "grad_norm": 0.30595675110816956, + "learning_rate": 0.001, + "loss": 2.1818, + "step": 1003 + }, + { + "epoch": 0.042473982570437434, + "grad_norm": 0.4422406554222107, + "learning_rate": 0.001, + "loss": 1.949, + "step": 1004 + }, + { + "epoch": 0.0425162873339538, + "grad_norm": 0.3010110557079315, + "learning_rate": 0.001, + "loss": 2.3222, + "step": 1005 + }, + { + "epoch": 0.04255859209747018, + "grad_norm": 12.802899360656738, + "learning_rate": 0.001, + "loss": 2.388, + "step": 1006 + }, + { + "epoch": 0.042600896860986545, + "grad_norm": 0.438760906457901, + "learning_rate": 0.001, + "loss": 2.7597, + "step": 1007 + }, + { + "epoch": 0.04264320162450292, + "grad_norm": 0.9350125789642334, + "learning_rate": 0.001, + "loss": 2.3453, + "step": 1008 + }, + { + "epoch": 0.04268550638801929, + "grad_norm": 0.4918479919433594, + "learning_rate": 0.001, + "loss": 2.1911, + "step": 1009 + }, + { + "epoch": 0.04272781115153566, + "grad_norm": 0.47994211316108704, + "learning_rate": 0.001, + "loss": 2.7534, + "step": 1010 + }, + { + "epoch": 0.04277011591505203, + "grad_norm": 0.34078550338745117, + "learning_rate": 0.001, + "loss": 2.1767, + "step": 1011 + }, + { + "epoch": 0.04281242067856841, + "grad_norm": 0.6441370844841003, + "learning_rate": 0.001, + "loss": 2.1742, + "step": 1012 + }, + { + "epoch": 0.04285472544208478, + "grad_norm": 1.00253427028656, + "learning_rate": 0.001, + "loss": 2.6111, + "step": 1013 + }, + { + "epoch": 0.04289703020560115, + "grad_norm": 0.42675742506980896, + "learning_rate": 0.001, + "loss": 3.7831, + "step": 1014 + }, + { + "epoch": 0.042939334969117525, + "grad_norm": 0.5508003830909729, + "learning_rate": 0.001, + "loss": 2.6718, + "step": 1015 + }, + { + "epoch": 0.04298163973263389, + "grad_norm": 0.5830764174461365, + "learning_rate": 0.001, + "loss": 3.2418, + "step": 1016 + }, + { + "epoch": 0.04302394449615027, + "grad_norm": 0.5782317519187927, + "learning_rate": 0.001, + "loss": 2.5268, + "step": 1017 + }, + { + "epoch": 0.043066249259666636, + "grad_norm": 0.39523276686668396, + "learning_rate": 0.001, + "loss": 2.3105, + "step": 1018 + }, + { + "epoch": 0.04310855402318301, + "grad_norm": 0.5414249300956726, + "learning_rate": 0.001, + "loss": 2.5811, + "step": 1019 + }, + { + "epoch": 0.04315085878669938, + "grad_norm": 0.5326202511787415, + "learning_rate": 0.001, + "loss": 2.5664, + "step": 1020 + }, + { + "epoch": 0.043193163550215755, + "grad_norm": 0.3412122428417206, + "learning_rate": 0.001, + "loss": 2.6221, + "step": 1021 + }, + { + "epoch": 0.04323546831373213, + "grad_norm": 0.3936321437358856, + "learning_rate": 0.001, + "loss": 2.1314, + "step": 1022 + }, + { + "epoch": 0.0432777730772485, + "grad_norm": 1.0190694332122803, + "learning_rate": 0.001, + "loss": 2.6642, + "step": 1023 + }, + { + "epoch": 0.04332007784076487, + "grad_norm": 2.318169116973877, + "learning_rate": 0.001, + "loss": 3.3849, + "step": 1024 + }, + { + "epoch": 0.04336238260428124, + "grad_norm": 0.29711923003196716, + "learning_rate": 0.001, + "loss": 2.2662, + "step": 1025 + }, + { + "epoch": 0.043404687367797616, + "grad_norm": 0.4354766011238098, + "learning_rate": 0.001, + "loss": 2.5792, + "step": 1026 + }, + { + "epoch": 0.043446992131313984, + "grad_norm": 0.3557169735431671, + "learning_rate": 0.001, + "loss": 3.1672, + "step": 1027 + }, + { + "epoch": 0.04348929689483036, + "grad_norm": 0.48224884271621704, + "learning_rate": 0.001, + "loss": 2.0798, + "step": 1028 + }, + { + "epoch": 0.04353160165834673, + "grad_norm": 0.3722802698612213, + "learning_rate": 0.001, + "loss": 2.6505, + "step": 1029 + }, + { + "epoch": 0.0435739064218631, + "grad_norm": 0.40177568793296814, + "learning_rate": 0.001, + "loss": 2.6731, + "step": 1030 + }, + { + "epoch": 0.04361621118537947, + "grad_norm": 0.34859174489974976, + "learning_rate": 0.001, + "loss": 2.0711, + "step": 1031 + }, + { + "epoch": 0.043658515948895846, + "grad_norm": 0.35582149028778076, + "learning_rate": 0.001, + "loss": 2.5389, + "step": 1032 + }, + { + "epoch": 0.04370082071241222, + "grad_norm": 0.35008880496025085, + "learning_rate": 0.001, + "loss": 2.0354, + "step": 1033 + }, + { + "epoch": 0.04374312547592859, + "grad_norm": 0.7123366594314575, + "learning_rate": 0.001, + "loss": 2.6122, + "step": 1034 + }, + { + "epoch": 0.043785430239444964, + "grad_norm": 0.6655288338661194, + "learning_rate": 0.001, + "loss": 2.3297, + "step": 1035 + }, + { + "epoch": 0.04382773500296133, + "grad_norm": 2.140918493270874, + "learning_rate": 0.001, + "loss": 2.2363, + "step": 1036 + }, + { + "epoch": 0.04387003976647771, + "grad_norm": 1.8651494979858398, + "learning_rate": 0.001, + "loss": 3.1583, + "step": 1037 + }, + { + "epoch": 0.043912344529994075, + "grad_norm": 0.5514461994171143, + "learning_rate": 0.001, + "loss": 3.5841, + "step": 1038 + }, + { + "epoch": 0.04395464929351045, + "grad_norm": 0.48088452219963074, + "learning_rate": 0.001, + "loss": 2.3557, + "step": 1039 + }, + { + "epoch": 0.04399695405702682, + "grad_norm": 1.1052600145339966, + "learning_rate": 0.001, + "loss": 2.5092, + "step": 1040 + }, + { + "epoch": 0.044039258820543194, + "grad_norm": 0.4182952344417572, + "learning_rate": 0.001, + "loss": 3.016, + "step": 1041 + }, + { + "epoch": 0.04408156358405956, + "grad_norm": 0.30450159311294556, + "learning_rate": 0.001, + "loss": 2.5603, + "step": 1042 + }, + { + "epoch": 0.04412386834757594, + "grad_norm": 0.2754423916339874, + "learning_rate": 0.001, + "loss": 1.9932, + "step": 1043 + }, + { + "epoch": 0.04416617311109231, + "grad_norm": 2.453713893890381, + "learning_rate": 0.001, + "loss": 3.1293, + "step": 1044 + }, + { + "epoch": 0.04420847787460868, + "grad_norm": 0.606838583946228, + "learning_rate": 0.001, + "loss": 3.1959, + "step": 1045 + }, + { + "epoch": 0.044250782638125055, + "grad_norm": 0.580777645111084, + "learning_rate": 0.001, + "loss": 2.0586, + "step": 1046 + }, + { + "epoch": 0.04429308740164142, + "grad_norm": 5.384679317474365, + "learning_rate": 0.001, + "loss": 2.4947, + "step": 1047 + }, + { + "epoch": 0.0443353921651578, + "grad_norm": 0.3704220652580261, + "learning_rate": 0.001, + "loss": 2.4293, + "step": 1048 + }, + { + "epoch": 0.04437769692867417, + "grad_norm": 0.3065285086631775, + "learning_rate": 0.001, + "loss": 2.7632, + "step": 1049 + }, + { + "epoch": 0.04442000169219054, + "grad_norm": 0.3605315089225769, + "learning_rate": 0.001, + "loss": 2.3277, + "step": 1050 + }, + { + "epoch": 0.04446230645570691, + "grad_norm": 0.46234673261642456, + "learning_rate": 0.001, + "loss": 2.1531, + "step": 1051 + }, + { + "epoch": 0.044504611219223285, + "grad_norm": 0.43919745087623596, + "learning_rate": 0.001, + "loss": 2.3706, + "step": 1052 + }, + { + "epoch": 0.04454691598273966, + "grad_norm": 0.37155017256736755, + "learning_rate": 0.001, + "loss": 3.1104, + "step": 1053 + }, + { + "epoch": 0.04458922074625603, + "grad_norm": 0.31615880131721497, + "learning_rate": 0.001, + "loss": 3.1019, + "step": 1054 + }, + { + "epoch": 0.0446315255097724, + "grad_norm": 1.206450343132019, + "learning_rate": 0.001, + "loss": 3.4134, + "step": 1055 + }, + { + "epoch": 0.04467383027328877, + "grad_norm": 0.6831628084182739, + "learning_rate": 0.001, + "loss": 2.5709, + "step": 1056 + }, + { + "epoch": 0.044716135036805146, + "grad_norm": 0.4207130968570709, + "learning_rate": 0.001, + "loss": 2.2903, + "step": 1057 + }, + { + "epoch": 0.044758439800321514, + "grad_norm": 0.3084408938884735, + "learning_rate": 0.001, + "loss": 2.8131, + "step": 1058 + }, + { + "epoch": 0.04480074456383789, + "grad_norm": 0.6155023574829102, + "learning_rate": 0.001, + "loss": 2.3744, + "step": 1059 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 0.6356950998306274, + "learning_rate": 0.001, + "loss": 2.5759, + "step": 1060 + }, + { + "epoch": 0.04488535409087063, + "grad_norm": 0.41177189350128174, + "learning_rate": 0.001, + "loss": 3.0073, + "step": 1061 + }, + { + "epoch": 0.044927658854387, + "grad_norm": 0.30051976442337036, + "learning_rate": 0.001, + "loss": 2.8997, + "step": 1062 + }, + { + "epoch": 0.044969963617903376, + "grad_norm": 0.28739258646965027, + "learning_rate": 0.001, + "loss": 1.8987, + "step": 1063 + }, + { + "epoch": 0.04501226838141975, + "grad_norm": 0.950710117816925, + "learning_rate": 0.001, + "loss": 2.3666, + "step": 1064 + }, + { + "epoch": 0.04505457314493612, + "grad_norm": 0.5854257941246033, + "learning_rate": 0.001, + "loss": 2.323, + "step": 1065 + }, + { + "epoch": 0.045096877908452494, + "grad_norm": 0.3358246088027954, + "learning_rate": 0.001, + "loss": 2.5187, + "step": 1066 + }, + { + "epoch": 0.04513918267196886, + "grad_norm": 1.6332861185073853, + "learning_rate": 0.001, + "loss": 2.3758, + "step": 1067 + }, + { + "epoch": 0.04518148743548524, + "grad_norm": 1.7713162899017334, + "learning_rate": 0.001, + "loss": 2.3417, + "step": 1068 + }, + { + "epoch": 0.045223792199001606, + "grad_norm": 0.3794574439525604, + "learning_rate": 0.001, + "loss": 2.0666, + "step": 1069 + }, + { + "epoch": 0.04526609696251798, + "grad_norm": 0.2956782281398773, + "learning_rate": 0.001, + "loss": 2.8441, + "step": 1070 + }, + { + "epoch": 0.04530840172603435, + "grad_norm": 0.2897772192955017, + "learning_rate": 0.001, + "loss": 2.3111, + "step": 1071 + }, + { + "epoch": 0.045350706489550724, + "grad_norm": 4.40648078918457, + "learning_rate": 0.001, + "loss": 2.0011, + "step": 1072 + }, + { + "epoch": 0.04539301125306709, + "grad_norm": 7.493185520172119, + "learning_rate": 0.001, + "loss": 3.1528, + "step": 1073 + }, + { + "epoch": 0.04543531601658347, + "grad_norm": 0.34881994128227234, + "learning_rate": 0.001, + "loss": 2.0333, + "step": 1074 + }, + { + "epoch": 0.04547762078009984, + "grad_norm": 0.9052633047103882, + "learning_rate": 0.001, + "loss": 2.4499, + "step": 1075 + }, + { + "epoch": 0.04551992554361621, + "grad_norm": 0.7866429090499878, + "learning_rate": 0.001, + "loss": 2.5423, + "step": 1076 + }, + { + "epoch": 0.045562230307132585, + "grad_norm": 0.8085727691650391, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 1077 + }, + { + "epoch": 0.045604535070648954, + "grad_norm": 0.517099142074585, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 1078 + }, + { + "epoch": 0.04564683983416533, + "grad_norm": 0.5327157378196716, + "learning_rate": 0.001, + "loss": 3.1626, + "step": 1079 + }, + { + "epoch": 0.0456891445976817, + "grad_norm": 0.28011730313301086, + "learning_rate": 0.001, + "loss": 1.6698, + "step": 1080 + }, + { + "epoch": 0.04573144936119807, + "grad_norm": 2.291832208633423, + "learning_rate": 0.001, + "loss": 2.2235, + "step": 1081 + }, + { + "epoch": 0.04577375412471444, + "grad_norm": 0.5037325024604797, + "learning_rate": 0.001, + "loss": 2.2568, + "step": 1082 + }, + { + "epoch": 0.045816058888230815, + "grad_norm": 0.7182853817939758, + "learning_rate": 0.001, + "loss": 2.6144, + "step": 1083 + }, + { + "epoch": 0.04585836365174719, + "grad_norm": 0.3494111895561218, + "learning_rate": 0.001, + "loss": 3.3147, + "step": 1084 + }, + { + "epoch": 0.04590066841526356, + "grad_norm": 0.35580259561538696, + "learning_rate": 0.001, + "loss": 3.1962, + "step": 1085 + }, + { + "epoch": 0.04594297317877993, + "grad_norm": 0.5036289691925049, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 1086 + }, + { + "epoch": 0.0459852779422963, + "grad_norm": 0.5229755640029907, + "learning_rate": 0.001, + "loss": 1.7954, + "step": 1087 + }, + { + "epoch": 0.04602758270581268, + "grad_norm": 0.5959165096282959, + "learning_rate": 0.001, + "loss": 3.3961, + "step": 1088 + }, + { + "epoch": 0.046069887469329045, + "grad_norm": 0.43716666102409363, + "learning_rate": 0.001, + "loss": 2.0928, + "step": 1089 + }, + { + "epoch": 0.04611219223284542, + "grad_norm": 0.38793283700942993, + "learning_rate": 0.001, + "loss": 2.1611, + "step": 1090 + }, + { + "epoch": 0.04615449699636179, + "grad_norm": 0.3964957892894745, + "learning_rate": 0.001, + "loss": 2.4315, + "step": 1091 + }, + { + "epoch": 0.04619680175987816, + "grad_norm": 0.7259196043014526, + "learning_rate": 0.001, + "loss": 2.5655, + "step": 1092 + }, + { + "epoch": 0.04623910652339453, + "grad_norm": 20.704673767089844, + "learning_rate": 0.001, + "loss": 2.1808, + "step": 1093 + }, + { + "epoch": 0.046281411286910906, + "grad_norm": 1.8599133491516113, + "learning_rate": 0.001, + "loss": 3.0483, + "step": 1094 + }, + { + "epoch": 0.04632371605042728, + "grad_norm": 0.3958137035369873, + "learning_rate": 0.001, + "loss": 2.8665, + "step": 1095 + }, + { + "epoch": 0.04636602081394365, + "grad_norm": 0.46354708075523376, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 1096 + }, + { + "epoch": 0.046408325577460025, + "grad_norm": 0.5653846263885498, + "learning_rate": 0.001, + "loss": 2.2344, + "step": 1097 + }, + { + "epoch": 0.04645063034097639, + "grad_norm": 0.3511918783187866, + "learning_rate": 0.001, + "loss": 1.8777, + "step": 1098 + }, + { + "epoch": 0.04649293510449277, + "grad_norm": 0.4230622351169586, + "learning_rate": 0.001, + "loss": 2.2688, + "step": 1099 + }, + { + "epoch": 0.046535239868009136, + "grad_norm": 0.5649506449699402, + "learning_rate": 0.001, + "loss": 1.8348, + "step": 1100 + }, + { + "epoch": 0.04657754463152551, + "grad_norm": 0.3206387162208557, + "learning_rate": 0.001, + "loss": 2.2039, + "step": 1101 + }, + { + "epoch": 0.04661984939504188, + "grad_norm": 0.3505328297615051, + "learning_rate": 0.001, + "loss": 2.8421, + "step": 1102 + }, + { + "epoch": 0.046662154158558254, + "grad_norm": 0.8784352540969849, + "learning_rate": 0.001, + "loss": 3.6394, + "step": 1103 + }, + { + "epoch": 0.04670445892207462, + "grad_norm": 0.3867740035057068, + "learning_rate": 0.001, + "loss": 2.2488, + "step": 1104 + }, + { + "epoch": 0.046746763685591, + "grad_norm": 0.3754059374332428, + "learning_rate": 0.001, + "loss": 2.4647, + "step": 1105 + }, + { + "epoch": 0.04678906844910737, + "grad_norm": 0.3429076671600342, + "learning_rate": 0.001, + "loss": 2.658, + "step": 1106 + }, + { + "epoch": 0.04683137321262374, + "grad_norm": 0.35917940735816956, + "learning_rate": 0.001, + "loss": 2.8441, + "step": 1107 + }, + { + "epoch": 0.046873677976140116, + "grad_norm": 0.4225330948829651, + "learning_rate": 0.001, + "loss": 3.4419, + "step": 1108 + }, + { + "epoch": 0.046915982739656484, + "grad_norm": 0.2866974174976349, + "learning_rate": 0.001, + "loss": 1.7171, + "step": 1109 + }, + { + "epoch": 0.04695828750317286, + "grad_norm": 0.655272364616394, + "learning_rate": 0.001, + "loss": 2.1575, + "step": 1110 + }, + { + "epoch": 0.04700059226668923, + "grad_norm": 0.32389822602272034, + "learning_rate": 0.001, + "loss": 2.9878, + "step": 1111 + }, + { + "epoch": 0.0470428970302056, + "grad_norm": 0.33824819326400757, + "learning_rate": 0.001, + "loss": 2.6459, + "step": 1112 + }, + { + "epoch": 0.04708520179372197, + "grad_norm": 0.2966809570789337, + "learning_rate": 0.001, + "loss": 2.6308, + "step": 1113 + }, + { + "epoch": 0.047127506557238345, + "grad_norm": 0.3179897665977478, + "learning_rate": 0.001, + "loss": 2.525, + "step": 1114 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 1.4565528631210327, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 1115 + }, + { + "epoch": 0.04721211608427109, + "grad_norm": 0.4614794850349426, + "learning_rate": 0.001, + "loss": 3.2028, + "step": 1116 + }, + { + "epoch": 0.047254420847787464, + "grad_norm": 0.3581281900405884, + "learning_rate": 0.001, + "loss": 2.3384, + "step": 1117 + }, + { + "epoch": 0.04729672561130383, + "grad_norm": 0.3326326310634613, + "learning_rate": 0.001, + "loss": 4.0398, + "step": 1118 + }, + { + "epoch": 0.04733903037482021, + "grad_norm": 0.3846263289451599, + "learning_rate": 0.001, + "loss": 2.437, + "step": 1119 + }, + { + "epoch": 0.047381335138336575, + "grad_norm": 0.3471648395061493, + "learning_rate": 0.001, + "loss": 2.8839, + "step": 1120 + }, + { + "epoch": 0.04742363990185295, + "grad_norm": 0.8793920874595642, + "learning_rate": 0.001, + "loss": 2.2851, + "step": 1121 + }, + { + "epoch": 0.04746594466536932, + "grad_norm": 0.4244942367076874, + "learning_rate": 0.001, + "loss": 2.1305, + "step": 1122 + }, + { + "epoch": 0.04750824942888569, + "grad_norm": 0.272441565990448, + "learning_rate": 0.001, + "loss": 2.3609, + "step": 1123 + }, + { + "epoch": 0.04755055419240206, + "grad_norm": 0.5901132225990295, + "learning_rate": 0.001, + "loss": 2.5462, + "step": 1124 + }, + { + "epoch": 0.047592858955918436, + "grad_norm": 1.15731680393219, + "learning_rate": 0.001, + "loss": 2.0628, + "step": 1125 + }, + { + "epoch": 0.04763516371943481, + "grad_norm": 0.35768887400627136, + "learning_rate": 0.001, + "loss": 2.415, + "step": 1126 + }, + { + "epoch": 0.04767746848295118, + "grad_norm": 0.5913015604019165, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 1127 + }, + { + "epoch": 0.047719773246467555, + "grad_norm": 0.7240321040153503, + "learning_rate": 0.001, + "loss": 2.7932, + "step": 1128 + }, + { + "epoch": 0.04776207800998392, + "grad_norm": 0.29713061451911926, + "learning_rate": 0.001, + "loss": 1.7194, + "step": 1129 + }, + { + "epoch": 0.0478043827735003, + "grad_norm": 0.30949392914772034, + "learning_rate": 0.001, + "loss": 2.3119, + "step": 1130 + }, + { + "epoch": 0.047846687537016666, + "grad_norm": 0.7915238738059998, + "learning_rate": 0.001, + "loss": 2.2115, + "step": 1131 + }, + { + "epoch": 0.04788899230053304, + "grad_norm": 0.33516794443130493, + "learning_rate": 0.001, + "loss": 2.4067, + "step": 1132 + }, + { + "epoch": 0.04793129706404941, + "grad_norm": 0.4222777783870697, + "learning_rate": 0.001, + "loss": 2.3102, + "step": 1133 + }, + { + "epoch": 0.047973601827565784, + "grad_norm": 0.3512939512729645, + "learning_rate": 0.001, + "loss": 4.4249, + "step": 1134 + }, + { + "epoch": 0.04801590659108215, + "grad_norm": 0.7207156419754028, + "learning_rate": 0.001, + "loss": 2.2423, + "step": 1135 + }, + { + "epoch": 0.04805821135459853, + "grad_norm": 0.3659185767173767, + "learning_rate": 0.001, + "loss": 2.3806, + "step": 1136 + }, + { + "epoch": 0.0481005161181149, + "grad_norm": 0.30134308338165283, + "learning_rate": 0.001, + "loss": 2.3715, + "step": 1137 + }, + { + "epoch": 0.04814282088163127, + "grad_norm": 0.5804815888404846, + "learning_rate": 0.001, + "loss": 4.0196, + "step": 1138 + }, + { + "epoch": 0.048185125645147646, + "grad_norm": 0.2848808169364929, + "learning_rate": 0.001, + "loss": 2.0329, + "step": 1139 + }, + { + "epoch": 0.048227430408664014, + "grad_norm": 0.5376827716827393, + "learning_rate": 0.001, + "loss": 2.5122, + "step": 1140 + }, + { + "epoch": 0.04826973517218039, + "grad_norm": 0.393577516078949, + "learning_rate": 0.001, + "loss": 3.0434, + "step": 1141 + }, + { + "epoch": 0.04831203993569676, + "grad_norm": 1.210608959197998, + "learning_rate": 0.001, + "loss": 2.1217, + "step": 1142 + }, + { + "epoch": 0.04835434469921313, + "grad_norm": 0.9917371273040771, + "learning_rate": 0.001, + "loss": 4.2548, + "step": 1143 + }, + { + "epoch": 0.0483966494627295, + "grad_norm": 4.173092365264893, + "learning_rate": 0.001, + "loss": 2.1519, + "step": 1144 + }, + { + "epoch": 0.048438954226245876, + "grad_norm": 0.4106460213661194, + "learning_rate": 0.001, + "loss": 3.6214, + "step": 1145 + }, + { + "epoch": 0.04848125898976225, + "grad_norm": 1.5029926300048828, + "learning_rate": 0.001, + "loss": 3.2724, + "step": 1146 + }, + { + "epoch": 0.04852356375327862, + "grad_norm": 0.5879022479057312, + "learning_rate": 0.001, + "loss": 1.8398, + "step": 1147 + }, + { + "epoch": 0.048565868516794994, + "grad_norm": 0.36776503920555115, + "learning_rate": 0.001, + "loss": 2.2025, + "step": 1148 + }, + { + "epoch": 0.04860817328031136, + "grad_norm": 0.9697319269180298, + "learning_rate": 0.001, + "loss": 2.4534, + "step": 1149 + }, + { + "epoch": 0.04865047804382774, + "grad_norm": 0.640369176864624, + "learning_rate": 0.001, + "loss": 4.1821, + "step": 1150 + }, + { + "epoch": 0.048692782807344105, + "grad_norm": 0.5271949768066406, + "learning_rate": 0.001, + "loss": 2.5097, + "step": 1151 + }, + { + "epoch": 0.04873508757086048, + "grad_norm": 0.498152494430542, + "learning_rate": 0.001, + "loss": 2.097, + "step": 1152 + }, + { + "epoch": 0.04877739233437685, + "grad_norm": 0.45384323596954346, + "learning_rate": 0.001, + "loss": 2.8774, + "step": 1153 + }, + { + "epoch": 0.048819697097893223, + "grad_norm": 0.6459994912147522, + "learning_rate": 0.001, + "loss": 2.267, + "step": 1154 + }, + { + "epoch": 0.04886200186140959, + "grad_norm": 0.8564043045043945, + "learning_rate": 0.001, + "loss": 2.7848, + "step": 1155 + }, + { + "epoch": 0.04890430662492597, + "grad_norm": 0.33357423543930054, + "learning_rate": 0.001, + "loss": 2.5382, + "step": 1156 + }, + { + "epoch": 0.04894661138844234, + "grad_norm": 1.739807367324829, + "learning_rate": 0.001, + "loss": 2.3483, + "step": 1157 + }, + { + "epoch": 0.04898891615195871, + "grad_norm": 0.6629720330238342, + "learning_rate": 0.001, + "loss": 2.345, + "step": 1158 + }, + { + "epoch": 0.049031220915475085, + "grad_norm": 0.42855751514434814, + "learning_rate": 0.001, + "loss": 2.7333, + "step": 1159 + }, + { + "epoch": 0.04907352567899145, + "grad_norm": 0.5585654973983765, + "learning_rate": 0.001, + "loss": 2.7187, + "step": 1160 + }, + { + "epoch": 0.04911583044250783, + "grad_norm": 1.3134695291519165, + "learning_rate": 0.001, + "loss": 2.2319, + "step": 1161 + }, + { + "epoch": 0.049158135206024196, + "grad_norm": 16.80616569519043, + "learning_rate": 0.001, + "loss": 3.13, + "step": 1162 + }, + { + "epoch": 0.04920043996954057, + "grad_norm": 0.4731372594833374, + "learning_rate": 0.001, + "loss": 2.9317, + "step": 1163 + }, + { + "epoch": 0.04924274473305694, + "grad_norm": 0.7382914423942566, + "learning_rate": 0.001, + "loss": 3.251, + "step": 1164 + }, + { + "epoch": 0.049285049496573315, + "grad_norm": 1.2850393056869507, + "learning_rate": 0.001, + "loss": 2.4139, + "step": 1165 + }, + { + "epoch": 0.04932735426008968, + "grad_norm": 0.45804908871650696, + "learning_rate": 0.001, + "loss": 2.1216, + "step": 1166 + }, + { + "epoch": 0.04936965902360606, + "grad_norm": 0.41841673851013184, + "learning_rate": 0.001, + "loss": 2.3593, + "step": 1167 + }, + { + "epoch": 0.04941196378712243, + "grad_norm": 0.3579320013523102, + "learning_rate": 0.001, + "loss": 2.733, + "step": 1168 + }, + { + "epoch": 0.0494542685506388, + "grad_norm": 0.460112988948822, + "learning_rate": 0.001, + "loss": 1.9814, + "step": 1169 + }, + { + "epoch": 0.049496573314155176, + "grad_norm": 0.6798352599143982, + "learning_rate": 0.001, + "loss": 2.1811, + "step": 1170 + }, + { + "epoch": 0.049538878077671544, + "grad_norm": 13.641300201416016, + "learning_rate": 0.001, + "loss": 2.4548, + "step": 1171 + }, + { + "epoch": 0.04958118284118792, + "grad_norm": 1.915924310684204, + "learning_rate": 0.001, + "loss": 2.3664, + "step": 1172 + }, + { + "epoch": 0.04962348760470429, + "grad_norm": 0.34269511699676514, + "learning_rate": 0.001, + "loss": 2.571, + "step": 1173 + }, + { + "epoch": 0.04966579236822066, + "grad_norm": 1.606256127357483, + "learning_rate": 0.001, + "loss": 1.7933, + "step": 1174 + }, + { + "epoch": 0.04970809713173703, + "grad_norm": 0.6439388394355774, + "learning_rate": 0.001, + "loss": 2.7446, + "step": 1175 + }, + { + "epoch": 0.049750401895253406, + "grad_norm": 0.554309606552124, + "learning_rate": 0.001, + "loss": 2.9953, + "step": 1176 + }, + { + "epoch": 0.04979270665876978, + "grad_norm": 0.36114075779914856, + "learning_rate": 0.001, + "loss": 2.1207, + "step": 1177 + }, + { + "epoch": 0.04983501142228615, + "grad_norm": 0.36802929639816284, + "learning_rate": 0.001, + "loss": 2.7589, + "step": 1178 + }, + { + "epoch": 0.049877316185802524, + "grad_norm": 0.38982877135276794, + "learning_rate": 0.001, + "loss": 2.7034, + "step": 1179 + }, + { + "epoch": 0.04991962094931889, + "grad_norm": 0.46610337495803833, + "learning_rate": 0.001, + "loss": 2.2733, + "step": 1180 + }, + { + "epoch": 0.04996192571283527, + "grad_norm": 1.6383532285690308, + "learning_rate": 0.001, + "loss": 2.6442, + "step": 1181 + }, + { + "epoch": 0.050004230476351635, + "grad_norm": 0.3796882927417755, + "learning_rate": 0.001, + "loss": 2.8773, + "step": 1182 + }, + { + "epoch": 0.05004653523986801, + "grad_norm": 0.3811698257923126, + "learning_rate": 0.001, + "loss": 2.2252, + "step": 1183 + }, + { + "epoch": 0.05008884000338438, + "grad_norm": 0.6414541602134705, + "learning_rate": 0.001, + "loss": 2.8175, + "step": 1184 + }, + { + "epoch": 0.050131144766900754, + "grad_norm": 0.8355805277824402, + "learning_rate": 0.001, + "loss": 2.3944, + "step": 1185 + }, + { + "epoch": 0.05017344953041712, + "grad_norm": 1.5542443990707397, + "learning_rate": 0.001, + "loss": 2.544, + "step": 1186 + }, + { + "epoch": 0.0502157542939335, + "grad_norm": 1.7193444967269897, + "learning_rate": 0.001, + "loss": 2.0804, + "step": 1187 + }, + { + "epoch": 0.05025805905744987, + "grad_norm": 1.0862737894058228, + "learning_rate": 0.001, + "loss": 2.5055, + "step": 1188 + }, + { + "epoch": 0.05030036382096624, + "grad_norm": 0.4850480258464813, + "learning_rate": 0.001, + "loss": 3.1904, + "step": 1189 + }, + { + "epoch": 0.050342668584482615, + "grad_norm": 0.5566414594650269, + "learning_rate": 0.001, + "loss": 1.9222, + "step": 1190 + }, + { + "epoch": 0.05038497334799898, + "grad_norm": 7.770880699157715, + "learning_rate": 0.001, + "loss": 3.1213, + "step": 1191 + }, + { + "epoch": 0.05042727811151536, + "grad_norm": 0.9301654100418091, + "learning_rate": 0.001, + "loss": 2.1625, + "step": 1192 + }, + { + "epoch": 0.05046958287503173, + "grad_norm": 0.28750085830688477, + "learning_rate": 0.001, + "loss": 2.0768, + "step": 1193 + }, + { + "epoch": 0.0505118876385481, + "grad_norm": 0.33357667922973633, + "learning_rate": 0.001, + "loss": 1.2852, + "step": 1194 + }, + { + "epoch": 0.05055419240206447, + "grad_norm": 0.4574757218360901, + "learning_rate": 0.001, + "loss": 1.9372, + "step": 1195 + }, + { + "epoch": 0.050596497165580845, + "grad_norm": 2.3129217624664307, + "learning_rate": 0.001, + "loss": 3.4428, + "step": 1196 + }, + { + "epoch": 0.05063880192909721, + "grad_norm": 11.059386253356934, + "learning_rate": 0.001, + "loss": 2.063, + "step": 1197 + }, + { + "epoch": 0.05068110669261359, + "grad_norm": 0.5615996718406677, + "learning_rate": 0.001, + "loss": 2.9518, + "step": 1198 + }, + { + "epoch": 0.05072341145612996, + "grad_norm": 1.5756652355194092, + "learning_rate": 0.001, + "loss": 2.7033, + "step": 1199 + }, + { + "epoch": 0.05076571621964633, + "grad_norm": 0.5411480069160461, + "learning_rate": 0.001, + "loss": 2.3017, + "step": 1200 + }, + { + "epoch": 0.050808020983162706, + "grad_norm": 0.41354984045028687, + "learning_rate": 0.001, + "loss": 3.1848, + "step": 1201 + }, + { + "epoch": 0.050850325746679075, + "grad_norm": 1.6077523231506348, + "learning_rate": 0.001, + "loss": 3.4731, + "step": 1202 + }, + { + "epoch": 0.05089263051019545, + "grad_norm": 0.8017259240150452, + "learning_rate": 0.001, + "loss": 1.9944, + "step": 1203 + }, + { + "epoch": 0.05093493527371182, + "grad_norm": 1.3641940355300903, + "learning_rate": 0.001, + "loss": 3.2556, + "step": 1204 + }, + { + "epoch": 0.05097724003722819, + "grad_norm": 0.9652554392814636, + "learning_rate": 0.001, + "loss": 3.2552, + "step": 1205 + }, + { + "epoch": 0.05101954480074456, + "grad_norm": 0.44206491112709045, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 1206 + }, + { + "epoch": 0.051061849564260936, + "grad_norm": 0.7324578762054443, + "learning_rate": 0.001, + "loss": 3.5191, + "step": 1207 + }, + { + "epoch": 0.05110415432777731, + "grad_norm": 1.9216930866241455, + "learning_rate": 0.001, + "loss": 3.5364, + "step": 1208 + }, + { + "epoch": 0.05114645909129368, + "grad_norm": 0.8697239756584167, + "learning_rate": 0.001, + "loss": 2.362, + "step": 1209 + }, + { + "epoch": 0.051188763854810054, + "grad_norm": 0.6998451948165894, + "learning_rate": 0.001, + "loss": 2.7623, + "step": 1210 + }, + { + "epoch": 0.05123106861832642, + "grad_norm": 0.9253522753715515, + "learning_rate": 0.001, + "loss": 3.4004, + "step": 1211 + }, + { + "epoch": 0.0512733733818428, + "grad_norm": 0.9361282587051392, + "learning_rate": 0.001, + "loss": 3.1784, + "step": 1212 + }, + { + "epoch": 0.051315678145359166, + "grad_norm": 0.38276004791259766, + "learning_rate": 0.001, + "loss": 2.448, + "step": 1213 + }, + { + "epoch": 0.05135798290887554, + "grad_norm": 0.5177932977676392, + "learning_rate": 0.001, + "loss": 3.3615, + "step": 1214 + }, + { + "epoch": 0.05140028767239191, + "grad_norm": 1.2303301095962524, + "learning_rate": 0.001, + "loss": 2.7344, + "step": 1215 + }, + { + "epoch": 0.051442592435908284, + "grad_norm": 25.516271591186523, + "learning_rate": 0.001, + "loss": 2.5343, + "step": 1216 + }, + { + "epoch": 0.05148489719942465, + "grad_norm": 0.7592980861663818, + "learning_rate": 0.001, + "loss": 3.7762, + "step": 1217 + }, + { + "epoch": 0.05152720196294103, + "grad_norm": 0.9871262311935425, + "learning_rate": 0.001, + "loss": 2.7732, + "step": 1218 + }, + { + "epoch": 0.0515695067264574, + "grad_norm": 0.6601408123970032, + "learning_rate": 0.001, + "loss": 2.7274, + "step": 1219 + }, + { + "epoch": 0.05161181148997377, + "grad_norm": 0.8926149606704712, + "learning_rate": 0.001, + "loss": 3.5338, + "step": 1220 + }, + { + "epoch": 0.051654116253490145, + "grad_norm": 0.6787442564964294, + "learning_rate": 0.001, + "loss": 2.4065, + "step": 1221 + }, + { + "epoch": 0.051696421017006514, + "grad_norm": 0.8640452027320862, + "learning_rate": 0.001, + "loss": 2.5657, + "step": 1222 + }, + { + "epoch": 0.05173872578052289, + "grad_norm": 1.1135860681533813, + "learning_rate": 0.001, + "loss": 2.0519, + "step": 1223 + }, + { + "epoch": 0.05178103054403926, + "grad_norm": 0.481350302696228, + "learning_rate": 0.001, + "loss": 2.7644, + "step": 1224 + }, + { + "epoch": 0.05182333530755563, + "grad_norm": 0.4704929292201996, + "learning_rate": 0.001, + "loss": 2.403, + "step": 1225 + }, + { + "epoch": 0.051865640071072, + "grad_norm": 6.178280353546143, + "learning_rate": 0.001, + "loss": 2.2048, + "step": 1226 + }, + { + "epoch": 0.051907944834588375, + "grad_norm": 0.5791310667991638, + "learning_rate": 0.001, + "loss": 3.0232, + "step": 1227 + }, + { + "epoch": 0.05195024959810474, + "grad_norm": 2.9677908420562744, + "learning_rate": 0.001, + "loss": 3.8193, + "step": 1228 + }, + { + "epoch": 0.05199255436162112, + "grad_norm": 1.5064809322357178, + "learning_rate": 0.001, + "loss": 3.0974, + "step": 1229 + }, + { + "epoch": 0.05203485912513749, + "grad_norm": 0.3698960542678833, + "learning_rate": 0.001, + "loss": 3.0401, + "step": 1230 + }, + { + "epoch": 0.05207716388865386, + "grad_norm": 0.5180457830429077, + "learning_rate": 0.001, + "loss": 3.2249, + "step": 1231 + }, + { + "epoch": 0.05211946865217024, + "grad_norm": 0.3810598850250244, + "learning_rate": 0.001, + "loss": 2.5986, + "step": 1232 + }, + { + "epoch": 0.052161773415686605, + "grad_norm": 0.33539965748786926, + "learning_rate": 0.001, + "loss": 3.7299, + "step": 1233 + }, + { + "epoch": 0.05220407817920298, + "grad_norm": 0.5705146789550781, + "learning_rate": 0.001, + "loss": 3.6431, + "step": 1234 + }, + { + "epoch": 0.05224638294271935, + "grad_norm": 0.4359552562236786, + "learning_rate": 0.001, + "loss": 3.0, + "step": 1235 + }, + { + "epoch": 0.05228868770623572, + "grad_norm": 0.9745590686798096, + "learning_rate": 0.001, + "loss": 2.4393, + "step": 1236 + }, + { + "epoch": 0.05233099246975209, + "grad_norm": 0.30867111682891846, + "learning_rate": 0.001, + "loss": 2.227, + "step": 1237 + }, + { + "epoch": 0.052373297233268466, + "grad_norm": 0.335039883852005, + "learning_rate": 0.001, + "loss": 2.2415, + "step": 1238 + }, + { + "epoch": 0.05241560199678484, + "grad_norm": 0.6804436445236206, + "learning_rate": 0.001, + "loss": 2.7192, + "step": 1239 + }, + { + "epoch": 0.05245790676030121, + "grad_norm": 0.41609200835227966, + "learning_rate": 0.001, + "loss": 3.3984, + "step": 1240 + }, + { + "epoch": 0.052500211523817585, + "grad_norm": 0.3207823932170868, + "learning_rate": 0.001, + "loss": 2.5688, + "step": 1241 + }, + { + "epoch": 0.05254251628733395, + "grad_norm": 0.33397310972213745, + "learning_rate": 0.001, + "loss": 2.5408, + "step": 1242 + }, + { + "epoch": 0.05258482105085033, + "grad_norm": 0.38078972697257996, + "learning_rate": 0.001, + "loss": 2.1729, + "step": 1243 + }, + { + "epoch": 0.052627125814366696, + "grad_norm": 0.34104421734809875, + "learning_rate": 0.001, + "loss": 3.0408, + "step": 1244 + }, + { + "epoch": 0.05266943057788307, + "grad_norm": 0.32771986722946167, + "learning_rate": 0.001, + "loss": 2.7228, + "step": 1245 + }, + { + "epoch": 0.05271173534139944, + "grad_norm": 0.8869002461433411, + "learning_rate": 0.001, + "loss": 3.8551, + "step": 1246 + }, + { + "epoch": 0.052754040104915814, + "grad_norm": 0.2952006459236145, + "learning_rate": 0.001, + "loss": 2.3158, + "step": 1247 + }, + { + "epoch": 0.05279634486843218, + "grad_norm": 0.8608886003494263, + "learning_rate": 0.001, + "loss": 2.2941, + "step": 1248 + }, + { + "epoch": 0.05283864963194856, + "grad_norm": 0.5748282074928284, + "learning_rate": 0.001, + "loss": 2.9898, + "step": 1249 + }, + { + "epoch": 0.05288095439546493, + "grad_norm": 0.6410783529281616, + "learning_rate": 0.001, + "loss": 1.971, + "step": 1250 + }, + { + "epoch": 0.0529232591589813, + "grad_norm": 1.4829449653625488, + "learning_rate": 0.001, + "loss": 3.4999, + "step": 1251 + }, + { + "epoch": 0.052965563922497676, + "grad_norm": 1.1416040658950806, + "learning_rate": 0.001, + "loss": 2.4466, + "step": 1252 + }, + { + "epoch": 0.053007868686014044, + "grad_norm": 1.1560173034667969, + "learning_rate": 0.001, + "loss": 2.3162, + "step": 1253 + }, + { + "epoch": 0.05305017344953042, + "grad_norm": 1.0749090909957886, + "learning_rate": 0.001, + "loss": 3.186, + "step": 1254 + }, + { + "epoch": 0.05309247821304679, + "grad_norm": 0.3261003792285919, + "learning_rate": 0.001, + "loss": 2.3392, + "step": 1255 + }, + { + "epoch": 0.05313478297656316, + "grad_norm": 0.4178333282470703, + "learning_rate": 0.001, + "loss": 1.8991, + "step": 1256 + }, + { + "epoch": 0.05317708774007953, + "grad_norm": 0.3390306234359741, + "learning_rate": 0.001, + "loss": 2.6713, + "step": 1257 + }, + { + "epoch": 0.053219392503595905, + "grad_norm": 0.46924909949302673, + "learning_rate": 0.001, + "loss": 3.4529, + "step": 1258 + }, + { + "epoch": 0.053261697267112273, + "grad_norm": 0.45819172263145447, + "learning_rate": 0.001, + "loss": 2.3935, + "step": 1259 + }, + { + "epoch": 0.05330400203062865, + "grad_norm": 0.36325275897979736, + "learning_rate": 0.001, + "loss": 1.8038, + "step": 1260 + }, + { + "epoch": 0.053346306794145024, + "grad_norm": 0.4765758216381073, + "learning_rate": 0.001, + "loss": 2.941, + "step": 1261 + }, + { + "epoch": 0.05338861155766139, + "grad_norm": 0.5178735256195068, + "learning_rate": 0.001, + "loss": 2.8948, + "step": 1262 + }, + { + "epoch": 0.05343091632117777, + "grad_norm": 0.3561374545097351, + "learning_rate": 0.001, + "loss": 3.3945, + "step": 1263 + }, + { + "epoch": 0.053473221084694135, + "grad_norm": 0.29234689474105835, + "learning_rate": 0.001, + "loss": 1.8964, + "step": 1264 + }, + { + "epoch": 0.05351552584821051, + "grad_norm": 1.8673244714736938, + "learning_rate": 0.001, + "loss": 2.2186, + "step": 1265 + }, + { + "epoch": 0.05355783061172688, + "grad_norm": 1.1225836277008057, + "learning_rate": 0.001, + "loss": 1.8668, + "step": 1266 + }, + { + "epoch": 0.05360013537524325, + "grad_norm": 0.8716728091239929, + "learning_rate": 0.001, + "loss": 2.8729, + "step": 1267 + }, + { + "epoch": 0.05364244013875962, + "grad_norm": 2.1635050773620605, + "learning_rate": 0.001, + "loss": 2.645, + "step": 1268 + }, + { + "epoch": 0.053684744902275996, + "grad_norm": 0.38754305243492126, + "learning_rate": 0.001, + "loss": 3.3657, + "step": 1269 + }, + { + "epoch": 0.05372704966579237, + "grad_norm": 0.35894379019737244, + "learning_rate": 0.001, + "loss": 1.9849, + "step": 1270 + }, + { + "epoch": 0.05376935442930874, + "grad_norm": 0.45211154222488403, + "learning_rate": 0.001, + "loss": 3.4621, + "step": 1271 + }, + { + "epoch": 0.053811659192825115, + "grad_norm": 1.4661130905151367, + "learning_rate": 0.001, + "loss": 2.2278, + "step": 1272 + }, + { + "epoch": 0.05385396395634148, + "grad_norm": 0.3701130151748657, + "learning_rate": 0.001, + "loss": 1.8248, + "step": 1273 + }, + { + "epoch": 0.05389626871985786, + "grad_norm": 0.3744787871837616, + "learning_rate": 0.001, + "loss": 2.5166, + "step": 1274 + }, + { + "epoch": 0.053938573483374226, + "grad_norm": 0.6254411935806274, + "learning_rate": 0.001, + "loss": 2.1034, + "step": 1275 + }, + { + "epoch": 0.0539808782468906, + "grad_norm": 0.5083110928535461, + "learning_rate": 0.001, + "loss": 2.5448, + "step": 1276 + }, + { + "epoch": 0.05402318301040697, + "grad_norm": 0.4066379964351654, + "learning_rate": 0.001, + "loss": 2.726, + "step": 1277 + }, + { + "epoch": 0.054065487773923344, + "grad_norm": 0.326869398355484, + "learning_rate": 0.001, + "loss": 2.1421, + "step": 1278 + }, + { + "epoch": 0.05410779253743971, + "grad_norm": 0.51470547914505, + "learning_rate": 0.001, + "loss": 3.0139, + "step": 1279 + }, + { + "epoch": 0.05415009730095609, + "grad_norm": 1.5484551191329956, + "learning_rate": 0.001, + "loss": 2.947, + "step": 1280 + }, + { + "epoch": 0.05419240206447246, + "grad_norm": 0.4610329270362854, + "learning_rate": 0.001, + "loss": 2.0814, + "step": 1281 + }, + { + "epoch": 0.05423470682798883, + "grad_norm": 1.545684576034546, + "learning_rate": 0.001, + "loss": 4.4303, + "step": 1282 + }, + { + "epoch": 0.054277011591505206, + "grad_norm": 1.627662181854248, + "learning_rate": 0.001, + "loss": 2.4313, + "step": 1283 + }, + { + "epoch": 0.054319316355021574, + "grad_norm": 0.37047141790390015, + "learning_rate": 0.001, + "loss": 2.1884, + "step": 1284 + }, + { + "epoch": 0.05436162111853795, + "grad_norm": 0.46785968542099, + "learning_rate": 0.001, + "loss": 3.1745, + "step": 1285 + }, + { + "epoch": 0.05440392588205432, + "grad_norm": 0.3685908615589142, + "learning_rate": 0.001, + "loss": 2.1659, + "step": 1286 + }, + { + "epoch": 0.05444623064557069, + "grad_norm": 0.8433362245559692, + "learning_rate": 0.001, + "loss": 2.7851, + "step": 1287 + }, + { + "epoch": 0.05448853540908706, + "grad_norm": 1.0135656595230103, + "learning_rate": 0.001, + "loss": 2.2148, + "step": 1288 + }, + { + "epoch": 0.054530840172603436, + "grad_norm": 0.9696979522705078, + "learning_rate": 0.001, + "loss": 2.5305, + "step": 1289 + }, + { + "epoch": 0.054573144936119804, + "grad_norm": 0.38324859738349915, + "learning_rate": 0.001, + "loss": 2.2144, + "step": 1290 + }, + { + "epoch": 0.05461544969963618, + "grad_norm": 0.4518929421901703, + "learning_rate": 0.001, + "loss": 2.0715, + "step": 1291 + }, + { + "epoch": 0.054657754463152554, + "grad_norm": 0.29709556698799133, + "learning_rate": 0.001, + "loss": 1.878, + "step": 1292 + }, + { + "epoch": 0.05470005922666892, + "grad_norm": 0.41647666692733765, + "learning_rate": 0.001, + "loss": 3.2047, + "step": 1293 + }, + { + "epoch": 0.0547423639901853, + "grad_norm": 0.40319982171058655, + "learning_rate": 0.001, + "loss": 2.8011, + "step": 1294 + }, + { + "epoch": 0.054784668753701665, + "grad_norm": 1.4222816228866577, + "learning_rate": 0.001, + "loss": 2.4923, + "step": 1295 + }, + { + "epoch": 0.05482697351721804, + "grad_norm": 0.6435485482215881, + "learning_rate": 0.001, + "loss": 2.7776, + "step": 1296 + }, + { + "epoch": 0.05486927828073441, + "grad_norm": 0.3867262899875641, + "learning_rate": 0.001, + "loss": 2.9806, + "step": 1297 + }, + { + "epoch": 0.054911583044250784, + "grad_norm": 0.32293111085891724, + "learning_rate": 0.001, + "loss": 2.2782, + "step": 1298 + }, + { + "epoch": 0.05495388780776715, + "grad_norm": 1.7177414894104004, + "learning_rate": 0.001, + "loss": 2.5676, + "step": 1299 + }, + { + "epoch": 0.05499619257128353, + "grad_norm": 4.7346272468566895, + "learning_rate": 0.001, + "loss": 2.7508, + "step": 1300 + }, + { + "epoch": 0.0550384973347999, + "grad_norm": 3.0460281372070312, + "learning_rate": 0.001, + "loss": 2.946, + "step": 1301 + }, + { + "epoch": 0.05508080209831627, + "grad_norm": 3.609449863433838, + "learning_rate": 0.001, + "loss": 2.9066, + "step": 1302 + }, + { + "epoch": 0.055123106861832645, + "grad_norm": 0.6072207689285278, + "learning_rate": 0.001, + "loss": 2.9887, + "step": 1303 + }, + { + "epoch": 0.05516541162534901, + "grad_norm": 0.3145444691181183, + "learning_rate": 0.001, + "loss": 2.0579, + "step": 1304 + }, + { + "epoch": 0.05520771638886539, + "grad_norm": 0.4488667845726013, + "learning_rate": 0.001, + "loss": 3.0136, + "step": 1305 + }, + { + "epoch": 0.055250021152381756, + "grad_norm": 2.8112523555755615, + "learning_rate": 0.001, + "loss": 2.0162, + "step": 1306 + }, + { + "epoch": 0.05529232591589813, + "grad_norm": 0.3468235731124878, + "learning_rate": 0.001, + "loss": 2.5913, + "step": 1307 + }, + { + "epoch": 0.0553346306794145, + "grad_norm": 0.32054612040519714, + "learning_rate": 0.001, + "loss": 2.9339, + "step": 1308 + }, + { + "epoch": 0.055376935442930875, + "grad_norm": 1.1245250701904297, + "learning_rate": 0.001, + "loss": 2.1141, + "step": 1309 + }, + { + "epoch": 0.05541924020644724, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.001, + "loss": 2.7699, + "step": 1310 + }, + { + "epoch": 0.05546154496996362, + "grad_norm": 0.5362788438796997, + "learning_rate": 0.001, + "loss": 3.1062, + "step": 1311 + }, + { + "epoch": 0.05550384973347999, + "grad_norm": 0.3507515490055084, + "learning_rate": 0.001, + "loss": 2.5279, + "step": 1312 + }, + { + "epoch": 0.05554615449699636, + "grad_norm": 0.4399697184562683, + "learning_rate": 0.001, + "loss": 2.6847, + "step": 1313 + }, + { + "epoch": 0.055588459260512736, + "grad_norm": 0.693962812423706, + "learning_rate": 0.001, + "loss": 1.86, + "step": 1314 + }, + { + "epoch": 0.055630764024029104, + "grad_norm": 0.5763394236564636, + "learning_rate": 0.001, + "loss": 2.8907, + "step": 1315 + }, + { + "epoch": 0.05567306878754548, + "grad_norm": 0.3348402678966522, + "learning_rate": 0.001, + "loss": 2.6682, + "step": 1316 + }, + { + "epoch": 0.05571537355106185, + "grad_norm": 2.0695788860321045, + "learning_rate": 0.001, + "loss": 2.2931, + "step": 1317 + }, + { + "epoch": 0.05575767831457822, + "grad_norm": 1.0537198781967163, + "learning_rate": 0.001, + "loss": 2.7975, + "step": 1318 + }, + { + "epoch": 0.05579998307809459, + "grad_norm": 0.38374683260917664, + "learning_rate": 0.001, + "loss": 3.1988, + "step": 1319 + }, + { + "epoch": 0.055842287841610966, + "grad_norm": 16.446990966796875, + "learning_rate": 0.001, + "loss": 2.2754, + "step": 1320 + }, + { + "epoch": 0.055884592605127334, + "grad_norm": 0.378416508436203, + "learning_rate": 0.001, + "loss": 3.6752, + "step": 1321 + }, + { + "epoch": 0.05592689736864371, + "grad_norm": 3.8668839931488037, + "learning_rate": 0.001, + "loss": 3.5075, + "step": 1322 + }, + { + "epoch": 0.055969202132160084, + "grad_norm": 0.40051063895225525, + "learning_rate": 0.001, + "loss": 1.935, + "step": 1323 + }, + { + "epoch": 0.05601150689567645, + "grad_norm": 0.3199704587459564, + "learning_rate": 0.001, + "loss": 2.8692, + "step": 1324 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 0.39655637741088867, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 1325 + }, + { + "epoch": 0.056096116422709195, + "grad_norm": 0.32414257526397705, + "learning_rate": 0.001, + "loss": 2.3473, + "step": 1326 + }, + { + "epoch": 0.05613842118622557, + "grad_norm": 0.30689844489097595, + "learning_rate": 0.001, + "loss": 1.9908, + "step": 1327 + }, + { + "epoch": 0.05618072594974194, + "grad_norm": 0.3849036693572998, + "learning_rate": 0.001, + "loss": 2.2868, + "step": 1328 + }, + { + "epoch": 0.056223030713258314, + "grad_norm": 0.7097468376159668, + "learning_rate": 0.001, + "loss": 2.9331, + "step": 1329 + }, + { + "epoch": 0.05626533547677468, + "grad_norm": 0.3276157081127167, + "learning_rate": 0.001, + "loss": 2.3295, + "step": 1330 + }, + { + "epoch": 0.05630764024029106, + "grad_norm": 0.30598339438438416, + "learning_rate": 0.001, + "loss": 2.8663, + "step": 1331 + }, + { + "epoch": 0.05634994500380743, + "grad_norm": 5.709721565246582, + "learning_rate": 0.001, + "loss": 2.2997, + "step": 1332 + }, + { + "epoch": 0.0563922497673238, + "grad_norm": 1.1615700721740723, + "learning_rate": 0.001, + "loss": 2.2019, + "step": 1333 + }, + { + "epoch": 0.056434554530840175, + "grad_norm": 0.8189941644668579, + "learning_rate": 0.001, + "loss": 1.9522, + "step": 1334 + }, + { + "epoch": 0.05647685929435654, + "grad_norm": 0.4313448965549469, + "learning_rate": 0.001, + "loss": 2.1067, + "step": 1335 + }, + { + "epoch": 0.05651916405787292, + "grad_norm": 0.38705217838287354, + "learning_rate": 0.001, + "loss": 2.9772, + "step": 1336 + }, + { + "epoch": 0.05656146882138929, + "grad_norm": 0.4954543709754944, + "learning_rate": 0.001, + "loss": 2.298, + "step": 1337 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 0.5692545771598816, + "learning_rate": 0.001, + "loss": 2.3607, + "step": 1338 + }, + { + "epoch": 0.05664607834842203, + "grad_norm": 1.1173979043960571, + "learning_rate": 0.001, + "loss": 2.9379, + "step": 1339 + }, + { + "epoch": 0.056688383111938405, + "grad_norm": 2.1702587604522705, + "learning_rate": 0.001, + "loss": 2.4834, + "step": 1340 + }, + { + "epoch": 0.05673068787545477, + "grad_norm": 3.090153694152832, + "learning_rate": 0.001, + "loss": 1.9018, + "step": 1341 + }, + { + "epoch": 0.05677299263897115, + "grad_norm": 0.7734284996986389, + "learning_rate": 0.001, + "loss": 2.3851, + "step": 1342 + }, + { + "epoch": 0.05681529740248752, + "grad_norm": 0.2983352243900299, + "learning_rate": 0.001, + "loss": 2.5298, + "step": 1343 + }, + { + "epoch": 0.05685760216600389, + "grad_norm": 0.8630567193031311, + "learning_rate": 0.001, + "loss": 3.0804, + "step": 1344 + }, + { + "epoch": 0.056899906929520266, + "grad_norm": 0.3336477279663086, + "learning_rate": 0.001, + "loss": 2.2696, + "step": 1345 + }, + { + "epoch": 0.056942211693036635, + "grad_norm": 0.30996400117874146, + "learning_rate": 0.001, + "loss": 2.3269, + "step": 1346 + }, + { + "epoch": 0.05698451645655301, + "grad_norm": 0.38334864377975464, + "learning_rate": 0.001, + "loss": 2.4821, + "step": 1347 + }, + { + "epoch": 0.05702682122006938, + "grad_norm": 0.29219967126846313, + "learning_rate": 0.001, + "loss": 2.192, + "step": 1348 + }, + { + "epoch": 0.05706912598358575, + "grad_norm": 1.8894317150115967, + "learning_rate": 0.001, + "loss": 2.2862, + "step": 1349 + }, + { + "epoch": 0.05711143074710212, + "grad_norm": 3.3716726303100586, + "learning_rate": 0.001, + "loss": 2.0416, + "step": 1350 + }, + { + "epoch": 0.057153735510618496, + "grad_norm": 0.3227848708629608, + "learning_rate": 0.001, + "loss": 2.7895, + "step": 1351 + }, + { + "epoch": 0.057196040274134864, + "grad_norm": 0.31334635615348816, + "learning_rate": 0.001, + "loss": 2.7555, + "step": 1352 + }, + { + "epoch": 0.05723834503765124, + "grad_norm": 0.32888343930244446, + "learning_rate": 0.001, + "loss": 2.6685, + "step": 1353 + }, + { + "epoch": 0.057280649801167614, + "grad_norm": 1.3726975917816162, + "learning_rate": 0.001, + "loss": 2.4858, + "step": 1354 + }, + { + "epoch": 0.05732295456468398, + "grad_norm": 0.296441912651062, + "learning_rate": 0.001, + "loss": 2.4981, + "step": 1355 + }, + { + "epoch": 0.05736525932820036, + "grad_norm": 0.40023407340049744, + "learning_rate": 0.001, + "loss": 2.3818, + "step": 1356 + }, + { + "epoch": 0.057407564091716726, + "grad_norm": 0.23572492599487305, + "learning_rate": 0.001, + "loss": 1.7868, + "step": 1357 + }, + { + "epoch": 0.0574498688552331, + "grad_norm": 0.3053801357746124, + "learning_rate": 0.001, + "loss": 2.115, + "step": 1358 + }, + { + "epoch": 0.05749217361874947, + "grad_norm": 0.4453347623348236, + "learning_rate": 0.001, + "loss": 2.3336, + "step": 1359 + }, + { + "epoch": 0.057534478382265844, + "grad_norm": 0.29795461893081665, + "learning_rate": 0.001, + "loss": 2.3624, + "step": 1360 + }, + { + "epoch": 0.05757678314578221, + "grad_norm": 0.3462240993976593, + "learning_rate": 0.001, + "loss": 2.4219, + "step": 1361 + }, + { + "epoch": 0.05761908790929859, + "grad_norm": 0.7424085140228271, + "learning_rate": 0.001, + "loss": 2.0359, + "step": 1362 + }, + { + "epoch": 0.05766139267281496, + "grad_norm": 0.33206912875175476, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 1363 + }, + { + "epoch": 0.05770369743633133, + "grad_norm": 0.5105282068252563, + "learning_rate": 0.001, + "loss": 2.5832, + "step": 1364 + }, + { + "epoch": 0.057746002199847705, + "grad_norm": 0.29361414909362793, + "learning_rate": 0.001, + "loss": 1.5788, + "step": 1365 + }, + { + "epoch": 0.057788306963364074, + "grad_norm": 0.7224523425102234, + "learning_rate": 0.001, + "loss": 1.6817, + "step": 1366 + }, + { + "epoch": 0.05783061172688045, + "grad_norm": 0.33219602704048157, + "learning_rate": 0.001, + "loss": 2.8996, + "step": 1367 + }, + { + "epoch": 0.05787291649039682, + "grad_norm": 0.6146803498268127, + "learning_rate": 0.001, + "loss": 2.8432, + "step": 1368 + }, + { + "epoch": 0.05791522125391319, + "grad_norm": 0.4107002019882202, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 1369 + }, + { + "epoch": 0.05795752601742956, + "grad_norm": 0.7845795750617981, + "learning_rate": 0.001, + "loss": 2.5862, + "step": 1370 + }, + { + "epoch": 0.057999830780945935, + "grad_norm": 0.2706119120121002, + "learning_rate": 0.001, + "loss": 2.4746, + "step": 1371 + }, + { + "epoch": 0.0580421355444623, + "grad_norm": 0.26624995470046997, + "learning_rate": 0.001, + "loss": 2.1325, + "step": 1372 + }, + { + "epoch": 0.05808444030797868, + "grad_norm": 1.2387701272964478, + "learning_rate": 0.001, + "loss": 3.0806, + "step": 1373 + }, + { + "epoch": 0.05812674507149505, + "grad_norm": 0.30188238620758057, + "learning_rate": 0.001, + "loss": 2.6137, + "step": 1374 + }, + { + "epoch": 0.05816904983501142, + "grad_norm": 0.47407734394073486, + "learning_rate": 0.001, + "loss": 2.949, + "step": 1375 + }, + { + "epoch": 0.0582113545985278, + "grad_norm": 13.492250442504883, + "learning_rate": 0.001, + "loss": 2.3764, + "step": 1376 + }, + { + "epoch": 0.058253659362044165, + "grad_norm": 1.4784618616104126, + "learning_rate": 0.001, + "loss": 2.6514, + "step": 1377 + }, + { + "epoch": 0.05829596412556054, + "grad_norm": 0.5186235904693604, + "learning_rate": 0.001, + "loss": 2.5127, + "step": 1378 + }, + { + "epoch": 0.05833826888907691, + "grad_norm": 0.33871689438819885, + "learning_rate": 0.001, + "loss": 1.9724, + "step": 1379 + }, + { + "epoch": 0.05838057365259328, + "grad_norm": 0.6185868382453918, + "learning_rate": 0.001, + "loss": 2.2309, + "step": 1380 + }, + { + "epoch": 0.05842287841610965, + "grad_norm": 0.3206985890865326, + "learning_rate": 0.001, + "loss": 2.2691, + "step": 1381 + }, + { + "epoch": 0.058465183179626026, + "grad_norm": 0.34144043922424316, + "learning_rate": 0.001, + "loss": 2.1018, + "step": 1382 + }, + { + "epoch": 0.058507487943142394, + "grad_norm": 2.2303543090820312, + "learning_rate": 0.001, + "loss": 2.0798, + "step": 1383 + }, + { + "epoch": 0.05854979270665877, + "grad_norm": 0.36561813950538635, + "learning_rate": 0.001, + "loss": 2.8894, + "step": 1384 + }, + { + "epoch": 0.058592097470175145, + "grad_norm": 0.5910583138465881, + "learning_rate": 0.001, + "loss": 2.6603, + "step": 1385 + }, + { + "epoch": 0.05863440223369151, + "grad_norm": 0.3511381149291992, + "learning_rate": 0.001, + "loss": 3.2612, + "step": 1386 + }, + { + "epoch": 0.05867670699720789, + "grad_norm": 1.0417660474777222, + "learning_rate": 0.001, + "loss": 2.0464, + "step": 1387 + }, + { + "epoch": 0.058719011760724256, + "grad_norm": 0.3064709007740021, + "learning_rate": 0.001, + "loss": 1.7901, + "step": 1388 + }, + { + "epoch": 0.05876131652424063, + "grad_norm": 0.6421626210212708, + "learning_rate": 0.001, + "loss": 2.4181, + "step": 1389 + }, + { + "epoch": 0.058803621287757, + "grad_norm": 0.5801759958267212, + "learning_rate": 0.001, + "loss": 1.9541, + "step": 1390 + }, + { + "epoch": 0.058845926051273374, + "grad_norm": 0.32130327820777893, + "learning_rate": 0.001, + "loss": 2.5807, + "step": 1391 + }, + { + "epoch": 0.05888823081478974, + "grad_norm": 0.31312790513038635, + "learning_rate": 0.001, + "loss": 3.0668, + "step": 1392 + }, + { + "epoch": 0.05893053557830612, + "grad_norm": 0.5199546813964844, + "learning_rate": 0.001, + "loss": 2.9197, + "step": 1393 + }, + { + "epoch": 0.05897284034182249, + "grad_norm": 0.45629438757896423, + "learning_rate": 0.001, + "loss": 1.9592, + "step": 1394 + }, + { + "epoch": 0.05901514510533886, + "grad_norm": 0.2890334129333496, + "learning_rate": 0.001, + "loss": 2.716, + "step": 1395 + }, + { + "epoch": 0.059057449868855236, + "grad_norm": 0.47239354252815247, + "learning_rate": 0.001, + "loss": 2.2804, + "step": 1396 + }, + { + "epoch": 0.059099754632371604, + "grad_norm": 0.989652693271637, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 1397 + }, + { + "epoch": 0.05914205939588798, + "grad_norm": 0.40251368284225464, + "learning_rate": 0.001, + "loss": 1.9777, + "step": 1398 + }, + { + "epoch": 0.05918436415940435, + "grad_norm": 0.2854582965373993, + "learning_rate": 0.001, + "loss": 1.8564, + "step": 1399 + }, + { + "epoch": 0.05922666892292072, + "grad_norm": 0.4429548382759094, + "learning_rate": 0.001, + "loss": 3.2216, + "step": 1400 + }, + { + "epoch": 0.05926897368643709, + "grad_norm": 0.3628709316253662, + "learning_rate": 0.001, + "loss": 1.898, + "step": 1401 + }, + { + "epoch": 0.059311278449953465, + "grad_norm": 0.37840116024017334, + "learning_rate": 0.001, + "loss": 3.3639, + "step": 1402 + }, + { + "epoch": 0.059353583213469834, + "grad_norm": 1.245588779449463, + "learning_rate": 0.001, + "loss": 3.1546, + "step": 1403 + }, + { + "epoch": 0.05939588797698621, + "grad_norm": 0.304123193025589, + "learning_rate": 0.001, + "loss": 3.7022, + "step": 1404 + }, + { + "epoch": 0.059438192740502584, + "grad_norm": 0.4658697843551636, + "learning_rate": 0.001, + "loss": 2.8618, + "step": 1405 + }, + { + "epoch": 0.05948049750401895, + "grad_norm": 0.3281252682209015, + "learning_rate": 0.001, + "loss": 2.5643, + "step": 1406 + }, + { + "epoch": 0.05952280226753533, + "grad_norm": 0.28439781069755554, + "learning_rate": 0.001, + "loss": 3.376, + "step": 1407 + }, + { + "epoch": 0.059565107031051695, + "grad_norm": 0.6219123601913452, + "learning_rate": 0.001, + "loss": 4.5267, + "step": 1408 + }, + { + "epoch": 0.05960741179456807, + "grad_norm": 0.30366063117980957, + "learning_rate": 0.001, + "loss": 2.8077, + "step": 1409 + }, + { + "epoch": 0.05964971655808444, + "grad_norm": 0.2643471658229828, + "learning_rate": 0.001, + "loss": 2.5821, + "step": 1410 + }, + { + "epoch": 0.05969202132160081, + "grad_norm": 0.5798410177230835, + "learning_rate": 0.001, + "loss": 2.406, + "step": 1411 + }, + { + "epoch": 0.05973432608511718, + "grad_norm": 0.2849283218383789, + "learning_rate": 0.001, + "loss": 1.9369, + "step": 1412 + }, + { + "epoch": 0.059776630848633557, + "grad_norm": 0.35845881700515747, + "learning_rate": 0.001, + "loss": 2.6967, + "step": 1413 + }, + { + "epoch": 0.059818935612149925, + "grad_norm": 1.3240238428115845, + "learning_rate": 0.001, + "loss": 3.4985, + "step": 1414 + }, + { + "epoch": 0.0598612403756663, + "grad_norm": 0.2925819754600525, + "learning_rate": 0.001, + "loss": 2.6216, + "step": 1415 + }, + { + "epoch": 0.059903545139182675, + "grad_norm": 0.8406017422676086, + "learning_rate": 0.001, + "loss": 2.1828, + "step": 1416 + }, + { + "epoch": 0.05994584990269904, + "grad_norm": 0.344137966632843, + "learning_rate": 0.001, + "loss": 2.3818, + "step": 1417 + }, + { + "epoch": 0.05998815466621542, + "grad_norm": 0.3151387870311737, + "learning_rate": 0.001, + "loss": 2.53, + "step": 1418 + }, + { + "epoch": 0.060030459429731786, + "grad_norm": 0.3660660982131958, + "learning_rate": 0.001, + "loss": 2.4767, + "step": 1419 + }, + { + "epoch": 0.06007276419324816, + "grad_norm": 3.185765266418457, + "learning_rate": 0.001, + "loss": 1.7922, + "step": 1420 + }, + { + "epoch": 0.06011506895676453, + "grad_norm": 0.4721246659755707, + "learning_rate": 0.001, + "loss": 2.1482, + "step": 1421 + }, + { + "epoch": 0.060157373720280904, + "grad_norm": 0.281838983297348, + "learning_rate": 0.001, + "loss": 2.1035, + "step": 1422 + }, + { + "epoch": 0.06019967848379727, + "grad_norm": 0.5046917796134949, + "learning_rate": 0.001, + "loss": 2.3257, + "step": 1423 + }, + { + "epoch": 0.06024198324731365, + "grad_norm": 0.36857858300209045, + "learning_rate": 0.001, + "loss": 2.5842, + "step": 1424 + }, + { + "epoch": 0.06028428801083002, + "grad_norm": 0.6424447894096375, + "learning_rate": 0.001, + "loss": 2.5619, + "step": 1425 + }, + { + "epoch": 0.06032659277434639, + "grad_norm": 0.3190594017505646, + "learning_rate": 0.001, + "loss": 2.6482, + "step": 1426 + }, + { + "epoch": 0.060368897537862766, + "grad_norm": 0.3126913905143738, + "learning_rate": 0.001, + "loss": 3.0516, + "step": 1427 + }, + { + "epoch": 0.060411202301379134, + "grad_norm": 0.41238340735435486, + "learning_rate": 0.001, + "loss": 3.3109, + "step": 1428 + }, + { + "epoch": 0.06045350706489551, + "grad_norm": 0.44219493865966797, + "learning_rate": 0.001, + "loss": 3.6026, + "step": 1429 + }, + { + "epoch": 0.06049581182841188, + "grad_norm": 0.32209640741348267, + "learning_rate": 0.001, + "loss": 2.6428, + "step": 1430 + }, + { + "epoch": 0.06053811659192825, + "grad_norm": 0.39168989658355713, + "learning_rate": 0.001, + "loss": 3.429, + "step": 1431 + }, + { + "epoch": 0.06058042135544462, + "grad_norm": 0.27055805921554565, + "learning_rate": 0.001, + "loss": 3.8608, + "step": 1432 + }, + { + "epoch": 0.060622726118960996, + "grad_norm": 0.49823957681655884, + "learning_rate": 0.001, + "loss": 3.4317, + "step": 1433 + }, + { + "epoch": 0.060665030882477364, + "grad_norm": 0.32492002844810486, + "learning_rate": 0.001, + "loss": 2.4563, + "step": 1434 + }, + { + "epoch": 0.06070733564599374, + "grad_norm": 0.34951508045196533, + "learning_rate": 0.001, + "loss": 3.2943, + "step": 1435 + }, + { + "epoch": 0.060749640409510114, + "grad_norm": 0.4263649880886078, + "learning_rate": 0.001, + "loss": 2.6012, + "step": 1436 + }, + { + "epoch": 0.06079194517302648, + "grad_norm": 1.0415583848953247, + "learning_rate": 0.001, + "loss": 2.7861, + "step": 1437 + }, + { + "epoch": 0.06083424993654286, + "grad_norm": 0.331173837184906, + "learning_rate": 0.001, + "loss": 2.4296, + "step": 1438 + }, + { + "epoch": 0.060876554700059225, + "grad_norm": 0.88392573595047, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 1439 + }, + { + "epoch": 0.0609188594635756, + "grad_norm": 1.8237016201019287, + "learning_rate": 0.001, + "loss": 2.9384, + "step": 1440 + }, + { + "epoch": 0.06096116422709197, + "grad_norm": 0.40476563572883606, + "learning_rate": 0.001, + "loss": 3.0706, + "step": 1441 + }, + { + "epoch": 0.061003468990608344, + "grad_norm": 2.381662607192993, + "learning_rate": 0.001, + "loss": 2.1478, + "step": 1442 + }, + { + "epoch": 0.06104577375412471, + "grad_norm": 0.3319908082485199, + "learning_rate": 0.001, + "loss": 3.0221, + "step": 1443 + }, + { + "epoch": 0.06108807851764109, + "grad_norm": 1.0969241857528687, + "learning_rate": 0.001, + "loss": 2.5479, + "step": 1444 + }, + { + "epoch": 0.061130383281157455, + "grad_norm": 0.510847806930542, + "learning_rate": 0.001, + "loss": 1.8381, + "step": 1445 + }, + { + "epoch": 0.06117268804467383, + "grad_norm": 0.5893344879150391, + "learning_rate": 0.001, + "loss": 2.661, + "step": 1446 + }, + { + "epoch": 0.061214992808190205, + "grad_norm": 0.36038708686828613, + "learning_rate": 0.001, + "loss": 3.1335, + "step": 1447 + }, + { + "epoch": 0.06125729757170657, + "grad_norm": 0.3727112412452698, + "learning_rate": 0.001, + "loss": 3.2217, + "step": 1448 + }, + { + "epoch": 0.06129960233522295, + "grad_norm": 0.31264758110046387, + "learning_rate": 0.001, + "loss": 2.0928, + "step": 1449 + }, + { + "epoch": 0.061341907098739316, + "grad_norm": 0.3144207298755646, + "learning_rate": 0.001, + "loss": 2.5247, + "step": 1450 + }, + { + "epoch": 0.06138421186225569, + "grad_norm": 0.5916879773139954, + "learning_rate": 0.001, + "loss": 3.4771, + "step": 1451 + }, + { + "epoch": 0.06142651662577206, + "grad_norm": 0.34228265285491943, + "learning_rate": 0.001, + "loss": 1.9271, + "step": 1452 + }, + { + "epoch": 0.061468821389288435, + "grad_norm": 0.31869953870773315, + "learning_rate": 0.001, + "loss": 1.9089, + "step": 1453 + }, + { + "epoch": 0.0615111261528048, + "grad_norm": 0.2902565598487854, + "learning_rate": 0.001, + "loss": 1.9665, + "step": 1454 + }, + { + "epoch": 0.06155343091632118, + "grad_norm": 0.8417028188705444, + "learning_rate": 0.001, + "loss": 2.1853, + "step": 1455 + }, + { + "epoch": 0.06159573567983755, + "grad_norm": 0.29936543107032776, + "learning_rate": 0.001, + "loss": 2.2901, + "step": 1456 + }, + { + "epoch": 0.06163804044335392, + "grad_norm": 1.0548635721206665, + "learning_rate": 0.001, + "loss": 3.0753, + "step": 1457 + }, + { + "epoch": 0.061680345206870296, + "grad_norm": 0.5437618494033813, + "learning_rate": 0.001, + "loss": 2.3487, + "step": 1458 + }, + { + "epoch": 0.061722649970386664, + "grad_norm": 0.35127514600753784, + "learning_rate": 0.001, + "loss": 2.761, + "step": 1459 + }, + { + "epoch": 0.06176495473390304, + "grad_norm": 0.3495043218135834, + "learning_rate": 0.001, + "loss": 2.1056, + "step": 1460 + }, + { + "epoch": 0.06180725949741941, + "grad_norm": 0.3617175817489624, + "learning_rate": 0.001, + "loss": 2.226, + "step": 1461 + }, + { + "epoch": 0.06184956426093578, + "grad_norm": 2.682561159133911, + "learning_rate": 0.001, + "loss": 2.2108, + "step": 1462 + }, + { + "epoch": 0.06189186902445215, + "grad_norm": 0.6340761184692383, + "learning_rate": 0.001, + "loss": 3.0965, + "step": 1463 + }, + { + "epoch": 0.061934173787968526, + "grad_norm": 0.43284621834754944, + "learning_rate": 0.001, + "loss": 3.3314, + "step": 1464 + }, + { + "epoch": 0.061976478551484894, + "grad_norm": 0.9071413278579712, + "learning_rate": 0.001, + "loss": 1.9998, + "step": 1465 + }, + { + "epoch": 0.06201878331500127, + "grad_norm": 0.32309386134147644, + "learning_rate": 0.001, + "loss": 1.6943, + "step": 1466 + }, + { + "epoch": 0.062061088078517644, + "grad_norm": 1.5545170307159424, + "learning_rate": 0.001, + "loss": 2.5362, + "step": 1467 + }, + { + "epoch": 0.06210339284203401, + "grad_norm": 0.4889759123325348, + "learning_rate": 0.001, + "loss": 2.4579, + "step": 1468 + }, + { + "epoch": 0.06214569760555039, + "grad_norm": 0.3919735252857208, + "learning_rate": 0.001, + "loss": 2.5643, + "step": 1469 + }, + { + "epoch": 0.062188002369066755, + "grad_norm": 0.8115111589431763, + "learning_rate": 0.001, + "loss": 2.6917, + "step": 1470 + }, + { + "epoch": 0.06223030713258313, + "grad_norm": 1.6906379461288452, + "learning_rate": 0.001, + "loss": 1.7673, + "step": 1471 + }, + { + "epoch": 0.0622726118960995, + "grad_norm": 0.33348721265792847, + "learning_rate": 0.001, + "loss": 1.9239, + "step": 1472 + }, + { + "epoch": 0.062314916659615874, + "grad_norm": 0.5403802990913391, + "learning_rate": 0.001, + "loss": 2.6563, + "step": 1473 + }, + { + "epoch": 0.06235722142313224, + "grad_norm": 0.6966021060943604, + "learning_rate": 0.001, + "loss": 1.9165, + "step": 1474 + }, + { + "epoch": 0.06239952618664862, + "grad_norm": 0.720427930355072, + "learning_rate": 0.001, + "loss": 2.4165, + "step": 1475 + }, + { + "epoch": 0.062441830950164985, + "grad_norm": 0.617408275604248, + "learning_rate": 0.001, + "loss": 3.6807, + "step": 1476 + }, + { + "epoch": 0.06248413571368136, + "grad_norm": 0.5452759265899658, + "learning_rate": 0.001, + "loss": 3.4443, + "step": 1477 + }, + { + "epoch": 0.06252644047719773, + "grad_norm": 0.4260646402835846, + "learning_rate": 0.001, + "loss": 3.0329, + "step": 1478 + }, + { + "epoch": 0.06256874524071411, + "grad_norm": 0.3713223338127136, + "learning_rate": 0.001, + "loss": 3.1337, + "step": 1479 + }, + { + "epoch": 0.06261105000423048, + "grad_norm": 0.4795995056629181, + "learning_rate": 0.001, + "loss": 2.6778, + "step": 1480 + }, + { + "epoch": 0.06265335476774685, + "grad_norm": 1.03213369846344, + "learning_rate": 0.001, + "loss": 2.7868, + "step": 1481 + }, + { + "epoch": 0.06269565953126321, + "grad_norm": 0.7824976444244385, + "learning_rate": 0.001, + "loss": 2.982, + "step": 1482 + }, + { + "epoch": 0.0627379642947796, + "grad_norm": 1.2638541460037231, + "learning_rate": 0.001, + "loss": 3.0771, + "step": 1483 + }, + { + "epoch": 0.06278026905829596, + "grad_norm": 1.2298089265823364, + "learning_rate": 0.001, + "loss": 2.0408, + "step": 1484 + }, + { + "epoch": 0.06282257382181233, + "grad_norm": 0.8441680073738098, + "learning_rate": 0.001, + "loss": 2.4082, + "step": 1485 + }, + { + "epoch": 0.0628648785853287, + "grad_norm": 0.34452885389328003, + "learning_rate": 0.001, + "loss": 2.2197, + "step": 1486 + }, + { + "epoch": 0.06290718334884508, + "grad_norm": 0.39541852474212646, + "learning_rate": 0.001, + "loss": 2.9002, + "step": 1487 + }, + { + "epoch": 0.06294948811236145, + "grad_norm": 0.49740156531333923, + "learning_rate": 0.001, + "loss": 2.4369, + "step": 1488 + }, + { + "epoch": 0.06299179287587782, + "grad_norm": 1.0628348588943481, + "learning_rate": 0.001, + "loss": 2.9808, + "step": 1489 + }, + { + "epoch": 0.0630340976393942, + "grad_norm": 0.3743760883808136, + "learning_rate": 0.001, + "loss": 2.8068, + "step": 1490 + }, + { + "epoch": 0.06307640240291057, + "grad_norm": 0.6638221740722656, + "learning_rate": 0.001, + "loss": 2.0364, + "step": 1491 + }, + { + "epoch": 0.06311870716642694, + "grad_norm": 0.34382185339927673, + "learning_rate": 0.001, + "loss": 2.6025, + "step": 1492 + }, + { + "epoch": 0.0631610119299433, + "grad_norm": 0.378578782081604, + "learning_rate": 0.001, + "loss": 2.762, + "step": 1493 + }, + { + "epoch": 0.06320331669345969, + "grad_norm": 0.38604044914245605, + "learning_rate": 0.001, + "loss": 2.0935, + "step": 1494 + }, + { + "epoch": 0.06324562145697606, + "grad_norm": 1.129136562347412, + "learning_rate": 0.001, + "loss": 2.8483, + "step": 1495 + }, + { + "epoch": 0.06328792622049242, + "grad_norm": 0.30852529406547546, + "learning_rate": 0.001, + "loss": 2.3716, + "step": 1496 + }, + { + "epoch": 0.0633302309840088, + "grad_norm": 2.0638482570648193, + "learning_rate": 0.001, + "loss": 2.6199, + "step": 1497 + }, + { + "epoch": 0.06337253574752517, + "grad_norm": 0.25943711400032043, + "learning_rate": 0.001, + "loss": 2.2946, + "step": 1498 + }, + { + "epoch": 0.06341484051104154, + "grad_norm": 0.31075453758239746, + "learning_rate": 0.001, + "loss": 2.1379, + "step": 1499 + }, + { + "epoch": 0.06345714527455791, + "grad_norm": 0.34476733207702637, + "learning_rate": 0.001, + "loss": 3.0886, + "step": 1500 + }, + { + "epoch": 0.06349945003807429, + "grad_norm": 0.3541569709777832, + "learning_rate": 0.001, + "loss": 2.0473, + "step": 1501 + }, + { + "epoch": 0.06354175480159066, + "grad_norm": 0.6460628509521484, + "learning_rate": 0.001, + "loss": 2.4141, + "step": 1502 + }, + { + "epoch": 0.06358405956510703, + "grad_norm": 0.30732765793800354, + "learning_rate": 0.001, + "loss": 2.4192, + "step": 1503 + }, + { + "epoch": 0.0636263643286234, + "grad_norm": 0.3573780655860901, + "learning_rate": 0.001, + "loss": 3.1885, + "step": 1504 + }, + { + "epoch": 0.06366866909213978, + "grad_norm": 0.26040375232696533, + "learning_rate": 0.001, + "loss": 1.8493, + "step": 1505 + }, + { + "epoch": 0.06371097385565615, + "grad_norm": 0.2959687411785126, + "learning_rate": 0.001, + "loss": 2.4594, + "step": 1506 + }, + { + "epoch": 0.06375327861917252, + "grad_norm": 0.3453410267829895, + "learning_rate": 0.001, + "loss": 3.036, + "step": 1507 + }, + { + "epoch": 0.0637955833826889, + "grad_norm": 0.35461366176605225, + "learning_rate": 0.001, + "loss": 3.1924, + "step": 1508 + }, + { + "epoch": 0.06383788814620527, + "grad_norm": 0.34828194975852966, + "learning_rate": 0.001, + "loss": 2.1851, + "step": 1509 + }, + { + "epoch": 0.06388019290972163, + "grad_norm": 2.283674955368042, + "learning_rate": 0.001, + "loss": 2.278, + "step": 1510 + }, + { + "epoch": 0.063922497673238, + "grad_norm": 0.3838748335838318, + "learning_rate": 0.001, + "loss": 2.2081, + "step": 1511 + }, + { + "epoch": 0.06396480243675438, + "grad_norm": 0.2536729574203491, + "learning_rate": 0.001, + "loss": 1.9966, + "step": 1512 + }, + { + "epoch": 0.06400710720027075, + "grad_norm": 3.2230656147003174, + "learning_rate": 0.001, + "loss": 2.9543, + "step": 1513 + }, + { + "epoch": 0.06404941196378712, + "grad_norm": 0.2882685661315918, + "learning_rate": 0.001, + "loss": 2.0072, + "step": 1514 + }, + { + "epoch": 0.06409171672730349, + "grad_norm": 0.34128835797309875, + "learning_rate": 0.001, + "loss": 2.4368, + "step": 1515 + }, + { + "epoch": 0.06413402149081987, + "grad_norm": 0.326494425535202, + "learning_rate": 0.001, + "loss": 2.5767, + "step": 1516 + }, + { + "epoch": 0.06417632625433624, + "grad_norm": 0.4839684069156647, + "learning_rate": 0.001, + "loss": 2.0286, + "step": 1517 + }, + { + "epoch": 0.0642186310178526, + "grad_norm": 0.43224430084228516, + "learning_rate": 0.001, + "loss": 3.4441, + "step": 1518 + }, + { + "epoch": 0.06426093578136899, + "grad_norm": 0.41439104080200195, + "learning_rate": 0.001, + "loss": 2.9004, + "step": 1519 + }, + { + "epoch": 0.06430324054488536, + "grad_norm": 0.35360193252563477, + "learning_rate": 0.001, + "loss": 2.659, + "step": 1520 + }, + { + "epoch": 0.06434554530840172, + "grad_norm": 0.25587019324302673, + "learning_rate": 0.001, + "loss": 2.0744, + "step": 1521 + }, + { + "epoch": 0.06438785007191809, + "grad_norm": 0.2605937719345093, + "learning_rate": 0.001, + "loss": 2.4958, + "step": 1522 + }, + { + "epoch": 0.06443015483543447, + "grad_norm": 0.26113244891166687, + "learning_rate": 0.001, + "loss": 2.1607, + "step": 1523 + }, + { + "epoch": 0.06447245959895084, + "grad_norm": 0.3792099356651306, + "learning_rate": 0.001, + "loss": 2.3428, + "step": 1524 + }, + { + "epoch": 0.06451476436246721, + "grad_norm": 0.44862857460975647, + "learning_rate": 0.001, + "loss": 2.1672, + "step": 1525 + }, + { + "epoch": 0.06455706912598358, + "grad_norm": 0.2584918737411499, + "learning_rate": 0.001, + "loss": 2.2928, + "step": 1526 + }, + { + "epoch": 0.06459937388949996, + "grad_norm": 0.2768254280090332, + "learning_rate": 0.001, + "loss": 1.6838, + "step": 1527 + }, + { + "epoch": 0.06464167865301633, + "grad_norm": 0.30833518505096436, + "learning_rate": 0.001, + "loss": 1.8115, + "step": 1528 + }, + { + "epoch": 0.0646839834165327, + "grad_norm": 0.8284555077552795, + "learning_rate": 0.001, + "loss": 2.5659, + "step": 1529 + }, + { + "epoch": 0.06472628818004908, + "grad_norm": 0.5380930304527283, + "learning_rate": 0.001, + "loss": 2.6105, + "step": 1530 + }, + { + "epoch": 0.06476859294356545, + "grad_norm": 0.5618930459022522, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 1531 + }, + { + "epoch": 0.06481089770708182, + "grad_norm": 0.27346742153167725, + "learning_rate": 0.001, + "loss": 2.1463, + "step": 1532 + }, + { + "epoch": 0.06485320247059818, + "grad_norm": 0.30352386832237244, + "learning_rate": 0.001, + "loss": 3.6644, + "step": 1533 + }, + { + "epoch": 0.06489550723411457, + "grad_norm": 0.31220725178718567, + "learning_rate": 0.001, + "loss": 2.6265, + "step": 1534 + }, + { + "epoch": 0.06493781199763093, + "grad_norm": 0.27084285020828247, + "learning_rate": 0.001, + "loss": 2.3142, + "step": 1535 + }, + { + "epoch": 0.0649801167611473, + "grad_norm": 0.5739223957061768, + "learning_rate": 0.001, + "loss": 2.2806, + "step": 1536 + }, + { + "epoch": 0.06502242152466367, + "grad_norm": 0.7535676956176758, + "learning_rate": 0.001, + "loss": 2.3798, + "step": 1537 + }, + { + "epoch": 0.06506472628818005, + "grad_norm": 0.27977368235588074, + "learning_rate": 0.001, + "loss": 2.7091, + "step": 1538 + }, + { + "epoch": 0.06510703105169642, + "grad_norm": 0.6584524512290955, + "learning_rate": 0.001, + "loss": 2.9503, + "step": 1539 + }, + { + "epoch": 0.06514933581521279, + "grad_norm": 0.34777507185935974, + "learning_rate": 0.001, + "loss": 2.8689, + "step": 1540 + }, + { + "epoch": 0.06519164057872917, + "grad_norm": 0.5156388282775879, + "learning_rate": 0.001, + "loss": 2.4906, + "step": 1541 + }, + { + "epoch": 0.06523394534224554, + "grad_norm": 0.28216788172721863, + "learning_rate": 0.001, + "loss": 1.8898, + "step": 1542 + }, + { + "epoch": 0.06527625010576191, + "grad_norm": 1.3255605697631836, + "learning_rate": 0.001, + "loss": 1.7489, + "step": 1543 + }, + { + "epoch": 0.06531855486927828, + "grad_norm": 0.28111153841018677, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 1544 + }, + { + "epoch": 0.06536085963279466, + "grad_norm": 0.2950698435306549, + "learning_rate": 0.001, + "loss": 1.6588, + "step": 1545 + }, + { + "epoch": 0.06540316439631103, + "grad_norm": 0.2627398371696472, + "learning_rate": 0.001, + "loss": 2.2555, + "step": 1546 + }, + { + "epoch": 0.0654454691598274, + "grad_norm": 0.545285701751709, + "learning_rate": 0.001, + "loss": 2.238, + "step": 1547 + }, + { + "epoch": 0.06548777392334376, + "grad_norm": 0.2821153402328491, + "learning_rate": 0.001, + "loss": 2.5748, + "step": 1548 + }, + { + "epoch": 0.06553007868686014, + "grad_norm": 0.32898885011672974, + "learning_rate": 0.001, + "loss": 2.6032, + "step": 1549 + }, + { + "epoch": 0.06557238345037651, + "grad_norm": 0.6069597601890564, + "learning_rate": 0.001, + "loss": 2.3282, + "step": 1550 + }, + { + "epoch": 0.06561468821389288, + "grad_norm": 0.33791178464889526, + "learning_rate": 0.001, + "loss": 2.3444, + "step": 1551 + }, + { + "epoch": 0.06565699297740926, + "grad_norm": 0.630935788154602, + "learning_rate": 0.001, + "loss": 2.574, + "step": 1552 + }, + { + "epoch": 0.06569929774092563, + "grad_norm": 0.35226690769195557, + "learning_rate": 0.001, + "loss": 2.3944, + "step": 1553 + }, + { + "epoch": 0.065741602504442, + "grad_norm": 0.254507452249527, + "learning_rate": 0.001, + "loss": 2.4492, + "step": 1554 + }, + { + "epoch": 0.06578390726795837, + "grad_norm": 1.1583757400512695, + "learning_rate": 0.001, + "loss": 2.0208, + "step": 1555 + }, + { + "epoch": 0.06582621203147475, + "grad_norm": 0.27121299505233765, + "learning_rate": 0.001, + "loss": 2.2889, + "step": 1556 + }, + { + "epoch": 0.06586851679499112, + "grad_norm": 0.6531930565834045, + "learning_rate": 0.001, + "loss": 3.3624, + "step": 1557 + }, + { + "epoch": 0.06591082155850748, + "grad_norm": 0.42395731806755066, + "learning_rate": 0.001, + "loss": 2.3959, + "step": 1558 + }, + { + "epoch": 0.06595312632202387, + "grad_norm": 0.3243398368358612, + "learning_rate": 0.001, + "loss": 2.0876, + "step": 1559 + }, + { + "epoch": 0.06599543108554023, + "grad_norm": 0.3197888135910034, + "learning_rate": 0.001, + "loss": 2.2143, + "step": 1560 + }, + { + "epoch": 0.0660377358490566, + "grad_norm": 0.33776190876960754, + "learning_rate": 0.001, + "loss": 2.6401, + "step": 1561 + }, + { + "epoch": 0.06608004061257297, + "grad_norm": 0.34774449467658997, + "learning_rate": 0.001, + "loss": 2.5507, + "step": 1562 + }, + { + "epoch": 0.06612234537608935, + "grad_norm": 0.5661643147468567, + "learning_rate": 0.001, + "loss": 2.9336, + "step": 1563 + }, + { + "epoch": 0.06616465013960572, + "grad_norm": 0.3465413749217987, + "learning_rate": 0.001, + "loss": 1.8256, + "step": 1564 + }, + { + "epoch": 0.06620695490312209, + "grad_norm": 0.3140278458595276, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 1565 + }, + { + "epoch": 0.06624925966663846, + "grad_norm": 0.6140979528427124, + "learning_rate": 0.001, + "loss": 2.4981, + "step": 1566 + }, + { + "epoch": 0.06629156443015484, + "grad_norm": 0.36587056517601013, + "learning_rate": 0.001, + "loss": 2.1976, + "step": 1567 + }, + { + "epoch": 0.06633386919367121, + "grad_norm": 0.2791414260864258, + "learning_rate": 0.001, + "loss": 2.1657, + "step": 1568 + }, + { + "epoch": 0.06637617395718758, + "grad_norm": 0.974829375743866, + "learning_rate": 0.001, + "loss": 2.604, + "step": 1569 + }, + { + "epoch": 0.06641847872070396, + "grad_norm": 0.3969157934188843, + "learning_rate": 0.001, + "loss": 3.1704, + "step": 1570 + }, + { + "epoch": 0.06646078348422033, + "grad_norm": 0.31570157408714294, + "learning_rate": 0.001, + "loss": 2.6029, + "step": 1571 + }, + { + "epoch": 0.0665030882477367, + "grad_norm": 0.5296083688735962, + "learning_rate": 0.001, + "loss": 2.375, + "step": 1572 + }, + { + "epoch": 0.06654539301125306, + "grad_norm": 0.8200310468673706, + "learning_rate": 0.001, + "loss": 1.9998, + "step": 1573 + }, + { + "epoch": 0.06658769777476944, + "grad_norm": 0.3001185953617096, + "learning_rate": 0.001, + "loss": 2.5948, + "step": 1574 + }, + { + "epoch": 0.06663000253828581, + "grad_norm": 0.7405486106872559, + "learning_rate": 0.001, + "loss": 3.2701, + "step": 1575 + }, + { + "epoch": 0.06667230730180218, + "grad_norm": 1.4987202882766724, + "learning_rate": 0.001, + "loss": 3.5723, + "step": 1576 + }, + { + "epoch": 0.06671461206531855, + "grad_norm": 0.7644462585449219, + "learning_rate": 0.001, + "loss": 2.1558, + "step": 1577 + }, + { + "epoch": 0.06675691682883493, + "grad_norm": 0.3527809977531433, + "learning_rate": 0.001, + "loss": 2.5948, + "step": 1578 + }, + { + "epoch": 0.0667992215923513, + "grad_norm": 1.6607075929641724, + "learning_rate": 0.001, + "loss": 2.2302, + "step": 1579 + }, + { + "epoch": 0.06684152635586767, + "grad_norm": 0.42727428674697876, + "learning_rate": 0.001, + "loss": 2.4521, + "step": 1580 + }, + { + "epoch": 0.06688383111938405, + "grad_norm": 0.5329534411430359, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 1581 + }, + { + "epoch": 0.06692613588290042, + "grad_norm": 0.32826146483421326, + "learning_rate": 0.001, + "loss": 3.578, + "step": 1582 + }, + { + "epoch": 0.06696844064641679, + "grad_norm": 0.5298377275466919, + "learning_rate": 0.001, + "loss": 2.7983, + "step": 1583 + }, + { + "epoch": 0.06701074540993315, + "grad_norm": 0.3430579900741577, + "learning_rate": 0.001, + "loss": 1.9132, + "step": 1584 + }, + { + "epoch": 0.06705305017344954, + "grad_norm": 0.31380248069763184, + "learning_rate": 0.001, + "loss": 2.7339, + "step": 1585 + }, + { + "epoch": 0.0670953549369659, + "grad_norm": 0.4660479724407196, + "learning_rate": 0.001, + "loss": 2.183, + "step": 1586 + }, + { + "epoch": 0.06713765970048227, + "grad_norm": 1.5428110361099243, + "learning_rate": 0.001, + "loss": 2.3808, + "step": 1587 + }, + { + "epoch": 0.06717996446399864, + "grad_norm": 1.2535374164581299, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 1588 + }, + { + "epoch": 0.06722226922751502, + "grad_norm": 0.438996285200119, + "learning_rate": 0.001, + "loss": 2.3571, + "step": 1589 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 0.3860979378223419, + "learning_rate": 0.001, + "loss": 3.1402, + "step": 1590 + }, + { + "epoch": 0.06730687875454776, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.001, + "loss": 2.5053, + "step": 1591 + }, + { + "epoch": 0.06734918351806414, + "grad_norm": 0.4713243246078491, + "learning_rate": 0.001, + "loss": 3.3592, + "step": 1592 + }, + { + "epoch": 0.06739148828158051, + "grad_norm": 0.30050158500671387, + "learning_rate": 0.001, + "loss": 2.1694, + "step": 1593 + }, + { + "epoch": 0.06743379304509688, + "grad_norm": 0.34924885630607605, + "learning_rate": 0.001, + "loss": 1.9499, + "step": 1594 + }, + { + "epoch": 0.06747609780861324, + "grad_norm": 0.4567183256149292, + "learning_rate": 0.001, + "loss": 2.2653, + "step": 1595 + }, + { + "epoch": 0.06751840257212963, + "grad_norm": 4.099420070648193, + "learning_rate": 0.001, + "loss": 2.9475, + "step": 1596 + }, + { + "epoch": 0.067560707335646, + "grad_norm": 0.3680780827999115, + "learning_rate": 0.001, + "loss": 2.3772, + "step": 1597 + }, + { + "epoch": 0.06760301209916236, + "grad_norm": 3.2965519428253174, + "learning_rate": 0.001, + "loss": 1.8148, + "step": 1598 + }, + { + "epoch": 0.06764531686267873, + "grad_norm": 0.4864022433757782, + "learning_rate": 0.001, + "loss": 2.3886, + "step": 1599 + }, + { + "epoch": 0.06768762162619511, + "grad_norm": 0.5703839063644409, + "learning_rate": 0.001, + "loss": 2.1813, + "step": 1600 + }, + { + "epoch": 0.06772992638971148, + "grad_norm": 0.28068217635154724, + "learning_rate": 0.001, + "loss": 2.4237, + "step": 1601 + }, + { + "epoch": 0.06777223115322785, + "grad_norm": 0.27317923307418823, + "learning_rate": 0.001, + "loss": 2.2093, + "step": 1602 + }, + { + "epoch": 0.06781453591674423, + "grad_norm": 0.3868054151535034, + "learning_rate": 0.001, + "loss": 2.4154, + "step": 1603 + }, + { + "epoch": 0.0678568406802606, + "grad_norm": 0.36695969104766846, + "learning_rate": 0.001, + "loss": 2.3344, + "step": 1604 + }, + { + "epoch": 0.06789914544377697, + "grad_norm": 0.37811872363090515, + "learning_rate": 0.001, + "loss": 1.7114, + "step": 1605 + }, + { + "epoch": 0.06794145020729334, + "grad_norm": 0.3852631449699402, + "learning_rate": 0.001, + "loss": 3.0529, + "step": 1606 + }, + { + "epoch": 0.06798375497080972, + "grad_norm": 0.9019683003425598, + "learning_rate": 0.001, + "loss": 2.0123, + "step": 1607 + }, + { + "epoch": 0.06802605973432609, + "grad_norm": 1.809870958328247, + "learning_rate": 0.001, + "loss": 2.5859, + "step": 1608 + }, + { + "epoch": 0.06806836449784245, + "grad_norm": 0.31933197379112244, + "learning_rate": 0.001, + "loss": 2.0505, + "step": 1609 + }, + { + "epoch": 0.06811066926135882, + "grad_norm": 0.37392449378967285, + "learning_rate": 0.001, + "loss": 3.3611, + "step": 1610 + }, + { + "epoch": 0.0681529740248752, + "grad_norm": 0.3455560803413391, + "learning_rate": 0.001, + "loss": 2.3174, + "step": 1611 + }, + { + "epoch": 0.06819527878839157, + "grad_norm": 0.5688546299934387, + "learning_rate": 0.001, + "loss": 2.2655, + "step": 1612 + }, + { + "epoch": 0.06823758355190794, + "grad_norm": 0.6909453868865967, + "learning_rate": 0.001, + "loss": 1.8641, + "step": 1613 + }, + { + "epoch": 0.06827988831542432, + "grad_norm": 0.31672239303588867, + "learning_rate": 0.001, + "loss": 1.9912, + "step": 1614 + }, + { + "epoch": 0.06832219307894069, + "grad_norm": 0.8048245310783386, + "learning_rate": 0.001, + "loss": 2.0888, + "step": 1615 + }, + { + "epoch": 0.06836449784245706, + "grad_norm": 0.28497985005378723, + "learning_rate": 0.001, + "loss": 2.5377, + "step": 1616 + }, + { + "epoch": 0.06840680260597343, + "grad_norm": 0.8506115674972534, + "learning_rate": 0.001, + "loss": 2.3879, + "step": 1617 + }, + { + "epoch": 0.06844910736948981, + "grad_norm": 1.2071772813796997, + "learning_rate": 0.001, + "loss": 2.7813, + "step": 1618 + }, + { + "epoch": 0.06849141213300618, + "grad_norm": 0.37529999017715454, + "learning_rate": 0.001, + "loss": 2.6804, + "step": 1619 + }, + { + "epoch": 0.06853371689652255, + "grad_norm": 0.5811559557914734, + "learning_rate": 0.001, + "loss": 2.3751, + "step": 1620 + }, + { + "epoch": 0.06857602166003893, + "grad_norm": 1.8456511497497559, + "learning_rate": 0.001, + "loss": 2.3837, + "step": 1621 + }, + { + "epoch": 0.0686183264235553, + "grad_norm": 0.7621955871582031, + "learning_rate": 0.001, + "loss": 2.696, + "step": 1622 + }, + { + "epoch": 0.06866063118707166, + "grad_norm": 0.8707615733146667, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 1623 + }, + { + "epoch": 0.06870293595058803, + "grad_norm": 0.6202014684677124, + "learning_rate": 0.001, + "loss": 1.9679, + "step": 1624 + }, + { + "epoch": 0.06874524071410441, + "grad_norm": 0.38461774587631226, + "learning_rate": 0.001, + "loss": 2.1663, + "step": 1625 + }, + { + "epoch": 0.06878754547762078, + "grad_norm": 0.3248027265071869, + "learning_rate": 0.001, + "loss": 2.6196, + "step": 1626 + }, + { + "epoch": 0.06882985024113715, + "grad_norm": 0.5045840740203857, + "learning_rate": 0.001, + "loss": 4.0164, + "step": 1627 + }, + { + "epoch": 0.06887215500465352, + "grad_norm": 0.31521186232566833, + "learning_rate": 0.001, + "loss": 1.885, + "step": 1628 + }, + { + "epoch": 0.0689144597681699, + "grad_norm": 0.3150392770767212, + "learning_rate": 0.001, + "loss": 2.2383, + "step": 1629 + }, + { + "epoch": 0.06895676453168627, + "grad_norm": 0.45550379157066345, + "learning_rate": 0.001, + "loss": 2.2817, + "step": 1630 + }, + { + "epoch": 0.06899906929520264, + "grad_norm": 0.30408960580825806, + "learning_rate": 0.001, + "loss": 2.213, + "step": 1631 + }, + { + "epoch": 0.06904137405871902, + "grad_norm": 0.36207836866378784, + "learning_rate": 0.001, + "loss": 2.593, + "step": 1632 + }, + { + "epoch": 0.06908367882223539, + "grad_norm": 0.37449923157691956, + "learning_rate": 0.001, + "loss": 2.0836, + "step": 1633 + }, + { + "epoch": 0.06912598358575175, + "grad_norm": 0.29214954376220703, + "learning_rate": 0.001, + "loss": 2.2552, + "step": 1634 + }, + { + "epoch": 0.06916828834926812, + "grad_norm": 0.8513506650924683, + "learning_rate": 0.001, + "loss": 1.975, + "step": 1635 + }, + { + "epoch": 0.0692105931127845, + "grad_norm": 0.5883151888847351, + "learning_rate": 0.001, + "loss": 3.0927, + "step": 1636 + }, + { + "epoch": 0.06925289787630087, + "grad_norm": 0.350691020488739, + "learning_rate": 0.001, + "loss": 2.0645, + "step": 1637 + }, + { + "epoch": 0.06929520263981724, + "grad_norm": 0.8605084419250488, + "learning_rate": 0.001, + "loss": 1.8086, + "step": 1638 + }, + { + "epoch": 0.06933750740333361, + "grad_norm": 1.2016950845718384, + "learning_rate": 0.001, + "loss": 1.9914, + "step": 1639 + }, + { + "epoch": 0.06937981216684999, + "grad_norm": 0.2884148955345154, + "learning_rate": 0.001, + "loss": 2.1365, + "step": 1640 + }, + { + "epoch": 0.06942211693036636, + "grad_norm": 0.337169349193573, + "learning_rate": 0.001, + "loss": 2.0075, + "step": 1641 + }, + { + "epoch": 0.06946442169388273, + "grad_norm": 0.6522495150566101, + "learning_rate": 0.001, + "loss": 3.0513, + "step": 1642 + }, + { + "epoch": 0.06950672645739911, + "grad_norm": 0.44939300417900085, + "learning_rate": 0.001, + "loss": 2.8989, + "step": 1643 + }, + { + "epoch": 0.06954903122091548, + "grad_norm": 0.30578479170799255, + "learning_rate": 0.001, + "loss": 3.2959, + "step": 1644 + }, + { + "epoch": 0.06959133598443185, + "grad_norm": 45.22916793823242, + "learning_rate": 0.001, + "loss": 2.4688, + "step": 1645 + }, + { + "epoch": 0.06963364074794821, + "grad_norm": 0.2697926461696625, + "learning_rate": 0.001, + "loss": 3.0138, + "step": 1646 + }, + { + "epoch": 0.0696759455114646, + "grad_norm": 0.32543253898620605, + "learning_rate": 0.001, + "loss": 2.884, + "step": 1647 + }, + { + "epoch": 0.06971825027498096, + "grad_norm": 1.5244373083114624, + "learning_rate": 0.001, + "loss": 2.605, + "step": 1648 + }, + { + "epoch": 0.06976055503849733, + "grad_norm": 0.2659820318222046, + "learning_rate": 0.001, + "loss": 2.1016, + "step": 1649 + }, + { + "epoch": 0.0698028598020137, + "grad_norm": 0.30054983496665955, + "learning_rate": 0.001, + "loss": 2.4681, + "step": 1650 + }, + { + "epoch": 0.06984516456553008, + "grad_norm": 0.3234836161136627, + "learning_rate": 0.001, + "loss": 1.9912, + "step": 1651 + }, + { + "epoch": 0.06988746932904645, + "grad_norm": 0.27745023369789124, + "learning_rate": 0.001, + "loss": 2.2214, + "step": 1652 + }, + { + "epoch": 0.06992977409256282, + "grad_norm": 0.6211910247802734, + "learning_rate": 0.001, + "loss": 2.1174, + "step": 1653 + }, + { + "epoch": 0.0699720788560792, + "grad_norm": 6.255242824554443, + "learning_rate": 0.001, + "loss": 2.5594, + "step": 1654 + }, + { + "epoch": 0.07001438361959557, + "grad_norm": 0.3245622217655182, + "learning_rate": 0.001, + "loss": 1.9634, + "step": 1655 + }, + { + "epoch": 0.07005668838311194, + "grad_norm": 0.4616556167602539, + "learning_rate": 0.001, + "loss": 2.2543, + "step": 1656 + }, + { + "epoch": 0.0700989931466283, + "grad_norm": 0.2637854516506195, + "learning_rate": 0.001, + "loss": 2.29, + "step": 1657 + }, + { + "epoch": 0.07014129791014469, + "grad_norm": 0.9866844415664673, + "learning_rate": 0.001, + "loss": 2.8994, + "step": 1658 + }, + { + "epoch": 0.07018360267366106, + "grad_norm": 0.7608810663223267, + "learning_rate": 0.001, + "loss": 2.2228, + "step": 1659 + }, + { + "epoch": 0.07022590743717742, + "grad_norm": 0.31685671210289, + "learning_rate": 0.001, + "loss": 2.2006, + "step": 1660 + }, + { + "epoch": 0.07026821220069379, + "grad_norm": 1.3476111888885498, + "learning_rate": 0.001, + "loss": 3.0012, + "step": 1661 + }, + { + "epoch": 0.07031051696421017, + "grad_norm": 5.591330051422119, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 1662 + }, + { + "epoch": 0.07035282172772654, + "grad_norm": 0.6403492093086243, + "learning_rate": 0.001, + "loss": 3.7632, + "step": 1663 + }, + { + "epoch": 0.07039512649124291, + "grad_norm": 0.4311082661151886, + "learning_rate": 0.001, + "loss": 2.0528, + "step": 1664 + }, + { + "epoch": 0.07043743125475929, + "grad_norm": 0.9901355504989624, + "learning_rate": 0.001, + "loss": 2.5533, + "step": 1665 + }, + { + "epoch": 0.07047973601827566, + "grad_norm": 2.0582873821258545, + "learning_rate": 0.001, + "loss": 3.0359, + "step": 1666 + }, + { + "epoch": 0.07052204078179203, + "grad_norm": 0.428562194108963, + "learning_rate": 0.001, + "loss": 3.0728, + "step": 1667 + }, + { + "epoch": 0.0705643455453084, + "grad_norm": 0.2768668532371521, + "learning_rate": 0.001, + "loss": 1.7863, + "step": 1668 + }, + { + "epoch": 0.07060665030882478, + "grad_norm": 0.593871533870697, + "learning_rate": 0.001, + "loss": 2.8978, + "step": 1669 + }, + { + "epoch": 0.07064895507234115, + "grad_norm": 0.41889962553977966, + "learning_rate": 0.001, + "loss": 2.3901, + "step": 1670 + }, + { + "epoch": 0.07069125983585751, + "grad_norm": 0.8739703297615051, + "learning_rate": 0.001, + "loss": 2.9089, + "step": 1671 + }, + { + "epoch": 0.07073356459937388, + "grad_norm": 0.3771756589412689, + "learning_rate": 0.001, + "loss": 2.338, + "step": 1672 + }, + { + "epoch": 0.07077586936289026, + "grad_norm": 0.29926592111587524, + "learning_rate": 0.001, + "loss": 3.1151, + "step": 1673 + }, + { + "epoch": 0.07081817412640663, + "grad_norm": 0.43917933106422424, + "learning_rate": 0.001, + "loss": 2.3074, + "step": 1674 + }, + { + "epoch": 0.070860478889923, + "grad_norm": 4.316728591918945, + "learning_rate": 0.001, + "loss": 2.0206, + "step": 1675 + }, + { + "epoch": 0.07090278365343938, + "grad_norm": 0.5438099503517151, + "learning_rate": 0.001, + "loss": 2.57, + "step": 1676 + }, + { + "epoch": 0.07094508841695575, + "grad_norm": 0.4939629137516022, + "learning_rate": 0.001, + "loss": 2.4069, + "step": 1677 + }, + { + "epoch": 0.07098739318047212, + "grad_norm": 0.4457460343837738, + "learning_rate": 0.001, + "loss": 2.7809, + "step": 1678 + }, + { + "epoch": 0.07102969794398849, + "grad_norm": 0.45648688077926636, + "learning_rate": 0.001, + "loss": 2.7326, + "step": 1679 + }, + { + "epoch": 0.07107200270750487, + "grad_norm": 0.3010053038597107, + "learning_rate": 0.001, + "loss": 2.2902, + "step": 1680 + }, + { + "epoch": 0.07111430747102124, + "grad_norm": 0.42142006754875183, + "learning_rate": 0.001, + "loss": 1.9107, + "step": 1681 + }, + { + "epoch": 0.0711566122345376, + "grad_norm": 0.3928402364253998, + "learning_rate": 0.001, + "loss": 3.0004, + "step": 1682 + }, + { + "epoch": 0.07119891699805399, + "grad_norm": 0.28071898221969604, + "learning_rate": 0.001, + "loss": 2.289, + "step": 1683 + }, + { + "epoch": 0.07124122176157036, + "grad_norm": 0.3247803747653961, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 1684 + }, + { + "epoch": 0.07128352652508672, + "grad_norm": 0.5170036554336548, + "learning_rate": 0.001, + "loss": 2.1761, + "step": 1685 + }, + { + "epoch": 0.07132583128860309, + "grad_norm": 1.5233675241470337, + "learning_rate": 0.001, + "loss": 2.3741, + "step": 1686 + }, + { + "epoch": 0.07136813605211947, + "grad_norm": 0.33674490451812744, + "learning_rate": 0.001, + "loss": 2.2987, + "step": 1687 + }, + { + "epoch": 0.07141044081563584, + "grad_norm": 3.1619656085968018, + "learning_rate": 0.001, + "loss": 1.9981, + "step": 1688 + }, + { + "epoch": 0.07145274557915221, + "grad_norm": 0.3328041732311249, + "learning_rate": 0.001, + "loss": 3.4872, + "step": 1689 + }, + { + "epoch": 0.07149505034266858, + "grad_norm": 0.3385929465293884, + "learning_rate": 0.001, + "loss": 2.4917, + "step": 1690 + }, + { + "epoch": 0.07153735510618496, + "grad_norm": 0.3292979300022125, + "learning_rate": 0.001, + "loss": 1.7646, + "step": 1691 + }, + { + "epoch": 0.07157965986970133, + "grad_norm": 9.468561172485352, + "learning_rate": 0.001, + "loss": 3.2305, + "step": 1692 + }, + { + "epoch": 0.0716219646332177, + "grad_norm": 0.339837908744812, + "learning_rate": 0.001, + "loss": 4.1912, + "step": 1693 + }, + { + "epoch": 0.07166426939673408, + "grad_norm": 0.7992456555366516, + "learning_rate": 0.001, + "loss": 1.7365, + "step": 1694 + }, + { + "epoch": 0.07170657416025045, + "grad_norm": 0.3533322513103485, + "learning_rate": 0.001, + "loss": 1.6018, + "step": 1695 + }, + { + "epoch": 0.07174887892376682, + "grad_norm": 0.4883526563644409, + "learning_rate": 0.001, + "loss": 2.5816, + "step": 1696 + }, + { + "epoch": 0.07179118368728318, + "grad_norm": 0.3907789885997772, + "learning_rate": 0.001, + "loss": 1.906, + "step": 1697 + }, + { + "epoch": 0.07183348845079957, + "grad_norm": 0.5203762054443359, + "learning_rate": 0.001, + "loss": 2.2438, + "step": 1698 + }, + { + "epoch": 0.07187579321431593, + "grad_norm": 1.2718244791030884, + "learning_rate": 0.001, + "loss": 2.1247, + "step": 1699 + }, + { + "epoch": 0.0719180979778323, + "grad_norm": 0.3142852485179901, + "learning_rate": 0.001, + "loss": 2.0239, + "step": 1700 + }, + { + "epoch": 0.07196040274134867, + "grad_norm": 0.4279821515083313, + "learning_rate": 0.001, + "loss": 2.2428, + "step": 1701 + }, + { + "epoch": 0.07200270750486505, + "grad_norm": 0.6683199405670166, + "learning_rate": 0.001, + "loss": 2.4512, + "step": 1702 + }, + { + "epoch": 0.07204501226838142, + "grad_norm": 2.1548545360565186, + "learning_rate": 0.001, + "loss": 2.6076, + "step": 1703 + }, + { + "epoch": 0.07208731703189779, + "grad_norm": 0.5022167563438416, + "learning_rate": 0.001, + "loss": 2.6147, + "step": 1704 + }, + { + "epoch": 0.07212962179541417, + "grad_norm": 4.610467910766602, + "learning_rate": 0.001, + "loss": 2.8257, + "step": 1705 + }, + { + "epoch": 0.07217192655893054, + "grad_norm": 0.35856351256370544, + "learning_rate": 0.001, + "loss": 2.0853, + "step": 1706 + }, + { + "epoch": 0.0722142313224469, + "grad_norm": 0.45485618710517883, + "learning_rate": 0.001, + "loss": 2.5217, + "step": 1707 + }, + { + "epoch": 0.07225653608596327, + "grad_norm": 0.3743135333061218, + "learning_rate": 0.001, + "loss": 2.2795, + "step": 1708 + }, + { + "epoch": 0.07229884084947966, + "grad_norm": 0.286580890417099, + "learning_rate": 0.001, + "loss": 2.0651, + "step": 1709 + }, + { + "epoch": 0.07234114561299602, + "grad_norm": 0.574466347694397, + "learning_rate": 0.001, + "loss": 2.1261, + "step": 1710 + }, + { + "epoch": 0.07238345037651239, + "grad_norm": 0.35179415345191956, + "learning_rate": 0.001, + "loss": 2.6743, + "step": 1711 + }, + { + "epoch": 0.07242575514002876, + "grad_norm": 0.3282983601093292, + "learning_rate": 0.001, + "loss": 2.7438, + "step": 1712 + }, + { + "epoch": 0.07246805990354514, + "grad_norm": 0.41283783316612244, + "learning_rate": 0.001, + "loss": 2.4672, + "step": 1713 + }, + { + "epoch": 0.07251036466706151, + "grad_norm": 0.4331008791923523, + "learning_rate": 0.001, + "loss": 2.4509, + "step": 1714 + }, + { + "epoch": 0.07255266943057788, + "grad_norm": 0.4134253263473511, + "learning_rate": 0.001, + "loss": 2.7329, + "step": 1715 + }, + { + "epoch": 0.07259497419409426, + "grad_norm": 0.3073793649673462, + "learning_rate": 0.001, + "loss": 2.2086, + "step": 1716 + }, + { + "epoch": 0.07263727895761063, + "grad_norm": 0.3595843017101288, + "learning_rate": 0.001, + "loss": 2.2972, + "step": 1717 + }, + { + "epoch": 0.072679583721127, + "grad_norm": 1.8754022121429443, + "learning_rate": 0.001, + "loss": 2.6782, + "step": 1718 + }, + { + "epoch": 0.07272188848464337, + "grad_norm": 0.2694728374481201, + "learning_rate": 0.001, + "loss": 2.5075, + "step": 1719 + }, + { + "epoch": 0.07276419324815975, + "grad_norm": 0.266966849565506, + "learning_rate": 0.001, + "loss": 2.7838, + "step": 1720 + }, + { + "epoch": 0.07280649801167612, + "grad_norm": 0.6708036661148071, + "learning_rate": 0.001, + "loss": 2.5704, + "step": 1721 + }, + { + "epoch": 0.07284880277519248, + "grad_norm": 0.32971417903900146, + "learning_rate": 0.001, + "loss": 2.1911, + "step": 1722 + }, + { + "epoch": 0.07289110753870885, + "grad_norm": 0.5026300549507141, + "learning_rate": 0.001, + "loss": 2.1365, + "step": 1723 + }, + { + "epoch": 0.07293341230222523, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.001, + "loss": 2.7638, + "step": 1724 + }, + { + "epoch": 0.0729757170657416, + "grad_norm": 0.35378944873809814, + "learning_rate": 0.001, + "loss": 2.5012, + "step": 1725 + }, + { + "epoch": 0.07301802182925797, + "grad_norm": 0.2955608665943146, + "learning_rate": 0.001, + "loss": 3.3105, + "step": 1726 + }, + { + "epoch": 0.07306032659277435, + "grad_norm": 1.2950598001480103, + "learning_rate": 0.001, + "loss": 1.4412, + "step": 1727 + }, + { + "epoch": 0.07310263135629072, + "grad_norm": 0.34854191541671753, + "learning_rate": 0.001, + "loss": 2.1458, + "step": 1728 + }, + { + "epoch": 0.07314493611980709, + "grad_norm": 0.2604203224182129, + "learning_rate": 0.001, + "loss": 2.5409, + "step": 1729 + }, + { + "epoch": 0.07318724088332346, + "grad_norm": 0.28911593556404114, + "learning_rate": 0.001, + "loss": 2.3813, + "step": 1730 + }, + { + "epoch": 0.07322954564683984, + "grad_norm": 0.4041121304035187, + "learning_rate": 0.001, + "loss": 2.1355, + "step": 1731 + }, + { + "epoch": 0.0732718504103562, + "grad_norm": 3.5361766815185547, + "learning_rate": 0.001, + "loss": 2.6676, + "step": 1732 + }, + { + "epoch": 0.07331415517387257, + "grad_norm": 0.38924169540405273, + "learning_rate": 0.001, + "loss": 2.1344, + "step": 1733 + }, + { + "epoch": 0.07335645993738894, + "grad_norm": 0.36835405230522156, + "learning_rate": 0.001, + "loss": 2.3473, + "step": 1734 + }, + { + "epoch": 0.07339876470090533, + "grad_norm": 2.3047337532043457, + "learning_rate": 0.001, + "loss": 2.1468, + "step": 1735 + }, + { + "epoch": 0.0734410694644217, + "grad_norm": 0.47445395588874817, + "learning_rate": 0.001, + "loss": 2.6338, + "step": 1736 + }, + { + "epoch": 0.07348337422793806, + "grad_norm": 0.5361562967300415, + "learning_rate": 0.001, + "loss": 2.8039, + "step": 1737 + }, + { + "epoch": 0.07352567899145444, + "grad_norm": 0.362362802028656, + "learning_rate": 0.001, + "loss": 1.9508, + "step": 1738 + }, + { + "epoch": 0.07356798375497081, + "grad_norm": 1.4122487306594849, + "learning_rate": 0.001, + "loss": 2.8017, + "step": 1739 + }, + { + "epoch": 0.07361028851848718, + "grad_norm": 0.5565959215164185, + "learning_rate": 0.001, + "loss": 2.2053, + "step": 1740 + }, + { + "epoch": 0.07365259328200355, + "grad_norm": 0.3117307722568512, + "learning_rate": 0.001, + "loss": 2.1428, + "step": 1741 + }, + { + "epoch": 0.07369489804551993, + "grad_norm": 0.4470238983631134, + "learning_rate": 0.001, + "loss": 3.3104, + "step": 1742 + }, + { + "epoch": 0.0737372028090363, + "grad_norm": 0.31840646266937256, + "learning_rate": 0.001, + "loss": 3.3071, + "step": 1743 + }, + { + "epoch": 0.07377950757255267, + "grad_norm": 0.29003921151161194, + "learning_rate": 0.001, + "loss": 1.8832, + "step": 1744 + }, + { + "epoch": 0.07382181233606905, + "grad_norm": 0.2651999890804291, + "learning_rate": 0.001, + "loss": 2.1535, + "step": 1745 + }, + { + "epoch": 0.07386411709958542, + "grad_norm": 0.34940871596336365, + "learning_rate": 0.001, + "loss": 2.5327, + "step": 1746 + }, + { + "epoch": 0.07390642186310178, + "grad_norm": 0.6055050492286682, + "learning_rate": 0.001, + "loss": 2.8376, + "step": 1747 + }, + { + "epoch": 0.07394872662661815, + "grad_norm": 0.372227281332016, + "learning_rate": 0.001, + "loss": 2.1704, + "step": 1748 + }, + { + "epoch": 0.07399103139013453, + "grad_norm": 0.31508174538612366, + "learning_rate": 0.001, + "loss": 2.3462, + "step": 1749 + }, + { + "epoch": 0.0740333361536509, + "grad_norm": 0.40334755182266235, + "learning_rate": 0.001, + "loss": 2.2007, + "step": 1750 + }, + { + "epoch": 0.07407564091716727, + "grad_norm": 0.23412242531776428, + "learning_rate": 0.001, + "loss": 1.9882, + "step": 1751 + }, + { + "epoch": 0.07411794568068364, + "grad_norm": 0.47561314702033997, + "learning_rate": 0.001, + "loss": 2.3195, + "step": 1752 + }, + { + "epoch": 0.07416025044420002, + "grad_norm": 0.3126150965690613, + "learning_rate": 0.001, + "loss": 3.2531, + "step": 1753 + }, + { + "epoch": 0.07420255520771639, + "grad_norm": 0.6425087451934814, + "learning_rate": 0.001, + "loss": 1.549, + "step": 1754 + }, + { + "epoch": 0.07424485997123276, + "grad_norm": 0.3294941484928131, + "learning_rate": 0.001, + "loss": 3.2919, + "step": 1755 + }, + { + "epoch": 0.07428716473474914, + "grad_norm": 0.32722482085227966, + "learning_rate": 0.001, + "loss": 2.3305, + "step": 1756 + }, + { + "epoch": 0.07432946949826551, + "grad_norm": 0.30937889218330383, + "learning_rate": 0.001, + "loss": 1.9746, + "step": 1757 + }, + { + "epoch": 0.07437177426178188, + "grad_norm": 1.4851760864257812, + "learning_rate": 0.001, + "loss": 2.4941, + "step": 1758 + }, + { + "epoch": 0.07441407902529824, + "grad_norm": 0.2669662833213806, + "learning_rate": 0.001, + "loss": 2.0385, + "step": 1759 + }, + { + "epoch": 0.07445638378881463, + "grad_norm": 0.44635874032974243, + "learning_rate": 0.001, + "loss": 2.5292, + "step": 1760 + }, + { + "epoch": 0.074498688552331, + "grad_norm": 0.28977322578430176, + "learning_rate": 0.001, + "loss": 2.7165, + "step": 1761 + }, + { + "epoch": 0.07454099331584736, + "grad_norm": 0.25588658452033997, + "learning_rate": 0.001, + "loss": 2.1741, + "step": 1762 + }, + { + "epoch": 0.07458329807936373, + "grad_norm": 0.28512755036354065, + "learning_rate": 0.001, + "loss": 2.8152, + "step": 1763 + }, + { + "epoch": 0.07462560284288011, + "grad_norm": 0.8499886393547058, + "learning_rate": 0.001, + "loss": 2.7018, + "step": 1764 + }, + { + "epoch": 0.07466790760639648, + "grad_norm": 0.5321717262268066, + "learning_rate": 0.001, + "loss": 2.3566, + "step": 1765 + }, + { + "epoch": 0.07471021236991285, + "grad_norm": 0.3150945007801056, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 1766 + }, + { + "epoch": 0.07475251713342923, + "grad_norm": 1.8098750114440918, + "learning_rate": 0.001, + "loss": 2.2745, + "step": 1767 + }, + { + "epoch": 0.0747948218969456, + "grad_norm": 0.2686639726161957, + "learning_rate": 0.001, + "loss": 3.0812, + "step": 1768 + }, + { + "epoch": 0.07483712666046197, + "grad_norm": 0.3387921154499054, + "learning_rate": 0.001, + "loss": 2.8251, + "step": 1769 + }, + { + "epoch": 0.07487943142397833, + "grad_norm": 0.5623756647109985, + "learning_rate": 0.001, + "loss": 2.115, + "step": 1770 + }, + { + "epoch": 0.07492173618749472, + "grad_norm": 0.2617107927799225, + "learning_rate": 0.001, + "loss": 2.468, + "step": 1771 + }, + { + "epoch": 0.07496404095101109, + "grad_norm": 0.6408830285072327, + "learning_rate": 0.001, + "loss": 2.4926, + "step": 1772 + }, + { + "epoch": 0.07500634571452745, + "grad_norm": 0.38034066557884216, + "learning_rate": 0.001, + "loss": 2.867, + "step": 1773 + }, + { + "epoch": 0.07504865047804382, + "grad_norm": 0.39489755034446716, + "learning_rate": 0.001, + "loss": 3.4283, + "step": 1774 + }, + { + "epoch": 0.0750909552415602, + "grad_norm": 2.249427318572998, + "learning_rate": 0.001, + "loss": 2.2082, + "step": 1775 + }, + { + "epoch": 0.07513326000507657, + "grad_norm": 4.038248062133789, + "learning_rate": 0.001, + "loss": 3.7159, + "step": 1776 + }, + { + "epoch": 0.07517556476859294, + "grad_norm": 0.28461596369743347, + "learning_rate": 0.001, + "loss": 2.4496, + "step": 1777 + }, + { + "epoch": 0.07521786953210932, + "grad_norm": 0.3818615674972534, + "learning_rate": 0.001, + "loss": 3.797, + "step": 1778 + }, + { + "epoch": 0.07526017429562569, + "grad_norm": 0.3321216404438019, + "learning_rate": 0.001, + "loss": 2.7859, + "step": 1779 + }, + { + "epoch": 0.07530247905914206, + "grad_norm": 0.394345223903656, + "learning_rate": 0.001, + "loss": 2.7472, + "step": 1780 + }, + { + "epoch": 0.07534478382265843, + "grad_norm": 0.32292380928993225, + "learning_rate": 0.001, + "loss": 2.2914, + "step": 1781 + }, + { + "epoch": 0.07538708858617481, + "grad_norm": 0.24881044030189514, + "learning_rate": 0.001, + "loss": 1.5237, + "step": 1782 + }, + { + "epoch": 0.07542939334969118, + "grad_norm": 0.2507427930831909, + "learning_rate": 0.001, + "loss": 2.339, + "step": 1783 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 0.340909868478775, + "learning_rate": 0.001, + "loss": 3.373, + "step": 1784 + }, + { + "epoch": 0.07551400287672391, + "grad_norm": 0.32472652196884155, + "learning_rate": 0.001, + "loss": 2.0487, + "step": 1785 + }, + { + "epoch": 0.0755563076402403, + "grad_norm": 0.3497520983219147, + "learning_rate": 0.001, + "loss": 3.3913, + "step": 1786 + }, + { + "epoch": 0.07559861240375666, + "grad_norm": 1.4078129529953003, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 1787 + }, + { + "epoch": 0.07564091716727303, + "grad_norm": 0.5234339237213135, + "learning_rate": 0.001, + "loss": 3.5763, + "step": 1788 + }, + { + "epoch": 0.07568322193078941, + "grad_norm": 0.32184237241744995, + "learning_rate": 0.001, + "loss": 2.3567, + "step": 1789 + }, + { + "epoch": 0.07572552669430578, + "grad_norm": 0.32312852144241333, + "learning_rate": 0.001, + "loss": 1.9989, + "step": 1790 + }, + { + "epoch": 0.07576783145782215, + "grad_norm": 0.47937512397766113, + "learning_rate": 0.001, + "loss": 3.2448, + "step": 1791 + }, + { + "epoch": 0.07581013622133852, + "grad_norm": 0.6073363423347473, + "learning_rate": 0.001, + "loss": 2.1826, + "step": 1792 + }, + { + "epoch": 0.0758524409848549, + "grad_norm": 0.24768002331256866, + "learning_rate": 0.001, + "loss": 2.2067, + "step": 1793 + }, + { + "epoch": 0.07589474574837127, + "grad_norm": 0.7116523385047913, + "learning_rate": 0.001, + "loss": 2.6321, + "step": 1794 + }, + { + "epoch": 0.07593705051188764, + "grad_norm": 0.3589756488800049, + "learning_rate": 0.001, + "loss": 3.1839, + "step": 1795 + }, + { + "epoch": 0.075979355275404, + "grad_norm": 0.3133421540260315, + "learning_rate": 0.001, + "loss": 2.097, + "step": 1796 + }, + { + "epoch": 0.07602166003892039, + "grad_norm": 0.28984275460243225, + "learning_rate": 0.001, + "loss": 3.0274, + "step": 1797 + }, + { + "epoch": 0.07606396480243675, + "grad_norm": 0.3593312203884125, + "learning_rate": 0.001, + "loss": 2.0219, + "step": 1798 + }, + { + "epoch": 0.07610626956595312, + "grad_norm": 0.4126046299934387, + "learning_rate": 0.001, + "loss": 1.9161, + "step": 1799 + }, + { + "epoch": 0.0761485743294695, + "grad_norm": 0.7341791987419128, + "learning_rate": 0.001, + "loss": 2.4186, + "step": 1800 + }, + { + "epoch": 0.07619087909298587, + "grad_norm": 0.27145180106163025, + "learning_rate": 0.001, + "loss": 1.9538, + "step": 1801 + }, + { + "epoch": 0.07623318385650224, + "grad_norm": 0.5113809108734131, + "learning_rate": 0.001, + "loss": 2.2766, + "step": 1802 + }, + { + "epoch": 0.07627548862001861, + "grad_norm": 0.27664855122566223, + "learning_rate": 0.001, + "loss": 2.3126, + "step": 1803 + }, + { + "epoch": 0.07631779338353499, + "grad_norm": 0.23161768913269043, + "learning_rate": 0.001, + "loss": 1.7487, + "step": 1804 + }, + { + "epoch": 0.07636009814705136, + "grad_norm": 1.3314400911331177, + "learning_rate": 0.001, + "loss": 2.0632, + "step": 1805 + }, + { + "epoch": 0.07640240291056773, + "grad_norm": 0.3629261553287506, + "learning_rate": 0.001, + "loss": 2.7344, + "step": 1806 + }, + { + "epoch": 0.07644470767408411, + "grad_norm": 0.2677980363368988, + "learning_rate": 0.001, + "loss": 1.9242, + "step": 1807 + }, + { + "epoch": 0.07648701243760048, + "grad_norm": 0.26410120725631714, + "learning_rate": 0.001, + "loss": 2.2882, + "step": 1808 + }, + { + "epoch": 0.07652931720111684, + "grad_norm": 1.109229326248169, + "learning_rate": 0.001, + "loss": 2.2232, + "step": 1809 + }, + { + "epoch": 0.07657162196463321, + "grad_norm": 0.533427357673645, + "learning_rate": 0.001, + "loss": 2.5592, + "step": 1810 + }, + { + "epoch": 0.0766139267281496, + "grad_norm": 0.32029929757118225, + "learning_rate": 0.001, + "loss": 2.2293, + "step": 1811 + }, + { + "epoch": 0.07665623149166596, + "grad_norm": 0.3942636251449585, + "learning_rate": 0.001, + "loss": 2.5888, + "step": 1812 + }, + { + "epoch": 0.07669853625518233, + "grad_norm": 0.28876644372940063, + "learning_rate": 0.001, + "loss": 2.1704, + "step": 1813 + }, + { + "epoch": 0.0767408410186987, + "grad_norm": 0.30077075958251953, + "learning_rate": 0.001, + "loss": 3.1774, + "step": 1814 + }, + { + "epoch": 0.07678314578221508, + "grad_norm": 0.6673346161842346, + "learning_rate": 0.001, + "loss": 2.055, + "step": 1815 + }, + { + "epoch": 0.07682545054573145, + "grad_norm": 0.2500164806842804, + "learning_rate": 0.001, + "loss": 2.6555, + "step": 1816 + }, + { + "epoch": 0.07686775530924782, + "grad_norm": 0.2818383574485779, + "learning_rate": 0.001, + "loss": 2.6336, + "step": 1817 + }, + { + "epoch": 0.0769100600727642, + "grad_norm": 0.4379955232143402, + "learning_rate": 0.001, + "loss": 1.9602, + "step": 1818 + }, + { + "epoch": 0.07695236483628057, + "grad_norm": 0.27242425084114075, + "learning_rate": 0.001, + "loss": 2.0259, + "step": 1819 + }, + { + "epoch": 0.07699466959979694, + "grad_norm": 0.2450658231973648, + "learning_rate": 0.001, + "loss": 2.2079, + "step": 1820 + }, + { + "epoch": 0.0770369743633133, + "grad_norm": 0.3834247291088104, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 1821 + }, + { + "epoch": 0.07707927912682969, + "grad_norm": 0.3078935742378235, + "learning_rate": 0.001, + "loss": 2.2373, + "step": 1822 + }, + { + "epoch": 0.07712158389034605, + "grad_norm": 0.31686580181121826, + "learning_rate": 0.001, + "loss": 2.4674, + "step": 1823 + }, + { + "epoch": 0.07716388865386242, + "grad_norm": 0.28138643503189087, + "learning_rate": 0.001, + "loss": 1.9822, + "step": 1824 + }, + { + "epoch": 0.07720619341737879, + "grad_norm": 0.6008328795433044, + "learning_rate": 0.001, + "loss": 2.6718, + "step": 1825 + }, + { + "epoch": 0.07724849818089517, + "grad_norm": 0.3610805869102478, + "learning_rate": 0.001, + "loss": 3.121, + "step": 1826 + }, + { + "epoch": 0.07729080294441154, + "grad_norm": 0.44791126251220703, + "learning_rate": 0.001, + "loss": 2.4597, + "step": 1827 + }, + { + "epoch": 0.07733310770792791, + "grad_norm": 0.6357278823852539, + "learning_rate": 0.001, + "loss": 2.6391, + "step": 1828 + }, + { + "epoch": 0.07737541247144429, + "grad_norm": 0.47252053022384644, + "learning_rate": 0.001, + "loss": 2.4176, + "step": 1829 + }, + { + "epoch": 0.07741771723496066, + "grad_norm": 0.3132396340370178, + "learning_rate": 0.001, + "loss": 2.105, + "step": 1830 + }, + { + "epoch": 0.07746002199847703, + "grad_norm": 0.2682265341281891, + "learning_rate": 0.001, + "loss": 2.9813, + "step": 1831 + }, + { + "epoch": 0.0775023267619934, + "grad_norm": 0.44940343499183655, + "learning_rate": 0.001, + "loss": 1.8856, + "step": 1832 + }, + { + "epoch": 0.07754463152550978, + "grad_norm": 0.2868255376815796, + "learning_rate": 0.001, + "loss": 2.9346, + "step": 1833 + }, + { + "epoch": 0.07758693628902615, + "grad_norm": 0.3130701780319214, + "learning_rate": 0.001, + "loss": 2.0693, + "step": 1834 + }, + { + "epoch": 0.07762924105254251, + "grad_norm": 0.3155618906021118, + "learning_rate": 0.001, + "loss": 2.8602, + "step": 1835 + }, + { + "epoch": 0.07767154581605888, + "grad_norm": 0.3293514847755432, + "learning_rate": 0.001, + "loss": 2.2591, + "step": 1836 + }, + { + "epoch": 0.07771385057957526, + "grad_norm": 0.2850898504257202, + "learning_rate": 0.001, + "loss": 2.0444, + "step": 1837 + }, + { + "epoch": 0.07775615534309163, + "grad_norm": 0.8841787576675415, + "learning_rate": 0.001, + "loss": 2.7438, + "step": 1838 + }, + { + "epoch": 0.077798460106608, + "grad_norm": 0.4035090506076813, + "learning_rate": 0.001, + "loss": 2.5178, + "step": 1839 + }, + { + "epoch": 0.07784076487012438, + "grad_norm": 0.2477097362279892, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 1840 + }, + { + "epoch": 0.07788306963364075, + "grad_norm": 0.32553189992904663, + "learning_rate": 0.001, + "loss": 2.4148, + "step": 1841 + }, + { + "epoch": 0.07792537439715712, + "grad_norm": 0.3204468786716461, + "learning_rate": 0.001, + "loss": 3.0719, + "step": 1842 + }, + { + "epoch": 0.07796767916067349, + "grad_norm": 0.30373045802116394, + "learning_rate": 0.001, + "loss": 2.8824, + "step": 1843 + }, + { + "epoch": 0.07800998392418987, + "grad_norm": 2.558303117752075, + "learning_rate": 0.001, + "loss": 3.2411, + "step": 1844 + }, + { + "epoch": 0.07805228868770624, + "grad_norm": 0.3434048295021057, + "learning_rate": 0.001, + "loss": 2.8942, + "step": 1845 + }, + { + "epoch": 0.0780945934512226, + "grad_norm": 0.30965694785118103, + "learning_rate": 0.001, + "loss": 3.0761, + "step": 1846 + }, + { + "epoch": 0.07813689821473897, + "grad_norm": 0.3516073226928711, + "learning_rate": 0.001, + "loss": 2.8424, + "step": 1847 + }, + { + "epoch": 0.07817920297825535, + "grad_norm": 0.3779698312282562, + "learning_rate": 0.001, + "loss": 2.8813, + "step": 1848 + }, + { + "epoch": 0.07822150774177172, + "grad_norm": 0.30136117339134216, + "learning_rate": 0.001, + "loss": 1.8542, + "step": 1849 + }, + { + "epoch": 0.07826381250528809, + "grad_norm": 0.25570622086524963, + "learning_rate": 0.001, + "loss": 2.0118, + "step": 1850 + }, + { + "epoch": 0.07830611726880447, + "grad_norm": 0.45675748586654663, + "learning_rate": 0.001, + "loss": 2.1994, + "step": 1851 + }, + { + "epoch": 0.07834842203232084, + "grad_norm": 0.5405208468437195, + "learning_rate": 0.001, + "loss": 2.8306, + "step": 1852 + }, + { + "epoch": 0.07839072679583721, + "grad_norm": 0.28319215774536133, + "learning_rate": 0.001, + "loss": 2.2087, + "step": 1853 + }, + { + "epoch": 0.07843303155935358, + "grad_norm": 0.5920930504798889, + "learning_rate": 0.001, + "loss": 2.1219, + "step": 1854 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 0.6683090329170227, + "learning_rate": 0.001, + "loss": 2.048, + "step": 1855 + }, + { + "epoch": 0.07851764108638633, + "grad_norm": 0.28934597969055176, + "learning_rate": 0.001, + "loss": 2.4109, + "step": 1856 + }, + { + "epoch": 0.0785599458499027, + "grad_norm": 1.1490424871444702, + "learning_rate": 0.001, + "loss": 2.8251, + "step": 1857 + }, + { + "epoch": 0.07860225061341906, + "grad_norm": 0.5859717130661011, + "learning_rate": 0.001, + "loss": 2.2749, + "step": 1858 + }, + { + "epoch": 0.07864455537693545, + "grad_norm": 0.614921510219574, + "learning_rate": 0.001, + "loss": 2.7844, + "step": 1859 + }, + { + "epoch": 0.07868686014045181, + "grad_norm": 0.3503010869026184, + "learning_rate": 0.001, + "loss": 2.4979, + "step": 1860 + }, + { + "epoch": 0.07872916490396818, + "grad_norm": 0.42067447304725647, + "learning_rate": 0.001, + "loss": 2.9724, + "step": 1861 + }, + { + "epoch": 0.07877146966748456, + "grad_norm": 0.45878085494041443, + "learning_rate": 0.001, + "loss": 1.98, + "step": 1862 + }, + { + "epoch": 0.07881377443100093, + "grad_norm": 1.4618866443634033, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 1863 + }, + { + "epoch": 0.0788560791945173, + "grad_norm": 0.33975115418434143, + "learning_rate": 0.001, + "loss": 2.7163, + "step": 1864 + }, + { + "epoch": 0.07889838395803367, + "grad_norm": 27.44309425354004, + "learning_rate": 0.001, + "loss": 3.4932, + "step": 1865 + }, + { + "epoch": 0.07894068872155005, + "grad_norm": 0.4054664373397827, + "learning_rate": 0.001, + "loss": 2.5312, + "step": 1866 + }, + { + "epoch": 0.07898299348506642, + "grad_norm": 0.3030809462070465, + "learning_rate": 0.001, + "loss": 2.5558, + "step": 1867 + }, + { + "epoch": 0.07902529824858279, + "grad_norm": 0.40705251693725586, + "learning_rate": 0.001, + "loss": 3.12, + "step": 1868 + }, + { + "epoch": 0.07906760301209917, + "grad_norm": 16.77686882019043, + "learning_rate": 0.001, + "loss": 3.2163, + "step": 1869 + }, + { + "epoch": 0.07910990777561554, + "grad_norm": 1.4358597993850708, + "learning_rate": 0.001, + "loss": 3.0532, + "step": 1870 + }, + { + "epoch": 0.0791522125391319, + "grad_norm": 0.3366057872772217, + "learning_rate": 0.001, + "loss": 2.595, + "step": 1871 + }, + { + "epoch": 0.07919451730264827, + "grad_norm": 0.3896106779575348, + "learning_rate": 0.001, + "loss": 2.9058, + "step": 1872 + }, + { + "epoch": 0.07923682206616466, + "grad_norm": 0.34666091203689575, + "learning_rate": 0.001, + "loss": 1.9996, + "step": 1873 + }, + { + "epoch": 0.07927912682968102, + "grad_norm": 0.34630462527275085, + "learning_rate": 0.001, + "loss": 2.6526, + "step": 1874 + }, + { + "epoch": 0.07932143159319739, + "grad_norm": 0.43369296193122864, + "learning_rate": 0.001, + "loss": 3.2299, + "step": 1875 + }, + { + "epoch": 0.07936373635671376, + "grad_norm": 0.323746919631958, + "learning_rate": 0.001, + "loss": 2.6891, + "step": 1876 + }, + { + "epoch": 0.07940604112023014, + "grad_norm": 3.8398473262786865, + "learning_rate": 0.001, + "loss": 4.0965, + "step": 1877 + }, + { + "epoch": 0.07944834588374651, + "grad_norm": 0.5422288775444031, + "learning_rate": 0.001, + "loss": 2.9482, + "step": 1878 + }, + { + "epoch": 0.07949065064726288, + "grad_norm": 0.701977014541626, + "learning_rate": 0.001, + "loss": 2.8954, + "step": 1879 + }, + { + "epoch": 0.07953295541077926, + "grad_norm": 0.7025458216667175, + "learning_rate": 0.001, + "loss": 2.7868, + "step": 1880 + }, + { + "epoch": 0.07957526017429563, + "grad_norm": 0.8017560243606567, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 1881 + }, + { + "epoch": 0.079617564937812, + "grad_norm": 0.45160236954689026, + "learning_rate": 0.001, + "loss": 2.2321, + "step": 1882 + }, + { + "epoch": 0.07965986970132836, + "grad_norm": 0.4654178023338318, + "learning_rate": 0.001, + "loss": 3.4246, + "step": 1883 + }, + { + "epoch": 0.07970217446484475, + "grad_norm": 0.3444591164588928, + "learning_rate": 0.001, + "loss": 2.8039, + "step": 1884 + }, + { + "epoch": 0.07974447922836111, + "grad_norm": 1.3339020013809204, + "learning_rate": 0.001, + "loss": 3.6179, + "step": 1885 + }, + { + "epoch": 0.07978678399187748, + "grad_norm": 0.23645636439323425, + "learning_rate": 0.001, + "loss": 1.7768, + "step": 1886 + }, + { + "epoch": 0.07982908875539385, + "grad_norm": 0.3817785680294037, + "learning_rate": 0.001, + "loss": 2.7453, + "step": 1887 + }, + { + "epoch": 0.07987139351891023, + "grad_norm": 0.41465362906455994, + "learning_rate": 0.001, + "loss": 3.0675, + "step": 1888 + }, + { + "epoch": 0.0799136982824266, + "grad_norm": 0.37037500739097595, + "learning_rate": 0.001, + "loss": 2.3173, + "step": 1889 + }, + { + "epoch": 0.07995600304594297, + "grad_norm": 0.31565260887145996, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 1890 + }, + { + "epoch": 0.07999830780945935, + "grad_norm": 6.897204399108887, + "learning_rate": 0.001, + "loss": 2.5709, + "step": 1891 + }, + { + "epoch": 0.08004061257297572, + "grad_norm": 1.544342279434204, + "learning_rate": 0.001, + "loss": 2.1807, + "step": 1892 + }, + { + "epoch": 0.08008291733649209, + "grad_norm": 1.1234506368637085, + "learning_rate": 0.001, + "loss": 2.368, + "step": 1893 + }, + { + "epoch": 0.08012522210000846, + "grad_norm": 0.2884044051170349, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 1894 + }, + { + "epoch": 0.08016752686352484, + "grad_norm": 0.38709041476249695, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 1895 + }, + { + "epoch": 0.0802098316270412, + "grad_norm": 41.802032470703125, + "learning_rate": 0.001, + "loss": 1.8604, + "step": 1896 + }, + { + "epoch": 0.08025213639055757, + "grad_norm": 0.44042840600013733, + "learning_rate": 0.001, + "loss": 1.6504, + "step": 1897 + }, + { + "epoch": 0.08029444115407394, + "grad_norm": 1.7440584897994995, + "learning_rate": 0.001, + "loss": 2.5165, + "step": 1898 + }, + { + "epoch": 0.08033674591759032, + "grad_norm": 1.517181396484375, + "learning_rate": 0.001, + "loss": 2.1762, + "step": 1899 + }, + { + "epoch": 0.08037905068110669, + "grad_norm": 0.5725038051605225, + "learning_rate": 0.001, + "loss": 2.5455, + "step": 1900 + }, + { + "epoch": 0.08042135544462306, + "grad_norm": 0.45639267563819885, + "learning_rate": 0.001, + "loss": 2.379, + "step": 1901 + }, + { + "epoch": 0.08046366020813944, + "grad_norm": 1.4812275171279907, + "learning_rate": 0.001, + "loss": 2.4805, + "step": 1902 + }, + { + "epoch": 0.08050596497165581, + "grad_norm": 0.3487553000450134, + "learning_rate": 0.001, + "loss": 1.9803, + "step": 1903 + }, + { + "epoch": 0.08054826973517218, + "grad_norm": 0.36132708191871643, + "learning_rate": 0.001, + "loss": 2.0808, + "step": 1904 + }, + { + "epoch": 0.08059057449868855, + "grad_norm": 0.3573780357837677, + "learning_rate": 0.001, + "loss": 1.9821, + "step": 1905 + }, + { + "epoch": 0.08063287926220493, + "grad_norm": 2.551344156265259, + "learning_rate": 0.001, + "loss": 2.4851, + "step": 1906 + }, + { + "epoch": 0.0806751840257213, + "grad_norm": 1.2544894218444824, + "learning_rate": 0.001, + "loss": 2.0348, + "step": 1907 + }, + { + "epoch": 0.08071748878923767, + "grad_norm": 0.3534647524356842, + "learning_rate": 0.001, + "loss": 4.1648, + "step": 1908 + }, + { + "epoch": 0.08075979355275403, + "grad_norm": 0.5673227310180664, + "learning_rate": 0.001, + "loss": 2.953, + "step": 1909 + }, + { + "epoch": 0.08080209831627042, + "grad_norm": 5.431023120880127, + "learning_rate": 0.001, + "loss": 2.5128, + "step": 1910 + }, + { + "epoch": 0.08084440307978678, + "grad_norm": 15.899325370788574, + "learning_rate": 0.001, + "loss": 2.9833, + "step": 1911 + }, + { + "epoch": 0.08088670784330315, + "grad_norm": 0.5933341383934021, + "learning_rate": 0.001, + "loss": 2.0336, + "step": 1912 + }, + { + "epoch": 0.08092901260681953, + "grad_norm": 0.572313129901886, + "learning_rate": 0.001, + "loss": 2.8636, + "step": 1913 + }, + { + "epoch": 0.0809713173703359, + "grad_norm": 2.6062185764312744, + "learning_rate": 0.001, + "loss": 2.3718, + "step": 1914 + }, + { + "epoch": 0.08101362213385227, + "grad_norm": 0.769607424736023, + "learning_rate": 0.001, + "loss": 2.6415, + "step": 1915 + }, + { + "epoch": 0.08105592689736864, + "grad_norm": 0.4920905828475952, + "learning_rate": 0.001, + "loss": 2.1552, + "step": 1916 + }, + { + "epoch": 0.08109823166088502, + "grad_norm": 0.5874459147453308, + "learning_rate": 0.001, + "loss": 2.376, + "step": 1917 + }, + { + "epoch": 0.08114053642440139, + "grad_norm": 0.48925819993019104, + "learning_rate": 0.001, + "loss": 1.7734, + "step": 1918 + }, + { + "epoch": 0.08118284118791776, + "grad_norm": 0.7119834423065186, + "learning_rate": 0.001, + "loss": 2.6034, + "step": 1919 + }, + { + "epoch": 0.08122514595143414, + "grad_norm": 3.461428165435791, + "learning_rate": 0.001, + "loss": 2.6797, + "step": 1920 + }, + { + "epoch": 0.0812674507149505, + "grad_norm": 0.7845181822776794, + "learning_rate": 0.001, + "loss": 4.2376, + "step": 1921 + }, + { + "epoch": 0.08130975547846687, + "grad_norm": 0.5165002942085266, + "learning_rate": 0.001, + "loss": 3.3205, + "step": 1922 + }, + { + "epoch": 0.08135206024198324, + "grad_norm": 1.0320379734039307, + "learning_rate": 0.001, + "loss": 2.7671, + "step": 1923 + }, + { + "epoch": 0.08139436500549962, + "grad_norm": 7.306519031524658, + "learning_rate": 0.001, + "loss": 3.0783, + "step": 1924 + }, + { + "epoch": 0.08143666976901599, + "grad_norm": 0.51955646276474, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 1925 + }, + { + "epoch": 0.08147897453253236, + "grad_norm": 0.405947744846344, + "learning_rate": 0.001, + "loss": 3.2719, + "step": 1926 + }, + { + "epoch": 0.08152127929604873, + "grad_norm": 2.2772488594055176, + "learning_rate": 0.001, + "loss": 2.8051, + "step": 1927 + }, + { + "epoch": 0.08156358405956511, + "grad_norm": 0.4286925494670868, + "learning_rate": 0.001, + "loss": 2.4886, + "step": 1928 + }, + { + "epoch": 0.08160588882308148, + "grad_norm": 0.3987908661365509, + "learning_rate": 0.001, + "loss": 2.1242, + "step": 1929 + }, + { + "epoch": 0.08164819358659785, + "grad_norm": 0.42939138412475586, + "learning_rate": 0.001, + "loss": 3.0869, + "step": 1930 + }, + { + "epoch": 0.08169049835011423, + "grad_norm": 0.7904039025306702, + "learning_rate": 0.001, + "loss": 2.6297, + "step": 1931 + }, + { + "epoch": 0.0817328031136306, + "grad_norm": 0.2795187532901764, + "learning_rate": 0.001, + "loss": 1.8094, + "step": 1932 + }, + { + "epoch": 0.08177510787714697, + "grad_norm": 0.7740998268127441, + "learning_rate": 0.001, + "loss": 2.6684, + "step": 1933 + }, + { + "epoch": 0.08181741264066333, + "grad_norm": 0.5167595148086548, + "learning_rate": 0.001, + "loss": 2.8719, + "step": 1934 + }, + { + "epoch": 0.08185971740417972, + "grad_norm": 0.354657381772995, + "learning_rate": 0.001, + "loss": 2.6624, + "step": 1935 + }, + { + "epoch": 0.08190202216769608, + "grad_norm": 0.35921430587768555, + "learning_rate": 0.001, + "loss": 2.4581, + "step": 1936 + }, + { + "epoch": 0.08194432693121245, + "grad_norm": 0.6091867089271545, + "learning_rate": 0.001, + "loss": 2.978, + "step": 1937 + }, + { + "epoch": 0.08198663169472882, + "grad_norm": 0.3409866988658905, + "learning_rate": 0.001, + "loss": 2.5926, + "step": 1938 + }, + { + "epoch": 0.0820289364582452, + "grad_norm": 0.5260420441627502, + "learning_rate": 0.001, + "loss": 3.2647, + "step": 1939 + }, + { + "epoch": 0.08207124122176157, + "grad_norm": 0.2958446741104126, + "learning_rate": 0.001, + "loss": 2.509, + "step": 1940 + }, + { + "epoch": 0.08211354598527794, + "grad_norm": 0.2943398058414459, + "learning_rate": 0.001, + "loss": 2.0493, + "step": 1941 + }, + { + "epoch": 0.08215585074879432, + "grad_norm": 0.3319268226623535, + "learning_rate": 0.001, + "loss": 2.3304, + "step": 1942 + }, + { + "epoch": 0.08219815551231069, + "grad_norm": 0.5134836435317993, + "learning_rate": 0.001, + "loss": 2.693, + "step": 1943 + }, + { + "epoch": 0.08224046027582706, + "grad_norm": 0.3512554168701172, + "learning_rate": 0.001, + "loss": 3.3824, + "step": 1944 + }, + { + "epoch": 0.08228276503934343, + "grad_norm": 1.8263537883758545, + "learning_rate": 0.001, + "loss": 2.3778, + "step": 1945 + }, + { + "epoch": 0.08232506980285981, + "grad_norm": 0.3825913667678833, + "learning_rate": 0.001, + "loss": 2.2277, + "step": 1946 + }, + { + "epoch": 0.08236737456637618, + "grad_norm": 0.3037860691547394, + "learning_rate": 0.001, + "loss": 2.3199, + "step": 1947 + }, + { + "epoch": 0.08240967932989254, + "grad_norm": 0.4019972085952759, + "learning_rate": 0.001, + "loss": 2.3972, + "step": 1948 + }, + { + "epoch": 0.08245198409340891, + "grad_norm": 1.4788627624511719, + "learning_rate": 0.001, + "loss": 2.4204, + "step": 1949 + }, + { + "epoch": 0.0824942888569253, + "grad_norm": 0.5321111083030701, + "learning_rate": 0.001, + "loss": 2.8231, + "step": 1950 + }, + { + "epoch": 0.08253659362044166, + "grad_norm": 0.5626177191734314, + "learning_rate": 0.001, + "loss": 3.8786, + "step": 1951 + }, + { + "epoch": 0.08257889838395803, + "grad_norm": 0.5114405751228333, + "learning_rate": 0.001, + "loss": 2.941, + "step": 1952 + }, + { + "epoch": 0.08262120314747441, + "grad_norm": 0.48595815896987915, + "learning_rate": 0.001, + "loss": 2.6246, + "step": 1953 + }, + { + "epoch": 0.08266350791099078, + "grad_norm": 0.28523850440979004, + "learning_rate": 0.001, + "loss": 2.7918, + "step": 1954 + }, + { + "epoch": 0.08270581267450715, + "grad_norm": 0.7027163505554199, + "learning_rate": 0.001, + "loss": 2.7862, + "step": 1955 + }, + { + "epoch": 0.08274811743802352, + "grad_norm": 0.2511918544769287, + "learning_rate": 0.001, + "loss": 1.811, + "step": 1956 + }, + { + "epoch": 0.0827904222015399, + "grad_norm": 0.2575523555278778, + "learning_rate": 0.001, + "loss": 3.1751, + "step": 1957 + }, + { + "epoch": 0.08283272696505627, + "grad_norm": 0.3014431297779083, + "learning_rate": 0.001, + "loss": 2.4499, + "step": 1958 + }, + { + "epoch": 0.08287503172857263, + "grad_norm": 0.24527455866336823, + "learning_rate": 0.001, + "loss": 2.0457, + "step": 1959 + }, + { + "epoch": 0.082917336492089, + "grad_norm": 0.312063992023468, + "learning_rate": 0.001, + "loss": 2.063, + "step": 1960 + }, + { + "epoch": 0.08295964125560538, + "grad_norm": 1.2686574459075928, + "learning_rate": 0.001, + "loss": 3.0543, + "step": 1961 + }, + { + "epoch": 0.08300194601912175, + "grad_norm": 0.3444327712059021, + "learning_rate": 0.001, + "loss": 2.5528, + "step": 1962 + }, + { + "epoch": 0.08304425078263812, + "grad_norm": 0.5791609287261963, + "learning_rate": 0.001, + "loss": 3.4888, + "step": 1963 + }, + { + "epoch": 0.0830865555461545, + "grad_norm": 1.6436971426010132, + "learning_rate": 0.001, + "loss": 2.6672, + "step": 1964 + }, + { + "epoch": 0.08312886030967087, + "grad_norm": 0.25417083501815796, + "learning_rate": 0.001, + "loss": 2.1457, + "step": 1965 + }, + { + "epoch": 0.08317116507318724, + "grad_norm": 0.3741660714149475, + "learning_rate": 0.001, + "loss": 1.8138, + "step": 1966 + }, + { + "epoch": 0.08321346983670361, + "grad_norm": 0.3511454164981842, + "learning_rate": 0.001, + "loss": 1.9674, + "step": 1967 + }, + { + "epoch": 0.08325577460021999, + "grad_norm": 0.3857306241989136, + "learning_rate": 0.001, + "loss": 3.0311, + "step": 1968 + }, + { + "epoch": 0.08329807936373636, + "grad_norm": 0.6367977261543274, + "learning_rate": 0.001, + "loss": 2.3158, + "step": 1969 + }, + { + "epoch": 0.08334038412725273, + "grad_norm": 0.30183205008506775, + "learning_rate": 0.001, + "loss": 2.4227, + "step": 1970 + }, + { + "epoch": 0.0833826888907691, + "grad_norm": 0.3112057149410248, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 1971 + }, + { + "epoch": 0.08342499365428548, + "grad_norm": 0.3398885726928711, + "learning_rate": 0.001, + "loss": 2.5514, + "step": 1972 + }, + { + "epoch": 0.08346729841780184, + "grad_norm": 0.5639985799789429, + "learning_rate": 0.001, + "loss": 3.3706, + "step": 1973 + }, + { + "epoch": 0.08350960318131821, + "grad_norm": 0.2720838785171509, + "learning_rate": 0.001, + "loss": 2.0734, + "step": 1974 + }, + { + "epoch": 0.0835519079448346, + "grad_norm": 0.3382203280925751, + "learning_rate": 0.001, + "loss": 2.1143, + "step": 1975 + }, + { + "epoch": 0.08359421270835096, + "grad_norm": 0.9592907428741455, + "learning_rate": 0.001, + "loss": 1.722, + "step": 1976 + }, + { + "epoch": 0.08363651747186733, + "grad_norm": 0.4691549241542816, + "learning_rate": 0.001, + "loss": 3.0425, + "step": 1977 + }, + { + "epoch": 0.0836788222353837, + "grad_norm": 0.4217078983783722, + "learning_rate": 0.001, + "loss": 2.8026, + "step": 1978 + }, + { + "epoch": 0.08372112699890008, + "grad_norm": 0.9768161773681641, + "learning_rate": 0.001, + "loss": 3.152, + "step": 1979 + }, + { + "epoch": 0.08376343176241645, + "grad_norm": 0.4018516540527344, + "learning_rate": 0.001, + "loss": 1.9038, + "step": 1980 + }, + { + "epoch": 0.08380573652593282, + "grad_norm": 0.3423294126987457, + "learning_rate": 0.001, + "loss": 3.554, + "step": 1981 + }, + { + "epoch": 0.0838480412894492, + "grad_norm": 4.468573093414307, + "learning_rate": 0.001, + "loss": 1.7724, + "step": 1982 + }, + { + "epoch": 0.08389034605296557, + "grad_norm": 2.0579144954681396, + "learning_rate": 0.001, + "loss": 2.9118, + "step": 1983 + }, + { + "epoch": 0.08393265081648194, + "grad_norm": 0.3974030911922455, + "learning_rate": 0.001, + "loss": 1.8146, + "step": 1984 + }, + { + "epoch": 0.0839749555799983, + "grad_norm": 2.071157693862915, + "learning_rate": 0.001, + "loss": 3.3286, + "step": 1985 + }, + { + "epoch": 0.08401726034351469, + "grad_norm": 0.8951895236968994, + "learning_rate": 0.001, + "loss": 2.1807, + "step": 1986 + }, + { + "epoch": 0.08405956510703105, + "grad_norm": 0.3309991955757141, + "learning_rate": 0.001, + "loss": 2.9089, + "step": 1987 + }, + { + "epoch": 0.08410186987054742, + "grad_norm": 0.3316001892089844, + "learning_rate": 0.001, + "loss": 2.4278, + "step": 1988 + }, + { + "epoch": 0.08414417463406379, + "grad_norm": 0.4614253342151642, + "learning_rate": 0.001, + "loss": 1.8007, + "step": 1989 + }, + { + "epoch": 0.08418647939758017, + "grad_norm": 0.4860163629055023, + "learning_rate": 0.001, + "loss": 2.394, + "step": 1990 + }, + { + "epoch": 0.08422878416109654, + "grad_norm": 0.30347874760627747, + "learning_rate": 0.001, + "loss": 2.4783, + "step": 1991 + }, + { + "epoch": 0.08427108892461291, + "grad_norm": 0.3198263943195343, + "learning_rate": 0.001, + "loss": 2.4848, + "step": 1992 + }, + { + "epoch": 0.08431339368812929, + "grad_norm": 6.526590824127197, + "learning_rate": 0.001, + "loss": 2.6791, + "step": 1993 + }, + { + "epoch": 0.08435569845164566, + "grad_norm": 0.4787849485874176, + "learning_rate": 0.001, + "loss": 3.7781, + "step": 1994 + }, + { + "epoch": 0.08439800321516203, + "grad_norm": 0.34780532121658325, + "learning_rate": 0.001, + "loss": 2.3788, + "step": 1995 + }, + { + "epoch": 0.0844403079786784, + "grad_norm": 0.30479609966278076, + "learning_rate": 0.001, + "loss": 2.1992, + "step": 1996 + }, + { + "epoch": 0.08448261274219478, + "grad_norm": 1.0337961912155151, + "learning_rate": 0.001, + "loss": 2.4048, + "step": 1997 + }, + { + "epoch": 0.08452491750571114, + "grad_norm": 0.5477790832519531, + "learning_rate": 0.001, + "loss": 2.2986, + "step": 1998 + }, + { + "epoch": 0.08456722226922751, + "grad_norm": 0.7585234045982361, + "learning_rate": 0.001, + "loss": 2.7247, + "step": 1999 + }, + { + "epoch": 0.08460952703274388, + "grad_norm": 12.078346252441406, + "learning_rate": 0.001, + "loss": 1.9748, + "step": 2000 + }, + { + "epoch": 0.08465183179626026, + "grad_norm": 0.26165372133255005, + "learning_rate": 0.001, + "loss": 2.2865, + "step": 2001 + }, + { + "epoch": 0.08469413655977663, + "grad_norm": 0.21518190205097198, + "learning_rate": 0.001, + "loss": 1.9516, + "step": 2002 + }, + { + "epoch": 0.084736441323293, + "grad_norm": 1.5820046663284302, + "learning_rate": 0.001, + "loss": 2.2892, + "step": 2003 + }, + { + "epoch": 0.08477874608680938, + "grad_norm": 0.3572734296321869, + "learning_rate": 0.001, + "loss": 3.4287, + "step": 2004 + }, + { + "epoch": 0.08482105085032575, + "grad_norm": 3.6497509479522705, + "learning_rate": 0.001, + "loss": 3.2188, + "step": 2005 + }, + { + "epoch": 0.08486335561384212, + "grad_norm": 0.38023465871810913, + "learning_rate": 0.001, + "loss": 2.0518, + "step": 2006 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 0.24291670322418213, + "learning_rate": 0.001, + "loss": 2.0684, + "step": 2007 + }, + { + "epoch": 0.08494796514087487, + "grad_norm": 0.3124212622642517, + "learning_rate": 0.001, + "loss": 2.3747, + "step": 2008 + }, + { + "epoch": 0.08499026990439124, + "grad_norm": 1.313674807548523, + "learning_rate": 0.001, + "loss": 1.7892, + "step": 2009 + }, + { + "epoch": 0.0850325746679076, + "grad_norm": 0.5036904215812683, + "learning_rate": 0.001, + "loss": 2.5027, + "step": 2010 + }, + { + "epoch": 0.08507487943142397, + "grad_norm": 0.4840260148048401, + "learning_rate": 0.001, + "loss": 2.3056, + "step": 2011 + }, + { + "epoch": 0.08511718419494035, + "grad_norm": 0.48675820231437683, + "learning_rate": 0.001, + "loss": 2.6466, + "step": 2012 + }, + { + "epoch": 0.08515948895845672, + "grad_norm": 6.131993770599365, + "learning_rate": 0.001, + "loss": 2.6512, + "step": 2013 + }, + { + "epoch": 0.08520179372197309, + "grad_norm": 0.37686774134635925, + "learning_rate": 0.001, + "loss": 3.5639, + "step": 2014 + }, + { + "epoch": 0.08524409848548947, + "grad_norm": 0.6002863645553589, + "learning_rate": 0.001, + "loss": 3.1445, + "step": 2015 + }, + { + "epoch": 0.08528640324900584, + "grad_norm": 0.46876654028892517, + "learning_rate": 0.001, + "loss": 2.5458, + "step": 2016 + }, + { + "epoch": 0.08532870801252221, + "grad_norm": 0.38201484084129333, + "learning_rate": 0.001, + "loss": 2.5112, + "step": 2017 + }, + { + "epoch": 0.08537101277603858, + "grad_norm": 0.31709811091423035, + "learning_rate": 0.001, + "loss": 1.8442, + "step": 2018 + }, + { + "epoch": 0.08541331753955496, + "grad_norm": 3.5930492877960205, + "learning_rate": 0.001, + "loss": 2.654, + "step": 2019 + }, + { + "epoch": 0.08545562230307133, + "grad_norm": 0.48756277561187744, + "learning_rate": 0.001, + "loss": 2.5555, + "step": 2020 + }, + { + "epoch": 0.0854979270665877, + "grad_norm": 0.3280019164085388, + "learning_rate": 0.001, + "loss": 3.0868, + "step": 2021 + }, + { + "epoch": 0.08554023183010406, + "grad_norm": 3.57021164894104, + "learning_rate": 0.001, + "loss": 2.7273, + "step": 2022 + }, + { + "epoch": 0.08558253659362045, + "grad_norm": 2.2349965572357178, + "learning_rate": 0.001, + "loss": 3.3427, + "step": 2023 + }, + { + "epoch": 0.08562484135713681, + "grad_norm": 0.333631694316864, + "learning_rate": 0.001, + "loss": 2.8953, + "step": 2024 + }, + { + "epoch": 0.08566714612065318, + "grad_norm": 0.31778064370155334, + "learning_rate": 0.001, + "loss": 2.4624, + "step": 2025 + }, + { + "epoch": 0.08570945088416956, + "grad_norm": 0.3415403664112091, + "learning_rate": 0.001, + "loss": 2.1366, + "step": 2026 + }, + { + "epoch": 0.08575175564768593, + "grad_norm": 3.173630952835083, + "learning_rate": 0.001, + "loss": 2.8478, + "step": 2027 + }, + { + "epoch": 0.0857940604112023, + "grad_norm": 0.27932387590408325, + "learning_rate": 0.001, + "loss": 2.1341, + "step": 2028 + }, + { + "epoch": 0.08583636517471867, + "grad_norm": 0.4122964143753052, + "learning_rate": 0.001, + "loss": 2.9483, + "step": 2029 + }, + { + "epoch": 0.08587866993823505, + "grad_norm": 2.471703052520752, + "learning_rate": 0.001, + "loss": 3.5857, + "step": 2030 + }, + { + "epoch": 0.08592097470175142, + "grad_norm": 0.3559750020503998, + "learning_rate": 0.001, + "loss": 3.2138, + "step": 2031 + }, + { + "epoch": 0.08596327946526779, + "grad_norm": 0.4111155867576599, + "learning_rate": 0.001, + "loss": 2.3594, + "step": 2032 + }, + { + "epoch": 0.08600558422878415, + "grad_norm": 0.2714095115661621, + "learning_rate": 0.001, + "loss": 2.0511, + "step": 2033 + }, + { + "epoch": 0.08604788899230054, + "grad_norm": 0.313900887966156, + "learning_rate": 0.001, + "loss": 2.274, + "step": 2034 + }, + { + "epoch": 0.0860901937558169, + "grad_norm": 0.2859230637550354, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 2035 + }, + { + "epoch": 0.08613249851933327, + "grad_norm": 0.323250949382782, + "learning_rate": 0.001, + "loss": 2.0196, + "step": 2036 + }, + { + "epoch": 0.08617480328284965, + "grad_norm": 0.2705003321170807, + "learning_rate": 0.001, + "loss": 1.9217, + "step": 2037 + }, + { + "epoch": 0.08621710804636602, + "grad_norm": 0.25685760378837585, + "learning_rate": 0.001, + "loss": 1.9932, + "step": 2038 + }, + { + "epoch": 0.08625941280988239, + "grad_norm": 0.2940180003643036, + "learning_rate": 0.001, + "loss": 3.1947, + "step": 2039 + }, + { + "epoch": 0.08630171757339876, + "grad_norm": 0.23758481442928314, + "learning_rate": 0.001, + "loss": 2.1624, + "step": 2040 + }, + { + "epoch": 0.08634402233691514, + "grad_norm": 0.2667909264564514, + "learning_rate": 0.001, + "loss": 2.5975, + "step": 2041 + }, + { + "epoch": 0.08638632710043151, + "grad_norm": 0.23966266214847565, + "learning_rate": 0.001, + "loss": 2.7902, + "step": 2042 + }, + { + "epoch": 0.08642863186394788, + "grad_norm": 0.3106350898742676, + "learning_rate": 0.001, + "loss": 1.6657, + "step": 2043 + }, + { + "epoch": 0.08647093662746426, + "grad_norm": 0.25842878222465515, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 2044 + }, + { + "epoch": 0.08651324139098063, + "grad_norm": 0.6965135931968689, + "learning_rate": 0.001, + "loss": 2.3328, + "step": 2045 + }, + { + "epoch": 0.086555546154497, + "grad_norm": 0.30815958976745605, + "learning_rate": 0.001, + "loss": 1.7124, + "step": 2046 + }, + { + "epoch": 0.08659785091801336, + "grad_norm": 0.32658496499061584, + "learning_rate": 0.001, + "loss": 3.9796, + "step": 2047 + }, + { + "epoch": 0.08664015568152975, + "grad_norm": 0.25080788135528564, + "learning_rate": 0.001, + "loss": 2.4049, + "step": 2048 + }, + { + "epoch": 0.08668246044504611, + "grad_norm": 0.31165841221809387, + "learning_rate": 0.001, + "loss": 2.0898, + "step": 2049 + }, + { + "epoch": 0.08672476520856248, + "grad_norm": 1.0614737272262573, + "learning_rate": 0.001, + "loss": 1.7963, + "step": 2050 + }, + { + "epoch": 0.08676706997207885, + "grad_norm": 0.44003593921661377, + "learning_rate": 0.001, + "loss": 3.2018, + "step": 2051 + }, + { + "epoch": 0.08680937473559523, + "grad_norm": 0.44519808888435364, + "learning_rate": 0.001, + "loss": 2.6672, + "step": 2052 + }, + { + "epoch": 0.0868516794991116, + "grad_norm": 1.5413126945495605, + "learning_rate": 0.001, + "loss": 1.6514, + "step": 2053 + }, + { + "epoch": 0.08689398426262797, + "grad_norm": 1.6876548528671265, + "learning_rate": 0.001, + "loss": 2.9417, + "step": 2054 + }, + { + "epoch": 0.08693628902614435, + "grad_norm": 0.26920443773269653, + "learning_rate": 0.001, + "loss": 2.1858, + "step": 2055 + }, + { + "epoch": 0.08697859378966072, + "grad_norm": 0.4876200556755066, + "learning_rate": 0.001, + "loss": 2.9616, + "step": 2056 + }, + { + "epoch": 0.08702089855317709, + "grad_norm": 0.32026639580726624, + "learning_rate": 0.001, + "loss": 2.398, + "step": 2057 + }, + { + "epoch": 0.08706320331669345, + "grad_norm": 0.34720298647880554, + "learning_rate": 0.001, + "loss": 2.5501, + "step": 2058 + }, + { + "epoch": 0.08710550808020984, + "grad_norm": 0.3236474394798279, + "learning_rate": 0.001, + "loss": 1.9898, + "step": 2059 + }, + { + "epoch": 0.0871478128437262, + "grad_norm": 0.4026321470737457, + "learning_rate": 0.001, + "loss": 2.4051, + "step": 2060 + }, + { + "epoch": 0.08719011760724257, + "grad_norm": 0.9391548037528992, + "learning_rate": 0.001, + "loss": 2.2811, + "step": 2061 + }, + { + "epoch": 0.08723242237075894, + "grad_norm": 0.29237544536590576, + "learning_rate": 0.001, + "loss": 2.1954, + "step": 2062 + }, + { + "epoch": 0.08727472713427532, + "grad_norm": 0.26706722378730774, + "learning_rate": 0.001, + "loss": 2.0118, + "step": 2063 + }, + { + "epoch": 0.08731703189779169, + "grad_norm": 0.6832917928695679, + "learning_rate": 0.001, + "loss": 2.0665, + "step": 2064 + }, + { + "epoch": 0.08735933666130806, + "grad_norm": 3.363959312438965, + "learning_rate": 0.001, + "loss": 2.8937, + "step": 2065 + }, + { + "epoch": 0.08740164142482444, + "grad_norm": 0.6167427897453308, + "learning_rate": 0.001, + "loss": 2.4164, + "step": 2066 + }, + { + "epoch": 0.08744394618834081, + "grad_norm": 0.40449997782707214, + "learning_rate": 0.001, + "loss": 2.503, + "step": 2067 + }, + { + "epoch": 0.08748625095185718, + "grad_norm": 0.77544105052948, + "learning_rate": 0.001, + "loss": 3.1617, + "step": 2068 + }, + { + "epoch": 0.08752855571537355, + "grad_norm": 0.6678179502487183, + "learning_rate": 0.001, + "loss": 2.4034, + "step": 2069 + }, + { + "epoch": 0.08757086047888993, + "grad_norm": 0.46908748149871826, + "learning_rate": 0.001, + "loss": 2.4388, + "step": 2070 + }, + { + "epoch": 0.0876131652424063, + "grad_norm": 0.3415198028087616, + "learning_rate": 0.001, + "loss": 2.1139, + "step": 2071 + }, + { + "epoch": 0.08765547000592266, + "grad_norm": 0.3351193070411682, + "learning_rate": 0.001, + "loss": 2.3951, + "step": 2072 + }, + { + "epoch": 0.08769777476943903, + "grad_norm": 0.27987435460090637, + "learning_rate": 0.001, + "loss": 2.0981, + "step": 2073 + }, + { + "epoch": 0.08774007953295541, + "grad_norm": 1.4388349056243896, + "learning_rate": 0.001, + "loss": 1.9596, + "step": 2074 + }, + { + "epoch": 0.08778238429647178, + "grad_norm": 2.4498531818389893, + "learning_rate": 0.001, + "loss": 2.667, + "step": 2075 + }, + { + "epoch": 0.08782468905998815, + "grad_norm": 0.259954035282135, + "learning_rate": 0.001, + "loss": 2.7094, + "step": 2076 + }, + { + "epoch": 0.08786699382350453, + "grad_norm": 0.24030877649784088, + "learning_rate": 0.001, + "loss": 2.7154, + "step": 2077 + }, + { + "epoch": 0.0879092985870209, + "grad_norm": 4.155751705169678, + "learning_rate": 0.001, + "loss": 1.8017, + "step": 2078 + }, + { + "epoch": 0.08795160335053727, + "grad_norm": 0.30022862553596497, + "learning_rate": 0.001, + "loss": 2.6318, + "step": 2079 + }, + { + "epoch": 0.08799390811405364, + "grad_norm": 4.097969055175781, + "learning_rate": 0.001, + "loss": 2.7386, + "step": 2080 + }, + { + "epoch": 0.08803621287757002, + "grad_norm": 0.8870471715927124, + "learning_rate": 0.001, + "loss": 2.6592, + "step": 2081 + }, + { + "epoch": 0.08807851764108639, + "grad_norm": 0.27012670040130615, + "learning_rate": 0.001, + "loss": 2.5062, + "step": 2082 + }, + { + "epoch": 0.08812082240460276, + "grad_norm": 0.28547555208206177, + "learning_rate": 0.001, + "loss": 2.3858, + "step": 2083 + }, + { + "epoch": 0.08816312716811912, + "grad_norm": 0.34155458211898804, + "learning_rate": 0.001, + "loss": 3.2342, + "step": 2084 + }, + { + "epoch": 0.0882054319316355, + "grad_norm": 0.26611489057540894, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 2085 + }, + { + "epoch": 0.08824773669515187, + "grad_norm": 0.35170432925224304, + "learning_rate": 0.001, + "loss": 2.7927, + "step": 2086 + }, + { + "epoch": 0.08829004145866824, + "grad_norm": 0.6946236491203308, + "learning_rate": 0.001, + "loss": 1.982, + "step": 2087 + }, + { + "epoch": 0.08833234622218462, + "grad_norm": 0.8609383702278137, + "learning_rate": 0.001, + "loss": 1.992, + "step": 2088 + }, + { + "epoch": 0.08837465098570099, + "grad_norm": 0.2939896583557129, + "learning_rate": 0.001, + "loss": 2.2666, + "step": 2089 + }, + { + "epoch": 0.08841695574921736, + "grad_norm": 0.26682353019714355, + "learning_rate": 0.001, + "loss": 2.5193, + "step": 2090 + }, + { + "epoch": 0.08845926051273373, + "grad_norm": 0.9872138500213623, + "learning_rate": 0.001, + "loss": 2.6831, + "step": 2091 + }, + { + "epoch": 0.08850156527625011, + "grad_norm": 0.9422023296356201, + "learning_rate": 0.001, + "loss": 3.6744, + "step": 2092 + }, + { + "epoch": 0.08854387003976648, + "grad_norm": 0.4777209758758545, + "learning_rate": 0.001, + "loss": 3.7379, + "step": 2093 + }, + { + "epoch": 0.08858617480328285, + "grad_norm": 104.47978210449219, + "learning_rate": 0.001, + "loss": 1.8196, + "step": 2094 + }, + { + "epoch": 0.08862847956679921, + "grad_norm": 17.303163528442383, + "learning_rate": 0.001, + "loss": 3.1286, + "step": 2095 + }, + { + "epoch": 0.0886707843303156, + "grad_norm": 0.3178851008415222, + "learning_rate": 0.001, + "loss": 3.5087, + "step": 2096 + }, + { + "epoch": 0.08871308909383196, + "grad_norm": 0.3228803873062134, + "learning_rate": 0.001, + "loss": 2.0294, + "step": 2097 + }, + { + "epoch": 0.08875539385734833, + "grad_norm": 0.6258364319801331, + "learning_rate": 0.001, + "loss": 2.5332, + "step": 2098 + }, + { + "epoch": 0.08879769862086472, + "grad_norm": 0.7965203523635864, + "learning_rate": 0.001, + "loss": 2.7295, + "step": 2099 + }, + { + "epoch": 0.08884000338438108, + "grad_norm": 0.4302770793437958, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 2100 + }, + { + "epoch": 0.08888230814789745, + "grad_norm": 0.5029553771018982, + "learning_rate": 0.001, + "loss": 2.9496, + "step": 2101 + }, + { + "epoch": 0.08892461291141382, + "grad_norm": 0.3942144215106964, + "learning_rate": 0.001, + "loss": 2.313, + "step": 2102 + }, + { + "epoch": 0.0889669176749302, + "grad_norm": 0.43013280630111694, + "learning_rate": 0.001, + "loss": 1.9479, + "step": 2103 + }, + { + "epoch": 0.08900922243844657, + "grad_norm": 1.0882439613342285, + "learning_rate": 0.001, + "loss": 2.8823, + "step": 2104 + }, + { + "epoch": 0.08905152720196294, + "grad_norm": 0.3308344781398773, + "learning_rate": 0.001, + "loss": 2.302, + "step": 2105 + }, + { + "epoch": 0.08909383196547932, + "grad_norm": 0.29911816120147705, + "learning_rate": 0.001, + "loss": 3.0091, + "step": 2106 + }, + { + "epoch": 0.08913613672899569, + "grad_norm": 1.754325270652771, + "learning_rate": 0.001, + "loss": 3.21, + "step": 2107 + }, + { + "epoch": 0.08917844149251206, + "grad_norm": 2.3082916736602783, + "learning_rate": 0.001, + "loss": 2.3856, + "step": 2108 + }, + { + "epoch": 0.08922074625602842, + "grad_norm": 0.6127236485481262, + "learning_rate": 0.001, + "loss": 2.2969, + "step": 2109 + }, + { + "epoch": 0.0892630510195448, + "grad_norm": 0.41946130990982056, + "learning_rate": 0.001, + "loss": 2.8269, + "step": 2110 + }, + { + "epoch": 0.08930535578306117, + "grad_norm": 6.0178327560424805, + "learning_rate": 0.001, + "loss": 2.7265, + "step": 2111 + }, + { + "epoch": 0.08934766054657754, + "grad_norm": 0.38182544708251953, + "learning_rate": 0.001, + "loss": 3.1223, + "step": 2112 + }, + { + "epoch": 0.08938996531009391, + "grad_norm": 1.5331692695617676, + "learning_rate": 0.001, + "loss": 2.4643, + "step": 2113 + }, + { + "epoch": 0.08943227007361029, + "grad_norm": 0.29090458154678345, + "learning_rate": 0.001, + "loss": 1.958, + "step": 2114 + }, + { + "epoch": 0.08947457483712666, + "grad_norm": 0.3298831284046173, + "learning_rate": 0.001, + "loss": 2.4861, + "step": 2115 + }, + { + "epoch": 0.08951687960064303, + "grad_norm": 0.9520841240882874, + "learning_rate": 0.001, + "loss": 2.2431, + "step": 2116 + }, + { + "epoch": 0.08955918436415941, + "grad_norm": 0.3201279938220978, + "learning_rate": 0.001, + "loss": 1.9121, + "step": 2117 + }, + { + "epoch": 0.08960148912767578, + "grad_norm": 0.3153676390647888, + "learning_rate": 0.001, + "loss": 2.2245, + "step": 2118 + }, + { + "epoch": 0.08964379389119215, + "grad_norm": 0.425874263048172, + "learning_rate": 0.001, + "loss": 3.1083, + "step": 2119 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 0.6781908869743347, + "learning_rate": 0.001, + "loss": 3.0335, + "step": 2120 + }, + { + "epoch": 0.0897284034182249, + "grad_norm": 0.38351890444755554, + "learning_rate": 0.001, + "loss": 3.6367, + "step": 2121 + }, + { + "epoch": 0.08977070818174127, + "grad_norm": 0.26305243372917175, + "learning_rate": 0.001, + "loss": 1.9731, + "step": 2122 + }, + { + "epoch": 0.08981301294525763, + "grad_norm": 6.707549571990967, + "learning_rate": 0.001, + "loss": 2.2013, + "step": 2123 + }, + { + "epoch": 0.089855317708774, + "grad_norm": 0.31812310218811035, + "learning_rate": 0.001, + "loss": 1.9723, + "step": 2124 + }, + { + "epoch": 0.08989762247229038, + "grad_norm": 0.30152562260627747, + "learning_rate": 0.001, + "loss": 2.6349, + "step": 2125 + }, + { + "epoch": 0.08993992723580675, + "grad_norm": 0.33307674527168274, + "learning_rate": 0.001, + "loss": 2.295, + "step": 2126 + }, + { + "epoch": 0.08998223199932312, + "grad_norm": 0.26768437027931213, + "learning_rate": 0.001, + "loss": 3.2849, + "step": 2127 + }, + { + "epoch": 0.0900245367628395, + "grad_norm": 0.39628762006759644, + "learning_rate": 0.001, + "loss": 3.1855, + "step": 2128 + }, + { + "epoch": 0.09006684152635587, + "grad_norm": 0.3016011416912079, + "learning_rate": 0.001, + "loss": 3.3723, + "step": 2129 + }, + { + "epoch": 0.09010914628987224, + "grad_norm": 0.6254957318305969, + "learning_rate": 0.001, + "loss": 4.1078, + "step": 2130 + }, + { + "epoch": 0.0901514510533886, + "grad_norm": 0.23875102400779724, + "learning_rate": 0.001, + "loss": 1.7937, + "step": 2131 + }, + { + "epoch": 0.09019375581690499, + "grad_norm": 0.2940853536128998, + "learning_rate": 0.001, + "loss": 2.0315, + "step": 2132 + }, + { + "epoch": 0.09023606058042136, + "grad_norm": 2.5391368865966797, + "learning_rate": 0.001, + "loss": 2.5625, + "step": 2133 + }, + { + "epoch": 0.09027836534393772, + "grad_norm": 5.3973388671875, + "learning_rate": 0.001, + "loss": 1.8511, + "step": 2134 + }, + { + "epoch": 0.09032067010745409, + "grad_norm": 0.25178593397140503, + "learning_rate": 0.001, + "loss": 2.3371, + "step": 2135 + }, + { + "epoch": 0.09036297487097047, + "grad_norm": 0.31066349148750305, + "learning_rate": 0.001, + "loss": 2.7473, + "step": 2136 + }, + { + "epoch": 0.09040527963448684, + "grad_norm": 0.42618098855018616, + "learning_rate": 0.001, + "loss": 3.0467, + "step": 2137 + }, + { + "epoch": 0.09044758439800321, + "grad_norm": 0.29961666464805603, + "learning_rate": 0.001, + "loss": 2.192, + "step": 2138 + }, + { + "epoch": 0.0904898891615196, + "grad_norm": 0.31987568736076355, + "learning_rate": 0.001, + "loss": 2.7925, + "step": 2139 + }, + { + "epoch": 0.09053219392503596, + "grad_norm": 0.309462308883667, + "learning_rate": 0.001, + "loss": 2.0587, + "step": 2140 + }, + { + "epoch": 0.09057449868855233, + "grad_norm": 0.3607901930809021, + "learning_rate": 0.001, + "loss": 2.1792, + "step": 2141 + }, + { + "epoch": 0.0906168034520687, + "grad_norm": 0.281773179769516, + "learning_rate": 0.001, + "loss": 1.8093, + "step": 2142 + }, + { + "epoch": 0.09065910821558508, + "grad_norm": 0.5456468462944031, + "learning_rate": 0.001, + "loss": 3.7337, + "step": 2143 + }, + { + "epoch": 0.09070141297910145, + "grad_norm": 0.25825467705726624, + "learning_rate": 0.001, + "loss": 1.9948, + "step": 2144 + }, + { + "epoch": 0.09074371774261782, + "grad_norm": 0.9076284766197205, + "learning_rate": 0.001, + "loss": 2.3712, + "step": 2145 + }, + { + "epoch": 0.09078602250613418, + "grad_norm": 0.41249004006385803, + "learning_rate": 0.001, + "loss": 2.3084, + "step": 2146 + }, + { + "epoch": 0.09082832726965057, + "grad_norm": 1.1865334510803223, + "learning_rate": 0.001, + "loss": 3.0776, + "step": 2147 + }, + { + "epoch": 0.09087063203316693, + "grad_norm": 0.3349679410457611, + "learning_rate": 0.001, + "loss": 2.3653, + "step": 2148 + }, + { + "epoch": 0.0909129367966833, + "grad_norm": 0.4010538160800934, + "learning_rate": 0.001, + "loss": 2.3073, + "step": 2149 + }, + { + "epoch": 0.09095524156019968, + "grad_norm": 0.38532817363739014, + "learning_rate": 0.001, + "loss": 2.9012, + "step": 2150 + }, + { + "epoch": 0.09099754632371605, + "grad_norm": 0.33620357513427734, + "learning_rate": 0.001, + "loss": 2.9892, + "step": 2151 + }, + { + "epoch": 0.09103985108723242, + "grad_norm": 5.379061222076416, + "learning_rate": 0.001, + "loss": 2.203, + "step": 2152 + }, + { + "epoch": 0.09108215585074879, + "grad_norm": 8.182388305664062, + "learning_rate": 0.001, + "loss": 3.635, + "step": 2153 + }, + { + "epoch": 0.09112446061426517, + "grad_norm": 0.23541434109210968, + "learning_rate": 0.001, + "loss": 2.343, + "step": 2154 + }, + { + "epoch": 0.09116676537778154, + "grad_norm": 0.27592089772224426, + "learning_rate": 0.001, + "loss": 2.7213, + "step": 2155 + }, + { + "epoch": 0.09120907014129791, + "grad_norm": 0.30198514461517334, + "learning_rate": 0.001, + "loss": 3.0081, + "step": 2156 + }, + { + "epoch": 0.09125137490481428, + "grad_norm": 0.26055535674095154, + "learning_rate": 0.001, + "loss": 2.4518, + "step": 2157 + }, + { + "epoch": 0.09129367966833066, + "grad_norm": 0.2565707564353943, + "learning_rate": 0.001, + "loss": 2.2259, + "step": 2158 + }, + { + "epoch": 0.09133598443184703, + "grad_norm": 0.2991856634616852, + "learning_rate": 0.001, + "loss": 2.2095, + "step": 2159 + }, + { + "epoch": 0.0913782891953634, + "grad_norm": 0.6168603897094727, + "learning_rate": 0.001, + "loss": 2.6562, + "step": 2160 + }, + { + "epoch": 0.09142059395887978, + "grad_norm": 1.3440738916397095, + "learning_rate": 0.001, + "loss": 2.2589, + "step": 2161 + }, + { + "epoch": 0.09146289872239614, + "grad_norm": 0.2948399782180786, + "learning_rate": 0.001, + "loss": 2.4666, + "step": 2162 + }, + { + "epoch": 0.09150520348591251, + "grad_norm": 0.3604365885257721, + "learning_rate": 0.001, + "loss": 2.1146, + "step": 2163 + }, + { + "epoch": 0.09154750824942888, + "grad_norm": 0.33632999658584595, + "learning_rate": 0.001, + "loss": 2.7935, + "step": 2164 + }, + { + "epoch": 0.09158981301294526, + "grad_norm": 0.49976199865341187, + "learning_rate": 0.001, + "loss": 2.0044, + "step": 2165 + }, + { + "epoch": 0.09163211777646163, + "grad_norm": 3.4067957401275635, + "learning_rate": 0.001, + "loss": 2.8011, + "step": 2166 + }, + { + "epoch": 0.091674422539978, + "grad_norm": 0.3377350866794586, + "learning_rate": 0.001, + "loss": 3.0546, + "step": 2167 + }, + { + "epoch": 0.09171672730349438, + "grad_norm": 0.3675040304660797, + "learning_rate": 0.001, + "loss": 2.6498, + "step": 2168 + }, + { + "epoch": 0.09175903206701075, + "grad_norm": 0.30007094144821167, + "learning_rate": 0.001, + "loss": 3.8178, + "step": 2169 + }, + { + "epoch": 0.09180133683052712, + "grad_norm": 0.27063828706741333, + "learning_rate": 0.001, + "loss": 3.6035, + "step": 2170 + }, + { + "epoch": 0.09184364159404348, + "grad_norm": 0.23841188848018646, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 2171 + }, + { + "epoch": 0.09188594635755987, + "grad_norm": 1.4631080627441406, + "learning_rate": 0.001, + "loss": 2.8047, + "step": 2172 + }, + { + "epoch": 0.09192825112107623, + "grad_norm": 0.26148828864097595, + "learning_rate": 0.001, + "loss": 1.7367, + "step": 2173 + }, + { + "epoch": 0.0919705558845926, + "grad_norm": 3.1010782718658447, + "learning_rate": 0.001, + "loss": 2.1477, + "step": 2174 + }, + { + "epoch": 0.09201286064810897, + "grad_norm": 1.398153305053711, + "learning_rate": 0.001, + "loss": 2.5751, + "step": 2175 + }, + { + "epoch": 0.09205516541162535, + "grad_norm": 1.0668129920959473, + "learning_rate": 0.001, + "loss": 2.3153, + "step": 2176 + }, + { + "epoch": 0.09209747017514172, + "grad_norm": 0.2531263828277588, + "learning_rate": 0.001, + "loss": 3.1233, + "step": 2177 + }, + { + "epoch": 0.09213977493865809, + "grad_norm": 0.26637500524520874, + "learning_rate": 0.001, + "loss": 3.5828, + "step": 2178 + }, + { + "epoch": 0.09218207970217447, + "grad_norm": 0.2811260223388672, + "learning_rate": 0.001, + "loss": 2.9139, + "step": 2179 + }, + { + "epoch": 0.09222438446569084, + "grad_norm": 0.39359262585639954, + "learning_rate": 0.001, + "loss": 3.4906, + "step": 2180 + }, + { + "epoch": 0.09226668922920721, + "grad_norm": 0.3012666702270508, + "learning_rate": 0.001, + "loss": 1.9615, + "step": 2181 + }, + { + "epoch": 0.09230899399272358, + "grad_norm": 0.5200200080871582, + "learning_rate": 0.001, + "loss": 2.8205, + "step": 2182 + }, + { + "epoch": 0.09235129875623996, + "grad_norm": 0.3044484257698059, + "learning_rate": 0.001, + "loss": 2.9967, + "step": 2183 + }, + { + "epoch": 0.09239360351975633, + "grad_norm": 0.3814965784549713, + "learning_rate": 0.001, + "loss": 2.1881, + "step": 2184 + }, + { + "epoch": 0.0924359082832727, + "grad_norm": 0.3739076554775238, + "learning_rate": 0.001, + "loss": 2.8411, + "step": 2185 + }, + { + "epoch": 0.09247821304678906, + "grad_norm": 0.3986753225326538, + "learning_rate": 0.001, + "loss": 2.6639, + "step": 2186 + }, + { + "epoch": 0.09252051781030544, + "grad_norm": 0.35847851634025574, + "learning_rate": 0.001, + "loss": 3.6791, + "step": 2187 + }, + { + "epoch": 0.09256282257382181, + "grad_norm": 0.3173019289970398, + "learning_rate": 0.001, + "loss": 2.2659, + "step": 2188 + }, + { + "epoch": 0.09260512733733818, + "grad_norm": 0.25434383749961853, + "learning_rate": 0.001, + "loss": 1.8084, + "step": 2189 + }, + { + "epoch": 0.09264743210085456, + "grad_norm": 0.2687484323978424, + "learning_rate": 0.001, + "loss": 2.9027, + "step": 2190 + }, + { + "epoch": 0.09268973686437093, + "grad_norm": 0.29788804054260254, + "learning_rate": 0.001, + "loss": 3.081, + "step": 2191 + }, + { + "epoch": 0.0927320416278873, + "grad_norm": 2.9321951866149902, + "learning_rate": 0.001, + "loss": 4.4009, + "step": 2192 + }, + { + "epoch": 0.09277434639140367, + "grad_norm": 0.29507976770401, + "learning_rate": 0.001, + "loss": 2.3275, + "step": 2193 + }, + { + "epoch": 0.09281665115492005, + "grad_norm": 0.29394271969795227, + "learning_rate": 0.001, + "loss": 2.6158, + "step": 2194 + }, + { + "epoch": 0.09285895591843642, + "grad_norm": 1.7109583616256714, + "learning_rate": 0.001, + "loss": 2.0686, + "step": 2195 + }, + { + "epoch": 0.09290126068195279, + "grad_norm": 0.28951358795166016, + "learning_rate": 0.001, + "loss": 2.8997, + "step": 2196 + }, + { + "epoch": 0.09294356544546915, + "grad_norm": 0.49977269768714905, + "learning_rate": 0.001, + "loss": 2.8217, + "step": 2197 + }, + { + "epoch": 0.09298587020898554, + "grad_norm": 0.35717108845710754, + "learning_rate": 0.001, + "loss": 2.0279, + "step": 2198 + }, + { + "epoch": 0.0930281749725019, + "grad_norm": 0.22373569011688232, + "learning_rate": 0.001, + "loss": 1.7682, + "step": 2199 + }, + { + "epoch": 0.09307047973601827, + "grad_norm": 0.25105729699134827, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 2200 + }, + { + "epoch": 0.09311278449953465, + "grad_norm": 0.24033679068088531, + "learning_rate": 0.001, + "loss": 2.2573, + "step": 2201 + }, + { + "epoch": 0.09315508926305102, + "grad_norm": 0.3561880588531494, + "learning_rate": 0.001, + "loss": 3.0528, + "step": 2202 + }, + { + "epoch": 0.09319739402656739, + "grad_norm": 0.26465436816215515, + "learning_rate": 0.001, + "loss": 3.0083, + "step": 2203 + }, + { + "epoch": 0.09323969879008376, + "grad_norm": 1.4494379758834839, + "learning_rate": 0.001, + "loss": 2.5106, + "step": 2204 + }, + { + "epoch": 0.09328200355360014, + "grad_norm": 0.28080251812934875, + "learning_rate": 0.001, + "loss": 2.9088, + "step": 2205 + }, + { + "epoch": 0.09332430831711651, + "grad_norm": 0.6943084597587585, + "learning_rate": 0.001, + "loss": 2.1316, + "step": 2206 + }, + { + "epoch": 0.09336661308063288, + "grad_norm": 0.2942868173122406, + "learning_rate": 0.001, + "loss": 2.6873, + "step": 2207 + }, + { + "epoch": 0.09340891784414924, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.001, + "loss": 2.3427, + "step": 2208 + }, + { + "epoch": 0.09345122260766563, + "grad_norm": 0.22013962268829346, + "learning_rate": 0.001, + "loss": 1.803, + "step": 2209 + }, + { + "epoch": 0.093493527371182, + "grad_norm": 0.24783039093017578, + "learning_rate": 0.001, + "loss": 2.1961, + "step": 2210 + }, + { + "epoch": 0.09353583213469836, + "grad_norm": 0.34850063920021057, + "learning_rate": 0.001, + "loss": 2.5243, + "step": 2211 + }, + { + "epoch": 0.09357813689821474, + "grad_norm": 0.35803404450416565, + "learning_rate": 0.001, + "loss": 2.6485, + "step": 2212 + }, + { + "epoch": 0.09362044166173111, + "grad_norm": 4.5896124839782715, + "learning_rate": 0.001, + "loss": 2.4434, + "step": 2213 + }, + { + "epoch": 0.09366274642524748, + "grad_norm": 0.34777218103408813, + "learning_rate": 0.001, + "loss": 2.0321, + "step": 2214 + }, + { + "epoch": 0.09370505118876385, + "grad_norm": 0.3306042551994324, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 2215 + }, + { + "epoch": 0.09374735595228023, + "grad_norm": 0.7642030715942383, + "learning_rate": 0.001, + "loss": 2.1946, + "step": 2216 + }, + { + "epoch": 0.0937896607157966, + "grad_norm": 0.37406593561172485, + "learning_rate": 0.001, + "loss": 3.6855, + "step": 2217 + }, + { + "epoch": 0.09383196547931297, + "grad_norm": 0.6254957318305969, + "learning_rate": 0.001, + "loss": 3.1232, + "step": 2218 + }, + { + "epoch": 0.09387427024282934, + "grad_norm": 0.31491386890411377, + "learning_rate": 0.001, + "loss": 2.757, + "step": 2219 + }, + { + "epoch": 0.09391657500634572, + "grad_norm": 0.37096840143203735, + "learning_rate": 0.001, + "loss": 2.4591, + "step": 2220 + }, + { + "epoch": 0.09395887976986209, + "grad_norm": 0.30345040559768677, + "learning_rate": 0.001, + "loss": 2.2993, + "step": 2221 + }, + { + "epoch": 0.09400118453337845, + "grad_norm": 0.8886005282402039, + "learning_rate": 0.001, + "loss": 2.9874, + "step": 2222 + }, + { + "epoch": 0.09404348929689484, + "grad_norm": 0.5755462646484375, + "learning_rate": 0.001, + "loss": 2.5191, + "step": 2223 + }, + { + "epoch": 0.0940857940604112, + "grad_norm": 0.28378450870513916, + "learning_rate": 0.001, + "loss": 2.1818, + "step": 2224 + }, + { + "epoch": 0.09412809882392757, + "grad_norm": 0.4646637737751007, + "learning_rate": 0.001, + "loss": 2.155, + "step": 2225 + }, + { + "epoch": 0.09417040358744394, + "grad_norm": 0.28239914774894714, + "learning_rate": 0.001, + "loss": 3.0008, + "step": 2226 + }, + { + "epoch": 0.09421270835096032, + "grad_norm": 0.3244176506996155, + "learning_rate": 0.001, + "loss": 2.2789, + "step": 2227 + }, + { + "epoch": 0.09425501311447669, + "grad_norm": 0.25214463472366333, + "learning_rate": 0.001, + "loss": 1.8638, + "step": 2228 + }, + { + "epoch": 0.09429731787799306, + "grad_norm": 0.29388561844825745, + "learning_rate": 0.001, + "loss": 2.6425, + "step": 2229 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 0.7651423811912537, + "learning_rate": 0.001, + "loss": 2.5595, + "step": 2230 + }, + { + "epoch": 0.09438192740502581, + "grad_norm": 0.7599115967750549, + "learning_rate": 0.001, + "loss": 2.6142, + "step": 2231 + }, + { + "epoch": 0.09442423216854218, + "grad_norm": 0.28916943073272705, + "learning_rate": 0.001, + "loss": 1.9283, + "step": 2232 + }, + { + "epoch": 0.09446653693205855, + "grad_norm": 0.2711436152458191, + "learning_rate": 0.001, + "loss": 2.0485, + "step": 2233 + }, + { + "epoch": 0.09450884169557493, + "grad_norm": 0.3689520061016083, + "learning_rate": 0.001, + "loss": 2.4108, + "step": 2234 + }, + { + "epoch": 0.0945511464590913, + "grad_norm": 0.3966144621372223, + "learning_rate": 0.001, + "loss": 2.4453, + "step": 2235 + }, + { + "epoch": 0.09459345122260766, + "grad_norm": 0.8278962969779968, + "learning_rate": 0.001, + "loss": 2.281, + "step": 2236 + }, + { + "epoch": 0.09463575598612403, + "grad_norm": 0.29549941420555115, + "learning_rate": 0.001, + "loss": 1.8836, + "step": 2237 + }, + { + "epoch": 0.09467806074964041, + "grad_norm": 0.563795268535614, + "learning_rate": 0.001, + "loss": 3.243, + "step": 2238 + }, + { + "epoch": 0.09472036551315678, + "grad_norm": 0.2910638749599457, + "learning_rate": 0.001, + "loss": 2.7178, + "step": 2239 + }, + { + "epoch": 0.09476267027667315, + "grad_norm": 1.0017096996307373, + "learning_rate": 0.001, + "loss": 2.5647, + "step": 2240 + }, + { + "epoch": 0.09480497504018953, + "grad_norm": 0.2674773037433624, + "learning_rate": 0.001, + "loss": 2.4521, + "step": 2241 + }, + { + "epoch": 0.0948472798037059, + "grad_norm": 0.33499273657798767, + "learning_rate": 0.001, + "loss": 1.9591, + "step": 2242 + }, + { + "epoch": 0.09488958456722227, + "grad_norm": 0.28287482261657715, + "learning_rate": 0.001, + "loss": 1.8664, + "step": 2243 + }, + { + "epoch": 0.09493188933073864, + "grad_norm": 0.3287140130996704, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 2244 + }, + { + "epoch": 0.09497419409425502, + "grad_norm": 0.28710776567459106, + "learning_rate": 0.001, + "loss": 2.1672, + "step": 2245 + }, + { + "epoch": 0.09501649885777139, + "grad_norm": 0.5307938456535339, + "learning_rate": 0.001, + "loss": 3.1055, + "step": 2246 + }, + { + "epoch": 0.09505880362128775, + "grad_norm": 0.2794020473957062, + "learning_rate": 0.001, + "loss": 1.4943, + "step": 2247 + }, + { + "epoch": 0.09510110838480412, + "grad_norm": 0.483715295791626, + "learning_rate": 0.001, + "loss": 1.7752, + "step": 2248 + }, + { + "epoch": 0.0951434131483205, + "grad_norm": 0.8411474227905273, + "learning_rate": 0.001, + "loss": 2.9864, + "step": 2249 + }, + { + "epoch": 0.09518571791183687, + "grad_norm": 0.30003300309181213, + "learning_rate": 0.001, + "loss": 4.0293, + "step": 2250 + }, + { + "epoch": 0.09522802267535324, + "grad_norm": 0.2558903396129608, + "learning_rate": 0.001, + "loss": 2.1845, + "step": 2251 + }, + { + "epoch": 0.09527032743886962, + "grad_norm": 0.30151990056037903, + "learning_rate": 0.001, + "loss": 2.3792, + "step": 2252 + }, + { + "epoch": 0.09531263220238599, + "grad_norm": 0.29656192660331726, + "learning_rate": 0.001, + "loss": 3.2587, + "step": 2253 + }, + { + "epoch": 0.09535493696590236, + "grad_norm": 0.8995860815048218, + "learning_rate": 0.001, + "loss": 2.6727, + "step": 2254 + }, + { + "epoch": 0.09539724172941873, + "grad_norm": 0.3521322011947632, + "learning_rate": 0.001, + "loss": 2.4839, + "step": 2255 + }, + { + "epoch": 0.09543954649293511, + "grad_norm": 0.2579616904258728, + "learning_rate": 0.001, + "loss": 2.0373, + "step": 2256 + }, + { + "epoch": 0.09548185125645148, + "grad_norm": 2.846085548400879, + "learning_rate": 0.001, + "loss": 2.3873, + "step": 2257 + }, + { + "epoch": 0.09552415601996785, + "grad_norm": 0.28332656621932983, + "learning_rate": 0.001, + "loss": 2.426, + "step": 2258 + }, + { + "epoch": 0.09556646078348421, + "grad_norm": 1.6657432317733765, + "learning_rate": 0.001, + "loss": 2.0307, + "step": 2259 + }, + { + "epoch": 0.0956087655470006, + "grad_norm": 0.3576366901397705, + "learning_rate": 0.001, + "loss": 2.6188, + "step": 2260 + }, + { + "epoch": 0.09565107031051696, + "grad_norm": 0.8456769585609436, + "learning_rate": 0.001, + "loss": 2.3674, + "step": 2261 + }, + { + "epoch": 0.09569337507403333, + "grad_norm": 0.3482474088668823, + "learning_rate": 0.001, + "loss": 3.4504, + "step": 2262 + }, + { + "epoch": 0.09573567983754971, + "grad_norm": 0.42225900292396545, + "learning_rate": 0.001, + "loss": 2.8715, + "step": 2263 + }, + { + "epoch": 0.09577798460106608, + "grad_norm": 1.451746940612793, + "learning_rate": 0.001, + "loss": 2.8526, + "step": 2264 + }, + { + "epoch": 0.09582028936458245, + "grad_norm": 0.3721246123313904, + "learning_rate": 0.001, + "loss": 2.4929, + "step": 2265 + }, + { + "epoch": 0.09586259412809882, + "grad_norm": 0.9242255091667175, + "learning_rate": 0.001, + "loss": 2.1786, + "step": 2266 + }, + { + "epoch": 0.0959048988916152, + "grad_norm": 0.4310086965560913, + "learning_rate": 0.001, + "loss": 3.3944, + "step": 2267 + }, + { + "epoch": 0.09594720365513157, + "grad_norm": 14.419943809509277, + "learning_rate": 0.001, + "loss": 1.8957, + "step": 2268 + }, + { + "epoch": 0.09598950841864794, + "grad_norm": 0.373689740896225, + "learning_rate": 0.001, + "loss": 2.3684, + "step": 2269 + }, + { + "epoch": 0.0960318131821643, + "grad_norm": 0.3977186381816864, + "learning_rate": 0.001, + "loss": 1.9556, + "step": 2270 + }, + { + "epoch": 0.09607411794568069, + "grad_norm": 0.6978753209114075, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 2271 + }, + { + "epoch": 0.09611642270919706, + "grad_norm": 0.3750711679458618, + "learning_rate": 0.001, + "loss": 2.1204, + "step": 2272 + }, + { + "epoch": 0.09615872747271342, + "grad_norm": 0.5839630365371704, + "learning_rate": 0.001, + "loss": 2.1901, + "step": 2273 + }, + { + "epoch": 0.0962010322362298, + "grad_norm": 0.27384617924690247, + "learning_rate": 0.001, + "loss": 3.4993, + "step": 2274 + }, + { + "epoch": 0.09624333699974617, + "grad_norm": 0.3717498183250427, + "learning_rate": 0.001, + "loss": 1.9125, + "step": 2275 + }, + { + "epoch": 0.09628564176326254, + "grad_norm": 0.34626662731170654, + "learning_rate": 0.001, + "loss": 2.2712, + "step": 2276 + }, + { + "epoch": 0.09632794652677891, + "grad_norm": 1.6960554122924805, + "learning_rate": 0.001, + "loss": 2.3714, + "step": 2277 + }, + { + "epoch": 0.09637025129029529, + "grad_norm": 0.8217235803604126, + "learning_rate": 0.001, + "loss": 2.2524, + "step": 2278 + }, + { + "epoch": 0.09641255605381166, + "grad_norm": 0.2900844216346741, + "learning_rate": 0.001, + "loss": 2.7666, + "step": 2279 + }, + { + "epoch": 0.09645486081732803, + "grad_norm": 0.47249212861061096, + "learning_rate": 0.001, + "loss": 2.5579, + "step": 2280 + }, + { + "epoch": 0.0964971655808444, + "grad_norm": 0.2934323847293854, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 2281 + }, + { + "epoch": 0.09653947034436078, + "grad_norm": 0.7934165596961975, + "learning_rate": 0.001, + "loss": 2.5456, + "step": 2282 + }, + { + "epoch": 0.09658177510787715, + "grad_norm": 0.442386269569397, + "learning_rate": 0.001, + "loss": 2.7316, + "step": 2283 + }, + { + "epoch": 0.09662407987139351, + "grad_norm": 0.3879510760307312, + "learning_rate": 0.001, + "loss": 1.8956, + "step": 2284 + }, + { + "epoch": 0.0966663846349099, + "grad_norm": 0.42023125290870667, + "learning_rate": 0.001, + "loss": 3.0717, + "step": 2285 + }, + { + "epoch": 0.09670868939842626, + "grad_norm": 0.21355478465557098, + "learning_rate": 0.001, + "loss": 2.0076, + "step": 2286 + }, + { + "epoch": 0.09675099416194263, + "grad_norm": 10.379256248474121, + "learning_rate": 0.001, + "loss": 2.8718, + "step": 2287 + }, + { + "epoch": 0.096793298925459, + "grad_norm": 0.3485308289527893, + "learning_rate": 0.001, + "loss": 1.9105, + "step": 2288 + }, + { + "epoch": 0.09683560368897538, + "grad_norm": 1.0859586000442505, + "learning_rate": 0.001, + "loss": 1.7195, + "step": 2289 + }, + { + "epoch": 0.09687790845249175, + "grad_norm": 0.43755942583084106, + "learning_rate": 0.001, + "loss": 2.1547, + "step": 2290 + }, + { + "epoch": 0.09692021321600812, + "grad_norm": 0.26755252480506897, + "learning_rate": 0.001, + "loss": 1.9872, + "step": 2291 + }, + { + "epoch": 0.0969625179795245, + "grad_norm": 0.3288552463054657, + "learning_rate": 0.001, + "loss": 3.2151, + "step": 2292 + }, + { + "epoch": 0.09700482274304087, + "grad_norm": 0.3707546293735504, + "learning_rate": 0.001, + "loss": 2.7954, + "step": 2293 + }, + { + "epoch": 0.09704712750655724, + "grad_norm": 0.33741122484207153, + "learning_rate": 0.001, + "loss": 2.5806, + "step": 2294 + }, + { + "epoch": 0.0970894322700736, + "grad_norm": 0.29366111755371094, + "learning_rate": 0.001, + "loss": 2.161, + "step": 2295 + }, + { + "epoch": 0.09713173703358999, + "grad_norm": 0.29789918661117554, + "learning_rate": 0.001, + "loss": 2.6224, + "step": 2296 + }, + { + "epoch": 0.09717404179710636, + "grad_norm": 2.258840322494507, + "learning_rate": 0.001, + "loss": 1.8646, + "step": 2297 + }, + { + "epoch": 0.09721634656062272, + "grad_norm": 0.2928341329097748, + "learning_rate": 0.001, + "loss": 2.6383, + "step": 2298 + }, + { + "epoch": 0.09725865132413909, + "grad_norm": 0.42597708106040955, + "learning_rate": 0.001, + "loss": 2.1843, + "step": 2299 + }, + { + "epoch": 0.09730095608765547, + "grad_norm": 2.620967149734497, + "learning_rate": 0.001, + "loss": 2.0323, + "step": 2300 + }, + { + "epoch": 0.09734326085117184, + "grad_norm": 0.5456410646438599, + "learning_rate": 0.001, + "loss": 1.7493, + "step": 2301 + }, + { + "epoch": 0.09738556561468821, + "grad_norm": 0.28042513132095337, + "learning_rate": 0.001, + "loss": 2.5266, + "step": 2302 + }, + { + "epoch": 0.09742787037820459, + "grad_norm": 0.23970133066177368, + "learning_rate": 0.001, + "loss": 1.9479, + "step": 2303 + }, + { + "epoch": 0.09747017514172096, + "grad_norm": 0.40894538164138794, + "learning_rate": 0.001, + "loss": 1.8509, + "step": 2304 + }, + { + "epoch": 0.09751247990523733, + "grad_norm": 0.23067353665828705, + "learning_rate": 0.001, + "loss": 2.7972, + "step": 2305 + }, + { + "epoch": 0.0975547846687537, + "grad_norm": 0.36500483751296997, + "learning_rate": 0.001, + "loss": 2.0178, + "step": 2306 + }, + { + "epoch": 0.09759708943227008, + "grad_norm": 0.4092254936695099, + "learning_rate": 0.001, + "loss": 2.9206, + "step": 2307 + }, + { + "epoch": 0.09763939419578645, + "grad_norm": 0.49489879608154297, + "learning_rate": 0.001, + "loss": 2.1519, + "step": 2308 + }, + { + "epoch": 0.09768169895930282, + "grad_norm": 0.39644548296928406, + "learning_rate": 0.001, + "loss": 2.3865, + "step": 2309 + }, + { + "epoch": 0.09772400372281918, + "grad_norm": 0.4900401830673218, + "learning_rate": 0.001, + "loss": 1.6001, + "step": 2310 + }, + { + "epoch": 0.09776630848633557, + "grad_norm": 0.2798866629600525, + "learning_rate": 0.001, + "loss": 2.7082, + "step": 2311 + }, + { + "epoch": 0.09780861324985193, + "grad_norm": 0.23535476624965668, + "learning_rate": 0.001, + "loss": 1.938, + "step": 2312 + }, + { + "epoch": 0.0978509180133683, + "grad_norm": 0.27713868021965027, + "learning_rate": 0.001, + "loss": 2.7315, + "step": 2313 + }, + { + "epoch": 0.09789322277688468, + "grad_norm": 0.3740001618862152, + "learning_rate": 0.001, + "loss": 2.2113, + "step": 2314 + }, + { + "epoch": 0.09793552754040105, + "grad_norm": 0.23348340392112732, + "learning_rate": 0.001, + "loss": 2.2677, + "step": 2315 + }, + { + "epoch": 0.09797783230391742, + "grad_norm": 2.2800018787384033, + "learning_rate": 0.001, + "loss": 2.1095, + "step": 2316 + }, + { + "epoch": 0.09802013706743379, + "grad_norm": 0.2242593616247177, + "learning_rate": 0.001, + "loss": 2.0112, + "step": 2317 + }, + { + "epoch": 0.09806244183095017, + "grad_norm": 0.40007051825523376, + "learning_rate": 0.001, + "loss": 2.9323, + "step": 2318 + }, + { + "epoch": 0.09810474659446654, + "grad_norm": 0.32281118631362915, + "learning_rate": 0.001, + "loss": 2.8329, + "step": 2319 + }, + { + "epoch": 0.0981470513579829, + "grad_norm": 0.32091641426086426, + "learning_rate": 0.001, + "loss": 2.1386, + "step": 2320 + }, + { + "epoch": 0.09818935612149927, + "grad_norm": 0.372270792722702, + "learning_rate": 0.001, + "loss": 3.7813, + "step": 2321 + }, + { + "epoch": 0.09823166088501566, + "grad_norm": 0.23315389454364777, + "learning_rate": 0.001, + "loss": 1.7683, + "step": 2322 + }, + { + "epoch": 0.09827396564853202, + "grad_norm": 0.287507027387619, + "learning_rate": 0.001, + "loss": 2.4763, + "step": 2323 + }, + { + "epoch": 0.09831627041204839, + "grad_norm": 0.33321303129196167, + "learning_rate": 0.001, + "loss": 2.5295, + "step": 2324 + }, + { + "epoch": 0.09835857517556477, + "grad_norm": 0.40260010957717896, + "learning_rate": 0.001, + "loss": 2.4879, + "step": 2325 + }, + { + "epoch": 0.09840087993908114, + "grad_norm": 0.5056272745132446, + "learning_rate": 0.001, + "loss": 3.2216, + "step": 2326 + }, + { + "epoch": 0.09844318470259751, + "grad_norm": 6.566803455352783, + "learning_rate": 0.001, + "loss": 2.4137, + "step": 2327 + }, + { + "epoch": 0.09848548946611388, + "grad_norm": 0.37313172221183777, + "learning_rate": 0.001, + "loss": 2.8742, + "step": 2328 + }, + { + "epoch": 0.09852779422963026, + "grad_norm": 0.7859897613525391, + "learning_rate": 0.001, + "loss": 1.96, + "step": 2329 + }, + { + "epoch": 0.09857009899314663, + "grad_norm": 0.2522431015968323, + "learning_rate": 0.001, + "loss": 2.2632, + "step": 2330 + }, + { + "epoch": 0.098612403756663, + "grad_norm": 8.351020812988281, + "learning_rate": 0.001, + "loss": 3.1826, + "step": 2331 + }, + { + "epoch": 0.09865470852017937, + "grad_norm": 0.4760158360004425, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 2332 + }, + { + "epoch": 0.09869701328369575, + "grad_norm": 0.27335453033447266, + "learning_rate": 0.001, + "loss": 1.8718, + "step": 2333 + }, + { + "epoch": 0.09873931804721212, + "grad_norm": 0.3000699281692505, + "learning_rate": 0.001, + "loss": 2.1485, + "step": 2334 + }, + { + "epoch": 0.09878162281072848, + "grad_norm": 0.2664180099964142, + "learning_rate": 0.001, + "loss": 2.0335, + "step": 2335 + }, + { + "epoch": 0.09882392757424487, + "grad_norm": 1.341559886932373, + "learning_rate": 0.001, + "loss": 1.9941, + "step": 2336 + }, + { + "epoch": 0.09886623233776123, + "grad_norm": 0.2919972836971283, + "learning_rate": 0.001, + "loss": 2.7002, + "step": 2337 + }, + { + "epoch": 0.0989085371012776, + "grad_norm": 0.3330998718738556, + "learning_rate": 0.001, + "loss": 2.831, + "step": 2338 + }, + { + "epoch": 0.09895084186479397, + "grad_norm": 2.2501633167266846, + "learning_rate": 0.001, + "loss": 2.2368, + "step": 2339 + }, + { + "epoch": 0.09899314662831035, + "grad_norm": 0.7335495352745056, + "learning_rate": 0.001, + "loss": 3.2086, + "step": 2340 + }, + { + "epoch": 0.09903545139182672, + "grad_norm": 0.36334070563316345, + "learning_rate": 0.001, + "loss": 3.0316, + "step": 2341 + }, + { + "epoch": 0.09907775615534309, + "grad_norm": 0.35820385813713074, + "learning_rate": 0.001, + "loss": 2.6684, + "step": 2342 + }, + { + "epoch": 0.09912006091885946, + "grad_norm": 0.5417790412902832, + "learning_rate": 0.001, + "loss": 3.4955, + "step": 2343 + }, + { + "epoch": 0.09916236568237584, + "grad_norm": 0.6243907809257507, + "learning_rate": 0.001, + "loss": 2.7038, + "step": 2344 + }, + { + "epoch": 0.0992046704458922, + "grad_norm": 0.4269777834415436, + "learning_rate": 0.001, + "loss": 3.0679, + "step": 2345 + }, + { + "epoch": 0.09924697520940857, + "grad_norm": 0.45058220624923706, + "learning_rate": 0.001, + "loss": 2.0214, + "step": 2346 + }, + { + "epoch": 0.09928927997292496, + "grad_norm": 0.35213160514831543, + "learning_rate": 0.001, + "loss": 1.6954, + "step": 2347 + }, + { + "epoch": 0.09933158473644133, + "grad_norm": 0.36422258615493774, + "learning_rate": 0.001, + "loss": 2.605, + "step": 2348 + }, + { + "epoch": 0.0993738894999577, + "grad_norm": 0.4111798405647278, + "learning_rate": 0.001, + "loss": 2.9701, + "step": 2349 + }, + { + "epoch": 0.09941619426347406, + "grad_norm": 0.2559814453125, + "learning_rate": 0.001, + "loss": 2.0841, + "step": 2350 + }, + { + "epoch": 0.09945849902699044, + "grad_norm": 0.4313149154186249, + "learning_rate": 0.001, + "loss": 2.1517, + "step": 2351 + }, + { + "epoch": 0.09950080379050681, + "grad_norm": 0.7786276340484619, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 2352 + }, + { + "epoch": 0.09954310855402318, + "grad_norm": 0.30976659059524536, + "learning_rate": 0.001, + "loss": 2.1551, + "step": 2353 + }, + { + "epoch": 0.09958541331753956, + "grad_norm": 0.2895033061504364, + "learning_rate": 0.001, + "loss": 2.9111, + "step": 2354 + }, + { + "epoch": 0.09962771808105593, + "grad_norm": 0.28255394101142883, + "learning_rate": 0.001, + "loss": 2.9254, + "step": 2355 + }, + { + "epoch": 0.0996700228445723, + "grad_norm": 1.1467900276184082, + "learning_rate": 0.001, + "loss": 2.8531, + "step": 2356 + }, + { + "epoch": 0.09971232760808867, + "grad_norm": 0.2727132737636566, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 2357 + }, + { + "epoch": 0.09975463237160505, + "grad_norm": 0.2686462104320526, + "learning_rate": 0.001, + "loss": 3.5868, + "step": 2358 + }, + { + "epoch": 0.09979693713512142, + "grad_norm": 0.888546884059906, + "learning_rate": 0.001, + "loss": 2.3209, + "step": 2359 + }, + { + "epoch": 0.09983924189863778, + "grad_norm": 0.96397465467453, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 2360 + }, + { + "epoch": 0.09988154666215415, + "grad_norm": 0.3050881028175354, + "learning_rate": 0.001, + "loss": 2.7307, + "step": 2361 + }, + { + "epoch": 0.09992385142567053, + "grad_norm": 0.2523465156555176, + "learning_rate": 0.001, + "loss": 1.8036, + "step": 2362 + }, + { + "epoch": 0.0999661561891869, + "grad_norm": 0.28606629371643066, + "learning_rate": 0.001, + "loss": 3.3748, + "step": 2363 + }, + { + "epoch": 0.10000846095270327, + "grad_norm": 0.25041037797927856, + "learning_rate": 0.001, + "loss": 1.5891, + "step": 2364 + }, + { + "epoch": 0.10005076571621965, + "grad_norm": 0.36506375670433044, + "learning_rate": 0.001, + "loss": 2.7191, + "step": 2365 + }, + { + "epoch": 0.10009307047973602, + "grad_norm": 0.30195456743240356, + "learning_rate": 0.001, + "loss": 2.418, + "step": 2366 + }, + { + "epoch": 0.10013537524325239, + "grad_norm": 0.2500908374786377, + "learning_rate": 0.001, + "loss": 1.9814, + "step": 2367 + }, + { + "epoch": 0.10017768000676876, + "grad_norm": 0.29026201367378235, + "learning_rate": 0.001, + "loss": 2.4507, + "step": 2368 + }, + { + "epoch": 0.10021998477028514, + "grad_norm": 0.28172945976257324, + "learning_rate": 0.001, + "loss": 2.6413, + "step": 2369 + }, + { + "epoch": 0.10026228953380151, + "grad_norm": 0.2788645625114441, + "learning_rate": 0.001, + "loss": 2.1827, + "step": 2370 + }, + { + "epoch": 0.10030459429731788, + "grad_norm": 0.24209265410900116, + "learning_rate": 0.001, + "loss": 2.1193, + "step": 2371 + }, + { + "epoch": 0.10034689906083424, + "grad_norm": 0.25518038868904114, + "learning_rate": 0.001, + "loss": 2.3543, + "step": 2372 + }, + { + "epoch": 0.10038920382435063, + "grad_norm": 0.28497177362442017, + "learning_rate": 0.001, + "loss": 2.3745, + "step": 2373 + }, + { + "epoch": 0.100431508587867, + "grad_norm": 0.49267375469207764, + "learning_rate": 0.001, + "loss": 1.9744, + "step": 2374 + }, + { + "epoch": 0.10047381335138336, + "grad_norm": 0.26035183668136597, + "learning_rate": 0.001, + "loss": 1.834, + "step": 2375 + }, + { + "epoch": 0.10051611811489974, + "grad_norm": 0.28036877512931824, + "learning_rate": 0.001, + "loss": 3.1908, + "step": 2376 + }, + { + "epoch": 0.10055842287841611, + "grad_norm": 1.9085135459899902, + "learning_rate": 0.001, + "loss": 3.0537, + "step": 2377 + }, + { + "epoch": 0.10060072764193248, + "grad_norm": 0.26107195019721985, + "learning_rate": 0.001, + "loss": 2.0188, + "step": 2378 + }, + { + "epoch": 0.10064303240544885, + "grad_norm": 0.2460990846157074, + "learning_rate": 0.001, + "loss": 3.2141, + "step": 2379 + }, + { + "epoch": 0.10068533716896523, + "grad_norm": 0.3403857946395874, + "learning_rate": 0.001, + "loss": 2.0665, + "step": 2380 + }, + { + "epoch": 0.1007276419324816, + "grad_norm": 0.36075910925865173, + "learning_rate": 0.001, + "loss": 2.6347, + "step": 2381 + }, + { + "epoch": 0.10076994669599797, + "grad_norm": 2.8513433933258057, + "learning_rate": 0.001, + "loss": 2.6082, + "step": 2382 + }, + { + "epoch": 0.10081225145951433, + "grad_norm": 0.30714812874794006, + "learning_rate": 0.001, + "loss": 2.1303, + "step": 2383 + }, + { + "epoch": 0.10085455622303072, + "grad_norm": 0.26115673780441284, + "learning_rate": 0.001, + "loss": 2.4109, + "step": 2384 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 3.011416435241699, + "learning_rate": 0.001, + "loss": 2.6895, + "step": 2385 + }, + { + "epoch": 0.10093916575006345, + "grad_norm": 0.2893314063549042, + "learning_rate": 0.001, + "loss": 1.6992, + "step": 2386 + }, + { + "epoch": 0.10098147051357984, + "grad_norm": 0.411714106798172, + "learning_rate": 0.001, + "loss": 3.7138, + "step": 2387 + }, + { + "epoch": 0.1010237752770962, + "grad_norm": 0.37683776021003723, + "learning_rate": 0.001, + "loss": 2.6581, + "step": 2388 + }, + { + "epoch": 0.10106608004061257, + "grad_norm": 0.8233429789543152, + "learning_rate": 0.001, + "loss": 3.1353, + "step": 2389 + }, + { + "epoch": 0.10110838480412894, + "grad_norm": 0.7344207167625427, + "learning_rate": 0.001, + "loss": 2.6045, + "step": 2390 + }, + { + "epoch": 0.10115068956764532, + "grad_norm": 0.28514331579208374, + "learning_rate": 0.001, + "loss": 1.9666, + "step": 2391 + }, + { + "epoch": 0.10119299433116169, + "grad_norm": 0.2906087040901184, + "learning_rate": 0.001, + "loss": 1.729, + "step": 2392 + }, + { + "epoch": 0.10123529909467806, + "grad_norm": 0.4669470191001892, + "learning_rate": 0.001, + "loss": 1.5726, + "step": 2393 + }, + { + "epoch": 0.10127760385819443, + "grad_norm": 0.44565969705581665, + "learning_rate": 0.001, + "loss": 2.1407, + "step": 2394 + }, + { + "epoch": 0.10131990862171081, + "grad_norm": 0.2725061774253845, + "learning_rate": 0.001, + "loss": 2.1667, + "step": 2395 + }, + { + "epoch": 0.10136221338522718, + "grad_norm": 0.49515077471733093, + "learning_rate": 0.001, + "loss": 2.4108, + "step": 2396 + }, + { + "epoch": 0.10140451814874354, + "grad_norm": 0.2847639322280884, + "learning_rate": 0.001, + "loss": 2.9909, + "step": 2397 + }, + { + "epoch": 0.10144682291225993, + "grad_norm": 0.8457419872283936, + "learning_rate": 0.001, + "loss": 3.0259, + "step": 2398 + }, + { + "epoch": 0.1014891276757763, + "grad_norm": 9.33295726776123, + "learning_rate": 0.001, + "loss": 3.2422, + "step": 2399 + }, + { + "epoch": 0.10153143243929266, + "grad_norm": 1.8597520589828491, + "learning_rate": 0.001, + "loss": 2.6769, + "step": 2400 + }, + { + "epoch": 0.10157373720280903, + "grad_norm": 0.26994621753692627, + "learning_rate": 0.001, + "loss": 2.2029, + "step": 2401 + }, + { + "epoch": 0.10161604196632541, + "grad_norm": 1.4038335084915161, + "learning_rate": 0.001, + "loss": 3.4995, + "step": 2402 + }, + { + "epoch": 0.10165834672984178, + "grad_norm": 0.6501076221466064, + "learning_rate": 0.001, + "loss": 2.7349, + "step": 2403 + }, + { + "epoch": 0.10170065149335815, + "grad_norm": 0.21357211470603943, + "learning_rate": 0.001, + "loss": 1.9009, + "step": 2404 + }, + { + "epoch": 0.10174295625687452, + "grad_norm": 0.272206574678421, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 2405 + }, + { + "epoch": 0.1017852610203909, + "grad_norm": 0.23111356794834137, + "learning_rate": 0.001, + "loss": 2.2551, + "step": 2406 + }, + { + "epoch": 0.10182756578390727, + "grad_norm": 0.21537300944328308, + "learning_rate": 0.001, + "loss": 3.186, + "step": 2407 + }, + { + "epoch": 0.10186987054742364, + "grad_norm": 0.26508206129074097, + "learning_rate": 0.001, + "loss": 2.1786, + "step": 2408 + }, + { + "epoch": 0.10191217531094002, + "grad_norm": 3.251347303390503, + "learning_rate": 0.001, + "loss": 2.1077, + "step": 2409 + }, + { + "epoch": 0.10195448007445639, + "grad_norm": 0.2736663818359375, + "learning_rate": 0.001, + "loss": 2.0577, + "step": 2410 + }, + { + "epoch": 0.10199678483797275, + "grad_norm": 0.3979964852333069, + "learning_rate": 0.001, + "loss": 3.457, + "step": 2411 + }, + { + "epoch": 0.10203908960148912, + "grad_norm": 0.2330218404531479, + "learning_rate": 0.001, + "loss": 2.2031, + "step": 2412 + }, + { + "epoch": 0.1020813943650055, + "grad_norm": 0.24427588284015656, + "learning_rate": 0.001, + "loss": 1.7257, + "step": 2413 + }, + { + "epoch": 0.10212369912852187, + "grad_norm": 0.7020580172538757, + "learning_rate": 0.001, + "loss": 3.6614, + "step": 2414 + }, + { + "epoch": 0.10216600389203824, + "grad_norm": 0.35481932759284973, + "learning_rate": 0.001, + "loss": 2.5094, + "step": 2415 + }, + { + "epoch": 0.10220830865555462, + "grad_norm": 10.207537651062012, + "learning_rate": 0.001, + "loss": 2.1178, + "step": 2416 + }, + { + "epoch": 0.10225061341907099, + "grad_norm": 0.3748272657394409, + "learning_rate": 0.001, + "loss": 2.5037, + "step": 2417 + }, + { + "epoch": 0.10229291818258736, + "grad_norm": 1.0302212238311768, + "learning_rate": 0.001, + "loss": 1.8157, + "step": 2418 + }, + { + "epoch": 0.10233522294610373, + "grad_norm": 0.35969147086143494, + "learning_rate": 0.001, + "loss": 2.2622, + "step": 2419 + }, + { + "epoch": 0.10237752770962011, + "grad_norm": 0.3533392548561096, + "learning_rate": 0.001, + "loss": 2.4072, + "step": 2420 + }, + { + "epoch": 0.10241983247313648, + "grad_norm": 0.3274601101875305, + "learning_rate": 0.001, + "loss": 2.4427, + "step": 2421 + }, + { + "epoch": 0.10246213723665284, + "grad_norm": 0.35946276783943176, + "learning_rate": 0.001, + "loss": 1.8442, + "step": 2422 + }, + { + "epoch": 0.10250444200016921, + "grad_norm": 0.3757915198802948, + "learning_rate": 0.001, + "loss": 3.0923, + "step": 2423 + }, + { + "epoch": 0.1025467467636856, + "grad_norm": 0.3471343219280243, + "learning_rate": 0.001, + "loss": 2.3881, + "step": 2424 + }, + { + "epoch": 0.10258905152720196, + "grad_norm": 0.2516452372074127, + "learning_rate": 0.001, + "loss": 2.0774, + "step": 2425 + }, + { + "epoch": 0.10263135629071833, + "grad_norm": 0.23940257728099823, + "learning_rate": 0.001, + "loss": 2.3271, + "step": 2426 + }, + { + "epoch": 0.10267366105423471, + "grad_norm": 0.8426430821418762, + "learning_rate": 0.001, + "loss": 2.4435, + "step": 2427 + }, + { + "epoch": 0.10271596581775108, + "grad_norm": 0.7691043019294739, + "learning_rate": 0.001, + "loss": 1.9913, + "step": 2428 + }, + { + "epoch": 0.10275827058126745, + "grad_norm": 0.29252296686172485, + "learning_rate": 0.001, + "loss": 3.3262, + "step": 2429 + }, + { + "epoch": 0.10280057534478382, + "grad_norm": 0.25585702061653137, + "learning_rate": 0.001, + "loss": 2.3882, + "step": 2430 + }, + { + "epoch": 0.1028428801083002, + "grad_norm": 2.2944388389587402, + "learning_rate": 0.001, + "loss": 2.7547, + "step": 2431 + }, + { + "epoch": 0.10288518487181657, + "grad_norm": 0.3069431185722351, + "learning_rate": 0.001, + "loss": 2.5555, + "step": 2432 + }, + { + "epoch": 0.10292748963533294, + "grad_norm": 0.6198341250419617, + "learning_rate": 0.001, + "loss": 2.2501, + "step": 2433 + }, + { + "epoch": 0.1029697943988493, + "grad_norm": 0.23452864587306976, + "learning_rate": 0.001, + "loss": 2.0086, + "step": 2434 + }, + { + "epoch": 0.10301209916236569, + "grad_norm": 0.27118051052093506, + "learning_rate": 0.001, + "loss": 1.6991, + "step": 2435 + }, + { + "epoch": 0.10305440392588205, + "grad_norm": 0.2895638048648834, + "learning_rate": 0.001, + "loss": 2.391, + "step": 2436 + }, + { + "epoch": 0.10309670868939842, + "grad_norm": 0.5216059684753418, + "learning_rate": 0.001, + "loss": 2.0853, + "step": 2437 + }, + { + "epoch": 0.1031390134529148, + "grad_norm": 0.32230275869369507, + "learning_rate": 0.001, + "loss": 2.3947, + "step": 2438 + }, + { + "epoch": 0.10318131821643117, + "grad_norm": 0.34388068318367004, + "learning_rate": 0.001, + "loss": 2.6358, + "step": 2439 + }, + { + "epoch": 0.10322362297994754, + "grad_norm": 0.3274713456630707, + "learning_rate": 0.001, + "loss": 3.2649, + "step": 2440 + }, + { + "epoch": 0.10326592774346391, + "grad_norm": 0.24570752680301666, + "learning_rate": 0.001, + "loss": 2.3323, + "step": 2441 + }, + { + "epoch": 0.10330823250698029, + "grad_norm": 0.23743736743927002, + "learning_rate": 0.001, + "loss": 1.6279, + "step": 2442 + }, + { + "epoch": 0.10335053727049666, + "grad_norm": 0.2327599972486496, + "learning_rate": 0.001, + "loss": 2.0293, + "step": 2443 + }, + { + "epoch": 0.10339284203401303, + "grad_norm": 0.24773749709129333, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 2444 + }, + { + "epoch": 0.1034351467975294, + "grad_norm": 0.3534669578075409, + "learning_rate": 0.001, + "loss": 2.7218, + "step": 2445 + }, + { + "epoch": 0.10347745156104578, + "grad_norm": 0.5942555069923401, + "learning_rate": 0.001, + "loss": 2.3746, + "step": 2446 + }, + { + "epoch": 0.10351975632456215, + "grad_norm": 0.26471078395843506, + "learning_rate": 0.001, + "loss": 2.2947, + "step": 2447 + }, + { + "epoch": 0.10356206108807851, + "grad_norm": 0.3464234173297882, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 2448 + }, + { + "epoch": 0.1036043658515949, + "grad_norm": 1.5699352025985718, + "learning_rate": 0.001, + "loss": 2.5707, + "step": 2449 + }, + { + "epoch": 0.10364667061511126, + "grad_norm": 1.2185924053192139, + "learning_rate": 0.001, + "loss": 2.956, + "step": 2450 + }, + { + "epoch": 0.10368897537862763, + "grad_norm": 0.2412140667438507, + "learning_rate": 0.001, + "loss": 2.1361, + "step": 2451 + }, + { + "epoch": 0.103731280142144, + "grad_norm": 1.7203179597854614, + "learning_rate": 0.001, + "loss": 2.7507, + "step": 2452 + }, + { + "epoch": 0.10377358490566038, + "grad_norm": 0.40216708183288574, + "learning_rate": 0.001, + "loss": 2.3239, + "step": 2453 + }, + { + "epoch": 0.10381588966917675, + "grad_norm": 0.41361749172210693, + "learning_rate": 0.001, + "loss": 2.4953, + "step": 2454 + }, + { + "epoch": 0.10385819443269312, + "grad_norm": 2.6895222663879395, + "learning_rate": 0.001, + "loss": 2.6164, + "step": 2455 + }, + { + "epoch": 0.10390049919620949, + "grad_norm": 0.6734895706176758, + "learning_rate": 0.001, + "loss": 2.5122, + "step": 2456 + }, + { + "epoch": 0.10394280395972587, + "grad_norm": 3.203244686126709, + "learning_rate": 0.001, + "loss": 2.6159, + "step": 2457 + }, + { + "epoch": 0.10398510872324224, + "grad_norm": 7.857353687286377, + "learning_rate": 0.001, + "loss": 2.2798, + "step": 2458 + }, + { + "epoch": 0.1040274134867586, + "grad_norm": 0.4832097589969635, + "learning_rate": 0.001, + "loss": 2.6077, + "step": 2459 + }, + { + "epoch": 0.10406971825027499, + "grad_norm": 1.2383735179901123, + "learning_rate": 0.001, + "loss": 2.2514, + "step": 2460 + }, + { + "epoch": 0.10411202301379135, + "grad_norm": 0.27097123861312866, + "learning_rate": 0.001, + "loss": 1.8764, + "step": 2461 + }, + { + "epoch": 0.10415432777730772, + "grad_norm": 0.2842322587966919, + "learning_rate": 0.001, + "loss": 2.4073, + "step": 2462 + }, + { + "epoch": 0.10419663254082409, + "grad_norm": 0.31297391653060913, + "learning_rate": 0.001, + "loss": 1.8048, + "step": 2463 + }, + { + "epoch": 0.10423893730434047, + "grad_norm": 0.4402758479118347, + "learning_rate": 0.001, + "loss": 3.4671, + "step": 2464 + }, + { + "epoch": 0.10428124206785684, + "grad_norm": 0.28556615114212036, + "learning_rate": 0.001, + "loss": 2.2693, + "step": 2465 + }, + { + "epoch": 0.10432354683137321, + "grad_norm": 0.2614104151725769, + "learning_rate": 0.001, + "loss": 1.8248, + "step": 2466 + }, + { + "epoch": 0.10436585159488958, + "grad_norm": 0.24164602160453796, + "learning_rate": 0.001, + "loss": 2.1898, + "step": 2467 + }, + { + "epoch": 0.10440815635840596, + "grad_norm": 0.3009679913520813, + "learning_rate": 0.001, + "loss": 3.0112, + "step": 2468 + }, + { + "epoch": 0.10445046112192233, + "grad_norm": 0.3488333821296692, + "learning_rate": 0.001, + "loss": 2.0925, + "step": 2469 + }, + { + "epoch": 0.1044927658854387, + "grad_norm": 0.6833105087280273, + "learning_rate": 0.001, + "loss": 2.22, + "step": 2470 + }, + { + "epoch": 0.10453507064895508, + "grad_norm": 0.28377044200897217, + "learning_rate": 0.001, + "loss": 2.2993, + "step": 2471 + }, + { + "epoch": 0.10457737541247145, + "grad_norm": 0.3093280792236328, + "learning_rate": 0.001, + "loss": 2.701, + "step": 2472 + }, + { + "epoch": 0.10461968017598781, + "grad_norm": 0.31424522399902344, + "learning_rate": 0.001, + "loss": 2.6909, + "step": 2473 + }, + { + "epoch": 0.10466198493950418, + "grad_norm": 0.23883268237113953, + "learning_rate": 0.001, + "loss": 3.2359, + "step": 2474 + }, + { + "epoch": 0.10470428970302056, + "grad_norm": 0.3218192458152771, + "learning_rate": 0.001, + "loss": 2.6222, + "step": 2475 + }, + { + "epoch": 0.10474659446653693, + "grad_norm": 0.3537428677082062, + "learning_rate": 0.001, + "loss": 2.6754, + "step": 2476 + }, + { + "epoch": 0.1047888992300533, + "grad_norm": 0.2585582137107849, + "learning_rate": 0.001, + "loss": 2.7918, + "step": 2477 + }, + { + "epoch": 0.10483120399356968, + "grad_norm": 0.28491732478141785, + "learning_rate": 0.001, + "loss": 1.814, + "step": 2478 + }, + { + "epoch": 0.10487350875708605, + "grad_norm": 0.31250932812690735, + "learning_rate": 0.001, + "loss": 2.1781, + "step": 2479 + }, + { + "epoch": 0.10491581352060242, + "grad_norm": 0.6643452644348145, + "learning_rate": 0.001, + "loss": 4.0611, + "step": 2480 + }, + { + "epoch": 0.10495811828411879, + "grad_norm": 0.2877008318901062, + "learning_rate": 0.001, + "loss": 2.3672, + "step": 2481 + }, + { + "epoch": 0.10500042304763517, + "grad_norm": 0.23607558012008667, + "learning_rate": 0.001, + "loss": 1.5298, + "step": 2482 + }, + { + "epoch": 0.10504272781115154, + "grad_norm": 3.870473623275757, + "learning_rate": 0.001, + "loss": 2.4808, + "step": 2483 + }, + { + "epoch": 0.1050850325746679, + "grad_norm": 0.29349610209465027, + "learning_rate": 0.001, + "loss": 1.9366, + "step": 2484 + }, + { + "epoch": 0.10512733733818427, + "grad_norm": 0.23734307289123535, + "learning_rate": 0.001, + "loss": 2.6115, + "step": 2485 + }, + { + "epoch": 0.10516964210170066, + "grad_norm": 0.8626511693000793, + "learning_rate": 0.001, + "loss": 2.4259, + "step": 2486 + }, + { + "epoch": 0.10521194686521702, + "grad_norm": 5.853229999542236, + "learning_rate": 0.001, + "loss": 2.406, + "step": 2487 + }, + { + "epoch": 0.10525425162873339, + "grad_norm": 0.23822012543678284, + "learning_rate": 0.001, + "loss": 3.1322, + "step": 2488 + }, + { + "epoch": 0.10529655639224977, + "grad_norm": 0.2993183732032776, + "learning_rate": 0.001, + "loss": 3.2551, + "step": 2489 + }, + { + "epoch": 0.10533886115576614, + "grad_norm": 1.8715354204177856, + "learning_rate": 0.001, + "loss": 2.2385, + "step": 2490 + }, + { + "epoch": 0.10538116591928251, + "grad_norm": 0.3598434329032898, + "learning_rate": 0.001, + "loss": 2.4187, + "step": 2491 + }, + { + "epoch": 0.10542347068279888, + "grad_norm": 0.5928519368171692, + "learning_rate": 0.001, + "loss": 1.9091, + "step": 2492 + }, + { + "epoch": 0.10546577544631526, + "grad_norm": 1.1429487466812134, + "learning_rate": 0.001, + "loss": 2.5143, + "step": 2493 + }, + { + "epoch": 0.10550808020983163, + "grad_norm": 2.955080270767212, + "learning_rate": 0.001, + "loss": 2.2716, + "step": 2494 + }, + { + "epoch": 0.105550384973348, + "grad_norm": 0.3205156922340393, + "learning_rate": 0.001, + "loss": 3.5673, + "step": 2495 + }, + { + "epoch": 0.10559268973686436, + "grad_norm": 0.31520524621009827, + "learning_rate": 0.001, + "loss": 2.6269, + "step": 2496 + }, + { + "epoch": 0.10563499450038075, + "grad_norm": 0.2760033905506134, + "learning_rate": 0.001, + "loss": 2.1043, + "step": 2497 + }, + { + "epoch": 0.10567729926389711, + "grad_norm": 0.23244822025299072, + "learning_rate": 0.001, + "loss": 2.583, + "step": 2498 + }, + { + "epoch": 0.10571960402741348, + "grad_norm": 0.24715182185173035, + "learning_rate": 0.001, + "loss": 2.147, + "step": 2499 + }, + { + "epoch": 0.10576190879092986, + "grad_norm": 0.3001532256603241, + "learning_rate": 0.001, + "loss": 2.2312, + "step": 2500 + }, + { + "epoch": 0.10580421355444623, + "grad_norm": 0.2582187056541443, + "learning_rate": 0.001, + "loss": 2.7161, + "step": 2501 + }, + { + "epoch": 0.1058465183179626, + "grad_norm": 0.6607329249382019, + "learning_rate": 0.001, + "loss": 2.7597, + "step": 2502 + }, + { + "epoch": 0.10588882308147897, + "grad_norm": 0.2999674677848816, + "learning_rate": 0.001, + "loss": 2.7306, + "step": 2503 + }, + { + "epoch": 0.10593112784499535, + "grad_norm": 0.2406844049692154, + "learning_rate": 0.001, + "loss": 1.7858, + "step": 2504 + }, + { + "epoch": 0.10597343260851172, + "grad_norm": 0.23513177037239075, + "learning_rate": 0.001, + "loss": 2.072, + "step": 2505 + }, + { + "epoch": 0.10601573737202809, + "grad_norm": 0.2586327791213989, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 2506 + }, + { + "epoch": 0.10605804213554446, + "grad_norm": 0.2618561089038849, + "learning_rate": 0.001, + "loss": 1.7579, + "step": 2507 + }, + { + "epoch": 0.10610034689906084, + "grad_norm": 0.34452149271965027, + "learning_rate": 0.001, + "loss": 2.1415, + "step": 2508 + }, + { + "epoch": 0.1061426516625772, + "grad_norm": 0.3686632513999939, + "learning_rate": 0.001, + "loss": 2.3883, + "step": 2509 + }, + { + "epoch": 0.10618495642609357, + "grad_norm": 0.2636253535747528, + "learning_rate": 0.001, + "loss": 2.2841, + "step": 2510 + }, + { + "epoch": 0.10622726118960996, + "grad_norm": 0.2659060060977936, + "learning_rate": 0.001, + "loss": 2.5267, + "step": 2511 + }, + { + "epoch": 0.10626956595312632, + "grad_norm": 0.37761518359184265, + "learning_rate": 0.001, + "loss": 2.7947, + "step": 2512 + }, + { + "epoch": 0.10631187071664269, + "grad_norm": 0.8648855090141296, + "learning_rate": 0.001, + "loss": 2.7321, + "step": 2513 + }, + { + "epoch": 0.10635417548015906, + "grad_norm": 0.26311472058296204, + "learning_rate": 0.001, + "loss": 1.881, + "step": 2514 + }, + { + "epoch": 0.10639648024367544, + "grad_norm": 0.344380259513855, + "learning_rate": 0.001, + "loss": 2.3485, + "step": 2515 + }, + { + "epoch": 0.10643878500719181, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.001, + "loss": 2.1559, + "step": 2516 + }, + { + "epoch": 0.10648108977070818, + "grad_norm": 0.32800522446632385, + "learning_rate": 0.001, + "loss": 2.0882, + "step": 2517 + }, + { + "epoch": 0.10652339453422455, + "grad_norm": 0.3281853497028351, + "learning_rate": 0.001, + "loss": 2.4298, + "step": 2518 + }, + { + "epoch": 0.10656569929774093, + "grad_norm": 0.25847840309143066, + "learning_rate": 0.001, + "loss": 2.1247, + "step": 2519 + }, + { + "epoch": 0.1066080040612573, + "grad_norm": 0.6328796148300171, + "learning_rate": 0.001, + "loss": 2.7698, + "step": 2520 + }, + { + "epoch": 0.10665030882477367, + "grad_norm": 1.8759524822235107, + "learning_rate": 0.001, + "loss": 2.4454, + "step": 2521 + }, + { + "epoch": 0.10669261358829005, + "grad_norm": 0.2672816812992096, + "learning_rate": 0.001, + "loss": 2.3352, + "step": 2522 + }, + { + "epoch": 0.10673491835180642, + "grad_norm": 0.22392208874225616, + "learning_rate": 0.001, + "loss": 1.6423, + "step": 2523 + }, + { + "epoch": 0.10677722311532278, + "grad_norm": 0.43734171986579895, + "learning_rate": 0.001, + "loss": 2.8971, + "step": 2524 + }, + { + "epoch": 0.10681952787883915, + "grad_norm": 0.28167760372161865, + "learning_rate": 0.001, + "loss": 1.9086, + "step": 2525 + }, + { + "epoch": 0.10686183264235553, + "grad_norm": 0.2657049000263214, + "learning_rate": 0.001, + "loss": 3.812, + "step": 2526 + }, + { + "epoch": 0.1069041374058719, + "grad_norm": 0.26134321093559265, + "learning_rate": 0.001, + "loss": 2.2141, + "step": 2527 + }, + { + "epoch": 0.10694644216938827, + "grad_norm": 0.271685928106308, + "learning_rate": 0.001, + "loss": 2.1261, + "step": 2528 + }, + { + "epoch": 0.10698874693290465, + "grad_norm": 0.7208402156829834, + "learning_rate": 0.001, + "loss": 2.3952, + "step": 2529 + }, + { + "epoch": 0.10703105169642102, + "grad_norm": 0.5197535157203674, + "learning_rate": 0.001, + "loss": 2.6504, + "step": 2530 + }, + { + "epoch": 0.10707335645993739, + "grad_norm": 0.27384886145591736, + "learning_rate": 0.001, + "loss": 3.3872, + "step": 2531 + }, + { + "epoch": 0.10711566122345376, + "grad_norm": 4.649167060852051, + "learning_rate": 0.001, + "loss": 2.3127, + "step": 2532 + }, + { + "epoch": 0.10715796598697014, + "grad_norm": 0.5148127675056458, + "learning_rate": 0.001, + "loss": 3.0018, + "step": 2533 + }, + { + "epoch": 0.1072002707504865, + "grad_norm": 1.9632915258407593, + "learning_rate": 0.001, + "loss": 3.2377, + "step": 2534 + }, + { + "epoch": 0.10724257551400287, + "grad_norm": 1.258927583694458, + "learning_rate": 0.001, + "loss": 1.5741, + "step": 2535 + }, + { + "epoch": 0.10728488027751924, + "grad_norm": 0.38562342524528503, + "learning_rate": 0.001, + "loss": 2.2438, + "step": 2536 + }, + { + "epoch": 0.10732718504103562, + "grad_norm": 0.4237939715385437, + "learning_rate": 0.001, + "loss": 2.8422, + "step": 2537 + }, + { + "epoch": 0.10736948980455199, + "grad_norm": 12.856072425842285, + "learning_rate": 0.001, + "loss": 1.9867, + "step": 2538 + }, + { + "epoch": 0.10741179456806836, + "grad_norm": 0.3507222533226013, + "learning_rate": 0.001, + "loss": 2.4188, + "step": 2539 + }, + { + "epoch": 0.10745409933158474, + "grad_norm": 0.29266253113746643, + "learning_rate": 0.001, + "loss": 2.3492, + "step": 2540 + }, + { + "epoch": 0.10749640409510111, + "grad_norm": 0.31082549691200256, + "learning_rate": 0.001, + "loss": 2.4325, + "step": 2541 + }, + { + "epoch": 0.10753870885861748, + "grad_norm": 0.41879016160964966, + "learning_rate": 0.001, + "loss": 3.5761, + "step": 2542 + }, + { + "epoch": 0.10758101362213385, + "grad_norm": 0.2282257229089737, + "learning_rate": 0.001, + "loss": 2.143, + "step": 2543 + }, + { + "epoch": 0.10762331838565023, + "grad_norm": 3.6079633235931396, + "learning_rate": 0.001, + "loss": 2.3143, + "step": 2544 + }, + { + "epoch": 0.1076656231491666, + "grad_norm": 0.24503585696220398, + "learning_rate": 0.001, + "loss": 2.5591, + "step": 2545 + }, + { + "epoch": 0.10770792791268297, + "grad_norm": 0.49733033776283264, + "learning_rate": 0.001, + "loss": 2.3954, + "step": 2546 + }, + { + "epoch": 0.10775023267619933, + "grad_norm": 1.210880994796753, + "learning_rate": 0.001, + "loss": 2.9697, + "step": 2547 + }, + { + "epoch": 0.10779253743971572, + "grad_norm": 0.6089571714401245, + "learning_rate": 0.001, + "loss": 2.2833, + "step": 2548 + }, + { + "epoch": 0.10783484220323208, + "grad_norm": 0.2385062873363495, + "learning_rate": 0.001, + "loss": 3.0483, + "step": 2549 + }, + { + "epoch": 0.10787714696674845, + "grad_norm": 0.2744576334953308, + "learning_rate": 0.001, + "loss": 1.9552, + "step": 2550 + }, + { + "epoch": 0.10791945173026483, + "grad_norm": 0.32292699813842773, + "learning_rate": 0.001, + "loss": 2.3587, + "step": 2551 + }, + { + "epoch": 0.1079617564937812, + "grad_norm": 0.4315103590488434, + "learning_rate": 0.001, + "loss": 3.1126, + "step": 2552 + }, + { + "epoch": 0.10800406125729757, + "grad_norm": 0.38236674666404724, + "learning_rate": 0.001, + "loss": 2.787, + "step": 2553 + }, + { + "epoch": 0.10804636602081394, + "grad_norm": 0.25783267617225647, + "learning_rate": 0.001, + "loss": 1.9616, + "step": 2554 + }, + { + "epoch": 0.10808867078433032, + "grad_norm": 0.6072126030921936, + "learning_rate": 0.001, + "loss": 2.2998, + "step": 2555 + }, + { + "epoch": 0.10813097554784669, + "grad_norm": 1.3105621337890625, + "learning_rate": 0.001, + "loss": 2.144, + "step": 2556 + }, + { + "epoch": 0.10817328031136306, + "grad_norm": 0.3103470504283905, + "learning_rate": 0.001, + "loss": 2.4171, + "step": 2557 + }, + { + "epoch": 0.10821558507487943, + "grad_norm": 2.017469644546509, + "learning_rate": 0.001, + "loss": 1.9626, + "step": 2558 + }, + { + "epoch": 0.10825788983839581, + "grad_norm": 0.6327214241027832, + "learning_rate": 0.001, + "loss": 3.576, + "step": 2559 + }, + { + "epoch": 0.10830019460191218, + "grad_norm": 0.3792698383331299, + "learning_rate": 0.001, + "loss": 2.7368, + "step": 2560 + }, + { + "epoch": 0.10834249936542854, + "grad_norm": 0.3013581931591034, + "learning_rate": 0.001, + "loss": 2.3676, + "step": 2561 + }, + { + "epoch": 0.10838480412894493, + "grad_norm": 0.2713608145713806, + "learning_rate": 0.001, + "loss": 2.6759, + "step": 2562 + }, + { + "epoch": 0.1084271088924613, + "grad_norm": 0.32632705569267273, + "learning_rate": 0.001, + "loss": 3.0297, + "step": 2563 + }, + { + "epoch": 0.10846941365597766, + "grad_norm": 0.48675236105918884, + "learning_rate": 0.001, + "loss": 2.3548, + "step": 2564 + }, + { + "epoch": 0.10851171841949403, + "grad_norm": 0.3283641040325165, + "learning_rate": 0.001, + "loss": 3.2577, + "step": 2565 + }, + { + "epoch": 0.10855402318301041, + "grad_norm": 0.3809754550457001, + "learning_rate": 0.001, + "loss": 1.8483, + "step": 2566 + }, + { + "epoch": 0.10859632794652678, + "grad_norm": 0.5347718596458435, + "learning_rate": 0.001, + "loss": 2.3094, + "step": 2567 + }, + { + "epoch": 0.10863863271004315, + "grad_norm": 0.28041812777519226, + "learning_rate": 0.001, + "loss": 2.4324, + "step": 2568 + }, + { + "epoch": 0.10868093747355952, + "grad_norm": 0.33430221676826477, + "learning_rate": 0.001, + "loss": 2.1342, + "step": 2569 + }, + { + "epoch": 0.1087232422370759, + "grad_norm": 0.23147587478160858, + "learning_rate": 0.001, + "loss": 2.2938, + "step": 2570 + }, + { + "epoch": 0.10876554700059227, + "grad_norm": 0.25908035039901733, + "learning_rate": 0.001, + "loss": 3.265, + "step": 2571 + }, + { + "epoch": 0.10880785176410863, + "grad_norm": 0.3302173316478729, + "learning_rate": 0.001, + "loss": 3.0818, + "step": 2572 + }, + { + "epoch": 0.10885015652762502, + "grad_norm": 0.3298220634460449, + "learning_rate": 0.001, + "loss": 2.6535, + "step": 2573 + }, + { + "epoch": 0.10889246129114138, + "grad_norm": 0.9503449201583862, + "learning_rate": 0.001, + "loss": 1.846, + "step": 2574 + }, + { + "epoch": 0.10893476605465775, + "grad_norm": 0.4278430938720703, + "learning_rate": 0.001, + "loss": 2.8229, + "step": 2575 + }, + { + "epoch": 0.10897707081817412, + "grad_norm": 0.35355350375175476, + "learning_rate": 0.001, + "loss": 2.7936, + "step": 2576 + }, + { + "epoch": 0.1090193755816905, + "grad_norm": 2.9743497371673584, + "learning_rate": 0.001, + "loss": 1.9395, + "step": 2577 + }, + { + "epoch": 0.10906168034520687, + "grad_norm": 1.5361433029174805, + "learning_rate": 0.001, + "loss": 2.2915, + "step": 2578 + }, + { + "epoch": 0.10910398510872324, + "grad_norm": 0.35586532950401306, + "learning_rate": 0.001, + "loss": 2.2418, + "step": 2579 + }, + { + "epoch": 0.10914628987223961, + "grad_norm": 0.7446362972259521, + "learning_rate": 0.001, + "loss": 2.3212, + "step": 2580 + }, + { + "epoch": 0.10918859463575599, + "grad_norm": 0.3713662922382355, + "learning_rate": 0.001, + "loss": 2.0863, + "step": 2581 + }, + { + "epoch": 0.10923089939927236, + "grad_norm": 0.4044400155544281, + "learning_rate": 0.001, + "loss": 2.1731, + "step": 2582 + }, + { + "epoch": 0.10927320416278873, + "grad_norm": 2.1356008052825928, + "learning_rate": 0.001, + "loss": 2.3904, + "step": 2583 + }, + { + "epoch": 0.10931550892630511, + "grad_norm": 0.9785255193710327, + "learning_rate": 0.001, + "loss": 1.529, + "step": 2584 + }, + { + "epoch": 0.10935781368982148, + "grad_norm": 1.0335931777954102, + "learning_rate": 0.001, + "loss": 3.1518, + "step": 2585 + }, + { + "epoch": 0.10940011845333784, + "grad_norm": 0.8856411576271057, + "learning_rate": 0.001, + "loss": 2.0349, + "step": 2586 + }, + { + "epoch": 0.10944242321685421, + "grad_norm": 0.8047211766242981, + "learning_rate": 0.001, + "loss": 2.1286, + "step": 2587 + }, + { + "epoch": 0.1094847279803706, + "grad_norm": 0.5640533566474915, + "learning_rate": 0.001, + "loss": 2.8253, + "step": 2588 + }, + { + "epoch": 0.10952703274388696, + "grad_norm": 0.3256753087043762, + "learning_rate": 0.001, + "loss": 2.0331, + "step": 2589 + }, + { + "epoch": 0.10956933750740333, + "grad_norm": 1.4646278619766235, + "learning_rate": 0.001, + "loss": 2.928, + "step": 2590 + }, + { + "epoch": 0.10961164227091971, + "grad_norm": 5.041200637817383, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 2591 + }, + { + "epoch": 0.10965394703443608, + "grad_norm": 0.8993614912033081, + "learning_rate": 0.001, + "loss": 2.4207, + "step": 2592 + }, + { + "epoch": 0.10969625179795245, + "grad_norm": 0.47238776087760925, + "learning_rate": 0.001, + "loss": 2.7745, + "step": 2593 + }, + { + "epoch": 0.10973855656146882, + "grad_norm": 0.5236493945121765, + "learning_rate": 0.001, + "loss": 2.6803, + "step": 2594 + }, + { + "epoch": 0.1097808613249852, + "grad_norm": 0.49597102403640747, + "learning_rate": 0.001, + "loss": 2.4384, + "step": 2595 + }, + { + "epoch": 0.10982316608850157, + "grad_norm": 0.7567354440689087, + "learning_rate": 0.001, + "loss": 2.0341, + "step": 2596 + }, + { + "epoch": 0.10986547085201794, + "grad_norm": 43.87346649169922, + "learning_rate": 0.001, + "loss": 2.3568, + "step": 2597 + }, + { + "epoch": 0.1099077756155343, + "grad_norm": 0.926131546497345, + "learning_rate": 0.001, + "loss": 3.707, + "step": 2598 + }, + { + "epoch": 0.10995008037905069, + "grad_norm": 0.2967435419559479, + "learning_rate": 0.001, + "loss": 2.0035, + "step": 2599 + }, + { + "epoch": 0.10999238514256705, + "grad_norm": 0.4026656150817871, + "learning_rate": 0.001, + "loss": 2.9419, + "step": 2600 + }, + { + "epoch": 0.11003468990608342, + "grad_norm": 1.9945727586746216, + "learning_rate": 0.001, + "loss": 2.478, + "step": 2601 + }, + { + "epoch": 0.1100769946695998, + "grad_norm": 8.51455307006836, + "learning_rate": 0.001, + "loss": 2.4987, + "step": 2602 + }, + { + "epoch": 0.11011929943311617, + "grad_norm": 7.822841167449951, + "learning_rate": 0.001, + "loss": 2.9194, + "step": 2603 + }, + { + "epoch": 0.11016160419663254, + "grad_norm": 0.6975441575050354, + "learning_rate": 0.001, + "loss": 4.2243, + "step": 2604 + }, + { + "epoch": 0.11020390896014891, + "grad_norm": 9.386311531066895, + "learning_rate": 0.001, + "loss": 2.6628, + "step": 2605 + }, + { + "epoch": 0.11024621372366529, + "grad_norm": 0.5597333908081055, + "learning_rate": 0.001, + "loss": 3.464, + "step": 2606 + }, + { + "epoch": 0.11028851848718166, + "grad_norm": 3.522331714630127, + "learning_rate": 0.001, + "loss": 2.7945, + "step": 2607 + }, + { + "epoch": 0.11033082325069803, + "grad_norm": 0.4466964602470398, + "learning_rate": 0.001, + "loss": 3.2322, + "step": 2608 + }, + { + "epoch": 0.1103731280142144, + "grad_norm": 12.962498664855957, + "learning_rate": 0.001, + "loss": 2.4416, + "step": 2609 + }, + { + "epoch": 0.11041543277773078, + "grad_norm": 0.35099491477012634, + "learning_rate": 0.001, + "loss": 3.0072, + "step": 2610 + }, + { + "epoch": 0.11045773754124714, + "grad_norm": 0.5855743885040283, + "learning_rate": 0.001, + "loss": 3.3016, + "step": 2611 + }, + { + "epoch": 0.11050004230476351, + "grad_norm": 1.2006386518478394, + "learning_rate": 0.001, + "loss": 4.1105, + "step": 2612 + }, + { + "epoch": 0.1105423470682799, + "grad_norm": 0.3867046535015106, + "learning_rate": 0.001, + "loss": 2.4014, + "step": 2613 + }, + { + "epoch": 0.11058465183179626, + "grad_norm": 0.6155429482460022, + "learning_rate": 0.001, + "loss": 3.2606, + "step": 2614 + }, + { + "epoch": 0.11062695659531263, + "grad_norm": 1.17522394657135, + "learning_rate": 0.001, + "loss": 2.7431, + "step": 2615 + }, + { + "epoch": 0.110669261358829, + "grad_norm": 1.3711291551589966, + "learning_rate": 0.001, + "loss": 2.6056, + "step": 2616 + }, + { + "epoch": 0.11071156612234538, + "grad_norm": 0.33851131796836853, + "learning_rate": 0.001, + "loss": 2.9142, + "step": 2617 + }, + { + "epoch": 0.11075387088586175, + "grad_norm": 0.32981690764427185, + "learning_rate": 0.001, + "loss": 2.4273, + "step": 2618 + }, + { + "epoch": 0.11079617564937812, + "grad_norm": 0.3835381269454956, + "learning_rate": 0.001, + "loss": 3.0215, + "step": 2619 + }, + { + "epoch": 0.11083848041289449, + "grad_norm": 0.2677971422672272, + "learning_rate": 0.001, + "loss": 2.0736, + "step": 2620 + }, + { + "epoch": 0.11088078517641087, + "grad_norm": 0.33958640694618225, + "learning_rate": 0.001, + "loss": 3.4959, + "step": 2621 + }, + { + "epoch": 0.11092308993992724, + "grad_norm": 0.281795859336853, + "learning_rate": 0.001, + "loss": 2.2221, + "step": 2622 + }, + { + "epoch": 0.1109653947034436, + "grad_norm": 0.3385681211948395, + "learning_rate": 0.001, + "loss": 1.5377, + "step": 2623 + }, + { + "epoch": 0.11100769946695999, + "grad_norm": 0.466964453458786, + "learning_rate": 0.001, + "loss": 2.3652, + "step": 2624 + }, + { + "epoch": 0.11105000423047635, + "grad_norm": 0.25589802861213684, + "learning_rate": 0.001, + "loss": 1.903, + "step": 2625 + }, + { + "epoch": 0.11109230899399272, + "grad_norm": 0.3010057508945465, + "learning_rate": 0.001, + "loss": 3.1568, + "step": 2626 + }, + { + "epoch": 0.11113461375750909, + "grad_norm": 6.012112140655518, + "learning_rate": 0.001, + "loss": 2.411, + "step": 2627 + }, + { + "epoch": 0.11117691852102547, + "grad_norm": 0.4896494448184967, + "learning_rate": 0.001, + "loss": 3.1489, + "step": 2628 + }, + { + "epoch": 0.11121922328454184, + "grad_norm": 0.33814510703086853, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 2629 + }, + { + "epoch": 0.11126152804805821, + "grad_norm": 0.23706887662410736, + "learning_rate": 0.001, + "loss": 2.3078, + "step": 2630 + }, + { + "epoch": 0.11130383281157458, + "grad_norm": 0.280916690826416, + "learning_rate": 0.001, + "loss": 2.4966, + "step": 2631 + }, + { + "epoch": 0.11134613757509096, + "grad_norm": 1.4228007793426514, + "learning_rate": 0.001, + "loss": 1.9013, + "step": 2632 + }, + { + "epoch": 0.11138844233860733, + "grad_norm": 0.3774404525756836, + "learning_rate": 0.001, + "loss": 2.2246, + "step": 2633 + }, + { + "epoch": 0.1114307471021237, + "grad_norm": 13.576204299926758, + "learning_rate": 0.001, + "loss": 2.8407, + "step": 2634 + }, + { + "epoch": 0.11147305186564008, + "grad_norm": 1.4729946851730347, + "learning_rate": 0.001, + "loss": 2.5873, + "step": 2635 + }, + { + "epoch": 0.11151535662915645, + "grad_norm": 0.2995510697364807, + "learning_rate": 0.001, + "loss": 2.0534, + "step": 2636 + }, + { + "epoch": 0.11155766139267281, + "grad_norm": 0.389316201210022, + "learning_rate": 0.001, + "loss": 3.0107, + "step": 2637 + }, + { + "epoch": 0.11159996615618918, + "grad_norm": 0.375186562538147, + "learning_rate": 0.001, + "loss": 3.2694, + "step": 2638 + }, + { + "epoch": 0.11164227091970556, + "grad_norm": 0.24365709722042084, + "learning_rate": 0.001, + "loss": 2.6694, + "step": 2639 + }, + { + "epoch": 0.11168457568322193, + "grad_norm": 0.35557329654693604, + "learning_rate": 0.001, + "loss": 2.1832, + "step": 2640 + }, + { + "epoch": 0.1117268804467383, + "grad_norm": 0.2812917232513428, + "learning_rate": 0.001, + "loss": 2.6873, + "step": 2641 + }, + { + "epoch": 0.11176918521025467, + "grad_norm": 0.3376398980617523, + "learning_rate": 0.001, + "loss": 2.1201, + "step": 2642 + }, + { + "epoch": 0.11181148997377105, + "grad_norm": 1.3907296657562256, + "learning_rate": 0.001, + "loss": 2.5576, + "step": 2643 + }, + { + "epoch": 0.11185379473728742, + "grad_norm": 3.7346065044403076, + "learning_rate": 0.001, + "loss": 2.812, + "step": 2644 + }, + { + "epoch": 0.11189609950080379, + "grad_norm": 15.240631103515625, + "learning_rate": 0.001, + "loss": 2.7597, + "step": 2645 + }, + { + "epoch": 0.11193840426432017, + "grad_norm": 0.24908733367919922, + "learning_rate": 0.001, + "loss": 2.6942, + "step": 2646 + }, + { + "epoch": 0.11198070902783654, + "grad_norm": 0.43838199973106384, + "learning_rate": 0.001, + "loss": 2.216, + "step": 2647 + }, + { + "epoch": 0.1120230137913529, + "grad_norm": 1.0071830749511719, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 2648 + }, + { + "epoch": 0.11206531855486927, + "grad_norm": 1.4726154804229736, + "learning_rate": 0.001, + "loss": 2.3125, + "step": 2649 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 0.4552990198135376, + "learning_rate": 0.001, + "loss": 2.402, + "step": 2650 + }, + { + "epoch": 0.11214992808190202, + "grad_norm": 0.4046100378036499, + "learning_rate": 0.001, + "loss": 2.8129, + "step": 2651 + }, + { + "epoch": 0.11219223284541839, + "grad_norm": 0.3966389000415802, + "learning_rate": 0.001, + "loss": 3.0564, + "step": 2652 + }, + { + "epoch": 0.11223453760893477, + "grad_norm": 0.25914904475212097, + "learning_rate": 0.001, + "loss": 2.0611, + "step": 2653 + }, + { + "epoch": 0.11227684237245114, + "grad_norm": 0.2761140763759613, + "learning_rate": 0.001, + "loss": 2.1811, + "step": 2654 + }, + { + "epoch": 0.11231914713596751, + "grad_norm": 0.3019154965877533, + "learning_rate": 0.001, + "loss": 2.5075, + "step": 2655 + }, + { + "epoch": 0.11236145189948388, + "grad_norm": 0.22981767356395721, + "learning_rate": 0.001, + "loss": 1.7464, + "step": 2656 + }, + { + "epoch": 0.11240375666300026, + "grad_norm": 0.32616347074508667, + "learning_rate": 0.001, + "loss": 3.1906, + "step": 2657 + }, + { + "epoch": 0.11244606142651663, + "grad_norm": 0.2503935992717743, + "learning_rate": 0.001, + "loss": 2.4619, + "step": 2658 + }, + { + "epoch": 0.112488366190033, + "grad_norm": 0.27525201439857483, + "learning_rate": 0.001, + "loss": 2.0142, + "step": 2659 + }, + { + "epoch": 0.11253067095354936, + "grad_norm": 0.23904815316200256, + "learning_rate": 0.001, + "loss": 1.9905, + "step": 2660 + }, + { + "epoch": 0.11257297571706575, + "grad_norm": 1.022966980934143, + "learning_rate": 0.001, + "loss": 2.2188, + "step": 2661 + }, + { + "epoch": 0.11261528048058211, + "grad_norm": 0.39735960960388184, + "learning_rate": 0.001, + "loss": 2.3742, + "step": 2662 + }, + { + "epoch": 0.11265758524409848, + "grad_norm": 0.22529076039791107, + "learning_rate": 0.001, + "loss": 2.3343, + "step": 2663 + }, + { + "epoch": 0.11269989000761486, + "grad_norm": 0.6422929167747498, + "learning_rate": 0.001, + "loss": 2.8202, + "step": 2664 + }, + { + "epoch": 0.11274219477113123, + "grad_norm": 0.32373046875, + "learning_rate": 0.001, + "loss": 3.1001, + "step": 2665 + }, + { + "epoch": 0.1127844995346476, + "grad_norm": 0.4991035759449005, + "learning_rate": 0.001, + "loss": 1.8736, + "step": 2666 + }, + { + "epoch": 0.11282680429816397, + "grad_norm": 0.2527635097503662, + "learning_rate": 0.001, + "loss": 2.5017, + "step": 2667 + }, + { + "epoch": 0.11286910906168035, + "grad_norm": 7.938605785369873, + "learning_rate": 0.001, + "loss": 1.5331, + "step": 2668 + }, + { + "epoch": 0.11291141382519672, + "grad_norm": 0.4624551832675934, + "learning_rate": 0.001, + "loss": 3.5508, + "step": 2669 + }, + { + "epoch": 0.11295371858871309, + "grad_norm": 0.28487929701805115, + "learning_rate": 0.001, + "loss": 2.2229, + "step": 2670 + }, + { + "epoch": 0.11299602335222945, + "grad_norm": 1.6842297315597534, + "learning_rate": 0.001, + "loss": 2.3192, + "step": 2671 + }, + { + "epoch": 0.11303832811574584, + "grad_norm": 1.5760769844055176, + "learning_rate": 0.001, + "loss": 2.3325, + "step": 2672 + }, + { + "epoch": 0.1130806328792622, + "grad_norm": 1.3597266674041748, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 2673 + }, + { + "epoch": 0.11312293764277857, + "grad_norm": 0.3656711280345917, + "learning_rate": 0.001, + "loss": 3.6342, + "step": 2674 + }, + { + "epoch": 0.11316524240629496, + "grad_norm": 0.4814014136791229, + "learning_rate": 0.001, + "loss": 2.4345, + "step": 2675 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.24970856308937073, + "learning_rate": 0.001, + "loss": 1.9132, + "step": 2676 + }, + { + "epoch": 0.11324985193332769, + "grad_norm": 0.31391432881355286, + "learning_rate": 0.001, + "loss": 2.6646, + "step": 2677 + }, + { + "epoch": 0.11329215669684406, + "grad_norm": 0.4391164481639862, + "learning_rate": 0.001, + "loss": 2.9287, + "step": 2678 + }, + { + "epoch": 0.11333446146036044, + "grad_norm": 0.3026899993419647, + "learning_rate": 0.001, + "loss": 2.882, + "step": 2679 + }, + { + "epoch": 0.11337676622387681, + "grad_norm": 0.755450963973999, + "learning_rate": 0.001, + "loss": 3.3115, + "step": 2680 + }, + { + "epoch": 0.11341907098739318, + "grad_norm": 0.3200805187225342, + "learning_rate": 0.001, + "loss": 2.3836, + "step": 2681 + }, + { + "epoch": 0.11346137575090955, + "grad_norm": 0.25980228185653687, + "learning_rate": 0.001, + "loss": 1.8265, + "step": 2682 + }, + { + "epoch": 0.11350368051442593, + "grad_norm": 0.4624451696872711, + "learning_rate": 0.001, + "loss": 2.7016, + "step": 2683 + }, + { + "epoch": 0.1135459852779423, + "grad_norm": 0.7080914378166199, + "learning_rate": 0.001, + "loss": 2.4309, + "step": 2684 + }, + { + "epoch": 0.11358829004145866, + "grad_norm": 0.45167163014411926, + "learning_rate": 0.001, + "loss": 3.1995, + "step": 2685 + }, + { + "epoch": 0.11363059480497505, + "grad_norm": 0.21926790475845337, + "learning_rate": 0.001, + "loss": 2.0448, + "step": 2686 + }, + { + "epoch": 0.11367289956849141, + "grad_norm": 1.0838067531585693, + "learning_rate": 0.001, + "loss": 1.8552, + "step": 2687 + }, + { + "epoch": 0.11371520433200778, + "grad_norm": 1.3477392196655273, + "learning_rate": 0.001, + "loss": 2.7008, + "step": 2688 + }, + { + "epoch": 0.11375750909552415, + "grad_norm": 0.41002964973449707, + "learning_rate": 0.001, + "loss": 3.4506, + "step": 2689 + }, + { + "epoch": 0.11379981385904053, + "grad_norm": 0.2277841717004776, + "learning_rate": 0.001, + "loss": 2.1868, + "step": 2690 + }, + { + "epoch": 0.1138421186225569, + "grad_norm": 0.6149588227272034, + "learning_rate": 0.001, + "loss": 2.6013, + "step": 2691 + }, + { + "epoch": 0.11388442338607327, + "grad_norm": 0.36279934644699097, + "learning_rate": 0.001, + "loss": 2.4312, + "step": 2692 + }, + { + "epoch": 0.11392672814958964, + "grad_norm": 0.47520431876182556, + "learning_rate": 0.001, + "loss": 3.0366, + "step": 2693 + }, + { + "epoch": 0.11396903291310602, + "grad_norm": 0.31242018938064575, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 2694 + }, + { + "epoch": 0.11401133767662239, + "grad_norm": 1.7739008665084839, + "learning_rate": 0.001, + "loss": 1.895, + "step": 2695 + }, + { + "epoch": 0.11405364244013876, + "grad_norm": 0.5066441297531128, + "learning_rate": 0.001, + "loss": 2.6342, + "step": 2696 + }, + { + "epoch": 0.11409594720365514, + "grad_norm": 0.24368147552013397, + "learning_rate": 0.001, + "loss": 2.7584, + "step": 2697 + }, + { + "epoch": 0.1141382519671715, + "grad_norm": 0.44066333770751953, + "learning_rate": 0.001, + "loss": 2.3854, + "step": 2698 + }, + { + "epoch": 0.11418055673068787, + "grad_norm": 0.26902955770492554, + "learning_rate": 0.001, + "loss": 2.7149, + "step": 2699 + }, + { + "epoch": 0.11422286149420424, + "grad_norm": 4.966346263885498, + "learning_rate": 0.001, + "loss": 2.2739, + "step": 2700 + }, + { + "epoch": 0.11426516625772062, + "grad_norm": 0.82923823595047, + "learning_rate": 0.001, + "loss": 2.4333, + "step": 2701 + }, + { + "epoch": 0.11430747102123699, + "grad_norm": 0.37718573212623596, + "learning_rate": 0.001, + "loss": 2.846, + "step": 2702 + }, + { + "epoch": 0.11434977578475336, + "grad_norm": 1.1402643918991089, + "learning_rate": 0.001, + "loss": 2.2025, + "step": 2703 + }, + { + "epoch": 0.11439208054826973, + "grad_norm": 0.5933586359024048, + "learning_rate": 0.001, + "loss": 4.4132, + "step": 2704 + }, + { + "epoch": 0.11443438531178611, + "grad_norm": 0.9966050386428833, + "learning_rate": 0.001, + "loss": 3.1715, + "step": 2705 + }, + { + "epoch": 0.11447669007530248, + "grad_norm": 0.3818608820438385, + "learning_rate": 0.001, + "loss": 2.9614, + "step": 2706 + }, + { + "epoch": 0.11451899483881885, + "grad_norm": 0.355037122964859, + "learning_rate": 0.001, + "loss": 2.7848, + "step": 2707 + }, + { + "epoch": 0.11456129960233523, + "grad_norm": 0.4167128801345825, + "learning_rate": 0.001, + "loss": 2.1713, + "step": 2708 + }, + { + "epoch": 0.1146036043658516, + "grad_norm": 0.3600790202617645, + "learning_rate": 0.001, + "loss": 2.0852, + "step": 2709 + }, + { + "epoch": 0.11464590912936796, + "grad_norm": 0.7185443639755249, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 2710 + }, + { + "epoch": 0.11468821389288433, + "grad_norm": 0.5277695655822754, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 2711 + }, + { + "epoch": 0.11473051865640072, + "grad_norm": 2.362328052520752, + "learning_rate": 0.001, + "loss": 2.655, + "step": 2712 + }, + { + "epoch": 0.11477282341991708, + "grad_norm": 1.576801061630249, + "learning_rate": 0.001, + "loss": 3.0455, + "step": 2713 + }, + { + "epoch": 0.11481512818343345, + "grad_norm": 0.5034794807434082, + "learning_rate": 0.001, + "loss": 1.9612, + "step": 2714 + }, + { + "epoch": 0.11485743294694983, + "grad_norm": 0.5581480860710144, + "learning_rate": 0.001, + "loss": 2.1294, + "step": 2715 + }, + { + "epoch": 0.1148997377104662, + "grad_norm": 1.2885347604751587, + "learning_rate": 0.001, + "loss": 3.0082, + "step": 2716 + }, + { + "epoch": 0.11494204247398257, + "grad_norm": 1.621717929840088, + "learning_rate": 0.001, + "loss": 2.4047, + "step": 2717 + }, + { + "epoch": 0.11498434723749894, + "grad_norm": 22.007076263427734, + "learning_rate": 0.001, + "loss": 3.3206, + "step": 2718 + }, + { + "epoch": 0.11502665200101532, + "grad_norm": 3.476715087890625, + "learning_rate": 0.001, + "loss": 2.8062, + "step": 2719 + }, + { + "epoch": 0.11506895676453169, + "grad_norm": 0.24378205835819244, + "learning_rate": 0.001, + "loss": 1.6405, + "step": 2720 + }, + { + "epoch": 0.11511126152804806, + "grad_norm": 3.1240763664245605, + "learning_rate": 0.001, + "loss": 2.9476, + "step": 2721 + }, + { + "epoch": 0.11515356629156442, + "grad_norm": 0.27668654918670654, + "learning_rate": 0.001, + "loss": 1.6878, + "step": 2722 + }, + { + "epoch": 0.1151958710550808, + "grad_norm": 33.68696975708008, + "learning_rate": 0.001, + "loss": 2.2438, + "step": 2723 + }, + { + "epoch": 0.11523817581859717, + "grad_norm": 1.328370213508606, + "learning_rate": 0.001, + "loss": 2.6715, + "step": 2724 + }, + { + "epoch": 0.11528048058211354, + "grad_norm": 22.9268741607666, + "learning_rate": 0.001, + "loss": 3.4431, + "step": 2725 + }, + { + "epoch": 0.11532278534562992, + "grad_norm": 0.6958709955215454, + "learning_rate": 0.001, + "loss": 2.9032, + "step": 2726 + }, + { + "epoch": 0.11536509010914629, + "grad_norm": 0.41907453536987305, + "learning_rate": 0.001, + "loss": 2.925, + "step": 2727 + }, + { + "epoch": 0.11540739487266266, + "grad_norm": 0.2991430461406708, + "learning_rate": 0.001, + "loss": 2.5263, + "step": 2728 + }, + { + "epoch": 0.11544969963617903, + "grad_norm": 0.7529126405715942, + "learning_rate": 0.001, + "loss": 2.8932, + "step": 2729 + }, + { + "epoch": 0.11549200439969541, + "grad_norm": 0.8318426609039307, + "learning_rate": 0.001, + "loss": 2.7539, + "step": 2730 + }, + { + "epoch": 0.11553430916321178, + "grad_norm": 1.7126951217651367, + "learning_rate": 0.001, + "loss": 2.2141, + "step": 2731 + }, + { + "epoch": 0.11557661392672815, + "grad_norm": 0.28930333256721497, + "learning_rate": 0.001, + "loss": 2.4101, + "step": 2732 + }, + { + "epoch": 0.11561891869024452, + "grad_norm": 0.3181939423084259, + "learning_rate": 0.001, + "loss": 2.2034, + "step": 2733 + }, + { + "epoch": 0.1156612234537609, + "grad_norm": 0.34087949991226196, + "learning_rate": 0.001, + "loss": 2.6263, + "step": 2734 + }, + { + "epoch": 0.11570352821727727, + "grad_norm": 1.4377021789550781, + "learning_rate": 0.001, + "loss": 2.2172, + "step": 2735 + }, + { + "epoch": 0.11574583298079363, + "grad_norm": 0.3622700572013855, + "learning_rate": 0.001, + "loss": 2.5439, + "step": 2736 + }, + { + "epoch": 0.11578813774431002, + "grad_norm": 0.27332210540771484, + "learning_rate": 0.001, + "loss": 2.9426, + "step": 2737 + }, + { + "epoch": 0.11583044250782638, + "grad_norm": 32.977970123291016, + "learning_rate": 0.001, + "loss": 2.6618, + "step": 2738 + }, + { + "epoch": 0.11587274727134275, + "grad_norm": 1.1440379619598389, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 2739 + }, + { + "epoch": 0.11591505203485912, + "grad_norm": 0.3366926610469818, + "learning_rate": 0.001, + "loss": 2.1376, + "step": 2740 + }, + { + "epoch": 0.1159573567983755, + "grad_norm": 0.3079652786254883, + "learning_rate": 0.001, + "loss": 2.873, + "step": 2741 + }, + { + "epoch": 0.11599966156189187, + "grad_norm": 0.3998541533946991, + "learning_rate": 0.001, + "loss": 2.7169, + "step": 2742 + }, + { + "epoch": 0.11604196632540824, + "grad_norm": 0.33957117795944214, + "learning_rate": 0.001, + "loss": 2.496, + "step": 2743 + }, + { + "epoch": 0.1160842710889246, + "grad_norm": 0.9444336891174316, + "learning_rate": 0.001, + "loss": 2.6322, + "step": 2744 + }, + { + "epoch": 0.11612657585244099, + "grad_norm": 0.8139225840568542, + "learning_rate": 0.001, + "loss": 2.0574, + "step": 2745 + }, + { + "epoch": 0.11616888061595736, + "grad_norm": 0.2738553583621979, + "learning_rate": 0.001, + "loss": 3.0332, + "step": 2746 + }, + { + "epoch": 0.11621118537947372, + "grad_norm": 0.539919912815094, + "learning_rate": 0.001, + "loss": 2.1122, + "step": 2747 + }, + { + "epoch": 0.1162534901429901, + "grad_norm": 0.29261156916618347, + "learning_rate": 0.001, + "loss": 2.6415, + "step": 2748 + }, + { + "epoch": 0.11629579490650647, + "grad_norm": 0.36019593477249146, + "learning_rate": 0.001, + "loss": 2.0997, + "step": 2749 + }, + { + "epoch": 0.11633809967002284, + "grad_norm": 1.0619240999221802, + "learning_rate": 0.001, + "loss": 2.7637, + "step": 2750 + }, + { + "epoch": 0.11638040443353921, + "grad_norm": 0.32508841156959534, + "learning_rate": 0.001, + "loss": 3.1838, + "step": 2751 + }, + { + "epoch": 0.1164227091970556, + "grad_norm": 0.37493109703063965, + "learning_rate": 0.001, + "loss": 2.8842, + "step": 2752 + }, + { + "epoch": 0.11646501396057196, + "grad_norm": 0.2882426381111145, + "learning_rate": 0.001, + "loss": 2.1448, + "step": 2753 + }, + { + "epoch": 0.11650731872408833, + "grad_norm": 0.30877891182899475, + "learning_rate": 0.001, + "loss": 2.4269, + "step": 2754 + }, + { + "epoch": 0.1165496234876047, + "grad_norm": 0.2244696319103241, + "learning_rate": 0.001, + "loss": 1.6318, + "step": 2755 + }, + { + "epoch": 0.11659192825112108, + "grad_norm": 0.2662592828273773, + "learning_rate": 0.001, + "loss": 2.2368, + "step": 2756 + }, + { + "epoch": 0.11663423301463745, + "grad_norm": 0.7968965172767639, + "learning_rate": 0.001, + "loss": 2.4511, + "step": 2757 + }, + { + "epoch": 0.11667653777815382, + "grad_norm": 0.3993340730667114, + "learning_rate": 0.001, + "loss": 2.2677, + "step": 2758 + }, + { + "epoch": 0.1167188425416702, + "grad_norm": 7.428627014160156, + "learning_rate": 0.001, + "loss": 2.8915, + "step": 2759 + }, + { + "epoch": 0.11676114730518657, + "grad_norm": 0.35218873620033264, + "learning_rate": 0.001, + "loss": 3.2628, + "step": 2760 + }, + { + "epoch": 0.11680345206870293, + "grad_norm": 0.33024418354034424, + "learning_rate": 0.001, + "loss": 2.3512, + "step": 2761 + }, + { + "epoch": 0.1168457568322193, + "grad_norm": 2.4879989624023438, + "learning_rate": 0.001, + "loss": 2.9775, + "step": 2762 + }, + { + "epoch": 0.11688806159573568, + "grad_norm": 0.2612648904323578, + "learning_rate": 0.001, + "loss": 2.8071, + "step": 2763 + }, + { + "epoch": 0.11693036635925205, + "grad_norm": 0.2603396475315094, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 2764 + }, + { + "epoch": 0.11697267112276842, + "grad_norm": 1.3054839372634888, + "learning_rate": 0.001, + "loss": 2.3514, + "step": 2765 + }, + { + "epoch": 0.11701497588628479, + "grad_norm": 0.46820470690727234, + "learning_rate": 0.001, + "loss": 2.3354, + "step": 2766 + }, + { + "epoch": 0.11705728064980117, + "grad_norm": 0.2680201530456543, + "learning_rate": 0.001, + "loss": 2.6435, + "step": 2767 + }, + { + "epoch": 0.11709958541331754, + "grad_norm": 0.2581261694431305, + "learning_rate": 0.001, + "loss": 2.5269, + "step": 2768 + }, + { + "epoch": 0.11714189017683391, + "grad_norm": 0.2201681286096573, + "learning_rate": 0.001, + "loss": 1.9714, + "step": 2769 + }, + { + "epoch": 0.11718419494035029, + "grad_norm": 0.31604522466659546, + "learning_rate": 0.001, + "loss": 2.4044, + "step": 2770 + }, + { + "epoch": 0.11722649970386666, + "grad_norm": 0.23219691216945648, + "learning_rate": 0.001, + "loss": 2.4686, + "step": 2771 + }, + { + "epoch": 0.11726880446738303, + "grad_norm": 0.24609123170375824, + "learning_rate": 0.001, + "loss": 1.7137, + "step": 2772 + }, + { + "epoch": 0.1173111092308994, + "grad_norm": 0.3166695535182953, + "learning_rate": 0.001, + "loss": 2.44, + "step": 2773 + }, + { + "epoch": 0.11735341399441578, + "grad_norm": 0.6801291108131409, + "learning_rate": 0.001, + "loss": 2.8316, + "step": 2774 + }, + { + "epoch": 0.11739571875793214, + "grad_norm": 0.25939130783081055, + "learning_rate": 0.001, + "loss": 2.155, + "step": 2775 + }, + { + "epoch": 0.11743802352144851, + "grad_norm": 0.2404932826757431, + "learning_rate": 0.001, + "loss": 1.9066, + "step": 2776 + }, + { + "epoch": 0.1174803282849649, + "grad_norm": 0.32322371006011963, + "learning_rate": 0.001, + "loss": 3.1624, + "step": 2777 + }, + { + "epoch": 0.11752263304848126, + "grad_norm": 0.29162564873695374, + "learning_rate": 0.001, + "loss": 2.1677, + "step": 2778 + }, + { + "epoch": 0.11756493781199763, + "grad_norm": 0.4201529324054718, + "learning_rate": 0.001, + "loss": 3.65, + "step": 2779 + }, + { + "epoch": 0.117607242575514, + "grad_norm": 0.3757007420063019, + "learning_rate": 0.001, + "loss": 2.7946, + "step": 2780 + }, + { + "epoch": 0.11764954733903038, + "grad_norm": 0.22242939472198486, + "learning_rate": 0.001, + "loss": 2.1342, + "step": 2781 + }, + { + "epoch": 0.11769185210254675, + "grad_norm": 0.7142042517662048, + "learning_rate": 0.001, + "loss": 2.9385, + "step": 2782 + }, + { + "epoch": 0.11773415686606312, + "grad_norm": 0.4159119129180908, + "learning_rate": 0.001, + "loss": 2.2336, + "step": 2783 + }, + { + "epoch": 0.11777646162957948, + "grad_norm": 38.654056549072266, + "learning_rate": 0.001, + "loss": 3.0095, + "step": 2784 + }, + { + "epoch": 0.11781876639309587, + "grad_norm": 0.2876143157482147, + "learning_rate": 0.001, + "loss": 2.1954, + "step": 2785 + }, + { + "epoch": 0.11786107115661223, + "grad_norm": 2.3631954193115234, + "learning_rate": 0.001, + "loss": 3.1529, + "step": 2786 + }, + { + "epoch": 0.1179033759201286, + "grad_norm": 0.2558928430080414, + "learning_rate": 0.001, + "loss": 2.0726, + "step": 2787 + }, + { + "epoch": 0.11794568068364499, + "grad_norm": 2.708045721054077, + "learning_rate": 0.001, + "loss": 2.3912, + "step": 2788 + }, + { + "epoch": 0.11798798544716135, + "grad_norm": 0.44729793071746826, + "learning_rate": 0.001, + "loss": 1.9636, + "step": 2789 + }, + { + "epoch": 0.11803029021067772, + "grad_norm": 0.5945959091186523, + "learning_rate": 0.001, + "loss": 2.6161, + "step": 2790 + }, + { + "epoch": 0.11807259497419409, + "grad_norm": 0.35763904452323914, + "learning_rate": 0.001, + "loss": 3.1271, + "step": 2791 + }, + { + "epoch": 0.11811489973771047, + "grad_norm": 0.24619075655937195, + "learning_rate": 0.001, + "loss": 2.6635, + "step": 2792 + }, + { + "epoch": 0.11815720450122684, + "grad_norm": 0.323395311832428, + "learning_rate": 0.001, + "loss": 3.0143, + "step": 2793 + }, + { + "epoch": 0.11819950926474321, + "grad_norm": 0.2660515010356903, + "learning_rate": 0.001, + "loss": 1.9591, + "step": 2794 + }, + { + "epoch": 0.11824181402825958, + "grad_norm": 0.5746325850486755, + "learning_rate": 0.001, + "loss": 2.3898, + "step": 2795 + }, + { + "epoch": 0.11828411879177596, + "grad_norm": 0.2861972153186798, + "learning_rate": 0.001, + "loss": 2.0267, + "step": 2796 + }, + { + "epoch": 0.11832642355529233, + "grad_norm": 0.2711239457130432, + "learning_rate": 0.001, + "loss": 2.5983, + "step": 2797 + }, + { + "epoch": 0.1183687283188087, + "grad_norm": 0.22106711566448212, + "learning_rate": 0.001, + "loss": 1.6675, + "step": 2798 + }, + { + "epoch": 0.11841103308232508, + "grad_norm": 0.2644370198249817, + "learning_rate": 0.001, + "loss": 2.7655, + "step": 2799 + }, + { + "epoch": 0.11845333784584144, + "grad_norm": 0.24654194712638855, + "learning_rate": 0.001, + "loss": 3.2642, + "step": 2800 + }, + { + "epoch": 0.11849564260935781, + "grad_norm": 0.7044499516487122, + "learning_rate": 0.001, + "loss": 2.9117, + "step": 2801 + }, + { + "epoch": 0.11853794737287418, + "grad_norm": 0.3446405827999115, + "learning_rate": 0.001, + "loss": 2.4988, + "step": 2802 + }, + { + "epoch": 0.11858025213639056, + "grad_norm": 0.27378132939338684, + "learning_rate": 0.001, + "loss": 2.2514, + "step": 2803 + }, + { + "epoch": 0.11862255689990693, + "grad_norm": 0.20993216335773468, + "learning_rate": 0.001, + "loss": 2.4, + "step": 2804 + }, + { + "epoch": 0.1186648616634233, + "grad_norm": 0.5672168731689453, + "learning_rate": 0.001, + "loss": 2.1295, + "step": 2805 + }, + { + "epoch": 0.11870716642693967, + "grad_norm": 0.2943876385688782, + "learning_rate": 0.001, + "loss": 3.7169, + "step": 2806 + }, + { + "epoch": 0.11874947119045605, + "grad_norm": 0.6475502252578735, + "learning_rate": 0.001, + "loss": 2.3737, + "step": 2807 + }, + { + "epoch": 0.11879177595397242, + "grad_norm": 0.3072225749492645, + "learning_rate": 0.001, + "loss": 1.9827, + "step": 2808 + }, + { + "epoch": 0.11883408071748879, + "grad_norm": 0.31037554144859314, + "learning_rate": 0.001, + "loss": 1.558, + "step": 2809 + }, + { + "epoch": 0.11887638548100517, + "grad_norm": 0.23181147873401642, + "learning_rate": 0.001, + "loss": 2.0697, + "step": 2810 + }, + { + "epoch": 0.11891869024452154, + "grad_norm": 10.96942138671875, + "learning_rate": 0.001, + "loss": 1.6794, + "step": 2811 + }, + { + "epoch": 0.1189609950080379, + "grad_norm": 0.42705076932907104, + "learning_rate": 0.001, + "loss": 2.3502, + "step": 2812 + }, + { + "epoch": 0.11900329977155427, + "grad_norm": 0.26363325119018555, + "learning_rate": 0.001, + "loss": 3.0325, + "step": 2813 + }, + { + "epoch": 0.11904560453507065, + "grad_norm": 0.26286640763282776, + "learning_rate": 0.001, + "loss": 2.1994, + "step": 2814 + }, + { + "epoch": 0.11908790929858702, + "grad_norm": 0.2361530065536499, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 2815 + }, + { + "epoch": 0.11913021406210339, + "grad_norm": 0.8257160186767578, + "learning_rate": 0.001, + "loss": 3.0286, + "step": 2816 + }, + { + "epoch": 0.11917251882561976, + "grad_norm": 0.2814808487892151, + "learning_rate": 0.001, + "loss": 2.8173, + "step": 2817 + }, + { + "epoch": 0.11921482358913614, + "grad_norm": 0.23943133652210236, + "learning_rate": 0.001, + "loss": 3.0642, + "step": 2818 + }, + { + "epoch": 0.11925712835265251, + "grad_norm": 0.206394761800766, + "learning_rate": 0.001, + "loss": 1.8372, + "step": 2819 + }, + { + "epoch": 0.11929943311616888, + "grad_norm": 0.29365062713623047, + "learning_rate": 0.001, + "loss": 2.7685, + "step": 2820 + }, + { + "epoch": 0.11934173787968526, + "grad_norm": 0.2639111280441284, + "learning_rate": 0.001, + "loss": 4.2966, + "step": 2821 + }, + { + "epoch": 0.11938404264320163, + "grad_norm": 0.24708105623722076, + "learning_rate": 0.001, + "loss": 2.184, + "step": 2822 + }, + { + "epoch": 0.119426347406718, + "grad_norm": 0.3220023512840271, + "learning_rate": 0.001, + "loss": 2.0879, + "step": 2823 + }, + { + "epoch": 0.11946865217023436, + "grad_norm": 0.30507785081863403, + "learning_rate": 0.001, + "loss": 2.2369, + "step": 2824 + }, + { + "epoch": 0.11951095693375074, + "grad_norm": 0.23936758935451508, + "learning_rate": 0.001, + "loss": 3.1638, + "step": 2825 + }, + { + "epoch": 0.11955326169726711, + "grad_norm": 0.5877100825309753, + "learning_rate": 0.001, + "loss": 2.9414, + "step": 2826 + }, + { + "epoch": 0.11959556646078348, + "grad_norm": 0.21661652624607086, + "learning_rate": 0.001, + "loss": 2.3561, + "step": 2827 + }, + { + "epoch": 0.11963787122429985, + "grad_norm": 0.2344318926334381, + "learning_rate": 0.001, + "loss": 2.9277, + "step": 2828 + }, + { + "epoch": 0.11968017598781623, + "grad_norm": 0.3025057017803192, + "learning_rate": 0.001, + "loss": 2.3685, + "step": 2829 + }, + { + "epoch": 0.1197224807513326, + "grad_norm": 0.24070441722869873, + "learning_rate": 0.001, + "loss": 3.1727, + "step": 2830 + }, + { + "epoch": 0.11976478551484897, + "grad_norm": 0.25270789861679077, + "learning_rate": 0.001, + "loss": 2.9425, + "step": 2831 + }, + { + "epoch": 0.11980709027836535, + "grad_norm": 0.23945005238056183, + "learning_rate": 0.001, + "loss": 3.2619, + "step": 2832 + }, + { + "epoch": 0.11984939504188172, + "grad_norm": 0.257040798664093, + "learning_rate": 0.001, + "loss": 2.4056, + "step": 2833 + }, + { + "epoch": 0.11989169980539809, + "grad_norm": 0.3537012040615082, + "learning_rate": 0.001, + "loss": 2.5206, + "step": 2834 + }, + { + "epoch": 0.11993400456891445, + "grad_norm": 0.9324373006820679, + "learning_rate": 0.001, + "loss": 3.0553, + "step": 2835 + }, + { + "epoch": 0.11997630933243084, + "grad_norm": 0.32961177825927734, + "learning_rate": 0.001, + "loss": 1.9009, + "step": 2836 + }, + { + "epoch": 0.1200186140959472, + "grad_norm": 0.21902523934841156, + "learning_rate": 0.001, + "loss": 2.2892, + "step": 2837 + }, + { + "epoch": 0.12006091885946357, + "grad_norm": 0.35956844687461853, + "learning_rate": 0.001, + "loss": 2.2198, + "step": 2838 + }, + { + "epoch": 0.12010322362297995, + "grad_norm": 0.31855231523513794, + "learning_rate": 0.001, + "loss": 2.6221, + "step": 2839 + }, + { + "epoch": 0.12014552838649632, + "grad_norm": 1.0669035911560059, + "learning_rate": 0.001, + "loss": 2.6779, + "step": 2840 + }, + { + "epoch": 0.12018783315001269, + "grad_norm": 0.5651441812515259, + "learning_rate": 0.001, + "loss": 2.3371, + "step": 2841 + }, + { + "epoch": 0.12023013791352906, + "grad_norm": 0.2813262939453125, + "learning_rate": 0.001, + "loss": 2.847, + "step": 2842 + }, + { + "epoch": 0.12027244267704544, + "grad_norm": 0.21474647521972656, + "learning_rate": 0.001, + "loss": 2.262, + "step": 2843 + }, + { + "epoch": 0.12031474744056181, + "grad_norm": 0.9822518229484558, + "learning_rate": 0.001, + "loss": 2.1823, + "step": 2844 + }, + { + "epoch": 0.12035705220407818, + "grad_norm": 0.4417326748371124, + "learning_rate": 0.001, + "loss": 1.6555, + "step": 2845 + }, + { + "epoch": 0.12039935696759455, + "grad_norm": 0.31064194440841675, + "learning_rate": 0.001, + "loss": 1.981, + "step": 2846 + }, + { + "epoch": 0.12044166173111093, + "grad_norm": 1.0444011688232422, + "learning_rate": 0.001, + "loss": 2.0207, + "step": 2847 + }, + { + "epoch": 0.1204839664946273, + "grad_norm": 0.25042444467544556, + "learning_rate": 0.001, + "loss": 2.2386, + "step": 2848 + }, + { + "epoch": 0.12052627125814366, + "grad_norm": 0.26854464411735535, + "learning_rate": 0.001, + "loss": 2.7635, + "step": 2849 + }, + { + "epoch": 0.12056857602166005, + "grad_norm": 0.28399658203125, + "learning_rate": 0.001, + "loss": 2.6783, + "step": 2850 + }, + { + "epoch": 0.12061088078517641, + "grad_norm": 0.3274761736392975, + "learning_rate": 0.001, + "loss": 2.8354, + "step": 2851 + }, + { + "epoch": 0.12065318554869278, + "grad_norm": 0.44863566756248474, + "learning_rate": 0.001, + "loss": 3.583, + "step": 2852 + }, + { + "epoch": 0.12069549031220915, + "grad_norm": 0.2759772539138794, + "learning_rate": 0.001, + "loss": 2.4853, + "step": 2853 + }, + { + "epoch": 0.12073779507572553, + "grad_norm": 0.3000403344631195, + "learning_rate": 0.001, + "loss": 2.8492, + "step": 2854 + }, + { + "epoch": 0.1207800998392419, + "grad_norm": 0.2891789376735687, + "learning_rate": 0.001, + "loss": 2.5121, + "step": 2855 + }, + { + "epoch": 0.12082240460275827, + "grad_norm": 0.26525014638900757, + "learning_rate": 0.001, + "loss": 3.1642, + "step": 2856 + }, + { + "epoch": 0.12086470936627464, + "grad_norm": 2.695129156112671, + "learning_rate": 0.001, + "loss": 3.6222, + "step": 2857 + }, + { + "epoch": 0.12090701412979102, + "grad_norm": 0.27255064249038696, + "learning_rate": 0.001, + "loss": 2.6515, + "step": 2858 + }, + { + "epoch": 0.12094931889330739, + "grad_norm": 1.2651582956314087, + "learning_rate": 0.001, + "loss": 3.0903, + "step": 2859 + }, + { + "epoch": 0.12099162365682375, + "grad_norm": 0.42988303303718567, + "learning_rate": 0.001, + "loss": 2.2843, + "step": 2860 + }, + { + "epoch": 0.12103392842034014, + "grad_norm": 0.27339550852775574, + "learning_rate": 0.001, + "loss": 2.0482, + "step": 2861 + }, + { + "epoch": 0.1210762331838565, + "grad_norm": 0.43721139430999756, + "learning_rate": 0.001, + "loss": 3.0085, + "step": 2862 + }, + { + "epoch": 0.12111853794737287, + "grad_norm": 0.5759567022323608, + "learning_rate": 0.001, + "loss": 3.0561, + "step": 2863 + }, + { + "epoch": 0.12116084271088924, + "grad_norm": 1.3344426155090332, + "learning_rate": 0.001, + "loss": 2.0301, + "step": 2864 + }, + { + "epoch": 0.12120314747440562, + "grad_norm": 0.325006902217865, + "learning_rate": 0.001, + "loss": 2.0085, + "step": 2865 + }, + { + "epoch": 0.12124545223792199, + "grad_norm": 0.27443817257881165, + "learning_rate": 0.001, + "loss": 1.91, + "step": 2866 + }, + { + "epoch": 0.12128775700143836, + "grad_norm": 0.5229665637016296, + "learning_rate": 0.001, + "loss": 2.2958, + "step": 2867 + }, + { + "epoch": 0.12133006176495473, + "grad_norm": 0.22957605123519897, + "learning_rate": 0.001, + "loss": 1.9397, + "step": 2868 + }, + { + "epoch": 0.12137236652847111, + "grad_norm": 0.23909400403499603, + "learning_rate": 0.001, + "loss": 2.603, + "step": 2869 + }, + { + "epoch": 0.12141467129198748, + "grad_norm": 0.9538590908050537, + "learning_rate": 0.001, + "loss": 1.8949, + "step": 2870 + }, + { + "epoch": 0.12145697605550385, + "grad_norm": 0.25031131505966187, + "learning_rate": 0.001, + "loss": 1.9435, + "step": 2871 + }, + { + "epoch": 0.12149928081902023, + "grad_norm": 0.30926719307899475, + "learning_rate": 0.001, + "loss": 2.2511, + "step": 2872 + }, + { + "epoch": 0.1215415855825366, + "grad_norm": 0.2400166243314743, + "learning_rate": 0.001, + "loss": 2.2112, + "step": 2873 + }, + { + "epoch": 0.12158389034605296, + "grad_norm": 0.5655799508094788, + "learning_rate": 0.001, + "loss": 2.8809, + "step": 2874 + }, + { + "epoch": 0.12162619510956933, + "grad_norm": 0.495080828666687, + "learning_rate": 0.001, + "loss": 2.6534, + "step": 2875 + }, + { + "epoch": 0.12166849987308571, + "grad_norm": 0.46104100346565247, + "learning_rate": 0.001, + "loss": 2.5455, + "step": 2876 + }, + { + "epoch": 0.12171080463660208, + "grad_norm": 3.711001396179199, + "learning_rate": 0.001, + "loss": 2.2507, + "step": 2877 + }, + { + "epoch": 0.12175310940011845, + "grad_norm": 0.30323144793510437, + "learning_rate": 0.001, + "loss": 2.8051, + "step": 2878 + }, + { + "epoch": 0.12179541416363482, + "grad_norm": 0.5401155948638916, + "learning_rate": 0.001, + "loss": 1.596, + "step": 2879 + }, + { + "epoch": 0.1218377189271512, + "grad_norm": 0.28242409229278564, + "learning_rate": 0.001, + "loss": 2.9103, + "step": 2880 + }, + { + "epoch": 0.12188002369066757, + "grad_norm": 0.5616193413734436, + "learning_rate": 0.001, + "loss": 2.0185, + "step": 2881 + }, + { + "epoch": 0.12192232845418394, + "grad_norm": 0.48654183745384216, + "learning_rate": 0.001, + "loss": 1.996, + "step": 2882 + }, + { + "epoch": 0.12196463321770032, + "grad_norm": 0.2724916338920593, + "learning_rate": 0.001, + "loss": 2.6126, + "step": 2883 + }, + { + "epoch": 0.12200693798121669, + "grad_norm": 0.3358596861362457, + "learning_rate": 0.001, + "loss": 2.2338, + "step": 2884 + }, + { + "epoch": 0.12204924274473306, + "grad_norm": 0.28303262591362, + "learning_rate": 0.001, + "loss": 3.4625, + "step": 2885 + }, + { + "epoch": 0.12209154750824942, + "grad_norm": 1.536812663078308, + "learning_rate": 0.001, + "loss": 2.7973, + "step": 2886 + }, + { + "epoch": 0.1221338522717658, + "grad_norm": 0.2855709195137024, + "learning_rate": 0.001, + "loss": 2.583, + "step": 2887 + }, + { + "epoch": 0.12217615703528217, + "grad_norm": 3.504598379135132, + "learning_rate": 0.001, + "loss": 2.1699, + "step": 2888 + }, + { + "epoch": 0.12221846179879854, + "grad_norm": 1.3819570541381836, + "learning_rate": 0.001, + "loss": 3.72, + "step": 2889 + }, + { + "epoch": 0.12226076656231491, + "grad_norm": 0.21111451089382172, + "learning_rate": 0.001, + "loss": 1.495, + "step": 2890 + }, + { + "epoch": 0.12230307132583129, + "grad_norm": 0.4661988317966461, + "learning_rate": 0.001, + "loss": 2.697, + "step": 2891 + }, + { + "epoch": 0.12234537608934766, + "grad_norm": 0.47962015867233276, + "learning_rate": 0.001, + "loss": 1.6233, + "step": 2892 + }, + { + "epoch": 0.12238768085286403, + "grad_norm": 0.33557215332984924, + "learning_rate": 0.001, + "loss": 3.1164, + "step": 2893 + }, + { + "epoch": 0.12242998561638041, + "grad_norm": 0.25374147295951843, + "learning_rate": 0.001, + "loss": 2.2069, + "step": 2894 + }, + { + "epoch": 0.12247229037989678, + "grad_norm": 0.397442489862442, + "learning_rate": 0.001, + "loss": 2.6315, + "step": 2895 + }, + { + "epoch": 0.12251459514341315, + "grad_norm": 0.2583683729171753, + "learning_rate": 0.001, + "loss": 2.516, + "step": 2896 + }, + { + "epoch": 0.12255689990692951, + "grad_norm": 0.3760106861591339, + "learning_rate": 0.001, + "loss": 1.859, + "step": 2897 + }, + { + "epoch": 0.1225992046704459, + "grad_norm": 0.3791460394859314, + "learning_rate": 0.001, + "loss": 2.9503, + "step": 2898 + }, + { + "epoch": 0.12264150943396226, + "grad_norm": 0.5899031162261963, + "learning_rate": 0.001, + "loss": 2.4752, + "step": 2899 + }, + { + "epoch": 0.12268381419747863, + "grad_norm": 0.29078209400177, + "learning_rate": 0.001, + "loss": 2.3481, + "step": 2900 + }, + { + "epoch": 0.12272611896099501, + "grad_norm": 6.856618881225586, + "learning_rate": 0.001, + "loss": 3.0669, + "step": 2901 + }, + { + "epoch": 0.12276842372451138, + "grad_norm": 1.2403521537780762, + "learning_rate": 0.001, + "loss": 1.9102, + "step": 2902 + }, + { + "epoch": 0.12281072848802775, + "grad_norm": 0.6264525055885315, + "learning_rate": 0.001, + "loss": 1.9421, + "step": 2903 + }, + { + "epoch": 0.12285303325154412, + "grad_norm": 0.9500356912612915, + "learning_rate": 0.001, + "loss": 2.146, + "step": 2904 + }, + { + "epoch": 0.1228953380150605, + "grad_norm": 0.27822932600975037, + "learning_rate": 0.001, + "loss": 2.3955, + "step": 2905 + }, + { + "epoch": 0.12293764277857687, + "grad_norm": 0.27553531527519226, + "learning_rate": 0.001, + "loss": 2.3293, + "step": 2906 + }, + { + "epoch": 0.12297994754209324, + "grad_norm": 0.2871011197566986, + "learning_rate": 0.001, + "loss": 2.0411, + "step": 2907 + }, + { + "epoch": 0.1230222523056096, + "grad_norm": 0.3595285713672638, + "learning_rate": 0.001, + "loss": 2.316, + "step": 2908 + }, + { + "epoch": 0.12306455706912599, + "grad_norm": 0.32838475704193115, + "learning_rate": 0.001, + "loss": 3.3189, + "step": 2909 + }, + { + "epoch": 0.12310686183264236, + "grad_norm": 0.2596978545188904, + "learning_rate": 0.001, + "loss": 2.4232, + "step": 2910 + }, + { + "epoch": 0.12314916659615872, + "grad_norm": 0.2212909609079361, + "learning_rate": 0.001, + "loss": 2.0995, + "step": 2911 + }, + { + "epoch": 0.1231914713596751, + "grad_norm": 0.2778913974761963, + "learning_rate": 0.001, + "loss": 2.1786, + "step": 2912 + }, + { + "epoch": 0.12323377612319147, + "grad_norm": 0.23343005776405334, + "learning_rate": 0.001, + "loss": 2.1327, + "step": 2913 + }, + { + "epoch": 0.12327608088670784, + "grad_norm": 0.3342663049697876, + "learning_rate": 0.001, + "loss": 2.605, + "step": 2914 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 0.23159651458263397, + "learning_rate": 0.001, + "loss": 2.4356, + "step": 2915 + }, + { + "epoch": 0.12336069041374059, + "grad_norm": 0.36005109548568726, + "learning_rate": 0.001, + "loss": 1.9138, + "step": 2916 + }, + { + "epoch": 0.12340299517725696, + "grad_norm": 0.21737025678157806, + "learning_rate": 0.001, + "loss": 1.9291, + "step": 2917 + }, + { + "epoch": 0.12344529994077333, + "grad_norm": 0.9176145195960999, + "learning_rate": 0.001, + "loss": 1.9523, + "step": 2918 + }, + { + "epoch": 0.1234876047042897, + "grad_norm": 0.4270826280117035, + "learning_rate": 0.001, + "loss": 2.6282, + "step": 2919 + }, + { + "epoch": 0.12352990946780608, + "grad_norm": 0.23092937469482422, + "learning_rate": 0.001, + "loss": 2.1475, + "step": 2920 + }, + { + "epoch": 0.12357221423132245, + "grad_norm": 0.32089483737945557, + "learning_rate": 0.001, + "loss": 2.8787, + "step": 2921 + }, + { + "epoch": 0.12361451899483882, + "grad_norm": 0.5019152164459229, + "learning_rate": 0.001, + "loss": 2.4339, + "step": 2922 + }, + { + "epoch": 0.1236568237583552, + "grad_norm": 8.176496505737305, + "learning_rate": 0.001, + "loss": 2.9616, + "step": 2923 + }, + { + "epoch": 0.12369912852187157, + "grad_norm": 0.42185178399086, + "learning_rate": 0.001, + "loss": 3.3083, + "step": 2924 + }, + { + "epoch": 0.12374143328538793, + "grad_norm": 0.6926589608192444, + "learning_rate": 0.001, + "loss": 4.3947, + "step": 2925 + }, + { + "epoch": 0.1237837380489043, + "grad_norm": 2.2707536220550537, + "learning_rate": 0.001, + "loss": 1.8298, + "step": 2926 + }, + { + "epoch": 0.12382604281242068, + "grad_norm": 0.2851990759372711, + "learning_rate": 0.001, + "loss": 2.0727, + "step": 2927 + }, + { + "epoch": 0.12386834757593705, + "grad_norm": 0.7518361210823059, + "learning_rate": 0.001, + "loss": 1.6419, + "step": 2928 + }, + { + "epoch": 0.12391065233945342, + "grad_norm": 0.2845136523246765, + "learning_rate": 0.001, + "loss": 2.6422, + "step": 2929 + }, + { + "epoch": 0.12395295710296979, + "grad_norm": 0.32894209027290344, + "learning_rate": 0.001, + "loss": 2.7414, + "step": 2930 + }, + { + "epoch": 0.12399526186648617, + "grad_norm": 1.5604556798934937, + "learning_rate": 0.001, + "loss": 2.3227, + "step": 2931 + }, + { + "epoch": 0.12403756663000254, + "grad_norm": 0.49824783205986023, + "learning_rate": 0.001, + "loss": 1.9455, + "step": 2932 + }, + { + "epoch": 0.1240798713935189, + "grad_norm": 0.40257930755615234, + "learning_rate": 0.001, + "loss": 2.6387, + "step": 2933 + }, + { + "epoch": 0.12412217615703529, + "grad_norm": 0.27476415038108826, + "learning_rate": 0.001, + "loss": 2.3756, + "step": 2934 + }, + { + "epoch": 0.12416448092055166, + "grad_norm": 0.3869374096393585, + "learning_rate": 0.001, + "loss": 2.8796, + "step": 2935 + }, + { + "epoch": 0.12420678568406802, + "grad_norm": 0.47996222972869873, + "learning_rate": 0.001, + "loss": 2.8597, + "step": 2936 + }, + { + "epoch": 0.12424909044758439, + "grad_norm": 0.2530654966831207, + "learning_rate": 0.001, + "loss": 2.6784, + "step": 2937 + }, + { + "epoch": 0.12429139521110077, + "grad_norm": 0.8970469832420349, + "learning_rate": 0.001, + "loss": 2.6644, + "step": 2938 + }, + { + "epoch": 0.12433369997461714, + "grad_norm": 0.24477270245552063, + "learning_rate": 0.001, + "loss": 2.4868, + "step": 2939 + }, + { + "epoch": 0.12437600473813351, + "grad_norm": 1.4595485925674438, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 2940 + }, + { + "epoch": 0.12441830950164988, + "grad_norm": 0.31102073192596436, + "learning_rate": 0.001, + "loss": 2.7839, + "step": 2941 + }, + { + "epoch": 0.12446061426516626, + "grad_norm": 1.8330522775650024, + "learning_rate": 0.001, + "loss": 3.1596, + "step": 2942 + }, + { + "epoch": 0.12450291902868263, + "grad_norm": 0.30926570296287537, + "learning_rate": 0.001, + "loss": 3.4155, + "step": 2943 + }, + { + "epoch": 0.124545223792199, + "grad_norm": 0.3584842085838318, + "learning_rate": 0.001, + "loss": 5.0056, + "step": 2944 + }, + { + "epoch": 0.12458752855571538, + "grad_norm": 0.24263957142829895, + "learning_rate": 0.001, + "loss": 1.7688, + "step": 2945 + }, + { + "epoch": 0.12462983331923175, + "grad_norm": 0.7257891893386841, + "learning_rate": 0.001, + "loss": 2.9134, + "step": 2946 + }, + { + "epoch": 0.12467213808274812, + "grad_norm": 0.2676026225090027, + "learning_rate": 0.001, + "loss": 3.1154, + "step": 2947 + }, + { + "epoch": 0.12471444284626448, + "grad_norm": 1.5698000192642212, + "learning_rate": 0.001, + "loss": 2.7397, + "step": 2948 + }, + { + "epoch": 0.12475674760978087, + "grad_norm": 0.2666419744491577, + "learning_rate": 0.001, + "loss": 2.6645, + "step": 2949 + }, + { + "epoch": 0.12479905237329723, + "grad_norm": 0.9079283475875854, + "learning_rate": 0.001, + "loss": 1.6547, + "step": 2950 + }, + { + "epoch": 0.1248413571368136, + "grad_norm": 0.27443641424179077, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 2951 + }, + { + "epoch": 0.12488366190032997, + "grad_norm": 0.27384769916534424, + "learning_rate": 0.001, + "loss": 2.2574, + "step": 2952 + }, + { + "epoch": 0.12492596666384635, + "grad_norm": 0.37685632705688477, + "learning_rate": 0.001, + "loss": 2.1753, + "step": 2953 + }, + { + "epoch": 0.12496827142736272, + "grad_norm": 0.5168205499649048, + "learning_rate": 0.001, + "loss": 3.1164, + "step": 2954 + }, + { + "epoch": 0.1250105761908791, + "grad_norm": 0.22727070748806, + "learning_rate": 0.001, + "loss": 1.8189, + "step": 2955 + }, + { + "epoch": 0.12505288095439546, + "grad_norm": 2.466627836227417, + "learning_rate": 0.001, + "loss": 1.925, + "step": 2956 + }, + { + "epoch": 0.12509518571791184, + "grad_norm": 0.37915435433387756, + "learning_rate": 0.001, + "loss": 3.6095, + "step": 2957 + }, + { + "epoch": 0.12513749048142822, + "grad_norm": 0.3377191424369812, + "learning_rate": 0.001, + "loss": 3.52, + "step": 2958 + }, + { + "epoch": 0.12517979524494457, + "grad_norm": 0.5984206199645996, + "learning_rate": 0.001, + "loss": 2.0392, + "step": 2959 + }, + { + "epoch": 0.12522210000846096, + "grad_norm": 0.2659249007701874, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 2960 + }, + { + "epoch": 0.1252644047719773, + "grad_norm": 0.25528255105018616, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 2961 + }, + { + "epoch": 0.1253067095354937, + "grad_norm": 0.3032079339027405, + "learning_rate": 0.001, + "loss": 2.3852, + "step": 2962 + }, + { + "epoch": 0.12534901429901008, + "grad_norm": 0.29467710852622986, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 2963 + }, + { + "epoch": 0.12539131906252643, + "grad_norm": 0.5475602746009827, + "learning_rate": 0.001, + "loss": 2.0179, + "step": 2964 + }, + { + "epoch": 0.1254336238260428, + "grad_norm": 0.2524794340133667, + "learning_rate": 0.001, + "loss": 3.1053, + "step": 2965 + }, + { + "epoch": 0.1254759285895592, + "grad_norm": 0.26027101278305054, + "learning_rate": 0.001, + "loss": 2.4225, + "step": 2966 + }, + { + "epoch": 0.12551823335307555, + "grad_norm": 0.2270876169204712, + "learning_rate": 0.001, + "loss": 1.8372, + "step": 2967 + }, + { + "epoch": 0.12556053811659193, + "grad_norm": 1.1829967498779297, + "learning_rate": 0.001, + "loss": 2.1506, + "step": 2968 + }, + { + "epoch": 0.1256028428801083, + "grad_norm": 0.423272967338562, + "learning_rate": 0.001, + "loss": 2.3921, + "step": 2969 + }, + { + "epoch": 0.12564514764362467, + "grad_norm": 0.8556601405143738, + "learning_rate": 0.001, + "loss": 2.0293, + "step": 2970 + }, + { + "epoch": 0.12568745240714105, + "grad_norm": 0.28667885065078735, + "learning_rate": 0.001, + "loss": 2.2469, + "step": 2971 + }, + { + "epoch": 0.1257297571706574, + "grad_norm": 0.3176821768283844, + "learning_rate": 0.001, + "loss": 2.315, + "step": 2972 + }, + { + "epoch": 0.12577206193417378, + "grad_norm": 0.3208604156970978, + "learning_rate": 0.001, + "loss": 3.3225, + "step": 2973 + }, + { + "epoch": 0.12581436669769017, + "grad_norm": 0.3094156086444855, + "learning_rate": 0.001, + "loss": 1.8009, + "step": 2974 + }, + { + "epoch": 0.12585667146120652, + "grad_norm": 0.2794618606567383, + "learning_rate": 0.001, + "loss": 2.4301, + "step": 2975 + }, + { + "epoch": 0.1258989762247229, + "grad_norm": 1.7522642612457275, + "learning_rate": 0.001, + "loss": 2.5346, + "step": 2976 + }, + { + "epoch": 0.12594128098823928, + "grad_norm": 0.7548322081565857, + "learning_rate": 0.001, + "loss": 2.3579, + "step": 2977 + }, + { + "epoch": 0.12598358575175564, + "grad_norm": 0.31399986147880554, + "learning_rate": 0.001, + "loss": 2.8131, + "step": 2978 + }, + { + "epoch": 0.12602589051527202, + "grad_norm": 0.5363809466362, + "learning_rate": 0.001, + "loss": 2.5607, + "step": 2979 + }, + { + "epoch": 0.1260681952787884, + "grad_norm": 0.495072603225708, + "learning_rate": 0.001, + "loss": 4.0278, + "step": 2980 + }, + { + "epoch": 0.12611050004230476, + "grad_norm": 0.36730262637138367, + "learning_rate": 0.001, + "loss": 2.1536, + "step": 2981 + }, + { + "epoch": 0.12615280480582114, + "grad_norm": 0.31238043308258057, + "learning_rate": 0.001, + "loss": 2.0246, + "step": 2982 + }, + { + "epoch": 0.1261951095693375, + "grad_norm": 0.2928439974784851, + "learning_rate": 0.001, + "loss": 3.6699, + "step": 2983 + }, + { + "epoch": 0.12623741433285388, + "grad_norm": 0.31920191645622253, + "learning_rate": 0.001, + "loss": 3.32, + "step": 2984 + }, + { + "epoch": 0.12627971909637026, + "grad_norm": 0.27571341395378113, + "learning_rate": 0.001, + "loss": 1.9494, + "step": 2985 + }, + { + "epoch": 0.1263220238598866, + "grad_norm": 0.3193123936653137, + "learning_rate": 0.001, + "loss": 2.1162, + "step": 2986 + }, + { + "epoch": 0.126364328623403, + "grad_norm": 0.24550597369670868, + "learning_rate": 0.001, + "loss": 3.5763, + "step": 2987 + }, + { + "epoch": 0.12640663338691938, + "grad_norm": 0.2659911513328552, + "learning_rate": 0.001, + "loss": 2.2886, + "step": 2988 + }, + { + "epoch": 0.12644893815043573, + "grad_norm": 0.3616361916065216, + "learning_rate": 0.001, + "loss": 2.4602, + "step": 2989 + }, + { + "epoch": 0.1264912429139521, + "grad_norm": 1.0882079601287842, + "learning_rate": 0.001, + "loss": 1.6782, + "step": 2990 + }, + { + "epoch": 0.1265335476774685, + "grad_norm": 0.1997089982032776, + "learning_rate": 0.001, + "loss": 2.1267, + "step": 2991 + }, + { + "epoch": 0.12657585244098485, + "grad_norm": 0.2554531991481781, + "learning_rate": 0.001, + "loss": 2.2105, + "step": 2992 + }, + { + "epoch": 0.12661815720450123, + "grad_norm": 1.5472227334976196, + "learning_rate": 0.001, + "loss": 2.6065, + "step": 2993 + }, + { + "epoch": 0.1266604619680176, + "grad_norm": 0.4191298186779022, + "learning_rate": 0.001, + "loss": 3.8673, + "step": 2994 + }, + { + "epoch": 0.12670276673153397, + "grad_norm": 0.3299647569656372, + "learning_rate": 0.001, + "loss": 2.6935, + "step": 2995 + }, + { + "epoch": 0.12674507149505035, + "grad_norm": 0.26964640617370605, + "learning_rate": 0.001, + "loss": 2.0866, + "step": 2996 + }, + { + "epoch": 0.1267873762585667, + "grad_norm": 0.22724339365959167, + "learning_rate": 0.001, + "loss": 1.6615, + "step": 2997 + }, + { + "epoch": 0.12682968102208309, + "grad_norm": 0.28157809376716614, + "learning_rate": 0.001, + "loss": 2.4699, + "step": 2998 + }, + { + "epoch": 0.12687198578559947, + "grad_norm": 0.3091438412666321, + "learning_rate": 0.001, + "loss": 2.359, + "step": 2999 + }, + { + "epoch": 0.12691429054911582, + "grad_norm": 0.2963161766529083, + "learning_rate": 0.001, + "loss": 2.3396, + "step": 3000 + }, + { + "epoch": 0.1269565953126322, + "grad_norm": 0.3389158546924591, + "learning_rate": 0.001, + "loss": 3.0763, + "step": 3001 + }, + { + "epoch": 0.12699890007614859, + "grad_norm": 1.0668145418167114, + "learning_rate": 0.001, + "loss": 2.6003, + "step": 3002 + }, + { + "epoch": 0.12704120483966494, + "grad_norm": 0.28413814306259155, + "learning_rate": 0.001, + "loss": 2.9304, + "step": 3003 + }, + { + "epoch": 0.12708350960318132, + "grad_norm": 0.5639888048171997, + "learning_rate": 0.001, + "loss": 2.1774, + "step": 3004 + }, + { + "epoch": 0.1271258143666977, + "grad_norm": 0.24791397154331207, + "learning_rate": 0.001, + "loss": 2.9347, + "step": 3005 + }, + { + "epoch": 0.12716811913021406, + "grad_norm": 0.23885676264762878, + "learning_rate": 0.001, + "loss": 2.189, + "step": 3006 + }, + { + "epoch": 0.12721042389373044, + "grad_norm": 0.2213631123304367, + "learning_rate": 0.001, + "loss": 2.3451, + "step": 3007 + }, + { + "epoch": 0.1272527286572468, + "grad_norm": 0.29912659525871277, + "learning_rate": 0.001, + "loss": 2.8164, + "step": 3008 + }, + { + "epoch": 0.12729503342076318, + "grad_norm": 0.7422712445259094, + "learning_rate": 0.001, + "loss": 2.4031, + "step": 3009 + }, + { + "epoch": 0.12733733818427956, + "grad_norm": 0.2729654014110565, + "learning_rate": 0.001, + "loss": 2.3894, + "step": 3010 + }, + { + "epoch": 0.1273796429477959, + "grad_norm": 0.2171071618795395, + "learning_rate": 0.001, + "loss": 2.0108, + "step": 3011 + }, + { + "epoch": 0.1274219477113123, + "grad_norm": 0.2428562194108963, + "learning_rate": 0.001, + "loss": 2.9788, + "step": 3012 + }, + { + "epoch": 0.12746425247482868, + "grad_norm": 0.19264620542526245, + "learning_rate": 0.001, + "loss": 1.949, + "step": 3013 + }, + { + "epoch": 0.12750655723834503, + "grad_norm": 0.2231709510087967, + "learning_rate": 0.001, + "loss": 2.2655, + "step": 3014 + }, + { + "epoch": 0.1275488620018614, + "grad_norm": 0.6086738705635071, + "learning_rate": 0.001, + "loss": 3.2293, + "step": 3015 + }, + { + "epoch": 0.1275911667653778, + "grad_norm": 0.20442450046539307, + "learning_rate": 0.001, + "loss": 1.6902, + "step": 3016 + }, + { + "epoch": 0.12763347152889415, + "grad_norm": 0.2568240463733673, + "learning_rate": 0.001, + "loss": 2.3126, + "step": 3017 + }, + { + "epoch": 0.12767577629241053, + "grad_norm": 0.6941277384757996, + "learning_rate": 0.001, + "loss": 2.5573, + "step": 3018 + }, + { + "epoch": 0.12771808105592689, + "grad_norm": 0.2820114493370056, + "learning_rate": 0.001, + "loss": 3.45, + "step": 3019 + }, + { + "epoch": 0.12776038581944327, + "grad_norm": 0.27247634530067444, + "learning_rate": 0.001, + "loss": 3.0103, + "step": 3020 + }, + { + "epoch": 0.12780269058295965, + "grad_norm": 0.30561110377311707, + "learning_rate": 0.001, + "loss": 2.9789, + "step": 3021 + }, + { + "epoch": 0.127844995346476, + "grad_norm": 0.7641719579696655, + "learning_rate": 0.001, + "loss": 1.9984, + "step": 3022 + }, + { + "epoch": 0.12788730010999239, + "grad_norm": 1.2558095455169678, + "learning_rate": 0.001, + "loss": 2.3797, + "step": 3023 + }, + { + "epoch": 0.12792960487350877, + "grad_norm": 0.24033454060554504, + "learning_rate": 0.001, + "loss": 2.5231, + "step": 3024 + }, + { + "epoch": 0.12797190963702512, + "grad_norm": 0.9906525611877441, + "learning_rate": 0.001, + "loss": 3.2656, + "step": 3025 + }, + { + "epoch": 0.1280142144005415, + "grad_norm": 0.35158273577690125, + "learning_rate": 0.001, + "loss": 2.5469, + "step": 3026 + }, + { + "epoch": 0.12805651916405789, + "grad_norm": 0.313698947429657, + "learning_rate": 0.001, + "loss": 2.2092, + "step": 3027 + }, + { + "epoch": 0.12809882392757424, + "grad_norm": 0.2757653594017029, + "learning_rate": 0.001, + "loss": 1.7885, + "step": 3028 + }, + { + "epoch": 0.12814112869109062, + "grad_norm": 0.23092950880527496, + "learning_rate": 0.001, + "loss": 1.9807, + "step": 3029 + }, + { + "epoch": 0.12818343345460698, + "grad_norm": 0.7450085282325745, + "learning_rate": 0.001, + "loss": 1.9991, + "step": 3030 + }, + { + "epoch": 0.12822573821812336, + "grad_norm": 0.362114816904068, + "learning_rate": 0.001, + "loss": 4.5653, + "step": 3031 + }, + { + "epoch": 0.12826804298163974, + "grad_norm": 0.23789288103580475, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 3032 + }, + { + "epoch": 0.1283103477451561, + "grad_norm": 0.27310964465141296, + "learning_rate": 0.001, + "loss": 3.2499, + "step": 3033 + }, + { + "epoch": 0.12835265250867248, + "grad_norm": 0.3834547996520996, + "learning_rate": 0.001, + "loss": 2.3692, + "step": 3034 + }, + { + "epoch": 0.12839495727218886, + "grad_norm": 0.28623712062835693, + "learning_rate": 0.001, + "loss": 2.875, + "step": 3035 + }, + { + "epoch": 0.1284372620357052, + "grad_norm": 0.35939669609069824, + "learning_rate": 0.001, + "loss": 2.4295, + "step": 3036 + }, + { + "epoch": 0.1284795667992216, + "grad_norm": 0.403884619474411, + "learning_rate": 0.001, + "loss": 2.8118, + "step": 3037 + }, + { + "epoch": 0.12852187156273798, + "grad_norm": 0.7596374154090881, + "learning_rate": 0.001, + "loss": 2.5978, + "step": 3038 + }, + { + "epoch": 0.12856417632625433, + "grad_norm": 0.24331355094909668, + "learning_rate": 0.001, + "loss": 2.3842, + "step": 3039 + }, + { + "epoch": 0.1286064810897707, + "grad_norm": 0.23005782067775726, + "learning_rate": 0.001, + "loss": 2.1265, + "step": 3040 + }, + { + "epoch": 0.12864878585328707, + "grad_norm": 0.23082035779953003, + "learning_rate": 0.001, + "loss": 2.273, + "step": 3041 + }, + { + "epoch": 0.12869109061680345, + "grad_norm": 0.26438039541244507, + "learning_rate": 0.001, + "loss": 2.3408, + "step": 3042 + }, + { + "epoch": 0.12873339538031983, + "grad_norm": 0.22107988595962524, + "learning_rate": 0.001, + "loss": 1.9021, + "step": 3043 + }, + { + "epoch": 0.12877570014383619, + "grad_norm": 0.21788084506988525, + "learning_rate": 0.001, + "loss": 3.0715, + "step": 3044 + }, + { + "epoch": 0.12881800490735257, + "grad_norm": 0.23471501469612122, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 3045 + }, + { + "epoch": 0.12886030967086895, + "grad_norm": 0.28260231018066406, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 3046 + }, + { + "epoch": 0.1289026144343853, + "grad_norm": 0.26574090123176575, + "learning_rate": 0.001, + "loss": 2.0225, + "step": 3047 + }, + { + "epoch": 0.1289449191979017, + "grad_norm": 0.34297651052474976, + "learning_rate": 0.001, + "loss": 3.0373, + "step": 3048 + }, + { + "epoch": 0.12898722396141807, + "grad_norm": 0.586717426776886, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 3049 + }, + { + "epoch": 0.12902952872493442, + "grad_norm": 0.21093139052391052, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 3050 + }, + { + "epoch": 0.1290718334884508, + "grad_norm": 0.29111140966415405, + "learning_rate": 0.001, + "loss": 2.1633, + "step": 3051 + }, + { + "epoch": 0.12911413825196716, + "grad_norm": 0.30666249990463257, + "learning_rate": 0.001, + "loss": 3.0201, + "step": 3052 + }, + { + "epoch": 0.12915644301548354, + "grad_norm": 0.24410761892795563, + "learning_rate": 0.001, + "loss": 2.8614, + "step": 3053 + }, + { + "epoch": 0.12919874777899992, + "grad_norm": 0.2161339968442917, + "learning_rate": 0.001, + "loss": 1.7318, + "step": 3054 + }, + { + "epoch": 0.12924105254251628, + "grad_norm": 0.19016364216804504, + "learning_rate": 0.001, + "loss": 2.0828, + "step": 3055 + }, + { + "epoch": 0.12928335730603266, + "grad_norm": 0.23141834139823914, + "learning_rate": 0.001, + "loss": 2.8, + "step": 3056 + }, + { + "epoch": 0.12932566206954904, + "grad_norm": 1.5664108991622925, + "learning_rate": 0.001, + "loss": 2.1793, + "step": 3057 + }, + { + "epoch": 0.1293679668330654, + "grad_norm": 2.544483184814453, + "learning_rate": 0.001, + "loss": 2.9984, + "step": 3058 + }, + { + "epoch": 0.12941027159658178, + "grad_norm": 0.30240291357040405, + "learning_rate": 0.001, + "loss": 2.197, + "step": 3059 + }, + { + "epoch": 0.12945257636009816, + "grad_norm": 0.24638395011425018, + "learning_rate": 0.001, + "loss": 2.2356, + "step": 3060 + }, + { + "epoch": 0.1294948811236145, + "grad_norm": 0.9346217513084412, + "learning_rate": 0.001, + "loss": 2.1051, + "step": 3061 + }, + { + "epoch": 0.1295371858871309, + "grad_norm": 0.256610244512558, + "learning_rate": 0.001, + "loss": 2.3009, + "step": 3062 + }, + { + "epoch": 0.12957949065064725, + "grad_norm": 0.40396857261657715, + "learning_rate": 0.001, + "loss": 2.5668, + "step": 3063 + }, + { + "epoch": 0.12962179541416363, + "grad_norm": 10.963554382324219, + "learning_rate": 0.001, + "loss": 2.1443, + "step": 3064 + }, + { + "epoch": 0.12966410017768001, + "grad_norm": 2.6234822273254395, + "learning_rate": 0.001, + "loss": 2.6211, + "step": 3065 + }, + { + "epoch": 0.12970640494119637, + "grad_norm": 0.3858783543109894, + "learning_rate": 0.001, + "loss": 1.9484, + "step": 3066 + }, + { + "epoch": 0.12974870970471275, + "grad_norm": 4.675527572631836, + "learning_rate": 0.001, + "loss": 2.551, + "step": 3067 + }, + { + "epoch": 0.12979101446822913, + "grad_norm": 0.3626082241535187, + "learning_rate": 0.001, + "loss": 1.7012, + "step": 3068 + }, + { + "epoch": 0.1298333192317455, + "grad_norm": 0.605455756187439, + "learning_rate": 0.001, + "loss": 1.6341, + "step": 3069 + }, + { + "epoch": 0.12987562399526187, + "grad_norm": 0.38578104972839355, + "learning_rate": 0.001, + "loss": 2.0653, + "step": 3070 + }, + { + "epoch": 0.12991792875877825, + "grad_norm": 0.3008464574813843, + "learning_rate": 0.001, + "loss": 2.3254, + "step": 3071 + }, + { + "epoch": 0.1299602335222946, + "grad_norm": 0.31806275248527527, + "learning_rate": 0.001, + "loss": 2.4444, + "step": 3072 + }, + { + "epoch": 0.130002538285811, + "grad_norm": 0.3251250386238098, + "learning_rate": 0.001, + "loss": 2.173, + "step": 3073 + }, + { + "epoch": 0.13004484304932734, + "grad_norm": 0.3980516791343689, + "learning_rate": 0.001, + "loss": 2.8316, + "step": 3074 + }, + { + "epoch": 0.13008714781284372, + "grad_norm": 0.3091077506542206, + "learning_rate": 0.001, + "loss": 2.9065, + "step": 3075 + }, + { + "epoch": 0.1301294525763601, + "grad_norm": 0.6228027939796448, + "learning_rate": 0.001, + "loss": 2.3924, + "step": 3076 + }, + { + "epoch": 0.13017175733987646, + "grad_norm": 0.2532864212989807, + "learning_rate": 0.001, + "loss": 2.3496, + "step": 3077 + }, + { + "epoch": 0.13021406210339284, + "grad_norm": 0.32938069105148315, + "learning_rate": 0.001, + "loss": 2.9943, + "step": 3078 + }, + { + "epoch": 0.13025636686690922, + "grad_norm": 0.7084699273109436, + "learning_rate": 0.001, + "loss": 2.3694, + "step": 3079 + }, + { + "epoch": 0.13029867163042558, + "grad_norm": 0.2795187830924988, + "learning_rate": 0.001, + "loss": 2.4931, + "step": 3080 + }, + { + "epoch": 0.13034097639394196, + "grad_norm": 2.5224292278289795, + "learning_rate": 0.001, + "loss": 2.2264, + "step": 3081 + }, + { + "epoch": 0.13038328115745834, + "grad_norm": 0.25982218980789185, + "learning_rate": 0.001, + "loss": 2.2179, + "step": 3082 + }, + { + "epoch": 0.1304255859209747, + "grad_norm": 0.35819700360298157, + "learning_rate": 0.001, + "loss": 2.5449, + "step": 3083 + }, + { + "epoch": 0.13046789068449108, + "grad_norm": 0.7076107263565063, + "learning_rate": 0.001, + "loss": 2.2326, + "step": 3084 + }, + { + "epoch": 0.13051019544800743, + "grad_norm": 0.4517766237258911, + "learning_rate": 0.001, + "loss": 3.3274, + "step": 3085 + }, + { + "epoch": 0.13055250021152381, + "grad_norm": 0.3324654698371887, + "learning_rate": 0.001, + "loss": 2.1643, + "step": 3086 + }, + { + "epoch": 0.1305948049750402, + "grad_norm": 0.2703245282173157, + "learning_rate": 0.001, + "loss": 2.2756, + "step": 3087 + }, + { + "epoch": 0.13063710973855655, + "grad_norm": 0.24688389897346497, + "learning_rate": 0.001, + "loss": 1.4894, + "step": 3088 + }, + { + "epoch": 0.13067941450207293, + "grad_norm": 0.2690267860889435, + "learning_rate": 0.001, + "loss": 2.0199, + "step": 3089 + }, + { + "epoch": 0.13072171926558931, + "grad_norm": 0.2187577337026596, + "learning_rate": 0.001, + "loss": 1.8486, + "step": 3090 + }, + { + "epoch": 0.13076402402910567, + "grad_norm": 0.3903275728225708, + "learning_rate": 0.001, + "loss": 1.9344, + "step": 3091 + }, + { + "epoch": 0.13080632879262205, + "grad_norm": 1.0155338048934937, + "learning_rate": 0.001, + "loss": 2.6314, + "step": 3092 + }, + { + "epoch": 0.13084863355613843, + "grad_norm": 2.3899335861206055, + "learning_rate": 0.001, + "loss": 2.6188, + "step": 3093 + }, + { + "epoch": 0.1308909383196548, + "grad_norm": 0.25962334871292114, + "learning_rate": 0.001, + "loss": 2.4554, + "step": 3094 + }, + { + "epoch": 0.13093324308317117, + "grad_norm": 0.4966904819011688, + "learning_rate": 0.001, + "loss": 2.3855, + "step": 3095 + }, + { + "epoch": 0.13097554784668752, + "grad_norm": 0.26274529099464417, + "learning_rate": 0.001, + "loss": 2.2674, + "step": 3096 + }, + { + "epoch": 0.1310178526102039, + "grad_norm": 0.7305838465690613, + "learning_rate": 0.001, + "loss": 3.0309, + "step": 3097 + }, + { + "epoch": 0.1310601573737203, + "grad_norm": 0.2599664330482483, + "learning_rate": 0.001, + "loss": 3.2799, + "step": 3098 + }, + { + "epoch": 0.13110246213723664, + "grad_norm": 0.3204602301120758, + "learning_rate": 0.001, + "loss": 1.795, + "step": 3099 + }, + { + "epoch": 0.13114476690075302, + "grad_norm": 0.4395667612552643, + "learning_rate": 0.001, + "loss": 2.2309, + "step": 3100 + }, + { + "epoch": 0.1311870716642694, + "grad_norm": 0.5608950257301331, + "learning_rate": 0.001, + "loss": 3.5101, + "step": 3101 + }, + { + "epoch": 0.13122937642778576, + "grad_norm": 0.28046101331710815, + "learning_rate": 0.001, + "loss": 2.8559, + "step": 3102 + }, + { + "epoch": 0.13127168119130214, + "grad_norm": 0.7474309206008911, + "learning_rate": 0.001, + "loss": 2.1403, + "step": 3103 + }, + { + "epoch": 0.13131398595481852, + "grad_norm": 0.3261275291442871, + "learning_rate": 0.001, + "loss": 2.4168, + "step": 3104 + }, + { + "epoch": 0.13135629071833488, + "grad_norm": 0.29234766960144043, + "learning_rate": 0.001, + "loss": 1.9736, + "step": 3105 + }, + { + "epoch": 0.13139859548185126, + "grad_norm": 0.32281428575515747, + "learning_rate": 0.001, + "loss": 2.9004, + "step": 3106 + }, + { + "epoch": 0.13144090024536761, + "grad_norm": 0.3510703146457672, + "learning_rate": 0.001, + "loss": 3.3073, + "step": 3107 + }, + { + "epoch": 0.131483205008884, + "grad_norm": 0.25277993083000183, + "learning_rate": 0.001, + "loss": 1.9087, + "step": 3108 + }, + { + "epoch": 0.13152550977240038, + "grad_norm": 0.5728982090950012, + "learning_rate": 0.001, + "loss": 2.3717, + "step": 3109 + }, + { + "epoch": 0.13156781453591673, + "grad_norm": 0.2837635278701782, + "learning_rate": 0.001, + "loss": 3.2256, + "step": 3110 + }, + { + "epoch": 0.13161011929943311, + "grad_norm": 0.9751304984092712, + "learning_rate": 0.001, + "loss": 1.9881, + "step": 3111 + }, + { + "epoch": 0.1316524240629495, + "grad_norm": 0.26259645819664, + "learning_rate": 0.001, + "loss": 2.9728, + "step": 3112 + }, + { + "epoch": 0.13169472882646585, + "grad_norm": 0.28536665439605713, + "learning_rate": 0.001, + "loss": 2.4389, + "step": 3113 + }, + { + "epoch": 0.13173703358998223, + "grad_norm": 0.8838117718696594, + "learning_rate": 0.001, + "loss": 2.7865, + "step": 3114 + }, + { + "epoch": 0.13177933835349862, + "grad_norm": 0.351610392332077, + "learning_rate": 0.001, + "loss": 4.3358, + "step": 3115 + }, + { + "epoch": 0.13182164311701497, + "grad_norm": 0.6937295198440552, + "learning_rate": 0.001, + "loss": 3.2516, + "step": 3116 + }, + { + "epoch": 0.13186394788053135, + "grad_norm": 0.24456751346588135, + "learning_rate": 0.001, + "loss": 3.5256, + "step": 3117 + }, + { + "epoch": 0.13190625264404773, + "grad_norm": 1.5360091924667358, + "learning_rate": 0.001, + "loss": 2.8469, + "step": 3118 + }, + { + "epoch": 0.1319485574075641, + "grad_norm": 0.3377660810947418, + "learning_rate": 0.001, + "loss": 2.5228, + "step": 3119 + }, + { + "epoch": 0.13199086217108047, + "grad_norm": 0.3961987793445587, + "learning_rate": 0.001, + "loss": 2.7447, + "step": 3120 + }, + { + "epoch": 0.13203316693459682, + "grad_norm": 0.32120051980018616, + "learning_rate": 0.001, + "loss": 2.3102, + "step": 3121 + }, + { + "epoch": 0.1320754716981132, + "grad_norm": 0.3845050036907196, + "learning_rate": 0.001, + "loss": 3.7081, + "step": 3122 + }, + { + "epoch": 0.1321177764616296, + "grad_norm": 0.2517256736755371, + "learning_rate": 0.001, + "loss": 2.6708, + "step": 3123 + }, + { + "epoch": 0.13216008122514594, + "grad_norm": 0.25273680686950684, + "learning_rate": 0.001, + "loss": 2.748, + "step": 3124 + }, + { + "epoch": 0.13220238598866232, + "grad_norm": 0.3618360161781311, + "learning_rate": 0.001, + "loss": 3.418, + "step": 3125 + }, + { + "epoch": 0.1322446907521787, + "grad_norm": 0.4369431436061859, + "learning_rate": 0.001, + "loss": 3.6098, + "step": 3126 + }, + { + "epoch": 0.13228699551569506, + "grad_norm": 0.28894707560539246, + "learning_rate": 0.001, + "loss": 1.7968, + "step": 3127 + }, + { + "epoch": 0.13232930027921144, + "grad_norm": 0.29500776529312134, + "learning_rate": 0.001, + "loss": 2.6331, + "step": 3128 + }, + { + "epoch": 0.13237160504272782, + "grad_norm": 0.8470263481140137, + "learning_rate": 0.001, + "loss": 2.9434, + "step": 3129 + }, + { + "epoch": 0.13241390980624418, + "grad_norm": 0.20783580839633942, + "learning_rate": 0.001, + "loss": 1.635, + "step": 3130 + }, + { + "epoch": 0.13245621456976056, + "grad_norm": 0.4847624897956848, + "learning_rate": 0.001, + "loss": 2.6969, + "step": 3131 + }, + { + "epoch": 0.13249851933327692, + "grad_norm": 0.2904376983642578, + "learning_rate": 0.001, + "loss": 3.1772, + "step": 3132 + }, + { + "epoch": 0.1325408240967933, + "grad_norm": 0.29514384269714355, + "learning_rate": 0.001, + "loss": 2.2651, + "step": 3133 + }, + { + "epoch": 0.13258312886030968, + "grad_norm": 0.3028735816478729, + "learning_rate": 0.001, + "loss": 2.2932, + "step": 3134 + }, + { + "epoch": 0.13262543362382603, + "grad_norm": 0.2190832495689392, + "learning_rate": 0.001, + "loss": 2.291, + "step": 3135 + }, + { + "epoch": 0.13266773838734242, + "grad_norm": 0.4608692526817322, + "learning_rate": 0.001, + "loss": 2.9313, + "step": 3136 + }, + { + "epoch": 0.1327100431508588, + "grad_norm": 0.5700821280479431, + "learning_rate": 0.001, + "loss": 2.6597, + "step": 3137 + }, + { + "epoch": 0.13275234791437515, + "grad_norm": 0.41855689883232117, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 3138 + }, + { + "epoch": 0.13279465267789153, + "grad_norm": 0.2460172474384308, + "learning_rate": 0.001, + "loss": 2.0043, + "step": 3139 + }, + { + "epoch": 0.13283695744140792, + "grad_norm": 0.26085442304611206, + "learning_rate": 0.001, + "loss": 2.8132, + "step": 3140 + }, + { + "epoch": 0.13287926220492427, + "grad_norm": 0.27267521619796753, + "learning_rate": 0.001, + "loss": 1.9265, + "step": 3141 + }, + { + "epoch": 0.13292156696844065, + "grad_norm": 0.4825087785720825, + "learning_rate": 0.001, + "loss": 2.5328, + "step": 3142 + }, + { + "epoch": 0.132963871731957, + "grad_norm": 0.7955046892166138, + "learning_rate": 0.001, + "loss": 2.4661, + "step": 3143 + }, + { + "epoch": 0.1330061764954734, + "grad_norm": 0.22219106554985046, + "learning_rate": 0.001, + "loss": 2.8113, + "step": 3144 + }, + { + "epoch": 0.13304848125898977, + "grad_norm": 0.20542827248573303, + "learning_rate": 0.001, + "loss": 1.5933, + "step": 3145 + }, + { + "epoch": 0.13309078602250612, + "grad_norm": 1.0127766132354736, + "learning_rate": 0.001, + "loss": 2.4128, + "step": 3146 + }, + { + "epoch": 0.1331330907860225, + "grad_norm": 0.7383486032485962, + "learning_rate": 0.001, + "loss": 2.2777, + "step": 3147 + }, + { + "epoch": 0.1331753955495389, + "grad_norm": 0.714616596698761, + "learning_rate": 0.001, + "loss": 2.5478, + "step": 3148 + }, + { + "epoch": 0.13321770031305524, + "grad_norm": 0.4222099781036377, + "learning_rate": 0.001, + "loss": 2.4327, + "step": 3149 + }, + { + "epoch": 0.13326000507657162, + "grad_norm": 0.3047798275947571, + "learning_rate": 0.001, + "loss": 1.8279, + "step": 3150 + }, + { + "epoch": 0.133302309840088, + "grad_norm": 32.82894515991211, + "learning_rate": 0.001, + "loss": 2.1241, + "step": 3151 + }, + { + "epoch": 0.13334461460360436, + "grad_norm": 4.569951057434082, + "learning_rate": 0.001, + "loss": 2.4177, + "step": 3152 + }, + { + "epoch": 0.13338691936712074, + "grad_norm": 0.2539486587047577, + "learning_rate": 0.001, + "loss": 2.6896, + "step": 3153 + }, + { + "epoch": 0.1334292241306371, + "grad_norm": 0.3113529086112976, + "learning_rate": 0.001, + "loss": 2.3834, + "step": 3154 + }, + { + "epoch": 0.13347152889415348, + "grad_norm": 0.5559782981872559, + "learning_rate": 0.001, + "loss": 3.521, + "step": 3155 + }, + { + "epoch": 0.13351383365766986, + "grad_norm": 0.3125370144844055, + "learning_rate": 0.001, + "loss": 2.5982, + "step": 3156 + }, + { + "epoch": 0.13355613842118622, + "grad_norm": 0.2650197744369507, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 3157 + }, + { + "epoch": 0.1335984431847026, + "grad_norm": 0.3328724503517151, + "learning_rate": 0.001, + "loss": 2.0653, + "step": 3158 + }, + { + "epoch": 0.13364074794821898, + "grad_norm": 0.2736479341983795, + "learning_rate": 0.001, + "loss": 3.6324, + "step": 3159 + }, + { + "epoch": 0.13368305271173533, + "grad_norm": 0.3234842121601105, + "learning_rate": 0.001, + "loss": 2.1484, + "step": 3160 + }, + { + "epoch": 0.13372535747525172, + "grad_norm": 0.2782366871833801, + "learning_rate": 0.001, + "loss": 4.0075, + "step": 3161 + }, + { + "epoch": 0.1337676622387681, + "grad_norm": 0.2664390504360199, + "learning_rate": 0.001, + "loss": 1.7258, + "step": 3162 + }, + { + "epoch": 0.13380996700228445, + "grad_norm": 0.23455047607421875, + "learning_rate": 0.001, + "loss": 1.9677, + "step": 3163 + }, + { + "epoch": 0.13385227176580083, + "grad_norm": 0.22314786911010742, + "learning_rate": 0.001, + "loss": 2.3173, + "step": 3164 + }, + { + "epoch": 0.1338945765293172, + "grad_norm": 0.2761704921722412, + "learning_rate": 0.001, + "loss": 1.8873, + "step": 3165 + }, + { + "epoch": 0.13393688129283357, + "grad_norm": 0.6037988066673279, + "learning_rate": 0.001, + "loss": 2.6251, + "step": 3166 + }, + { + "epoch": 0.13397918605634995, + "grad_norm": 0.42490071058273315, + "learning_rate": 0.001, + "loss": 2.1205, + "step": 3167 + }, + { + "epoch": 0.1340214908198663, + "grad_norm": 0.27945172786712646, + "learning_rate": 0.001, + "loss": 2.0086, + "step": 3168 + }, + { + "epoch": 0.1340637955833827, + "grad_norm": 0.2539001703262329, + "learning_rate": 0.001, + "loss": 2.1604, + "step": 3169 + }, + { + "epoch": 0.13410610034689907, + "grad_norm": 0.23439648747444153, + "learning_rate": 0.001, + "loss": 2.4407, + "step": 3170 + }, + { + "epoch": 0.13414840511041543, + "grad_norm": 0.3301600515842438, + "learning_rate": 0.001, + "loss": 2.0621, + "step": 3171 + }, + { + "epoch": 0.1341907098739318, + "grad_norm": 2.253829002380371, + "learning_rate": 0.001, + "loss": 2.1485, + "step": 3172 + }, + { + "epoch": 0.1342330146374482, + "grad_norm": 0.34564846754074097, + "learning_rate": 0.001, + "loss": 2.2515, + "step": 3173 + }, + { + "epoch": 0.13427531940096454, + "grad_norm": 0.269344300031662, + "learning_rate": 0.001, + "loss": 3.2911, + "step": 3174 + }, + { + "epoch": 0.13431762416448093, + "grad_norm": 0.2677054703235626, + "learning_rate": 0.001, + "loss": 1.9107, + "step": 3175 + }, + { + "epoch": 0.13435992892799728, + "grad_norm": 1.6993762254714966, + "learning_rate": 0.001, + "loss": 2.3414, + "step": 3176 + }, + { + "epoch": 0.13440223369151366, + "grad_norm": 0.29692259430885315, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 3177 + }, + { + "epoch": 0.13444453845503004, + "grad_norm": 0.24042674899101257, + "learning_rate": 0.001, + "loss": 2.9594, + "step": 3178 + }, + { + "epoch": 0.1344868432185464, + "grad_norm": 0.7225860357284546, + "learning_rate": 0.001, + "loss": 2.6567, + "step": 3179 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 0.24381719529628754, + "learning_rate": 0.001, + "loss": 2.0548, + "step": 3180 + }, + { + "epoch": 0.13457145274557916, + "grad_norm": 0.2055826634168625, + "learning_rate": 0.001, + "loss": 2.3845, + "step": 3181 + }, + { + "epoch": 0.13461375750909552, + "grad_norm": 0.2573340833187103, + "learning_rate": 0.001, + "loss": 2.0984, + "step": 3182 + }, + { + "epoch": 0.1346560622726119, + "grad_norm": 0.2962018847465515, + "learning_rate": 0.001, + "loss": 2.5636, + "step": 3183 + }, + { + "epoch": 0.13469836703612828, + "grad_norm": 0.40545910596847534, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 3184 + }, + { + "epoch": 0.13474067179964463, + "grad_norm": 0.6234009861946106, + "learning_rate": 0.001, + "loss": 1.9832, + "step": 3185 + }, + { + "epoch": 0.13478297656316102, + "grad_norm": 0.9001304507255554, + "learning_rate": 0.001, + "loss": 2.0331, + "step": 3186 + }, + { + "epoch": 0.13482528132667737, + "grad_norm": 3.3280558586120605, + "learning_rate": 0.001, + "loss": 2.8476, + "step": 3187 + }, + { + "epoch": 0.13486758609019375, + "grad_norm": 0.25616398453712463, + "learning_rate": 0.001, + "loss": 2.8065, + "step": 3188 + }, + { + "epoch": 0.13490989085371013, + "grad_norm": 4.6577467918396, + "learning_rate": 0.001, + "loss": 2.244, + "step": 3189 + }, + { + "epoch": 0.1349521956172265, + "grad_norm": 0.37131521105766296, + "learning_rate": 0.001, + "loss": 2.2248, + "step": 3190 + }, + { + "epoch": 0.13499450038074287, + "grad_norm": 0.4129955768585205, + "learning_rate": 0.001, + "loss": 2.701, + "step": 3191 + }, + { + "epoch": 0.13503680514425925, + "grad_norm": 0.24256403744220734, + "learning_rate": 0.001, + "loss": 2.0321, + "step": 3192 + }, + { + "epoch": 0.1350791099077756, + "grad_norm": 0.4478329122066498, + "learning_rate": 0.001, + "loss": 2.888, + "step": 3193 + }, + { + "epoch": 0.135121414671292, + "grad_norm": 0.24205787479877472, + "learning_rate": 0.001, + "loss": 2.9618, + "step": 3194 + }, + { + "epoch": 0.13516371943480837, + "grad_norm": 0.2591802179813385, + "learning_rate": 0.001, + "loss": 2.3314, + "step": 3195 + }, + { + "epoch": 0.13520602419832473, + "grad_norm": 0.26308709383010864, + "learning_rate": 0.001, + "loss": 1.5799, + "step": 3196 + }, + { + "epoch": 0.1352483289618411, + "grad_norm": 2.3362512588500977, + "learning_rate": 0.001, + "loss": 2.72, + "step": 3197 + }, + { + "epoch": 0.13529063372535746, + "grad_norm": 0.25793129205703735, + "learning_rate": 0.001, + "loss": 3.2309, + "step": 3198 + }, + { + "epoch": 0.13533293848887384, + "grad_norm": 0.24646157026290894, + "learning_rate": 0.001, + "loss": 2.5992, + "step": 3199 + }, + { + "epoch": 0.13537524325239023, + "grad_norm": 1.71589195728302, + "learning_rate": 0.001, + "loss": 2.0522, + "step": 3200 + }, + { + "epoch": 0.13541754801590658, + "grad_norm": 0.24351108074188232, + "learning_rate": 0.001, + "loss": 1.7597, + "step": 3201 + }, + { + "epoch": 0.13545985277942296, + "grad_norm": 0.23399876058101654, + "learning_rate": 0.001, + "loss": 3.4004, + "step": 3202 + }, + { + "epoch": 0.13550215754293934, + "grad_norm": 0.2488681972026825, + "learning_rate": 0.001, + "loss": 3.0523, + "step": 3203 + }, + { + "epoch": 0.1355444623064557, + "grad_norm": 0.276132732629776, + "learning_rate": 0.001, + "loss": 3.5311, + "step": 3204 + }, + { + "epoch": 0.13558676706997208, + "grad_norm": 0.2650410830974579, + "learning_rate": 0.001, + "loss": 2.3133, + "step": 3205 + }, + { + "epoch": 0.13562907183348846, + "grad_norm": 8.522130966186523, + "learning_rate": 0.001, + "loss": 2.3636, + "step": 3206 + }, + { + "epoch": 0.13567137659700482, + "grad_norm": 0.26246312260627747, + "learning_rate": 0.001, + "loss": 2.2168, + "step": 3207 + }, + { + "epoch": 0.1357136813605212, + "grad_norm": 0.29150646924972534, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 3208 + }, + { + "epoch": 0.13575598612403755, + "grad_norm": 0.2513675093650818, + "learning_rate": 0.001, + "loss": 2.0227, + "step": 3209 + }, + { + "epoch": 0.13579829088755394, + "grad_norm": 0.274081289768219, + "learning_rate": 0.001, + "loss": 3.2165, + "step": 3210 + }, + { + "epoch": 0.13584059565107032, + "grad_norm": 0.7095363736152649, + "learning_rate": 0.001, + "loss": 2.1727, + "step": 3211 + }, + { + "epoch": 0.13588290041458667, + "grad_norm": 0.24826420843601227, + "learning_rate": 0.001, + "loss": 2.9115, + "step": 3212 + }, + { + "epoch": 0.13592520517810305, + "grad_norm": 0.2485196441411972, + "learning_rate": 0.001, + "loss": 2.705, + "step": 3213 + }, + { + "epoch": 0.13596750994161944, + "grad_norm": 1.0916763544082642, + "learning_rate": 0.001, + "loss": 2.5535, + "step": 3214 + }, + { + "epoch": 0.1360098147051358, + "grad_norm": 0.29727303981781006, + "learning_rate": 0.001, + "loss": 3.2918, + "step": 3215 + }, + { + "epoch": 0.13605211946865217, + "grad_norm": 0.24911724030971527, + "learning_rate": 0.001, + "loss": 2.0456, + "step": 3216 + }, + { + "epoch": 0.13609442423216855, + "grad_norm": 0.2424779087305069, + "learning_rate": 0.001, + "loss": 2.466, + "step": 3217 + }, + { + "epoch": 0.1361367289956849, + "grad_norm": 0.23129363358020782, + "learning_rate": 0.001, + "loss": 2.5406, + "step": 3218 + }, + { + "epoch": 0.1361790337592013, + "grad_norm": 0.2544875741004944, + "learning_rate": 0.001, + "loss": 2.7152, + "step": 3219 + }, + { + "epoch": 0.13622133852271764, + "grad_norm": 0.28286901116371155, + "learning_rate": 0.001, + "loss": 2.7993, + "step": 3220 + }, + { + "epoch": 0.13626364328623403, + "grad_norm": 0.3165169060230255, + "learning_rate": 0.001, + "loss": 1.8458, + "step": 3221 + }, + { + "epoch": 0.1363059480497504, + "grad_norm": 0.2771192491054535, + "learning_rate": 0.001, + "loss": 2.2885, + "step": 3222 + }, + { + "epoch": 0.13634825281326676, + "grad_norm": 0.2702016830444336, + "learning_rate": 0.001, + "loss": 2.1035, + "step": 3223 + }, + { + "epoch": 0.13639055757678314, + "grad_norm": 0.20894509553909302, + "learning_rate": 0.001, + "loss": 2.4901, + "step": 3224 + }, + { + "epoch": 0.13643286234029953, + "grad_norm": 0.4794827997684479, + "learning_rate": 0.001, + "loss": 2.3976, + "step": 3225 + }, + { + "epoch": 0.13647516710381588, + "grad_norm": 0.3762035667896271, + "learning_rate": 0.001, + "loss": 2.3773, + "step": 3226 + }, + { + "epoch": 0.13651747186733226, + "grad_norm": 0.9282992482185364, + "learning_rate": 0.001, + "loss": 2.3559, + "step": 3227 + }, + { + "epoch": 0.13655977663084864, + "grad_norm": 0.24388918280601501, + "learning_rate": 0.001, + "loss": 2.6357, + "step": 3228 + }, + { + "epoch": 0.136602081394365, + "grad_norm": 0.22450682520866394, + "learning_rate": 0.001, + "loss": 3.0694, + "step": 3229 + }, + { + "epoch": 0.13664438615788138, + "grad_norm": 0.22949005663394928, + "learning_rate": 0.001, + "loss": 2.4709, + "step": 3230 + }, + { + "epoch": 0.13668669092139776, + "grad_norm": 0.2965823709964752, + "learning_rate": 0.001, + "loss": 2.0617, + "step": 3231 + }, + { + "epoch": 0.13672899568491412, + "grad_norm": 0.6691174507141113, + "learning_rate": 0.001, + "loss": 2.4596, + "step": 3232 + }, + { + "epoch": 0.1367713004484305, + "grad_norm": 0.21386466920375824, + "learning_rate": 0.001, + "loss": 2.3095, + "step": 3233 + }, + { + "epoch": 0.13681360521194685, + "grad_norm": 0.3088574707508087, + "learning_rate": 0.001, + "loss": 2.1195, + "step": 3234 + }, + { + "epoch": 0.13685590997546324, + "grad_norm": 0.23426178097724915, + "learning_rate": 0.001, + "loss": 2.036, + "step": 3235 + }, + { + "epoch": 0.13689821473897962, + "grad_norm": 0.2373049259185791, + "learning_rate": 0.001, + "loss": 2.3527, + "step": 3236 + }, + { + "epoch": 0.13694051950249597, + "grad_norm": 0.253162682056427, + "learning_rate": 0.001, + "loss": 1.8557, + "step": 3237 + }, + { + "epoch": 0.13698282426601235, + "grad_norm": 0.23162192106246948, + "learning_rate": 0.001, + "loss": 2.3313, + "step": 3238 + }, + { + "epoch": 0.13702512902952874, + "grad_norm": 0.20582619309425354, + "learning_rate": 0.001, + "loss": 2.1614, + "step": 3239 + }, + { + "epoch": 0.1370674337930451, + "grad_norm": 0.23828250169754028, + "learning_rate": 0.001, + "loss": 2.4971, + "step": 3240 + }, + { + "epoch": 0.13710973855656147, + "grad_norm": 0.2543167471885681, + "learning_rate": 0.001, + "loss": 2.4651, + "step": 3241 + }, + { + "epoch": 0.13715204332007785, + "grad_norm": 1.886708378791809, + "learning_rate": 0.001, + "loss": 2.7207, + "step": 3242 + }, + { + "epoch": 0.1371943480835942, + "grad_norm": 0.2185901701450348, + "learning_rate": 0.001, + "loss": 1.9575, + "step": 3243 + }, + { + "epoch": 0.1372366528471106, + "grad_norm": 0.25897449254989624, + "learning_rate": 0.001, + "loss": 3.6101, + "step": 3244 + }, + { + "epoch": 0.13727895761062694, + "grad_norm": 0.22902828454971313, + "learning_rate": 0.001, + "loss": 2.5504, + "step": 3245 + }, + { + "epoch": 0.13732126237414333, + "grad_norm": 0.9940201044082642, + "learning_rate": 0.001, + "loss": 2.4984, + "step": 3246 + }, + { + "epoch": 0.1373635671376597, + "grad_norm": 0.5756583213806152, + "learning_rate": 0.001, + "loss": 2.0633, + "step": 3247 + }, + { + "epoch": 0.13740587190117606, + "grad_norm": 0.24829471111297607, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 3248 + }, + { + "epoch": 0.13744817666469245, + "grad_norm": 0.2848198413848877, + "learning_rate": 0.001, + "loss": 2.2559, + "step": 3249 + }, + { + "epoch": 0.13749048142820883, + "grad_norm": 0.2882290184497833, + "learning_rate": 0.001, + "loss": 2.2497, + "step": 3250 + }, + { + "epoch": 0.13753278619172518, + "grad_norm": 0.3670541048049927, + "learning_rate": 0.001, + "loss": 2.8339, + "step": 3251 + }, + { + "epoch": 0.13757509095524156, + "grad_norm": 0.2615547180175781, + "learning_rate": 0.001, + "loss": 2.7191, + "step": 3252 + }, + { + "epoch": 0.13761739571875795, + "grad_norm": 0.2824948728084564, + "learning_rate": 0.001, + "loss": 2.4394, + "step": 3253 + }, + { + "epoch": 0.1376597004822743, + "grad_norm": 0.2855933904647827, + "learning_rate": 0.001, + "loss": 1.7882, + "step": 3254 + }, + { + "epoch": 0.13770200524579068, + "grad_norm": 0.22275911271572113, + "learning_rate": 0.001, + "loss": 2.8593, + "step": 3255 + }, + { + "epoch": 0.13774431000930704, + "grad_norm": 0.29241636395454407, + "learning_rate": 0.001, + "loss": 2.4057, + "step": 3256 + }, + { + "epoch": 0.13778661477282342, + "grad_norm": 0.2222166359424591, + "learning_rate": 0.001, + "loss": 2.4944, + "step": 3257 + }, + { + "epoch": 0.1378289195363398, + "grad_norm": 0.23610618710517883, + "learning_rate": 0.001, + "loss": 2.5925, + "step": 3258 + }, + { + "epoch": 0.13787122429985615, + "grad_norm": 0.25960981845855713, + "learning_rate": 0.001, + "loss": 2.085, + "step": 3259 + }, + { + "epoch": 0.13791352906337254, + "grad_norm": 0.23140114545822144, + "learning_rate": 0.001, + "loss": 2.1499, + "step": 3260 + }, + { + "epoch": 0.13795583382688892, + "grad_norm": 1.9852933883666992, + "learning_rate": 0.001, + "loss": 2.7648, + "step": 3261 + }, + { + "epoch": 0.13799813859040527, + "grad_norm": 0.27138063311576843, + "learning_rate": 0.001, + "loss": 2.131, + "step": 3262 + }, + { + "epoch": 0.13804044335392165, + "grad_norm": 0.23058192431926727, + "learning_rate": 0.001, + "loss": 1.905, + "step": 3263 + }, + { + "epoch": 0.13808274811743804, + "grad_norm": 0.360061377286911, + "learning_rate": 0.001, + "loss": 1.958, + "step": 3264 + }, + { + "epoch": 0.1381250528809544, + "grad_norm": 0.2711131274700165, + "learning_rate": 0.001, + "loss": 2.4258, + "step": 3265 + }, + { + "epoch": 0.13816735764447077, + "grad_norm": 0.30782273411750793, + "learning_rate": 0.001, + "loss": 3.0677, + "step": 3266 + }, + { + "epoch": 0.13820966240798713, + "grad_norm": 0.2656923234462738, + "learning_rate": 0.001, + "loss": 1.8376, + "step": 3267 + }, + { + "epoch": 0.1382519671715035, + "grad_norm": 0.24168632924556732, + "learning_rate": 0.001, + "loss": 1.9629, + "step": 3268 + }, + { + "epoch": 0.1382942719350199, + "grad_norm": 0.34883561730384827, + "learning_rate": 0.001, + "loss": 3.1204, + "step": 3269 + }, + { + "epoch": 0.13833657669853625, + "grad_norm": 0.25927019119262695, + "learning_rate": 0.001, + "loss": 2.9234, + "step": 3270 + }, + { + "epoch": 0.13837888146205263, + "grad_norm": 0.38157638907432556, + "learning_rate": 0.001, + "loss": 2.7691, + "step": 3271 + }, + { + "epoch": 0.138421186225569, + "grad_norm": 0.4185699224472046, + "learning_rate": 0.001, + "loss": 2.306, + "step": 3272 + }, + { + "epoch": 0.13846349098908536, + "grad_norm": 0.22628718614578247, + "learning_rate": 0.001, + "loss": 2.0184, + "step": 3273 + }, + { + "epoch": 0.13850579575260175, + "grad_norm": 0.25233402848243713, + "learning_rate": 0.001, + "loss": 2.845, + "step": 3274 + }, + { + "epoch": 0.13854810051611813, + "grad_norm": 0.26197144389152527, + "learning_rate": 0.001, + "loss": 1.7947, + "step": 3275 + }, + { + "epoch": 0.13859040527963448, + "grad_norm": 0.38328817486763, + "learning_rate": 0.001, + "loss": 2.4887, + "step": 3276 + }, + { + "epoch": 0.13863271004315086, + "grad_norm": 0.2760562300682068, + "learning_rate": 0.001, + "loss": 2.0466, + "step": 3277 + }, + { + "epoch": 0.13867501480666722, + "grad_norm": 0.2207472026348114, + "learning_rate": 0.001, + "loss": 2.2844, + "step": 3278 + }, + { + "epoch": 0.1387173195701836, + "grad_norm": 0.24221129715442657, + "learning_rate": 0.001, + "loss": 2.7634, + "step": 3279 + }, + { + "epoch": 0.13875962433369998, + "grad_norm": 0.23639145493507385, + "learning_rate": 0.001, + "loss": 2.3057, + "step": 3280 + }, + { + "epoch": 0.13880192909721634, + "grad_norm": 0.6643745303153992, + "learning_rate": 0.001, + "loss": 2.1484, + "step": 3281 + }, + { + "epoch": 0.13884423386073272, + "grad_norm": 0.29646036028862, + "learning_rate": 0.001, + "loss": 1.7328, + "step": 3282 + }, + { + "epoch": 0.1388865386242491, + "grad_norm": 0.7454001307487488, + "learning_rate": 0.001, + "loss": 2.0942, + "step": 3283 + }, + { + "epoch": 0.13892884338776545, + "grad_norm": 1.1968051195144653, + "learning_rate": 0.001, + "loss": 2.9441, + "step": 3284 + }, + { + "epoch": 0.13897114815128184, + "grad_norm": 0.23698095977306366, + "learning_rate": 0.001, + "loss": 2.6655, + "step": 3285 + }, + { + "epoch": 0.13901345291479822, + "grad_norm": 0.6837499141693115, + "learning_rate": 0.001, + "loss": 2.2513, + "step": 3286 + }, + { + "epoch": 0.13905575767831457, + "grad_norm": 0.48883894085884094, + "learning_rate": 0.001, + "loss": 2.6241, + "step": 3287 + }, + { + "epoch": 0.13909806244183096, + "grad_norm": 0.5206930637359619, + "learning_rate": 0.001, + "loss": 2.2263, + "step": 3288 + }, + { + "epoch": 0.1391403672053473, + "grad_norm": 0.40440839529037476, + "learning_rate": 0.001, + "loss": 3.109, + "step": 3289 + }, + { + "epoch": 0.1391826719688637, + "grad_norm": 1.5446665287017822, + "learning_rate": 0.001, + "loss": 2.3798, + "step": 3290 + }, + { + "epoch": 0.13922497673238007, + "grad_norm": 0.6700052618980408, + "learning_rate": 0.001, + "loss": 1.8315, + "step": 3291 + }, + { + "epoch": 0.13926728149589643, + "grad_norm": 0.5063325762748718, + "learning_rate": 0.001, + "loss": 2.6604, + "step": 3292 + }, + { + "epoch": 0.1393095862594128, + "grad_norm": 0.38244378566741943, + "learning_rate": 0.001, + "loss": 2.4225, + "step": 3293 + }, + { + "epoch": 0.1393518910229292, + "grad_norm": 0.3094440996646881, + "learning_rate": 0.001, + "loss": 2.6557, + "step": 3294 + }, + { + "epoch": 0.13939419578644555, + "grad_norm": 0.2953416109085083, + "learning_rate": 0.001, + "loss": 2.1801, + "step": 3295 + }, + { + "epoch": 0.13943650054996193, + "grad_norm": 0.2831237018108368, + "learning_rate": 0.001, + "loss": 2.5021, + "step": 3296 + }, + { + "epoch": 0.1394788053134783, + "grad_norm": 14.991360664367676, + "learning_rate": 0.001, + "loss": 3.954, + "step": 3297 + }, + { + "epoch": 0.13952111007699466, + "grad_norm": 0.26766881346702576, + "learning_rate": 0.001, + "loss": 2.3572, + "step": 3298 + }, + { + "epoch": 0.13956341484051105, + "grad_norm": 0.4463373124599457, + "learning_rate": 0.001, + "loss": 2.2735, + "step": 3299 + }, + { + "epoch": 0.1396057196040274, + "grad_norm": 0.30204683542251587, + "learning_rate": 0.001, + "loss": 2.7257, + "step": 3300 + }, + { + "epoch": 0.13964802436754378, + "grad_norm": 0.34700825810432434, + "learning_rate": 0.001, + "loss": 3.2625, + "step": 3301 + }, + { + "epoch": 0.13969032913106016, + "grad_norm": 0.348042756319046, + "learning_rate": 0.001, + "loss": 2.5979, + "step": 3302 + }, + { + "epoch": 0.13973263389457652, + "grad_norm": 0.38788822293281555, + "learning_rate": 0.001, + "loss": 2.5037, + "step": 3303 + }, + { + "epoch": 0.1397749386580929, + "grad_norm": 0.24049419164657593, + "learning_rate": 0.001, + "loss": 2.2218, + "step": 3304 + }, + { + "epoch": 0.13981724342160928, + "grad_norm": 0.2679419219493866, + "learning_rate": 0.001, + "loss": 2.8018, + "step": 3305 + }, + { + "epoch": 0.13985954818512564, + "grad_norm": 0.2681131064891815, + "learning_rate": 0.001, + "loss": 3.2434, + "step": 3306 + }, + { + "epoch": 0.13990185294864202, + "grad_norm": 0.25279781222343445, + "learning_rate": 0.001, + "loss": 2.6153, + "step": 3307 + }, + { + "epoch": 0.1399441577121584, + "grad_norm": 0.23080165684223175, + "learning_rate": 0.001, + "loss": 2.5528, + "step": 3308 + }, + { + "epoch": 0.13998646247567476, + "grad_norm": 0.21056526899337769, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 3309 + }, + { + "epoch": 0.14002876723919114, + "grad_norm": 1.0948318243026733, + "learning_rate": 0.001, + "loss": 1.8427, + "step": 3310 + }, + { + "epoch": 0.1400710720027075, + "grad_norm": 0.27404388785362244, + "learning_rate": 0.001, + "loss": 1.971, + "step": 3311 + }, + { + "epoch": 0.14011337676622387, + "grad_norm": 0.23093107342720032, + "learning_rate": 0.001, + "loss": 2.145, + "step": 3312 + }, + { + "epoch": 0.14015568152974026, + "grad_norm": 0.8817729949951172, + "learning_rate": 0.001, + "loss": 2.4664, + "step": 3313 + }, + { + "epoch": 0.1401979862932566, + "grad_norm": 0.3034093677997589, + "learning_rate": 0.001, + "loss": 2.2661, + "step": 3314 + }, + { + "epoch": 0.140240291056773, + "grad_norm": 0.2352837473154068, + "learning_rate": 0.001, + "loss": 1.6646, + "step": 3315 + }, + { + "epoch": 0.14028259582028937, + "grad_norm": 0.2697775661945343, + "learning_rate": 0.001, + "loss": 2.322, + "step": 3316 + }, + { + "epoch": 0.14032490058380573, + "grad_norm": 0.889599621295929, + "learning_rate": 0.001, + "loss": 2.4923, + "step": 3317 + }, + { + "epoch": 0.1403672053473221, + "grad_norm": 0.3509933352470398, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 3318 + }, + { + "epoch": 0.1404095101108385, + "grad_norm": 0.2816075086593628, + "learning_rate": 0.001, + "loss": 2.1828, + "step": 3319 + }, + { + "epoch": 0.14045181487435485, + "grad_norm": 0.4183788299560547, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 3320 + }, + { + "epoch": 0.14049411963787123, + "grad_norm": 31.530492782592773, + "learning_rate": 0.001, + "loss": 2.0174, + "step": 3321 + }, + { + "epoch": 0.14053642440138758, + "grad_norm": 0.3534470796585083, + "learning_rate": 0.001, + "loss": 2.234, + "step": 3322 + }, + { + "epoch": 0.14057872916490396, + "grad_norm": 0.3242388367652893, + "learning_rate": 0.001, + "loss": 2.3006, + "step": 3323 + }, + { + "epoch": 0.14062103392842035, + "grad_norm": 1.5292270183563232, + "learning_rate": 0.001, + "loss": 2.3741, + "step": 3324 + }, + { + "epoch": 0.1406633386919367, + "grad_norm": 0.5664166212081909, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 3325 + }, + { + "epoch": 0.14070564345545308, + "grad_norm": 0.4291297495365143, + "learning_rate": 0.001, + "loss": 2.9416, + "step": 3326 + }, + { + "epoch": 0.14074794821896947, + "grad_norm": 0.3085836172103882, + "learning_rate": 0.001, + "loss": 3.2857, + "step": 3327 + }, + { + "epoch": 0.14079025298248582, + "grad_norm": 0.3337254524230957, + "learning_rate": 0.001, + "loss": 2.8056, + "step": 3328 + }, + { + "epoch": 0.1408325577460022, + "grad_norm": 0.267048180103302, + "learning_rate": 0.001, + "loss": 2.1272, + "step": 3329 + }, + { + "epoch": 0.14087486250951858, + "grad_norm": 0.8167191743850708, + "learning_rate": 0.001, + "loss": 2.2273, + "step": 3330 + }, + { + "epoch": 0.14091716727303494, + "grad_norm": 0.426624596118927, + "learning_rate": 0.001, + "loss": 3.7321, + "step": 3331 + }, + { + "epoch": 0.14095947203655132, + "grad_norm": 0.2855152189731598, + "learning_rate": 0.001, + "loss": 3.1353, + "step": 3332 + }, + { + "epoch": 0.14100177680006767, + "grad_norm": 0.647718071937561, + "learning_rate": 0.001, + "loss": 2.2319, + "step": 3333 + }, + { + "epoch": 0.14104408156358406, + "grad_norm": 0.7934921383857727, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 3334 + }, + { + "epoch": 0.14108638632710044, + "grad_norm": 0.22644270956516266, + "learning_rate": 0.001, + "loss": 2.802, + "step": 3335 + }, + { + "epoch": 0.1411286910906168, + "grad_norm": 0.5594651699066162, + "learning_rate": 0.001, + "loss": 2.4036, + "step": 3336 + }, + { + "epoch": 0.14117099585413317, + "grad_norm": 0.34176865220069885, + "learning_rate": 0.001, + "loss": 3.0433, + "step": 3337 + }, + { + "epoch": 0.14121330061764956, + "grad_norm": 0.9747275114059448, + "learning_rate": 0.001, + "loss": 2.6015, + "step": 3338 + }, + { + "epoch": 0.1412556053811659, + "grad_norm": 0.4771696925163269, + "learning_rate": 0.001, + "loss": 2.0621, + "step": 3339 + }, + { + "epoch": 0.1412979101446823, + "grad_norm": 0.2995487451553345, + "learning_rate": 0.001, + "loss": 2.6303, + "step": 3340 + }, + { + "epoch": 0.14134021490819867, + "grad_norm": 0.3293488919734955, + "learning_rate": 0.001, + "loss": 3.3868, + "step": 3341 + }, + { + "epoch": 0.14138251967171503, + "grad_norm": 0.2893940508365631, + "learning_rate": 0.001, + "loss": 3.0442, + "step": 3342 + }, + { + "epoch": 0.1414248244352314, + "grad_norm": 0.31001016497612, + "learning_rate": 0.001, + "loss": 2.3384, + "step": 3343 + }, + { + "epoch": 0.14146712919874777, + "grad_norm": 0.31601953506469727, + "learning_rate": 0.001, + "loss": 2.9162, + "step": 3344 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 0.27529218792915344, + "learning_rate": 0.001, + "loss": 2.4267, + "step": 3345 + }, + { + "epoch": 0.14155173872578053, + "grad_norm": 0.6147814393043518, + "learning_rate": 0.001, + "loss": 1.7324, + "step": 3346 + }, + { + "epoch": 0.14159404348929688, + "grad_norm": 0.25061115622520447, + "learning_rate": 0.001, + "loss": 2.4223, + "step": 3347 + }, + { + "epoch": 0.14163634825281327, + "grad_norm": 7.874605178833008, + "learning_rate": 0.001, + "loss": 2.3057, + "step": 3348 + }, + { + "epoch": 0.14167865301632965, + "grad_norm": 0.23804105818271637, + "learning_rate": 0.001, + "loss": 1.9634, + "step": 3349 + }, + { + "epoch": 0.141720957779846, + "grad_norm": 1.9372200965881348, + "learning_rate": 0.001, + "loss": 2.3494, + "step": 3350 + }, + { + "epoch": 0.14176326254336238, + "grad_norm": 0.3472435772418976, + "learning_rate": 0.001, + "loss": 2.9078, + "step": 3351 + }, + { + "epoch": 0.14180556730687877, + "grad_norm": 0.35763394832611084, + "learning_rate": 0.001, + "loss": 3.6208, + "step": 3352 + }, + { + "epoch": 0.14184787207039512, + "grad_norm": 0.3070499300956726, + "learning_rate": 0.001, + "loss": 1.5918, + "step": 3353 + }, + { + "epoch": 0.1418901768339115, + "grad_norm": 0.4274042248725891, + "learning_rate": 0.001, + "loss": 2.3483, + "step": 3354 + }, + { + "epoch": 0.14193248159742788, + "grad_norm": 0.2639428675174713, + "learning_rate": 0.001, + "loss": 2.1166, + "step": 3355 + }, + { + "epoch": 0.14197478636094424, + "grad_norm": 0.23096594214439392, + "learning_rate": 0.001, + "loss": 1.9444, + "step": 3356 + }, + { + "epoch": 0.14201709112446062, + "grad_norm": 0.2586401402950287, + "learning_rate": 0.001, + "loss": 2.3109, + "step": 3357 + }, + { + "epoch": 0.14205939588797697, + "grad_norm": 0.26827335357666016, + "learning_rate": 0.001, + "loss": 2.2147, + "step": 3358 + }, + { + "epoch": 0.14210170065149336, + "grad_norm": 0.26217904686927795, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 3359 + }, + { + "epoch": 0.14214400541500974, + "grad_norm": 0.45599186420440674, + "learning_rate": 0.001, + "loss": 2.3742, + "step": 3360 + }, + { + "epoch": 0.1421863101785261, + "grad_norm": 0.24741707742214203, + "learning_rate": 0.001, + "loss": 1.727, + "step": 3361 + }, + { + "epoch": 0.14222861494204248, + "grad_norm": 0.300525963306427, + "learning_rate": 0.001, + "loss": 2.6092, + "step": 3362 + }, + { + "epoch": 0.14227091970555886, + "grad_norm": 0.27571022510528564, + "learning_rate": 0.001, + "loss": 2.3276, + "step": 3363 + }, + { + "epoch": 0.1423132244690752, + "grad_norm": 0.42578956484794617, + "learning_rate": 0.001, + "loss": 1.5469, + "step": 3364 + }, + { + "epoch": 0.1423555292325916, + "grad_norm": 0.2815234661102295, + "learning_rate": 0.001, + "loss": 3.2236, + "step": 3365 + }, + { + "epoch": 0.14239783399610798, + "grad_norm": 3.376448392868042, + "learning_rate": 0.001, + "loss": 2.2013, + "step": 3366 + }, + { + "epoch": 0.14244013875962433, + "grad_norm": 1.9560842514038086, + "learning_rate": 0.001, + "loss": 2.0186, + "step": 3367 + }, + { + "epoch": 0.1424824435231407, + "grad_norm": 0.23897530138492584, + "learning_rate": 0.001, + "loss": 2.2577, + "step": 3368 + }, + { + "epoch": 0.14252474828665707, + "grad_norm": 0.25748342275619507, + "learning_rate": 0.001, + "loss": 2.3233, + "step": 3369 + }, + { + "epoch": 0.14256705305017345, + "grad_norm": 0.2940260171890259, + "learning_rate": 0.001, + "loss": 1.8611, + "step": 3370 + }, + { + "epoch": 0.14260935781368983, + "grad_norm": 0.5368818640708923, + "learning_rate": 0.001, + "loss": 5.5408, + "step": 3371 + }, + { + "epoch": 0.14265166257720618, + "grad_norm": 0.2916557788848877, + "learning_rate": 0.001, + "loss": 2.0839, + "step": 3372 + }, + { + "epoch": 0.14269396734072257, + "grad_norm": 0.21450082957744598, + "learning_rate": 0.001, + "loss": 3.1867, + "step": 3373 + }, + { + "epoch": 0.14273627210423895, + "grad_norm": 0.43757933378219604, + "learning_rate": 0.001, + "loss": 3.901, + "step": 3374 + }, + { + "epoch": 0.1427785768677553, + "grad_norm": 0.2997872829437256, + "learning_rate": 0.001, + "loss": 2.2917, + "step": 3375 + }, + { + "epoch": 0.14282088163127168, + "grad_norm": 0.26807701587677, + "learning_rate": 0.001, + "loss": 1.9812, + "step": 3376 + }, + { + "epoch": 0.14286318639478807, + "grad_norm": 0.7264923453330994, + "learning_rate": 0.001, + "loss": 1.7461, + "step": 3377 + }, + { + "epoch": 0.14290549115830442, + "grad_norm": 0.23869849741458893, + "learning_rate": 0.001, + "loss": 2.7253, + "step": 3378 + }, + { + "epoch": 0.1429477959218208, + "grad_norm": 2.684951066970825, + "learning_rate": 0.001, + "loss": 1.8465, + "step": 3379 + }, + { + "epoch": 0.14299010068533716, + "grad_norm": 0.33340945839881897, + "learning_rate": 0.001, + "loss": 2.7263, + "step": 3380 + }, + { + "epoch": 0.14303240544885354, + "grad_norm": 0.8364661335945129, + "learning_rate": 0.001, + "loss": 3.4191, + "step": 3381 + }, + { + "epoch": 0.14307471021236992, + "grad_norm": 0.2745528221130371, + "learning_rate": 0.001, + "loss": 2.9753, + "step": 3382 + }, + { + "epoch": 0.14311701497588628, + "grad_norm": 0.5543707609176636, + "learning_rate": 0.001, + "loss": 2.9777, + "step": 3383 + }, + { + "epoch": 0.14315931973940266, + "grad_norm": 0.31856632232666016, + "learning_rate": 0.001, + "loss": 3.4294, + "step": 3384 + }, + { + "epoch": 0.14320162450291904, + "grad_norm": 0.2563943862915039, + "learning_rate": 0.001, + "loss": 2.3104, + "step": 3385 + }, + { + "epoch": 0.1432439292664354, + "grad_norm": 1.2910809516906738, + "learning_rate": 0.001, + "loss": 2.1959, + "step": 3386 + }, + { + "epoch": 0.14328623402995178, + "grad_norm": 0.261447548866272, + "learning_rate": 0.001, + "loss": 1.9312, + "step": 3387 + }, + { + "epoch": 0.14332853879346816, + "grad_norm": 1.013630747795105, + "learning_rate": 0.001, + "loss": 1.6115, + "step": 3388 + }, + { + "epoch": 0.1433708435569845, + "grad_norm": 2.35341477394104, + "learning_rate": 0.001, + "loss": 2.19, + "step": 3389 + }, + { + "epoch": 0.1434131483205009, + "grad_norm": 0.24885472655296326, + "learning_rate": 0.001, + "loss": 2.1222, + "step": 3390 + }, + { + "epoch": 0.14345545308401725, + "grad_norm": 0.45557206869125366, + "learning_rate": 0.001, + "loss": 2.2925, + "step": 3391 + }, + { + "epoch": 0.14349775784753363, + "grad_norm": 0.7096180319786072, + "learning_rate": 0.001, + "loss": 1.7682, + "step": 3392 + }, + { + "epoch": 0.14354006261105, + "grad_norm": 0.6223247051239014, + "learning_rate": 0.001, + "loss": 2.476, + "step": 3393 + }, + { + "epoch": 0.14358236737456637, + "grad_norm": 0.3025702238082886, + "learning_rate": 0.001, + "loss": 2.1303, + "step": 3394 + }, + { + "epoch": 0.14362467213808275, + "grad_norm": 0.6526556015014648, + "learning_rate": 0.001, + "loss": 2.4046, + "step": 3395 + }, + { + "epoch": 0.14366697690159913, + "grad_norm": 0.40125930309295654, + "learning_rate": 0.001, + "loss": 3.7139, + "step": 3396 + }, + { + "epoch": 0.14370928166511548, + "grad_norm": 0.26466187834739685, + "learning_rate": 0.001, + "loss": 2.4826, + "step": 3397 + }, + { + "epoch": 0.14375158642863187, + "grad_norm": 0.26691725850105286, + "learning_rate": 0.001, + "loss": 1.7453, + "step": 3398 + }, + { + "epoch": 0.14379389119214825, + "grad_norm": 0.2618395686149597, + "learning_rate": 0.001, + "loss": 2.1459, + "step": 3399 + }, + { + "epoch": 0.1438361959556646, + "grad_norm": 0.7489890456199646, + "learning_rate": 0.001, + "loss": 2.8917, + "step": 3400 + }, + { + "epoch": 0.14387850071918099, + "grad_norm": 0.24983298778533936, + "learning_rate": 0.001, + "loss": 1.8924, + "step": 3401 + }, + { + "epoch": 0.14392080548269734, + "grad_norm": 0.210697740316391, + "learning_rate": 0.001, + "loss": 2.1125, + "step": 3402 + }, + { + "epoch": 0.14396311024621372, + "grad_norm": 0.25680264830589294, + "learning_rate": 0.001, + "loss": 2.1817, + "step": 3403 + }, + { + "epoch": 0.1440054150097301, + "grad_norm": 0.2333562821149826, + "learning_rate": 0.001, + "loss": 2.1268, + "step": 3404 + }, + { + "epoch": 0.14404771977324646, + "grad_norm": 0.26086026430130005, + "learning_rate": 0.001, + "loss": 2.647, + "step": 3405 + }, + { + "epoch": 0.14409002453676284, + "grad_norm": 0.24561172723770142, + "learning_rate": 0.001, + "loss": 1.7931, + "step": 3406 + }, + { + "epoch": 0.14413232930027922, + "grad_norm": 0.2474185675382614, + "learning_rate": 0.001, + "loss": 2.4043, + "step": 3407 + }, + { + "epoch": 0.14417463406379558, + "grad_norm": 0.2294658124446869, + "learning_rate": 0.001, + "loss": 2.243, + "step": 3408 + }, + { + "epoch": 0.14421693882731196, + "grad_norm": 0.2385028749704361, + "learning_rate": 0.001, + "loss": 2.5765, + "step": 3409 + }, + { + "epoch": 0.14425924359082834, + "grad_norm": 0.2432500123977661, + "learning_rate": 0.001, + "loss": 1.8027, + "step": 3410 + }, + { + "epoch": 0.1443015483543447, + "grad_norm": 0.2768508791923523, + "learning_rate": 0.001, + "loss": 2.4532, + "step": 3411 + }, + { + "epoch": 0.14434385311786108, + "grad_norm": 0.892453134059906, + "learning_rate": 0.001, + "loss": 2.4542, + "step": 3412 + }, + { + "epoch": 0.14438615788137743, + "grad_norm": 0.2457326054573059, + "learning_rate": 0.001, + "loss": 2.1699, + "step": 3413 + }, + { + "epoch": 0.1444284626448938, + "grad_norm": 0.26426583528518677, + "learning_rate": 0.001, + "loss": 2.1309, + "step": 3414 + }, + { + "epoch": 0.1444707674084102, + "grad_norm": 0.2871556282043457, + "learning_rate": 0.001, + "loss": 3.3215, + "step": 3415 + }, + { + "epoch": 0.14451307217192655, + "grad_norm": 0.2520632743835449, + "learning_rate": 0.001, + "loss": 1.9709, + "step": 3416 + }, + { + "epoch": 0.14455537693544293, + "grad_norm": 0.38204339146614075, + "learning_rate": 0.001, + "loss": 3.6021, + "step": 3417 + }, + { + "epoch": 0.1445976816989593, + "grad_norm": 0.26071059703826904, + "learning_rate": 0.001, + "loss": 2.2053, + "step": 3418 + }, + { + "epoch": 0.14463998646247567, + "grad_norm": 0.2733023762702942, + "learning_rate": 0.001, + "loss": 3.47, + "step": 3419 + }, + { + "epoch": 0.14468229122599205, + "grad_norm": 1.002029538154602, + "learning_rate": 0.001, + "loss": 3.2734, + "step": 3420 + }, + { + "epoch": 0.14472459598950843, + "grad_norm": 0.3164158761501312, + "learning_rate": 0.001, + "loss": 2.763, + "step": 3421 + }, + { + "epoch": 0.14476690075302479, + "grad_norm": 0.2513934075832367, + "learning_rate": 0.001, + "loss": 2.0143, + "step": 3422 + }, + { + "epoch": 0.14480920551654117, + "grad_norm": 0.24494628608226776, + "learning_rate": 0.001, + "loss": 2.298, + "step": 3423 + }, + { + "epoch": 0.14485151028005752, + "grad_norm": 0.27729180455207825, + "learning_rate": 0.001, + "loss": 2.6749, + "step": 3424 + }, + { + "epoch": 0.1448938150435739, + "grad_norm": 0.26897814869880676, + "learning_rate": 0.001, + "loss": 3.574, + "step": 3425 + }, + { + "epoch": 0.14493611980709029, + "grad_norm": 0.22974863648414612, + "learning_rate": 0.001, + "loss": 3.4897, + "step": 3426 + }, + { + "epoch": 0.14497842457060664, + "grad_norm": 0.22176092863082886, + "learning_rate": 0.001, + "loss": 1.9926, + "step": 3427 + }, + { + "epoch": 0.14502072933412302, + "grad_norm": 0.38495469093322754, + "learning_rate": 0.001, + "loss": 2.5689, + "step": 3428 + }, + { + "epoch": 0.1450630340976394, + "grad_norm": 0.22981159389019012, + "learning_rate": 0.001, + "loss": 2.2184, + "step": 3429 + }, + { + "epoch": 0.14510533886115576, + "grad_norm": 0.2774735987186432, + "learning_rate": 0.001, + "loss": 3.0189, + "step": 3430 + }, + { + "epoch": 0.14514764362467214, + "grad_norm": 0.2698344886302948, + "learning_rate": 0.001, + "loss": 2.4688, + "step": 3431 + }, + { + "epoch": 0.14518994838818852, + "grad_norm": 0.6816653609275818, + "learning_rate": 0.001, + "loss": 2.0268, + "step": 3432 + }, + { + "epoch": 0.14523225315170488, + "grad_norm": 0.29013943672180176, + "learning_rate": 0.001, + "loss": 3.2067, + "step": 3433 + }, + { + "epoch": 0.14527455791522126, + "grad_norm": 0.22570838034152985, + "learning_rate": 0.001, + "loss": 3.0135, + "step": 3434 + }, + { + "epoch": 0.1453168626787376, + "grad_norm": 0.21075336635112762, + "learning_rate": 0.001, + "loss": 2.242, + "step": 3435 + }, + { + "epoch": 0.145359167442254, + "grad_norm": 0.2940874993801117, + "learning_rate": 0.001, + "loss": 3.109, + "step": 3436 + }, + { + "epoch": 0.14540147220577038, + "grad_norm": 0.21364165842533112, + "learning_rate": 0.001, + "loss": 1.3964, + "step": 3437 + }, + { + "epoch": 0.14544377696928673, + "grad_norm": 0.2253018319606781, + "learning_rate": 0.001, + "loss": 2.2411, + "step": 3438 + }, + { + "epoch": 0.1454860817328031, + "grad_norm": 0.2131890505552292, + "learning_rate": 0.001, + "loss": 1.7277, + "step": 3439 + }, + { + "epoch": 0.1455283864963195, + "grad_norm": 0.2189098596572876, + "learning_rate": 0.001, + "loss": 1.7839, + "step": 3440 + }, + { + "epoch": 0.14557069125983585, + "grad_norm": 2.004612922668457, + "learning_rate": 0.001, + "loss": 1.8655, + "step": 3441 + }, + { + "epoch": 0.14561299602335223, + "grad_norm": 0.2156573235988617, + "learning_rate": 0.001, + "loss": 2.3993, + "step": 3442 + }, + { + "epoch": 0.1456553007868686, + "grad_norm": 0.30679431557655334, + "learning_rate": 0.001, + "loss": 2.738, + "step": 3443 + }, + { + "epoch": 0.14569760555038497, + "grad_norm": 0.2574104070663452, + "learning_rate": 0.001, + "loss": 2.4518, + "step": 3444 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 0.2621307075023651, + "learning_rate": 0.001, + "loss": 2.3622, + "step": 3445 + }, + { + "epoch": 0.1457822150774177, + "grad_norm": 0.30141088366508484, + "learning_rate": 0.001, + "loss": 2.494, + "step": 3446 + }, + { + "epoch": 0.14582451984093409, + "grad_norm": 0.19590161740779877, + "learning_rate": 0.001, + "loss": 1.6406, + "step": 3447 + }, + { + "epoch": 0.14586682460445047, + "grad_norm": 0.21325886249542236, + "learning_rate": 0.001, + "loss": 1.6452, + "step": 3448 + }, + { + "epoch": 0.14590912936796682, + "grad_norm": 0.22410674393177032, + "learning_rate": 0.001, + "loss": 2.4961, + "step": 3449 + }, + { + "epoch": 0.1459514341314832, + "grad_norm": 0.2333114743232727, + "learning_rate": 0.001, + "loss": 1.9583, + "step": 3450 + }, + { + "epoch": 0.1459937388949996, + "grad_norm": 0.43659529089927673, + "learning_rate": 0.001, + "loss": 2.4177, + "step": 3451 + }, + { + "epoch": 0.14603604365851594, + "grad_norm": 0.23108899593353271, + "learning_rate": 0.001, + "loss": 1.9033, + "step": 3452 + }, + { + "epoch": 0.14607834842203232, + "grad_norm": 0.2105552852153778, + "learning_rate": 0.001, + "loss": 1.7711, + "step": 3453 + }, + { + "epoch": 0.1461206531855487, + "grad_norm": 0.22220350801944733, + "learning_rate": 0.001, + "loss": 3.0378, + "step": 3454 + }, + { + "epoch": 0.14616295794906506, + "grad_norm": 0.2945927679538727, + "learning_rate": 0.001, + "loss": 2.0337, + "step": 3455 + }, + { + "epoch": 0.14620526271258144, + "grad_norm": 0.26347851753234863, + "learning_rate": 0.001, + "loss": 2.1827, + "step": 3456 + }, + { + "epoch": 0.1462475674760978, + "grad_norm": 1.3712072372436523, + "learning_rate": 0.001, + "loss": 2.5637, + "step": 3457 + }, + { + "epoch": 0.14628987223961418, + "grad_norm": 0.33388158679008484, + "learning_rate": 0.001, + "loss": 1.8558, + "step": 3458 + }, + { + "epoch": 0.14633217700313056, + "grad_norm": 0.23212599754333496, + "learning_rate": 0.001, + "loss": 2.7854, + "step": 3459 + }, + { + "epoch": 0.1463744817666469, + "grad_norm": 0.2477710247039795, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 3460 + }, + { + "epoch": 0.1464167865301633, + "grad_norm": 0.5189692974090576, + "learning_rate": 0.001, + "loss": 2.5085, + "step": 3461 + }, + { + "epoch": 0.14645909129367968, + "grad_norm": 0.39620745182037354, + "learning_rate": 0.001, + "loss": 2.913, + "step": 3462 + }, + { + "epoch": 0.14650139605719603, + "grad_norm": 0.23096932470798492, + "learning_rate": 0.001, + "loss": 2.7137, + "step": 3463 + }, + { + "epoch": 0.1465437008207124, + "grad_norm": 0.2657645642757416, + "learning_rate": 0.001, + "loss": 2.3191, + "step": 3464 + }, + { + "epoch": 0.1465860055842288, + "grad_norm": 0.2581949532032013, + "learning_rate": 0.001, + "loss": 2.4491, + "step": 3465 + }, + { + "epoch": 0.14662831034774515, + "grad_norm": 0.21660561859607697, + "learning_rate": 0.001, + "loss": 2.4903, + "step": 3466 + }, + { + "epoch": 0.14667061511126153, + "grad_norm": 0.2140873819589615, + "learning_rate": 0.001, + "loss": 2.8028, + "step": 3467 + }, + { + "epoch": 0.1467129198747779, + "grad_norm": 0.23401807248592377, + "learning_rate": 0.001, + "loss": 1.9589, + "step": 3468 + }, + { + "epoch": 0.14675522463829427, + "grad_norm": 0.43440452218055725, + "learning_rate": 0.001, + "loss": 2.3189, + "step": 3469 + }, + { + "epoch": 0.14679752940181065, + "grad_norm": 0.2246183156967163, + "learning_rate": 0.001, + "loss": 2.8871, + "step": 3470 + }, + { + "epoch": 0.146839834165327, + "grad_norm": 0.25875502824783325, + "learning_rate": 0.001, + "loss": 2.123, + "step": 3471 + }, + { + "epoch": 0.1468821389288434, + "grad_norm": 0.2841735780239105, + "learning_rate": 0.001, + "loss": 2.4137, + "step": 3472 + }, + { + "epoch": 0.14692444369235977, + "grad_norm": 0.3783358037471771, + "learning_rate": 0.001, + "loss": 2.2655, + "step": 3473 + }, + { + "epoch": 0.14696674845587612, + "grad_norm": 0.37711983919143677, + "learning_rate": 0.001, + "loss": 2.7385, + "step": 3474 + }, + { + "epoch": 0.1470090532193925, + "grad_norm": 0.2421020269393921, + "learning_rate": 0.001, + "loss": 3.2295, + "step": 3475 + }, + { + "epoch": 0.1470513579829089, + "grad_norm": 0.3066774308681488, + "learning_rate": 0.001, + "loss": 2.2816, + "step": 3476 + }, + { + "epoch": 0.14709366274642524, + "grad_norm": 0.29278188943862915, + "learning_rate": 0.001, + "loss": 2.3431, + "step": 3477 + }, + { + "epoch": 0.14713596750994162, + "grad_norm": 0.259764701128006, + "learning_rate": 0.001, + "loss": 2.7806, + "step": 3478 + }, + { + "epoch": 0.147178272273458, + "grad_norm": 0.2844379246234894, + "learning_rate": 0.001, + "loss": 2.9955, + "step": 3479 + }, + { + "epoch": 0.14722057703697436, + "grad_norm": 0.46466735005378723, + "learning_rate": 0.001, + "loss": 2.212, + "step": 3480 + }, + { + "epoch": 0.14726288180049074, + "grad_norm": 0.2156391441822052, + "learning_rate": 0.001, + "loss": 1.6904, + "step": 3481 + }, + { + "epoch": 0.1473051865640071, + "grad_norm": 0.3095730245113373, + "learning_rate": 0.001, + "loss": 2.1532, + "step": 3482 + }, + { + "epoch": 0.14734749132752348, + "grad_norm": 0.23583956062793732, + "learning_rate": 0.001, + "loss": 1.8892, + "step": 3483 + }, + { + "epoch": 0.14738979609103986, + "grad_norm": 0.19967339932918549, + "learning_rate": 0.001, + "loss": 2.1184, + "step": 3484 + }, + { + "epoch": 0.14743210085455621, + "grad_norm": 0.22221580147743225, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 3485 + }, + { + "epoch": 0.1474744056180726, + "grad_norm": 0.21620295941829681, + "learning_rate": 0.001, + "loss": 1.8619, + "step": 3486 + }, + { + "epoch": 0.14751671038158898, + "grad_norm": 0.2938464283943176, + "learning_rate": 0.001, + "loss": 2.0775, + "step": 3487 + }, + { + "epoch": 0.14755901514510533, + "grad_norm": 0.4123593270778656, + "learning_rate": 0.001, + "loss": 2.9428, + "step": 3488 + }, + { + "epoch": 0.14760131990862171, + "grad_norm": 0.3647308647632599, + "learning_rate": 0.001, + "loss": 2.6945, + "step": 3489 + }, + { + "epoch": 0.1476436246721381, + "grad_norm": 0.9250668287277222, + "learning_rate": 0.001, + "loss": 2.283, + "step": 3490 + }, + { + "epoch": 0.14768592943565445, + "grad_norm": 0.19745373725891113, + "learning_rate": 0.001, + "loss": 1.9975, + "step": 3491 + }, + { + "epoch": 0.14772823419917083, + "grad_norm": 0.30542847514152527, + "learning_rate": 0.001, + "loss": 2.3923, + "step": 3492 + }, + { + "epoch": 0.1477705389626872, + "grad_norm": 0.23577959835529327, + "learning_rate": 0.001, + "loss": 1.8087, + "step": 3493 + }, + { + "epoch": 0.14781284372620357, + "grad_norm": 1.1520733833312988, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 3494 + }, + { + "epoch": 0.14785514848971995, + "grad_norm": 0.4492546617984772, + "learning_rate": 0.001, + "loss": 2.4425, + "step": 3495 + }, + { + "epoch": 0.1478974532532363, + "grad_norm": 0.2272578924894333, + "learning_rate": 0.001, + "loss": 2.0611, + "step": 3496 + }, + { + "epoch": 0.1479397580167527, + "grad_norm": 1.9856147766113281, + "learning_rate": 0.001, + "loss": 2.5383, + "step": 3497 + }, + { + "epoch": 0.14798206278026907, + "grad_norm": 0.32524406909942627, + "learning_rate": 0.001, + "loss": 2.6931, + "step": 3498 + }, + { + "epoch": 0.14802436754378542, + "grad_norm": 0.3067854046821594, + "learning_rate": 0.001, + "loss": 2.6123, + "step": 3499 + }, + { + "epoch": 0.1480666723073018, + "grad_norm": 0.2817644476890564, + "learning_rate": 0.001, + "loss": 2.0649, + "step": 3500 + }, + { + "epoch": 0.1481089770708182, + "grad_norm": 1.7400952577590942, + "learning_rate": 0.001, + "loss": 2.2053, + "step": 3501 + }, + { + "epoch": 0.14815128183433454, + "grad_norm": 0.2977108061313629, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 3502 + }, + { + "epoch": 0.14819358659785092, + "grad_norm": 0.3165165185928345, + "learning_rate": 0.001, + "loss": 3.425, + "step": 3503 + }, + { + "epoch": 0.14823589136136728, + "grad_norm": 0.25336670875549316, + "learning_rate": 0.001, + "loss": 1.8021, + "step": 3504 + }, + { + "epoch": 0.14827819612488366, + "grad_norm": 1.0275665521621704, + "learning_rate": 0.001, + "loss": 2.5291, + "step": 3505 + }, + { + "epoch": 0.14832050088840004, + "grad_norm": 0.291814923286438, + "learning_rate": 0.001, + "loss": 1.9457, + "step": 3506 + }, + { + "epoch": 0.1483628056519164, + "grad_norm": 0.25686556100845337, + "learning_rate": 0.001, + "loss": 2.059, + "step": 3507 + }, + { + "epoch": 0.14840511041543278, + "grad_norm": 0.2979572117328644, + "learning_rate": 0.001, + "loss": 1.9924, + "step": 3508 + }, + { + "epoch": 0.14844741517894916, + "grad_norm": 0.2901727259159088, + "learning_rate": 0.001, + "loss": 2.0276, + "step": 3509 + }, + { + "epoch": 0.14848971994246551, + "grad_norm": 0.34327811002731323, + "learning_rate": 0.001, + "loss": 2.7836, + "step": 3510 + }, + { + "epoch": 0.1485320247059819, + "grad_norm": 0.38972118496894836, + "learning_rate": 0.001, + "loss": 3.0989, + "step": 3511 + }, + { + "epoch": 0.14857432946949828, + "grad_norm": 11.712198257446289, + "learning_rate": 0.001, + "loss": 3.3529, + "step": 3512 + }, + { + "epoch": 0.14861663423301463, + "grad_norm": 5.345020771026611, + "learning_rate": 0.001, + "loss": 3.9772, + "step": 3513 + }, + { + "epoch": 0.14865893899653101, + "grad_norm": 0.3382362425327301, + "learning_rate": 0.001, + "loss": 2.0698, + "step": 3514 + }, + { + "epoch": 0.14870124376004737, + "grad_norm": 0.24800531566143036, + "learning_rate": 0.001, + "loss": 2.5415, + "step": 3515 + }, + { + "epoch": 0.14874354852356375, + "grad_norm": 0.2699640393257141, + "learning_rate": 0.001, + "loss": 3.1028, + "step": 3516 + }, + { + "epoch": 0.14878585328708013, + "grad_norm": 0.273624062538147, + "learning_rate": 0.001, + "loss": 2.3766, + "step": 3517 + }, + { + "epoch": 0.1488281580505965, + "grad_norm": 0.2503005266189575, + "learning_rate": 0.001, + "loss": 2.6137, + "step": 3518 + }, + { + "epoch": 0.14887046281411287, + "grad_norm": 0.8663057684898376, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 3519 + }, + { + "epoch": 0.14891276757762925, + "grad_norm": 0.25076824426651, + "learning_rate": 0.001, + "loss": 2.2382, + "step": 3520 + }, + { + "epoch": 0.1489550723411456, + "grad_norm": 0.34838005900382996, + "learning_rate": 0.001, + "loss": 2.0448, + "step": 3521 + }, + { + "epoch": 0.148997377104662, + "grad_norm": 0.2928178012371063, + "learning_rate": 0.001, + "loss": 3.3827, + "step": 3522 + }, + { + "epoch": 0.14903968186817837, + "grad_norm": 1.0668600797653198, + "learning_rate": 0.001, + "loss": 2.3127, + "step": 3523 + }, + { + "epoch": 0.14908198663169472, + "grad_norm": 0.32312437891960144, + "learning_rate": 0.001, + "loss": 2.3694, + "step": 3524 + }, + { + "epoch": 0.1491242913952111, + "grad_norm": 19.220008850097656, + "learning_rate": 0.001, + "loss": 1.897, + "step": 3525 + }, + { + "epoch": 0.14916659615872746, + "grad_norm": 0.2430810034275055, + "learning_rate": 0.001, + "loss": 2.16, + "step": 3526 + }, + { + "epoch": 0.14920890092224384, + "grad_norm": 0.3711296319961548, + "learning_rate": 0.001, + "loss": 2.4911, + "step": 3527 + }, + { + "epoch": 0.14925120568576022, + "grad_norm": 0.4155080020427704, + "learning_rate": 0.001, + "loss": 3.8939, + "step": 3528 + }, + { + "epoch": 0.14929351044927658, + "grad_norm": 0.3918618857860565, + "learning_rate": 0.001, + "loss": 2.4925, + "step": 3529 + }, + { + "epoch": 0.14933581521279296, + "grad_norm": 0.3465852439403534, + "learning_rate": 0.001, + "loss": 2.4602, + "step": 3530 + }, + { + "epoch": 0.14937811997630934, + "grad_norm": 0.2546325623989105, + "learning_rate": 0.001, + "loss": 3.3572, + "step": 3531 + }, + { + "epoch": 0.1494204247398257, + "grad_norm": 0.24824251234531403, + "learning_rate": 0.001, + "loss": 2.3866, + "step": 3532 + }, + { + "epoch": 0.14946272950334208, + "grad_norm": 0.783733606338501, + "learning_rate": 0.001, + "loss": 2.2685, + "step": 3533 + }, + { + "epoch": 0.14950503426685846, + "grad_norm": 0.27685683965682983, + "learning_rate": 0.001, + "loss": 2.6839, + "step": 3534 + }, + { + "epoch": 0.14954733903037482, + "grad_norm": 0.9122802019119263, + "learning_rate": 0.001, + "loss": 2.1993, + "step": 3535 + }, + { + "epoch": 0.1495896437938912, + "grad_norm": 0.20734846591949463, + "learning_rate": 0.001, + "loss": 2.7781, + "step": 3536 + }, + { + "epoch": 0.14963194855740755, + "grad_norm": 0.303323894739151, + "learning_rate": 0.001, + "loss": 3.147, + "step": 3537 + }, + { + "epoch": 0.14967425332092393, + "grad_norm": 3.6282622814178467, + "learning_rate": 0.001, + "loss": 3.1942, + "step": 3538 + }, + { + "epoch": 0.14971655808444032, + "grad_norm": 0.5021495223045349, + "learning_rate": 0.001, + "loss": 2.3829, + "step": 3539 + }, + { + "epoch": 0.14975886284795667, + "grad_norm": 0.3432963788509369, + "learning_rate": 0.001, + "loss": 2.3058, + "step": 3540 + }, + { + "epoch": 0.14980116761147305, + "grad_norm": 0.46659693121910095, + "learning_rate": 0.001, + "loss": 2.5437, + "step": 3541 + }, + { + "epoch": 0.14984347237498943, + "grad_norm": 0.2588510513305664, + "learning_rate": 0.001, + "loss": 1.9825, + "step": 3542 + }, + { + "epoch": 0.1498857771385058, + "grad_norm": 0.8514876961708069, + "learning_rate": 0.001, + "loss": 2.1087, + "step": 3543 + }, + { + "epoch": 0.14992808190202217, + "grad_norm": 1.0069010257720947, + "learning_rate": 0.001, + "loss": 1.9367, + "step": 3544 + }, + { + "epoch": 0.14997038666553855, + "grad_norm": 0.3433607220649719, + "learning_rate": 0.001, + "loss": 1.9445, + "step": 3545 + }, + { + "epoch": 0.1500126914290549, + "grad_norm": 0.9383977055549622, + "learning_rate": 0.001, + "loss": 3.1781, + "step": 3546 + }, + { + "epoch": 0.1500549961925713, + "grad_norm": 0.6198011040687561, + "learning_rate": 0.001, + "loss": 2.755, + "step": 3547 + }, + { + "epoch": 0.15009730095608764, + "grad_norm": 0.7712514400482178, + "learning_rate": 0.001, + "loss": 2.3735, + "step": 3548 + }, + { + "epoch": 0.15013960571960402, + "grad_norm": 0.8419329524040222, + "learning_rate": 0.001, + "loss": 1.8685, + "step": 3549 + }, + { + "epoch": 0.1501819104831204, + "grad_norm": 0.38785937428474426, + "learning_rate": 0.001, + "loss": 2.7159, + "step": 3550 + }, + { + "epoch": 0.15022421524663676, + "grad_norm": 0.25905489921569824, + "learning_rate": 0.001, + "loss": 1.916, + "step": 3551 + }, + { + "epoch": 0.15026652001015314, + "grad_norm": 0.28637632727622986, + "learning_rate": 0.001, + "loss": 2.152, + "step": 3552 + }, + { + "epoch": 0.15030882477366952, + "grad_norm": 0.4733045697212219, + "learning_rate": 0.001, + "loss": 2.2419, + "step": 3553 + }, + { + "epoch": 0.15035112953718588, + "grad_norm": 0.3638868033885956, + "learning_rate": 0.001, + "loss": 2.0374, + "step": 3554 + }, + { + "epoch": 0.15039343430070226, + "grad_norm": 0.4152970612049103, + "learning_rate": 0.001, + "loss": 2.3609, + "step": 3555 + }, + { + "epoch": 0.15043573906421864, + "grad_norm": 1.5988473892211914, + "learning_rate": 0.001, + "loss": 1.8426, + "step": 3556 + }, + { + "epoch": 0.150478043827735, + "grad_norm": 0.24985834956169128, + "learning_rate": 0.001, + "loss": 2.2193, + "step": 3557 + }, + { + "epoch": 0.15052034859125138, + "grad_norm": 7.093243598937988, + "learning_rate": 0.001, + "loss": 1.9872, + "step": 3558 + }, + { + "epoch": 0.15056265335476773, + "grad_norm": 0.5130526423454285, + "learning_rate": 0.001, + "loss": 3.1589, + "step": 3559 + }, + { + "epoch": 0.15060495811828412, + "grad_norm": 0.9955002665519714, + "learning_rate": 0.001, + "loss": 2.2292, + "step": 3560 + }, + { + "epoch": 0.1506472628818005, + "grad_norm": 0.4683253765106201, + "learning_rate": 0.001, + "loss": 1.9829, + "step": 3561 + }, + { + "epoch": 0.15068956764531685, + "grad_norm": 0.24029046297073364, + "learning_rate": 0.001, + "loss": 2.5349, + "step": 3562 + }, + { + "epoch": 0.15073187240883323, + "grad_norm": 0.24601727724075317, + "learning_rate": 0.001, + "loss": 2.589, + "step": 3563 + }, + { + "epoch": 0.15077417717234962, + "grad_norm": 0.2925271987915039, + "learning_rate": 0.001, + "loss": 2.6538, + "step": 3564 + }, + { + "epoch": 0.15081648193586597, + "grad_norm": 0.6500786542892456, + "learning_rate": 0.001, + "loss": 2.3506, + "step": 3565 + }, + { + "epoch": 0.15085878669938235, + "grad_norm": 0.3403443396091461, + "learning_rate": 0.001, + "loss": 3.4838, + "step": 3566 + }, + { + "epoch": 0.15090109146289873, + "grad_norm": 0.2680628299713135, + "learning_rate": 0.001, + "loss": 3.188, + "step": 3567 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.25424718856811523, + "learning_rate": 0.001, + "loss": 2.5081, + "step": 3568 + }, + { + "epoch": 0.15098570098993147, + "grad_norm": 0.24566307663917542, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 3569 + }, + { + "epoch": 0.15102800575344782, + "grad_norm": 2.2985541820526123, + "learning_rate": 0.001, + "loss": 2.8511, + "step": 3570 + }, + { + "epoch": 0.1510703105169642, + "grad_norm": 0.2778152823448181, + "learning_rate": 0.001, + "loss": 2.8095, + "step": 3571 + }, + { + "epoch": 0.1511126152804806, + "grad_norm": 1.3614428043365479, + "learning_rate": 0.001, + "loss": 2.2643, + "step": 3572 + }, + { + "epoch": 0.15115492004399694, + "grad_norm": 4.856557846069336, + "learning_rate": 0.001, + "loss": 2.3387, + "step": 3573 + }, + { + "epoch": 0.15119722480751333, + "grad_norm": 0.2886848449707031, + "learning_rate": 0.001, + "loss": 3.1371, + "step": 3574 + }, + { + "epoch": 0.1512395295710297, + "grad_norm": 0.2378971427679062, + "learning_rate": 0.001, + "loss": 2.4346, + "step": 3575 + }, + { + "epoch": 0.15128183433454606, + "grad_norm": 0.28027504682540894, + "learning_rate": 0.001, + "loss": 2.2992, + "step": 3576 + }, + { + "epoch": 0.15132413909806244, + "grad_norm": 0.9018852114677429, + "learning_rate": 0.001, + "loss": 1.6448, + "step": 3577 + }, + { + "epoch": 0.15136644386157883, + "grad_norm": 0.28057655692100525, + "learning_rate": 0.001, + "loss": 1.584, + "step": 3578 + }, + { + "epoch": 0.15140874862509518, + "grad_norm": 0.6362797021865845, + "learning_rate": 0.001, + "loss": 3.2339, + "step": 3579 + }, + { + "epoch": 0.15145105338861156, + "grad_norm": 2.48857045173645, + "learning_rate": 0.001, + "loss": 2.0265, + "step": 3580 + }, + { + "epoch": 0.15149335815212792, + "grad_norm": 0.341279536485672, + "learning_rate": 0.001, + "loss": 2.7349, + "step": 3581 + }, + { + "epoch": 0.1515356629156443, + "grad_norm": 0.39645954966545105, + "learning_rate": 0.001, + "loss": 2.335, + "step": 3582 + }, + { + "epoch": 0.15157796767916068, + "grad_norm": 2.092052698135376, + "learning_rate": 0.001, + "loss": 1.5818, + "step": 3583 + }, + { + "epoch": 0.15162027244267703, + "grad_norm": 5.003619194030762, + "learning_rate": 0.001, + "loss": 2.547, + "step": 3584 + }, + { + "epoch": 0.15166257720619342, + "grad_norm": 0.3023916482925415, + "learning_rate": 0.001, + "loss": 1.8312, + "step": 3585 + }, + { + "epoch": 0.1517048819697098, + "grad_norm": 0.5600648522377014, + "learning_rate": 0.001, + "loss": 2.258, + "step": 3586 + }, + { + "epoch": 0.15174718673322615, + "grad_norm": 0.38749417662620544, + "learning_rate": 0.001, + "loss": 2.2861, + "step": 3587 + }, + { + "epoch": 0.15178949149674253, + "grad_norm": 1.0356155633926392, + "learning_rate": 0.001, + "loss": 3.6475, + "step": 3588 + }, + { + "epoch": 0.15183179626025892, + "grad_norm": 0.27420303225517273, + "learning_rate": 0.001, + "loss": 2.0222, + "step": 3589 + }, + { + "epoch": 0.15187410102377527, + "grad_norm": 1.7488365173339844, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 3590 + }, + { + "epoch": 0.15191640578729165, + "grad_norm": 0.3303074538707733, + "learning_rate": 0.001, + "loss": 2.4819, + "step": 3591 + }, + { + "epoch": 0.151958710550808, + "grad_norm": 0.8918514847755432, + "learning_rate": 0.001, + "loss": 2.3028, + "step": 3592 + }, + { + "epoch": 0.1520010153143244, + "grad_norm": 0.40100759267807007, + "learning_rate": 0.001, + "loss": 2.3375, + "step": 3593 + }, + { + "epoch": 0.15204332007784077, + "grad_norm": 0.4550624489784241, + "learning_rate": 0.001, + "loss": 4.096, + "step": 3594 + }, + { + "epoch": 0.15208562484135713, + "grad_norm": 0.7799673080444336, + "learning_rate": 0.001, + "loss": 2.6198, + "step": 3595 + }, + { + "epoch": 0.1521279296048735, + "grad_norm": 0.34559065103530884, + "learning_rate": 0.001, + "loss": 2.9821, + "step": 3596 + }, + { + "epoch": 0.1521702343683899, + "grad_norm": 0.3571106493473053, + "learning_rate": 0.001, + "loss": 2.503, + "step": 3597 + }, + { + "epoch": 0.15221253913190624, + "grad_norm": 0.3727911710739136, + "learning_rate": 0.001, + "loss": 2.5119, + "step": 3598 + }, + { + "epoch": 0.15225484389542263, + "grad_norm": 4.5436015129089355, + "learning_rate": 0.001, + "loss": 3.7948, + "step": 3599 + }, + { + "epoch": 0.152297148658939, + "grad_norm": 0.3875895142555237, + "learning_rate": 0.001, + "loss": 2.6795, + "step": 3600 + }, + { + "epoch": 0.15233945342245536, + "grad_norm": 0.3254016935825348, + "learning_rate": 0.001, + "loss": 2.9257, + "step": 3601 + }, + { + "epoch": 0.15238175818597174, + "grad_norm": 0.27709123492240906, + "learning_rate": 0.001, + "loss": 2.9524, + "step": 3602 + }, + { + "epoch": 0.15242406294948813, + "grad_norm": 0.236286923289299, + "learning_rate": 0.001, + "loss": 2.0965, + "step": 3603 + }, + { + "epoch": 0.15246636771300448, + "grad_norm": 0.29797059297561646, + "learning_rate": 0.001, + "loss": 2.974, + "step": 3604 + }, + { + "epoch": 0.15250867247652086, + "grad_norm": 0.9622611999511719, + "learning_rate": 0.001, + "loss": 1.8431, + "step": 3605 + }, + { + "epoch": 0.15255097724003722, + "grad_norm": 0.32603511214256287, + "learning_rate": 0.001, + "loss": 3.1559, + "step": 3606 + }, + { + "epoch": 0.1525932820035536, + "grad_norm": 0.965201199054718, + "learning_rate": 0.001, + "loss": 3.2636, + "step": 3607 + }, + { + "epoch": 0.15263558676706998, + "grad_norm": 0.28725409507751465, + "learning_rate": 0.001, + "loss": 1.9986, + "step": 3608 + }, + { + "epoch": 0.15267789153058633, + "grad_norm": 0.28865811228752136, + "learning_rate": 0.001, + "loss": 3.1487, + "step": 3609 + }, + { + "epoch": 0.15272019629410272, + "grad_norm": 0.22873492538928986, + "learning_rate": 0.001, + "loss": 1.9311, + "step": 3610 + }, + { + "epoch": 0.1527625010576191, + "grad_norm": 0.4197172522544861, + "learning_rate": 0.001, + "loss": 2.2263, + "step": 3611 + }, + { + "epoch": 0.15280480582113545, + "grad_norm": 0.3066697120666504, + "learning_rate": 0.001, + "loss": 2.3186, + "step": 3612 + }, + { + "epoch": 0.15284711058465184, + "grad_norm": 0.23598486185073853, + "learning_rate": 0.001, + "loss": 1.9821, + "step": 3613 + }, + { + "epoch": 0.15288941534816822, + "grad_norm": 0.21602579951286316, + "learning_rate": 0.001, + "loss": 1.9219, + "step": 3614 + }, + { + "epoch": 0.15293172011168457, + "grad_norm": 0.28833669424057007, + "learning_rate": 0.001, + "loss": 2.1563, + "step": 3615 + }, + { + "epoch": 0.15297402487520095, + "grad_norm": 0.6677043437957764, + "learning_rate": 0.001, + "loss": 2.4993, + "step": 3616 + }, + { + "epoch": 0.1530163296387173, + "grad_norm": 0.29518651962280273, + "learning_rate": 0.001, + "loss": 3.194, + "step": 3617 + }, + { + "epoch": 0.1530586344022337, + "grad_norm": 4.215554237365723, + "learning_rate": 0.001, + "loss": 2.5791, + "step": 3618 + }, + { + "epoch": 0.15310093916575007, + "grad_norm": 0.27064064145088196, + "learning_rate": 0.001, + "loss": 3.0186, + "step": 3619 + }, + { + "epoch": 0.15314324392926643, + "grad_norm": 0.22626028954982758, + "learning_rate": 0.001, + "loss": 1.7413, + "step": 3620 + }, + { + "epoch": 0.1531855486927828, + "grad_norm": 0.5641341805458069, + "learning_rate": 0.001, + "loss": 2.2514, + "step": 3621 + }, + { + "epoch": 0.1532278534562992, + "grad_norm": 0.6790673136711121, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 3622 + }, + { + "epoch": 0.15327015821981554, + "grad_norm": 1.0468883514404297, + "learning_rate": 0.001, + "loss": 2.5064, + "step": 3623 + }, + { + "epoch": 0.15331246298333193, + "grad_norm": 0.710141658782959, + "learning_rate": 0.001, + "loss": 2.2824, + "step": 3624 + }, + { + "epoch": 0.1533547677468483, + "grad_norm": 0.46319958567619324, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 3625 + }, + { + "epoch": 0.15339707251036466, + "grad_norm": 0.24517352879047394, + "learning_rate": 0.001, + "loss": 2.353, + "step": 3626 + }, + { + "epoch": 0.15343937727388104, + "grad_norm": 0.25180742144584656, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 3627 + }, + { + "epoch": 0.1534816820373974, + "grad_norm": 0.8973338603973389, + "learning_rate": 0.001, + "loss": 2.0089, + "step": 3628 + }, + { + "epoch": 0.15352398680091378, + "grad_norm": 0.2948894500732422, + "learning_rate": 0.001, + "loss": 1.8217, + "step": 3629 + }, + { + "epoch": 0.15356629156443016, + "grad_norm": 0.4694260358810425, + "learning_rate": 0.001, + "loss": 1.9615, + "step": 3630 + }, + { + "epoch": 0.15360859632794652, + "grad_norm": 0.2402891218662262, + "learning_rate": 0.001, + "loss": 2.4652, + "step": 3631 + }, + { + "epoch": 0.1536509010914629, + "grad_norm": 2.8009684085845947, + "learning_rate": 0.001, + "loss": 2.6741, + "step": 3632 + }, + { + "epoch": 0.15369320585497928, + "grad_norm": 0.3112724721431732, + "learning_rate": 0.001, + "loss": 2.6238, + "step": 3633 + }, + { + "epoch": 0.15373551061849564, + "grad_norm": 0.29018670320510864, + "learning_rate": 0.001, + "loss": 2.4207, + "step": 3634 + }, + { + "epoch": 0.15377781538201202, + "grad_norm": 0.8226271867752075, + "learning_rate": 0.001, + "loss": 2.0828, + "step": 3635 + }, + { + "epoch": 0.1538201201455284, + "grad_norm": 0.26996588706970215, + "learning_rate": 0.001, + "loss": 2.2658, + "step": 3636 + }, + { + "epoch": 0.15386242490904475, + "grad_norm": 0.33914855122566223, + "learning_rate": 0.001, + "loss": 2.4518, + "step": 3637 + }, + { + "epoch": 0.15390472967256114, + "grad_norm": 0.37279003858566284, + "learning_rate": 0.001, + "loss": 2.0381, + "step": 3638 + }, + { + "epoch": 0.1539470344360775, + "grad_norm": 0.4848242700099945, + "learning_rate": 0.001, + "loss": 2.8453, + "step": 3639 + }, + { + "epoch": 0.15398933919959387, + "grad_norm": 0.3430381119251251, + "learning_rate": 0.001, + "loss": 1.9578, + "step": 3640 + }, + { + "epoch": 0.15403164396311025, + "grad_norm": 0.2790931165218353, + "learning_rate": 0.001, + "loss": 1.5198, + "step": 3641 + }, + { + "epoch": 0.1540739487266266, + "grad_norm": 0.23120807111263275, + "learning_rate": 0.001, + "loss": 2.044, + "step": 3642 + }, + { + "epoch": 0.154116253490143, + "grad_norm": 0.26103150844573975, + "learning_rate": 0.001, + "loss": 2.3744, + "step": 3643 + }, + { + "epoch": 0.15415855825365937, + "grad_norm": 0.21912577748298645, + "learning_rate": 0.001, + "loss": 1.8251, + "step": 3644 + }, + { + "epoch": 0.15420086301717573, + "grad_norm": 0.268587201833725, + "learning_rate": 0.001, + "loss": 4.6103, + "step": 3645 + }, + { + "epoch": 0.1542431677806921, + "grad_norm": 0.24096794426441193, + "learning_rate": 0.001, + "loss": 1.8308, + "step": 3646 + }, + { + "epoch": 0.1542854725442085, + "grad_norm": 0.5062342882156372, + "learning_rate": 0.001, + "loss": 1.7605, + "step": 3647 + }, + { + "epoch": 0.15432777730772484, + "grad_norm": 1.528459072113037, + "learning_rate": 0.001, + "loss": 3.088, + "step": 3648 + }, + { + "epoch": 0.15437008207124123, + "grad_norm": 0.27189287543296814, + "learning_rate": 0.001, + "loss": 1.9692, + "step": 3649 + }, + { + "epoch": 0.15441238683475758, + "grad_norm": 0.22782421112060547, + "learning_rate": 0.001, + "loss": 1.9446, + "step": 3650 + }, + { + "epoch": 0.15445469159827396, + "grad_norm": 0.24183671176433563, + "learning_rate": 0.001, + "loss": 2.9768, + "step": 3651 + }, + { + "epoch": 0.15449699636179035, + "grad_norm": 0.37327948212623596, + "learning_rate": 0.001, + "loss": 2.5958, + "step": 3652 + }, + { + "epoch": 0.1545393011253067, + "grad_norm": 0.2614731788635254, + "learning_rate": 0.001, + "loss": 2.8753, + "step": 3653 + }, + { + "epoch": 0.15458160588882308, + "grad_norm": 0.30423954129219055, + "learning_rate": 0.001, + "loss": 2.9914, + "step": 3654 + }, + { + "epoch": 0.15462391065233946, + "grad_norm": 0.2651650607585907, + "learning_rate": 0.001, + "loss": 2.2149, + "step": 3655 + }, + { + "epoch": 0.15466621541585582, + "grad_norm": 0.29150527715682983, + "learning_rate": 0.001, + "loss": 3.1139, + "step": 3656 + }, + { + "epoch": 0.1547085201793722, + "grad_norm": 0.26949405670166016, + "learning_rate": 0.001, + "loss": 3.0504, + "step": 3657 + }, + { + "epoch": 0.15475082494288858, + "grad_norm": 0.277124285697937, + "learning_rate": 0.001, + "loss": 2.7155, + "step": 3658 + }, + { + "epoch": 0.15479312970640494, + "grad_norm": 0.24489636719226837, + "learning_rate": 0.001, + "loss": 2.4702, + "step": 3659 + }, + { + "epoch": 0.15483543446992132, + "grad_norm": 0.18855103850364685, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 3660 + }, + { + "epoch": 0.15487773923343767, + "grad_norm": 0.311081200838089, + "learning_rate": 0.001, + "loss": 2.6019, + "step": 3661 + }, + { + "epoch": 0.15492004399695405, + "grad_norm": 0.22517842054367065, + "learning_rate": 0.001, + "loss": 2.035, + "step": 3662 + }, + { + "epoch": 0.15496234876047044, + "grad_norm": 0.20022818446159363, + "learning_rate": 0.001, + "loss": 3.0709, + "step": 3663 + }, + { + "epoch": 0.1550046535239868, + "grad_norm": 0.3839741349220276, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 3664 + }, + { + "epoch": 0.15504695828750317, + "grad_norm": 0.3007325828075409, + "learning_rate": 0.001, + "loss": 3.0195, + "step": 3665 + }, + { + "epoch": 0.15508926305101955, + "grad_norm": 0.2808815538883209, + "learning_rate": 0.001, + "loss": 2.1549, + "step": 3666 + }, + { + "epoch": 0.1551315678145359, + "grad_norm": 0.23041900992393494, + "learning_rate": 0.001, + "loss": 1.8212, + "step": 3667 + }, + { + "epoch": 0.1551738725780523, + "grad_norm": 0.2662278711795807, + "learning_rate": 0.001, + "loss": 2.8651, + "step": 3668 + }, + { + "epoch": 0.15521617734156867, + "grad_norm": 1.5377527475357056, + "learning_rate": 0.001, + "loss": 2.4422, + "step": 3669 + }, + { + "epoch": 0.15525848210508503, + "grad_norm": 0.22316411137580872, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 3670 + }, + { + "epoch": 0.1553007868686014, + "grad_norm": 0.26653704047203064, + "learning_rate": 0.001, + "loss": 2.4008, + "step": 3671 + }, + { + "epoch": 0.15534309163211776, + "grad_norm": 0.2589184045791626, + "learning_rate": 0.001, + "loss": 2.8367, + "step": 3672 + }, + { + "epoch": 0.15538539639563415, + "grad_norm": 0.26263269782066345, + "learning_rate": 0.001, + "loss": 1.9108, + "step": 3673 + }, + { + "epoch": 0.15542770115915053, + "grad_norm": 0.3323366343975067, + "learning_rate": 0.001, + "loss": 2.1237, + "step": 3674 + }, + { + "epoch": 0.15547000592266688, + "grad_norm": 0.22809675335884094, + "learning_rate": 0.001, + "loss": 1.8055, + "step": 3675 + }, + { + "epoch": 0.15551231068618326, + "grad_norm": 0.2392113357782364, + "learning_rate": 0.001, + "loss": 3.3006, + "step": 3676 + }, + { + "epoch": 0.15555461544969965, + "grad_norm": 0.24501052498817444, + "learning_rate": 0.001, + "loss": 2.4065, + "step": 3677 + }, + { + "epoch": 0.155596920213216, + "grad_norm": 1.7419697046279907, + "learning_rate": 0.001, + "loss": 2.0626, + "step": 3678 + }, + { + "epoch": 0.15563922497673238, + "grad_norm": 0.2538470923900604, + "learning_rate": 0.001, + "loss": 2.548, + "step": 3679 + }, + { + "epoch": 0.15568152974024876, + "grad_norm": 0.27344977855682373, + "learning_rate": 0.001, + "loss": 1.8455, + "step": 3680 + }, + { + "epoch": 0.15572383450376512, + "grad_norm": 5.969301700592041, + "learning_rate": 0.001, + "loss": 1.987, + "step": 3681 + }, + { + "epoch": 0.1557661392672815, + "grad_norm": 0.21255825459957123, + "learning_rate": 0.001, + "loss": 2.1096, + "step": 3682 + }, + { + "epoch": 0.15580844403079785, + "grad_norm": 0.25273945927619934, + "learning_rate": 0.001, + "loss": 2.4582, + "step": 3683 + }, + { + "epoch": 0.15585074879431424, + "grad_norm": 0.2562407851219177, + "learning_rate": 0.001, + "loss": 1.9308, + "step": 3684 + }, + { + "epoch": 0.15589305355783062, + "grad_norm": 0.596889853477478, + "learning_rate": 0.001, + "loss": 2.817, + "step": 3685 + }, + { + "epoch": 0.15593535832134697, + "grad_norm": 0.55907142162323, + "learning_rate": 0.001, + "loss": 2.8084, + "step": 3686 + }, + { + "epoch": 0.15597766308486335, + "grad_norm": 2.2937467098236084, + "learning_rate": 0.001, + "loss": 2.1206, + "step": 3687 + }, + { + "epoch": 0.15601996784837974, + "grad_norm": 0.2151869237422943, + "learning_rate": 0.001, + "loss": 2.1484, + "step": 3688 + }, + { + "epoch": 0.1560622726118961, + "grad_norm": 0.2432985007762909, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 3689 + }, + { + "epoch": 0.15610457737541247, + "grad_norm": 0.26198938488960266, + "learning_rate": 0.001, + "loss": 2.4545, + "step": 3690 + }, + { + "epoch": 0.15614688213892886, + "grad_norm": 0.2777709364891052, + "learning_rate": 0.001, + "loss": 2.664, + "step": 3691 + }, + { + "epoch": 0.1561891869024452, + "grad_norm": 0.28329089283943176, + "learning_rate": 0.001, + "loss": 2.5311, + "step": 3692 + }, + { + "epoch": 0.1562314916659616, + "grad_norm": 0.3065381944179535, + "learning_rate": 0.001, + "loss": 2.1369, + "step": 3693 + }, + { + "epoch": 0.15627379642947795, + "grad_norm": 0.28511857986450195, + "learning_rate": 0.001, + "loss": 2.6129, + "step": 3694 + }, + { + "epoch": 0.15631610119299433, + "grad_norm": 0.24408607184886932, + "learning_rate": 0.001, + "loss": 2.0128, + "step": 3695 + }, + { + "epoch": 0.1563584059565107, + "grad_norm": 0.8473067879676819, + "learning_rate": 0.001, + "loss": 2.2145, + "step": 3696 + }, + { + "epoch": 0.15640071072002706, + "grad_norm": 0.42082521319389343, + "learning_rate": 0.001, + "loss": 2.1128, + "step": 3697 + }, + { + "epoch": 0.15644301548354345, + "grad_norm": 0.42995426058769226, + "learning_rate": 0.001, + "loss": 2.3262, + "step": 3698 + }, + { + "epoch": 0.15648532024705983, + "grad_norm": 0.3102007210254669, + "learning_rate": 0.001, + "loss": 2.7998, + "step": 3699 + }, + { + "epoch": 0.15652762501057618, + "grad_norm": 0.3263281285762787, + "learning_rate": 0.001, + "loss": 2.2744, + "step": 3700 + }, + { + "epoch": 0.15656992977409256, + "grad_norm": 2.9346940517425537, + "learning_rate": 0.001, + "loss": 2.1367, + "step": 3701 + }, + { + "epoch": 0.15661223453760895, + "grad_norm": 0.298091858625412, + "learning_rate": 0.001, + "loss": 3.0486, + "step": 3702 + }, + { + "epoch": 0.1566545393011253, + "grad_norm": 0.5562085509300232, + "learning_rate": 0.001, + "loss": 2.0666, + "step": 3703 + }, + { + "epoch": 0.15669684406464168, + "grad_norm": 0.26188915967941284, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 3704 + }, + { + "epoch": 0.15673914882815804, + "grad_norm": 0.3531795144081116, + "learning_rate": 0.001, + "loss": 2.6941, + "step": 3705 + }, + { + "epoch": 0.15678145359167442, + "grad_norm": 0.32536250352859497, + "learning_rate": 0.001, + "loss": 2.1217, + "step": 3706 + }, + { + "epoch": 0.1568237583551908, + "grad_norm": 0.2858486771583557, + "learning_rate": 0.001, + "loss": 2.2795, + "step": 3707 + }, + { + "epoch": 0.15686606311870716, + "grad_norm": 0.2625608742237091, + "learning_rate": 0.001, + "loss": 3.3786, + "step": 3708 + }, + { + "epoch": 0.15690836788222354, + "grad_norm": 0.2354310154914856, + "learning_rate": 0.001, + "loss": 2.0803, + "step": 3709 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 0.25832563638687134, + "learning_rate": 0.001, + "loss": 3.1712, + "step": 3710 + }, + { + "epoch": 0.15699297740925627, + "grad_norm": 2.9774281978607178, + "learning_rate": 0.001, + "loss": 2.6699, + "step": 3711 + }, + { + "epoch": 0.15703528217277266, + "grad_norm": 0.945013165473938, + "learning_rate": 0.001, + "loss": 3.3384, + "step": 3712 + }, + { + "epoch": 0.15707758693628904, + "grad_norm": 0.7904821634292603, + "learning_rate": 0.001, + "loss": 2.3788, + "step": 3713 + }, + { + "epoch": 0.1571198916998054, + "grad_norm": 0.2616831958293915, + "learning_rate": 0.001, + "loss": 2.8228, + "step": 3714 + }, + { + "epoch": 0.15716219646332177, + "grad_norm": 0.5680962204933167, + "learning_rate": 0.001, + "loss": 2.8964, + "step": 3715 + }, + { + "epoch": 0.15720450122683813, + "grad_norm": 1.2796063423156738, + "learning_rate": 0.001, + "loss": 2.6141, + "step": 3716 + }, + { + "epoch": 0.1572468059903545, + "grad_norm": 0.34902986884117126, + "learning_rate": 0.001, + "loss": 3.4248, + "step": 3717 + }, + { + "epoch": 0.1572891107538709, + "grad_norm": 0.30092769861221313, + "learning_rate": 0.001, + "loss": 2.7998, + "step": 3718 + }, + { + "epoch": 0.15733141551738725, + "grad_norm": 0.2545045018196106, + "learning_rate": 0.001, + "loss": 2.4546, + "step": 3719 + }, + { + "epoch": 0.15737372028090363, + "grad_norm": 0.30985644459724426, + "learning_rate": 0.001, + "loss": 2.1622, + "step": 3720 + }, + { + "epoch": 0.15741602504442, + "grad_norm": 0.21358482539653778, + "learning_rate": 0.001, + "loss": 1.9671, + "step": 3721 + }, + { + "epoch": 0.15745832980793636, + "grad_norm": 0.2403787076473236, + "learning_rate": 0.001, + "loss": 1.8596, + "step": 3722 + }, + { + "epoch": 0.15750063457145275, + "grad_norm": 0.25962314009666443, + "learning_rate": 0.001, + "loss": 2.595, + "step": 3723 + }, + { + "epoch": 0.15754293933496913, + "grad_norm": 0.9243412613868713, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 3724 + }, + { + "epoch": 0.15758524409848548, + "grad_norm": 0.2106955498456955, + "learning_rate": 0.001, + "loss": 3.3038, + "step": 3725 + }, + { + "epoch": 0.15762754886200186, + "grad_norm": 0.26728370785713196, + "learning_rate": 0.001, + "loss": 2.3843, + "step": 3726 + }, + { + "epoch": 0.15766985362551825, + "grad_norm": 0.26427310705184937, + "learning_rate": 0.001, + "loss": 2.1108, + "step": 3727 + }, + { + "epoch": 0.1577121583890346, + "grad_norm": 0.28308454155921936, + "learning_rate": 0.001, + "loss": 2.4893, + "step": 3728 + }, + { + "epoch": 0.15775446315255098, + "grad_norm": 0.266253262758255, + "learning_rate": 0.001, + "loss": 2.521, + "step": 3729 + }, + { + "epoch": 0.15779676791606734, + "grad_norm": 0.4611959159374237, + "learning_rate": 0.001, + "loss": 2.2577, + "step": 3730 + }, + { + "epoch": 0.15783907267958372, + "grad_norm": 0.3216572701931, + "learning_rate": 0.001, + "loss": 3.0579, + "step": 3731 + }, + { + "epoch": 0.1578813774431001, + "grad_norm": 0.2995474934577942, + "learning_rate": 0.001, + "loss": 2.8257, + "step": 3732 + }, + { + "epoch": 0.15792368220661646, + "grad_norm": 0.704836368560791, + "learning_rate": 0.001, + "loss": 2.7877, + "step": 3733 + }, + { + "epoch": 0.15796598697013284, + "grad_norm": 0.4456735849380493, + "learning_rate": 0.001, + "loss": 2.1902, + "step": 3734 + }, + { + "epoch": 0.15800829173364922, + "grad_norm": 0.21803617477416992, + "learning_rate": 0.001, + "loss": 2.4335, + "step": 3735 + }, + { + "epoch": 0.15805059649716557, + "grad_norm": 0.24227195978164673, + "learning_rate": 0.001, + "loss": 1.9287, + "step": 3736 + }, + { + "epoch": 0.15809290126068196, + "grad_norm": 0.2500062882900238, + "learning_rate": 0.001, + "loss": 2.7866, + "step": 3737 + }, + { + "epoch": 0.15813520602419834, + "grad_norm": 0.24755047261714935, + "learning_rate": 0.001, + "loss": 2.8725, + "step": 3738 + }, + { + "epoch": 0.1581775107877147, + "grad_norm": 0.26385197043418884, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 3739 + }, + { + "epoch": 0.15821981555123107, + "grad_norm": 0.3932577967643738, + "learning_rate": 0.001, + "loss": 2.5877, + "step": 3740 + }, + { + "epoch": 0.15826212031474743, + "grad_norm": 0.21887458860874176, + "learning_rate": 0.001, + "loss": 2.3305, + "step": 3741 + }, + { + "epoch": 0.1583044250782638, + "grad_norm": 0.2982005774974823, + "learning_rate": 0.001, + "loss": 3.3444, + "step": 3742 + }, + { + "epoch": 0.1583467298417802, + "grad_norm": 0.4082737863063812, + "learning_rate": 0.001, + "loss": 3.0144, + "step": 3743 + }, + { + "epoch": 0.15838903460529655, + "grad_norm": 0.4862198829650879, + "learning_rate": 0.001, + "loss": 2.7298, + "step": 3744 + }, + { + "epoch": 0.15843133936881293, + "grad_norm": 0.2732762098312378, + "learning_rate": 0.001, + "loss": 2.9059, + "step": 3745 + }, + { + "epoch": 0.1584736441323293, + "grad_norm": 0.2768647372722626, + "learning_rate": 0.001, + "loss": 2.005, + "step": 3746 + }, + { + "epoch": 0.15851594889584567, + "grad_norm": 0.2584654688835144, + "learning_rate": 0.001, + "loss": 2.2505, + "step": 3747 + }, + { + "epoch": 0.15855825365936205, + "grad_norm": 0.2902711033821106, + "learning_rate": 0.001, + "loss": 2.4176, + "step": 3748 + }, + { + "epoch": 0.15860055842287843, + "grad_norm": 0.2150590866804123, + "learning_rate": 0.001, + "loss": 1.8724, + "step": 3749 + }, + { + "epoch": 0.15864286318639478, + "grad_norm": 1.3695828914642334, + "learning_rate": 0.001, + "loss": 1.9712, + "step": 3750 + }, + { + "epoch": 0.15868516794991117, + "grad_norm": 0.3454398810863495, + "learning_rate": 0.001, + "loss": 2.6138, + "step": 3751 + }, + { + "epoch": 0.15872747271342752, + "grad_norm": 0.7297479510307312, + "learning_rate": 0.001, + "loss": 3.5123, + "step": 3752 + }, + { + "epoch": 0.1587697774769439, + "grad_norm": 0.43115225434303284, + "learning_rate": 0.001, + "loss": 1.886, + "step": 3753 + }, + { + "epoch": 0.15881208224046028, + "grad_norm": 0.2626439929008484, + "learning_rate": 0.001, + "loss": 1.6985, + "step": 3754 + }, + { + "epoch": 0.15885438700397664, + "grad_norm": 1.067133903503418, + "learning_rate": 0.001, + "loss": 2.0347, + "step": 3755 + }, + { + "epoch": 0.15889669176749302, + "grad_norm": 0.28213247656822205, + "learning_rate": 0.001, + "loss": 2.6482, + "step": 3756 + }, + { + "epoch": 0.1589389965310094, + "grad_norm": 0.3096356689929962, + "learning_rate": 0.001, + "loss": 2.4303, + "step": 3757 + }, + { + "epoch": 0.15898130129452576, + "grad_norm": 1.1172544956207275, + "learning_rate": 0.001, + "loss": 1.4872, + "step": 3758 + }, + { + "epoch": 0.15902360605804214, + "grad_norm": 0.5289722681045532, + "learning_rate": 0.001, + "loss": 2.2845, + "step": 3759 + }, + { + "epoch": 0.15906591082155852, + "grad_norm": 0.41261282563209534, + "learning_rate": 0.001, + "loss": 2.7228, + "step": 3760 + }, + { + "epoch": 0.15910821558507487, + "grad_norm": 0.2809741199016571, + "learning_rate": 0.001, + "loss": 2.4155, + "step": 3761 + }, + { + "epoch": 0.15915052034859126, + "grad_norm": 0.6320580840110779, + "learning_rate": 0.001, + "loss": 1.6108, + "step": 3762 + }, + { + "epoch": 0.1591928251121076, + "grad_norm": 0.2594701051712036, + "learning_rate": 0.001, + "loss": 1.7907, + "step": 3763 + }, + { + "epoch": 0.159235129875624, + "grad_norm": 0.266510933637619, + "learning_rate": 0.001, + "loss": 1.9833, + "step": 3764 + }, + { + "epoch": 0.15927743463914038, + "grad_norm": 0.24247369170188904, + "learning_rate": 0.001, + "loss": 2.0623, + "step": 3765 + }, + { + "epoch": 0.15931973940265673, + "grad_norm": 0.23818761110305786, + "learning_rate": 0.001, + "loss": 2.2446, + "step": 3766 + }, + { + "epoch": 0.1593620441661731, + "grad_norm": 0.27693408727645874, + "learning_rate": 0.001, + "loss": 2.7074, + "step": 3767 + }, + { + "epoch": 0.1594043489296895, + "grad_norm": 0.6779216527938843, + "learning_rate": 0.001, + "loss": 2.1112, + "step": 3768 + }, + { + "epoch": 0.15944665369320585, + "grad_norm": 0.5291603207588196, + "learning_rate": 0.001, + "loss": 2.0643, + "step": 3769 + }, + { + "epoch": 0.15948895845672223, + "grad_norm": 0.2355789989233017, + "learning_rate": 0.001, + "loss": 2.9922, + "step": 3770 + }, + { + "epoch": 0.1595312632202386, + "grad_norm": 0.334757536649704, + "learning_rate": 0.001, + "loss": 3.6081, + "step": 3771 + }, + { + "epoch": 0.15957356798375497, + "grad_norm": 0.2403586208820343, + "learning_rate": 0.001, + "loss": 1.8171, + "step": 3772 + }, + { + "epoch": 0.15961587274727135, + "grad_norm": 0.697832465171814, + "learning_rate": 0.001, + "loss": 2.3378, + "step": 3773 + }, + { + "epoch": 0.1596581775107877, + "grad_norm": 0.7586553692817688, + "learning_rate": 0.001, + "loss": 2.3106, + "step": 3774 + }, + { + "epoch": 0.15970048227430408, + "grad_norm": 0.365491658449173, + "learning_rate": 0.001, + "loss": 3.1585, + "step": 3775 + }, + { + "epoch": 0.15974278703782047, + "grad_norm": 1.0467041730880737, + "learning_rate": 0.001, + "loss": 3.8686, + "step": 3776 + }, + { + "epoch": 0.15978509180133682, + "grad_norm": 0.23686069250106812, + "learning_rate": 0.001, + "loss": 3.3134, + "step": 3777 + }, + { + "epoch": 0.1598273965648532, + "grad_norm": 0.2806885838508606, + "learning_rate": 0.001, + "loss": 2.4206, + "step": 3778 + }, + { + "epoch": 0.15986970132836958, + "grad_norm": 0.25082680583000183, + "learning_rate": 0.001, + "loss": 2.3114, + "step": 3779 + }, + { + "epoch": 0.15991200609188594, + "grad_norm": 0.22670866549015045, + "learning_rate": 0.001, + "loss": 2.852, + "step": 3780 + }, + { + "epoch": 0.15995431085540232, + "grad_norm": 0.2668560743331909, + "learning_rate": 0.001, + "loss": 2.3196, + "step": 3781 + }, + { + "epoch": 0.1599966156189187, + "grad_norm": 1.557098150253296, + "learning_rate": 0.001, + "loss": 1.6908, + "step": 3782 + }, + { + "epoch": 0.16003892038243506, + "grad_norm": 0.22779476642608643, + "learning_rate": 0.001, + "loss": 3.2161, + "step": 3783 + }, + { + "epoch": 0.16008122514595144, + "grad_norm": 0.777952253818512, + "learning_rate": 0.001, + "loss": 1.5536, + "step": 3784 + }, + { + "epoch": 0.1601235299094678, + "grad_norm": 1.5574365854263306, + "learning_rate": 0.001, + "loss": 2.5072, + "step": 3785 + }, + { + "epoch": 0.16016583467298418, + "grad_norm": 0.299005925655365, + "learning_rate": 0.001, + "loss": 3.0707, + "step": 3786 + }, + { + "epoch": 0.16020813943650056, + "grad_norm": 0.3228248953819275, + "learning_rate": 0.001, + "loss": 2.3951, + "step": 3787 + }, + { + "epoch": 0.1602504442000169, + "grad_norm": 1.0663105249404907, + "learning_rate": 0.001, + "loss": 2.2717, + "step": 3788 + }, + { + "epoch": 0.1602927489635333, + "grad_norm": 3.4064865112304688, + "learning_rate": 0.001, + "loss": 2.4347, + "step": 3789 + }, + { + "epoch": 0.16033505372704968, + "grad_norm": 0.3657093048095703, + "learning_rate": 0.001, + "loss": 2.6096, + "step": 3790 + }, + { + "epoch": 0.16037735849056603, + "grad_norm": 0.3097366690635681, + "learning_rate": 0.001, + "loss": 2.0905, + "step": 3791 + }, + { + "epoch": 0.1604196632540824, + "grad_norm": 0.35716721415519714, + "learning_rate": 0.001, + "loss": 2.5812, + "step": 3792 + }, + { + "epoch": 0.1604619680175988, + "grad_norm": 0.32534101605415344, + "learning_rate": 0.001, + "loss": 2.9844, + "step": 3793 + }, + { + "epoch": 0.16050427278111515, + "grad_norm": 0.36417415738105774, + "learning_rate": 0.001, + "loss": 2.5366, + "step": 3794 + }, + { + "epoch": 0.16054657754463153, + "grad_norm": 0.26519954204559326, + "learning_rate": 0.001, + "loss": 2.7738, + "step": 3795 + }, + { + "epoch": 0.16058888230814788, + "grad_norm": 0.473812073469162, + "learning_rate": 0.001, + "loss": 2.2917, + "step": 3796 + }, + { + "epoch": 0.16063118707166427, + "grad_norm": 0.3712599277496338, + "learning_rate": 0.001, + "loss": 2.2126, + "step": 3797 + }, + { + "epoch": 0.16067349183518065, + "grad_norm": 0.30291634798049927, + "learning_rate": 0.001, + "loss": 2.1419, + "step": 3798 + }, + { + "epoch": 0.160715796598697, + "grad_norm": 0.30859270691871643, + "learning_rate": 0.001, + "loss": 1.9372, + "step": 3799 + }, + { + "epoch": 0.16075810136221338, + "grad_norm": 0.24209967255592346, + "learning_rate": 0.001, + "loss": 1.9439, + "step": 3800 + }, + { + "epoch": 0.16080040612572977, + "grad_norm": 0.30465030670166016, + "learning_rate": 0.001, + "loss": 3.1223, + "step": 3801 + }, + { + "epoch": 0.16084271088924612, + "grad_norm": 0.29678988456726074, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 3802 + }, + { + "epoch": 0.1608850156527625, + "grad_norm": 0.21192920207977295, + "learning_rate": 0.001, + "loss": 2.1279, + "step": 3803 + }, + { + "epoch": 0.16092732041627889, + "grad_norm": 2.0357847213745117, + "learning_rate": 0.001, + "loss": 3.1239, + "step": 3804 + }, + { + "epoch": 0.16096962517979524, + "grad_norm": 0.31351718306541443, + "learning_rate": 0.001, + "loss": 2.176, + "step": 3805 + }, + { + "epoch": 0.16101192994331162, + "grad_norm": 0.3054656684398651, + "learning_rate": 0.001, + "loss": 2.5238, + "step": 3806 + }, + { + "epoch": 0.16105423470682798, + "grad_norm": 0.5278902053833008, + "learning_rate": 0.001, + "loss": 1.6492, + "step": 3807 + }, + { + "epoch": 0.16109653947034436, + "grad_norm": 0.300001323223114, + "learning_rate": 0.001, + "loss": 2.562, + "step": 3808 + }, + { + "epoch": 0.16113884423386074, + "grad_norm": 0.34436389803886414, + "learning_rate": 0.001, + "loss": 3.3102, + "step": 3809 + }, + { + "epoch": 0.1611811489973771, + "grad_norm": 0.33348026871681213, + "learning_rate": 0.001, + "loss": 3.1311, + "step": 3810 + }, + { + "epoch": 0.16122345376089348, + "grad_norm": 0.24410240352153778, + "learning_rate": 0.001, + "loss": 3.0828, + "step": 3811 + }, + { + "epoch": 0.16126575852440986, + "grad_norm": 0.21113960444927216, + "learning_rate": 0.001, + "loss": 1.9945, + "step": 3812 + }, + { + "epoch": 0.1613080632879262, + "grad_norm": 0.2360842376947403, + "learning_rate": 0.001, + "loss": 1.9214, + "step": 3813 + }, + { + "epoch": 0.1613503680514426, + "grad_norm": 0.3132474720478058, + "learning_rate": 0.001, + "loss": 3.3039, + "step": 3814 + }, + { + "epoch": 0.16139267281495898, + "grad_norm": 0.23220689594745636, + "learning_rate": 0.001, + "loss": 2.3121, + "step": 3815 + }, + { + "epoch": 0.16143497757847533, + "grad_norm": 0.3871288001537323, + "learning_rate": 0.001, + "loss": 3.1045, + "step": 3816 + }, + { + "epoch": 0.1614772823419917, + "grad_norm": 0.2573937773704529, + "learning_rate": 0.001, + "loss": 2.2547, + "step": 3817 + }, + { + "epoch": 0.16151958710550807, + "grad_norm": 0.23230381309986115, + "learning_rate": 0.001, + "loss": 1.9308, + "step": 3818 + }, + { + "epoch": 0.16156189186902445, + "grad_norm": 0.2752211093902588, + "learning_rate": 0.001, + "loss": 3.6371, + "step": 3819 + }, + { + "epoch": 0.16160419663254083, + "grad_norm": 0.2206379473209381, + "learning_rate": 0.001, + "loss": 3.6659, + "step": 3820 + }, + { + "epoch": 0.16164650139605719, + "grad_norm": 0.5794355869293213, + "learning_rate": 0.001, + "loss": 2.3093, + "step": 3821 + }, + { + "epoch": 0.16168880615957357, + "grad_norm": 0.23683209717273712, + "learning_rate": 0.001, + "loss": 2.5634, + "step": 3822 + }, + { + "epoch": 0.16173111092308995, + "grad_norm": 0.2306908518075943, + "learning_rate": 0.001, + "loss": 1.9803, + "step": 3823 + }, + { + "epoch": 0.1617734156866063, + "grad_norm": 0.22796887159347534, + "learning_rate": 0.001, + "loss": 2.3285, + "step": 3824 + }, + { + "epoch": 0.16181572045012269, + "grad_norm": 0.27235713601112366, + "learning_rate": 0.001, + "loss": 2.6457, + "step": 3825 + }, + { + "epoch": 0.16185802521363907, + "grad_norm": 1.1337499618530273, + "learning_rate": 0.001, + "loss": 2.7345, + "step": 3826 + }, + { + "epoch": 0.16190032997715542, + "grad_norm": 0.22080039978027344, + "learning_rate": 0.001, + "loss": 2.0913, + "step": 3827 + }, + { + "epoch": 0.1619426347406718, + "grad_norm": 0.24313177168369293, + "learning_rate": 0.001, + "loss": 2.4784, + "step": 3828 + }, + { + "epoch": 0.16198493950418816, + "grad_norm": 0.28508949279785156, + "learning_rate": 0.001, + "loss": 2.8932, + "step": 3829 + }, + { + "epoch": 0.16202724426770454, + "grad_norm": 0.37626540660858154, + "learning_rate": 0.001, + "loss": 2.596, + "step": 3830 + }, + { + "epoch": 0.16206954903122092, + "grad_norm": 0.22318224608898163, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 3831 + }, + { + "epoch": 0.16211185379473728, + "grad_norm": 0.2062063217163086, + "learning_rate": 0.001, + "loss": 1.5958, + "step": 3832 + }, + { + "epoch": 0.16215415855825366, + "grad_norm": 0.2109755128622055, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 3833 + }, + { + "epoch": 0.16219646332177004, + "grad_norm": 0.21446409821510315, + "learning_rate": 0.001, + "loss": 2.1621, + "step": 3834 + }, + { + "epoch": 0.1622387680852864, + "grad_norm": 0.21408231556415558, + "learning_rate": 0.001, + "loss": 2.683, + "step": 3835 + }, + { + "epoch": 0.16228107284880278, + "grad_norm": 0.2532133460044861, + "learning_rate": 0.001, + "loss": 2.2158, + "step": 3836 + }, + { + "epoch": 0.16232337761231916, + "grad_norm": 0.27173325419425964, + "learning_rate": 0.001, + "loss": 2.2614, + "step": 3837 + }, + { + "epoch": 0.1623656823758355, + "grad_norm": 0.2316475659608841, + "learning_rate": 0.001, + "loss": 2.1505, + "step": 3838 + }, + { + "epoch": 0.1624079871393519, + "grad_norm": 0.23597820103168488, + "learning_rate": 0.001, + "loss": 2.2652, + "step": 3839 + }, + { + "epoch": 0.16245029190286828, + "grad_norm": 0.23943273723125458, + "learning_rate": 0.001, + "loss": 2.0347, + "step": 3840 + }, + { + "epoch": 0.16249259666638463, + "grad_norm": 0.8113381266593933, + "learning_rate": 0.001, + "loss": 2.0472, + "step": 3841 + }, + { + "epoch": 0.162534901429901, + "grad_norm": 2.3139660358428955, + "learning_rate": 0.001, + "loss": 1.8397, + "step": 3842 + }, + { + "epoch": 0.16257720619341737, + "grad_norm": 0.42972567677497864, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 3843 + }, + { + "epoch": 0.16261951095693375, + "grad_norm": 0.2802884578704834, + "learning_rate": 0.001, + "loss": 1.9931, + "step": 3844 + }, + { + "epoch": 0.16266181572045013, + "grad_norm": 0.3165445625782013, + "learning_rate": 0.001, + "loss": 2.3039, + "step": 3845 + }, + { + "epoch": 0.16270412048396649, + "grad_norm": 0.5963391661643982, + "learning_rate": 0.001, + "loss": 1.952, + "step": 3846 + }, + { + "epoch": 0.16274642524748287, + "grad_norm": 0.25218188762664795, + "learning_rate": 0.001, + "loss": 2.3469, + "step": 3847 + }, + { + "epoch": 0.16278873001099925, + "grad_norm": 0.31111615896224976, + "learning_rate": 0.001, + "loss": 2.0224, + "step": 3848 + }, + { + "epoch": 0.1628310347745156, + "grad_norm": 0.3019029200077057, + "learning_rate": 0.001, + "loss": 2.4928, + "step": 3849 + }, + { + "epoch": 0.16287333953803199, + "grad_norm": 0.22773532569408417, + "learning_rate": 0.001, + "loss": 2.9932, + "step": 3850 + }, + { + "epoch": 0.16291564430154837, + "grad_norm": 0.2483637034893036, + "learning_rate": 0.001, + "loss": 2.8444, + "step": 3851 + }, + { + "epoch": 0.16295794906506472, + "grad_norm": 0.2056424915790558, + "learning_rate": 0.001, + "loss": 1.9696, + "step": 3852 + }, + { + "epoch": 0.1630002538285811, + "grad_norm": 0.22549600899219513, + "learning_rate": 0.001, + "loss": 1.7296, + "step": 3853 + }, + { + "epoch": 0.16304255859209746, + "grad_norm": 0.2797006368637085, + "learning_rate": 0.001, + "loss": 2.0571, + "step": 3854 + }, + { + "epoch": 0.16308486335561384, + "grad_norm": 0.2678796947002411, + "learning_rate": 0.001, + "loss": 1.9062, + "step": 3855 + }, + { + "epoch": 0.16312716811913022, + "grad_norm": 0.24409295618534088, + "learning_rate": 0.001, + "loss": 2.4397, + "step": 3856 + }, + { + "epoch": 0.16316947288264658, + "grad_norm": 0.25241202116012573, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 3857 + }, + { + "epoch": 0.16321177764616296, + "grad_norm": 0.23605115711688995, + "learning_rate": 0.001, + "loss": 2.9556, + "step": 3858 + }, + { + "epoch": 0.16325408240967934, + "grad_norm": 0.26487863063812256, + "learning_rate": 0.001, + "loss": 1.9874, + "step": 3859 + }, + { + "epoch": 0.1632963871731957, + "grad_norm": 0.24517613649368286, + "learning_rate": 0.001, + "loss": 2.321, + "step": 3860 + }, + { + "epoch": 0.16333869193671208, + "grad_norm": 0.20442011952400208, + "learning_rate": 0.001, + "loss": 1.8018, + "step": 3861 + }, + { + "epoch": 0.16338099670022846, + "grad_norm": 0.24029582738876343, + "learning_rate": 0.001, + "loss": 2.722, + "step": 3862 + }, + { + "epoch": 0.1634233014637448, + "grad_norm": 0.2389087826013565, + "learning_rate": 0.001, + "loss": 1.9895, + "step": 3863 + }, + { + "epoch": 0.1634656062272612, + "grad_norm": 0.2552791237831116, + "learning_rate": 0.001, + "loss": 2.4092, + "step": 3864 + }, + { + "epoch": 0.16350791099077755, + "grad_norm": 0.20444203913211823, + "learning_rate": 0.001, + "loss": 1.8636, + "step": 3865 + }, + { + "epoch": 0.16355021575429393, + "grad_norm": 1.1959320306777954, + "learning_rate": 0.001, + "loss": 2.6259, + "step": 3866 + }, + { + "epoch": 0.1635925205178103, + "grad_norm": 0.20237359404563904, + "learning_rate": 0.001, + "loss": 1.7948, + "step": 3867 + }, + { + "epoch": 0.16363482528132667, + "grad_norm": 0.21189026534557343, + "learning_rate": 0.001, + "loss": 2.6693, + "step": 3868 + }, + { + "epoch": 0.16367713004484305, + "grad_norm": 0.25462326407432556, + "learning_rate": 0.001, + "loss": 1.9829, + "step": 3869 + }, + { + "epoch": 0.16371943480835943, + "grad_norm": 0.24201983213424683, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 3870 + }, + { + "epoch": 0.1637617395718758, + "grad_norm": 0.22964642941951752, + "learning_rate": 0.001, + "loss": 2.9446, + "step": 3871 + }, + { + "epoch": 0.16380404433539217, + "grad_norm": 0.2601517140865326, + "learning_rate": 0.001, + "loss": 2.3314, + "step": 3872 + }, + { + "epoch": 0.16384634909890855, + "grad_norm": 0.19910211861133575, + "learning_rate": 0.001, + "loss": 2.236, + "step": 3873 + }, + { + "epoch": 0.1638886538624249, + "grad_norm": 0.2370014637708664, + "learning_rate": 0.001, + "loss": 2.767, + "step": 3874 + }, + { + "epoch": 0.1639309586259413, + "grad_norm": 0.273014098405838, + "learning_rate": 0.001, + "loss": 2.8735, + "step": 3875 + }, + { + "epoch": 0.16397326338945764, + "grad_norm": 0.2136770635843277, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 3876 + }, + { + "epoch": 0.16401556815297402, + "grad_norm": 0.196532741189003, + "learning_rate": 0.001, + "loss": 2.1683, + "step": 3877 + }, + { + "epoch": 0.1640578729164904, + "grad_norm": 1.7746950387954712, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 3878 + }, + { + "epoch": 0.16410017768000676, + "grad_norm": 0.29451486468315125, + "learning_rate": 0.001, + "loss": 2.2666, + "step": 3879 + }, + { + "epoch": 0.16414248244352314, + "grad_norm": 0.6255790591239929, + "learning_rate": 0.001, + "loss": 2.7286, + "step": 3880 + }, + { + "epoch": 0.16418478720703952, + "grad_norm": 0.49284324049949646, + "learning_rate": 0.001, + "loss": 2.0154, + "step": 3881 + }, + { + "epoch": 0.16422709197055588, + "grad_norm": 0.27464428544044495, + "learning_rate": 0.001, + "loss": 2.347, + "step": 3882 + }, + { + "epoch": 0.16426939673407226, + "grad_norm": 0.2803431451320648, + "learning_rate": 0.001, + "loss": 2.6582, + "step": 3883 + }, + { + "epoch": 0.16431170149758864, + "grad_norm": 0.2514314353466034, + "learning_rate": 0.001, + "loss": 2.6842, + "step": 3884 + }, + { + "epoch": 0.164354006261105, + "grad_norm": 0.30920740962028503, + "learning_rate": 0.001, + "loss": 2.5913, + "step": 3885 + }, + { + "epoch": 0.16439631102462138, + "grad_norm": 0.6570300459861755, + "learning_rate": 0.001, + "loss": 2.4439, + "step": 3886 + }, + { + "epoch": 0.16443861578813773, + "grad_norm": 0.24634666740894318, + "learning_rate": 0.001, + "loss": 2.1094, + "step": 3887 + }, + { + "epoch": 0.16448092055165411, + "grad_norm": 0.2316695600748062, + "learning_rate": 0.001, + "loss": 2.3292, + "step": 3888 + }, + { + "epoch": 0.1645232253151705, + "grad_norm": 0.6251176595687866, + "learning_rate": 0.001, + "loss": 2.9064, + "step": 3889 + }, + { + "epoch": 0.16456553007868685, + "grad_norm": 0.28041884303092957, + "learning_rate": 0.001, + "loss": 2.4278, + "step": 3890 + }, + { + "epoch": 0.16460783484220323, + "grad_norm": 0.6248710751533508, + "learning_rate": 0.001, + "loss": 3.7971, + "step": 3891 + }, + { + "epoch": 0.16465013960571961, + "grad_norm": 0.24960578978061676, + "learning_rate": 0.001, + "loss": 3.1562, + "step": 3892 + }, + { + "epoch": 0.16469244436923597, + "grad_norm": 0.24922244250774384, + "learning_rate": 0.001, + "loss": 3.1838, + "step": 3893 + }, + { + "epoch": 0.16473474913275235, + "grad_norm": 0.21565461158752441, + "learning_rate": 0.001, + "loss": 2.5373, + "step": 3894 + }, + { + "epoch": 0.16477705389626873, + "grad_norm": 0.30223679542541504, + "learning_rate": 0.001, + "loss": 1.9657, + "step": 3895 + }, + { + "epoch": 0.1648193586597851, + "grad_norm": 0.23021796345710754, + "learning_rate": 0.001, + "loss": 2.0286, + "step": 3896 + }, + { + "epoch": 0.16486166342330147, + "grad_norm": 0.22594904899597168, + "learning_rate": 0.001, + "loss": 2.0249, + "step": 3897 + }, + { + "epoch": 0.16490396818681782, + "grad_norm": 0.20173117518424988, + "learning_rate": 0.001, + "loss": 2.8041, + "step": 3898 + }, + { + "epoch": 0.1649462729503342, + "grad_norm": 0.2069264054298401, + "learning_rate": 0.001, + "loss": 1.9167, + "step": 3899 + }, + { + "epoch": 0.1649885777138506, + "grad_norm": 0.4964730739593506, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 3900 + }, + { + "epoch": 0.16503088247736694, + "grad_norm": 0.2436501681804657, + "learning_rate": 0.001, + "loss": 1.9212, + "step": 3901 + }, + { + "epoch": 0.16507318724088332, + "grad_norm": 0.27425655722618103, + "learning_rate": 0.001, + "loss": 2.2874, + "step": 3902 + }, + { + "epoch": 0.1651154920043997, + "grad_norm": 0.18915478885173798, + "learning_rate": 0.001, + "loss": 1.7265, + "step": 3903 + }, + { + "epoch": 0.16515779676791606, + "grad_norm": 0.34025347232818604, + "learning_rate": 0.001, + "loss": 2.0228, + "step": 3904 + }, + { + "epoch": 0.16520010153143244, + "grad_norm": 0.3264094889163971, + "learning_rate": 0.001, + "loss": 2.3233, + "step": 3905 + }, + { + "epoch": 0.16524240629494882, + "grad_norm": 0.2108362913131714, + "learning_rate": 0.001, + "loss": 2.6092, + "step": 3906 + }, + { + "epoch": 0.16528471105846518, + "grad_norm": 0.27986034750938416, + "learning_rate": 0.001, + "loss": 1.6611, + "step": 3907 + }, + { + "epoch": 0.16532701582198156, + "grad_norm": 0.22576811909675598, + "learning_rate": 0.001, + "loss": 1.8204, + "step": 3908 + }, + { + "epoch": 0.16536932058549791, + "grad_norm": 0.5198656916618347, + "learning_rate": 0.001, + "loss": 2.44, + "step": 3909 + }, + { + "epoch": 0.1654116253490143, + "grad_norm": 0.24308837950229645, + "learning_rate": 0.001, + "loss": 2.2157, + "step": 3910 + }, + { + "epoch": 0.16545393011253068, + "grad_norm": 0.2985725402832031, + "learning_rate": 0.001, + "loss": 2.6967, + "step": 3911 + }, + { + "epoch": 0.16549623487604703, + "grad_norm": 0.2176986038684845, + "learning_rate": 0.001, + "loss": 1.9572, + "step": 3912 + }, + { + "epoch": 0.16553853963956341, + "grad_norm": 0.49889811873435974, + "learning_rate": 0.001, + "loss": 2.2016, + "step": 3913 + }, + { + "epoch": 0.1655808444030798, + "grad_norm": 0.23154956102371216, + "learning_rate": 0.001, + "loss": 2.0423, + "step": 3914 + }, + { + "epoch": 0.16562314916659615, + "grad_norm": 0.2753046751022339, + "learning_rate": 0.001, + "loss": 1.9889, + "step": 3915 + }, + { + "epoch": 0.16566545393011253, + "grad_norm": 0.19035251438617706, + "learning_rate": 0.001, + "loss": 2.0711, + "step": 3916 + }, + { + "epoch": 0.16570775869362891, + "grad_norm": 0.23184069991111755, + "learning_rate": 0.001, + "loss": 1.9129, + "step": 3917 + }, + { + "epoch": 0.16575006345714527, + "grad_norm": 0.20346291363239288, + "learning_rate": 0.001, + "loss": 2.411, + "step": 3918 + }, + { + "epoch": 0.16579236822066165, + "grad_norm": 0.24841812252998352, + "learning_rate": 0.001, + "loss": 1.8754, + "step": 3919 + }, + { + "epoch": 0.165834672984178, + "grad_norm": 0.3259766697883606, + "learning_rate": 0.001, + "loss": 2.4933, + "step": 3920 + }, + { + "epoch": 0.1658769777476944, + "grad_norm": 0.27737030386924744, + "learning_rate": 0.001, + "loss": 2.1435, + "step": 3921 + }, + { + "epoch": 0.16591928251121077, + "grad_norm": 1.7625619173049927, + "learning_rate": 0.001, + "loss": 2.05, + "step": 3922 + }, + { + "epoch": 0.16596158727472712, + "grad_norm": 0.24628551304340363, + "learning_rate": 0.001, + "loss": 2.3493, + "step": 3923 + }, + { + "epoch": 0.1660038920382435, + "grad_norm": 0.29456469416618347, + "learning_rate": 0.001, + "loss": 2.2105, + "step": 3924 + }, + { + "epoch": 0.1660461968017599, + "grad_norm": 0.20997002720832825, + "learning_rate": 0.001, + "loss": 1.8444, + "step": 3925 + }, + { + "epoch": 0.16608850156527624, + "grad_norm": 2.3074915409088135, + "learning_rate": 0.001, + "loss": 2.2948, + "step": 3926 + }, + { + "epoch": 0.16613080632879262, + "grad_norm": 0.3753257691860199, + "learning_rate": 0.001, + "loss": 3.4608, + "step": 3927 + }, + { + "epoch": 0.166173111092309, + "grad_norm": 0.32698577642440796, + "learning_rate": 0.001, + "loss": 2.5853, + "step": 3928 + }, + { + "epoch": 0.16621541585582536, + "grad_norm": 0.22866463661193848, + "learning_rate": 0.001, + "loss": 2.7564, + "step": 3929 + }, + { + "epoch": 0.16625772061934174, + "grad_norm": 0.30471017956733704, + "learning_rate": 0.001, + "loss": 3.0827, + "step": 3930 + }, + { + "epoch": 0.1663000253828581, + "grad_norm": 0.2387181669473648, + "learning_rate": 0.001, + "loss": 2.8088, + "step": 3931 + }, + { + "epoch": 0.16634233014637448, + "grad_norm": 0.2087489813566208, + "learning_rate": 0.001, + "loss": 1.9849, + "step": 3932 + }, + { + "epoch": 0.16638463490989086, + "grad_norm": 0.22151561081409454, + "learning_rate": 0.001, + "loss": 2.8208, + "step": 3933 + }, + { + "epoch": 0.16642693967340721, + "grad_norm": 0.21167586743831635, + "learning_rate": 0.001, + "loss": 2.9183, + "step": 3934 + }, + { + "epoch": 0.1664692444369236, + "grad_norm": 0.2525525391101837, + "learning_rate": 0.001, + "loss": 3.3236, + "step": 3935 + }, + { + "epoch": 0.16651154920043998, + "grad_norm": 0.26379460096359253, + "learning_rate": 0.001, + "loss": 2.4491, + "step": 3936 + }, + { + "epoch": 0.16655385396395633, + "grad_norm": 0.2644009590148926, + "learning_rate": 0.001, + "loss": 2.4049, + "step": 3937 + }, + { + "epoch": 0.16659615872747272, + "grad_norm": 9.060093879699707, + "learning_rate": 0.001, + "loss": 2.6395, + "step": 3938 + }, + { + "epoch": 0.1666384634909891, + "grad_norm": 0.22047647833824158, + "learning_rate": 0.001, + "loss": 1.9525, + "step": 3939 + }, + { + "epoch": 0.16668076825450545, + "grad_norm": 0.5126127004623413, + "learning_rate": 0.001, + "loss": 1.6991, + "step": 3940 + }, + { + "epoch": 0.16672307301802183, + "grad_norm": 0.25247883796691895, + "learning_rate": 0.001, + "loss": 2.1926, + "step": 3941 + }, + { + "epoch": 0.1667653777815382, + "grad_norm": 0.25880908966064453, + "learning_rate": 0.001, + "loss": 2.1134, + "step": 3942 + }, + { + "epoch": 0.16680768254505457, + "grad_norm": 0.23852358758449554, + "learning_rate": 0.001, + "loss": 2.3002, + "step": 3943 + }, + { + "epoch": 0.16684998730857095, + "grad_norm": 0.20607604086399078, + "learning_rate": 0.001, + "loss": 1.8629, + "step": 3944 + }, + { + "epoch": 0.1668922920720873, + "grad_norm": 0.279757559299469, + "learning_rate": 0.001, + "loss": 3.2208, + "step": 3945 + }, + { + "epoch": 0.1669345968356037, + "grad_norm": 0.2418821156024933, + "learning_rate": 0.001, + "loss": 2.4583, + "step": 3946 + }, + { + "epoch": 0.16697690159912007, + "grad_norm": 0.19842597842216492, + "learning_rate": 0.001, + "loss": 2.4909, + "step": 3947 + }, + { + "epoch": 0.16701920636263642, + "grad_norm": 0.6748121976852417, + "learning_rate": 0.001, + "loss": 2.0367, + "step": 3948 + }, + { + "epoch": 0.1670615111261528, + "grad_norm": 0.3548758625984192, + "learning_rate": 0.001, + "loss": 2.2625, + "step": 3949 + }, + { + "epoch": 0.1671038158896692, + "grad_norm": 0.23522132635116577, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 3950 + }, + { + "epoch": 0.16714612065318554, + "grad_norm": 1.6471339464187622, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 3951 + }, + { + "epoch": 0.16718842541670192, + "grad_norm": 0.9741820693016052, + "learning_rate": 0.001, + "loss": 2.5284, + "step": 3952 + }, + { + "epoch": 0.16723073018021828, + "grad_norm": 0.2638908624649048, + "learning_rate": 0.001, + "loss": 3.4217, + "step": 3953 + }, + { + "epoch": 0.16727303494373466, + "grad_norm": 0.2375737726688385, + "learning_rate": 0.001, + "loss": 2.1506, + "step": 3954 + }, + { + "epoch": 0.16731533970725104, + "grad_norm": 0.24627238512039185, + "learning_rate": 0.001, + "loss": 2.8165, + "step": 3955 + }, + { + "epoch": 0.1673576444707674, + "grad_norm": 0.5549525618553162, + "learning_rate": 0.001, + "loss": 2.7948, + "step": 3956 + }, + { + "epoch": 0.16739994923428378, + "grad_norm": 9.49412727355957, + "learning_rate": 0.001, + "loss": 2.6595, + "step": 3957 + }, + { + "epoch": 0.16744225399780016, + "grad_norm": 0.2718313932418823, + "learning_rate": 0.001, + "loss": 1.5429, + "step": 3958 + }, + { + "epoch": 0.16748455876131652, + "grad_norm": 0.2949593663215637, + "learning_rate": 0.001, + "loss": 2.1059, + "step": 3959 + }, + { + "epoch": 0.1675268635248329, + "grad_norm": 0.4094192385673523, + "learning_rate": 0.001, + "loss": 2.1578, + "step": 3960 + }, + { + "epoch": 0.16756916828834928, + "grad_norm": 0.23077575862407684, + "learning_rate": 0.001, + "loss": 1.7893, + "step": 3961 + }, + { + "epoch": 0.16761147305186563, + "grad_norm": 3.148542881011963, + "learning_rate": 0.001, + "loss": 2.0162, + "step": 3962 + }, + { + "epoch": 0.16765377781538202, + "grad_norm": 0.2932436764240265, + "learning_rate": 0.001, + "loss": 3.187, + "step": 3963 + }, + { + "epoch": 0.1676960825788984, + "grad_norm": 4.55126953125, + "learning_rate": 0.001, + "loss": 4.0645, + "step": 3964 + }, + { + "epoch": 0.16773838734241475, + "grad_norm": 0.31805071234703064, + "learning_rate": 0.001, + "loss": 1.9577, + "step": 3965 + }, + { + "epoch": 0.16778069210593113, + "grad_norm": 0.6054160594940186, + "learning_rate": 0.001, + "loss": 2.4387, + "step": 3966 + }, + { + "epoch": 0.1678229968694475, + "grad_norm": 0.34546828269958496, + "learning_rate": 0.001, + "loss": 1.7153, + "step": 3967 + }, + { + "epoch": 0.16786530163296387, + "grad_norm": 0.29277992248535156, + "learning_rate": 0.001, + "loss": 2.0135, + "step": 3968 + }, + { + "epoch": 0.16790760639648025, + "grad_norm": 0.25595077872276306, + "learning_rate": 0.001, + "loss": 3.5568, + "step": 3969 + }, + { + "epoch": 0.1679499111599966, + "grad_norm": 0.38943490386009216, + "learning_rate": 0.001, + "loss": 2.8329, + "step": 3970 + }, + { + "epoch": 0.167992215923513, + "grad_norm": 0.23987014591693878, + "learning_rate": 0.001, + "loss": 2.4131, + "step": 3971 + }, + { + "epoch": 0.16803452068702937, + "grad_norm": 0.3031393587589264, + "learning_rate": 0.001, + "loss": 2.5168, + "step": 3972 + }, + { + "epoch": 0.16807682545054572, + "grad_norm": 0.27220430970191956, + "learning_rate": 0.001, + "loss": 3.0367, + "step": 3973 + }, + { + "epoch": 0.1681191302140621, + "grad_norm": 0.3574322462081909, + "learning_rate": 0.001, + "loss": 1.6484, + "step": 3974 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 0.6500173211097717, + "learning_rate": 0.001, + "loss": 2.2295, + "step": 3975 + }, + { + "epoch": 0.16820373974109484, + "grad_norm": 0.3387952744960785, + "learning_rate": 0.001, + "loss": 2.0314, + "step": 3976 + }, + { + "epoch": 0.16824604450461123, + "grad_norm": 0.21707701683044434, + "learning_rate": 0.001, + "loss": 2.2662, + "step": 3977 + }, + { + "epoch": 0.16828834926812758, + "grad_norm": 0.2807854115962982, + "learning_rate": 0.001, + "loss": 3.0303, + "step": 3978 + }, + { + "epoch": 0.16833065403164396, + "grad_norm": 0.19229240715503693, + "learning_rate": 0.001, + "loss": 2.87, + "step": 3979 + }, + { + "epoch": 0.16837295879516034, + "grad_norm": 0.24821369349956512, + "learning_rate": 0.001, + "loss": 3.3929, + "step": 3980 + }, + { + "epoch": 0.1684152635586767, + "grad_norm": 0.2526116967201233, + "learning_rate": 0.001, + "loss": 2.3234, + "step": 3981 + }, + { + "epoch": 0.16845756832219308, + "grad_norm": 0.3005029857158661, + "learning_rate": 0.001, + "loss": 1.7588, + "step": 3982 + }, + { + "epoch": 0.16849987308570946, + "grad_norm": 2.6109414100646973, + "learning_rate": 0.001, + "loss": 2.1013, + "step": 3983 + }, + { + "epoch": 0.16854217784922582, + "grad_norm": 0.605918824672699, + "learning_rate": 0.001, + "loss": 3.1133, + "step": 3984 + }, + { + "epoch": 0.1685844826127422, + "grad_norm": 0.21746625006198883, + "learning_rate": 0.001, + "loss": 2.9431, + "step": 3985 + }, + { + "epoch": 0.16862678737625858, + "grad_norm": 0.260303795337677, + "learning_rate": 0.001, + "loss": 2.0652, + "step": 3986 + }, + { + "epoch": 0.16866909213977493, + "grad_norm": 0.2302677035331726, + "learning_rate": 0.001, + "loss": 3.3315, + "step": 3987 + }, + { + "epoch": 0.16871139690329132, + "grad_norm": 0.34791648387908936, + "learning_rate": 0.001, + "loss": 2.44, + "step": 3988 + }, + { + "epoch": 0.16875370166680767, + "grad_norm": 1.0129145383834839, + "learning_rate": 0.001, + "loss": 1.8525, + "step": 3989 + }, + { + "epoch": 0.16879600643032405, + "grad_norm": 0.2503008246421814, + "learning_rate": 0.001, + "loss": 1.666, + "step": 3990 + }, + { + "epoch": 0.16883831119384043, + "grad_norm": 0.2816570997238159, + "learning_rate": 0.001, + "loss": 2.4679, + "step": 3991 + }, + { + "epoch": 0.1688806159573568, + "grad_norm": 0.24317888915538788, + "learning_rate": 0.001, + "loss": 2.6195, + "step": 3992 + }, + { + "epoch": 0.16892292072087317, + "grad_norm": 0.3003460466861725, + "learning_rate": 0.001, + "loss": 2.6692, + "step": 3993 + }, + { + "epoch": 0.16896522548438955, + "grad_norm": 0.2870309054851532, + "learning_rate": 0.001, + "loss": 2.37, + "step": 3994 + }, + { + "epoch": 0.1690075302479059, + "grad_norm": 0.35769209265708923, + "learning_rate": 0.001, + "loss": 2.6095, + "step": 3995 + }, + { + "epoch": 0.1690498350114223, + "grad_norm": 0.647505521774292, + "learning_rate": 0.001, + "loss": 2.4609, + "step": 3996 + }, + { + "epoch": 0.16909213977493867, + "grad_norm": 0.2910658121109009, + "learning_rate": 0.001, + "loss": 2.1234, + "step": 3997 + }, + { + "epoch": 0.16913444453845503, + "grad_norm": 0.4279431402683258, + "learning_rate": 0.001, + "loss": 2.5119, + "step": 3998 + }, + { + "epoch": 0.1691767493019714, + "grad_norm": 0.7346844673156738, + "learning_rate": 0.001, + "loss": 2.1146, + "step": 3999 + }, + { + "epoch": 0.16921905406548776, + "grad_norm": 0.24350988864898682, + "learning_rate": 0.001, + "loss": 2.28, + "step": 4000 + }, + { + "epoch": 0.16926135882900414, + "grad_norm": 0.2387346774339676, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 4001 + }, + { + "epoch": 0.16930366359252053, + "grad_norm": 0.7333387732505798, + "learning_rate": 0.001, + "loss": 2.0075, + "step": 4002 + }, + { + "epoch": 0.16934596835603688, + "grad_norm": 0.289885938167572, + "learning_rate": 0.001, + "loss": 1.9186, + "step": 4003 + }, + { + "epoch": 0.16938827311955326, + "grad_norm": 0.3716363310813904, + "learning_rate": 0.001, + "loss": 3.237, + "step": 4004 + }, + { + "epoch": 0.16943057788306964, + "grad_norm": 0.29184696078300476, + "learning_rate": 0.001, + "loss": 1.9219, + "step": 4005 + }, + { + "epoch": 0.169472882646586, + "grad_norm": 2.09316349029541, + "learning_rate": 0.001, + "loss": 2.8952, + "step": 4006 + }, + { + "epoch": 0.16951518741010238, + "grad_norm": 0.30977028608322144, + "learning_rate": 0.001, + "loss": 1.6384, + "step": 4007 + }, + { + "epoch": 0.16955749217361876, + "grad_norm": 0.2913471460342407, + "learning_rate": 0.001, + "loss": 2.3069, + "step": 4008 + }, + { + "epoch": 0.16959979693713512, + "grad_norm": 0.2933041751384735, + "learning_rate": 0.001, + "loss": 2.115, + "step": 4009 + }, + { + "epoch": 0.1696421017006515, + "grad_norm": 0.29783979058265686, + "learning_rate": 0.001, + "loss": 3.0471, + "step": 4010 + }, + { + "epoch": 0.16968440646416785, + "grad_norm": 0.46605968475341797, + "learning_rate": 0.001, + "loss": 2.4636, + "step": 4011 + }, + { + "epoch": 0.16972671122768423, + "grad_norm": 0.346618115901947, + "learning_rate": 0.001, + "loss": 2.6208, + "step": 4012 + }, + { + "epoch": 0.16976901599120062, + "grad_norm": 0.279699444770813, + "learning_rate": 0.001, + "loss": 2.4847, + "step": 4013 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 0.5628467798233032, + "learning_rate": 0.001, + "loss": 2.4294, + "step": 4014 + }, + { + "epoch": 0.16985362551823335, + "grad_norm": 0.30765336751937866, + "learning_rate": 0.001, + "loss": 2.5887, + "step": 4015 + }, + { + "epoch": 0.16989593028174974, + "grad_norm": 0.43252626061439514, + "learning_rate": 0.001, + "loss": 2.6351, + "step": 4016 + }, + { + "epoch": 0.1699382350452661, + "grad_norm": 0.7735976576805115, + "learning_rate": 0.001, + "loss": 2.4618, + "step": 4017 + }, + { + "epoch": 0.16998053980878247, + "grad_norm": 0.4775107204914093, + "learning_rate": 0.001, + "loss": 2.93, + "step": 4018 + }, + { + "epoch": 0.17002284457229885, + "grad_norm": 0.2542901933193207, + "learning_rate": 0.001, + "loss": 1.9901, + "step": 4019 + }, + { + "epoch": 0.1700651493358152, + "grad_norm": 0.7986733317375183, + "learning_rate": 0.001, + "loss": 1.8875, + "step": 4020 + }, + { + "epoch": 0.1701074540993316, + "grad_norm": 0.24276584386825562, + "learning_rate": 0.001, + "loss": 2.8431, + "step": 4021 + }, + { + "epoch": 0.17014975886284794, + "grad_norm": 0.29002276062965393, + "learning_rate": 0.001, + "loss": 2.1358, + "step": 4022 + }, + { + "epoch": 0.17019206362636433, + "grad_norm": 0.7131394743919373, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 4023 + }, + { + "epoch": 0.1702343683898807, + "grad_norm": 0.22828415036201477, + "learning_rate": 0.001, + "loss": 1.7578, + "step": 4024 + }, + { + "epoch": 0.17027667315339706, + "grad_norm": 0.2842770218849182, + "learning_rate": 0.001, + "loss": 2.5149, + "step": 4025 + }, + { + "epoch": 0.17031897791691344, + "grad_norm": 0.24497929215431213, + "learning_rate": 0.001, + "loss": 2.7276, + "step": 4026 + }, + { + "epoch": 0.17036128268042983, + "grad_norm": 0.2938736379146576, + "learning_rate": 0.001, + "loss": 3.0422, + "step": 4027 + }, + { + "epoch": 0.17040358744394618, + "grad_norm": 0.2794887125492096, + "learning_rate": 0.001, + "loss": 1.9868, + "step": 4028 + }, + { + "epoch": 0.17044589220746256, + "grad_norm": 0.23071792721748352, + "learning_rate": 0.001, + "loss": 3.0668, + "step": 4029 + }, + { + "epoch": 0.17048819697097894, + "grad_norm": 0.2662751376628876, + "learning_rate": 0.001, + "loss": 2.5487, + "step": 4030 + }, + { + "epoch": 0.1705305017344953, + "grad_norm": 0.21133390069007874, + "learning_rate": 0.001, + "loss": 2.2983, + "step": 4031 + }, + { + "epoch": 0.17057280649801168, + "grad_norm": 0.22290173172950745, + "learning_rate": 0.001, + "loss": 2.4885, + "step": 4032 + }, + { + "epoch": 0.17061511126152804, + "grad_norm": 0.29108884930610657, + "learning_rate": 0.001, + "loss": 1.848, + "step": 4033 + }, + { + "epoch": 0.17065741602504442, + "grad_norm": 0.28918585181236267, + "learning_rate": 0.001, + "loss": 2.7061, + "step": 4034 + }, + { + "epoch": 0.1706997207885608, + "grad_norm": 1.4359923601150513, + "learning_rate": 0.001, + "loss": 2.7202, + "step": 4035 + }, + { + "epoch": 0.17074202555207715, + "grad_norm": 0.6065989136695862, + "learning_rate": 0.001, + "loss": 2.978, + "step": 4036 + }, + { + "epoch": 0.17078433031559354, + "grad_norm": 1.2570323944091797, + "learning_rate": 0.001, + "loss": 2.1911, + "step": 4037 + }, + { + "epoch": 0.17082663507910992, + "grad_norm": 0.2336377501487732, + "learning_rate": 0.001, + "loss": 2.2424, + "step": 4038 + }, + { + "epoch": 0.17086893984262627, + "grad_norm": 3.624565601348877, + "learning_rate": 0.001, + "loss": 1.9867, + "step": 4039 + }, + { + "epoch": 0.17091124460614265, + "grad_norm": 0.21247118711471558, + "learning_rate": 0.001, + "loss": 2.1356, + "step": 4040 + }, + { + "epoch": 0.17095354936965904, + "grad_norm": 0.7525285482406616, + "learning_rate": 0.001, + "loss": 2.7077, + "step": 4041 + }, + { + "epoch": 0.1709958541331754, + "grad_norm": 0.3459639549255371, + "learning_rate": 0.001, + "loss": 2.9617, + "step": 4042 + }, + { + "epoch": 0.17103815889669177, + "grad_norm": 0.28101351857185364, + "learning_rate": 0.001, + "loss": 2.4219, + "step": 4043 + }, + { + "epoch": 0.17108046366020813, + "grad_norm": 0.262946218252182, + "learning_rate": 0.001, + "loss": 2.1022, + "step": 4044 + }, + { + "epoch": 0.1711227684237245, + "grad_norm": 134.08250427246094, + "learning_rate": 0.001, + "loss": 2.2692, + "step": 4045 + }, + { + "epoch": 0.1711650731872409, + "grad_norm": 0.24002955853939056, + "learning_rate": 0.001, + "loss": 2.385, + "step": 4046 + }, + { + "epoch": 0.17120737795075724, + "grad_norm": 0.29694628715515137, + "learning_rate": 0.001, + "loss": 2.6118, + "step": 4047 + }, + { + "epoch": 0.17124968271427363, + "grad_norm": 12.005644798278809, + "learning_rate": 0.001, + "loss": 2.6029, + "step": 4048 + }, + { + "epoch": 0.17129198747779, + "grad_norm": 0.2015325427055359, + "learning_rate": 0.001, + "loss": 1.6005, + "step": 4049 + }, + { + "epoch": 0.17133429224130636, + "grad_norm": 0.18817874789237976, + "learning_rate": 0.001, + "loss": 2.2318, + "step": 4050 + }, + { + "epoch": 0.17137659700482274, + "grad_norm": 0.22923152148723602, + "learning_rate": 0.001, + "loss": 1.8762, + "step": 4051 + }, + { + "epoch": 0.17141890176833913, + "grad_norm": 0.2749441862106323, + "learning_rate": 0.001, + "loss": 2.4774, + "step": 4052 + }, + { + "epoch": 0.17146120653185548, + "grad_norm": 0.23520220816135406, + "learning_rate": 0.001, + "loss": 1.9863, + "step": 4053 + }, + { + "epoch": 0.17150351129537186, + "grad_norm": 0.21112807095050812, + "learning_rate": 0.001, + "loss": 1.7663, + "step": 4054 + }, + { + "epoch": 0.17154581605888822, + "grad_norm": 0.23605698347091675, + "learning_rate": 0.001, + "loss": 2.1954, + "step": 4055 + }, + { + "epoch": 0.1715881208224046, + "grad_norm": 0.19001781940460205, + "learning_rate": 0.001, + "loss": 2.1111, + "step": 4056 + }, + { + "epoch": 0.17163042558592098, + "grad_norm": 0.34026920795440674, + "learning_rate": 0.001, + "loss": 2.5831, + "step": 4057 + }, + { + "epoch": 0.17167273034943734, + "grad_norm": 3.0105605125427246, + "learning_rate": 0.001, + "loss": 3.2851, + "step": 4058 + }, + { + "epoch": 0.17171503511295372, + "grad_norm": 0.2173168957233429, + "learning_rate": 0.001, + "loss": 1.7838, + "step": 4059 + }, + { + "epoch": 0.1717573398764701, + "grad_norm": 0.23681773245334625, + "learning_rate": 0.001, + "loss": 2.5279, + "step": 4060 + }, + { + "epoch": 0.17179964463998645, + "grad_norm": 0.4054577648639679, + "learning_rate": 0.001, + "loss": 1.787, + "step": 4061 + }, + { + "epoch": 0.17184194940350284, + "grad_norm": 0.6441399455070496, + "learning_rate": 0.001, + "loss": 2.2311, + "step": 4062 + }, + { + "epoch": 0.17188425416701922, + "grad_norm": 1.3562344312667847, + "learning_rate": 0.001, + "loss": 2.7153, + "step": 4063 + }, + { + "epoch": 0.17192655893053557, + "grad_norm": 0.26166918873786926, + "learning_rate": 0.001, + "loss": 3.0784, + "step": 4064 + }, + { + "epoch": 0.17196886369405195, + "grad_norm": 0.22632955014705658, + "learning_rate": 0.001, + "loss": 1.6698, + "step": 4065 + }, + { + "epoch": 0.1720111684575683, + "grad_norm": 0.7687202095985413, + "learning_rate": 0.001, + "loss": 2.1554, + "step": 4066 + }, + { + "epoch": 0.1720534732210847, + "grad_norm": 0.6964684128761292, + "learning_rate": 0.001, + "loss": 2.7002, + "step": 4067 + }, + { + "epoch": 0.17209577798460107, + "grad_norm": 0.4354396164417267, + "learning_rate": 0.001, + "loss": 3.5678, + "step": 4068 + }, + { + "epoch": 0.17213808274811743, + "grad_norm": 0.2912690341472626, + "learning_rate": 0.001, + "loss": 2.0832, + "step": 4069 + }, + { + "epoch": 0.1721803875116338, + "grad_norm": 1.5569998025894165, + "learning_rate": 0.001, + "loss": 2.1359, + "step": 4070 + }, + { + "epoch": 0.1722226922751502, + "grad_norm": 0.893632709980011, + "learning_rate": 0.001, + "loss": 3.0861, + "step": 4071 + }, + { + "epoch": 0.17226499703866655, + "grad_norm": 0.7286151051521301, + "learning_rate": 0.001, + "loss": 2.3538, + "step": 4072 + }, + { + "epoch": 0.17230730180218293, + "grad_norm": 0.29030993580818176, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 4073 + }, + { + "epoch": 0.1723496065656993, + "grad_norm": 0.28612467646598816, + "learning_rate": 0.001, + "loss": 2.9171, + "step": 4074 + }, + { + "epoch": 0.17239191132921566, + "grad_norm": 3.892707109451294, + "learning_rate": 0.001, + "loss": 2.1065, + "step": 4075 + }, + { + "epoch": 0.17243421609273205, + "grad_norm": 0.24053117632865906, + "learning_rate": 0.001, + "loss": 2.6832, + "step": 4076 + }, + { + "epoch": 0.1724765208562484, + "grad_norm": 0.2183643877506256, + "learning_rate": 0.001, + "loss": 3.0531, + "step": 4077 + }, + { + "epoch": 0.17251882561976478, + "grad_norm": 1.0080100297927856, + "learning_rate": 0.001, + "loss": 2.9435, + "step": 4078 + }, + { + "epoch": 0.17256113038328116, + "grad_norm": 0.3189499080181122, + "learning_rate": 0.001, + "loss": 2.0975, + "step": 4079 + }, + { + "epoch": 0.17260343514679752, + "grad_norm": 0.2897562086582184, + "learning_rate": 0.001, + "loss": 2.5301, + "step": 4080 + }, + { + "epoch": 0.1726457399103139, + "grad_norm": 0.2769043743610382, + "learning_rate": 0.001, + "loss": 1.8059, + "step": 4081 + }, + { + "epoch": 0.17268804467383028, + "grad_norm": 0.8298352360725403, + "learning_rate": 0.001, + "loss": 2.0551, + "step": 4082 + }, + { + "epoch": 0.17273034943734664, + "grad_norm": 0.26442840695381165, + "learning_rate": 0.001, + "loss": 1.9421, + "step": 4083 + }, + { + "epoch": 0.17277265420086302, + "grad_norm": 0.27219903469085693, + "learning_rate": 0.001, + "loss": 2.6075, + "step": 4084 + }, + { + "epoch": 0.1728149589643794, + "grad_norm": 0.21034714579582214, + "learning_rate": 0.001, + "loss": 2.0325, + "step": 4085 + }, + { + "epoch": 0.17285726372789575, + "grad_norm": 3.3212363719940186, + "learning_rate": 0.001, + "loss": 1.9906, + "step": 4086 + }, + { + "epoch": 0.17289956849141214, + "grad_norm": 0.9745553135871887, + "learning_rate": 0.001, + "loss": 2.8193, + "step": 4087 + }, + { + "epoch": 0.17294187325492852, + "grad_norm": 0.2681751847267151, + "learning_rate": 0.001, + "loss": 2.3555, + "step": 4088 + }, + { + "epoch": 0.17298417801844487, + "grad_norm": 0.31693801283836365, + "learning_rate": 0.001, + "loss": 2.0349, + "step": 4089 + }, + { + "epoch": 0.17302648278196125, + "grad_norm": 0.32253557443618774, + "learning_rate": 0.001, + "loss": 1.9938, + "step": 4090 + }, + { + "epoch": 0.1730687875454776, + "grad_norm": 0.25818389654159546, + "learning_rate": 0.001, + "loss": 2.6496, + "step": 4091 + }, + { + "epoch": 0.173111092308994, + "grad_norm": 0.24890708923339844, + "learning_rate": 0.001, + "loss": 1.7513, + "step": 4092 + }, + { + "epoch": 0.17315339707251037, + "grad_norm": 0.22379307448863983, + "learning_rate": 0.001, + "loss": 1.7251, + "step": 4093 + }, + { + "epoch": 0.17319570183602673, + "grad_norm": 0.24759642779827118, + "learning_rate": 0.001, + "loss": 3.0331, + "step": 4094 + }, + { + "epoch": 0.1732380065995431, + "grad_norm": 0.3050745129585266, + "learning_rate": 0.001, + "loss": 3.0863, + "step": 4095 + }, + { + "epoch": 0.1732803113630595, + "grad_norm": 0.29657745361328125, + "learning_rate": 0.001, + "loss": 3.5831, + "step": 4096 + }, + { + "epoch": 0.17332261612657585, + "grad_norm": 0.254731684923172, + "learning_rate": 0.001, + "loss": 1.9884, + "step": 4097 + }, + { + "epoch": 0.17336492089009223, + "grad_norm": 0.24810083210468292, + "learning_rate": 0.001, + "loss": 1.6098, + "step": 4098 + }, + { + "epoch": 0.1734072256536086, + "grad_norm": 0.7149272561073303, + "learning_rate": 0.001, + "loss": 1.6882, + "step": 4099 + }, + { + "epoch": 0.17344953041712496, + "grad_norm": 0.34439778327941895, + "learning_rate": 0.001, + "loss": 3.468, + "step": 4100 + }, + { + "epoch": 0.17349183518064135, + "grad_norm": 0.27477285265922546, + "learning_rate": 0.001, + "loss": 3.7453, + "step": 4101 + }, + { + "epoch": 0.1735341399441577, + "grad_norm": 0.2810986042022705, + "learning_rate": 0.001, + "loss": 2.4957, + "step": 4102 + }, + { + "epoch": 0.17357644470767408, + "grad_norm": 0.24247132241725922, + "learning_rate": 0.001, + "loss": 1.6966, + "step": 4103 + }, + { + "epoch": 0.17361874947119046, + "grad_norm": 0.1760856807231903, + "learning_rate": 0.001, + "loss": 2.591, + "step": 4104 + }, + { + "epoch": 0.17366105423470682, + "grad_norm": 0.25415733456611633, + "learning_rate": 0.001, + "loss": 2.1011, + "step": 4105 + }, + { + "epoch": 0.1737033589982232, + "grad_norm": 2.5164103507995605, + "learning_rate": 0.001, + "loss": 2.035, + "step": 4106 + }, + { + "epoch": 0.17374566376173958, + "grad_norm": 0.5804241895675659, + "learning_rate": 0.001, + "loss": 2.2759, + "step": 4107 + }, + { + "epoch": 0.17378796852525594, + "grad_norm": 0.24741019308567047, + "learning_rate": 0.001, + "loss": 2.2455, + "step": 4108 + }, + { + "epoch": 0.17383027328877232, + "grad_norm": 0.230664923787117, + "learning_rate": 0.001, + "loss": 1.8416, + "step": 4109 + }, + { + "epoch": 0.1738725780522887, + "grad_norm": 0.3088621497154236, + "learning_rate": 0.001, + "loss": 2.0536, + "step": 4110 + }, + { + "epoch": 0.17391488281580506, + "grad_norm": 0.21473775804042816, + "learning_rate": 0.001, + "loss": 1.4643, + "step": 4111 + }, + { + "epoch": 0.17395718757932144, + "grad_norm": 0.23546308279037476, + "learning_rate": 0.001, + "loss": 2.4978, + "step": 4112 + }, + { + "epoch": 0.1739994923428378, + "grad_norm": 0.24598172307014465, + "learning_rate": 0.001, + "loss": 3.8651, + "step": 4113 + }, + { + "epoch": 0.17404179710635417, + "grad_norm": 0.24466127157211304, + "learning_rate": 0.001, + "loss": 3.8869, + "step": 4114 + }, + { + "epoch": 0.17408410186987056, + "grad_norm": 0.21379978954792023, + "learning_rate": 0.001, + "loss": 3.1837, + "step": 4115 + }, + { + "epoch": 0.1741264066333869, + "grad_norm": 3.490262031555176, + "learning_rate": 0.001, + "loss": 1.6441, + "step": 4116 + }, + { + "epoch": 0.1741687113969033, + "grad_norm": 0.30612412095069885, + "learning_rate": 0.001, + "loss": 2.0432, + "step": 4117 + }, + { + "epoch": 0.17421101616041967, + "grad_norm": 0.32725802063941956, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 4118 + }, + { + "epoch": 0.17425332092393603, + "grad_norm": 0.34883803129196167, + "learning_rate": 0.001, + "loss": 2.9586, + "step": 4119 + }, + { + "epoch": 0.1742956256874524, + "grad_norm": 0.2720988690853119, + "learning_rate": 0.001, + "loss": 2.6285, + "step": 4120 + }, + { + "epoch": 0.1743379304509688, + "grad_norm": 0.47883912920951843, + "learning_rate": 0.001, + "loss": 2.1746, + "step": 4121 + }, + { + "epoch": 0.17438023521448515, + "grad_norm": 0.3157065212726593, + "learning_rate": 0.001, + "loss": 2.5907, + "step": 4122 + }, + { + "epoch": 0.17442253997800153, + "grad_norm": 0.22530221939086914, + "learning_rate": 0.001, + "loss": 1.8267, + "step": 4123 + }, + { + "epoch": 0.17446484474151788, + "grad_norm": 0.6682106852531433, + "learning_rate": 0.001, + "loss": 1.8648, + "step": 4124 + }, + { + "epoch": 0.17450714950503426, + "grad_norm": 0.24948126077651978, + "learning_rate": 0.001, + "loss": 3.0028, + "step": 4125 + }, + { + "epoch": 0.17454945426855065, + "grad_norm": 0.23554354906082153, + "learning_rate": 0.001, + "loss": 2.8481, + "step": 4126 + }, + { + "epoch": 0.174591759032067, + "grad_norm": 0.2798170745372772, + "learning_rate": 0.001, + "loss": 2.5993, + "step": 4127 + }, + { + "epoch": 0.17463406379558338, + "grad_norm": 0.1892385184764862, + "learning_rate": 0.001, + "loss": 2.373, + "step": 4128 + }, + { + "epoch": 0.17467636855909977, + "grad_norm": 4.151578426361084, + "learning_rate": 0.001, + "loss": 2.2752, + "step": 4129 + }, + { + "epoch": 0.17471867332261612, + "grad_norm": 0.2719863951206207, + "learning_rate": 0.001, + "loss": 2.4036, + "step": 4130 + }, + { + "epoch": 0.1747609780861325, + "grad_norm": 0.27087652683258057, + "learning_rate": 0.001, + "loss": 2.1397, + "step": 4131 + }, + { + "epoch": 0.17480328284964888, + "grad_norm": 0.27612924575805664, + "learning_rate": 0.001, + "loss": 3.2754, + "step": 4132 + }, + { + "epoch": 0.17484558761316524, + "grad_norm": 0.32560476660728455, + "learning_rate": 0.001, + "loss": 2.3888, + "step": 4133 + }, + { + "epoch": 0.17488789237668162, + "grad_norm": 0.5404102802276611, + "learning_rate": 0.001, + "loss": 2.7755, + "step": 4134 + }, + { + "epoch": 0.17493019714019797, + "grad_norm": 0.28054457902908325, + "learning_rate": 0.001, + "loss": 2.4661, + "step": 4135 + }, + { + "epoch": 0.17497250190371436, + "grad_norm": 7.133134365081787, + "learning_rate": 0.001, + "loss": 2.3996, + "step": 4136 + }, + { + "epoch": 0.17501480666723074, + "grad_norm": 0.4292699098587036, + "learning_rate": 0.001, + "loss": 2.2355, + "step": 4137 + }, + { + "epoch": 0.1750571114307471, + "grad_norm": 0.2766590416431427, + "learning_rate": 0.001, + "loss": 2.556, + "step": 4138 + }, + { + "epoch": 0.17509941619426347, + "grad_norm": 0.26670682430267334, + "learning_rate": 0.001, + "loss": 2.2956, + "step": 4139 + }, + { + "epoch": 0.17514172095777986, + "grad_norm": 0.2878996729850769, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 4140 + }, + { + "epoch": 0.1751840257212962, + "grad_norm": 0.4456641376018524, + "learning_rate": 0.001, + "loss": 2.4915, + "step": 4141 + }, + { + "epoch": 0.1752263304848126, + "grad_norm": 0.28615471720695496, + "learning_rate": 0.001, + "loss": 2.2347, + "step": 4142 + }, + { + "epoch": 0.17526863524832897, + "grad_norm": 0.28259655833244324, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 4143 + }, + { + "epoch": 0.17531094001184533, + "grad_norm": 0.22979553043842316, + "learning_rate": 0.001, + "loss": 2.1944, + "step": 4144 + }, + { + "epoch": 0.1753532447753617, + "grad_norm": 0.6153604984283447, + "learning_rate": 0.001, + "loss": 2.1908, + "step": 4145 + }, + { + "epoch": 0.17539554953887806, + "grad_norm": 1.9100154638290405, + "learning_rate": 0.001, + "loss": 2.154, + "step": 4146 + }, + { + "epoch": 0.17543785430239445, + "grad_norm": 0.23111866414546967, + "learning_rate": 0.001, + "loss": 1.4795, + "step": 4147 + }, + { + "epoch": 0.17548015906591083, + "grad_norm": 0.21934235095977783, + "learning_rate": 0.001, + "loss": 2.0412, + "step": 4148 + }, + { + "epoch": 0.17552246382942718, + "grad_norm": 0.31571948528289795, + "learning_rate": 0.001, + "loss": 1.7655, + "step": 4149 + }, + { + "epoch": 0.17556476859294357, + "grad_norm": 0.35099247097969055, + "learning_rate": 0.001, + "loss": 2.8888, + "step": 4150 + }, + { + "epoch": 0.17560707335645995, + "grad_norm": 0.25058233737945557, + "learning_rate": 0.001, + "loss": 1.6825, + "step": 4151 + }, + { + "epoch": 0.1756493781199763, + "grad_norm": 0.35002222657203674, + "learning_rate": 0.001, + "loss": 2.3041, + "step": 4152 + }, + { + "epoch": 0.17569168288349268, + "grad_norm": 0.2953129708766937, + "learning_rate": 0.001, + "loss": 2.1666, + "step": 4153 + }, + { + "epoch": 0.17573398764700907, + "grad_norm": 0.2621290981769562, + "learning_rate": 0.001, + "loss": 3.3843, + "step": 4154 + }, + { + "epoch": 0.17577629241052542, + "grad_norm": 0.2586938738822937, + "learning_rate": 0.001, + "loss": 1.8781, + "step": 4155 + }, + { + "epoch": 0.1758185971740418, + "grad_norm": 0.29512864351272583, + "learning_rate": 0.001, + "loss": 2.2121, + "step": 4156 + }, + { + "epoch": 0.17586090193755816, + "grad_norm": 0.2333596795797348, + "learning_rate": 0.001, + "loss": 2.6819, + "step": 4157 + }, + { + "epoch": 0.17590320670107454, + "grad_norm": 0.2719203531742096, + "learning_rate": 0.001, + "loss": 2.4844, + "step": 4158 + }, + { + "epoch": 0.17594551146459092, + "grad_norm": 0.9604827761650085, + "learning_rate": 0.001, + "loss": 2.4097, + "step": 4159 + }, + { + "epoch": 0.17598781622810727, + "grad_norm": 0.29836225509643555, + "learning_rate": 0.001, + "loss": 1.9671, + "step": 4160 + }, + { + "epoch": 0.17603012099162366, + "grad_norm": 0.2874457538127899, + "learning_rate": 0.001, + "loss": 2.8141, + "step": 4161 + }, + { + "epoch": 0.17607242575514004, + "grad_norm": 0.3310123085975647, + "learning_rate": 0.001, + "loss": 2.1315, + "step": 4162 + }, + { + "epoch": 0.1761147305186564, + "grad_norm": 0.26700732111930847, + "learning_rate": 0.001, + "loss": 2.7584, + "step": 4163 + }, + { + "epoch": 0.17615703528217277, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.001, + "loss": 2.1226, + "step": 4164 + }, + { + "epoch": 0.17619934004568916, + "grad_norm": 0.30483001470565796, + "learning_rate": 0.001, + "loss": 2.3202, + "step": 4165 + }, + { + "epoch": 0.1762416448092055, + "grad_norm": 0.792739987373352, + "learning_rate": 0.001, + "loss": 2.3611, + "step": 4166 + }, + { + "epoch": 0.1762839495727219, + "grad_norm": 0.39795559644699097, + "learning_rate": 0.001, + "loss": 3.8292, + "step": 4167 + }, + { + "epoch": 0.17632625433623825, + "grad_norm": 0.5967978239059448, + "learning_rate": 0.001, + "loss": 2.3902, + "step": 4168 + }, + { + "epoch": 0.17636855909975463, + "grad_norm": 0.25690481066703796, + "learning_rate": 0.001, + "loss": 3.1277, + "step": 4169 + }, + { + "epoch": 0.176410863863271, + "grad_norm": 0.3368457555770874, + "learning_rate": 0.001, + "loss": 2.4135, + "step": 4170 + }, + { + "epoch": 0.17645316862678737, + "grad_norm": 0.5564723014831543, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 4171 + }, + { + "epoch": 0.17649547339030375, + "grad_norm": 0.28822070360183716, + "learning_rate": 0.001, + "loss": 2.0568, + "step": 4172 + }, + { + "epoch": 0.17653777815382013, + "grad_norm": 0.2540615200996399, + "learning_rate": 0.001, + "loss": 2.7371, + "step": 4173 + }, + { + "epoch": 0.17658008291733648, + "grad_norm": 0.30011865496635437, + "learning_rate": 0.001, + "loss": 2.0551, + "step": 4174 + }, + { + "epoch": 0.17662238768085287, + "grad_norm": 0.28831255435943604, + "learning_rate": 0.001, + "loss": 2.3959, + "step": 4175 + }, + { + "epoch": 0.17666469244436925, + "grad_norm": 0.2988402843475342, + "learning_rate": 0.001, + "loss": 1.8196, + "step": 4176 + }, + { + "epoch": 0.1767069972078856, + "grad_norm": 1.3660717010498047, + "learning_rate": 0.001, + "loss": 1.853, + "step": 4177 + }, + { + "epoch": 0.17674930197140198, + "grad_norm": 0.25615715980529785, + "learning_rate": 0.001, + "loss": 2.7053, + "step": 4178 + }, + { + "epoch": 0.17679160673491834, + "grad_norm": 0.25990021228790283, + "learning_rate": 0.001, + "loss": 3.0701, + "step": 4179 + }, + { + "epoch": 0.17683391149843472, + "grad_norm": 0.24227409064769745, + "learning_rate": 0.001, + "loss": 2.085, + "step": 4180 + }, + { + "epoch": 0.1768762162619511, + "grad_norm": 0.2668752670288086, + "learning_rate": 0.001, + "loss": 2.9824, + "step": 4181 + }, + { + "epoch": 0.17691852102546746, + "grad_norm": 0.2560098469257355, + "learning_rate": 0.001, + "loss": 1.9853, + "step": 4182 + }, + { + "epoch": 0.17696082578898384, + "grad_norm": 26.13444709777832, + "learning_rate": 0.001, + "loss": 1.7282, + "step": 4183 + }, + { + "epoch": 0.17700313055250022, + "grad_norm": 0.28741776943206787, + "learning_rate": 0.001, + "loss": 1.9061, + "step": 4184 + }, + { + "epoch": 0.17704543531601658, + "grad_norm": 0.2741623520851135, + "learning_rate": 0.001, + "loss": 2.4942, + "step": 4185 + }, + { + "epoch": 0.17708774007953296, + "grad_norm": 0.24903661012649536, + "learning_rate": 0.001, + "loss": 3.1056, + "step": 4186 + }, + { + "epoch": 0.17713004484304934, + "grad_norm": 0.21220599114894867, + "learning_rate": 0.001, + "loss": 1.4461, + "step": 4187 + }, + { + "epoch": 0.1771723496065657, + "grad_norm": 0.3349112570285797, + "learning_rate": 0.001, + "loss": 2.0917, + "step": 4188 + }, + { + "epoch": 0.17721465437008208, + "grad_norm": 0.7024485468864441, + "learning_rate": 0.001, + "loss": 2.4401, + "step": 4189 + }, + { + "epoch": 0.17725695913359843, + "grad_norm": 0.2594164311885834, + "learning_rate": 0.001, + "loss": 2.3743, + "step": 4190 + }, + { + "epoch": 0.1772992638971148, + "grad_norm": 0.26141050457954407, + "learning_rate": 0.001, + "loss": 3.0745, + "step": 4191 + }, + { + "epoch": 0.1773415686606312, + "grad_norm": 0.25073671340942383, + "learning_rate": 0.001, + "loss": 2.8072, + "step": 4192 + }, + { + "epoch": 0.17738387342414755, + "grad_norm": 0.25238800048828125, + "learning_rate": 0.001, + "loss": 2.5714, + "step": 4193 + }, + { + "epoch": 0.17742617818766393, + "grad_norm": 2.378894805908203, + "learning_rate": 0.001, + "loss": 3.8264, + "step": 4194 + }, + { + "epoch": 0.1774684829511803, + "grad_norm": 0.2004442662000656, + "learning_rate": 0.001, + "loss": 3.0773, + "step": 4195 + }, + { + "epoch": 0.17751078771469667, + "grad_norm": 0.48582643270492554, + "learning_rate": 0.001, + "loss": 2.3928, + "step": 4196 + }, + { + "epoch": 0.17755309247821305, + "grad_norm": 0.3151031732559204, + "learning_rate": 0.001, + "loss": 2.8011, + "step": 4197 + }, + { + "epoch": 0.17759539724172943, + "grad_norm": 0.257080078125, + "learning_rate": 0.001, + "loss": 2.5786, + "step": 4198 + }, + { + "epoch": 0.17763770200524578, + "grad_norm": 0.42748337984085083, + "learning_rate": 0.001, + "loss": 2.5647, + "step": 4199 + }, + { + "epoch": 0.17768000676876217, + "grad_norm": 0.3146316409111023, + "learning_rate": 0.001, + "loss": 2.9859, + "step": 4200 + }, + { + "epoch": 0.17772231153227852, + "grad_norm": 0.2928094267845154, + "learning_rate": 0.001, + "loss": 2.3672, + "step": 4201 + }, + { + "epoch": 0.1777646162957949, + "grad_norm": 0.31037336587905884, + "learning_rate": 0.001, + "loss": 2.9569, + "step": 4202 + }, + { + "epoch": 0.17780692105931128, + "grad_norm": 0.2981893718242645, + "learning_rate": 0.001, + "loss": 2.2674, + "step": 4203 + }, + { + "epoch": 0.17784922582282764, + "grad_norm": 0.26700565218925476, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 4204 + }, + { + "epoch": 0.17789153058634402, + "grad_norm": 0.3305959701538086, + "learning_rate": 0.001, + "loss": 1.9121, + "step": 4205 + }, + { + "epoch": 0.1779338353498604, + "grad_norm": 0.323095440864563, + "learning_rate": 0.001, + "loss": 2.6363, + "step": 4206 + }, + { + "epoch": 0.17797614011337676, + "grad_norm": 0.5207691788673401, + "learning_rate": 0.001, + "loss": 2.3411, + "step": 4207 + }, + { + "epoch": 0.17801844487689314, + "grad_norm": 0.5256071090698242, + "learning_rate": 0.001, + "loss": 2.0633, + "step": 4208 + }, + { + "epoch": 0.17806074964040952, + "grad_norm": 0.20449815690517426, + "learning_rate": 0.001, + "loss": 1.5104, + "step": 4209 + }, + { + "epoch": 0.17810305440392588, + "grad_norm": 0.3280058801174164, + "learning_rate": 0.001, + "loss": 2.0113, + "step": 4210 + }, + { + "epoch": 0.17814535916744226, + "grad_norm": 0.20661139488220215, + "learning_rate": 0.001, + "loss": 1.8537, + "step": 4211 + }, + { + "epoch": 0.17818766393095864, + "grad_norm": 0.22364257276058197, + "learning_rate": 0.001, + "loss": 2.2209, + "step": 4212 + }, + { + "epoch": 0.178229968694475, + "grad_norm": 0.3003136217594147, + "learning_rate": 0.001, + "loss": 2.3106, + "step": 4213 + }, + { + "epoch": 0.17827227345799138, + "grad_norm": 0.39940145611763, + "learning_rate": 0.001, + "loss": 2.6153, + "step": 4214 + }, + { + "epoch": 0.17831457822150773, + "grad_norm": 0.2537018954753876, + "learning_rate": 0.001, + "loss": 2.6277, + "step": 4215 + }, + { + "epoch": 0.1783568829850241, + "grad_norm": 0.25565820932388306, + "learning_rate": 0.001, + "loss": 2.7391, + "step": 4216 + }, + { + "epoch": 0.1783991877485405, + "grad_norm": 0.3583895266056061, + "learning_rate": 0.001, + "loss": 2.151, + "step": 4217 + }, + { + "epoch": 0.17844149251205685, + "grad_norm": 0.21725155413150787, + "learning_rate": 0.001, + "loss": 2.0606, + "step": 4218 + }, + { + "epoch": 0.17848379727557323, + "grad_norm": 0.21374504268169403, + "learning_rate": 0.001, + "loss": 2.0371, + "step": 4219 + }, + { + "epoch": 0.1785261020390896, + "grad_norm": 0.21078385412693024, + "learning_rate": 0.001, + "loss": 1.8766, + "step": 4220 + }, + { + "epoch": 0.17856840680260597, + "grad_norm": 0.2687901258468628, + "learning_rate": 0.001, + "loss": 2.4474, + "step": 4221 + }, + { + "epoch": 0.17861071156612235, + "grad_norm": 0.3007321357727051, + "learning_rate": 0.001, + "loss": 2.4974, + "step": 4222 + }, + { + "epoch": 0.17865301632963873, + "grad_norm": 0.35999658703804016, + "learning_rate": 0.001, + "loss": 2.2235, + "step": 4223 + }, + { + "epoch": 0.17869532109315509, + "grad_norm": 0.5664570331573486, + "learning_rate": 0.001, + "loss": 2.6284, + "step": 4224 + }, + { + "epoch": 0.17873762585667147, + "grad_norm": 0.28979048132896423, + "learning_rate": 0.001, + "loss": 2.0292, + "step": 4225 + }, + { + "epoch": 0.17877993062018782, + "grad_norm": 0.3224925398826599, + "learning_rate": 0.001, + "loss": 1.9622, + "step": 4226 + }, + { + "epoch": 0.1788222353837042, + "grad_norm": 4.19497537612915, + "learning_rate": 0.001, + "loss": 2.2633, + "step": 4227 + }, + { + "epoch": 0.17886454014722059, + "grad_norm": 0.252350389957428, + "learning_rate": 0.001, + "loss": 3.7678, + "step": 4228 + }, + { + "epoch": 0.17890684491073694, + "grad_norm": 0.26426586508750916, + "learning_rate": 0.001, + "loss": 2.3954, + "step": 4229 + }, + { + "epoch": 0.17894914967425332, + "grad_norm": 0.24827632308006287, + "learning_rate": 0.001, + "loss": 1.9709, + "step": 4230 + }, + { + "epoch": 0.1789914544377697, + "grad_norm": 4.474344253540039, + "learning_rate": 0.001, + "loss": 2.9707, + "step": 4231 + }, + { + "epoch": 0.17903375920128606, + "grad_norm": 0.4154488444328308, + "learning_rate": 0.001, + "loss": 2.6213, + "step": 4232 + }, + { + "epoch": 0.17907606396480244, + "grad_norm": 0.2699248492717743, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 4233 + }, + { + "epoch": 0.17911836872831882, + "grad_norm": 0.2923453152179718, + "learning_rate": 0.001, + "loss": 1.9264, + "step": 4234 + }, + { + "epoch": 0.17916067349183518, + "grad_norm": 0.31689560413360596, + "learning_rate": 0.001, + "loss": 2.5336, + "step": 4235 + }, + { + "epoch": 0.17920297825535156, + "grad_norm": 0.2745906412601471, + "learning_rate": 0.001, + "loss": 1.8758, + "step": 4236 + }, + { + "epoch": 0.1792452830188679, + "grad_norm": 1.625520944595337, + "learning_rate": 0.001, + "loss": 4.2199, + "step": 4237 + }, + { + "epoch": 0.1792875877823843, + "grad_norm": 0.32071831822395325, + "learning_rate": 0.001, + "loss": 2.993, + "step": 4238 + }, + { + "epoch": 0.17932989254590068, + "grad_norm": 0.2907838225364685, + "learning_rate": 0.001, + "loss": 2.3436, + "step": 4239 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.25917503237724304, + "learning_rate": 0.001, + "loss": 1.7643, + "step": 4240 + }, + { + "epoch": 0.1794145020729334, + "grad_norm": 0.35071495175361633, + "learning_rate": 0.001, + "loss": 2.6049, + "step": 4241 + }, + { + "epoch": 0.1794568068364498, + "grad_norm": 0.402885377407074, + "learning_rate": 0.001, + "loss": 3.151, + "step": 4242 + }, + { + "epoch": 0.17949911159996615, + "grad_norm": 0.2622643709182739, + "learning_rate": 0.001, + "loss": 1.8995, + "step": 4243 + }, + { + "epoch": 0.17954141636348253, + "grad_norm": 0.2709498107433319, + "learning_rate": 0.001, + "loss": 2.5256, + "step": 4244 + }, + { + "epoch": 0.1795837211269989, + "grad_norm": 0.27462783455848694, + "learning_rate": 0.001, + "loss": 2.714, + "step": 4245 + }, + { + "epoch": 0.17962602589051527, + "grad_norm": 0.20611165463924408, + "learning_rate": 0.001, + "loss": 1.6344, + "step": 4246 + }, + { + "epoch": 0.17966833065403165, + "grad_norm": 0.32255131006240845, + "learning_rate": 0.001, + "loss": 2.2145, + "step": 4247 + }, + { + "epoch": 0.179710635417548, + "grad_norm": 0.21528469026088715, + "learning_rate": 0.001, + "loss": 1.8803, + "step": 4248 + }, + { + "epoch": 0.17975294018106439, + "grad_norm": 0.21335670351982117, + "learning_rate": 0.001, + "loss": 2.8012, + "step": 4249 + }, + { + "epoch": 0.17979524494458077, + "grad_norm": 1.517720341682434, + "learning_rate": 0.001, + "loss": 2.1899, + "step": 4250 + }, + { + "epoch": 0.17983754970809712, + "grad_norm": 0.762283205986023, + "learning_rate": 0.001, + "loss": 3.0798, + "step": 4251 + }, + { + "epoch": 0.1798798544716135, + "grad_norm": 0.23315021395683289, + "learning_rate": 0.001, + "loss": 2.9799, + "step": 4252 + }, + { + "epoch": 0.17992215923512989, + "grad_norm": 1.0861098766326904, + "learning_rate": 0.001, + "loss": 3.0077, + "step": 4253 + }, + { + "epoch": 0.17996446399864624, + "grad_norm": 0.4819357991218567, + "learning_rate": 0.001, + "loss": 3.5156, + "step": 4254 + }, + { + "epoch": 0.18000676876216262, + "grad_norm": 0.5548194646835327, + "learning_rate": 0.001, + "loss": 2.9003, + "step": 4255 + }, + { + "epoch": 0.180049073525679, + "grad_norm": 8.51793384552002, + "learning_rate": 0.001, + "loss": 2.8978, + "step": 4256 + }, + { + "epoch": 0.18009137828919536, + "grad_norm": 0.37549543380737305, + "learning_rate": 0.001, + "loss": 2.1621, + "step": 4257 + }, + { + "epoch": 0.18013368305271174, + "grad_norm": 1.0575660467147827, + "learning_rate": 0.001, + "loss": 2.7155, + "step": 4258 + }, + { + "epoch": 0.1801759878162281, + "grad_norm": 0.26176881790161133, + "learning_rate": 0.001, + "loss": 1.717, + "step": 4259 + }, + { + "epoch": 0.18021829257974448, + "grad_norm": 0.418062299489975, + "learning_rate": 0.001, + "loss": 3.2709, + "step": 4260 + }, + { + "epoch": 0.18026059734326086, + "grad_norm": 0.3230735957622528, + "learning_rate": 0.001, + "loss": 2.752, + "step": 4261 + }, + { + "epoch": 0.1803029021067772, + "grad_norm": 0.3032633662223816, + "learning_rate": 0.001, + "loss": 1.8807, + "step": 4262 + }, + { + "epoch": 0.1803452068702936, + "grad_norm": 0.37252533435821533, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 4263 + }, + { + "epoch": 0.18038751163380998, + "grad_norm": 0.8877015709877014, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 4264 + }, + { + "epoch": 0.18042981639732633, + "grad_norm": 0.6750360131263733, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 4265 + }, + { + "epoch": 0.1804721211608427, + "grad_norm": 0.626649796962738, + "learning_rate": 0.001, + "loss": 3.1486, + "step": 4266 + }, + { + "epoch": 0.1805144259243591, + "grad_norm": 0.3944655656814575, + "learning_rate": 0.001, + "loss": 2.4488, + "step": 4267 + }, + { + "epoch": 0.18055673068787545, + "grad_norm": 0.28062745928764343, + "learning_rate": 0.001, + "loss": 1.9655, + "step": 4268 + }, + { + "epoch": 0.18059903545139183, + "grad_norm": 2.311063051223755, + "learning_rate": 0.001, + "loss": 2.6447, + "step": 4269 + }, + { + "epoch": 0.18064134021490819, + "grad_norm": 0.30582699179649353, + "learning_rate": 0.001, + "loss": 2.7865, + "step": 4270 + }, + { + "epoch": 0.18068364497842457, + "grad_norm": 0.6388780474662781, + "learning_rate": 0.001, + "loss": 3.422, + "step": 4271 + }, + { + "epoch": 0.18072594974194095, + "grad_norm": 0.24511829018592834, + "learning_rate": 0.001, + "loss": 1.925, + "step": 4272 + }, + { + "epoch": 0.1807682545054573, + "grad_norm": 10.821248054504395, + "learning_rate": 0.001, + "loss": 2.6423, + "step": 4273 + }, + { + "epoch": 0.1808105592689737, + "grad_norm": 0.9741492867469788, + "learning_rate": 0.001, + "loss": 2.614, + "step": 4274 + }, + { + "epoch": 0.18085286403249007, + "grad_norm": 0.4937375485897064, + "learning_rate": 0.001, + "loss": 2.9155, + "step": 4275 + }, + { + "epoch": 0.18089516879600642, + "grad_norm": 0.32273703813552856, + "learning_rate": 0.001, + "loss": 3.4425, + "step": 4276 + }, + { + "epoch": 0.1809374735595228, + "grad_norm": 0.3622698187828064, + "learning_rate": 0.001, + "loss": 2.5883, + "step": 4277 + }, + { + "epoch": 0.1809797783230392, + "grad_norm": 0.2949691116809845, + "learning_rate": 0.001, + "loss": 3.5752, + "step": 4278 + }, + { + "epoch": 0.18102208308655554, + "grad_norm": 0.2598804533481598, + "learning_rate": 0.001, + "loss": 1.8536, + "step": 4279 + }, + { + "epoch": 0.18106438785007192, + "grad_norm": 1.1611905097961426, + "learning_rate": 0.001, + "loss": 3.0881, + "step": 4280 + }, + { + "epoch": 0.18110669261358828, + "grad_norm": 1.2100458145141602, + "learning_rate": 0.001, + "loss": 2.2188, + "step": 4281 + }, + { + "epoch": 0.18114899737710466, + "grad_norm": 0.5973420739173889, + "learning_rate": 0.001, + "loss": 2.6307, + "step": 4282 + }, + { + "epoch": 0.18119130214062104, + "grad_norm": 0.29785507917404175, + "learning_rate": 0.001, + "loss": 2.6614, + "step": 4283 + }, + { + "epoch": 0.1812336069041374, + "grad_norm": 0.2885024845600128, + "learning_rate": 0.001, + "loss": 2.4603, + "step": 4284 + }, + { + "epoch": 0.18127591166765378, + "grad_norm": 5.69395112991333, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 4285 + }, + { + "epoch": 0.18131821643117016, + "grad_norm": 0.9888959527015686, + "learning_rate": 0.001, + "loss": 3.3646, + "step": 4286 + }, + { + "epoch": 0.1813605211946865, + "grad_norm": 1.5482476949691772, + "learning_rate": 0.001, + "loss": 1.983, + "step": 4287 + }, + { + "epoch": 0.1814028259582029, + "grad_norm": 1.1126073598861694, + "learning_rate": 0.001, + "loss": 2.3665, + "step": 4288 + }, + { + "epoch": 0.18144513072171928, + "grad_norm": 3.9008285999298096, + "learning_rate": 0.001, + "loss": 2.3922, + "step": 4289 + }, + { + "epoch": 0.18148743548523563, + "grad_norm": 0.30987706780433655, + "learning_rate": 0.001, + "loss": 2.7724, + "step": 4290 + }, + { + "epoch": 0.18152974024875201, + "grad_norm": 0.5834580659866333, + "learning_rate": 0.001, + "loss": 3.4511, + "step": 4291 + }, + { + "epoch": 0.18157204501226837, + "grad_norm": 0.37197399139404297, + "learning_rate": 0.001, + "loss": 2.7004, + "step": 4292 + }, + { + "epoch": 0.18161434977578475, + "grad_norm": 0.2617497742176056, + "learning_rate": 0.001, + "loss": 2.2616, + "step": 4293 + }, + { + "epoch": 0.18165665453930113, + "grad_norm": 0.25308120250701904, + "learning_rate": 0.001, + "loss": 2.5587, + "step": 4294 + }, + { + "epoch": 0.1816989593028175, + "grad_norm": 0.22817359864711761, + "learning_rate": 0.001, + "loss": 2.3229, + "step": 4295 + }, + { + "epoch": 0.18174126406633387, + "grad_norm": 0.27048158645629883, + "learning_rate": 0.001, + "loss": 2.0208, + "step": 4296 + }, + { + "epoch": 0.18178356882985025, + "grad_norm": 0.7957993149757385, + "learning_rate": 0.001, + "loss": 2.4789, + "step": 4297 + }, + { + "epoch": 0.1818258735933666, + "grad_norm": 0.2634366452693939, + "learning_rate": 0.001, + "loss": 2.9582, + "step": 4298 + }, + { + "epoch": 0.181868178356883, + "grad_norm": 0.2717626094818115, + "learning_rate": 0.001, + "loss": 2.3598, + "step": 4299 + }, + { + "epoch": 0.18191048312039937, + "grad_norm": 1.2515925168991089, + "learning_rate": 0.001, + "loss": 2.4262, + "step": 4300 + }, + { + "epoch": 0.18195278788391572, + "grad_norm": 0.307389497756958, + "learning_rate": 0.001, + "loss": 2.2833, + "step": 4301 + }, + { + "epoch": 0.1819950926474321, + "grad_norm": 0.6592715978622437, + "learning_rate": 0.001, + "loss": 2.8568, + "step": 4302 + }, + { + "epoch": 0.18203739741094846, + "grad_norm": 0.24115906655788422, + "learning_rate": 0.001, + "loss": 2.0772, + "step": 4303 + }, + { + "epoch": 0.18207970217446484, + "grad_norm": 0.3468441963195801, + "learning_rate": 0.001, + "loss": 3.1513, + "step": 4304 + }, + { + "epoch": 0.18212200693798122, + "grad_norm": 0.3268192708492279, + "learning_rate": 0.001, + "loss": 2.2592, + "step": 4305 + }, + { + "epoch": 0.18216431170149758, + "grad_norm": 0.4834609925746918, + "learning_rate": 0.001, + "loss": 1.9173, + "step": 4306 + }, + { + "epoch": 0.18220661646501396, + "grad_norm": 0.29343941807746887, + "learning_rate": 0.001, + "loss": 4.07, + "step": 4307 + }, + { + "epoch": 0.18224892122853034, + "grad_norm": 0.5635595321655273, + "learning_rate": 0.001, + "loss": 3.5071, + "step": 4308 + }, + { + "epoch": 0.1822912259920467, + "grad_norm": 6.358903884887695, + "learning_rate": 0.001, + "loss": 2.6644, + "step": 4309 + }, + { + "epoch": 0.18233353075556308, + "grad_norm": 1.2210367918014526, + "learning_rate": 0.001, + "loss": 3.2344, + "step": 4310 + }, + { + "epoch": 0.18237583551907946, + "grad_norm": 0.3168500065803528, + "learning_rate": 0.001, + "loss": 1.8108, + "step": 4311 + }, + { + "epoch": 0.18241814028259581, + "grad_norm": 0.2486400455236435, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 4312 + }, + { + "epoch": 0.1824604450461122, + "grad_norm": 0.2820669114589691, + "learning_rate": 0.001, + "loss": 2.4824, + "step": 4313 + }, + { + "epoch": 0.18250274980962855, + "grad_norm": 0.23513592779636383, + "learning_rate": 0.001, + "loss": 2.2421, + "step": 4314 + }, + { + "epoch": 0.18254505457314493, + "grad_norm": 0.33237895369529724, + "learning_rate": 0.001, + "loss": 2.6718, + "step": 4315 + }, + { + "epoch": 0.18258735933666131, + "grad_norm": 1.0040096044540405, + "learning_rate": 0.001, + "loss": 2.4138, + "step": 4316 + }, + { + "epoch": 0.18262966410017767, + "grad_norm": 2.4072158336639404, + "learning_rate": 0.001, + "loss": 2.3787, + "step": 4317 + }, + { + "epoch": 0.18267196886369405, + "grad_norm": 0.2276010513305664, + "learning_rate": 0.001, + "loss": 2.5066, + "step": 4318 + }, + { + "epoch": 0.18271427362721043, + "grad_norm": 0.2673986256122589, + "learning_rate": 0.001, + "loss": 2.4379, + "step": 4319 + }, + { + "epoch": 0.1827565783907268, + "grad_norm": 0.3459979295730591, + "learning_rate": 0.001, + "loss": 2.8818, + "step": 4320 + }, + { + "epoch": 0.18279888315424317, + "grad_norm": 0.30909305810928345, + "learning_rate": 0.001, + "loss": 2.359, + "step": 4321 + }, + { + "epoch": 0.18284118791775955, + "grad_norm": 0.29360231757164, + "learning_rate": 0.001, + "loss": 2.3982, + "step": 4322 + }, + { + "epoch": 0.1828834926812759, + "grad_norm": 0.3940381407737732, + "learning_rate": 0.001, + "loss": 2.7807, + "step": 4323 + }, + { + "epoch": 0.1829257974447923, + "grad_norm": 0.7100034356117249, + "learning_rate": 0.001, + "loss": 3.6592, + "step": 4324 + }, + { + "epoch": 0.18296810220830864, + "grad_norm": 4.326420307159424, + "learning_rate": 0.001, + "loss": 2.7129, + "step": 4325 + }, + { + "epoch": 0.18301040697182502, + "grad_norm": 0.2793787717819214, + "learning_rate": 0.001, + "loss": 3.1424, + "step": 4326 + }, + { + "epoch": 0.1830527117353414, + "grad_norm": 0.9387155175209045, + "learning_rate": 0.001, + "loss": 3.1617, + "step": 4327 + }, + { + "epoch": 0.18309501649885776, + "grad_norm": 0.3149893581867218, + "learning_rate": 0.001, + "loss": 2.914, + "step": 4328 + }, + { + "epoch": 0.18313732126237414, + "grad_norm": 1.0925941467285156, + "learning_rate": 0.001, + "loss": 2.902, + "step": 4329 + }, + { + "epoch": 0.18317962602589052, + "grad_norm": 0.3763814866542816, + "learning_rate": 0.001, + "loss": 3.5513, + "step": 4330 + }, + { + "epoch": 0.18322193078940688, + "grad_norm": 0.2654663622379303, + "learning_rate": 0.001, + "loss": 3.1993, + "step": 4331 + }, + { + "epoch": 0.18326423555292326, + "grad_norm": 0.34242555499076843, + "learning_rate": 0.001, + "loss": 2.4377, + "step": 4332 + }, + { + "epoch": 0.18330654031643964, + "grad_norm": 0.37130022048950195, + "learning_rate": 0.001, + "loss": 3.27, + "step": 4333 + }, + { + "epoch": 0.183348845079956, + "grad_norm": 0.4086019694805145, + "learning_rate": 0.001, + "loss": 2.3096, + "step": 4334 + }, + { + "epoch": 0.18339114984347238, + "grad_norm": 0.2771133482456207, + "learning_rate": 0.001, + "loss": 3.351, + "step": 4335 + }, + { + "epoch": 0.18343345460698876, + "grad_norm": 0.3115023970603943, + "learning_rate": 0.001, + "loss": 1.9335, + "step": 4336 + }, + { + "epoch": 0.18347575937050511, + "grad_norm": 0.513750433921814, + "learning_rate": 0.001, + "loss": 3.1363, + "step": 4337 + }, + { + "epoch": 0.1835180641340215, + "grad_norm": 0.3518810570240021, + "learning_rate": 0.001, + "loss": 3.5947, + "step": 4338 + }, + { + "epoch": 0.18356036889753785, + "grad_norm": 0.2949376106262207, + "learning_rate": 0.001, + "loss": 2.2923, + "step": 4339 + }, + { + "epoch": 0.18360267366105423, + "grad_norm": 0.2582836449146271, + "learning_rate": 0.001, + "loss": 3.0664, + "step": 4340 + }, + { + "epoch": 0.18364497842457062, + "grad_norm": 0.19488857686519623, + "learning_rate": 0.001, + "loss": 2.7243, + "step": 4341 + }, + { + "epoch": 0.18368728318808697, + "grad_norm": 0.26442408561706543, + "learning_rate": 0.001, + "loss": 3.4023, + "step": 4342 + }, + { + "epoch": 0.18372958795160335, + "grad_norm": 0.22754669189453125, + "learning_rate": 0.001, + "loss": 2.4041, + "step": 4343 + }, + { + "epoch": 0.18377189271511973, + "grad_norm": 0.22837084531784058, + "learning_rate": 0.001, + "loss": 3.132, + "step": 4344 + }, + { + "epoch": 0.1838141974786361, + "grad_norm": 6.180835723876953, + "learning_rate": 0.001, + "loss": 1.6405, + "step": 4345 + }, + { + "epoch": 0.18385650224215247, + "grad_norm": 0.21038007736206055, + "learning_rate": 0.001, + "loss": 2.0711, + "step": 4346 + }, + { + "epoch": 0.18389880700566885, + "grad_norm": 0.2657643258571625, + "learning_rate": 0.001, + "loss": 2.2425, + "step": 4347 + }, + { + "epoch": 0.1839411117691852, + "grad_norm": 1.5151898860931396, + "learning_rate": 0.001, + "loss": 2.9717, + "step": 4348 + }, + { + "epoch": 0.1839834165327016, + "grad_norm": 0.2588333189487457, + "learning_rate": 0.001, + "loss": 2.7555, + "step": 4349 + }, + { + "epoch": 0.18402572129621794, + "grad_norm": 0.28635910153388977, + "learning_rate": 0.001, + "loss": 2.7952, + "step": 4350 + }, + { + "epoch": 0.18406802605973432, + "grad_norm": 0.32209694385528564, + "learning_rate": 0.001, + "loss": 2.6064, + "step": 4351 + }, + { + "epoch": 0.1841103308232507, + "grad_norm": 2.678985357284546, + "learning_rate": 0.001, + "loss": 2.9109, + "step": 4352 + }, + { + "epoch": 0.18415263558676706, + "grad_norm": 0.25267353653907776, + "learning_rate": 0.001, + "loss": 2.4621, + "step": 4353 + }, + { + "epoch": 0.18419494035028344, + "grad_norm": 0.24416348338127136, + "learning_rate": 0.001, + "loss": 1.9122, + "step": 4354 + }, + { + "epoch": 0.18423724511379982, + "grad_norm": 0.24533572793006897, + "learning_rate": 0.001, + "loss": 2.3288, + "step": 4355 + }, + { + "epoch": 0.18427954987731618, + "grad_norm": 0.2123999446630478, + "learning_rate": 0.001, + "loss": 2.8654, + "step": 4356 + }, + { + "epoch": 0.18432185464083256, + "grad_norm": 0.5792009234428406, + "learning_rate": 0.001, + "loss": 2.8559, + "step": 4357 + }, + { + "epoch": 0.18436415940434894, + "grad_norm": 0.30343350768089294, + "learning_rate": 0.001, + "loss": 2.3648, + "step": 4358 + }, + { + "epoch": 0.1844064641678653, + "grad_norm": 0.23615819215774536, + "learning_rate": 0.001, + "loss": 1.7143, + "step": 4359 + }, + { + "epoch": 0.18444876893138168, + "grad_norm": 0.2771995961666107, + "learning_rate": 0.001, + "loss": 2.4038, + "step": 4360 + }, + { + "epoch": 0.18449107369489803, + "grad_norm": 0.7265622615814209, + "learning_rate": 0.001, + "loss": 2.8253, + "step": 4361 + }, + { + "epoch": 0.18453337845841442, + "grad_norm": 0.1909170150756836, + "learning_rate": 0.001, + "loss": 2.1125, + "step": 4362 + }, + { + "epoch": 0.1845756832219308, + "grad_norm": 0.2617343068122864, + "learning_rate": 0.001, + "loss": 2.8421, + "step": 4363 + }, + { + "epoch": 0.18461798798544715, + "grad_norm": 0.6342441439628601, + "learning_rate": 0.001, + "loss": 2.0168, + "step": 4364 + }, + { + "epoch": 0.18466029274896353, + "grad_norm": 0.3092588484287262, + "learning_rate": 0.001, + "loss": 2.8462, + "step": 4365 + }, + { + "epoch": 0.18470259751247992, + "grad_norm": 0.27178463339805603, + "learning_rate": 0.001, + "loss": 2.1236, + "step": 4366 + }, + { + "epoch": 0.18474490227599627, + "grad_norm": 10.186301231384277, + "learning_rate": 0.001, + "loss": 2.949, + "step": 4367 + }, + { + "epoch": 0.18478720703951265, + "grad_norm": 0.23955731093883514, + "learning_rate": 0.001, + "loss": 2.6389, + "step": 4368 + }, + { + "epoch": 0.18482951180302903, + "grad_norm": 1.5058904886245728, + "learning_rate": 0.001, + "loss": 2.7837, + "step": 4369 + }, + { + "epoch": 0.1848718165665454, + "grad_norm": 2.1568853855133057, + "learning_rate": 0.001, + "loss": 3.1977, + "step": 4370 + }, + { + "epoch": 0.18491412133006177, + "grad_norm": 0.23633380234241486, + "learning_rate": 0.001, + "loss": 1.8045, + "step": 4371 + }, + { + "epoch": 0.18495642609357812, + "grad_norm": 0.31348666548728943, + "learning_rate": 0.001, + "loss": 2.6933, + "step": 4372 + }, + { + "epoch": 0.1849987308570945, + "grad_norm": 0.3345177471637726, + "learning_rate": 0.001, + "loss": 2.535, + "step": 4373 + }, + { + "epoch": 0.1850410356206109, + "grad_norm": 0.26245883107185364, + "learning_rate": 0.001, + "loss": 1.6552, + "step": 4374 + }, + { + "epoch": 0.18508334038412724, + "grad_norm": 0.5210462808609009, + "learning_rate": 0.001, + "loss": 2.2235, + "step": 4375 + }, + { + "epoch": 0.18512564514764362, + "grad_norm": 1.8073543310165405, + "learning_rate": 0.001, + "loss": 2.5607, + "step": 4376 + }, + { + "epoch": 0.18516794991116, + "grad_norm": 1.0985215902328491, + "learning_rate": 0.001, + "loss": 2.2265, + "step": 4377 + }, + { + "epoch": 0.18521025467467636, + "grad_norm": 0.4432186484336853, + "learning_rate": 0.001, + "loss": 3.1894, + "step": 4378 + }, + { + "epoch": 0.18525255943819274, + "grad_norm": 10.145984649658203, + "learning_rate": 0.001, + "loss": 2.348, + "step": 4379 + }, + { + "epoch": 0.18529486420170913, + "grad_norm": 0.7287303805351257, + "learning_rate": 0.001, + "loss": 3.4078, + "step": 4380 + }, + { + "epoch": 0.18533716896522548, + "grad_norm": 0.4048196077346802, + "learning_rate": 0.001, + "loss": 2.3973, + "step": 4381 + }, + { + "epoch": 0.18537947372874186, + "grad_norm": 0.3356500566005707, + "learning_rate": 0.001, + "loss": 1.8402, + "step": 4382 + }, + { + "epoch": 0.18542177849225822, + "grad_norm": 0.3641221225261688, + "learning_rate": 0.001, + "loss": 2.937, + "step": 4383 + }, + { + "epoch": 0.1854640832557746, + "grad_norm": 0.3666855990886688, + "learning_rate": 0.001, + "loss": 2.2261, + "step": 4384 + }, + { + "epoch": 0.18550638801929098, + "grad_norm": 0.34403887391090393, + "learning_rate": 0.001, + "loss": 2.4207, + "step": 4385 + }, + { + "epoch": 0.18554869278280733, + "grad_norm": 0.3421815037727356, + "learning_rate": 0.001, + "loss": 2.913, + "step": 4386 + }, + { + "epoch": 0.18559099754632372, + "grad_norm": 0.22695082426071167, + "learning_rate": 0.001, + "loss": 1.8004, + "step": 4387 + }, + { + "epoch": 0.1856333023098401, + "grad_norm": 0.24995842576026917, + "learning_rate": 0.001, + "loss": 1.5226, + "step": 4388 + }, + { + "epoch": 0.18567560707335645, + "grad_norm": 0.23015795648097992, + "learning_rate": 0.001, + "loss": 2.1822, + "step": 4389 + }, + { + "epoch": 0.18571791183687283, + "grad_norm": 0.26571375131607056, + "learning_rate": 0.001, + "loss": 2.0279, + "step": 4390 + }, + { + "epoch": 0.18576021660038922, + "grad_norm": 0.22029922902584076, + "learning_rate": 0.001, + "loss": 2.4631, + "step": 4391 + }, + { + "epoch": 0.18580252136390557, + "grad_norm": 0.24884076416492462, + "learning_rate": 0.001, + "loss": 2.1439, + "step": 4392 + }, + { + "epoch": 0.18584482612742195, + "grad_norm": 0.27514705061912537, + "learning_rate": 0.001, + "loss": 2.0805, + "step": 4393 + }, + { + "epoch": 0.1858871308909383, + "grad_norm": 0.23706066608428955, + "learning_rate": 0.001, + "loss": 2.3177, + "step": 4394 + }, + { + "epoch": 0.1859294356544547, + "grad_norm": 0.21739627420902252, + "learning_rate": 0.001, + "loss": 1.9557, + "step": 4395 + }, + { + "epoch": 0.18597174041797107, + "grad_norm": 0.3604746460914612, + "learning_rate": 0.001, + "loss": 1.6202, + "step": 4396 + }, + { + "epoch": 0.18601404518148743, + "grad_norm": 0.25236776471138, + "learning_rate": 0.001, + "loss": 2.6437, + "step": 4397 + }, + { + "epoch": 0.1860563499450038, + "grad_norm": 0.23242443799972534, + "learning_rate": 0.001, + "loss": 3.186, + "step": 4398 + }, + { + "epoch": 0.1860986547085202, + "grad_norm": 0.3794444799423218, + "learning_rate": 0.001, + "loss": 2.4195, + "step": 4399 + }, + { + "epoch": 0.18614095947203654, + "grad_norm": 0.23218633234500885, + "learning_rate": 0.001, + "loss": 2.6508, + "step": 4400 + }, + { + "epoch": 0.18618326423555293, + "grad_norm": 0.20233222842216492, + "learning_rate": 0.001, + "loss": 1.5472, + "step": 4401 + }, + { + "epoch": 0.1862255689990693, + "grad_norm": 0.6316521167755127, + "learning_rate": 0.001, + "loss": 3.3565, + "step": 4402 + }, + { + "epoch": 0.18626787376258566, + "grad_norm": 0.40583282709121704, + "learning_rate": 0.001, + "loss": 2.4887, + "step": 4403 + }, + { + "epoch": 0.18631017852610204, + "grad_norm": 0.35350820422172546, + "learning_rate": 0.001, + "loss": 2.5153, + "step": 4404 + }, + { + "epoch": 0.1863524832896184, + "grad_norm": 0.2667691707611084, + "learning_rate": 0.001, + "loss": 3.2235, + "step": 4405 + }, + { + "epoch": 0.18639478805313478, + "grad_norm": 0.7131944894790649, + "learning_rate": 0.001, + "loss": 3.317, + "step": 4406 + }, + { + "epoch": 0.18643709281665116, + "grad_norm": 0.2557836174964905, + "learning_rate": 0.001, + "loss": 2.8428, + "step": 4407 + }, + { + "epoch": 0.18647939758016752, + "grad_norm": 0.18979831039905548, + "learning_rate": 0.001, + "loss": 2.1681, + "step": 4408 + }, + { + "epoch": 0.1865217023436839, + "grad_norm": 0.8334029316902161, + "learning_rate": 0.001, + "loss": 1.9816, + "step": 4409 + }, + { + "epoch": 0.18656400710720028, + "grad_norm": 0.2034035176038742, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 4410 + }, + { + "epoch": 0.18660631187071663, + "grad_norm": 0.25661614537239075, + "learning_rate": 0.001, + "loss": 2.1606, + "step": 4411 + }, + { + "epoch": 0.18664861663423302, + "grad_norm": 0.21977266669273376, + "learning_rate": 0.001, + "loss": 2.2903, + "step": 4412 + }, + { + "epoch": 0.1866909213977494, + "grad_norm": 0.5622522234916687, + "learning_rate": 0.001, + "loss": 3.0276, + "step": 4413 + }, + { + "epoch": 0.18673322616126575, + "grad_norm": 0.2287818193435669, + "learning_rate": 0.001, + "loss": 3.0729, + "step": 4414 + }, + { + "epoch": 0.18677553092478213, + "grad_norm": 0.2905156910419464, + "learning_rate": 0.001, + "loss": 3.1699, + "step": 4415 + }, + { + "epoch": 0.1868178356882985, + "grad_norm": 0.23308047652244568, + "learning_rate": 0.001, + "loss": 2.4299, + "step": 4416 + }, + { + "epoch": 0.18686014045181487, + "grad_norm": 0.23100613057613373, + "learning_rate": 0.001, + "loss": 2.6107, + "step": 4417 + }, + { + "epoch": 0.18690244521533125, + "grad_norm": 0.2881180942058563, + "learning_rate": 0.001, + "loss": 2.3265, + "step": 4418 + }, + { + "epoch": 0.1869447499788476, + "grad_norm": 0.26562678813934326, + "learning_rate": 0.001, + "loss": 2.6836, + "step": 4419 + }, + { + "epoch": 0.186987054742364, + "grad_norm": 0.21036438643932343, + "learning_rate": 0.001, + "loss": 1.9917, + "step": 4420 + }, + { + "epoch": 0.18702935950588037, + "grad_norm": 0.27544263005256653, + "learning_rate": 0.001, + "loss": 3.4425, + "step": 4421 + }, + { + "epoch": 0.18707166426939673, + "grad_norm": 0.2592555284500122, + "learning_rate": 0.001, + "loss": 2.09, + "step": 4422 + }, + { + "epoch": 0.1871139690329131, + "grad_norm": 0.5881548523902893, + "learning_rate": 0.001, + "loss": 2.1932, + "step": 4423 + }, + { + "epoch": 0.1871562737964295, + "grad_norm": 0.20673885941505432, + "learning_rate": 0.001, + "loss": 2.2809, + "step": 4424 + }, + { + "epoch": 0.18719857855994584, + "grad_norm": 0.20291772484779358, + "learning_rate": 0.001, + "loss": 2.0593, + "step": 4425 + }, + { + "epoch": 0.18724088332346223, + "grad_norm": 0.270967960357666, + "learning_rate": 0.001, + "loss": 1.6903, + "step": 4426 + }, + { + "epoch": 0.18728318808697858, + "grad_norm": 0.35140615701675415, + "learning_rate": 0.001, + "loss": 2.364, + "step": 4427 + }, + { + "epoch": 0.18732549285049496, + "grad_norm": 0.20926691591739655, + "learning_rate": 0.001, + "loss": 1.9464, + "step": 4428 + }, + { + "epoch": 0.18736779761401134, + "grad_norm": 0.28219670057296753, + "learning_rate": 0.001, + "loss": 3.2835, + "step": 4429 + }, + { + "epoch": 0.1874101023775277, + "grad_norm": 3.9769115447998047, + "learning_rate": 0.001, + "loss": 2.0957, + "step": 4430 + }, + { + "epoch": 0.18745240714104408, + "grad_norm": 0.24822956323623657, + "learning_rate": 0.001, + "loss": 2.3957, + "step": 4431 + }, + { + "epoch": 0.18749471190456046, + "grad_norm": 0.22063516080379486, + "learning_rate": 0.001, + "loss": 2.1515, + "step": 4432 + }, + { + "epoch": 0.18753701666807682, + "grad_norm": 0.25612160563468933, + "learning_rate": 0.001, + "loss": 2.687, + "step": 4433 + }, + { + "epoch": 0.1875793214315932, + "grad_norm": 0.2492561936378479, + "learning_rate": 0.001, + "loss": 2.4043, + "step": 4434 + }, + { + "epoch": 0.18762162619510958, + "grad_norm": 0.28341639041900635, + "learning_rate": 0.001, + "loss": 2.0796, + "step": 4435 + }, + { + "epoch": 0.18766393095862594, + "grad_norm": 0.2945077121257782, + "learning_rate": 0.001, + "loss": 2.1302, + "step": 4436 + }, + { + "epoch": 0.18770623572214232, + "grad_norm": 0.3373602330684662, + "learning_rate": 0.001, + "loss": 2.0283, + "step": 4437 + }, + { + "epoch": 0.18774854048565867, + "grad_norm": 0.24458883702754974, + "learning_rate": 0.001, + "loss": 2.6162, + "step": 4438 + }, + { + "epoch": 0.18779084524917505, + "grad_norm": 0.21525456011295319, + "learning_rate": 0.001, + "loss": 2.0196, + "step": 4439 + }, + { + "epoch": 0.18783315001269144, + "grad_norm": 0.22924627363681793, + "learning_rate": 0.001, + "loss": 2.1364, + "step": 4440 + }, + { + "epoch": 0.1878754547762078, + "grad_norm": 0.22823546826839447, + "learning_rate": 0.001, + "loss": 3.0261, + "step": 4441 + }, + { + "epoch": 0.18791775953972417, + "grad_norm": 0.37336304783821106, + "learning_rate": 0.001, + "loss": 2.0595, + "step": 4442 + }, + { + "epoch": 0.18796006430324055, + "grad_norm": 2.6956934928894043, + "learning_rate": 0.001, + "loss": 1.9668, + "step": 4443 + }, + { + "epoch": 0.1880023690667569, + "grad_norm": 0.7836053371429443, + "learning_rate": 0.001, + "loss": 2.3485, + "step": 4444 + }, + { + "epoch": 0.1880446738302733, + "grad_norm": 1.6367077827453613, + "learning_rate": 0.001, + "loss": 3.084, + "step": 4445 + }, + { + "epoch": 0.18808697859378967, + "grad_norm": 0.26571542024612427, + "learning_rate": 0.001, + "loss": 2.4098, + "step": 4446 + }, + { + "epoch": 0.18812928335730603, + "grad_norm": 0.2825041711330414, + "learning_rate": 0.001, + "loss": 2.2918, + "step": 4447 + }, + { + "epoch": 0.1881715881208224, + "grad_norm": 0.246236190199852, + "learning_rate": 0.001, + "loss": 2.4611, + "step": 4448 + }, + { + "epoch": 0.1882138928843388, + "grad_norm": 0.5740474462509155, + "learning_rate": 0.001, + "loss": 2.9142, + "step": 4449 + }, + { + "epoch": 0.18825619764785514, + "grad_norm": 0.37695226073265076, + "learning_rate": 0.001, + "loss": 1.6922, + "step": 4450 + }, + { + "epoch": 0.18829850241137153, + "grad_norm": 1.8497018814086914, + "learning_rate": 0.001, + "loss": 2.2615, + "step": 4451 + }, + { + "epoch": 0.18834080717488788, + "grad_norm": 0.2767746150493622, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 4452 + }, + { + "epoch": 0.18838311193840426, + "grad_norm": 1.5770310163497925, + "learning_rate": 0.001, + "loss": 2.4928, + "step": 4453 + }, + { + "epoch": 0.18842541670192064, + "grad_norm": 0.23543912172317505, + "learning_rate": 0.001, + "loss": 1.8614, + "step": 4454 + }, + { + "epoch": 0.188467721465437, + "grad_norm": 0.26487410068511963, + "learning_rate": 0.001, + "loss": 2.2644, + "step": 4455 + }, + { + "epoch": 0.18851002622895338, + "grad_norm": 0.26625239849090576, + "learning_rate": 0.001, + "loss": 2.4482, + "step": 4456 + }, + { + "epoch": 0.18855233099246976, + "grad_norm": 0.21155591309070587, + "learning_rate": 0.001, + "loss": 1.929, + "step": 4457 + }, + { + "epoch": 0.18859463575598612, + "grad_norm": 0.25420722365379333, + "learning_rate": 0.001, + "loss": 2.1863, + "step": 4458 + }, + { + "epoch": 0.1886369405195025, + "grad_norm": 0.27316761016845703, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 4459 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 3.036153793334961, + "learning_rate": 0.001, + "loss": 3.9133, + "step": 4460 + }, + { + "epoch": 0.18872155004653524, + "grad_norm": 0.22309522330760956, + "learning_rate": 0.001, + "loss": 2.348, + "step": 4461 + }, + { + "epoch": 0.18876385481005162, + "grad_norm": 0.21288998425006866, + "learning_rate": 0.001, + "loss": 1.5457, + "step": 4462 + }, + { + "epoch": 0.18880615957356797, + "grad_norm": 3.6792640686035156, + "learning_rate": 0.001, + "loss": 2.3953, + "step": 4463 + }, + { + "epoch": 0.18884846433708435, + "grad_norm": 0.3118633031845093, + "learning_rate": 0.001, + "loss": 3.1294, + "step": 4464 + }, + { + "epoch": 0.18889076910060074, + "grad_norm": 2.057178497314453, + "learning_rate": 0.001, + "loss": 1.7783, + "step": 4465 + }, + { + "epoch": 0.1889330738641171, + "grad_norm": 0.6279289722442627, + "learning_rate": 0.001, + "loss": 1.8288, + "step": 4466 + }, + { + "epoch": 0.18897537862763347, + "grad_norm": 0.3405144214630127, + "learning_rate": 0.001, + "loss": 2.3726, + "step": 4467 + }, + { + "epoch": 0.18901768339114985, + "grad_norm": 0.2901994585990906, + "learning_rate": 0.001, + "loss": 1.8338, + "step": 4468 + }, + { + "epoch": 0.1890599881546662, + "grad_norm": 0.3155980706214905, + "learning_rate": 0.001, + "loss": 3.4189, + "step": 4469 + }, + { + "epoch": 0.1891022929181826, + "grad_norm": 0.5617590546607971, + "learning_rate": 0.001, + "loss": 2.9148, + "step": 4470 + }, + { + "epoch": 0.18914459768169897, + "grad_norm": 0.3109961450099945, + "learning_rate": 0.001, + "loss": 2.4199, + "step": 4471 + }, + { + "epoch": 0.18918690244521533, + "grad_norm": 0.23848043382167816, + "learning_rate": 0.001, + "loss": 2.1432, + "step": 4472 + }, + { + "epoch": 0.1892292072087317, + "grad_norm": 0.2614109218120575, + "learning_rate": 0.001, + "loss": 1.9615, + "step": 4473 + }, + { + "epoch": 0.18927151197224806, + "grad_norm": 0.30078810453414917, + "learning_rate": 0.001, + "loss": 2.6696, + "step": 4474 + }, + { + "epoch": 0.18931381673576445, + "grad_norm": 0.25650012493133545, + "learning_rate": 0.001, + "loss": 1.8638, + "step": 4475 + }, + { + "epoch": 0.18935612149928083, + "grad_norm": 0.21869732439517975, + "learning_rate": 0.001, + "loss": 1.5891, + "step": 4476 + }, + { + "epoch": 0.18939842626279718, + "grad_norm": 8.205790519714355, + "learning_rate": 0.001, + "loss": 1.5669, + "step": 4477 + }, + { + "epoch": 0.18944073102631356, + "grad_norm": 0.2447681427001953, + "learning_rate": 0.001, + "loss": 2.9438, + "step": 4478 + }, + { + "epoch": 0.18948303578982995, + "grad_norm": 1.446384072303772, + "learning_rate": 0.001, + "loss": 2.7589, + "step": 4479 + }, + { + "epoch": 0.1895253405533463, + "grad_norm": 0.8409281373023987, + "learning_rate": 0.001, + "loss": 1.9919, + "step": 4480 + }, + { + "epoch": 0.18956764531686268, + "grad_norm": 0.27768608927726746, + "learning_rate": 0.001, + "loss": 2.4846, + "step": 4481 + }, + { + "epoch": 0.18960995008037906, + "grad_norm": 0.4152839779853821, + "learning_rate": 0.001, + "loss": 3.0854, + "step": 4482 + }, + { + "epoch": 0.18965225484389542, + "grad_norm": 0.2601878046989441, + "learning_rate": 0.001, + "loss": 2.7528, + "step": 4483 + }, + { + "epoch": 0.1896945596074118, + "grad_norm": 0.2510074973106384, + "learning_rate": 0.001, + "loss": 2.1918, + "step": 4484 + }, + { + "epoch": 0.18973686437092815, + "grad_norm": 0.22711610794067383, + "learning_rate": 0.001, + "loss": 2.3735, + "step": 4485 + }, + { + "epoch": 0.18977916913444454, + "grad_norm": 0.35560375452041626, + "learning_rate": 0.001, + "loss": 2.2848, + "step": 4486 + }, + { + "epoch": 0.18982147389796092, + "grad_norm": 0.27158334851264954, + "learning_rate": 0.001, + "loss": 2.6733, + "step": 4487 + }, + { + "epoch": 0.18986377866147727, + "grad_norm": 0.2838427424430847, + "learning_rate": 0.001, + "loss": 2.8558, + "step": 4488 + }, + { + "epoch": 0.18990608342499365, + "grad_norm": 0.2345813810825348, + "learning_rate": 0.001, + "loss": 3.6895, + "step": 4489 + }, + { + "epoch": 0.18994838818851004, + "grad_norm": 0.2404671013355255, + "learning_rate": 0.001, + "loss": 2.1784, + "step": 4490 + }, + { + "epoch": 0.1899906929520264, + "grad_norm": 0.2407745122909546, + "learning_rate": 0.001, + "loss": 2.8477, + "step": 4491 + }, + { + "epoch": 0.19003299771554277, + "grad_norm": 0.2435643970966339, + "learning_rate": 0.001, + "loss": 2.7741, + "step": 4492 + }, + { + "epoch": 0.19007530247905915, + "grad_norm": 0.20882461965084076, + "learning_rate": 0.001, + "loss": 2.0404, + "step": 4493 + }, + { + "epoch": 0.1901176072425755, + "grad_norm": 0.20815284550189972, + "learning_rate": 0.001, + "loss": 1.4838, + "step": 4494 + }, + { + "epoch": 0.1901599120060919, + "grad_norm": 0.5364993810653687, + "learning_rate": 0.001, + "loss": 2.5901, + "step": 4495 + }, + { + "epoch": 0.19020221676960825, + "grad_norm": 0.18857216835021973, + "learning_rate": 0.001, + "loss": 2.0745, + "step": 4496 + }, + { + "epoch": 0.19024452153312463, + "grad_norm": 0.22083306312561035, + "learning_rate": 0.001, + "loss": 2.2613, + "step": 4497 + }, + { + "epoch": 0.190286826296641, + "grad_norm": 0.21830333769321442, + "learning_rate": 0.001, + "loss": 2.4491, + "step": 4498 + }, + { + "epoch": 0.19032913106015736, + "grad_norm": 0.918835461139679, + "learning_rate": 0.001, + "loss": 2.8763, + "step": 4499 + }, + { + "epoch": 0.19037143582367375, + "grad_norm": 0.2459859997034073, + "learning_rate": 0.001, + "loss": 2.3936, + "step": 4500 + }, + { + "epoch": 0.19041374058719013, + "grad_norm": 0.2378329634666443, + "learning_rate": 0.001, + "loss": 1.8399, + "step": 4501 + }, + { + "epoch": 0.19045604535070648, + "grad_norm": 0.3564843237400055, + "learning_rate": 0.001, + "loss": 3.5808, + "step": 4502 + }, + { + "epoch": 0.19049835011422286, + "grad_norm": 0.5608360171318054, + "learning_rate": 0.001, + "loss": 2.1496, + "step": 4503 + }, + { + "epoch": 0.19054065487773925, + "grad_norm": 0.3090474009513855, + "learning_rate": 0.001, + "loss": 2.3587, + "step": 4504 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 0.2552463114261627, + "learning_rate": 0.001, + "loss": 1.866, + "step": 4505 + }, + { + "epoch": 0.19062526440477198, + "grad_norm": 0.41491034626960754, + "learning_rate": 0.001, + "loss": 2.1764, + "step": 4506 + }, + { + "epoch": 0.19066756916828834, + "grad_norm": 0.2689819633960724, + "learning_rate": 0.001, + "loss": 1.6736, + "step": 4507 + }, + { + "epoch": 0.19070987393180472, + "grad_norm": 0.2358677089214325, + "learning_rate": 0.001, + "loss": 1.9717, + "step": 4508 + }, + { + "epoch": 0.1907521786953211, + "grad_norm": 0.2064763307571411, + "learning_rate": 0.001, + "loss": 2.6779, + "step": 4509 + }, + { + "epoch": 0.19079448345883745, + "grad_norm": 0.24154812097549438, + "learning_rate": 0.001, + "loss": 1.7884, + "step": 4510 + }, + { + "epoch": 0.19083678822235384, + "grad_norm": 0.22353674471378326, + "learning_rate": 0.001, + "loss": 1.9496, + "step": 4511 + }, + { + "epoch": 0.19087909298587022, + "grad_norm": 0.2455575168132782, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 4512 + }, + { + "epoch": 0.19092139774938657, + "grad_norm": 0.22272691130638123, + "learning_rate": 0.001, + "loss": 2.6156, + "step": 4513 + }, + { + "epoch": 0.19096370251290296, + "grad_norm": 0.21342520415782928, + "learning_rate": 0.001, + "loss": 2.2985, + "step": 4514 + }, + { + "epoch": 0.19100600727641934, + "grad_norm": 0.21540196239948273, + "learning_rate": 0.001, + "loss": 2.5549, + "step": 4515 + }, + { + "epoch": 0.1910483120399357, + "grad_norm": 0.20063146948814392, + "learning_rate": 0.001, + "loss": 3.4714, + "step": 4516 + }, + { + "epoch": 0.19109061680345207, + "grad_norm": 0.1966932862997055, + "learning_rate": 0.001, + "loss": 2.1782, + "step": 4517 + }, + { + "epoch": 0.19113292156696843, + "grad_norm": 1.2073802947998047, + "learning_rate": 0.001, + "loss": 2.668, + "step": 4518 + }, + { + "epoch": 0.1911752263304848, + "grad_norm": 0.3695375323295593, + "learning_rate": 0.001, + "loss": 3.0143, + "step": 4519 + }, + { + "epoch": 0.1912175310940012, + "grad_norm": 0.21040910482406616, + "learning_rate": 0.001, + "loss": 2.2652, + "step": 4520 + }, + { + "epoch": 0.19125983585751755, + "grad_norm": 0.2612590491771698, + "learning_rate": 0.001, + "loss": 2.0682, + "step": 4521 + }, + { + "epoch": 0.19130214062103393, + "grad_norm": 1.0337492227554321, + "learning_rate": 0.001, + "loss": 1.7501, + "step": 4522 + }, + { + "epoch": 0.1913444453845503, + "grad_norm": 0.2554808259010315, + "learning_rate": 0.001, + "loss": 2.4155, + "step": 4523 + }, + { + "epoch": 0.19138675014806666, + "grad_norm": 2.867785692214966, + "learning_rate": 0.001, + "loss": 1.7372, + "step": 4524 + }, + { + "epoch": 0.19142905491158305, + "grad_norm": 3.803913116455078, + "learning_rate": 0.001, + "loss": 2.8557, + "step": 4525 + }, + { + "epoch": 0.19147135967509943, + "grad_norm": 0.671362578868866, + "learning_rate": 0.001, + "loss": 1.9373, + "step": 4526 + }, + { + "epoch": 0.19151366443861578, + "grad_norm": 0.33404502272605896, + "learning_rate": 0.001, + "loss": 2.3405, + "step": 4527 + }, + { + "epoch": 0.19155596920213216, + "grad_norm": 7.813088893890381, + "learning_rate": 0.001, + "loss": 3.0833, + "step": 4528 + }, + { + "epoch": 0.19159827396564852, + "grad_norm": 0.33136120438575745, + "learning_rate": 0.001, + "loss": 3.0437, + "step": 4529 + }, + { + "epoch": 0.1916405787291649, + "grad_norm": 0.3208834230899811, + "learning_rate": 0.001, + "loss": 2.8585, + "step": 4530 + }, + { + "epoch": 0.19168288349268128, + "grad_norm": 0.46846067905426025, + "learning_rate": 0.001, + "loss": 3.273, + "step": 4531 + }, + { + "epoch": 0.19172518825619764, + "grad_norm": 0.4278513789176941, + "learning_rate": 0.001, + "loss": 2.7628, + "step": 4532 + }, + { + "epoch": 0.19176749301971402, + "grad_norm": 0.3164735734462738, + "learning_rate": 0.001, + "loss": 3.0253, + "step": 4533 + }, + { + "epoch": 0.1918097977832304, + "grad_norm": 0.22496308386325836, + "learning_rate": 0.001, + "loss": 2.1171, + "step": 4534 + }, + { + "epoch": 0.19185210254674676, + "grad_norm": 0.2776702344417572, + "learning_rate": 0.001, + "loss": 2.7838, + "step": 4535 + }, + { + "epoch": 0.19189440731026314, + "grad_norm": 0.6228007674217224, + "learning_rate": 0.001, + "loss": 2.2344, + "step": 4536 + }, + { + "epoch": 0.19193671207377952, + "grad_norm": 0.48833128809928894, + "learning_rate": 0.001, + "loss": 2.986, + "step": 4537 + }, + { + "epoch": 0.19197901683729587, + "grad_norm": 0.3756866753101349, + "learning_rate": 0.001, + "loss": 1.9291, + "step": 4538 + }, + { + "epoch": 0.19202132160081226, + "grad_norm": 0.278314471244812, + "learning_rate": 0.001, + "loss": 2.1759, + "step": 4539 + }, + { + "epoch": 0.1920636263643286, + "grad_norm": 1.2533029317855835, + "learning_rate": 0.001, + "loss": 2.107, + "step": 4540 + }, + { + "epoch": 0.192105931127845, + "grad_norm": 0.2423766851425171, + "learning_rate": 0.001, + "loss": 1.7797, + "step": 4541 + }, + { + "epoch": 0.19214823589136137, + "grad_norm": 0.3214251399040222, + "learning_rate": 0.001, + "loss": 2.94, + "step": 4542 + }, + { + "epoch": 0.19219054065487773, + "grad_norm": 0.4931315779685974, + "learning_rate": 0.001, + "loss": 1.9467, + "step": 4543 + }, + { + "epoch": 0.1922328454183941, + "grad_norm": 2.458348035812378, + "learning_rate": 0.001, + "loss": 2.5617, + "step": 4544 + }, + { + "epoch": 0.1922751501819105, + "grad_norm": 1.2763197422027588, + "learning_rate": 0.001, + "loss": 2.5777, + "step": 4545 + }, + { + "epoch": 0.19231745494542685, + "grad_norm": 0.2197532057762146, + "learning_rate": 0.001, + "loss": 1.8814, + "step": 4546 + }, + { + "epoch": 0.19235975970894323, + "grad_norm": 0.335099458694458, + "learning_rate": 0.001, + "loss": 1.9736, + "step": 4547 + }, + { + "epoch": 0.1924020644724596, + "grad_norm": 0.24733540415763855, + "learning_rate": 0.001, + "loss": 1.7578, + "step": 4548 + }, + { + "epoch": 0.19244436923597597, + "grad_norm": 0.2788896858692169, + "learning_rate": 0.001, + "loss": 3.6972, + "step": 4549 + }, + { + "epoch": 0.19248667399949235, + "grad_norm": 0.250205397605896, + "learning_rate": 0.001, + "loss": 2.5927, + "step": 4550 + }, + { + "epoch": 0.1925289787630087, + "grad_norm": 0.9215936660766602, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 4551 + }, + { + "epoch": 0.19257128352652508, + "grad_norm": 0.2189418524503708, + "learning_rate": 0.001, + "loss": 2.3595, + "step": 4552 + }, + { + "epoch": 0.19261358829004147, + "grad_norm": 0.2297450602054596, + "learning_rate": 0.001, + "loss": 2.0018, + "step": 4553 + }, + { + "epoch": 0.19265589305355782, + "grad_norm": 0.8725435137748718, + "learning_rate": 0.001, + "loss": 2.3299, + "step": 4554 + }, + { + "epoch": 0.1926981978170742, + "grad_norm": 0.2853243350982666, + "learning_rate": 0.001, + "loss": 3.1252, + "step": 4555 + }, + { + "epoch": 0.19274050258059058, + "grad_norm": 0.26800212264060974, + "learning_rate": 0.001, + "loss": 2.5468, + "step": 4556 + }, + { + "epoch": 0.19278280734410694, + "grad_norm": 0.4624558091163635, + "learning_rate": 0.001, + "loss": 2.0941, + "step": 4557 + }, + { + "epoch": 0.19282511210762332, + "grad_norm": 0.2232256382703781, + "learning_rate": 0.001, + "loss": 2.4683, + "step": 4558 + }, + { + "epoch": 0.1928674168711397, + "grad_norm": 0.2688036859035492, + "learning_rate": 0.001, + "loss": 2.223, + "step": 4559 + }, + { + "epoch": 0.19290972163465606, + "grad_norm": 0.2234850972890854, + "learning_rate": 0.001, + "loss": 2.6623, + "step": 4560 + }, + { + "epoch": 0.19295202639817244, + "grad_norm": 0.3354385495185852, + "learning_rate": 0.001, + "loss": 3.2651, + "step": 4561 + }, + { + "epoch": 0.1929943311616888, + "grad_norm": 0.26778674125671387, + "learning_rate": 0.001, + "loss": 2.6781, + "step": 4562 + }, + { + "epoch": 0.19303663592520517, + "grad_norm": 0.2896103858947754, + "learning_rate": 0.001, + "loss": 2.2987, + "step": 4563 + }, + { + "epoch": 0.19307894068872156, + "grad_norm": 0.21023137867450714, + "learning_rate": 0.001, + "loss": 1.6636, + "step": 4564 + }, + { + "epoch": 0.1931212454522379, + "grad_norm": 0.2824184000492096, + "learning_rate": 0.001, + "loss": 2.5228, + "step": 4565 + }, + { + "epoch": 0.1931635502157543, + "grad_norm": 0.22193598747253418, + "learning_rate": 0.001, + "loss": 1.9797, + "step": 4566 + }, + { + "epoch": 0.19320585497927067, + "grad_norm": 0.26808369159698486, + "learning_rate": 0.001, + "loss": 2.533, + "step": 4567 + }, + { + "epoch": 0.19324815974278703, + "grad_norm": 0.2427438348531723, + "learning_rate": 0.001, + "loss": 2.6625, + "step": 4568 + }, + { + "epoch": 0.1932904645063034, + "grad_norm": 0.2184421867132187, + "learning_rate": 0.001, + "loss": 2.5565, + "step": 4569 + }, + { + "epoch": 0.1933327692698198, + "grad_norm": 0.4565901756286621, + "learning_rate": 0.001, + "loss": 2.0758, + "step": 4570 + }, + { + "epoch": 0.19337507403333615, + "grad_norm": 0.9183976650238037, + "learning_rate": 0.001, + "loss": 2.1397, + "step": 4571 + }, + { + "epoch": 0.19341737879685253, + "grad_norm": 0.20792505145072937, + "learning_rate": 0.001, + "loss": 2.7359, + "step": 4572 + }, + { + "epoch": 0.1934596835603689, + "grad_norm": 0.7904244661331177, + "learning_rate": 0.001, + "loss": 3.3685, + "step": 4573 + }, + { + "epoch": 0.19350198832388527, + "grad_norm": 0.2975200116634369, + "learning_rate": 0.001, + "loss": 3.3311, + "step": 4574 + }, + { + "epoch": 0.19354429308740165, + "grad_norm": 4.709847450256348, + "learning_rate": 0.001, + "loss": 2.6445, + "step": 4575 + }, + { + "epoch": 0.193586597850918, + "grad_norm": 0.23912140727043152, + "learning_rate": 0.001, + "loss": 2.272, + "step": 4576 + }, + { + "epoch": 0.19362890261443438, + "grad_norm": 0.3137105405330658, + "learning_rate": 0.001, + "loss": 2.5401, + "step": 4577 + }, + { + "epoch": 0.19367120737795077, + "grad_norm": 0.27049148082733154, + "learning_rate": 0.001, + "loss": 2.8427, + "step": 4578 + }, + { + "epoch": 0.19371351214146712, + "grad_norm": 0.3511445224285126, + "learning_rate": 0.001, + "loss": 1.8104, + "step": 4579 + }, + { + "epoch": 0.1937558169049835, + "grad_norm": 0.280909925699234, + "learning_rate": 0.001, + "loss": 1.8538, + "step": 4580 + }, + { + "epoch": 0.19379812166849988, + "grad_norm": 0.2330571413040161, + "learning_rate": 0.001, + "loss": 1.9844, + "step": 4581 + }, + { + "epoch": 0.19384042643201624, + "grad_norm": 0.2649915814399719, + "learning_rate": 0.001, + "loss": 3.0022, + "step": 4582 + }, + { + "epoch": 0.19388273119553262, + "grad_norm": 0.28710460662841797, + "learning_rate": 0.001, + "loss": 2.5591, + "step": 4583 + }, + { + "epoch": 0.193925035959049, + "grad_norm": 0.22108560800552368, + "learning_rate": 0.001, + "loss": 3.5141, + "step": 4584 + }, + { + "epoch": 0.19396734072256536, + "grad_norm": 0.21730417013168335, + "learning_rate": 0.001, + "loss": 2.453, + "step": 4585 + }, + { + "epoch": 0.19400964548608174, + "grad_norm": 0.4755695164203644, + "learning_rate": 0.001, + "loss": 2.3401, + "step": 4586 + }, + { + "epoch": 0.1940519502495981, + "grad_norm": 0.2430569976568222, + "learning_rate": 0.001, + "loss": 2.5329, + "step": 4587 + }, + { + "epoch": 0.19409425501311448, + "grad_norm": 3.246774435043335, + "learning_rate": 0.001, + "loss": 2.1003, + "step": 4588 + }, + { + "epoch": 0.19413655977663086, + "grad_norm": 0.22115357220172882, + "learning_rate": 0.001, + "loss": 2.791, + "step": 4589 + }, + { + "epoch": 0.1941788645401472, + "grad_norm": 0.3158797025680542, + "learning_rate": 0.001, + "loss": 2.5248, + "step": 4590 + }, + { + "epoch": 0.1942211693036636, + "grad_norm": 0.21238943934440613, + "learning_rate": 0.001, + "loss": 2.0581, + "step": 4591 + }, + { + "epoch": 0.19426347406717998, + "grad_norm": 0.2093350887298584, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 4592 + }, + { + "epoch": 0.19430577883069633, + "grad_norm": 0.22821201384067535, + "learning_rate": 0.001, + "loss": 3.2084, + "step": 4593 + }, + { + "epoch": 0.1943480835942127, + "grad_norm": 0.26408034563064575, + "learning_rate": 0.001, + "loss": 2.2652, + "step": 4594 + }, + { + "epoch": 0.1943903883577291, + "grad_norm": 0.24591988325119019, + "learning_rate": 0.001, + "loss": 2.1064, + "step": 4595 + }, + { + "epoch": 0.19443269312124545, + "grad_norm": 5.068770408630371, + "learning_rate": 0.001, + "loss": 1.804, + "step": 4596 + }, + { + "epoch": 0.19447499788476183, + "grad_norm": 0.22931112349033356, + "learning_rate": 0.001, + "loss": 1.7943, + "step": 4597 + }, + { + "epoch": 0.19451730264827818, + "grad_norm": 0.5310946702957153, + "learning_rate": 0.001, + "loss": 2.0597, + "step": 4598 + }, + { + "epoch": 0.19455960741179457, + "grad_norm": 0.30972105264663696, + "learning_rate": 0.001, + "loss": 3.2119, + "step": 4599 + }, + { + "epoch": 0.19460191217531095, + "grad_norm": 0.2220851629972458, + "learning_rate": 0.001, + "loss": 2.2495, + "step": 4600 + }, + { + "epoch": 0.1946442169388273, + "grad_norm": 0.871143639087677, + "learning_rate": 0.001, + "loss": 2.6081, + "step": 4601 + }, + { + "epoch": 0.19468652170234368, + "grad_norm": 0.2440234124660492, + "learning_rate": 0.001, + "loss": 3.2444, + "step": 4602 + }, + { + "epoch": 0.19472882646586007, + "grad_norm": 15.243217468261719, + "learning_rate": 0.001, + "loss": 2.0989, + "step": 4603 + }, + { + "epoch": 0.19477113122937642, + "grad_norm": 0.22179925441741943, + "learning_rate": 0.001, + "loss": 2.6825, + "step": 4604 + }, + { + "epoch": 0.1948134359928928, + "grad_norm": 0.2920258641242981, + "learning_rate": 0.001, + "loss": 1.8067, + "step": 4605 + }, + { + "epoch": 0.19485574075640918, + "grad_norm": 0.45571228861808777, + "learning_rate": 0.001, + "loss": 2.4332, + "step": 4606 + }, + { + "epoch": 0.19489804551992554, + "grad_norm": 0.21809923648834229, + "learning_rate": 0.001, + "loss": 1.9698, + "step": 4607 + }, + { + "epoch": 0.19494035028344192, + "grad_norm": 0.22986778616905212, + "learning_rate": 0.001, + "loss": 2.3261, + "step": 4608 + }, + { + "epoch": 0.19498265504695828, + "grad_norm": 0.2756772041320801, + "learning_rate": 0.001, + "loss": 3.1784, + "step": 4609 + }, + { + "epoch": 0.19502495981047466, + "grad_norm": 0.20755811035633087, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 4610 + }, + { + "epoch": 0.19506726457399104, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.001, + "loss": 2.3201, + "step": 4611 + }, + { + "epoch": 0.1951095693375074, + "grad_norm": 0.35516980290412903, + "learning_rate": 0.001, + "loss": 2.119, + "step": 4612 + }, + { + "epoch": 0.19515187410102378, + "grad_norm": 0.21821679174900055, + "learning_rate": 0.001, + "loss": 3.3628, + "step": 4613 + }, + { + "epoch": 0.19519417886454016, + "grad_norm": 0.23565897345542908, + "learning_rate": 0.001, + "loss": 2.407, + "step": 4614 + }, + { + "epoch": 0.1952364836280565, + "grad_norm": 0.19071324169635773, + "learning_rate": 0.001, + "loss": 2.0236, + "step": 4615 + }, + { + "epoch": 0.1952787883915729, + "grad_norm": 0.21241721510887146, + "learning_rate": 0.001, + "loss": 1.9463, + "step": 4616 + }, + { + "epoch": 0.19532109315508928, + "grad_norm": 0.21527446806430817, + "learning_rate": 0.001, + "loss": 2.137, + "step": 4617 + }, + { + "epoch": 0.19536339791860563, + "grad_norm": 0.21654748916625977, + "learning_rate": 0.001, + "loss": 2.9063, + "step": 4618 + }, + { + "epoch": 0.195405702682122, + "grad_norm": 0.26138582825660706, + "learning_rate": 0.001, + "loss": 2.4052, + "step": 4619 + }, + { + "epoch": 0.19544800744563837, + "grad_norm": 0.2170308381319046, + "learning_rate": 0.001, + "loss": 1.9937, + "step": 4620 + }, + { + "epoch": 0.19549031220915475, + "grad_norm": 1.4368832111358643, + "learning_rate": 0.001, + "loss": 2.1986, + "step": 4621 + }, + { + "epoch": 0.19553261697267113, + "grad_norm": 0.21207021176815033, + "learning_rate": 0.001, + "loss": 1.8493, + "step": 4622 + }, + { + "epoch": 0.19557492173618748, + "grad_norm": 0.3097752034664154, + "learning_rate": 0.001, + "loss": 1.6055, + "step": 4623 + }, + { + "epoch": 0.19561722649970387, + "grad_norm": 0.2724337875843048, + "learning_rate": 0.001, + "loss": 2.3367, + "step": 4624 + }, + { + "epoch": 0.19565953126322025, + "grad_norm": 0.21333764493465424, + "learning_rate": 0.001, + "loss": 1.6781, + "step": 4625 + }, + { + "epoch": 0.1957018360267366, + "grad_norm": 0.30645281076431274, + "learning_rate": 0.001, + "loss": 2.288, + "step": 4626 + }, + { + "epoch": 0.19574414079025299, + "grad_norm": 0.2575608193874359, + "learning_rate": 0.001, + "loss": 2.4528, + "step": 4627 + }, + { + "epoch": 0.19578644555376937, + "grad_norm": 0.21486923098564148, + "learning_rate": 0.001, + "loss": 1.9437, + "step": 4628 + }, + { + "epoch": 0.19582875031728572, + "grad_norm": 0.24129249155521393, + "learning_rate": 0.001, + "loss": 3.5545, + "step": 4629 + }, + { + "epoch": 0.1958710550808021, + "grad_norm": 0.3026060461997986, + "learning_rate": 0.001, + "loss": 2.1121, + "step": 4630 + }, + { + "epoch": 0.19591335984431846, + "grad_norm": 0.2635842561721802, + "learning_rate": 0.001, + "loss": 2.1426, + "step": 4631 + }, + { + "epoch": 0.19595566460783484, + "grad_norm": 0.38324952125549316, + "learning_rate": 0.001, + "loss": 2.0874, + "step": 4632 + }, + { + "epoch": 0.19599796937135122, + "grad_norm": 0.6430294513702393, + "learning_rate": 0.001, + "loss": 2.474, + "step": 4633 + }, + { + "epoch": 0.19604027413486758, + "grad_norm": 0.2072354555130005, + "learning_rate": 0.001, + "loss": 3.0056, + "step": 4634 + }, + { + "epoch": 0.19608257889838396, + "grad_norm": 0.22297710180282593, + "learning_rate": 0.001, + "loss": 2.1925, + "step": 4635 + }, + { + "epoch": 0.19612488366190034, + "grad_norm": 2.166142702102661, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 4636 + }, + { + "epoch": 0.1961671884254167, + "grad_norm": 0.21612171828746796, + "learning_rate": 0.001, + "loss": 3.9448, + "step": 4637 + }, + { + "epoch": 0.19620949318893308, + "grad_norm": 0.2798874080181122, + "learning_rate": 0.001, + "loss": 2.152, + "step": 4638 + }, + { + "epoch": 0.19625179795244946, + "grad_norm": 0.29378223419189453, + "learning_rate": 0.001, + "loss": 1.913, + "step": 4639 + }, + { + "epoch": 0.1962941027159658, + "grad_norm": 0.37038248777389526, + "learning_rate": 0.001, + "loss": 2.5038, + "step": 4640 + }, + { + "epoch": 0.1963364074794822, + "grad_norm": 0.2652187645435333, + "learning_rate": 0.001, + "loss": 2.1267, + "step": 4641 + }, + { + "epoch": 0.19637871224299855, + "grad_norm": 0.27471134066581726, + "learning_rate": 0.001, + "loss": 2.1653, + "step": 4642 + }, + { + "epoch": 0.19642101700651493, + "grad_norm": 0.19736327230930328, + "learning_rate": 0.001, + "loss": 2.9415, + "step": 4643 + }, + { + "epoch": 0.1964633217700313, + "grad_norm": 0.22357088327407837, + "learning_rate": 0.001, + "loss": 3.3843, + "step": 4644 + }, + { + "epoch": 0.19650562653354767, + "grad_norm": 0.22968696057796478, + "learning_rate": 0.001, + "loss": 2.0118, + "step": 4645 + }, + { + "epoch": 0.19654793129706405, + "grad_norm": 1.8765842914581299, + "learning_rate": 0.001, + "loss": 2.4737, + "step": 4646 + }, + { + "epoch": 0.19659023606058043, + "grad_norm": 0.41798344254493713, + "learning_rate": 0.001, + "loss": 2.6721, + "step": 4647 + }, + { + "epoch": 0.19663254082409679, + "grad_norm": 0.20289246737957, + "learning_rate": 0.001, + "loss": 3.4892, + "step": 4648 + }, + { + "epoch": 0.19667484558761317, + "grad_norm": 0.2355775684118271, + "learning_rate": 0.001, + "loss": 1.8723, + "step": 4649 + }, + { + "epoch": 0.19671715035112955, + "grad_norm": 0.2144707590341568, + "learning_rate": 0.001, + "loss": 2.787, + "step": 4650 + }, + { + "epoch": 0.1967594551146459, + "grad_norm": 0.23321262001991272, + "learning_rate": 0.001, + "loss": 1.7408, + "step": 4651 + }, + { + "epoch": 0.19680175987816229, + "grad_norm": 0.21460668742656708, + "learning_rate": 0.001, + "loss": 1.8128, + "step": 4652 + }, + { + "epoch": 0.19684406464167864, + "grad_norm": 0.6912943124771118, + "learning_rate": 0.001, + "loss": 2.8737, + "step": 4653 + }, + { + "epoch": 0.19688636940519502, + "grad_norm": 0.24109399318695068, + "learning_rate": 0.001, + "loss": 2.2734, + "step": 4654 + }, + { + "epoch": 0.1969286741687114, + "grad_norm": 0.47939634323120117, + "learning_rate": 0.001, + "loss": 1.6569, + "step": 4655 + }, + { + "epoch": 0.19697097893222776, + "grad_norm": 2.933166742324829, + "learning_rate": 0.001, + "loss": 2.6041, + "step": 4656 + }, + { + "epoch": 0.19701328369574414, + "grad_norm": 0.8214706778526306, + "learning_rate": 0.001, + "loss": 2.518, + "step": 4657 + }, + { + "epoch": 0.19705558845926052, + "grad_norm": 0.2619834542274475, + "learning_rate": 0.001, + "loss": 3.2975, + "step": 4658 + }, + { + "epoch": 0.19709789322277688, + "grad_norm": 0.3553191125392914, + "learning_rate": 0.001, + "loss": 3.8384, + "step": 4659 + }, + { + "epoch": 0.19714019798629326, + "grad_norm": 0.1962711215019226, + "learning_rate": 0.001, + "loss": 1.984, + "step": 4660 + }, + { + "epoch": 0.19718250274980964, + "grad_norm": 0.2404264360666275, + "learning_rate": 0.001, + "loss": 2.4286, + "step": 4661 + }, + { + "epoch": 0.197224807513326, + "grad_norm": 0.24695684015750885, + "learning_rate": 0.001, + "loss": 2.8653, + "step": 4662 + }, + { + "epoch": 0.19726711227684238, + "grad_norm": 20.713294982910156, + "learning_rate": 0.001, + "loss": 2.2609, + "step": 4663 + }, + { + "epoch": 0.19730941704035873, + "grad_norm": 0.25775420665740967, + "learning_rate": 0.001, + "loss": 2.2937, + "step": 4664 + }, + { + "epoch": 0.1973517218038751, + "grad_norm": 0.24397379159927368, + "learning_rate": 0.001, + "loss": 2.0571, + "step": 4665 + }, + { + "epoch": 0.1973940265673915, + "grad_norm": 0.38282090425491333, + "learning_rate": 0.001, + "loss": 2.2702, + "step": 4666 + }, + { + "epoch": 0.19743633133090785, + "grad_norm": 0.23603591322898865, + "learning_rate": 0.001, + "loss": 2.01, + "step": 4667 + }, + { + "epoch": 0.19747863609442423, + "grad_norm": 0.2705064117908478, + "learning_rate": 0.001, + "loss": 2.6685, + "step": 4668 + }, + { + "epoch": 0.1975209408579406, + "grad_norm": 3.485954999923706, + "learning_rate": 0.001, + "loss": 3.6868, + "step": 4669 + }, + { + "epoch": 0.19756324562145697, + "grad_norm": 0.3059000074863434, + "learning_rate": 0.001, + "loss": 2.5588, + "step": 4670 + }, + { + "epoch": 0.19760555038497335, + "grad_norm": 2.187786102294922, + "learning_rate": 0.001, + "loss": 3.4677, + "step": 4671 + }, + { + "epoch": 0.19764785514848973, + "grad_norm": 0.25730597972869873, + "learning_rate": 0.001, + "loss": 2.3108, + "step": 4672 + }, + { + "epoch": 0.19769015991200609, + "grad_norm": 0.2296760380268097, + "learning_rate": 0.001, + "loss": 1.5381, + "step": 4673 + }, + { + "epoch": 0.19773246467552247, + "grad_norm": 0.31700170040130615, + "learning_rate": 0.001, + "loss": 3.6578, + "step": 4674 + }, + { + "epoch": 0.19777476943903882, + "grad_norm": 0.43023067712783813, + "learning_rate": 0.001, + "loss": 3.3044, + "step": 4675 + }, + { + "epoch": 0.1978170742025552, + "grad_norm": 1.113132357597351, + "learning_rate": 0.001, + "loss": 3.9424, + "step": 4676 + }, + { + "epoch": 0.1978593789660716, + "grad_norm": 0.2837026119232178, + "learning_rate": 0.001, + "loss": 2.3631, + "step": 4677 + }, + { + "epoch": 0.19790168372958794, + "grad_norm": 0.29237067699432373, + "learning_rate": 0.001, + "loss": 2.443, + "step": 4678 + }, + { + "epoch": 0.19794398849310432, + "grad_norm": 0.3563275635242462, + "learning_rate": 0.001, + "loss": 2.2434, + "step": 4679 + }, + { + "epoch": 0.1979862932566207, + "grad_norm": 1.2392581701278687, + "learning_rate": 0.001, + "loss": 3.6908, + "step": 4680 + }, + { + "epoch": 0.19802859802013706, + "grad_norm": 2.4038748741149902, + "learning_rate": 0.001, + "loss": 1.984, + "step": 4681 + }, + { + "epoch": 0.19807090278365344, + "grad_norm": 0.44923508167266846, + "learning_rate": 0.001, + "loss": 2.7697, + "step": 4682 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 0.3151043951511383, + "learning_rate": 0.001, + "loss": 2.9882, + "step": 4683 + }, + { + "epoch": 0.19815551231068618, + "grad_norm": 0.46017053723335266, + "learning_rate": 0.001, + "loss": 1.9041, + "step": 4684 + }, + { + "epoch": 0.19819781707420256, + "grad_norm": 0.3234018385410309, + "learning_rate": 0.001, + "loss": 1.8249, + "step": 4685 + }, + { + "epoch": 0.1982401218377189, + "grad_norm": 0.3335753083229065, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 4686 + }, + { + "epoch": 0.1982824266012353, + "grad_norm": 0.382220059633255, + "learning_rate": 0.001, + "loss": 2.4, + "step": 4687 + }, + { + "epoch": 0.19832473136475168, + "grad_norm": 0.9957152605056763, + "learning_rate": 0.001, + "loss": 3.0074, + "step": 4688 + }, + { + "epoch": 0.19836703612826803, + "grad_norm": 0.2819012701511383, + "learning_rate": 0.001, + "loss": 3.5457, + "step": 4689 + }, + { + "epoch": 0.1984093408917844, + "grad_norm": 2.3886940479278564, + "learning_rate": 0.001, + "loss": 3.4807, + "step": 4690 + }, + { + "epoch": 0.1984516456553008, + "grad_norm": 0.35311177372932434, + "learning_rate": 0.001, + "loss": 3.1578, + "step": 4691 + }, + { + "epoch": 0.19849395041881715, + "grad_norm": 1.2206075191497803, + "learning_rate": 0.001, + "loss": 2.128, + "step": 4692 + }, + { + "epoch": 0.19853625518233353, + "grad_norm": 5.357852458953857, + "learning_rate": 0.001, + "loss": 2.2252, + "step": 4693 + }, + { + "epoch": 0.19857855994584991, + "grad_norm": 0.4290313422679901, + "learning_rate": 0.001, + "loss": 2.0768, + "step": 4694 + }, + { + "epoch": 0.19862086470936627, + "grad_norm": 0.22791223227977753, + "learning_rate": 0.001, + "loss": 1.7422, + "step": 4695 + }, + { + "epoch": 0.19866316947288265, + "grad_norm": 0.24464336037635803, + "learning_rate": 0.001, + "loss": 2.6629, + "step": 4696 + }, + { + "epoch": 0.19870547423639903, + "grad_norm": 1.3040649890899658, + "learning_rate": 0.001, + "loss": 2.1108, + "step": 4697 + }, + { + "epoch": 0.1987477789999154, + "grad_norm": 0.5187906622886658, + "learning_rate": 0.001, + "loss": 2.7388, + "step": 4698 + }, + { + "epoch": 0.19879008376343177, + "grad_norm": 0.2980749011039734, + "learning_rate": 0.001, + "loss": 3.4038, + "step": 4699 + }, + { + "epoch": 0.19883238852694812, + "grad_norm": 0.3226117193698883, + "learning_rate": 0.001, + "loss": 2.8376, + "step": 4700 + }, + { + "epoch": 0.1988746932904645, + "grad_norm": 0.2342327982187271, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 4701 + }, + { + "epoch": 0.1989169980539809, + "grad_norm": 0.23881351947784424, + "learning_rate": 0.001, + "loss": 3.1819, + "step": 4702 + }, + { + "epoch": 0.19895930281749724, + "grad_norm": 0.1929883062839508, + "learning_rate": 0.001, + "loss": 2.0568, + "step": 4703 + }, + { + "epoch": 0.19900160758101362, + "grad_norm": 0.25098490715026855, + "learning_rate": 0.001, + "loss": 2.0162, + "step": 4704 + }, + { + "epoch": 0.19904391234453, + "grad_norm": 0.25516989827156067, + "learning_rate": 0.001, + "loss": 2.6734, + "step": 4705 + }, + { + "epoch": 0.19908621710804636, + "grad_norm": 0.2264355570077896, + "learning_rate": 0.001, + "loss": 2.4311, + "step": 4706 + }, + { + "epoch": 0.19912852187156274, + "grad_norm": 0.18645042181015015, + "learning_rate": 0.001, + "loss": 1.6494, + "step": 4707 + }, + { + "epoch": 0.19917082663507912, + "grad_norm": 9.609095573425293, + "learning_rate": 0.001, + "loss": 2.3706, + "step": 4708 + }, + { + "epoch": 0.19921313139859548, + "grad_norm": 0.2301723212003708, + "learning_rate": 0.001, + "loss": 2.3175, + "step": 4709 + }, + { + "epoch": 0.19925543616211186, + "grad_norm": 0.30773094296455383, + "learning_rate": 0.001, + "loss": 3.0642, + "step": 4710 + }, + { + "epoch": 0.19929774092562821, + "grad_norm": 0.48115605115890503, + "learning_rate": 0.001, + "loss": 2.8732, + "step": 4711 + }, + { + "epoch": 0.1993400456891446, + "grad_norm": 0.20981243252754211, + "learning_rate": 0.001, + "loss": 2.5917, + "step": 4712 + }, + { + "epoch": 0.19938235045266098, + "grad_norm": 0.22592686116695404, + "learning_rate": 0.001, + "loss": 2.8297, + "step": 4713 + }, + { + "epoch": 0.19942465521617733, + "grad_norm": 0.27815452218055725, + "learning_rate": 0.001, + "loss": 2.3364, + "step": 4714 + }, + { + "epoch": 0.19946695997969371, + "grad_norm": 0.23436151444911957, + "learning_rate": 0.001, + "loss": 2.8687, + "step": 4715 + }, + { + "epoch": 0.1995092647432101, + "grad_norm": 1.096976637840271, + "learning_rate": 0.001, + "loss": 2.2385, + "step": 4716 + }, + { + "epoch": 0.19955156950672645, + "grad_norm": 0.22871260344982147, + "learning_rate": 0.001, + "loss": 2.3228, + "step": 4717 + }, + { + "epoch": 0.19959387427024283, + "grad_norm": 0.7824380397796631, + "learning_rate": 0.001, + "loss": 2.2027, + "step": 4718 + }, + { + "epoch": 0.19963617903375921, + "grad_norm": 0.20783767104148865, + "learning_rate": 0.001, + "loss": 1.4938, + "step": 4719 + }, + { + "epoch": 0.19967848379727557, + "grad_norm": 0.25429844856262207, + "learning_rate": 0.001, + "loss": 1.8029, + "step": 4720 + }, + { + "epoch": 0.19972078856079195, + "grad_norm": 0.37199491262435913, + "learning_rate": 0.001, + "loss": 2.5307, + "step": 4721 + }, + { + "epoch": 0.1997630933243083, + "grad_norm": 0.258245587348938, + "learning_rate": 0.001, + "loss": 2.2374, + "step": 4722 + }, + { + "epoch": 0.1998053980878247, + "grad_norm": 0.2625525891780853, + "learning_rate": 0.001, + "loss": 3.0043, + "step": 4723 + }, + { + "epoch": 0.19984770285134107, + "grad_norm": 6.895899772644043, + "learning_rate": 0.001, + "loss": 2.5071, + "step": 4724 + }, + { + "epoch": 0.19989000761485742, + "grad_norm": 0.5165795087814331, + "learning_rate": 0.001, + "loss": 2.563, + "step": 4725 + }, + { + "epoch": 0.1999323123783738, + "grad_norm": 1.534584879875183, + "learning_rate": 0.001, + "loss": 2.2309, + "step": 4726 + }, + { + "epoch": 0.1999746171418902, + "grad_norm": 3.5470550060272217, + "learning_rate": 0.001, + "loss": 2.8135, + "step": 4727 + }, + { + "epoch": 0.20001692190540654, + "grad_norm": 0.46171459555625916, + "learning_rate": 0.001, + "loss": 2.4945, + "step": 4728 + }, + { + "epoch": 0.20005922666892292, + "grad_norm": 0.2912849485874176, + "learning_rate": 0.001, + "loss": 2.1541, + "step": 4729 + }, + { + "epoch": 0.2001015314324393, + "grad_norm": 0.2877223491668701, + "learning_rate": 0.001, + "loss": 2.4918, + "step": 4730 + }, + { + "epoch": 0.20014383619595566, + "grad_norm": 0.28247973322868347, + "learning_rate": 0.001, + "loss": 3.4501, + "step": 4731 + }, + { + "epoch": 0.20018614095947204, + "grad_norm": 0.24483604729175568, + "learning_rate": 0.001, + "loss": 3.568, + "step": 4732 + }, + { + "epoch": 0.2002284457229884, + "grad_norm": 0.2491256296634674, + "learning_rate": 0.001, + "loss": 2.456, + "step": 4733 + }, + { + "epoch": 0.20027075048650478, + "grad_norm": 0.21148993074893951, + "learning_rate": 0.001, + "loss": 1.9497, + "step": 4734 + }, + { + "epoch": 0.20031305525002116, + "grad_norm": 0.24428227543830872, + "learning_rate": 0.001, + "loss": 2.6689, + "step": 4735 + }, + { + "epoch": 0.20035536001353751, + "grad_norm": 0.21488066017627716, + "learning_rate": 0.001, + "loss": 2.897, + "step": 4736 + }, + { + "epoch": 0.2003976647770539, + "grad_norm": 0.2354801744222641, + "learning_rate": 0.001, + "loss": 2.4959, + "step": 4737 + }, + { + "epoch": 0.20043996954057028, + "grad_norm": 0.3543696403503418, + "learning_rate": 0.001, + "loss": 2.3337, + "step": 4738 + }, + { + "epoch": 0.20048227430408663, + "grad_norm": 0.21295201778411865, + "learning_rate": 0.001, + "loss": 2.2182, + "step": 4739 + }, + { + "epoch": 0.20052457906760301, + "grad_norm": 0.2649983763694763, + "learning_rate": 0.001, + "loss": 2.0836, + "step": 4740 + }, + { + "epoch": 0.2005668838311194, + "grad_norm": 0.4929693341255188, + "learning_rate": 0.001, + "loss": 2.1557, + "step": 4741 + }, + { + "epoch": 0.20060918859463575, + "grad_norm": 0.24373185634613037, + "learning_rate": 0.001, + "loss": 2.2373, + "step": 4742 + }, + { + "epoch": 0.20065149335815213, + "grad_norm": 0.2782137095928192, + "learning_rate": 0.001, + "loss": 2.0663, + "step": 4743 + }, + { + "epoch": 0.2006937981216685, + "grad_norm": 0.9865705370903015, + "learning_rate": 0.001, + "loss": 2.4, + "step": 4744 + }, + { + "epoch": 0.20073610288518487, + "grad_norm": 0.24436363577842712, + "learning_rate": 0.001, + "loss": 3.0016, + "step": 4745 + }, + { + "epoch": 0.20077840764870125, + "grad_norm": 0.22285853326320648, + "learning_rate": 0.001, + "loss": 2.4025, + "step": 4746 + }, + { + "epoch": 0.2008207124122176, + "grad_norm": 0.2801361083984375, + "learning_rate": 0.001, + "loss": 3.0465, + "step": 4747 + }, + { + "epoch": 0.200863017175734, + "grad_norm": 1.4615576267242432, + "learning_rate": 0.001, + "loss": 1.813, + "step": 4748 + }, + { + "epoch": 0.20090532193925037, + "grad_norm": 0.21313884854316711, + "learning_rate": 0.001, + "loss": 1.858, + "step": 4749 + }, + { + "epoch": 0.20094762670276672, + "grad_norm": 0.7325378060340881, + "learning_rate": 0.001, + "loss": 3.6507, + "step": 4750 + }, + { + "epoch": 0.2009899314662831, + "grad_norm": 102.32513427734375, + "learning_rate": 0.001, + "loss": 3.3369, + "step": 4751 + }, + { + "epoch": 0.2010322362297995, + "grad_norm": 0.3230404555797577, + "learning_rate": 0.001, + "loss": 3.2636, + "step": 4752 + }, + { + "epoch": 0.20107454099331584, + "grad_norm": 0.27479425072669983, + "learning_rate": 0.001, + "loss": 2.8126, + "step": 4753 + }, + { + "epoch": 0.20111684575683222, + "grad_norm": 1.0016082525253296, + "learning_rate": 0.001, + "loss": 3.3368, + "step": 4754 + }, + { + "epoch": 0.20115915052034858, + "grad_norm": 2.433990478515625, + "learning_rate": 0.001, + "loss": 2.5722, + "step": 4755 + }, + { + "epoch": 0.20120145528386496, + "grad_norm": 0.29447877407073975, + "learning_rate": 0.001, + "loss": 2.011, + "step": 4756 + }, + { + "epoch": 0.20124376004738134, + "grad_norm": 0.23931360244750977, + "learning_rate": 0.001, + "loss": 3.1154, + "step": 4757 + }, + { + "epoch": 0.2012860648108977, + "grad_norm": 0.24119767546653748, + "learning_rate": 0.001, + "loss": 2.6907, + "step": 4758 + }, + { + "epoch": 0.20132836957441408, + "grad_norm": 0.2806399166584015, + "learning_rate": 0.001, + "loss": 2.435, + "step": 4759 + }, + { + "epoch": 0.20137067433793046, + "grad_norm": 0.2589947581291199, + "learning_rate": 0.001, + "loss": 3.0918, + "step": 4760 + }, + { + "epoch": 0.20141297910144682, + "grad_norm": 0.2585409879684448, + "learning_rate": 0.001, + "loss": 1.6908, + "step": 4761 + }, + { + "epoch": 0.2014552838649632, + "grad_norm": 0.4902653992176056, + "learning_rate": 0.001, + "loss": 2.5211, + "step": 4762 + }, + { + "epoch": 0.20149758862847958, + "grad_norm": 0.2097211331129074, + "learning_rate": 0.001, + "loss": 1.7957, + "step": 4763 + }, + { + "epoch": 0.20153989339199593, + "grad_norm": 0.26205360889434814, + "learning_rate": 0.001, + "loss": 1.7048, + "step": 4764 + }, + { + "epoch": 0.20158219815551232, + "grad_norm": 1.476196527481079, + "learning_rate": 0.001, + "loss": 2.1979, + "step": 4765 + }, + { + "epoch": 0.20162450291902867, + "grad_norm": 0.26606282591819763, + "learning_rate": 0.001, + "loss": 1.9978, + "step": 4766 + }, + { + "epoch": 0.20166680768254505, + "grad_norm": 0.7488316297531128, + "learning_rate": 0.001, + "loss": 2.2828, + "step": 4767 + }, + { + "epoch": 0.20170911244606143, + "grad_norm": 0.2467147260904312, + "learning_rate": 0.001, + "loss": 2.2932, + "step": 4768 + }, + { + "epoch": 0.2017514172095778, + "grad_norm": 0.3320417106151581, + "learning_rate": 0.001, + "loss": 2.5512, + "step": 4769 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 0.827613115310669, + "learning_rate": 0.001, + "loss": 2.9978, + "step": 4770 + }, + { + "epoch": 0.20183602673661055, + "grad_norm": 0.2728056311607361, + "learning_rate": 0.001, + "loss": 2.3317, + "step": 4771 + }, + { + "epoch": 0.2018783315001269, + "grad_norm": 12.807820320129395, + "learning_rate": 0.001, + "loss": 3.0231, + "step": 4772 + }, + { + "epoch": 0.2019206362636433, + "grad_norm": 0.25953209400177, + "learning_rate": 0.001, + "loss": 2.5744, + "step": 4773 + }, + { + "epoch": 0.20196294102715967, + "grad_norm": 0.2514730393886566, + "learning_rate": 0.001, + "loss": 1.816, + "step": 4774 + }, + { + "epoch": 0.20200524579067602, + "grad_norm": 0.3252697288990021, + "learning_rate": 0.001, + "loss": 1.7334, + "step": 4775 + }, + { + "epoch": 0.2020475505541924, + "grad_norm": 0.24858559668064117, + "learning_rate": 0.001, + "loss": 1.9274, + "step": 4776 + }, + { + "epoch": 0.20208985531770876, + "grad_norm": 0.24600395560264587, + "learning_rate": 0.001, + "loss": 2.0192, + "step": 4777 + }, + { + "epoch": 0.20213216008122514, + "grad_norm": 0.2425483614206314, + "learning_rate": 0.001, + "loss": 1.9709, + "step": 4778 + }, + { + "epoch": 0.20217446484474152, + "grad_norm": 0.30808165669441223, + "learning_rate": 0.001, + "loss": 3.5698, + "step": 4779 + }, + { + "epoch": 0.20221676960825788, + "grad_norm": 0.23074059188365936, + "learning_rate": 0.001, + "loss": 2.4182, + "step": 4780 + }, + { + "epoch": 0.20225907437177426, + "grad_norm": 0.22748015820980072, + "learning_rate": 0.001, + "loss": 2.067, + "step": 4781 + }, + { + "epoch": 0.20230137913529064, + "grad_norm": 0.23154856264591217, + "learning_rate": 0.001, + "loss": 1.9049, + "step": 4782 + }, + { + "epoch": 0.202343683898807, + "grad_norm": 0.23665709793567657, + "learning_rate": 0.001, + "loss": 2.272, + "step": 4783 + }, + { + "epoch": 0.20238598866232338, + "grad_norm": 0.20455607771873474, + "learning_rate": 0.001, + "loss": 2.1121, + "step": 4784 + }, + { + "epoch": 0.20242829342583976, + "grad_norm": 0.4246728718280792, + "learning_rate": 0.001, + "loss": 2.6021, + "step": 4785 + }, + { + "epoch": 0.20247059818935612, + "grad_norm": 0.235957533121109, + "learning_rate": 0.001, + "loss": 3.1363, + "step": 4786 + }, + { + "epoch": 0.2025129029528725, + "grad_norm": 0.20879578590393066, + "learning_rate": 0.001, + "loss": 2.4947, + "step": 4787 + }, + { + "epoch": 0.20255520771638885, + "grad_norm": 0.22662273049354553, + "learning_rate": 0.001, + "loss": 2.2168, + "step": 4788 + }, + { + "epoch": 0.20259751247990523, + "grad_norm": 0.39572110772132874, + "learning_rate": 0.001, + "loss": 2.7348, + "step": 4789 + }, + { + "epoch": 0.20263981724342162, + "grad_norm": 0.19888943433761597, + "learning_rate": 0.001, + "loss": 2.4301, + "step": 4790 + }, + { + "epoch": 0.20268212200693797, + "grad_norm": 0.2209424376487732, + "learning_rate": 0.001, + "loss": 2.1574, + "step": 4791 + }, + { + "epoch": 0.20272442677045435, + "grad_norm": 0.25813013315200806, + "learning_rate": 0.001, + "loss": 2.2174, + "step": 4792 + }, + { + "epoch": 0.20276673153397073, + "grad_norm": 0.28615349531173706, + "learning_rate": 0.001, + "loss": 2.5578, + "step": 4793 + }, + { + "epoch": 0.2028090362974871, + "grad_norm": 0.25967228412628174, + "learning_rate": 0.001, + "loss": 2.1363, + "step": 4794 + }, + { + "epoch": 0.20285134106100347, + "grad_norm": 0.6237789988517761, + "learning_rate": 0.001, + "loss": 2.4102, + "step": 4795 + }, + { + "epoch": 0.20289364582451985, + "grad_norm": 0.20853352546691895, + "learning_rate": 0.001, + "loss": 2.2166, + "step": 4796 + }, + { + "epoch": 0.2029359505880362, + "grad_norm": 0.24653717875480652, + "learning_rate": 0.001, + "loss": 2.9338, + "step": 4797 + }, + { + "epoch": 0.2029782553515526, + "grad_norm": 0.4108681082725525, + "learning_rate": 0.001, + "loss": 1.4424, + "step": 4798 + }, + { + "epoch": 0.20302056011506894, + "grad_norm": 0.40089255571365356, + "learning_rate": 0.001, + "loss": 4.6068, + "step": 4799 + }, + { + "epoch": 0.20306286487858533, + "grad_norm": 0.24011704325675964, + "learning_rate": 0.001, + "loss": 2.9857, + "step": 4800 + }, + { + "epoch": 0.2031051696421017, + "grad_norm": 0.27986645698547363, + "learning_rate": 0.001, + "loss": 2.6674, + "step": 4801 + }, + { + "epoch": 0.20314747440561806, + "grad_norm": 0.23206351697444916, + "learning_rate": 0.001, + "loss": 2.4333, + "step": 4802 + }, + { + "epoch": 0.20318977916913444, + "grad_norm": 0.5023967623710632, + "learning_rate": 0.001, + "loss": 2.2802, + "step": 4803 + }, + { + "epoch": 0.20323208393265083, + "grad_norm": 0.4342386722564697, + "learning_rate": 0.001, + "loss": 2.8626, + "step": 4804 + }, + { + "epoch": 0.20327438869616718, + "grad_norm": 0.21476680040359497, + "learning_rate": 0.001, + "loss": 1.9667, + "step": 4805 + }, + { + "epoch": 0.20331669345968356, + "grad_norm": 0.22180813550949097, + "learning_rate": 0.001, + "loss": 1.9456, + "step": 4806 + }, + { + "epoch": 0.20335899822319994, + "grad_norm": 0.21526440978050232, + "learning_rate": 0.001, + "loss": 2.1147, + "step": 4807 + }, + { + "epoch": 0.2034013029867163, + "grad_norm": 0.22872960567474365, + "learning_rate": 0.001, + "loss": 2.3991, + "step": 4808 + }, + { + "epoch": 0.20344360775023268, + "grad_norm": 0.21979276835918427, + "learning_rate": 0.001, + "loss": 1.7426, + "step": 4809 + }, + { + "epoch": 0.20348591251374903, + "grad_norm": 0.20522305369377136, + "learning_rate": 0.001, + "loss": 2.61, + "step": 4810 + }, + { + "epoch": 0.20352821727726542, + "grad_norm": 0.24340516328811646, + "learning_rate": 0.001, + "loss": 3.118, + "step": 4811 + }, + { + "epoch": 0.2035705220407818, + "grad_norm": 0.7604936361312866, + "learning_rate": 0.001, + "loss": 2.2295, + "step": 4812 + }, + { + "epoch": 0.20361282680429815, + "grad_norm": 0.20879897475242615, + "learning_rate": 0.001, + "loss": 3.0593, + "step": 4813 + }, + { + "epoch": 0.20365513156781453, + "grad_norm": 0.22496330738067627, + "learning_rate": 0.001, + "loss": 1.967, + "step": 4814 + }, + { + "epoch": 0.20369743633133092, + "grad_norm": 0.6500147581100464, + "learning_rate": 0.001, + "loss": 2.5665, + "step": 4815 + }, + { + "epoch": 0.20373974109484727, + "grad_norm": 0.2536025941371918, + "learning_rate": 0.001, + "loss": 3.4292, + "step": 4816 + }, + { + "epoch": 0.20378204585836365, + "grad_norm": 0.25820499658584595, + "learning_rate": 0.001, + "loss": 2.0012, + "step": 4817 + }, + { + "epoch": 0.20382435062188003, + "grad_norm": 0.886145830154419, + "learning_rate": 0.001, + "loss": 1.7369, + "step": 4818 + }, + { + "epoch": 0.2038666553853964, + "grad_norm": 0.19858545064926147, + "learning_rate": 0.001, + "loss": 1.86, + "step": 4819 + }, + { + "epoch": 0.20390896014891277, + "grad_norm": 0.6188323497772217, + "learning_rate": 0.001, + "loss": 1.8931, + "step": 4820 + }, + { + "epoch": 0.20395126491242915, + "grad_norm": 0.2777750790119171, + "learning_rate": 0.001, + "loss": 3.6006, + "step": 4821 + }, + { + "epoch": 0.2039935696759455, + "grad_norm": 0.26666122674942017, + "learning_rate": 0.001, + "loss": 2.4877, + "step": 4822 + }, + { + "epoch": 0.2040358744394619, + "grad_norm": 0.2691092789173126, + "learning_rate": 0.001, + "loss": 2.1836, + "step": 4823 + }, + { + "epoch": 0.20407817920297824, + "grad_norm": 0.24200117588043213, + "learning_rate": 0.001, + "loss": 2.5001, + "step": 4824 + }, + { + "epoch": 0.20412048396649463, + "grad_norm": 0.2400779128074646, + "learning_rate": 0.001, + "loss": 1.6834, + "step": 4825 + }, + { + "epoch": 0.204162788730011, + "grad_norm": 0.23831325769424438, + "learning_rate": 0.001, + "loss": 1.8685, + "step": 4826 + }, + { + "epoch": 0.20420509349352736, + "grad_norm": 0.2239019125699997, + "learning_rate": 0.001, + "loss": 2.894, + "step": 4827 + }, + { + "epoch": 0.20424739825704374, + "grad_norm": 0.23331467807292938, + "learning_rate": 0.001, + "loss": 2.893, + "step": 4828 + }, + { + "epoch": 0.20428970302056013, + "grad_norm": 0.2696641981601715, + "learning_rate": 0.001, + "loss": 3.3553, + "step": 4829 + }, + { + "epoch": 0.20433200778407648, + "grad_norm": 0.45643630623817444, + "learning_rate": 0.001, + "loss": 2.6336, + "step": 4830 + }, + { + "epoch": 0.20437431254759286, + "grad_norm": 0.5224469900131226, + "learning_rate": 0.001, + "loss": 2.7963, + "step": 4831 + }, + { + "epoch": 0.20441661731110924, + "grad_norm": 0.20430107414722443, + "learning_rate": 0.001, + "loss": 2.001, + "step": 4832 + }, + { + "epoch": 0.2044589220746256, + "grad_norm": 0.47029760479927063, + "learning_rate": 0.001, + "loss": 1.7341, + "step": 4833 + }, + { + "epoch": 0.20450122683814198, + "grad_norm": 0.20866377651691437, + "learning_rate": 0.001, + "loss": 2.1228, + "step": 4834 + }, + { + "epoch": 0.20454353160165833, + "grad_norm": 0.23199744522571564, + "learning_rate": 0.001, + "loss": 1.6639, + "step": 4835 + }, + { + "epoch": 0.20458583636517472, + "grad_norm": 0.23351909220218658, + "learning_rate": 0.001, + "loss": 2.9277, + "step": 4836 + }, + { + "epoch": 0.2046281411286911, + "grad_norm": 0.2393893003463745, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 4837 + }, + { + "epoch": 0.20467044589220745, + "grad_norm": 0.19761449098587036, + "learning_rate": 0.001, + "loss": 1.9734, + "step": 4838 + }, + { + "epoch": 0.20471275065572384, + "grad_norm": 0.45381811261177063, + "learning_rate": 0.001, + "loss": 2.3705, + "step": 4839 + }, + { + "epoch": 0.20475505541924022, + "grad_norm": 0.25361669063568115, + "learning_rate": 0.001, + "loss": 2.7643, + "step": 4840 + }, + { + "epoch": 0.20479736018275657, + "grad_norm": 0.23903019726276398, + "learning_rate": 0.001, + "loss": 2.1605, + "step": 4841 + }, + { + "epoch": 0.20483966494627295, + "grad_norm": 1.0079925060272217, + "learning_rate": 0.001, + "loss": 2.5728, + "step": 4842 + }, + { + "epoch": 0.20488196970978934, + "grad_norm": 0.22258150577545166, + "learning_rate": 0.001, + "loss": 2.7058, + "step": 4843 + }, + { + "epoch": 0.2049242744733057, + "grad_norm": 0.22821152210235596, + "learning_rate": 0.001, + "loss": 1.7562, + "step": 4844 + }, + { + "epoch": 0.20496657923682207, + "grad_norm": 0.29148948192596436, + "learning_rate": 0.001, + "loss": 2.5153, + "step": 4845 + }, + { + "epoch": 0.20500888400033843, + "grad_norm": 0.37664884328842163, + "learning_rate": 0.001, + "loss": 2.9711, + "step": 4846 + }, + { + "epoch": 0.2050511887638548, + "grad_norm": 0.2692352533340454, + "learning_rate": 0.001, + "loss": 2.3962, + "step": 4847 + }, + { + "epoch": 0.2050934935273712, + "grad_norm": 0.2488355189561844, + "learning_rate": 0.001, + "loss": 2.6436, + "step": 4848 + }, + { + "epoch": 0.20513579829088754, + "grad_norm": 33.88499069213867, + "learning_rate": 0.001, + "loss": 1.9808, + "step": 4849 + }, + { + "epoch": 0.20517810305440393, + "grad_norm": 0.24988500773906708, + "learning_rate": 0.001, + "loss": 3.2433, + "step": 4850 + }, + { + "epoch": 0.2052204078179203, + "grad_norm": 0.2517193853855133, + "learning_rate": 0.001, + "loss": 2.9665, + "step": 4851 + }, + { + "epoch": 0.20526271258143666, + "grad_norm": 0.38323888182640076, + "learning_rate": 0.001, + "loss": 1.9108, + "step": 4852 + }, + { + "epoch": 0.20530501734495304, + "grad_norm": 0.21491578221321106, + "learning_rate": 0.001, + "loss": 1.8844, + "step": 4853 + }, + { + "epoch": 0.20534732210846943, + "grad_norm": 0.23121948540210724, + "learning_rate": 0.001, + "loss": 2.4052, + "step": 4854 + }, + { + "epoch": 0.20538962687198578, + "grad_norm": 1.7590092420578003, + "learning_rate": 0.001, + "loss": 2.9853, + "step": 4855 + }, + { + "epoch": 0.20543193163550216, + "grad_norm": 0.3250569701194763, + "learning_rate": 0.001, + "loss": 2.9398, + "step": 4856 + }, + { + "epoch": 0.20547423639901852, + "grad_norm": 0.5734913349151611, + "learning_rate": 0.001, + "loss": 1.8469, + "step": 4857 + }, + { + "epoch": 0.2055165411625349, + "grad_norm": 0.2421615719795227, + "learning_rate": 0.001, + "loss": 2.0313, + "step": 4858 + }, + { + "epoch": 0.20555884592605128, + "grad_norm": 0.2648211121559143, + "learning_rate": 0.001, + "loss": 2.3516, + "step": 4859 + }, + { + "epoch": 0.20560115068956764, + "grad_norm": 0.5653280019760132, + "learning_rate": 0.001, + "loss": 1.7878, + "step": 4860 + }, + { + "epoch": 0.20564345545308402, + "grad_norm": 0.4042348861694336, + "learning_rate": 0.001, + "loss": 2.8081, + "step": 4861 + }, + { + "epoch": 0.2056857602166004, + "grad_norm": 0.25248414278030396, + "learning_rate": 0.001, + "loss": 3.5081, + "step": 4862 + }, + { + "epoch": 0.20572806498011675, + "grad_norm": 0.9863622784614563, + "learning_rate": 0.001, + "loss": 3.1157, + "step": 4863 + }, + { + "epoch": 0.20577036974363314, + "grad_norm": 0.2207251340150833, + "learning_rate": 0.001, + "loss": 2.1768, + "step": 4864 + }, + { + "epoch": 0.20581267450714952, + "grad_norm": 0.2811149060726166, + "learning_rate": 0.001, + "loss": 3.3907, + "step": 4865 + }, + { + "epoch": 0.20585497927066587, + "grad_norm": 0.9956902265548706, + "learning_rate": 0.001, + "loss": 2.2663, + "step": 4866 + }, + { + "epoch": 0.20589728403418225, + "grad_norm": 0.38394954800605774, + "learning_rate": 0.001, + "loss": 2.641, + "step": 4867 + }, + { + "epoch": 0.2059395887976986, + "grad_norm": 0.1891484558582306, + "learning_rate": 0.001, + "loss": 2.0367, + "step": 4868 + }, + { + "epoch": 0.205981893561215, + "grad_norm": 0.21494343876838684, + "learning_rate": 0.001, + "loss": 3.1822, + "step": 4869 + }, + { + "epoch": 0.20602419832473137, + "grad_norm": 0.27545684576034546, + "learning_rate": 0.001, + "loss": 3.4021, + "step": 4870 + }, + { + "epoch": 0.20606650308824773, + "grad_norm": 0.23246289789676666, + "learning_rate": 0.001, + "loss": 2.2404, + "step": 4871 + }, + { + "epoch": 0.2061088078517641, + "grad_norm": 0.826824963092804, + "learning_rate": 0.001, + "loss": 3.1329, + "step": 4872 + }, + { + "epoch": 0.2061511126152805, + "grad_norm": 0.38859671354293823, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 4873 + }, + { + "epoch": 0.20619341737879684, + "grad_norm": 0.22136381268501282, + "learning_rate": 0.001, + "loss": 2.243, + "step": 4874 + }, + { + "epoch": 0.20623572214231323, + "grad_norm": 0.24336136877536774, + "learning_rate": 0.001, + "loss": 3.3275, + "step": 4875 + }, + { + "epoch": 0.2062780269058296, + "grad_norm": 0.24462832510471344, + "learning_rate": 0.001, + "loss": 2.9589, + "step": 4876 + }, + { + "epoch": 0.20632033166934596, + "grad_norm": 7.4848175048828125, + "learning_rate": 0.001, + "loss": 1.9259, + "step": 4877 + }, + { + "epoch": 0.20636263643286235, + "grad_norm": 0.25238528847694397, + "learning_rate": 0.001, + "loss": 1.9616, + "step": 4878 + }, + { + "epoch": 0.2064049411963787, + "grad_norm": 1.0370349884033203, + "learning_rate": 0.001, + "loss": 2.0602, + "step": 4879 + }, + { + "epoch": 0.20644724595989508, + "grad_norm": 0.2667924165725708, + "learning_rate": 0.001, + "loss": 2.1587, + "step": 4880 + }, + { + "epoch": 0.20648955072341146, + "grad_norm": 5.384062767028809, + "learning_rate": 0.001, + "loss": 2.1591, + "step": 4881 + }, + { + "epoch": 0.20653185548692782, + "grad_norm": 0.653923511505127, + "learning_rate": 0.001, + "loss": 2.1794, + "step": 4882 + }, + { + "epoch": 0.2065741602504442, + "grad_norm": 6.2755937576293945, + "learning_rate": 0.001, + "loss": 2.7158, + "step": 4883 + }, + { + "epoch": 0.20661646501396058, + "grad_norm": 0.5690131187438965, + "learning_rate": 0.001, + "loss": 2.2172, + "step": 4884 + }, + { + "epoch": 0.20665876977747694, + "grad_norm": 0.6004669070243835, + "learning_rate": 0.001, + "loss": 2.4962, + "step": 4885 + }, + { + "epoch": 0.20670107454099332, + "grad_norm": 0.5099846124649048, + "learning_rate": 0.001, + "loss": 2.7323, + "step": 4886 + }, + { + "epoch": 0.2067433793045097, + "grad_norm": 0.8244269490242004, + "learning_rate": 0.001, + "loss": 2.5037, + "step": 4887 + }, + { + "epoch": 0.20678568406802605, + "grad_norm": 0.7311514616012573, + "learning_rate": 0.001, + "loss": 2.8543, + "step": 4888 + }, + { + "epoch": 0.20682798883154244, + "grad_norm": 1.6396245956420898, + "learning_rate": 0.001, + "loss": 3.5585, + "step": 4889 + }, + { + "epoch": 0.2068702935950588, + "grad_norm": 0.7014549374580383, + "learning_rate": 0.001, + "loss": 2.8487, + "step": 4890 + }, + { + "epoch": 0.20691259835857517, + "grad_norm": 0.7353118062019348, + "learning_rate": 0.001, + "loss": 2.7439, + "step": 4891 + }, + { + "epoch": 0.20695490312209155, + "grad_norm": 0.3765983283519745, + "learning_rate": 0.001, + "loss": 4.2012, + "step": 4892 + }, + { + "epoch": 0.2069972078856079, + "grad_norm": 1.587646245956421, + "learning_rate": 0.001, + "loss": 2.1776, + "step": 4893 + }, + { + "epoch": 0.2070395126491243, + "grad_norm": 0.45445260405540466, + "learning_rate": 0.001, + "loss": 2.7538, + "step": 4894 + }, + { + "epoch": 0.20708181741264067, + "grad_norm": 0.5824608206748962, + "learning_rate": 0.001, + "loss": 2.4091, + "step": 4895 + }, + { + "epoch": 0.20712412217615703, + "grad_norm": 0.29046350717544556, + "learning_rate": 0.001, + "loss": 2.3128, + "step": 4896 + }, + { + "epoch": 0.2071664269396734, + "grad_norm": 0.2888144552707672, + "learning_rate": 0.001, + "loss": 2.5121, + "step": 4897 + }, + { + "epoch": 0.2072087317031898, + "grad_norm": 0.4128365218639374, + "learning_rate": 0.001, + "loss": 2.8964, + "step": 4898 + }, + { + "epoch": 0.20725103646670615, + "grad_norm": 2.1219639778137207, + "learning_rate": 0.001, + "loss": 2.9168, + "step": 4899 + }, + { + "epoch": 0.20729334123022253, + "grad_norm": 0.910344660282135, + "learning_rate": 0.001, + "loss": 4.5335, + "step": 4900 + }, + { + "epoch": 0.20733564599373888, + "grad_norm": 0.2882691025733948, + "learning_rate": 0.001, + "loss": 2.8958, + "step": 4901 + }, + { + "epoch": 0.20737795075725526, + "grad_norm": 0.2687327563762665, + "learning_rate": 0.001, + "loss": 2.1742, + "step": 4902 + }, + { + "epoch": 0.20742025552077165, + "grad_norm": 0.8853581547737122, + "learning_rate": 0.001, + "loss": 1.9429, + "step": 4903 + }, + { + "epoch": 0.207462560284288, + "grad_norm": 1.759624719619751, + "learning_rate": 0.001, + "loss": 2.8563, + "step": 4904 + }, + { + "epoch": 0.20750486504780438, + "grad_norm": 3.137960910797119, + "learning_rate": 0.001, + "loss": 3.4388, + "step": 4905 + }, + { + "epoch": 0.20754716981132076, + "grad_norm": 3.9103238582611084, + "learning_rate": 0.001, + "loss": 2.4237, + "step": 4906 + }, + { + "epoch": 0.20758947457483712, + "grad_norm": 0.40916767716407776, + "learning_rate": 0.001, + "loss": 2.2436, + "step": 4907 + }, + { + "epoch": 0.2076317793383535, + "grad_norm": 0.30772218108177185, + "learning_rate": 0.001, + "loss": 2.3738, + "step": 4908 + }, + { + "epoch": 0.20767408410186988, + "grad_norm": 1.7794135808944702, + "learning_rate": 0.001, + "loss": 3.5238, + "step": 4909 + }, + { + "epoch": 0.20771638886538624, + "grad_norm": 0.23665037751197815, + "learning_rate": 0.001, + "loss": 1.9228, + "step": 4910 + }, + { + "epoch": 0.20775869362890262, + "grad_norm": 0.2872195541858673, + "learning_rate": 0.001, + "loss": 2.6802, + "step": 4911 + }, + { + "epoch": 0.20780099839241897, + "grad_norm": 6.244613170623779, + "learning_rate": 0.001, + "loss": 2.8313, + "step": 4912 + }, + { + "epoch": 0.20784330315593535, + "grad_norm": 0.2514772117137909, + "learning_rate": 0.001, + "loss": 2.0803, + "step": 4913 + }, + { + "epoch": 0.20788560791945174, + "grad_norm": 11.249589920043945, + "learning_rate": 0.001, + "loss": 2.0849, + "step": 4914 + }, + { + "epoch": 0.2079279126829681, + "grad_norm": 0.3307616114616394, + "learning_rate": 0.001, + "loss": 2.9033, + "step": 4915 + }, + { + "epoch": 0.20797021744648447, + "grad_norm": 0.3316502869129181, + "learning_rate": 0.001, + "loss": 2.3176, + "step": 4916 + }, + { + "epoch": 0.20801252221000086, + "grad_norm": 0.4694264829158783, + "learning_rate": 0.001, + "loss": 3.0293, + "step": 4917 + }, + { + "epoch": 0.2080548269735172, + "grad_norm": 0.2822750210762024, + "learning_rate": 0.001, + "loss": 2.4169, + "step": 4918 + }, + { + "epoch": 0.2080971317370336, + "grad_norm": 0.604796826839447, + "learning_rate": 0.001, + "loss": 2.7267, + "step": 4919 + }, + { + "epoch": 0.20813943650054997, + "grad_norm": 0.23968219757080078, + "learning_rate": 0.001, + "loss": 2.7295, + "step": 4920 + }, + { + "epoch": 0.20818174126406633, + "grad_norm": 0.2379380464553833, + "learning_rate": 0.001, + "loss": 2.2191, + "step": 4921 + }, + { + "epoch": 0.2082240460275827, + "grad_norm": 0.2360192835330963, + "learning_rate": 0.001, + "loss": 1.8814, + "step": 4922 + }, + { + "epoch": 0.20826635079109906, + "grad_norm": 0.40133988857269287, + "learning_rate": 0.001, + "loss": 1.9194, + "step": 4923 + }, + { + "epoch": 0.20830865555461545, + "grad_norm": 0.314656525850296, + "learning_rate": 0.001, + "loss": 2.3935, + "step": 4924 + }, + { + "epoch": 0.20835096031813183, + "grad_norm": 1.1727277040481567, + "learning_rate": 0.001, + "loss": 2.8302, + "step": 4925 + }, + { + "epoch": 0.20839326508164818, + "grad_norm": 0.22926130890846252, + "learning_rate": 0.001, + "loss": 2.7978, + "step": 4926 + }, + { + "epoch": 0.20843556984516456, + "grad_norm": 0.4540160596370697, + "learning_rate": 0.001, + "loss": 1.8122, + "step": 4927 + }, + { + "epoch": 0.20847787460868095, + "grad_norm": 23.994537353515625, + "learning_rate": 0.001, + "loss": 2.6313, + "step": 4928 + }, + { + "epoch": 0.2085201793721973, + "grad_norm": 0.6194539070129395, + "learning_rate": 0.001, + "loss": 2.9595, + "step": 4929 + }, + { + "epoch": 0.20856248413571368, + "grad_norm": 13.033817291259766, + "learning_rate": 0.001, + "loss": 2.1685, + "step": 4930 + }, + { + "epoch": 0.20860478889923006, + "grad_norm": 0.323756605386734, + "learning_rate": 0.001, + "loss": 2.9983, + "step": 4931 + }, + { + "epoch": 0.20864709366274642, + "grad_norm": 4.008752346038818, + "learning_rate": 0.001, + "loss": 3.0625, + "step": 4932 + }, + { + "epoch": 0.2086893984262628, + "grad_norm": 0.28806641697883606, + "learning_rate": 0.001, + "loss": 2.3397, + "step": 4933 + }, + { + "epoch": 0.20873170318977916, + "grad_norm": 0.49707770347595215, + "learning_rate": 0.001, + "loss": 2.1648, + "step": 4934 + }, + { + "epoch": 0.20877400795329554, + "grad_norm": 0.29439014196395874, + "learning_rate": 0.001, + "loss": 2.0093, + "step": 4935 + }, + { + "epoch": 0.20881631271681192, + "grad_norm": 0.7795244455337524, + "learning_rate": 0.001, + "loss": 2.36, + "step": 4936 + }, + { + "epoch": 0.20885861748032827, + "grad_norm": 0.2714845836162567, + "learning_rate": 0.001, + "loss": 1.7267, + "step": 4937 + }, + { + "epoch": 0.20890092224384466, + "grad_norm": 0.4351987838745117, + "learning_rate": 0.001, + "loss": 2.605, + "step": 4938 + }, + { + "epoch": 0.20894322700736104, + "grad_norm": 0.39439719915390015, + "learning_rate": 0.001, + "loss": 2.6728, + "step": 4939 + }, + { + "epoch": 0.2089855317708774, + "grad_norm": 0.8820605874061584, + "learning_rate": 0.001, + "loss": 3.4102, + "step": 4940 + }, + { + "epoch": 0.20902783653439377, + "grad_norm": 0.3229409158229828, + "learning_rate": 0.001, + "loss": 2.9257, + "step": 4941 + }, + { + "epoch": 0.20907014129791016, + "grad_norm": 0.3107205629348755, + "learning_rate": 0.001, + "loss": 2.3553, + "step": 4942 + }, + { + "epoch": 0.2091124460614265, + "grad_norm": 0.36764320731163025, + "learning_rate": 0.001, + "loss": 3.036, + "step": 4943 + }, + { + "epoch": 0.2091547508249429, + "grad_norm": 0.2639576494693756, + "learning_rate": 0.001, + "loss": 2.8147, + "step": 4944 + }, + { + "epoch": 0.20919705558845927, + "grad_norm": 0.24890625476837158, + "learning_rate": 0.001, + "loss": 1.888, + "step": 4945 + }, + { + "epoch": 0.20923936035197563, + "grad_norm": 0.28734108805656433, + "learning_rate": 0.001, + "loss": 2.3685, + "step": 4946 + }, + { + "epoch": 0.209281665115492, + "grad_norm": 1.45154869556427, + "learning_rate": 0.001, + "loss": 2.8914, + "step": 4947 + }, + { + "epoch": 0.20932396987900836, + "grad_norm": 0.24972866475582123, + "learning_rate": 0.001, + "loss": 2.1973, + "step": 4948 + }, + { + "epoch": 0.20936627464252475, + "grad_norm": 0.7616045475006104, + "learning_rate": 0.001, + "loss": 2.4037, + "step": 4949 + }, + { + "epoch": 0.20940857940604113, + "grad_norm": 0.3818029463291168, + "learning_rate": 0.001, + "loss": 2.3215, + "step": 4950 + }, + { + "epoch": 0.20945088416955748, + "grad_norm": 32.00923156738281, + "learning_rate": 0.001, + "loss": 2.4591, + "step": 4951 + }, + { + "epoch": 0.20949318893307387, + "grad_norm": 0.31729358434677124, + "learning_rate": 0.001, + "loss": 2.4413, + "step": 4952 + }, + { + "epoch": 0.20953549369659025, + "grad_norm": 2.464691638946533, + "learning_rate": 0.001, + "loss": 2.4398, + "step": 4953 + }, + { + "epoch": 0.2095777984601066, + "grad_norm": 35.59858322143555, + "learning_rate": 0.001, + "loss": 2.887, + "step": 4954 + }, + { + "epoch": 0.20962010322362298, + "grad_norm": 0.5536134243011475, + "learning_rate": 0.001, + "loss": 2.9387, + "step": 4955 + }, + { + "epoch": 0.20966240798713937, + "grad_norm": 1.5458406209945679, + "learning_rate": 0.001, + "loss": 2.1783, + "step": 4956 + }, + { + "epoch": 0.20970471275065572, + "grad_norm": 0.8525711894035339, + "learning_rate": 0.001, + "loss": 3.0125, + "step": 4957 + }, + { + "epoch": 0.2097470175141721, + "grad_norm": 3.157151460647583, + "learning_rate": 0.001, + "loss": 2.9392, + "step": 4958 + }, + { + "epoch": 0.20978932227768846, + "grad_norm": 0.42130786180496216, + "learning_rate": 0.001, + "loss": 2.071, + "step": 4959 + }, + { + "epoch": 0.20983162704120484, + "grad_norm": 2.541234016418457, + "learning_rate": 0.001, + "loss": 4.1087, + "step": 4960 + }, + { + "epoch": 0.20987393180472122, + "grad_norm": 0.24158483743667603, + "learning_rate": 0.001, + "loss": 1.9446, + "step": 4961 + }, + { + "epoch": 0.20991623656823757, + "grad_norm": 0.45275890827178955, + "learning_rate": 0.001, + "loss": 2.1538, + "step": 4962 + }, + { + "epoch": 0.20995854133175396, + "grad_norm": 1.040623664855957, + "learning_rate": 0.001, + "loss": 2.0899, + "step": 4963 + }, + { + "epoch": 0.21000084609527034, + "grad_norm": 0.2312449812889099, + "learning_rate": 0.001, + "loss": 1.9906, + "step": 4964 + }, + { + "epoch": 0.2100431508587867, + "grad_norm": 0.33807173371315, + "learning_rate": 0.001, + "loss": 2.1848, + "step": 4965 + }, + { + "epoch": 0.21008545562230307, + "grad_norm": 0.256266325712204, + "learning_rate": 0.001, + "loss": 2.1486, + "step": 4966 + }, + { + "epoch": 0.21012776038581946, + "grad_norm": 0.4030238687992096, + "learning_rate": 0.001, + "loss": 1.9343, + "step": 4967 + }, + { + "epoch": 0.2101700651493358, + "grad_norm": 0.33248621225357056, + "learning_rate": 0.001, + "loss": 2.2524, + "step": 4968 + }, + { + "epoch": 0.2102123699128522, + "grad_norm": 0.3359518349170685, + "learning_rate": 0.001, + "loss": 2.6882, + "step": 4969 + }, + { + "epoch": 0.21025467467636855, + "grad_norm": 0.3091052174568176, + "learning_rate": 0.001, + "loss": 3.1879, + "step": 4970 + }, + { + "epoch": 0.21029697943988493, + "grad_norm": 0.21481722593307495, + "learning_rate": 0.001, + "loss": 1.8638, + "step": 4971 + }, + { + "epoch": 0.2103392842034013, + "grad_norm": 0.21660932898521423, + "learning_rate": 0.001, + "loss": 2.0017, + "step": 4972 + }, + { + "epoch": 0.21038158896691767, + "grad_norm": 0.4549258351325989, + "learning_rate": 0.001, + "loss": 3.1636, + "step": 4973 + }, + { + "epoch": 0.21042389373043405, + "grad_norm": 1.2132890224456787, + "learning_rate": 0.001, + "loss": 3.037, + "step": 4974 + }, + { + "epoch": 0.21046619849395043, + "grad_norm": 0.24998739361763, + "learning_rate": 0.001, + "loss": 2.3893, + "step": 4975 + }, + { + "epoch": 0.21050850325746678, + "grad_norm": 0.2769694924354553, + "learning_rate": 0.001, + "loss": 2.0534, + "step": 4976 + }, + { + "epoch": 0.21055080802098317, + "grad_norm": 8.817028999328613, + "learning_rate": 0.001, + "loss": 2.5478, + "step": 4977 + }, + { + "epoch": 0.21059311278449955, + "grad_norm": 0.840341329574585, + "learning_rate": 0.001, + "loss": 2.5962, + "step": 4978 + }, + { + "epoch": 0.2106354175480159, + "grad_norm": 0.33536335825920105, + "learning_rate": 0.001, + "loss": 1.973, + "step": 4979 + }, + { + "epoch": 0.21067772231153228, + "grad_norm": 0.34550049901008606, + "learning_rate": 0.001, + "loss": 2.3634, + "step": 4980 + }, + { + "epoch": 0.21072002707504864, + "grad_norm": 0.4002279043197632, + "learning_rate": 0.001, + "loss": 2.7841, + "step": 4981 + }, + { + "epoch": 0.21076233183856502, + "grad_norm": 0.3583948314189911, + "learning_rate": 0.001, + "loss": 3.248, + "step": 4982 + }, + { + "epoch": 0.2108046366020814, + "grad_norm": 0.2348349541425705, + "learning_rate": 0.001, + "loss": 2.4177, + "step": 4983 + }, + { + "epoch": 0.21084694136559776, + "grad_norm": 0.2678260803222656, + "learning_rate": 0.001, + "loss": 2.775, + "step": 4984 + }, + { + "epoch": 0.21088924612911414, + "grad_norm": 0.24458220601081848, + "learning_rate": 0.001, + "loss": 2.3764, + "step": 4985 + }, + { + "epoch": 0.21093155089263052, + "grad_norm": 0.38108593225479126, + "learning_rate": 0.001, + "loss": 2.247, + "step": 4986 + }, + { + "epoch": 0.21097385565614687, + "grad_norm": 0.21886327862739563, + "learning_rate": 0.001, + "loss": 2.7944, + "step": 4987 + }, + { + "epoch": 0.21101616041966326, + "grad_norm": 7.095550537109375, + "learning_rate": 0.001, + "loss": 1.5504, + "step": 4988 + }, + { + "epoch": 0.21105846518317964, + "grad_norm": 0.2182372361421585, + "learning_rate": 0.001, + "loss": 2.7729, + "step": 4989 + }, + { + "epoch": 0.211100769946696, + "grad_norm": 0.24648889899253845, + "learning_rate": 0.001, + "loss": 2.2145, + "step": 4990 + }, + { + "epoch": 0.21114307471021238, + "grad_norm": 0.22735580801963806, + "learning_rate": 0.001, + "loss": 1.9403, + "step": 4991 + }, + { + "epoch": 0.21118537947372873, + "grad_norm": 2.2606966495513916, + "learning_rate": 0.001, + "loss": 2.5648, + "step": 4992 + }, + { + "epoch": 0.2112276842372451, + "grad_norm": 1.0740227699279785, + "learning_rate": 0.001, + "loss": 3.2116, + "step": 4993 + }, + { + "epoch": 0.2112699890007615, + "grad_norm": 1.1305190324783325, + "learning_rate": 0.001, + "loss": 3.5133, + "step": 4994 + }, + { + "epoch": 0.21131229376427785, + "grad_norm": 0.8290959000587463, + "learning_rate": 0.001, + "loss": 2.4756, + "step": 4995 + }, + { + "epoch": 0.21135459852779423, + "grad_norm": 0.30517685413360596, + "learning_rate": 0.001, + "loss": 2.2226, + "step": 4996 + }, + { + "epoch": 0.2113969032913106, + "grad_norm": 0.25863179564476013, + "learning_rate": 0.001, + "loss": 2.1738, + "step": 4997 + }, + { + "epoch": 0.21143920805482697, + "grad_norm": 0.33867284655570984, + "learning_rate": 0.001, + "loss": 2.6182, + "step": 4998 + }, + { + "epoch": 0.21148151281834335, + "grad_norm": 0.3169691264629364, + "learning_rate": 0.001, + "loss": 2.8752, + "step": 4999 + }, + { + "epoch": 0.21152381758185973, + "grad_norm": 0.2934655249118805, + "learning_rate": 0.001, + "loss": 3.4324, + "step": 5000 + }, + { + "epoch": 0.21156612234537608, + "grad_norm": 1.9539580345153809, + "learning_rate": 0.001, + "loss": 2.0974, + "step": 5001 + }, + { + "epoch": 0.21160842710889247, + "grad_norm": 0.28352606296539307, + "learning_rate": 0.001, + "loss": 2.3892, + "step": 5002 + }, + { + "epoch": 0.21165073187240882, + "grad_norm": 0.36468642950057983, + "learning_rate": 0.001, + "loss": 2.67, + "step": 5003 + }, + { + "epoch": 0.2116930366359252, + "grad_norm": 0.28954243659973145, + "learning_rate": 0.001, + "loss": 2.4779, + "step": 5004 + }, + { + "epoch": 0.21173534139944158, + "grad_norm": 0.2951817810535431, + "learning_rate": 0.001, + "loss": 2.6193, + "step": 5005 + }, + { + "epoch": 0.21177764616295794, + "grad_norm": 0.24036556482315063, + "learning_rate": 0.001, + "loss": 2.362, + "step": 5006 + }, + { + "epoch": 0.21181995092647432, + "grad_norm": 0.23176927864551544, + "learning_rate": 0.001, + "loss": 2.4304, + "step": 5007 + }, + { + "epoch": 0.2118622556899907, + "grad_norm": 0.2653713822364807, + "learning_rate": 0.001, + "loss": 2.4899, + "step": 5008 + }, + { + "epoch": 0.21190456045350706, + "grad_norm": 0.22608830034732819, + "learning_rate": 0.001, + "loss": 1.915, + "step": 5009 + }, + { + "epoch": 0.21194686521702344, + "grad_norm": 0.26712745428085327, + "learning_rate": 0.001, + "loss": 2.8369, + "step": 5010 + }, + { + "epoch": 0.21198916998053982, + "grad_norm": 0.23458565771579742, + "learning_rate": 0.001, + "loss": 2.713, + "step": 5011 + }, + { + "epoch": 0.21203147474405618, + "grad_norm": 0.3184855282306671, + "learning_rate": 0.001, + "loss": 3.2585, + "step": 5012 + }, + { + "epoch": 0.21207377950757256, + "grad_norm": 0.29982033371925354, + "learning_rate": 0.001, + "loss": 2.6947, + "step": 5013 + }, + { + "epoch": 0.2121160842710889, + "grad_norm": 0.22614946961402893, + "learning_rate": 0.001, + "loss": 1.8255, + "step": 5014 + }, + { + "epoch": 0.2121583890346053, + "grad_norm": 0.22984737157821655, + "learning_rate": 0.001, + "loss": 1.9238, + "step": 5015 + }, + { + "epoch": 0.21220069379812168, + "grad_norm": 0.2999686598777771, + "learning_rate": 0.001, + "loss": 2.102, + "step": 5016 + }, + { + "epoch": 0.21224299856163803, + "grad_norm": 0.21866334974765778, + "learning_rate": 0.001, + "loss": 2.5355, + "step": 5017 + }, + { + "epoch": 0.2122853033251544, + "grad_norm": 0.30054131150245667, + "learning_rate": 0.001, + "loss": 1.7776, + "step": 5018 + }, + { + "epoch": 0.2123276080886708, + "grad_norm": 0.28100132942199707, + "learning_rate": 0.001, + "loss": 3.4669, + "step": 5019 + }, + { + "epoch": 0.21236991285218715, + "grad_norm": 0.2859184443950653, + "learning_rate": 0.001, + "loss": 1.9491, + "step": 5020 + }, + { + "epoch": 0.21241221761570353, + "grad_norm": 1.7632389068603516, + "learning_rate": 0.001, + "loss": 2.4757, + "step": 5021 + }, + { + "epoch": 0.2124545223792199, + "grad_norm": 0.4322301149368286, + "learning_rate": 0.001, + "loss": 1.9597, + "step": 5022 + }, + { + "epoch": 0.21249682714273627, + "grad_norm": 0.27014920115470886, + "learning_rate": 0.001, + "loss": 2.5075, + "step": 5023 + }, + { + "epoch": 0.21253913190625265, + "grad_norm": 0.21609550714492798, + "learning_rate": 0.001, + "loss": 2.2271, + "step": 5024 + }, + { + "epoch": 0.212581436669769, + "grad_norm": 0.2416316717863083, + "learning_rate": 0.001, + "loss": 2.0308, + "step": 5025 + }, + { + "epoch": 0.21262374143328538, + "grad_norm": 0.6305524110794067, + "learning_rate": 0.001, + "loss": 2.16, + "step": 5026 + }, + { + "epoch": 0.21266604619680177, + "grad_norm": 0.21432656049728394, + "learning_rate": 0.001, + "loss": 2.6648, + "step": 5027 + }, + { + "epoch": 0.21270835096031812, + "grad_norm": 0.854989767074585, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 5028 + }, + { + "epoch": 0.2127506557238345, + "grad_norm": 0.2650429308414459, + "learning_rate": 0.001, + "loss": 2.1294, + "step": 5029 + }, + { + "epoch": 0.21279296048735089, + "grad_norm": 0.1896660178899765, + "learning_rate": 0.001, + "loss": 1.7712, + "step": 5030 + }, + { + "epoch": 0.21283526525086724, + "grad_norm": 0.4741089940071106, + "learning_rate": 0.001, + "loss": 2.5271, + "step": 5031 + }, + { + "epoch": 0.21287757001438362, + "grad_norm": 0.22527028620243073, + "learning_rate": 0.001, + "loss": 2.7449, + "step": 5032 + }, + { + "epoch": 0.2129198747779, + "grad_norm": 0.2307746261358261, + "learning_rate": 0.001, + "loss": 2.8616, + "step": 5033 + }, + { + "epoch": 0.21296217954141636, + "grad_norm": 0.4615713655948639, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 5034 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 23.890384674072266, + "learning_rate": 0.001, + "loss": 1.4897, + "step": 5035 + }, + { + "epoch": 0.2130467890684491, + "grad_norm": 0.497530072927475, + "learning_rate": 0.001, + "loss": 2.1057, + "step": 5036 + }, + { + "epoch": 0.21308909383196548, + "grad_norm": 4.099823474884033, + "learning_rate": 0.001, + "loss": 2.2474, + "step": 5037 + }, + { + "epoch": 0.21313139859548186, + "grad_norm": 0.23300927877426147, + "learning_rate": 0.001, + "loss": 2.6447, + "step": 5038 + }, + { + "epoch": 0.2131737033589982, + "grad_norm": 1.2628371715545654, + "learning_rate": 0.001, + "loss": 2.3729, + "step": 5039 + }, + { + "epoch": 0.2132160081225146, + "grad_norm": 3.1556689739227295, + "learning_rate": 0.001, + "loss": 2.0734, + "step": 5040 + }, + { + "epoch": 0.21325831288603098, + "grad_norm": 0.23001788556575775, + "learning_rate": 0.001, + "loss": 2.0797, + "step": 5041 + }, + { + "epoch": 0.21330061764954733, + "grad_norm": 0.3631579577922821, + "learning_rate": 0.001, + "loss": 2.1848, + "step": 5042 + }, + { + "epoch": 0.2133429224130637, + "grad_norm": 0.2023169994354248, + "learning_rate": 0.001, + "loss": 2.0585, + "step": 5043 + }, + { + "epoch": 0.2133852271765801, + "grad_norm": 0.25876331329345703, + "learning_rate": 0.001, + "loss": 2.2102, + "step": 5044 + }, + { + "epoch": 0.21342753194009645, + "grad_norm": 0.22901210188865662, + "learning_rate": 0.001, + "loss": 2.1429, + "step": 5045 + }, + { + "epoch": 0.21346983670361283, + "grad_norm": 0.472971647977829, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 5046 + }, + { + "epoch": 0.21351214146712919, + "grad_norm": 0.47565358877182007, + "learning_rate": 0.001, + "loss": 2.3727, + "step": 5047 + }, + { + "epoch": 0.21355444623064557, + "grad_norm": 3.025669813156128, + "learning_rate": 0.001, + "loss": 1.6324, + "step": 5048 + }, + { + "epoch": 0.21359675099416195, + "grad_norm": 0.7364016175270081, + "learning_rate": 0.001, + "loss": 2.7439, + "step": 5049 + }, + { + "epoch": 0.2136390557576783, + "grad_norm": 0.25837811827659607, + "learning_rate": 0.001, + "loss": 2.591, + "step": 5050 + }, + { + "epoch": 0.21368136052119469, + "grad_norm": 0.556201159954071, + "learning_rate": 0.001, + "loss": 2.0559, + "step": 5051 + }, + { + "epoch": 0.21372366528471107, + "grad_norm": 0.23484918475151062, + "learning_rate": 0.001, + "loss": 2.0903, + "step": 5052 + }, + { + "epoch": 0.21376597004822742, + "grad_norm": 0.20582491159439087, + "learning_rate": 0.001, + "loss": 2.5129, + "step": 5053 + }, + { + "epoch": 0.2138082748117438, + "grad_norm": 0.4761744439601898, + "learning_rate": 0.001, + "loss": 2.3142, + "step": 5054 + }, + { + "epoch": 0.21385057957526019, + "grad_norm": 0.5703874230384827, + "learning_rate": 0.001, + "loss": 2.5662, + "step": 5055 + }, + { + "epoch": 0.21389288433877654, + "grad_norm": 0.19592377543449402, + "learning_rate": 0.001, + "loss": 2.497, + "step": 5056 + }, + { + "epoch": 0.21393518910229292, + "grad_norm": 0.21815036237239838, + "learning_rate": 0.001, + "loss": 2.4716, + "step": 5057 + }, + { + "epoch": 0.2139774938658093, + "grad_norm": 0.976707398891449, + "learning_rate": 0.001, + "loss": 2.0707, + "step": 5058 + }, + { + "epoch": 0.21401979862932566, + "grad_norm": 1.2479373216629028, + "learning_rate": 0.001, + "loss": 1.7482, + "step": 5059 + }, + { + "epoch": 0.21406210339284204, + "grad_norm": 0.31325292587280273, + "learning_rate": 0.001, + "loss": 2.5769, + "step": 5060 + }, + { + "epoch": 0.2141044081563584, + "grad_norm": 3.2416415214538574, + "learning_rate": 0.001, + "loss": 2.0277, + "step": 5061 + }, + { + "epoch": 0.21414671291987478, + "grad_norm": 0.21428047120571136, + "learning_rate": 0.001, + "loss": 1.7499, + "step": 5062 + }, + { + "epoch": 0.21418901768339116, + "grad_norm": 0.7413913607597351, + "learning_rate": 0.001, + "loss": 3.0148, + "step": 5063 + }, + { + "epoch": 0.2142313224469075, + "grad_norm": 3.3833765983581543, + "learning_rate": 0.001, + "loss": 2.9668, + "step": 5064 + }, + { + "epoch": 0.2142736272104239, + "grad_norm": 0.934876024723053, + "learning_rate": 0.001, + "loss": 2.7919, + "step": 5065 + }, + { + "epoch": 0.21431593197394028, + "grad_norm": 0.4856380224227905, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 5066 + }, + { + "epoch": 0.21435823673745663, + "grad_norm": 0.5465160012245178, + "learning_rate": 0.001, + "loss": 2.8415, + "step": 5067 + }, + { + "epoch": 0.214400541500973, + "grad_norm": 0.22289830446243286, + "learning_rate": 0.001, + "loss": 2.0494, + "step": 5068 + }, + { + "epoch": 0.2144428462644894, + "grad_norm": 2.262659788131714, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 5069 + }, + { + "epoch": 0.21448515102800575, + "grad_norm": 0.2547045052051544, + "learning_rate": 0.001, + "loss": 2.714, + "step": 5070 + }, + { + "epoch": 0.21452745579152213, + "grad_norm": 0.2590230703353882, + "learning_rate": 0.001, + "loss": 2.928, + "step": 5071 + }, + { + "epoch": 0.21456976055503849, + "grad_norm": 0.42084890604019165, + "learning_rate": 0.001, + "loss": 3.0684, + "step": 5072 + }, + { + "epoch": 0.21461206531855487, + "grad_norm": 0.7488611340522766, + "learning_rate": 0.001, + "loss": 2.8074, + "step": 5073 + }, + { + "epoch": 0.21465437008207125, + "grad_norm": 0.32366877794265747, + "learning_rate": 0.001, + "loss": 3.2808, + "step": 5074 + }, + { + "epoch": 0.2146966748455876, + "grad_norm": 0.20374761521816254, + "learning_rate": 0.001, + "loss": 2.5365, + "step": 5075 + }, + { + "epoch": 0.21473897960910399, + "grad_norm": 0.23549343645572662, + "learning_rate": 0.001, + "loss": 3.1777, + "step": 5076 + }, + { + "epoch": 0.21478128437262037, + "grad_norm": 17.203941345214844, + "learning_rate": 0.001, + "loss": 2.5717, + "step": 5077 + }, + { + "epoch": 0.21482358913613672, + "grad_norm": 0.25522732734680176, + "learning_rate": 0.001, + "loss": 2.9367, + "step": 5078 + }, + { + "epoch": 0.2148658938996531, + "grad_norm": 0.3394809365272522, + "learning_rate": 0.001, + "loss": 2.3561, + "step": 5079 + }, + { + "epoch": 0.2149081986631695, + "grad_norm": 0.21299435198307037, + "learning_rate": 0.001, + "loss": 2.7499, + "step": 5080 + }, + { + "epoch": 0.21495050342668584, + "grad_norm": 0.1964573711156845, + "learning_rate": 0.001, + "loss": 2.3468, + "step": 5081 + }, + { + "epoch": 0.21499280819020222, + "grad_norm": 0.3184814155101776, + "learning_rate": 0.001, + "loss": 2.4803, + "step": 5082 + }, + { + "epoch": 0.21503511295371858, + "grad_norm": 0.34336036443710327, + "learning_rate": 0.001, + "loss": 3.3502, + "step": 5083 + }, + { + "epoch": 0.21507741771723496, + "grad_norm": 0.25828707218170166, + "learning_rate": 0.001, + "loss": 3.5646, + "step": 5084 + }, + { + "epoch": 0.21511972248075134, + "grad_norm": 0.4242289066314697, + "learning_rate": 0.001, + "loss": 2.8577, + "step": 5085 + }, + { + "epoch": 0.2151620272442677, + "grad_norm": 0.18986240029335022, + "learning_rate": 0.001, + "loss": 1.9367, + "step": 5086 + }, + { + "epoch": 0.21520433200778408, + "grad_norm": 0.19585952162742615, + "learning_rate": 0.001, + "loss": 2.1596, + "step": 5087 + }, + { + "epoch": 0.21524663677130046, + "grad_norm": 0.3815001845359802, + "learning_rate": 0.001, + "loss": 2.9462, + "step": 5088 + }, + { + "epoch": 0.2152889415348168, + "grad_norm": 0.26725849509239197, + "learning_rate": 0.001, + "loss": 3.1378, + "step": 5089 + }, + { + "epoch": 0.2153312462983332, + "grad_norm": 0.4309972822666168, + "learning_rate": 0.001, + "loss": 3.3749, + "step": 5090 + }, + { + "epoch": 0.21537355106184958, + "grad_norm": 0.19142082333564758, + "learning_rate": 0.001, + "loss": 2.2686, + "step": 5091 + }, + { + "epoch": 0.21541585582536593, + "grad_norm": 0.22003363072872162, + "learning_rate": 0.001, + "loss": 1.996, + "step": 5092 + }, + { + "epoch": 0.2154581605888823, + "grad_norm": 0.30616286396980286, + "learning_rate": 0.001, + "loss": 3.3171, + "step": 5093 + }, + { + "epoch": 0.21550046535239867, + "grad_norm": 8.013957977294922, + "learning_rate": 0.001, + "loss": 2.2265, + "step": 5094 + }, + { + "epoch": 0.21554277011591505, + "grad_norm": 0.3616850972175598, + "learning_rate": 0.001, + "loss": 3.1465, + "step": 5095 + }, + { + "epoch": 0.21558507487943143, + "grad_norm": 1.7922744750976562, + "learning_rate": 0.001, + "loss": 2.7905, + "step": 5096 + }, + { + "epoch": 0.2156273796429478, + "grad_norm": 0.18036864697933197, + "learning_rate": 0.001, + "loss": 1.8923, + "step": 5097 + }, + { + "epoch": 0.21566968440646417, + "grad_norm": 0.5982964038848877, + "learning_rate": 0.001, + "loss": 2.0202, + "step": 5098 + }, + { + "epoch": 0.21571198916998055, + "grad_norm": 0.37752029299736023, + "learning_rate": 0.001, + "loss": 1.8078, + "step": 5099 + }, + { + "epoch": 0.2157542939334969, + "grad_norm": 0.28953149914741516, + "learning_rate": 0.001, + "loss": 2.2722, + "step": 5100 + }, + { + "epoch": 0.2157965986970133, + "grad_norm": 0.275361031293869, + "learning_rate": 0.001, + "loss": 1.8862, + "step": 5101 + }, + { + "epoch": 0.21583890346052967, + "grad_norm": 2.51975679397583, + "learning_rate": 0.001, + "loss": 2.4179, + "step": 5102 + }, + { + "epoch": 0.21588120822404602, + "grad_norm": 3.455864191055298, + "learning_rate": 0.001, + "loss": 2.9148, + "step": 5103 + }, + { + "epoch": 0.2159235129875624, + "grad_norm": 0.20818361639976501, + "learning_rate": 0.001, + "loss": 1.5796, + "step": 5104 + }, + { + "epoch": 0.21596581775107876, + "grad_norm": 0.2759215831756592, + "learning_rate": 0.001, + "loss": 3.4658, + "step": 5105 + }, + { + "epoch": 0.21600812251459514, + "grad_norm": 0.2338385432958603, + "learning_rate": 0.001, + "loss": 2.5726, + "step": 5106 + }, + { + "epoch": 0.21605042727811152, + "grad_norm": 0.2205616980791092, + "learning_rate": 0.001, + "loss": 4.0847, + "step": 5107 + }, + { + "epoch": 0.21609273204162788, + "grad_norm": 0.8315731287002563, + "learning_rate": 0.001, + "loss": 2.2934, + "step": 5108 + }, + { + "epoch": 0.21613503680514426, + "grad_norm": 0.22039955854415894, + "learning_rate": 0.001, + "loss": 2.0346, + "step": 5109 + }, + { + "epoch": 0.21617734156866064, + "grad_norm": 0.89706951379776, + "learning_rate": 0.001, + "loss": 2.0214, + "step": 5110 + }, + { + "epoch": 0.216219646332177, + "grad_norm": 13.432646751403809, + "learning_rate": 0.001, + "loss": 2.9813, + "step": 5111 + }, + { + "epoch": 0.21626195109569338, + "grad_norm": 0.20482416450977325, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 5112 + }, + { + "epoch": 0.21630425585920976, + "grad_norm": 0.4937507212162018, + "learning_rate": 0.001, + "loss": 1.7228, + "step": 5113 + }, + { + "epoch": 0.21634656062272611, + "grad_norm": 0.3154240548610687, + "learning_rate": 0.001, + "loss": 2.3277, + "step": 5114 + }, + { + "epoch": 0.2163888653862425, + "grad_norm": 0.4587913453578949, + "learning_rate": 0.001, + "loss": 2.2894, + "step": 5115 + }, + { + "epoch": 0.21643117014975885, + "grad_norm": 0.39989644289016724, + "learning_rate": 0.001, + "loss": 2.5237, + "step": 5116 + }, + { + "epoch": 0.21647347491327523, + "grad_norm": 0.21547210216522217, + "learning_rate": 0.001, + "loss": 2.0295, + "step": 5117 + }, + { + "epoch": 0.21651577967679161, + "grad_norm": 2.3156797885894775, + "learning_rate": 0.001, + "loss": 2.5201, + "step": 5118 + }, + { + "epoch": 0.21655808444030797, + "grad_norm": 0.19337671995162964, + "learning_rate": 0.001, + "loss": 1.8627, + "step": 5119 + }, + { + "epoch": 0.21660038920382435, + "grad_norm": 0.24067647755146027, + "learning_rate": 0.001, + "loss": 2.2175, + "step": 5120 + }, + { + "epoch": 0.21664269396734073, + "grad_norm": 10.653289794921875, + "learning_rate": 0.001, + "loss": 3.6184, + "step": 5121 + }, + { + "epoch": 0.2166849987308571, + "grad_norm": 0.22319035232067108, + "learning_rate": 0.001, + "loss": 2.0583, + "step": 5122 + }, + { + "epoch": 0.21672730349437347, + "grad_norm": 13.307822227478027, + "learning_rate": 0.001, + "loss": 1.885, + "step": 5123 + }, + { + "epoch": 0.21676960825788985, + "grad_norm": 41.82732009887695, + "learning_rate": 0.001, + "loss": 3.325, + "step": 5124 + }, + { + "epoch": 0.2168119130214062, + "grad_norm": 0.20223648846149445, + "learning_rate": 0.001, + "loss": 1.657, + "step": 5125 + }, + { + "epoch": 0.2168542177849226, + "grad_norm": 0.708741307258606, + "learning_rate": 0.001, + "loss": 2.2723, + "step": 5126 + }, + { + "epoch": 0.21689652254843894, + "grad_norm": 0.2860840857028961, + "learning_rate": 0.001, + "loss": 2.3074, + "step": 5127 + }, + { + "epoch": 0.21693882731195532, + "grad_norm": 0.4499269425868988, + "learning_rate": 0.001, + "loss": 2.5004, + "step": 5128 + }, + { + "epoch": 0.2169811320754717, + "grad_norm": 0.3753020763397217, + "learning_rate": 0.001, + "loss": 2.7281, + "step": 5129 + }, + { + "epoch": 0.21702343683898806, + "grad_norm": 6.676267623901367, + "learning_rate": 0.001, + "loss": 2.5515, + "step": 5130 + }, + { + "epoch": 0.21706574160250444, + "grad_norm": 2.6914873123168945, + "learning_rate": 0.001, + "loss": 2.5111, + "step": 5131 + }, + { + "epoch": 0.21710804636602082, + "grad_norm": 0.24604924023151398, + "learning_rate": 0.001, + "loss": 2.1823, + "step": 5132 + }, + { + "epoch": 0.21715035112953718, + "grad_norm": 0.2534256875514984, + "learning_rate": 0.001, + "loss": 2.6668, + "step": 5133 + }, + { + "epoch": 0.21719265589305356, + "grad_norm": 0.32457131147384644, + "learning_rate": 0.001, + "loss": 3.2796, + "step": 5134 + }, + { + "epoch": 0.21723496065656994, + "grad_norm": 0.3132564425468445, + "learning_rate": 0.001, + "loss": 3.2073, + "step": 5135 + }, + { + "epoch": 0.2172772654200863, + "grad_norm": 0.2281041145324707, + "learning_rate": 0.001, + "loss": 2.6315, + "step": 5136 + }, + { + "epoch": 0.21731957018360268, + "grad_norm": 0.7160950303077698, + "learning_rate": 0.001, + "loss": 2.3004, + "step": 5137 + }, + { + "epoch": 0.21736187494711903, + "grad_norm": 0.2322395145893097, + "learning_rate": 0.001, + "loss": 2.7504, + "step": 5138 + }, + { + "epoch": 0.21740417971063541, + "grad_norm": 0.3613683879375458, + "learning_rate": 0.001, + "loss": 2.5574, + "step": 5139 + }, + { + "epoch": 0.2174464844741518, + "grad_norm": 0.24612249433994293, + "learning_rate": 0.001, + "loss": 2.8064, + "step": 5140 + }, + { + "epoch": 0.21748878923766815, + "grad_norm": 0.2229098081588745, + "learning_rate": 0.001, + "loss": 2.07, + "step": 5141 + }, + { + "epoch": 0.21753109400118453, + "grad_norm": 0.19228215515613556, + "learning_rate": 0.001, + "loss": 1.859, + "step": 5142 + }, + { + "epoch": 0.21757339876470091, + "grad_norm": 0.3609655499458313, + "learning_rate": 0.001, + "loss": 3.1708, + "step": 5143 + }, + { + "epoch": 0.21761570352821727, + "grad_norm": 0.3540642261505127, + "learning_rate": 0.001, + "loss": 2.7362, + "step": 5144 + }, + { + "epoch": 0.21765800829173365, + "grad_norm": 0.37258389592170715, + "learning_rate": 0.001, + "loss": 2.3635, + "step": 5145 + }, + { + "epoch": 0.21770031305525003, + "grad_norm": 0.2843034863471985, + "learning_rate": 0.001, + "loss": 2.8098, + "step": 5146 + }, + { + "epoch": 0.2177426178187664, + "grad_norm": 0.2767336368560791, + "learning_rate": 0.001, + "loss": 2.7574, + "step": 5147 + }, + { + "epoch": 0.21778492258228277, + "grad_norm": 0.21004167199134827, + "learning_rate": 0.001, + "loss": 2.2051, + "step": 5148 + }, + { + "epoch": 0.21782722734579912, + "grad_norm": 0.23345938324928284, + "learning_rate": 0.001, + "loss": 3.7967, + "step": 5149 + }, + { + "epoch": 0.2178695321093155, + "grad_norm": 0.44973114132881165, + "learning_rate": 0.001, + "loss": 3.6378, + "step": 5150 + }, + { + "epoch": 0.2179118368728319, + "grad_norm": 0.26152345538139343, + "learning_rate": 0.001, + "loss": 2.7861, + "step": 5151 + }, + { + "epoch": 0.21795414163634824, + "grad_norm": 0.3166126012802124, + "learning_rate": 0.001, + "loss": 2.9621, + "step": 5152 + }, + { + "epoch": 0.21799644639986462, + "grad_norm": 0.2214854210615158, + "learning_rate": 0.001, + "loss": 2.0622, + "step": 5153 + }, + { + "epoch": 0.218038751163381, + "grad_norm": 0.21160806715488434, + "learning_rate": 0.001, + "loss": 2.9396, + "step": 5154 + }, + { + "epoch": 0.21808105592689736, + "grad_norm": 0.21767565608024597, + "learning_rate": 0.001, + "loss": 1.7393, + "step": 5155 + }, + { + "epoch": 0.21812336069041374, + "grad_norm": 18.17046356201172, + "learning_rate": 0.001, + "loss": 2.2801, + "step": 5156 + }, + { + "epoch": 0.21816566545393012, + "grad_norm": 0.4204169511795044, + "learning_rate": 0.001, + "loss": 2.1432, + "step": 5157 + }, + { + "epoch": 0.21820797021744648, + "grad_norm": 0.42446818947792053, + "learning_rate": 0.001, + "loss": 1.7413, + "step": 5158 + }, + { + "epoch": 0.21825027498096286, + "grad_norm": 0.2123391479253769, + "learning_rate": 0.001, + "loss": 2.2634, + "step": 5159 + }, + { + "epoch": 0.21829257974447921, + "grad_norm": 0.2388673722743988, + "learning_rate": 0.001, + "loss": 2.2383, + "step": 5160 + }, + { + "epoch": 0.2183348845079956, + "grad_norm": 0.870901882648468, + "learning_rate": 0.001, + "loss": 3.157, + "step": 5161 + }, + { + "epoch": 0.21837718927151198, + "grad_norm": 0.3035809397697449, + "learning_rate": 0.001, + "loss": 3.1643, + "step": 5162 + }, + { + "epoch": 0.21841949403502833, + "grad_norm": 0.40469470620155334, + "learning_rate": 0.001, + "loss": 2.211, + "step": 5163 + }, + { + "epoch": 0.21846179879854472, + "grad_norm": 0.22423990070819855, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 5164 + }, + { + "epoch": 0.2185041035620611, + "grad_norm": 0.20097650587558746, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 5165 + }, + { + "epoch": 0.21854640832557745, + "grad_norm": 0.2772468626499176, + "learning_rate": 0.001, + "loss": 2.8581, + "step": 5166 + }, + { + "epoch": 0.21858871308909383, + "grad_norm": 0.20195744931697845, + "learning_rate": 0.001, + "loss": 1.8894, + "step": 5167 + }, + { + "epoch": 0.21863101785261022, + "grad_norm": 0.21510477364063263, + "learning_rate": 0.001, + "loss": 2.1195, + "step": 5168 + }, + { + "epoch": 0.21867332261612657, + "grad_norm": 0.2677229344844818, + "learning_rate": 0.001, + "loss": 2.9506, + "step": 5169 + }, + { + "epoch": 0.21871562737964295, + "grad_norm": 1.1444584131240845, + "learning_rate": 0.001, + "loss": 1.6645, + "step": 5170 + }, + { + "epoch": 0.2187579321431593, + "grad_norm": 0.30999642610549927, + "learning_rate": 0.001, + "loss": 2.7087, + "step": 5171 + }, + { + "epoch": 0.2188002369066757, + "grad_norm": 0.20951640605926514, + "learning_rate": 0.001, + "loss": 2.8629, + "step": 5172 + }, + { + "epoch": 0.21884254167019207, + "grad_norm": 0.2823593020439148, + "learning_rate": 0.001, + "loss": 1.8454, + "step": 5173 + }, + { + "epoch": 0.21888484643370842, + "grad_norm": 0.19759036600589752, + "learning_rate": 0.001, + "loss": 1.6832, + "step": 5174 + }, + { + "epoch": 0.2189271511972248, + "grad_norm": 1.2790015935897827, + "learning_rate": 0.001, + "loss": 2.2598, + "step": 5175 + }, + { + "epoch": 0.2189694559607412, + "grad_norm": 0.3004913926124573, + "learning_rate": 0.001, + "loss": 2.8445, + "step": 5176 + }, + { + "epoch": 0.21901176072425754, + "grad_norm": 0.1984034925699234, + "learning_rate": 0.001, + "loss": 2.0588, + "step": 5177 + }, + { + "epoch": 0.21905406548777392, + "grad_norm": 0.6654098033905029, + "learning_rate": 0.001, + "loss": 2.5635, + "step": 5178 + }, + { + "epoch": 0.2190963702512903, + "grad_norm": 0.21716728806495667, + "learning_rate": 0.001, + "loss": 1.8904, + "step": 5179 + }, + { + "epoch": 0.21913867501480666, + "grad_norm": 0.2529633939266205, + "learning_rate": 0.001, + "loss": 3.3847, + "step": 5180 + }, + { + "epoch": 0.21918097977832304, + "grad_norm": 0.24113009870052338, + "learning_rate": 0.001, + "loss": 2.8504, + "step": 5181 + }, + { + "epoch": 0.21922328454183942, + "grad_norm": 0.23012512922286987, + "learning_rate": 0.001, + "loss": 1.8835, + "step": 5182 + }, + { + "epoch": 0.21926558930535578, + "grad_norm": 0.2334389090538025, + "learning_rate": 0.001, + "loss": 2.9091, + "step": 5183 + }, + { + "epoch": 0.21930789406887216, + "grad_norm": 3.1486589908599854, + "learning_rate": 0.001, + "loss": 3.0328, + "step": 5184 + }, + { + "epoch": 0.21935019883238852, + "grad_norm": 0.18506896495819092, + "learning_rate": 0.001, + "loss": 3.1343, + "step": 5185 + }, + { + "epoch": 0.2193925035959049, + "grad_norm": 0.24055512249469757, + "learning_rate": 0.001, + "loss": 1.9749, + "step": 5186 + }, + { + "epoch": 0.21943480835942128, + "grad_norm": 0.3492937684059143, + "learning_rate": 0.001, + "loss": 2.3837, + "step": 5187 + }, + { + "epoch": 0.21947711312293763, + "grad_norm": 0.2223334014415741, + "learning_rate": 0.001, + "loss": 2.7189, + "step": 5188 + }, + { + "epoch": 0.21951941788645402, + "grad_norm": 2.0929033756256104, + "learning_rate": 0.001, + "loss": 2.3026, + "step": 5189 + }, + { + "epoch": 0.2195617226499704, + "grad_norm": 3.1999146938323975, + "learning_rate": 0.001, + "loss": 1.7523, + "step": 5190 + }, + { + "epoch": 0.21960402741348675, + "grad_norm": 0.19789402186870575, + "learning_rate": 0.001, + "loss": 1.6299, + "step": 5191 + }, + { + "epoch": 0.21964633217700313, + "grad_norm": 0.23743316531181335, + "learning_rate": 0.001, + "loss": 3.3135, + "step": 5192 + }, + { + "epoch": 0.21968863694051952, + "grad_norm": 0.2639296054840088, + "learning_rate": 0.001, + "loss": 2.3337, + "step": 5193 + }, + { + "epoch": 0.21973094170403587, + "grad_norm": 0.24590055644512177, + "learning_rate": 0.001, + "loss": 2.4579, + "step": 5194 + }, + { + "epoch": 0.21977324646755225, + "grad_norm": 0.2431252896785736, + "learning_rate": 0.001, + "loss": 2.1513, + "step": 5195 + }, + { + "epoch": 0.2198155512310686, + "grad_norm": 0.22125622630119324, + "learning_rate": 0.001, + "loss": 2.3895, + "step": 5196 + }, + { + "epoch": 0.219857855994585, + "grad_norm": 0.19629360735416412, + "learning_rate": 0.001, + "loss": 2.045, + "step": 5197 + }, + { + "epoch": 0.21990016075810137, + "grad_norm": 0.23924358189105988, + "learning_rate": 0.001, + "loss": 2.2106, + "step": 5198 + }, + { + "epoch": 0.21994246552161772, + "grad_norm": 0.21963438391685486, + "learning_rate": 0.001, + "loss": 2.2913, + "step": 5199 + }, + { + "epoch": 0.2199847702851341, + "grad_norm": 0.2901560366153717, + "learning_rate": 0.001, + "loss": 2.8481, + "step": 5200 + }, + { + "epoch": 0.2200270750486505, + "grad_norm": 0.2538588345050812, + "learning_rate": 0.001, + "loss": 3.4287, + "step": 5201 + }, + { + "epoch": 0.22006937981216684, + "grad_norm": 0.23842386901378632, + "learning_rate": 0.001, + "loss": 3.3795, + "step": 5202 + }, + { + "epoch": 0.22011168457568323, + "grad_norm": 0.255536288022995, + "learning_rate": 0.001, + "loss": 3.2765, + "step": 5203 + }, + { + "epoch": 0.2201539893391996, + "grad_norm": 4.658047199249268, + "learning_rate": 0.001, + "loss": 2.3231, + "step": 5204 + }, + { + "epoch": 0.22019629410271596, + "grad_norm": 0.20021434128284454, + "learning_rate": 0.001, + "loss": 2.3354, + "step": 5205 + }, + { + "epoch": 0.22023859886623234, + "grad_norm": 0.19822002947330475, + "learning_rate": 0.001, + "loss": 1.7009, + "step": 5206 + }, + { + "epoch": 0.2202809036297487, + "grad_norm": 0.21478721499443054, + "learning_rate": 0.001, + "loss": 2.0951, + "step": 5207 + }, + { + "epoch": 0.22032320839326508, + "grad_norm": 0.21483932435512543, + "learning_rate": 0.001, + "loss": 2.7571, + "step": 5208 + }, + { + "epoch": 0.22036551315678146, + "grad_norm": 0.23699632287025452, + "learning_rate": 0.001, + "loss": 2.2913, + "step": 5209 + }, + { + "epoch": 0.22040781792029782, + "grad_norm": 0.26198896765708923, + "learning_rate": 0.001, + "loss": 4.5366, + "step": 5210 + }, + { + "epoch": 0.2204501226838142, + "grad_norm": 0.21379588544368744, + "learning_rate": 0.001, + "loss": 1.9979, + "step": 5211 + }, + { + "epoch": 0.22049242744733058, + "grad_norm": 0.2645533084869385, + "learning_rate": 0.001, + "loss": 2.2221, + "step": 5212 + }, + { + "epoch": 0.22053473221084693, + "grad_norm": 0.19984637200832367, + "learning_rate": 0.001, + "loss": 2.0598, + "step": 5213 + }, + { + "epoch": 0.22057703697436332, + "grad_norm": 0.2022748589515686, + "learning_rate": 0.001, + "loss": 2.3214, + "step": 5214 + }, + { + "epoch": 0.2206193417378797, + "grad_norm": 0.20694908499717712, + "learning_rate": 0.001, + "loss": 2.0205, + "step": 5215 + }, + { + "epoch": 0.22066164650139605, + "grad_norm": 0.2556994557380676, + "learning_rate": 0.001, + "loss": 2.4558, + "step": 5216 + }, + { + "epoch": 0.22070395126491243, + "grad_norm": 0.7074611186981201, + "learning_rate": 0.001, + "loss": 1.6551, + "step": 5217 + }, + { + "epoch": 0.2207462560284288, + "grad_norm": 0.19164715707302094, + "learning_rate": 0.001, + "loss": 2.1213, + "step": 5218 + }, + { + "epoch": 0.22078856079194517, + "grad_norm": 0.32563894987106323, + "learning_rate": 0.001, + "loss": 2.366, + "step": 5219 + }, + { + "epoch": 0.22083086555546155, + "grad_norm": 0.21705080568790436, + "learning_rate": 0.001, + "loss": 2.4052, + "step": 5220 + }, + { + "epoch": 0.2208731703189779, + "grad_norm": 0.20345237851142883, + "learning_rate": 0.001, + "loss": 2.2577, + "step": 5221 + }, + { + "epoch": 0.2209154750824943, + "grad_norm": 0.2837914824485779, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 5222 + }, + { + "epoch": 0.22095777984601067, + "grad_norm": 1.1149832010269165, + "learning_rate": 0.001, + "loss": 2.357, + "step": 5223 + }, + { + "epoch": 0.22100008460952703, + "grad_norm": 0.23472867906093597, + "learning_rate": 0.001, + "loss": 2.6525, + "step": 5224 + }, + { + "epoch": 0.2210423893730434, + "grad_norm": 0.6274850964546204, + "learning_rate": 0.001, + "loss": 3.9738, + "step": 5225 + }, + { + "epoch": 0.2210846941365598, + "grad_norm": 0.21737419068813324, + "learning_rate": 0.001, + "loss": 2.4552, + "step": 5226 + }, + { + "epoch": 0.22112699890007614, + "grad_norm": 0.2056940793991089, + "learning_rate": 0.001, + "loss": 2.8104, + "step": 5227 + }, + { + "epoch": 0.22116930366359253, + "grad_norm": 0.2175094336271286, + "learning_rate": 0.001, + "loss": 2.1255, + "step": 5228 + }, + { + "epoch": 0.22121160842710888, + "grad_norm": 0.39655065536499023, + "learning_rate": 0.001, + "loss": 1.9519, + "step": 5229 + }, + { + "epoch": 0.22125391319062526, + "grad_norm": 0.20697876811027527, + "learning_rate": 0.001, + "loss": 2.4179, + "step": 5230 + }, + { + "epoch": 0.22129621795414164, + "grad_norm": 0.217100128531456, + "learning_rate": 0.001, + "loss": 2.7213, + "step": 5231 + }, + { + "epoch": 0.221338522717658, + "grad_norm": 0.21442492306232452, + "learning_rate": 0.001, + "loss": 2.8483, + "step": 5232 + }, + { + "epoch": 0.22138082748117438, + "grad_norm": 0.21602658927440643, + "learning_rate": 0.001, + "loss": 1.6262, + "step": 5233 + }, + { + "epoch": 0.22142313224469076, + "grad_norm": 0.22940513491630554, + "learning_rate": 0.001, + "loss": 1.9484, + "step": 5234 + }, + { + "epoch": 0.22146543700820712, + "grad_norm": 3.8185322284698486, + "learning_rate": 0.001, + "loss": 2.8154, + "step": 5235 + }, + { + "epoch": 0.2215077417717235, + "grad_norm": 0.28936222195625305, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 5236 + }, + { + "epoch": 0.22155004653523988, + "grad_norm": 0.24605505168437958, + "learning_rate": 0.001, + "loss": 3.03, + "step": 5237 + }, + { + "epoch": 0.22159235129875623, + "grad_norm": 0.20465493202209473, + "learning_rate": 0.001, + "loss": 2.3525, + "step": 5238 + }, + { + "epoch": 0.22163465606227262, + "grad_norm": 0.20358186960220337, + "learning_rate": 0.001, + "loss": 2.1494, + "step": 5239 + }, + { + "epoch": 0.22167696082578897, + "grad_norm": 0.2088003009557724, + "learning_rate": 0.001, + "loss": 1.9404, + "step": 5240 + }, + { + "epoch": 0.22171926558930535, + "grad_norm": 0.20697228610515594, + "learning_rate": 0.001, + "loss": 3.8656, + "step": 5241 + }, + { + "epoch": 0.22176157035282174, + "grad_norm": 0.22819316387176514, + "learning_rate": 0.001, + "loss": 2.1, + "step": 5242 + }, + { + "epoch": 0.2218038751163381, + "grad_norm": 0.32436373829841614, + "learning_rate": 0.001, + "loss": 3.2291, + "step": 5243 + }, + { + "epoch": 0.22184617987985447, + "grad_norm": 0.3842344880104065, + "learning_rate": 0.001, + "loss": 1.9864, + "step": 5244 + }, + { + "epoch": 0.22188848464337085, + "grad_norm": 0.22547200322151184, + "learning_rate": 0.001, + "loss": 2.1806, + "step": 5245 + }, + { + "epoch": 0.2219307894068872, + "grad_norm": 0.201228067278862, + "learning_rate": 0.001, + "loss": 2.6097, + "step": 5246 + }, + { + "epoch": 0.2219730941704036, + "grad_norm": 0.21469905972480774, + "learning_rate": 0.001, + "loss": 1.884, + "step": 5247 + }, + { + "epoch": 0.22201539893391997, + "grad_norm": 17.149986267089844, + "learning_rate": 0.001, + "loss": 2.0795, + "step": 5248 + }, + { + "epoch": 0.22205770369743633, + "grad_norm": 2.927840232849121, + "learning_rate": 0.001, + "loss": 2.0508, + "step": 5249 + }, + { + "epoch": 0.2221000084609527, + "grad_norm": 0.20757484436035156, + "learning_rate": 0.001, + "loss": 3.9396, + "step": 5250 + }, + { + "epoch": 0.22214231322446906, + "grad_norm": 0.22015203535556793, + "learning_rate": 0.001, + "loss": 2.512, + "step": 5251 + }, + { + "epoch": 0.22218461798798544, + "grad_norm": 0.22938333451747894, + "learning_rate": 0.001, + "loss": 2.2927, + "step": 5252 + }, + { + "epoch": 0.22222692275150183, + "grad_norm": 0.24902082979679108, + "learning_rate": 0.001, + "loss": 2.9212, + "step": 5253 + }, + { + "epoch": 0.22226922751501818, + "grad_norm": 0.23875977098941803, + "learning_rate": 0.001, + "loss": 2.0939, + "step": 5254 + }, + { + "epoch": 0.22231153227853456, + "grad_norm": 0.21278776228427887, + "learning_rate": 0.001, + "loss": 1.9339, + "step": 5255 + }, + { + "epoch": 0.22235383704205094, + "grad_norm": 0.2972838282585144, + "learning_rate": 0.001, + "loss": 3.0246, + "step": 5256 + }, + { + "epoch": 0.2223961418055673, + "grad_norm": 0.20768025517463684, + "learning_rate": 0.001, + "loss": 1.9835, + "step": 5257 + }, + { + "epoch": 0.22243844656908368, + "grad_norm": 5.894577980041504, + "learning_rate": 0.001, + "loss": 2.6385, + "step": 5258 + }, + { + "epoch": 0.22248075133260006, + "grad_norm": 0.2140832394361496, + "learning_rate": 0.001, + "loss": 2.7344, + "step": 5259 + }, + { + "epoch": 0.22252305609611642, + "grad_norm": 0.20773886144161224, + "learning_rate": 0.001, + "loss": 2.4667, + "step": 5260 + }, + { + "epoch": 0.2225653608596328, + "grad_norm": 0.21243473887443542, + "learning_rate": 0.001, + "loss": 2.1209, + "step": 5261 + }, + { + "epoch": 0.22260766562314915, + "grad_norm": 0.2120569497346878, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 5262 + }, + { + "epoch": 0.22264997038666554, + "grad_norm": 0.25889766216278076, + "learning_rate": 0.001, + "loss": 2.5132, + "step": 5263 + }, + { + "epoch": 0.22269227515018192, + "grad_norm": 0.2106718271970749, + "learning_rate": 0.001, + "loss": 1.8965, + "step": 5264 + }, + { + "epoch": 0.22273457991369827, + "grad_norm": 0.2909772992134094, + "learning_rate": 0.001, + "loss": 2.1466, + "step": 5265 + }, + { + "epoch": 0.22277688467721465, + "grad_norm": 1.4828362464904785, + "learning_rate": 0.001, + "loss": 2.8965, + "step": 5266 + }, + { + "epoch": 0.22281918944073104, + "grad_norm": 0.23213310539722443, + "learning_rate": 0.001, + "loss": 2.0563, + "step": 5267 + }, + { + "epoch": 0.2228614942042474, + "grad_norm": 0.2343592345714569, + "learning_rate": 0.001, + "loss": 2.6482, + "step": 5268 + }, + { + "epoch": 0.22290379896776377, + "grad_norm": 0.2614687979221344, + "learning_rate": 0.001, + "loss": 2.7576, + "step": 5269 + }, + { + "epoch": 0.22294610373128015, + "grad_norm": 4.1902008056640625, + "learning_rate": 0.001, + "loss": 2.9158, + "step": 5270 + }, + { + "epoch": 0.2229884084947965, + "grad_norm": 0.20279575884342194, + "learning_rate": 0.001, + "loss": 2.205, + "step": 5271 + }, + { + "epoch": 0.2230307132583129, + "grad_norm": 0.373855322599411, + "learning_rate": 0.001, + "loss": 2.6883, + "step": 5272 + }, + { + "epoch": 0.22307301802182924, + "grad_norm": 0.3078487813472748, + "learning_rate": 0.001, + "loss": 1.9063, + "step": 5273 + }, + { + "epoch": 0.22311532278534563, + "grad_norm": 0.3240393102169037, + "learning_rate": 0.001, + "loss": 2.6232, + "step": 5274 + }, + { + "epoch": 0.223157627548862, + "grad_norm": 0.3263351321220398, + "learning_rate": 0.001, + "loss": 2.3034, + "step": 5275 + }, + { + "epoch": 0.22319993231237836, + "grad_norm": 0.7479755282402039, + "learning_rate": 0.001, + "loss": 1.8465, + "step": 5276 + }, + { + "epoch": 0.22324223707589474, + "grad_norm": 2.0020651817321777, + "learning_rate": 0.001, + "loss": 1.859, + "step": 5277 + }, + { + "epoch": 0.22328454183941113, + "grad_norm": 0.2206108123064041, + "learning_rate": 0.001, + "loss": 1.5864, + "step": 5278 + }, + { + "epoch": 0.22332684660292748, + "grad_norm": 3.234041452407837, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 5279 + }, + { + "epoch": 0.22336915136644386, + "grad_norm": 0.21610672771930695, + "learning_rate": 0.001, + "loss": 1.948, + "step": 5280 + }, + { + "epoch": 0.22341145612996025, + "grad_norm": 1.4125299453735352, + "learning_rate": 0.001, + "loss": 2.7824, + "step": 5281 + }, + { + "epoch": 0.2234537608934766, + "grad_norm": 0.19420327246189117, + "learning_rate": 0.001, + "loss": 1.985, + "step": 5282 + }, + { + "epoch": 0.22349606565699298, + "grad_norm": 0.26585274934768677, + "learning_rate": 0.001, + "loss": 2.6315, + "step": 5283 + }, + { + "epoch": 0.22353837042050934, + "grad_norm": 1.4780011177062988, + "learning_rate": 0.001, + "loss": 2.5902, + "step": 5284 + }, + { + "epoch": 0.22358067518402572, + "grad_norm": 0.29642167687416077, + "learning_rate": 0.001, + "loss": 1.7713, + "step": 5285 + }, + { + "epoch": 0.2236229799475421, + "grad_norm": 0.192737877368927, + "learning_rate": 0.001, + "loss": 2.7865, + "step": 5286 + }, + { + "epoch": 0.22366528471105845, + "grad_norm": 2.1191747188568115, + "learning_rate": 0.001, + "loss": 2.4322, + "step": 5287 + }, + { + "epoch": 0.22370758947457484, + "grad_norm": 1.5505908727645874, + "learning_rate": 0.001, + "loss": 2.4076, + "step": 5288 + }, + { + "epoch": 0.22374989423809122, + "grad_norm": 0.2791816294193268, + "learning_rate": 0.001, + "loss": 2.6396, + "step": 5289 + }, + { + "epoch": 0.22379219900160757, + "grad_norm": 0.5878250598907471, + "learning_rate": 0.001, + "loss": 2.4715, + "step": 5290 + }, + { + "epoch": 0.22383450376512395, + "grad_norm": 0.22260555624961853, + "learning_rate": 0.001, + "loss": 1.734, + "step": 5291 + }, + { + "epoch": 0.22387680852864034, + "grad_norm": 0.27154138684272766, + "learning_rate": 0.001, + "loss": 2.9405, + "step": 5292 + }, + { + "epoch": 0.2239191132921567, + "grad_norm": 0.24614472687244415, + "learning_rate": 0.001, + "loss": 3.2545, + "step": 5293 + }, + { + "epoch": 0.22396141805567307, + "grad_norm": 0.26551756262779236, + "learning_rate": 0.001, + "loss": 2.62, + "step": 5294 + }, + { + "epoch": 0.22400372281918943, + "grad_norm": 0.23047612607479095, + "learning_rate": 0.001, + "loss": 1.9018, + "step": 5295 + }, + { + "epoch": 0.2240460275827058, + "grad_norm": 0.2711014151573181, + "learning_rate": 0.001, + "loss": 2.3669, + "step": 5296 + }, + { + "epoch": 0.2240883323462222, + "grad_norm": 0.2241722196340561, + "learning_rate": 0.001, + "loss": 2.2924, + "step": 5297 + }, + { + "epoch": 0.22413063710973855, + "grad_norm": 0.36074158549308777, + "learning_rate": 0.001, + "loss": 2.4914, + "step": 5298 + }, + { + "epoch": 0.22417294187325493, + "grad_norm": 0.21040941774845123, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 5299 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 0.17329736053943634, + "learning_rate": 0.001, + "loss": 2.7799, + "step": 5300 + }, + { + "epoch": 0.22425755140028766, + "grad_norm": 1.4445072412490845, + "learning_rate": 0.001, + "loss": 2.6139, + "step": 5301 + }, + { + "epoch": 0.22429985616380405, + "grad_norm": 0.20153391361236572, + "learning_rate": 0.001, + "loss": 2.4595, + "step": 5302 + }, + { + "epoch": 0.22434216092732043, + "grad_norm": 0.3603563606739044, + "learning_rate": 0.001, + "loss": 3.4425, + "step": 5303 + }, + { + "epoch": 0.22438446569083678, + "grad_norm": 0.2601318955421448, + "learning_rate": 0.001, + "loss": 3.1128, + "step": 5304 + }, + { + "epoch": 0.22442677045435316, + "grad_norm": 0.1929735243320465, + "learning_rate": 0.001, + "loss": 1.8997, + "step": 5305 + }, + { + "epoch": 0.22446907521786955, + "grad_norm": 0.26424339413642883, + "learning_rate": 0.001, + "loss": 2.3221, + "step": 5306 + }, + { + "epoch": 0.2245113799813859, + "grad_norm": 0.396243155002594, + "learning_rate": 0.001, + "loss": 2.9599, + "step": 5307 + }, + { + "epoch": 0.22455368474490228, + "grad_norm": 1.7157014608383179, + "learning_rate": 0.001, + "loss": 2.1441, + "step": 5308 + }, + { + "epoch": 0.22459598950841864, + "grad_norm": 0.33265525102615356, + "learning_rate": 0.001, + "loss": 1.9819, + "step": 5309 + }, + { + "epoch": 0.22463829427193502, + "grad_norm": 0.6093931198120117, + "learning_rate": 0.001, + "loss": 2.9678, + "step": 5310 + }, + { + "epoch": 0.2246805990354514, + "grad_norm": 0.2151605635881424, + "learning_rate": 0.001, + "loss": 2.5349, + "step": 5311 + }, + { + "epoch": 0.22472290379896775, + "grad_norm": 8.028112411499023, + "learning_rate": 0.001, + "loss": 2.6196, + "step": 5312 + }, + { + "epoch": 0.22476520856248414, + "grad_norm": 1.0721027851104736, + "learning_rate": 0.001, + "loss": 3.4655, + "step": 5313 + }, + { + "epoch": 0.22480751332600052, + "grad_norm": 0.20032766461372375, + "learning_rate": 0.001, + "loss": 2.1955, + "step": 5314 + }, + { + "epoch": 0.22484981808951687, + "grad_norm": 0.20679010450839996, + "learning_rate": 0.001, + "loss": 1.743, + "step": 5315 + }, + { + "epoch": 0.22489212285303326, + "grad_norm": 0.23269832134246826, + "learning_rate": 0.001, + "loss": 1.9682, + "step": 5316 + }, + { + "epoch": 0.22493442761654964, + "grad_norm": 0.2920213043689728, + "learning_rate": 0.001, + "loss": 2.6719, + "step": 5317 + }, + { + "epoch": 0.224976732380066, + "grad_norm": 0.31730595231056213, + "learning_rate": 0.001, + "loss": 2.7025, + "step": 5318 + }, + { + "epoch": 0.22501903714358237, + "grad_norm": 0.20346704125404358, + "learning_rate": 0.001, + "loss": 2.528, + "step": 5319 + }, + { + "epoch": 0.22506134190709873, + "grad_norm": 0.9651232957839966, + "learning_rate": 0.001, + "loss": 2.3393, + "step": 5320 + }, + { + "epoch": 0.2251036466706151, + "grad_norm": 3.8393514156341553, + "learning_rate": 0.001, + "loss": 1.8216, + "step": 5321 + }, + { + "epoch": 0.2251459514341315, + "grad_norm": 0.1834096908569336, + "learning_rate": 0.001, + "loss": 2.0548, + "step": 5322 + }, + { + "epoch": 0.22518825619764785, + "grad_norm": 0.4497200846672058, + "learning_rate": 0.001, + "loss": 2.9522, + "step": 5323 + }, + { + "epoch": 0.22523056096116423, + "grad_norm": 0.21612843871116638, + "learning_rate": 0.001, + "loss": 2.9417, + "step": 5324 + }, + { + "epoch": 0.2252728657246806, + "grad_norm": 0.23549750447273254, + "learning_rate": 0.001, + "loss": 1.8287, + "step": 5325 + }, + { + "epoch": 0.22531517048819696, + "grad_norm": 0.25662145018577576, + "learning_rate": 0.001, + "loss": 3.0902, + "step": 5326 + }, + { + "epoch": 0.22535747525171335, + "grad_norm": 0.24648882448673248, + "learning_rate": 0.001, + "loss": 2.5687, + "step": 5327 + }, + { + "epoch": 0.22539978001522973, + "grad_norm": 0.23536236584186554, + "learning_rate": 0.001, + "loss": 2.0515, + "step": 5328 + }, + { + "epoch": 0.22544208477874608, + "grad_norm": 0.25583964586257935, + "learning_rate": 0.001, + "loss": 2.2743, + "step": 5329 + }, + { + "epoch": 0.22548438954226246, + "grad_norm": 6.257376194000244, + "learning_rate": 0.001, + "loss": 1.6795, + "step": 5330 + }, + { + "epoch": 0.22552669430577882, + "grad_norm": 1.3603640794754028, + "learning_rate": 0.001, + "loss": 3.1865, + "step": 5331 + }, + { + "epoch": 0.2255689990692952, + "grad_norm": 0.2052699625492096, + "learning_rate": 0.001, + "loss": 2.1218, + "step": 5332 + }, + { + "epoch": 0.22561130383281158, + "grad_norm": 0.2767829895019531, + "learning_rate": 0.001, + "loss": 2.3987, + "step": 5333 + }, + { + "epoch": 0.22565360859632794, + "grad_norm": 0.22107641398906708, + "learning_rate": 0.001, + "loss": 1.91, + "step": 5334 + }, + { + "epoch": 0.22569591335984432, + "grad_norm": 0.5406429767608643, + "learning_rate": 0.001, + "loss": 2.1801, + "step": 5335 + }, + { + "epoch": 0.2257382181233607, + "grad_norm": 0.3691091239452362, + "learning_rate": 0.001, + "loss": 2.8873, + "step": 5336 + }, + { + "epoch": 0.22578052288687706, + "grad_norm": 0.25709986686706543, + "learning_rate": 0.001, + "loss": 3.6867, + "step": 5337 + }, + { + "epoch": 0.22582282765039344, + "grad_norm": 1.0610352754592896, + "learning_rate": 0.001, + "loss": 2.0219, + "step": 5338 + }, + { + "epoch": 0.22586513241390982, + "grad_norm": 0.2490183711051941, + "learning_rate": 0.001, + "loss": 3.2717, + "step": 5339 + }, + { + "epoch": 0.22590743717742617, + "grad_norm": 0.3832576870918274, + "learning_rate": 0.001, + "loss": 2.6887, + "step": 5340 + }, + { + "epoch": 0.22594974194094256, + "grad_norm": 0.20792420208454132, + "learning_rate": 0.001, + "loss": 1.4877, + "step": 5341 + }, + { + "epoch": 0.2259920467044589, + "grad_norm": 0.2878759503364563, + "learning_rate": 0.001, + "loss": 2.5409, + "step": 5342 + }, + { + "epoch": 0.2260343514679753, + "grad_norm": 0.20281533896923065, + "learning_rate": 0.001, + "loss": 1.8222, + "step": 5343 + }, + { + "epoch": 0.22607665623149167, + "grad_norm": 0.3520990014076233, + "learning_rate": 0.001, + "loss": 2.7768, + "step": 5344 + }, + { + "epoch": 0.22611896099500803, + "grad_norm": 0.25387975573539734, + "learning_rate": 0.001, + "loss": 2.5626, + "step": 5345 + }, + { + "epoch": 0.2261612657585244, + "grad_norm": 0.2082633674144745, + "learning_rate": 0.001, + "loss": 1.8055, + "step": 5346 + }, + { + "epoch": 0.2262035705220408, + "grad_norm": 0.24384428560733795, + "learning_rate": 0.001, + "loss": 3.0511, + "step": 5347 + }, + { + "epoch": 0.22624587528555715, + "grad_norm": 0.2108755111694336, + "learning_rate": 0.001, + "loss": 2.4456, + "step": 5348 + }, + { + "epoch": 0.22628818004907353, + "grad_norm": 0.23625120520591736, + "learning_rate": 0.001, + "loss": 2.0264, + "step": 5349 + }, + { + "epoch": 0.2263304848125899, + "grad_norm": 0.19359584152698517, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 5350 + }, + { + "epoch": 0.22637278957610626, + "grad_norm": 0.23231805860996246, + "learning_rate": 0.001, + "loss": 2.3967, + "step": 5351 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.2224705070257187, + "learning_rate": 0.001, + "loss": 3.0155, + "step": 5352 + }, + { + "epoch": 0.226457399103139, + "grad_norm": 0.18623998761177063, + "learning_rate": 0.001, + "loss": 2.7007, + "step": 5353 + }, + { + "epoch": 0.22649970386665538, + "grad_norm": 0.190797358751297, + "learning_rate": 0.001, + "loss": 1.8014, + "step": 5354 + }, + { + "epoch": 0.22654200863017177, + "grad_norm": 0.2000328004360199, + "learning_rate": 0.001, + "loss": 2.1544, + "step": 5355 + }, + { + "epoch": 0.22658431339368812, + "grad_norm": 0.26492246985435486, + "learning_rate": 0.001, + "loss": 2.5089, + "step": 5356 + }, + { + "epoch": 0.2266266181572045, + "grad_norm": 0.22106075286865234, + "learning_rate": 0.001, + "loss": 2.0971, + "step": 5357 + }, + { + "epoch": 0.22666892292072088, + "grad_norm": 0.20364297926425934, + "learning_rate": 0.001, + "loss": 2.6095, + "step": 5358 + }, + { + "epoch": 0.22671122768423724, + "grad_norm": 0.30333849787712097, + "learning_rate": 0.001, + "loss": 1.6666, + "step": 5359 + }, + { + "epoch": 0.22675353244775362, + "grad_norm": 0.20348311960697174, + "learning_rate": 0.001, + "loss": 1.8586, + "step": 5360 + }, + { + "epoch": 0.22679583721127, + "grad_norm": 0.265525221824646, + "learning_rate": 0.001, + "loss": 2.3425, + "step": 5361 + }, + { + "epoch": 0.22683814197478636, + "grad_norm": 0.2070625275373459, + "learning_rate": 0.001, + "loss": 1.855, + "step": 5362 + }, + { + "epoch": 0.22688044673830274, + "grad_norm": 0.24439737200737, + "learning_rate": 0.001, + "loss": 2.8247, + "step": 5363 + }, + { + "epoch": 0.2269227515018191, + "grad_norm": 0.18254782259464264, + "learning_rate": 0.001, + "loss": 2.1448, + "step": 5364 + }, + { + "epoch": 0.22696505626533547, + "grad_norm": 0.2069178968667984, + "learning_rate": 0.001, + "loss": 2.2075, + "step": 5365 + }, + { + "epoch": 0.22700736102885186, + "grad_norm": 0.2451086789369583, + "learning_rate": 0.001, + "loss": 2.7744, + "step": 5366 + }, + { + "epoch": 0.2270496657923682, + "grad_norm": 0.2338859587907791, + "learning_rate": 0.001, + "loss": 2.2202, + "step": 5367 + }, + { + "epoch": 0.2270919705558846, + "grad_norm": 0.17298074066638947, + "learning_rate": 0.001, + "loss": 2.8615, + "step": 5368 + }, + { + "epoch": 0.22713427531940097, + "grad_norm": 0.20199984312057495, + "learning_rate": 0.001, + "loss": 2.1147, + "step": 5369 + }, + { + "epoch": 0.22717658008291733, + "grad_norm": 0.18716251850128174, + "learning_rate": 0.001, + "loss": 2.4151, + "step": 5370 + }, + { + "epoch": 0.2272188848464337, + "grad_norm": 0.22359853982925415, + "learning_rate": 0.001, + "loss": 2.2439, + "step": 5371 + }, + { + "epoch": 0.2272611896099501, + "grad_norm": 0.1923590749502182, + "learning_rate": 0.001, + "loss": 1.878, + "step": 5372 + }, + { + "epoch": 0.22730349437346645, + "grad_norm": 0.20053477585315704, + "learning_rate": 0.001, + "loss": 2.8747, + "step": 5373 + }, + { + "epoch": 0.22734579913698283, + "grad_norm": 0.20686081051826477, + "learning_rate": 0.001, + "loss": 1.8966, + "step": 5374 + }, + { + "epoch": 0.22738810390049918, + "grad_norm": 0.2037486433982849, + "learning_rate": 0.001, + "loss": 2.1247, + "step": 5375 + }, + { + "epoch": 0.22743040866401557, + "grad_norm": 0.2048976719379425, + "learning_rate": 0.001, + "loss": 2.5233, + "step": 5376 + }, + { + "epoch": 0.22747271342753195, + "grad_norm": 0.22234363853931427, + "learning_rate": 0.001, + "loss": 3.5596, + "step": 5377 + }, + { + "epoch": 0.2275150181910483, + "grad_norm": 0.17984293401241302, + "learning_rate": 0.001, + "loss": 1.7908, + "step": 5378 + }, + { + "epoch": 0.22755732295456468, + "grad_norm": 0.19727393984794617, + "learning_rate": 0.001, + "loss": 1.9901, + "step": 5379 + }, + { + "epoch": 0.22759962771808107, + "grad_norm": 0.6455583572387695, + "learning_rate": 0.001, + "loss": 2.2072, + "step": 5380 + }, + { + "epoch": 0.22764193248159742, + "grad_norm": 0.1867457777261734, + "learning_rate": 0.001, + "loss": 2.5531, + "step": 5381 + }, + { + "epoch": 0.2276842372451138, + "grad_norm": 0.23731602728366852, + "learning_rate": 0.001, + "loss": 2.3504, + "step": 5382 + }, + { + "epoch": 0.22772654200863018, + "grad_norm": 0.1865188479423523, + "learning_rate": 0.001, + "loss": 2.1391, + "step": 5383 + }, + { + "epoch": 0.22776884677214654, + "grad_norm": 0.26208534836769104, + "learning_rate": 0.001, + "loss": 2.0536, + "step": 5384 + }, + { + "epoch": 0.22781115153566292, + "grad_norm": 0.16341978311538696, + "learning_rate": 0.001, + "loss": 2.8151, + "step": 5385 + }, + { + "epoch": 0.22785345629917927, + "grad_norm": 0.2644388675689697, + "learning_rate": 0.001, + "loss": 2.5974, + "step": 5386 + }, + { + "epoch": 0.22789576106269566, + "grad_norm": 0.18066078424453735, + "learning_rate": 0.001, + "loss": 1.6, + "step": 5387 + }, + { + "epoch": 0.22793806582621204, + "grad_norm": 0.34033629298210144, + "learning_rate": 0.001, + "loss": 1.4947, + "step": 5388 + }, + { + "epoch": 0.2279803705897284, + "grad_norm": 0.19720013439655304, + "learning_rate": 0.001, + "loss": 1.9336, + "step": 5389 + }, + { + "epoch": 0.22802267535324477, + "grad_norm": 0.20138509571552277, + "learning_rate": 0.001, + "loss": 2.4402, + "step": 5390 + }, + { + "epoch": 0.22806498011676116, + "grad_norm": 0.163251593708992, + "learning_rate": 0.001, + "loss": 1.7993, + "step": 5391 + }, + { + "epoch": 0.2281072848802775, + "grad_norm": 0.20354165136814117, + "learning_rate": 0.001, + "loss": 2.5252, + "step": 5392 + }, + { + "epoch": 0.2281495896437939, + "grad_norm": 6.216144561767578, + "learning_rate": 0.001, + "loss": 2.3808, + "step": 5393 + }, + { + "epoch": 0.22819189440731028, + "grad_norm": 0.21125631034374237, + "learning_rate": 0.001, + "loss": 2.29, + "step": 5394 + }, + { + "epoch": 0.22823419917082663, + "grad_norm": 0.19609850645065308, + "learning_rate": 0.001, + "loss": 1.9172, + "step": 5395 + }, + { + "epoch": 0.228276503934343, + "grad_norm": 0.22020597755908966, + "learning_rate": 0.001, + "loss": 3.1209, + "step": 5396 + }, + { + "epoch": 0.22831880869785937, + "grad_norm": 0.21470773220062256, + "learning_rate": 0.001, + "loss": 2.4758, + "step": 5397 + }, + { + "epoch": 0.22836111346137575, + "grad_norm": 0.24168118834495544, + "learning_rate": 0.001, + "loss": 2.8874, + "step": 5398 + }, + { + "epoch": 0.22840341822489213, + "grad_norm": 0.2095698118209839, + "learning_rate": 0.001, + "loss": 2.1628, + "step": 5399 + }, + { + "epoch": 0.22844572298840848, + "grad_norm": 0.2221812754869461, + "learning_rate": 0.001, + "loss": 2.1466, + "step": 5400 + }, + { + "epoch": 0.22848802775192487, + "grad_norm": 0.17952899634838104, + "learning_rate": 0.001, + "loss": 2.3154, + "step": 5401 + }, + { + "epoch": 0.22853033251544125, + "grad_norm": 0.1853719800710678, + "learning_rate": 0.001, + "loss": 1.7711, + "step": 5402 + }, + { + "epoch": 0.2285726372789576, + "grad_norm": 0.2924804985523224, + "learning_rate": 0.001, + "loss": 1.6892, + "step": 5403 + }, + { + "epoch": 0.22861494204247398, + "grad_norm": 0.20289921760559082, + "learning_rate": 0.001, + "loss": 2.1891, + "step": 5404 + }, + { + "epoch": 0.22865724680599037, + "grad_norm": 0.2075899988412857, + "learning_rate": 0.001, + "loss": 2.1022, + "step": 5405 + }, + { + "epoch": 0.22869955156950672, + "grad_norm": 0.19035397469997406, + "learning_rate": 0.001, + "loss": 1.5325, + "step": 5406 + }, + { + "epoch": 0.2287418563330231, + "grad_norm": 9.618221282958984, + "learning_rate": 0.001, + "loss": 2.1168, + "step": 5407 + }, + { + "epoch": 0.22878416109653946, + "grad_norm": 0.20112530887126923, + "learning_rate": 0.001, + "loss": 2.3236, + "step": 5408 + }, + { + "epoch": 0.22882646586005584, + "grad_norm": 0.24566976726055145, + "learning_rate": 0.001, + "loss": 2.5824, + "step": 5409 + }, + { + "epoch": 0.22886877062357222, + "grad_norm": 0.42258378863334656, + "learning_rate": 0.001, + "loss": 2.3928, + "step": 5410 + }, + { + "epoch": 0.22891107538708858, + "grad_norm": 0.17424896359443665, + "learning_rate": 0.001, + "loss": 2.8459, + "step": 5411 + }, + { + "epoch": 0.22895338015060496, + "grad_norm": 0.23018798232078552, + "learning_rate": 0.001, + "loss": 2.3751, + "step": 5412 + }, + { + "epoch": 0.22899568491412134, + "grad_norm": 16.62144660949707, + "learning_rate": 0.001, + "loss": 1.8728, + "step": 5413 + }, + { + "epoch": 0.2290379896776377, + "grad_norm": 0.19507117569446564, + "learning_rate": 0.001, + "loss": 2.0108, + "step": 5414 + }, + { + "epoch": 0.22908029444115408, + "grad_norm": 0.18236559629440308, + "learning_rate": 0.001, + "loss": 1.877, + "step": 5415 + }, + { + "epoch": 0.22912259920467046, + "grad_norm": 0.18296857178211212, + "learning_rate": 0.001, + "loss": 2.3096, + "step": 5416 + }, + { + "epoch": 0.2291649039681868, + "grad_norm": 0.22751617431640625, + "learning_rate": 0.001, + "loss": 2.0818, + "step": 5417 + }, + { + "epoch": 0.2292072087317032, + "grad_norm": 0.22728577256202698, + "learning_rate": 0.001, + "loss": 2.0094, + "step": 5418 + }, + { + "epoch": 0.22924951349521955, + "grad_norm": 0.20583270490169525, + "learning_rate": 0.001, + "loss": 1.8739, + "step": 5419 + }, + { + "epoch": 0.22929181825873593, + "grad_norm": 0.2076723724603653, + "learning_rate": 0.001, + "loss": 1.8793, + "step": 5420 + }, + { + "epoch": 0.2293341230222523, + "grad_norm": 0.23529089987277985, + "learning_rate": 0.001, + "loss": 1.4192, + "step": 5421 + }, + { + "epoch": 0.22937642778576867, + "grad_norm": 1.0901966094970703, + "learning_rate": 0.001, + "loss": 2.0787, + "step": 5422 + }, + { + "epoch": 0.22941873254928505, + "grad_norm": 0.21631963551044464, + "learning_rate": 0.001, + "loss": 2.0501, + "step": 5423 + }, + { + "epoch": 0.22946103731280143, + "grad_norm": 0.2734993100166321, + "learning_rate": 0.001, + "loss": 3.2331, + "step": 5424 + }, + { + "epoch": 0.22950334207631778, + "grad_norm": 0.23572872579097748, + "learning_rate": 0.001, + "loss": 2.8713, + "step": 5425 + }, + { + "epoch": 0.22954564683983417, + "grad_norm": 0.19519785046577454, + "learning_rate": 0.001, + "loss": 2.4311, + "step": 5426 + }, + { + "epoch": 0.22958795160335055, + "grad_norm": 0.19540639221668243, + "learning_rate": 0.001, + "loss": 2.0317, + "step": 5427 + }, + { + "epoch": 0.2296302563668669, + "grad_norm": 0.28334343433380127, + "learning_rate": 0.001, + "loss": 3.7543, + "step": 5428 + }, + { + "epoch": 0.22967256113038328, + "grad_norm": 0.18688379228115082, + "learning_rate": 0.001, + "loss": 1.9003, + "step": 5429 + }, + { + "epoch": 0.22971486589389967, + "grad_norm": 1.3220264911651611, + "learning_rate": 0.001, + "loss": 1.7229, + "step": 5430 + }, + { + "epoch": 0.22975717065741602, + "grad_norm": 0.23925921320915222, + "learning_rate": 0.001, + "loss": 2.1974, + "step": 5431 + }, + { + "epoch": 0.2297994754209324, + "grad_norm": 0.25730374455451965, + "learning_rate": 0.001, + "loss": 3.2443, + "step": 5432 + }, + { + "epoch": 0.22984178018444876, + "grad_norm": 1.0473192930221558, + "learning_rate": 0.001, + "loss": 2.2703, + "step": 5433 + }, + { + "epoch": 0.22988408494796514, + "grad_norm": 0.20525427162647247, + "learning_rate": 0.001, + "loss": 2.2747, + "step": 5434 + }, + { + "epoch": 0.22992638971148152, + "grad_norm": 0.24831177294254303, + "learning_rate": 0.001, + "loss": 2.3047, + "step": 5435 + }, + { + "epoch": 0.22996869447499788, + "grad_norm": 0.16241610050201416, + "learning_rate": 0.001, + "loss": 1.5626, + "step": 5436 + }, + { + "epoch": 0.23001099923851426, + "grad_norm": 0.23656898736953735, + "learning_rate": 0.001, + "loss": 1.8401, + "step": 5437 + }, + { + "epoch": 0.23005330400203064, + "grad_norm": 0.21393883228302002, + "learning_rate": 0.001, + "loss": 2.2871, + "step": 5438 + }, + { + "epoch": 0.230095608765547, + "grad_norm": 0.471842497587204, + "learning_rate": 0.001, + "loss": 2.9926, + "step": 5439 + }, + { + "epoch": 0.23013791352906338, + "grad_norm": 0.20878411829471588, + "learning_rate": 0.001, + "loss": 1.7661, + "step": 5440 + }, + { + "epoch": 0.23018021829257976, + "grad_norm": 0.21290957927703857, + "learning_rate": 0.001, + "loss": 2.2779, + "step": 5441 + }, + { + "epoch": 0.2302225230560961, + "grad_norm": 0.2180158644914627, + "learning_rate": 0.001, + "loss": 2.9909, + "step": 5442 + }, + { + "epoch": 0.2302648278196125, + "grad_norm": 0.22294098138809204, + "learning_rate": 0.001, + "loss": 2.7471, + "step": 5443 + }, + { + "epoch": 0.23030713258312885, + "grad_norm": 0.20272763073444366, + "learning_rate": 0.001, + "loss": 2.2887, + "step": 5444 + }, + { + "epoch": 0.23034943734664523, + "grad_norm": 0.5208730697631836, + "learning_rate": 0.001, + "loss": 1.786, + "step": 5445 + }, + { + "epoch": 0.2303917421101616, + "grad_norm": 0.23959921300411224, + "learning_rate": 0.001, + "loss": 2.5374, + "step": 5446 + }, + { + "epoch": 0.23043404687367797, + "grad_norm": 0.22310377657413483, + "learning_rate": 0.001, + "loss": 2.9862, + "step": 5447 + }, + { + "epoch": 0.23047635163719435, + "grad_norm": 0.27970150113105774, + "learning_rate": 0.001, + "loss": 2.2442, + "step": 5448 + }, + { + "epoch": 0.23051865640071073, + "grad_norm": 0.230902299284935, + "learning_rate": 0.001, + "loss": 1.6546, + "step": 5449 + }, + { + "epoch": 0.23056096116422709, + "grad_norm": 0.22653928399085999, + "learning_rate": 0.001, + "loss": 2.9463, + "step": 5450 + }, + { + "epoch": 0.23060326592774347, + "grad_norm": 0.25787416100502014, + "learning_rate": 0.001, + "loss": 2.4843, + "step": 5451 + }, + { + "epoch": 0.23064557069125985, + "grad_norm": 0.44561514258384705, + "learning_rate": 0.001, + "loss": 2.011, + "step": 5452 + }, + { + "epoch": 0.2306878754547762, + "grad_norm": 0.27553293108940125, + "learning_rate": 0.001, + "loss": 2.7563, + "step": 5453 + }, + { + "epoch": 0.23073018021829259, + "grad_norm": 0.2613121569156647, + "learning_rate": 0.001, + "loss": 2.2846, + "step": 5454 + }, + { + "epoch": 0.23077248498180894, + "grad_norm": 0.36091411113739014, + "learning_rate": 0.001, + "loss": 3.0635, + "step": 5455 + }, + { + "epoch": 0.23081478974532532, + "grad_norm": 0.19998879730701447, + "learning_rate": 0.001, + "loss": 2.578, + "step": 5456 + }, + { + "epoch": 0.2308570945088417, + "grad_norm": 0.235711470246315, + "learning_rate": 0.001, + "loss": 2.1896, + "step": 5457 + }, + { + "epoch": 0.23089939927235806, + "grad_norm": 0.7797324061393738, + "learning_rate": 0.001, + "loss": 2.4483, + "step": 5458 + }, + { + "epoch": 0.23094170403587444, + "grad_norm": 0.17661023139953613, + "learning_rate": 0.001, + "loss": 2.4288, + "step": 5459 + }, + { + "epoch": 0.23098400879939082, + "grad_norm": 0.18640023469924927, + "learning_rate": 0.001, + "loss": 2.1796, + "step": 5460 + }, + { + "epoch": 0.23102631356290718, + "grad_norm": 7.270983695983887, + "learning_rate": 0.001, + "loss": 2.1537, + "step": 5461 + }, + { + "epoch": 0.23106861832642356, + "grad_norm": 0.5781380534172058, + "learning_rate": 0.001, + "loss": 2.9149, + "step": 5462 + }, + { + "epoch": 0.23111092308993994, + "grad_norm": 0.21426080167293549, + "learning_rate": 0.001, + "loss": 2.6075, + "step": 5463 + }, + { + "epoch": 0.2311532278534563, + "grad_norm": 0.24475079774856567, + "learning_rate": 0.001, + "loss": 2.4721, + "step": 5464 + }, + { + "epoch": 0.23119553261697268, + "grad_norm": 1.0542508363723755, + "learning_rate": 0.001, + "loss": 2.0507, + "step": 5465 + }, + { + "epoch": 0.23123783738048903, + "grad_norm": 0.2142096906900406, + "learning_rate": 0.001, + "loss": 1.8177, + "step": 5466 + }, + { + "epoch": 0.2312801421440054, + "grad_norm": 0.25511589646339417, + "learning_rate": 0.001, + "loss": 2.9547, + "step": 5467 + }, + { + "epoch": 0.2313224469075218, + "grad_norm": 4.037811279296875, + "learning_rate": 0.001, + "loss": 2.0729, + "step": 5468 + }, + { + "epoch": 0.23136475167103815, + "grad_norm": 0.42371609807014465, + "learning_rate": 0.001, + "loss": 2.684, + "step": 5469 + }, + { + "epoch": 0.23140705643455453, + "grad_norm": 0.25769296288490295, + "learning_rate": 0.001, + "loss": 1.8104, + "step": 5470 + }, + { + "epoch": 0.2314493611980709, + "grad_norm": 0.9849298596382141, + "learning_rate": 0.001, + "loss": 3.1677, + "step": 5471 + }, + { + "epoch": 0.23149166596158727, + "grad_norm": 10.10745620727539, + "learning_rate": 0.001, + "loss": 2.3539, + "step": 5472 + }, + { + "epoch": 0.23153397072510365, + "grad_norm": 0.28518936038017273, + "learning_rate": 0.001, + "loss": 3.5426, + "step": 5473 + }, + { + "epoch": 0.23157627548862003, + "grad_norm": 0.40552154183387756, + "learning_rate": 0.001, + "loss": 2.9542, + "step": 5474 + }, + { + "epoch": 0.23161858025213639, + "grad_norm": 0.26154863834381104, + "learning_rate": 0.001, + "loss": 1.9963, + "step": 5475 + }, + { + "epoch": 0.23166088501565277, + "grad_norm": 0.2571628987789154, + "learning_rate": 0.001, + "loss": 1.95, + "step": 5476 + }, + { + "epoch": 0.23170318977916912, + "grad_norm": 0.2619466185569763, + "learning_rate": 0.001, + "loss": 2.0637, + "step": 5477 + }, + { + "epoch": 0.2317454945426855, + "grad_norm": 0.28519323468208313, + "learning_rate": 0.001, + "loss": 2.5762, + "step": 5478 + }, + { + "epoch": 0.23178779930620189, + "grad_norm": 1.2424720525741577, + "learning_rate": 0.001, + "loss": 3.0967, + "step": 5479 + }, + { + "epoch": 0.23183010406971824, + "grad_norm": 0.4895488917827606, + "learning_rate": 0.001, + "loss": 3.3258, + "step": 5480 + }, + { + "epoch": 0.23187240883323462, + "grad_norm": 0.39501601457595825, + "learning_rate": 0.001, + "loss": 2.6346, + "step": 5481 + }, + { + "epoch": 0.231914713596751, + "grad_norm": 0.2693521976470947, + "learning_rate": 0.001, + "loss": 2.1174, + "step": 5482 + }, + { + "epoch": 0.23195701836026736, + "grad_norm": 0.2762487828731537, + "learning_rate": 0.001, + "loss": 2.0984, + "step": 5483 + }, + { + "epoch": 0.23199932312378374, + "grad_norm": 0.2225947082042694, + "learning_rate": 0.001, + "loss": 1.9871, + "step": 5484 + }, + { + "epoch": 0.23204162788730012, + "grad_norm": 0.38060519099235535, + "learning_rate": 0.001, + "loss": 2.3824, + "step": 5485 + }, + { + "epoch": 0.23208393265081648, + "grad_norm": 0.21459133923053741, + "learning_rate": 0.001, + "loss": 2.0364, + "step": 5486 + }, + { + "epoch": 0.23212623741433286, + "grad_norm": 0.5198157429695129, + "learning_rate": 0.001, + "loss": 3.1656, + "step": 5487 + }, + { + "epoch": 0.2321685421778492, + "grad_norm": 0.675767183303833, + "learning_rate": 0.001, + "loss": 3.456, + "step": 5488 + }, + { + "epoch": 0.2322108469413656, + "grad_norm": 0.22795476019382477, + "learning_rate": 0.001, + "loss": 3.3561, + "step": 5489 + }, + { + "epoch": 0.23225315170488198, + "grad_norm": 0.9936857223510742, + "learning_rate": 0.001, + "loss": 2.4905, + "step": 5490 + }, + { + "epoch": 0.23229545646839833, + "grad_norm": 0.3095644414424896, + "learning_rate": 0.001, + "loss": 2.0372, + "step": 5491 + }, + { + "epoch": 0.2323377612319147, + "grad_norm": 0.18799656629562378, + "learning_rate": 0.001, + "loss": 1.9836, + "step": 5492 + }, + { + "epoch": 0.2323800659954311, + "grad_norm": 0.21074536442756653, + "learning_rate": 0.001, + "loss": 2.2601, + "step": 5493 + }, + { + "epoch": 0.23242237075894745, + "grad_norm": 0.32169005274772644, + "learning_rate": 0.001, + "loss": 3.0188, + "step": 5494 + }, + { + "epoch": 0.23246467552246383, + "grad_norm": 0.21272505819797516, + "learning_rate": 0.001, + "loss": 2.1103, + "step": 5495 + }, + { + "epoch": 0.2325069802859802, + "grad_norm": 0.4344709515571594, + "learning_rate": 0.001, + "loss": 3.0217, + "step": 5496 + }, + { + "epoch": 0.23254928504949657, + "grad_norm": 1.6699187755584717, + "learning_rate": 0.001, + "loss": 2.5814, + "step": 5497 + }, + { + "epoch": 0.23259158981301295, + "grad_norm": 0.22509506344795227, + "learning_rate": 0.001, + "loss": 2.8708, + "step": 5498 + }, + { + "epoch": 0.2326338945765293, + "grad_norm": 0.24898000061511993, + "learning_rate": 0.001, + "loss": 2.8715, + "step": 5499 + }, + { + "epoch": 0.2326761993400457, + "grad_norm": 0.7220833897590637, + "learning_rate": 0.001, + "loss": 2.1767, + "step": 5500 + }, + { + "epoch": 0.23271850410356207, + "grad_norm": 0.25471919775009155, + "learning_rate": 0.001, + "loss": 3.0134, + "step": 5501 + }, + { + "epoch": 0.23276080886707842, + "grad_norm": 0.2646358907222748, + "learning_rate": 0.001, + "loss": 2.0253, + "step": 5502 + }, + { + "epoch": 0.2328031136305948, + "grad_norm": 0.20259220898151398, + "learning_rate": 0.001, + "loss": 2.053, + "step": 5503 + }, + { + "epoch": 0.2328454183941112, + "grad_norm": 0.49306970834732056, + "learning_rate": 0.001, + "loss": 2.2513, + "step": 5504 + }, + { + "epoch": 0.23288772315762754, + "grad_norm": 0.2066371738910675, + "learning_rate": 0.001, + "loss": 2.7308, + "step": 5505 + }, + { + "epoch": 0.23293002792114392, + "grad_norm": 0.20641134679317474, + "learning_rate": 0.001, + "loss": 2.5732, + "step": 5506 + }, + { + "epoch": 0.2329723326846603, + "grad_norm": 0.21827174723148346, + "learning_rate": 0.001, + "loss": 2.6558, + "step": 5507 + }, + { + "epoch": 0.23301463744817666, + "grad_norm": 0.2925966680049896, + "learning_rate": 0.001, + "loss": 2.6788, + "step": 5508 + }, + { + "epoch": 0.23305694221169304, + "grad_norm": 0.2853943109512329, + "learning_rate": 0.001, + "loss": 2.5224, + "step": 5509 + }, + { + "epoch": 0.2330992469752094, + "grad_norm": 0.20848669111728668, + "learning_rate": 0.001, + "loss": 2.3025, + "step": 5510 + }, + { + "epoch": 0.23314155173872578, + "grad_norm": 1.7982761859893799, + "learning_rate": 0.001, + "loss": 2.1487, + "step": 5511 + }, + { + "epoch": 0.23318385650224216, + "grad_norm": 0.2628767490386963, + "learning_rate": 0.001, + "loss": 2.7454, + "step": 5512 + }, + { + "epoch": 0.2332261612657585, + "grad_norm": 0.21487084031105042, + "learning_rate": 0.001, + "loss": 1.7897, + "step": 5513 + }, + { + "epoch": 0.2332684660292749, + "grad_norm": 0.2565120756626129, + "learning_rate": 0.001, + "loss": 2.3647, + "step": 5514 + }, + { + "epoch": 0.23331077079279128, + "grad_norm": 0.2229587584733963, + "learning_rate": 0.001, + "loss": 1.7196, + "step": 5515 + }, + { + "epoch": 0.23335307555630763, + "grad_norm": 0.23122352361679077, + "learning_rate": 0.001, + "loss": 3.264, + "step": 5516 + }, + { + "epoch": 0.23339538031982401, + "grad_norm": 0.2294238656759262, + "learning_rate": 0.001, + "loss": 2.1468, + "step": 5517 + }, + { + "epoch": 0.2334376850833404, + "grad_norm": 1.3923436403274536, + "learning_rate": 0.001, + "loss": 2.8618, + "step": 5518 + }, + { + "epoch": 0.23347998984685675, + "grad_norm": 4.941551685333252, + "learning_rate": 0.001, + "loss": 2.7653, + "step": 5519 + }, + { + "epoch": 0.23352229461037313, + "grad_norm": 0.23686207830905914, + "learning_rate": 0.001, + "loss": 1.6694, + "step": 5520 + }, + { + "epoch": 0.2335645993738895, + "grad_norm": 0.38587716221809387, + "learning_rate": 0.001, + "loss": 2.2298, + "step": 5521 + }, + { + "epoch": 0.23360690413740587, + "grad_norm": 0.2625442147254944, + "learning_rate": 0.001, + "loss": 2.0646, + "step": 5522 + }, + { + "epoch": 0.23364920890092225, + "grad_norm": 0.2582058906555176, + "learning_rate": 0.001, + "loss": 2.6844, + "step": 5523 + }, + { + "epoch": 0.2336915136644386, + "grad_norm": 2.8828814029693604, + "learning_rate": 0.001, + "loss": 2.1936, + "step": 5524 + }, + { + "epoch": 0.233733818427955, + "grad_norm": 0.2290409654378891, + "learning_rate": 0.001, + "loss": 1.9075, + "step": 5525 + }, + { + "epoch": 0.23377612319147137, + "grad_norm": 0.1932823210954666, + "learning_rate": 0.001, + "loss": 3.3037, + "step": 5526 + }, + { + "epoch": 0.23381842795498772, + "grad_norm": 0.20359812676906586, + "learning_rate": 0.001, + "loss": 2.209, + "step": 5527 + }, + { + "epoch": 0.2338607327185041, + "grad_norm": 0.3113921582698822, + "learning_rate": 0.001, + "loss": 1.766, + "step": 5528 + }, + { + "epoch": 0.2339030374820205, + "grad_norm": 20.93682098388672, + "learning_rate": 0.001, + "loss": 2.5909, + "step": 5529 + }, + { + "epoch": 0.23394534224553684, + "grad_norm": 0.2424592673778534, + "learning_rate": 0.001, + "loss": 2.3092, + "step": 5530 + }, + { + "epoch": 0.23398764700905322, + "grad_norm": 0.22494091093540192, + "learning_rate": 0.001, + "loss": 2.2942, + "step": 5531 + }, + { + "epoch": 0.23402995177256958, + "grad_norm": 0.20457018911838531, + "learning_rate": 0.001, + "loss": 2.31, + "step": 5532 + }, + { + "epoch": 0.23407225653608596, + "grad_norm": 0.32877421379089355, + "learning_rate": 0.001, + "loss": 1.8861, + "step": 5533 + }, + { + "epoch": 0.23411456129960234, + "grad_norm": 0.42228415608406067, + "learning_rate": 0.001, + "loss": 2.7353, + "step": 5534 + }, + { + "epoch": 0.2341568660631187, + "grad_norm": 0.47094154357910156, + "learning_rate": 0.001, + "loss": 3.8558, + "step": 5535 + }, + { + "epoch": 0.23419917082663508, + "grad_norm": 0.2175481915473938, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 5536 + }, + { + "epoch": 0.23424147559015146, + "grad_norm": 3.194316864013672, + "learning_rate": 0.001, + "loss": 2.5488, + "step": 5537 + }, + { + "epoch": 0.23428378035366781, + "grad_norm": 0.3026067912578583, + "learning_rate": 0.001, + "loss": 2.4307, + "step": 5538 + }, + { + "epoch": 0.2343260851171842, + "grad_norm": 0.8778800368309021, + "learning_rate": 0.001, + "loss": 1.6685, + "step": 5539 + }, + { + "epoch": 0.23436838988070058, + "grad_norm": 0.1916915625333786, + "learning_rate": 0.001, + "loss": 2.2532, + "step": 5540 + }, + { + "epoch": 0.23441069464421693, + "grad_norm": 0.28568512201309204, + "learning_rate": 0.001, + "loss": 2.0453, + "step": 5541 + }, + { + "epoch": 0.23445299940773331, + "grad_norm": 0.5260047316551208, + "learning_rate": 0.001, + "loss": 3.2691, + "step": 5542 + }, + { + "epoch": 0.23449530417124967, + "grad_norm": 0.2400904893875122, + "learning_rate": 0.001, + "loss": 2.483, + "step": 5543 + }, + { + "epoch": 0.23453760893476605, + "grad_norm": 0.232883483171463, + "learning_rate": 0.001, + "loss": 3.0007, + "step": 5544 + }, + { + "epoch": 0.23457991369828243, + "grad_norm": 0.21904799342155457, + "learning_rate": 0.001, + "loss": 2.0221, + "step": 5545 + }, + { + "epoch": 0.2346222184617988, + "grad_norm": 0.27808424830436707, + "learning_rate": 0.001, + "loss": 1.7361, + "step": 5546 + }, + { + "epoch": 0.23466452322531517, + "grad_norm": 0.33674612641334534, + "learning_rate": 0.001, + "loss": 2.5554, + "step": 5547 + }, + { + "epoch": 0.23470682798883155, + "grad_norm": 0.21071641147136688, + "learning_rate": 0.001, + "loss": 2.0207, + "step": 5548 + }, + { + "epoch": 0.2347491327523479, + "grad_norm": 0.20552270114421844, + "learning_rate": 0.001, + "loss": 2.2293, + "step": 5549 + }, + { + "epoch": 0.2347914375158643, + "grad_norm": 0.31367790699005127, + "learning_rate": 0.001, + "loss": 2.6712, + "step": 5550 + }, + { + "epoch": 0.23483374227938067, + "grad_norm": 0.7364101409912109, + "learning_rate": 0.001, + "loss": 2.0776, + "step": 5551 + }, + { + "epoch": 0.23487604704289702, + "grad_norm": 0.24621638655662537, + "learning_rate": 0.001, + "loss": 2.0778, + "step": 5552 + }, + { + "epoch": 0.2349183518064134, + "grad_norm": 0.23604516685009003, + "learning_rate": 0.001, + "loss": 3.3079, + "step": 5553 + }, + { + "epoch": 0.2349606565699298, + "grad_norm": 0.8118863105773926, + "learning_rate": 0.001, + "loss": 2.4182, + "step": 5554 + }, + { + "epoch": 0.23500296133344614, + "grad_norm": 0.21892663836479187, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 5555 + }, + { + "epoch": 0.23504526609696252, + "grad_norm": 0.28527218103408813, + "learning_rate": 0.001, + "loss": 3.7358, + "step": 5556 + }, + { + "epoch": 0.23508757086047888, + "grad_norm": 0.2317507416009903, + "learning_rate": 0.001, + "loss": 3.2241, + "step": 5557 + }, + { + "epoch": 0.23512987562399526, + "grad_norm": 0.23719051480293274, + "learning_rate": 0.001, + "loss": 2.1869, + "step": 5558 + }, + { + "epoch": 0.23517218038751164, + "grad_norm": 0.2504566013813019, + "learning_rate": 0.001, + "loss": 2.2632, + "step": 5559 + }, + { + "epoch": 0.235214485151028, + "grad_norm": 0.2797103226184845, + "learning_rate": 0.001, + "loss": 2.0476, + "step": 5560 + }, + { + "epoch": 0.23525678991454438, + "grad_norm": 0.223208948969841, + "learning_rate": 0.001, + "loss": 2.3264, + "step": 5561 + }, + { + "epoch": 0.23529909467806076, + "grad_norm": 2.268007755279541, + "learning_rate": 0.001, + "loss": 3.4196, + "step": 5562 + }, + { + "epoch": 0.23534139944157711, + "grad_norm": 0.20520193874835968, + "learning_rate": 0.001, + "loss": 2.1592, + "step": 5563 + }, + { + "epoch": 0.2353837042050935, + "grad_norm": 0.1952827274799347, + "learning_rate": 0.001, + "loss": 2.2686, + "step": 5564 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 0.24060063064098358, + "learning_rate": 0.001, + "loss": 2.6335, + "step": 5565 + }, + { + "epoch": 0.23546831373212623, + "grad_norm": 0.18970201909542084, + "learning_rate": 0.001, + "loss": 2.1595, + "step": 5566 + }, + { + "epoch": 0.23551061849564262, + "grad_norm": 0.20681913197040558, + "learning_rate": 0.001, + "loss": 2.7761, + "step": 5567 + }, + { + "epoch": 0.23555292325915897, + "grad_norm": 1.009856104850769, + "learning_rate": 0.001, + "loss": 3.113, + "step": 5568 + }, + { + "epoch": 0.23559522802267535, + "grad_norm": 0.20202496647834778, + "learning_rate": 0.001, + "loss": 2.7176, + "step": 5569 + }, + { + "epoch": 0.23563753278619173, + "grad_norm": 0.2296670824289322, + "learning_rate": 0.001, + "loss": 1.9414, + "step": 5570 + }, + { + "epoch": 0.2356798375497081, + "grad_norm": 2.0302107334136963, + "learning_rate": 0.001, + "loss": 3.5255, + "step": 5571 + }, + { + "epoch": 0.23572214231322447, + "grad_norm": 0.2701435983181, + "learning_rate": 0.001, + "loss": 2.0376, + "step": 5572 + }, + { + "epoch": 0.23576444707674085, + "grad_norm": 1.3323795795440674, + "learning_rate": 0.001, + "loss": 3.0352, + "step": 5573 + }, + { + "epoch": 0.2358067518402572, + "grad_norm": 0.2649135887622833, + "learning_rate": 0.001, + "loss": 2.5785, + "step": 5574 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 0.21373842656612396, + "learning_rate": 0.001, + "loss": 1.9535, + "step": 5575 + }, + { + "epoch": 0.23589136136728997, + "grad_norm": 0.2295471578836441, + "learning_rate": 0.001, + "loss": 1.9545, + "step": 5576 + }, + { + "epoch": 0.23593366613080632, + "grad_norm": 0.32685786485671997, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 5577 + }, + { + "epoch": 0.2359759708943227, + "grad_norm": 0.22259478271007538, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 5578 + }, + { + "epoch": 0.23601827565783906, + "grad_norm": 0.2274249643087387, + "learning_rate": 0.001, + "loss": 2.2864, + "step": 5579 + }, + { + "epoch": 0.23606058042135544, + "grad_norm": 0.23607851564884186, + "learning_rate": 0.001, + "loss": 3.7908, + "step": 5580 + }, + { + "epoch": 0.23610288518487182, + "grad_norm": 0.22319360077381134, + "learning_rate": 0.001, + "loss": 1.8991, + "step": 5581 + }, + { + "epoch": 0.23614518994838818, + "grad_norm": 0.1910727173089981, + "learning_rate": 0.001, + "loss": 2.1673, + "step": 5582 + }, + { + "epoch": 0.23618749471190456, + "grad_norm": 0.35509058833122253, + "learning_rate": 0.001, + "loss": 1.9815, + "step": 5583 + }, + { + "epoch": 0.23622979947542094, + "grad_norm": 0.5298141837120056, + "learning_rate": 0.001, + "loss": 2.5921, + "step": 5584 + }, + { + "epoch": 0.2362721042389373, + "grad_norm": 0.2098468840122223, + "learning_rate": 0.001, + "loss": 2.8481, + "step": 5585 + }, + { + "epoch": 0.23631440900245368, + "grad_norm": 0.6692335605621338, + "learning_rate": 0.001, + "loss": 2.6247, + "step": 5586 + }, + { + "epoch": 0.23635671376597006, + "grad_norm": 0.2933887541294098, + "learning_rate": 0.001, + "loss": 2.677, + "step": 5587 + }, + { + "epoch": 0.23639901852948642, + "grad_norm": 1.0850200653076172, + "learning_rate": 0.001, + "loss": 2.6977, + "step": 5588 + }, + { + "epoch": 0.2364413232930028, + "grad_norm": 8.362639427185059, + "learning_rate": 0.001, + "loss": 2.4108, + "step": 5589 + }, + { + "epoch": 0.23648362805651915, + "grad_norm": 0.1976199746131897, + "learning_rate": 0.001, + "loss": 2.2232, + "step": 5590 + }, + { + "epoch": 0.23652593282003553, + "grad_norm": 1.5596935749053955, + "learning_rate": 0.001, + "loss": 2.3909, + "step": 5591 + }, + { + "epoch": 0.23656823758355192, + "grad_norm": 0.3760131895542145, + "learning_rate": 0.001, + "loss": 2.1015, + "step": 5592 + }, + { + "epoch": 0.23661054234706827, + "grad_norm": 0.4867786169052124, + "learning_rate": 0.001, + "loss": 2.3708, + "step": 5593 + }, + { + "epoch": 0.23665284711058465, + "grad_norm": 0.3288508355617523, + "learning_rate": 0.001, + "loss": 3.3302, + "step": 5594 + }, + { + "epoch": 0.23669515187410103, + "grad_norm": 0.3380516767501831, + "learning_rate": 0.001, + "loss": 2.3632, + "step": 5595 + }, + { + "epoch": 0.2367374566376174, + "grad_norm": 0.2778700590133667, + "learning_rate": 0.001, + "loss": 2.2904, + "step": 5596 + }, + { + "epoch": 0.23677976140113377, + "grad_norm": 2.0466156005859375, + "learning_rate": 0.001, + "loss": 2.2768, + "step": 5597 + }, + { + "epoch": 0.23682206616465015, + "grad_norm": 0.5960863828659058, + "learning_rate": 0.001, + "loss": 2.5767, + "step": 5598 + }, + { + "epoch": 0.2368643709281665, + "grad_norm": 0.3480601906776428, + "learning_rate": 0.001, + "loss": 2.7427, + "step": 5599 + }, + { + "epoch": 0.2369066756916829, + "grad_norm": 0.22647280991077423, + "learning_rate": 0.001, + "loss": 4.0171, + "step": 5600 + }, + { + "epoch": 0.23694898045519924, + "grad_norm": 0.37474966049194336, + "learning_rate": 0.001, + "loss": 2.2962, + "step": 5601 + }, + { + "epoch": 0.23699128521871562, + "grad_norm": 0.35157617926597595, + "learning_rate": 0.001, + "loss": 3.5374, + "step": 5602 + }, + { + "epoch": 0.237033589982232, + "grad_norm": 1.5529965162277222, + "learning_rate": 0.001, + "loss": 3.0981, + "step": 5603 + }, + { + "epoch": 0.23707589474574836, + "grad_norm": 0.4362848103046417, + "learning_rate": 0.001, + "loss": 3.007, + "step": 5604 + }, + { + "epoch": 0.23711819950926474, + "grad_norm": 0.2729444205760956, + "learning_rate": 0.001, + "loss": 2.2965, + "step": 5605 + }, + { + "epoch": 0.23716050427278113, + "grad_norm": 0.2660609781742096, + "learning_rate": 0.001, + "loss": 2.2928, + "step": 5606 + }, + { + "epoch": 0.23720280903629748, + "grad_norm": 0.23978281021118164, + "learning_rate": 0.001, + "loss": 2.5681, + "step": 5607 + }, + { + "epoch": 0.23724511379981386, + "grad_norm": 0.21553561091423035, + "learning_rate": 0.001, + "loss": 1.9796, + "step": 5608 + }, + { + "epoch": 0.23728741856333024, + "grad_norm": 0.5897154808044434, + "learning_rate": 0.001, + "loss": 2.986, + "step": 5609 + }, + { + "epoch": 0.2373297233268466, + "grad_norm": 0.22807319462299347, + "learning_rate": 0.001, + "loss": 3.1874, + "step": 5610 + }, + { + "epoch": 0.23737202809036298, + "grad_norm": 0.22051000595092773, + "learning_rate": 0.001, + "loss": 1.8367, + "step": 5611 + }, + { + "epoch": 0.23741433285387933, + "grad_norm": 59.09831237792969, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 5612 + }, + { + "epoch": 0.23745663761739572, + "grad_norm": 0.21315006911754608, + "learning_rate": 0.001, + "loss": 2.1231, + "step": 5613 + }, + { + "epoch": 0.2374989423809121, + "grad_norm": 0.45930665731430054, + "learning_rate": 0.001, + "loss": 3.1726, + "step": 5614 + }, + { + "epoch": 0.23754124714442845, + "grad_norm": 0.5922689437866211, + "learning_rate": 0.001, + "loss": 1.8162, + "step": 5615 + }, + { + "epoch": 0.23758355190794483, + "grad_norm": 0.23892509937286377, + "learning_rate": 0.001, + "loss": 2.2867, + "step": 5616 + }, + { + "epoch": 0.23762585667146122, + "grad_norm": 0.2679390013217926, + "learning_rate": 0.001, + "loss": 2.9999, + "step": 5617 + }, + { + "epoch": 0.23766816143497757, + "grad_norm": 0.7704877257347107, + "learning_rate": 0.001, + "loss": 2.2267, + "step": 5618 + }, + { + "epoch": 0.23771046619849395, + "grad_norm": 0.4141584038734436, + "learning_rate": 0.001, + "loss": 3.1586, + "step": 5619 + }, + { + "epoch": 0.23775277096201033, + "grad_norm": 0.6287165284156799, + "learning_rate": 0.001, + "loss": 2.379, + "step": 5620 + }, + { + "epoch": 0.2377950757255267, + "grad_norm": 0.19241921603679657, + "learning_rate": 0.001, + "loss": 1.791, + "step": 5621 + }, + { + "epoch": 0.23783738048904307, + "grad_norm": 0.2662370502948761, + "learning_rate": 0.001, + "loss": 1.7727, + "step": 5622 + }, + { + "epoch": 0.23787968525255943, + "grad_norm": 68.88314819335938, + "learning_rate": 0.001, + "loss": 2.168, + "step": 5623 + }, + { + "epoch": 0.2379219900160758, + "grad_norm": 0.8763318061828613, + "learning_rate": 0.001, + "loss": 3.0477, + "step": 5624 + }, + { + "epoch": 0.2379642947795922, + "grad_norm": 0.878894567489624, + "learning_rate": 0.001, + "loss": 1.9467, + "step": 5625 + }, + { + "epoch": 0.23800659954310854, + "grad_norm": 0.4422621428966522, + "learning_rate": 0.001, + "loss": 2.7905, + "step": 5626 + }, + { + "epoch": 0.23804890430662493, + "grad_norm": 5.424615383148193, + "learning_rate": 0.001, + "loss": 2.3152, + "step": 5627 + }, + { + "epoch": 0.2380912090701413, + "grad_norm": 2.8828539848327637, + "learning_rate": 0.001, + "loss": 2.4996, + "step": 5628 + }, + { + "epoch": 0.23813351383365766, + "grad_norm": 1.1331160068511963, + "learning_rate": 0.001, + "loss": 2.6441, + "step": 5629 + }, + { + "epoch": 0.23817581859717404, + "grad_norm": 1.2550442218780518, + "learning_rate": 0.001, + "loss": 3.1826, + "step": 5630 + }, + { + "epoch": 0.23821812336069043, + "grad_norm": 0.27109989523887634, + "learning_rate": 0.001, + "loss": 3.46, + "step": 5631 + }, + { + "epoch": 0.23826042812420678, + "grad_norm": 0.2896735966205597, + "learning_rate": 0.001, + "loss": 2.7865, + "step": 5632 + }, + { + "epoch": 0.23830273288772316, + "grad_norm": 0.2764877676963806, + "learning_rate": 0.001, + "loss": 2.0332, + "step": 5633 + }, + { + "epoch": 0.23834503765123952, + "grad_norm": 1.915027141571045, + "learning_rate": 0.001, + "loss": 2.3694, + "step": 5634 + }, + { + "epoch": 0.2383873424147559, + "grad_norm": 0.4358770549297333, + "learning_rate": 0.001, + "loss": 2.3084, + "step": 5635 + }, + { + "epoch": 0.23842964717827228, + "grad_norm": 0.2749125063419342, + "learning_rate": 0.001, + "loss": 2.1625, + "step": 5636 + }, + { + "epoch": 0.23847195194178863, + "grad_norm": 0.29137587547302246, + "learning_rate": 0.001, + "loss": 3.2571, + "step": 5637 + }, + { + "epoch": 0.23851425670530502, + "grad_norm": 0.2710089385509491, + "learning_rate": 0.001, + "loss": 2.8085, + "step": 5638 + }, + { + "epoch": 0.2385565614688214, + "grad_norm": 0.3743971288204193, + "learning_rate": 0.001, + "loss": 3.0382, + "step": 5639 + }, + { + "epoch": 0.23859886623233775, + "grad_norm": 0.6103678345680237, + "learning_rate": 0.001, + "loss": 1.8752, + "step": 5640 + }, + { + "epoch": 0.23864117099585413, + "grad_norm": 0.2807563543319702, + "learning_rate": 0.001, + "loss": 2.6665, + "step": 5641 + }, + { + "epoch": 0.23868347575937052, + "grad_norm": 0.4093710780143738, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 5642 + }, + { + "epoch": 0.23872578052288687, + "grad_norm": 0.2517501711845398, + "learning_rate": 0.001, + "loss": 3.2762, + "step": 5643 + }, + { + "epoch": 0.23876808528640325, + "grad_norm": 0.21582388877868652, + "learning_rate": 0.001, + "loss": 2.8667, + "step": 5644 + }, + { + "epoch": 0.2388103900499196, + "grad_norm": 0.22406849265098572, + "learning_rate": 0.001, + "loss": 2.279, + "step": 5645 + }, + { + "epoch": 0.238852694813436, + "grad_norm": 0.17993216216564178, + "learning_rate": 0.001, + "loss": 1.6104, + "step": 5646 + }, + { + "epoch": 0.23889499957695237, + "grad_norm": 0.29950782656669617, + "learning_rate": 0.001, + "loss": 2.2527, + "step": 5647 + }, + { + "epoch": 0.23893730434046873, + "grad_norm": 0.20286992192268372, + "learning_rate": 0.001, + "loss": 2.5939, + "step": 5648 + }, + { + "epoch": 0.2389796091039851, + "grad_norm": 0.5291157960891724, + "learning_rate": 0.001, + "loss": 2.1778, + "step": 5649 + }, + { + "epoch": 0.2390219138675015, + "grad_norm": 0.34574270248413086, + "learning_rate": 0.001, + "loss": 2.5221, + "step": 5650 + }, + { + "epoch": 0.23906421863101784, + "grad_norm": 0.25159138441085815, + "learning_rate": 0.001, + "loss": 2.4263, + "step": 5651 + }, + { + "epoch": 0.23910652339453423, + "grad_norm": 1.9903714656829834, + "learning_rate": 0.001, + "loss": 2.5326, + "step": 5652 + }, + { + "epoch": 0.2391488281580506, + "grad_norm": 0.26104992628097534, + "learning_rate": 0.001, + "loss": 3.192, + "step": 5653 + }, + { + "epoch": 0.23919113292156696, + "grad_norm": 0.21419936418533325, + "learning_rate": 0.001, + "loss": 2.0116, + "step": 5654 + }, + { + "epoch": 0.23923343768508334, + "grad_norm": 0.49943071603775024, + "learning_rate": 0.001, + "loss": 1.9931, + "step": 5655 + }, + { + "epoch": 0.2392757424485997, + "grad_norm": 0.23732519149780273, + "learning_rate": 0.001, + "loss": 2.1189, + "step": 5656 + }, + { + "epoch": 0.23931804721211608, + "grad_norm": 0.2006048560142517, + "learning_rate": 0.001, + "loss": 2.1357, + "step": 5657 + }, + { + "epoch": 0.23936035197563246, + "grad_norm": 0.2843586504459381, + "learning_rate": 0.001, + "loss": 2.3875, + "step": 5658 + }, + { + "epoch": 0.23940265673914882, + "grad_norm": 0.16184455156326294, + "learning_rate": 0.001, + "loss": 2.2042, + "step": 5659 + }, + { + "epoch": 0.2394449615026652, + "grad_norm": 0.1909385770559311, + "learning_rate": 0.001, + "loss": 2.8616, + "step": 5660 + }, + { + "epoch": 0.23948726626618158, + "grad_norm": 0.18999890983104706, + "learning_rate": 0.001, + "loss": 1.9102, + "step": 5661 + }, + { + "epoch": 0.23952957102969794, + "grad_norm": 2.6552658081054688, + "learning_rate": 0.001, + "loss": 2.8128, + "step": 5662 + }, + { + "epoch": 0.23957187579321432, + "grad_norm": 0.21462197601795197, + "learning_rate": 0.001, + "loss": 2.404, + "step": 5663 + }, + { + "epoch": 0.2396141805567307, + "grad_norm": 0.27352675795555115, + "learning_rate": 0.001, + "loss": 2.4358, + "step": 5664 + }, + { + "epoch": 0.23965648532024705, + "grad_norm": 0.22441208362579346, + "learning_rate": 0.001, + "loss": 2.3097, + "step": 5665 + }, + { + "epoch": 0.23969879008376344, + "grad_norm": 0.3627524673938751, + "learning_rate": 0.001, + "loss": 3.7734, + "step": 5666 + }, + { + "epoch": 0.23974109484727982, + "grad_norm": 0.4762495756149292, + "learning_rate": 0.001, + "loss": 2.1284, + "step": 5667 + }, + { + "epoch": 0.23978339961079617, + "grad_norm": 0.2010953575372696, + "learning_rate": 0.001, + "loss": 2.329, + "step": 5668 + }, + { + "epoch": 0.23982570437431255, + "grad_norm": 0.24924032390117645, + "learning_rate": 0.001, + "loss": 3.1285, + "step": 5669 + }, + { + "epoch": 0.2398680091378289, + "grad_norm": 0.6464244723320007, + "learning_rate": 0.001, + "loss": 2.4777, + "step": 5670 + }, + { + "epoch": 0.2399103139013453, + "grad_norm": 0.1977522075176239, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 5671 + }, + { + "epoch": 0.23995261866486167, + "grad_norm": 0.1925719529390335, + "learning_rate": 0.001, + "loss": 2.2735, + "step": 5672 + }, + { + "epoch": 0.23999492342837803, + "grad_norm": 0.18989835679531097, + "learning_rate": 0.001, + "loss": 2.142, + "step": 5673 + }, + { + "epoch": 0.2400372281918944, + "grad_norm": 0.21661527454853058, + "learning_rate": 0.001, + "loss": 2.0072, + "step": 5674 + }, + { + "epoch": 0.2400795329554108, + "grad_norm": 1.2266058921813965, + "learning_rate": 0.001, + "loss": 2.5989, + "step": 5675 + }, + { + "epoch": 0.24012183771892714, + "grad_norm": 0.20844648778438568, + "learning_rate": 0.001, + "loss": 1.9061, + "step": 5676 + }, + { + "epoch": 0.24016414248244353, + "grad_norm": 0.27534279227256775, + "learning_rate": 0.001, + "loss": 2.631, + "step": 5677 + }, + { + "epoch": 0.2402064472459599, + "grad_norm": 0.2036500722169876, + "learning_rate": 0.001, + "loss": 1.604, + "step": 5678 + }, + { + "epoch": 0.24024875200947626, + "grad_norm": 0.18914766609668732, + "learning_rate": 0.001, + "loss": 1.9546, + "step": 5679 + }, + { + "epoch": 0.24029105677299264, + "grad_norm": 0.2073379009962082, + "learning_rate": 0.001, + "loss": 2.5842, + "step": 5680 + }, + { + "epoch": 0.240333361536509, + "grad_norm": 0.32036593556404114, + "learning_rate": 0.001, + "loss": 2.5532, + "step": 5681 + }, + { + "epoch": 0.24037566630002538, + "grad_norm": 0.1830267608165741, + "learning_rate": 0.001, + "loss": 2.4035, + "step": 5682 + }, + { + "epoch": 0.24041797106354176, + "grad_norm": 0.1883174479007721, + "learning_rate": 0.001, + "loss": 2.1087, + "step": 5683 + }, + { + "epoch": 0.24046027582705812, + "grad_norm": 0.2873155176639557, + "learning_rate": 0.001, + "loss": 2.2658, + "step": 5684 + }, + { + "epoch": 0.2405025805905745, + "grad_norm": 0.22399446368217468, + "learning_rate": 0.001, + "loss": 1.8286, + "step": 5685 + }, + { + "epoch": 0.24054488535409088, + "grad_norm": 0.7013282775878906, + "learning_rate": 0.001, + "loss": 2.4934, + "step": 5686 + }, + { + "epoch": 0.24058719011760724, + "grad_norm": 0.21409347653388977, + "learning_rate": 0.001, + "loss": 2.2924, + "step": 5687 + }, + { + "epoch": 0.24062949488112362, + "grad_norm": 2.542304515838623, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 5688 + }, + { + "epoch": 0.24067179964464, + "grad_norm": 0.39416250586509705, + "learning_rate": 0.001, + "loss": 2.9602, + "step": 5689 + }, + { + "epoch": 0.24071410440815635, + "grad_norm": 0.307919979095459, + "learning_rate": 0.001, + "loss": 1.6349, + "step": 5690 + }, + { + "epoch": 0.24075640917167274, + "grad_norm": 0.24086859822273254, + "learning_rate": 0.001, + "loss": 2.0097, + "step": 5691 + }, + { + "epoch": 0.2407987139351891, + "grad_norm": 0.1972251832485199, + "learning_rate": 0.001, + "loss": 1.7443, + "step": 5692 + }, + { + "epoch": 0.24084101869870547, + "grad_norm": 0.5142681002616882, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 5693 + }, + { + "epoch": 0.24088332346222185, + "grad_norm": 0.28559404611587524, + "learning_rate": 0.001, + "loss": 2.4702, + "step": 5694 + }, + { + "epoch": 0.2409256282257382, + "grad_norm": 0.20487889647483826, + "learning_rate": 0.001, + "loss": 1.9676, + "step": 5695 + }, + { + "epoch": 0.2409679329892546, + "grad_norm": 0.20150291919708252, + "learning_rate": 0.001, + "loss": 1.9557, + "step": 5696 + }, + { + "epoch": 0.24101023775277097, + "grad_norm": 0.2249046266078949, + "learning_rate": 0.001, + "loss": 2.1542, + "step": 5697 + }, + { + "epoch": 0.24105254251628733, + "grad_norm": 0.20710447430610657, + "learning_rate": 0.001, + "loss": 1.6919, + "step": 5698 + }, + { + "epoch": 0.2410948472798037, + "grad_norm": 0.2290615290403366, + "learning_rate": 0.001, + "loss": 2.3559, + "step": 5699 + }, + { + "epoch": 0.2411371520433201, + "grad_norm": 0.3252545893192291, + "learning_rate": 0.001, + "loss": 2.7864, + "step": 5700 + }, + { + "epoch": 0.24117945680683645, + "grad_norm": 0.21582552790641785, + "learning_rate": 0.001, + "loss": 1.7943, + "step": 5701 + }, + { + "epoch": 0.24122176157035283, + "grad_norm": 0.2762306332588196, + "learning_rate": 0.001, + "loss": 2.1902, + "step": 5702 + }, + { + "epoch": 0.24126406633386918, + "grad_norm": 0.22092297673225403, + "learning_rate": 0.001, + "loss": 2.4124, + "step": 5703 + }, + { + "epoch": 0.24130637109738556, + "grad_norm": 1.6592704057693481, + "learning_rate": 0.001, + "loss": 1.9375, + "step": 5704 + }, + { + "epoch": 0.24134867586090195, + "grad_norm": 0.2536323666572571, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 5705 + }, + { + "epoch": 0.2413909806244183, + "grad_norm": 0.23811465501785278, + "learning_rate": 0.001, + "loss": 2.3419, + "step": 5706 + }, + { + "epoch": 0.24143328538793468, + "grad_norm": 8.864980697631836, + "learning_rate": 0.001, + "loss": 3.1004, + "step": 5707 + }, + { + "epoch": 0.24147559015145106, + "grad_norm": 0.23312745988368988, + "learning_rate": 0.001, + "loss": 2.076, + "step": 5708 + }, + { + "epoch": 0.24151789491496742, + "grad_norm": 0.2500420808792114, + "learning_rate": 0.001, + "loss": 1.8015, + "step": 5709 + }, + { + "epoch": 0.2415601996784838, + "grad_norm": 0.397246390581131, + "learning_rate": 0.001, + "loss": 2.0747, + "step": 5710 + }, + { + "epoch": 0.24160250444200018, + "grad_norm": 0.256539523601532, + "learning_rate": 0.001, + "loss": 2.0406, + "step": 5711 + }, + { + "epoch": 0.24164480920551654, + "grad_norm": 0.2969229817390442, + "learning_rate": 0.001, + "loss": 2.8112, + "step": 5712 + }, + { + "epoch": 0.24168711396903292, + "grad_norm": 0.35546842217445374, + "learning_rate": 0.001, + "loss": 3.5403, + "step": 5713 + }, + { + "epoch": 0.24172941873254927, + "grad_norm": 0.22269092500209808, + "learning_rate": 0.001, + "loss": 1.9222, + "step": 5714 + }, + { + "epoch": 0.24177172349606565, + "grad_norm": 0.20512692630290985, + "learning_rate": 0.001, + "loss": 1.9088, + "step": 5715 + }, + { + "epoch": 0.24181402825958204, + "grad_norm": 0.26039499044418335, + "learning_rate": 0.001, + "loss": 2.0708, + "step": 5716 + }, + { + "epoch": 0.2418563330230984, + "grad_norm": 38.39719009399414, + "learning_rate": 0.001, + "loss": 1.9211, + "step": 5717 + }, + { + "epoch": 0.24189863778661477, + "grad_norm": 0.3820383548736572, + "learning_rate": 0.001, + "loss": 3.4881, + "step": 5718 + }, + { + "epoch": 0.24194094255013116, + "grad_norm": 0.3408060073852539, + "learning_rate": 0.001, + "loss": 2.9283, + "step": 5719 + }, + { + "epoch": 0.2419832473136475, + "grad_norm": 0.36963334679603577, + "learning_rate": 0.001, + "loss": 3.4062, + "step": 5720 + }, + { + "epoch": 0.2420255520771639, + "grad_norm": 1.3119021654129028, + "learning_rate": 0.001, + "loss": 3.7547, + "step": 5721 + }, + { + "epoch": 0.24206785684068027, + "grad_norm": 0.27805081009864807, + "learning_rate": 0.001, + "loss": 3.2918, + "step": 5722 + }, + { + "epoch": 0.24211016160419663, + "grad_norm": 0.3566412031650543, + "learning_rate": 0.001, + "loss": 2.1117, + "step": 5723 + }, + { + "epoch": 0.242152466367713, + "grad_norm": 0.3360730707645416, + "learning_rate": 0.001, + "loss": 2.1076, + "step": 5724 + }, + { + "epoch": 0.24219477113122936, + "grad_norm": 0.27928465604782104, + "learning_rate": 0.001, + "loss": 2.2339, + "step": 5725 + }, + { + "epoch": 0.24223707589474575, + "grad_norm": 2.314484119415283, + "learning_rate": 0.001, + "loss": 2.1504, + "step": 5726 + }, + { + "epoch": 0.24227938065826213, + "grad_norm": 0.2751530706882477, + "learning_rate": 0.001, + "loss": 2.5127, + "step": 5727 + }, + { + "epoch": 0.24232168542177848, + "grad_norm": 0.20994529128074646, + "learning_rate": 0.001, + "loss": 2.1564, + "step": 5728 + }, + { + "epoch": 0.24236399018529486, + "grad_norm": 0.246437668800354, + "learning_rate": 0.001, + "loss": 2.3628, + "step": 5729 + }, + { + "epoch": 0.24240629494881125, + "grad_norm": 2.577066659927368, + "learning_rate": 0.001, + "loss": 2.6037, + "step": 5730 + }, + { + "epoch": 0.2424485997123276, + "grad_norm": 0.2616536021232605, + "learning_rate": 0.001, + "loss": 2.1903, + "step": 5731 + }, + { + "epoch": 0.24249090447584398, + "grad_norm": 0.24390681087970734, + "learning_rate": 0.001, + "loss": 2.4864, + "step": 5732 + }, + { + "epoch": 0.24253320923936036, + "grad_norm": 0.18465809524059296, + "learning_rate": 0.001, + "loss": 1.9357, + "step": 5733 + }, + { + "epoch": 0.24257551400287672, + "grad_norm": 1.6933213472366333, + "learning_rate": 0.001, + "loss": 1.7803, + "step": 5734 + }, + { + "epoch": 0.2426178187663931, + "grad_norm": 0.25248533487319946, + "learning_rate": 0.001, + "loss": 2.9161, + "step": 5735 + }, + { + "epoch": 0.24266012352990945, + "grad_norm": 0.20103050768375397, + "learning_rate": 0.001, + "loss": 1.9586, + "step": 5736 + }, + { + "epoch": 0.24270242829342584, + "grad_norm": 0.8724969029426575, + "learning_rate": 0.001, + "loss": 1.979, + "step": 5737 + }, + { + "epoch": 0.24274473305694222, + "grad_norm": 0.23048318922519684, + "learning_rate": 0.001, + "loss": 2.1012, + "step": 5738 + }, + { + "epoch": 0.24278703782045857, + "grad_norm": 0.24367755651474, + "learning_rate": 0.001, + "loss": 2.41, + "step": 5739 + }, + { + "epoch": 0.24282934258397496, + "grad_norm": 0.4339103102684021, + "learning_rate": 0.001, + "loss": 2.6809, + "step": 5740 + }, + { + "epoch": 0.24287164734749134, + "grad_norm": 1.0650697946548462, + "learning_rate": 0.001, + "loss": 2.7325, + "step": 5741 + }, + { + "epoch": 0.2429139521110077, + "grad_norm": 2.662815809249878, + "learning_rate": 0.001, + "loss": 1.9265, + "step": 5742 + }, + { + "epoch": 0.24295625687452407, + "grad_norm": 0.1942417025566101, + "learning_rate": 0.001, + "loss": 1.7619, + "step": 5743 + }, + { + "epoch": 0.24299856163804046, + "grad_norm": 0.28779444098472595, + "learning_rate": 0.001, + "loss": 2.8304, + "step": 5744 + }, + { + "epoch": 0.2430408664015568, + "grad_norm": 0.28456562757492065, + "learning_rate": 0.001, + "loss": 3.3492, + "step": 5745 + }, + { + "epoch": 0.2430831711650732, + "grad_norm": 2.139453172683716, + "learning_rate": 0.001, + "loss": 3.8633, + "step": 5746 + }, + { + "epoch": 0.24312547592858955, + "grad_norm": 0.35128921270370483, + "learning_rate": 0.001, + "loss": 2.3143, + "step": 5747 + }, + { + "epoch": 0.24316778069210593, + "grad_norm": 0.3761100172996521, + "learning_rate": 0.001, + "loss": 2.5602, + "step": 5748 + }, + { + "epoch": 0.2432100854556223, + "grad_norm": 0.3312492370605469, + "learning_rate": 0.001, + "loss": 2.6397, + "step": 5749 + }, + { + "epoch": 0.24325239021913866, + "grad_norm": 14.835386276245117, + "learning_rate": 0.001, + "loss": 2.1131, + "step": 5750 + }, + { + "epoch": 0.24329469498265505, + "grad_norm": 0.33980563282966614, + "learning_rate": 0.001, + "loss": 3.0289, + "step": 5751 + }, + { + "epoch": 0.24333699974617143, + "grad_norm": 0.3430834412574768, + "learning_rate": 0.001, + "loss": 2.298, + "step": 5752 + }, + { + "epoch": 0.24337930450968778, + "grad_norm": 0.2788030207157135, + "learning_rate": 0.001, + "loss": 2.3714, + "step": 5753 + }, + { + "epoch": 0.24342160927320416, + "grad_norm": 0.242362380027771, + "learning_rate": 0.001, + "loss": 2.3746, + "step": 5754 + }, + { + "epoch": 0.24346391403672055, + "grad_norm": 0.4052402079105377, + "learning_rate": 0.001, + "loss": 3.5768, + "step": 5755 + }, + { + "epoch": 0.2435062188002369, + "grad_norm": 0.24132952094078064, + "learning_rate": 0.001, + "loss": 3.8857, + "step": 5756 + }, + { + "epoch": 0.24354852356375328, + "grad_norm": 1.1747184991836548, + "learning_rate": 0.001, + "loss": 2.5792, + "step": 5757 + }, + { + "epoch": 0.24359082832726964, + "grad_norm": 0.3181629776954651, + "learning_rate": 0.001, + "loss": 2.4274, + "step": 5758 + }, + { + "epoch": 0.24363313309078602, + "grad_norm": 0.22457624971866608, + "learning_rate": 0.001, + "loss": 2.6695, + "step": 5759 + }, + { + "epoch": 0.2436754378543024, + "grad_norm": 0.24323022365570068, + "learning_rate": 0.001, + "loss": 2.0115, + "step": 5760 + }, + { + "epoch": 0.24371774261781876, + "grad_norm": 0.2709481120109558, + "learning_rate": 0.001, + "loss": 2.3548, + "step": 5761 + }, + { + "epoch": 0.24376004738133514, + "grad_norm": 0.20824141800403595, + "learning_rate": 0.001, + "loss": 1.701, + "step": 5762 + }, + { + "epoch": 0.24380235214485152, + "grad_norm": 1.6470791101455688, + "learning_rate": 0.001, + "loss": 2.4078, + "step": 5763 + }, + { + "epoch": 0.24384465690836787, + "grad_norm": 0.2733412981033325, + "learning_rate": 0.001, + "loss": 2.0431, + "step": 5764 + }, + { + "epoch": 0.24388696167188426, + "grad_norm": 0.2784044146537781, + "learning_rate": 0.001, + "loss": 2.6605, + "step": 5765 + }, + { + "epoch": 0.24392926643540064, + "grad_norm": 0.661558210849762, + "learning_rate": 0.001, + "loss": 2.7723, + "step": 5766 + }, + { + "epoch": 0.243971571198917, + "grad_norm": 0.24639226496219635, + "learning_rate": 0.001, + "loss": 2.2698, + "step": 5767 + }, + { + "epoch": 0.24401387596243337, + "grad_norm": 0.2241000235080719, + "learning_rate": 0.001, + "loss": 2.6614, + "step": 5768 + }, + { + "epoch": 0.24405618072594973, + "grad_norm": 0.23688313364982605, + "learning_rate": 0.001, + "loss": 3.4856, + "step": 5769 + }, + { + "epoch": 0.2440984854894661, + "grad_norm": 0.22975680232048035, + "learning_rate": 0.001, + "loss": 2.7368, + "step": 5770 + }, + { + "epoch": 0.2441407902529825, + "grad_norm": 0.2196754515171051, + "learning_rate": 0.001, + "loss": 2.1829, + "step": 5771 + }, + { + "epoch": 0.24418309501649885, + "grad_norm": 0.21035362780094147, + "learning_rate": 0.001, + "loss": 3.0376, + "step": 5772 + }, + { + "epoch": 0.24422539978001523, + "grad_norm": 0.19896003603935242, + "learning_rate": 0.001, + "loss": 1.9874, + "step": 5773 + }, + { + "epoch": 0.2442677045435316, + "grad_norm": 0.24322247505187988, + "learning_rate": 0.001, + "loss": 3.4208, + "step": 5774 + }, + { + "epoch": 0.24431000930704797, + "grad_norm": 0.20282556116580963, + "learning_rate": 0.001, + "loss": 2.0628, + "step": 5775 + }, + { + "epoch": 0.24435231407056435, + "grad_norm": 0.19425134360790253, + "learning_rate": 0.001, + "loss": 2.9513, + "step": 5776 + }, + { + "epoch": 0.24439461883408073, + "grad_norm": 0.26466214656829834, + "learning_rate": 0.001, + "loss": 2.5617, + "step": 5777 + }, + { + "epoch": 0.24443692359759708, + "grad_norm": 0.2062908262014389, + "learning_rate": 0.001, + "loss": 2.3014, + "step": 5778 + }, + { + "epoch": 0.24447922836111347, + "grad_norm": 0.17950376868247986, + "learning_rate": 0.001, + "loss": 2.2402, + "step": 5779 + }, + { + "epoch": 0.24452153312462982, + "grad_norm": 0.17766666412353516, + "learning_rate": 0.001, + "loss": 1.9251, + "step": 5780 + }, + { + "epoch": 0.2445638378881462, + "grad_norm": 0.20533357560634613, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 5781 + }, + { + "epoch": 0.24460614265166258, + "grad_norm": 0.21567344665527344, + "learning_rate": 0.001, + "loss": 2.074, + "step": 5782 + }, + { + "epoch": 0.24464844741517894, + "grad_norm": 0.17948715388774872, + "learning_rate": 0.001, + "loss": 2.2158, + "step": 5783 + }, + { + "epoch": 0.24469075217869532, + "grad_norm": 0.2008078396320343, + "learning_rate": 0.001, + "loss": 1.9838, + "step": 5784 + }, + { + "epoch": 0.2447330569422117, + "grad_norm": 0.6070753931999207, + "learning_rate": 0.001, + "loss": 2.0619, + "step": 5785 + }, + { + "epoch": 0.24477536170572806, + "grad_norm": 0.21907269954681396, + "learning_rate": 0.001, + "loss": 2.5838, + "step": 5786 + }, + { + "epoch": 0.24481766646924444, + "grad_norm": 0.46992072463035583, + "learning_rate": 0.001, + "loss": 3.1865, + "step": 5787 + }, + { + "epoch": 0.24485997123276082, + "grad_norm": 0.16599377989768982, + "learning_rate": 0.001, + "loss": 1.9173, + "step": 5788 + }, + { + "epoch": 0.24490227599627717, + "grad_norm": 0.1835167109966278, + "learning_rate": 0.001, + "loss": 2.0218, + "step": 5789 + }, + { + "epoch": 0.24494458075979356, + "grad_norm": 3.3056118488311768, + "learning_rate": 0.001, + "loss": 2.8621, + "step": 5790 + }, + { + "epoch": 0.24498688552330994, + "grad_norm": 0.4746275842189789, + "learning_rate": 0.001, + "loss": 2.476, + "step": 5791 + }, + { + "epoch": 0.2450291902868263, + "grad_norm": 0.48032623529434204, + "learning_rate": 0.001, + "loss": 2.3223, + "step": 5792 + }, + { + "epoch": 0.24507149505034267, + "grad_norm": 0.31363826990127563, + "learning_rate": 0.001, + "loss": 3.0721, + "step": 5793 + }, + { + "epoch": 0.24511379981385903, + "grad_norm": 0.1737576723098755, + "learning_rate": 0.001, + "loss": 2.0079, + "step": 5794 + }, + { + "epoch": 0.2451561045773754, + "grad_norm": 0.8084182143211365, + "learning_rate": 0.001, + "loss": 2.1507, + "step": 5795 + }, + { + "epoch": 0.2451984093408918, + "grad_norm": 0.1980573832988739, + "learning_rate": 0.001, + "loss": 2.65, + "step": 5796 + }, + { + "epoch": 0.24524071410440815, + "grad_norm": 0.32529035210609436, + "learning_rate": 0.001, + "loss": 2.2491, + "step": 5797 + }, + { + "epoch": 0.24528301886792453, + "grad_norm": 0.2029356062412262, + "learning_rate": 0.001, + "loss": 2.3983, + "step": 5798 + }, + { + "epoch": 0.2453253236314409, + "grad_norm": 0.1999257355928421, + "learning_rate": 0.001, + "loss": 2.4885, + "step": 5799 + }, + { + "epoch": 0.24536762839495727, + "grad_norm": 0.34168320894241333, + "learning_rate": 0.001, + "loss": 2.712, + "step": 5800 + }, + { + "epoch": 0.24540993315847365, + "grad_norm": 0.24313102662563324, + "learning_rate": 0.001, + "loss": 2.064, + "step": 5801 + }, + { + "epoch": 0.24545223792199003, + "grad_norm": 0.2077663093805313, + "learning_rate": 0.001, + "loss": 2.3874, + "step": 5802 + }, + { + "epoch": 0.24549454268550638, + "grad_norm": 0.21080611646175385, + "learning_rate": 0.001, + "loss": 4.459, + "step": 5803 + }, + { + "epoch": 0.24553684744902277, + "grad_norm": 0.22150756418704987, + "learning_rate": 0.001, + "loss": 2.0071, + "step": 5804 + }, + { + "epoch": 0.24557915221253912, + "grad_norm": 0.27595874667167664, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 5805 + }, + { + "epoch": 0.2456214569760555, + "grad_norm": 38.239009857177734, + "learning_rate": 0.001, + "loss": 1.7931, + "step": 5806 + }, + { + "epoch": 0.24566376173957188, + "grad_norm": 0.24801005423069, + "learning_rate": 0.001, + "loss": 2.6051, + "step": 5807 + }, + { + "epoch": 0.24570606650308824, + "grad_norm": 0.26121652126312256, + "learning_rate": 0.001, + "loss": 2.9589, + "step": 5808 + }, + { + "epoch": 0.24574837126660462, + "grad_norm": 0.18741196393966675, + "learning_rate": 0.001, + "loss": 1.4772, + "step": 5809 + }, + { + "epoch": 0.245790676030121, + "grad_norm": 0.2526096701622009, + "learning_rate": 0.001, + "loss": 1.857, + "step": 5810 + }, + { + "epoch": 0.24583298079363736, + "grad_norm": 0.28198450803756714, + "learning_rate": 0.001, + "loss": 2.0914, + "step": 5811 + }, + { + "epoch": 0.24587528555715374, + "grad_norm": 0.22132883965969086, + "learning_rate": 0.001, + "loss": 2.4616, + "step": 5812 + }, + { + "epoch": 0.24591759032067012, + "grad_norm": 0.26563504338264465, + "learning_rate": 0.001, + "loss": 2.2087, + "step": 5813 + }, + { + "epoch": 0.24595989508418648, + "grad_norm": 0.35990941524505615, + "learning_rate": 0.001, + "loss": 2.3027, + "step": 5814 + }, + { + "epoch": 0.24600219984770286, + "grad_norm": 0.20027977228164673, + "learning_rate": 0.001, + "loss": 3.1369, + "step": 5815 + }, + { + "epoch": 0.2460445046112192, + "grad_norm": 0.9017431735992432, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 5816 + }, + { + "epoch": 0.2460868093747356, + "grad_norm": 1.3316782712936401, + "learning_rate": 0.001, + "loss": 2.9245, + "step": 5817 + }, + { + "epoch": 0.24612911413825198, + "grad_norm": 0.24641363322734833, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 5818 + }, + { + "epoch": 0.24617141890176833, + "grad_norm": 0.3345772325992584, + "learning_rate": 0.001, + "loss": 2.3912, + "step": 5819 + }, + { + "epoch": 0.2462137236652847, + "grad_norm": 0.23146621882915497, + "learning_rate": 0.001, + "loss": 2.2022, + "step": 5820 + }, + { + "epoch": 0.2462560284288011, + "grad_norm": 0.3035587966442108, + "learning_rate": 0.001, + "loss": 2.5982, + "step": 5821 + }, + { + "epoch": 0.24629833319231745, + "grad_norm": 0.4304903447628021, + "learning_rate": 0.001, + "loss": 2.566, + "step": 5822 + }, + { + "epoch": 0.24634063795583383, + "grad_norm": 0.21231508255004883, + "learning_rate": 0.001, + "loss": 2.6924, + "step": 5823 + }, + { + "epoch": 0.2463829427193502, + "grad_norm": 0.21332456171512604, + "learning_rate": 0.001, + "loss": 2.458, + "step": 5824 + }, + { + "epoch": 0.24642524748286657, + "grad_norm": 0.24201619625091553, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 5825 + }, + { + "epoch": 0.24646755224638295, + "grad_norm": 0.20796683430671692, + "learning_rate": 0.001, + "loss": 2.2657, + "step": 5826 + }, + { + "epoch": 0.2465098570098993, + "grad_norm": 0.3387661278247833, + "learning_rate": 0.001, + "loss": 1.8694, + "step": 5827 + }, + { + "epoch": 0.24655216177341568, + "grad_norm": 0.5283912420272827, + "learning_rate": 0.001, + "loss": 2.0767, + "step": 5828 + }, + { + "epoch": 0.24659446653693207, + "grad_norm": 0.21991148591041565, + "learning_rate": 0.001, + "loss": 3.0532, + "step": 5829 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 0.2817951738834381, + "learning_rate": 0.001, + "loss": 2.8324, + "step": 5830 + }, + { + "epoch": 0.2466790760639648, + "grad_norm": 0.2231799066066742, + "learning_rate": 0.001, + "loss": 3.0957, + "step": 5831 + }, + { + "epoch": 0.24672138082748118, + "grad_norm": 0.23108860850334167, + "learning_rate": 0.001, + "loss": 2.1957, + "step": 5832 + }, + { + "epoch": 0.24676368559099754, + "grad_norm": 0.47794216871261597, + "learning_rate": 0.001, + "loss": 2.4318, + "step": 5833 + }, + { + "epoch": 0.24680599035451392, + "grad_norm": 0.3009633421897888, + "learning_rate": 0.001, + "loss": 2.801, + "step": 5834 + }, + { + "epoch": 0.2468482951180303, + "grad_norm": 0.1829899400472641, + "learning_rate": 0.001, + "loss": 1.762, + "step": 5835 + }, + { + "epoch": 0.24689059988154666, + "grad_norm": 0.25043395161628723, + "learning_rate": 0.001, + "loss": 2.103, + "step": 5836 + }, + { + "epoch": 0.24693290464506304, + "grad_norm": 1.0401873588562012, + "learning_rate": 0.001, + "loss": 2.5898, + "step": 5837 + }, + { + "epoch": 0.2469752094085794, + "grad_norm": 0.22603413462638855, + "learning_rate": 0.001, + "loss": 3.3657, + "step": 5838 + }, + { + "epoch": 0.24701751417209578, + "grad_norm": 0.25399866700172424, + "learning_rate": 0.001, + "loss": 2.7811, + "step": 5839 + }, + { + "epoch": 0.24705981893561216, + "grad_norm": 0.24066664278507233, + "learning_rate": 0.001, + "loss": 3.1812, + "step": 5840 + }, + { + "epoch": 0.2471021236991285, + "grad_norm": 0.27932751178741455, + "learning_rate": 0.001, + "loss": 2.5269, + "step": 5841 + }, + { + "epoch": 0.2471444284626449, + "grad_norm": 0.2165471911430359, + "learning_rate": 0.001, + "loss": 1.9364, + "step": 5842 + }, + { + "epoch": 0.24718673322616128, + "grad_norm": 0.1947283148765564, + "learning_rate": 0.001, + "loss": 1.9661, + "step": 5843 + }, + { + "epoch": 0.24722903798967763, + "grad_norm": 0.245658740401268, + "learning_rate": 0.001, + "loss": 2.2399, + "step": 5844 + }, + { + "epoch": 0.247271342753194, + "grad_norm": 0.21546167135238647, + "learning_rate": 0.001, + "loss": 2.4518, + "step": 5845 + }, + { + "epoch": 0.2473136475167104, + "grad_norm": 0.18316084146499634, + "learning_rate": 0.001, + "loss": 1.8123, + "step": 5846 + }, + { + "epoch": 0.24735595228022675, + "grad_norm": 2.9340388774871826, + "learning_rate": 0.001, + "loss": 2.8578, + "step": 5847 + }, + { + "epoch": 0.24739825704374313, + "grad_norm": 0.16483551263809204, + "learning_rate": 0.001, + "loss": 1.7181, + "step": 5848 + }, + { + "epoch": 0.24744056180725948, + "grad_norm": 0.18957403302192688, + "learning_rate": 0.001, + "loss": 2.1544, + "step": 5849 + }, + { + "epoch": 0.24748286657077587, + "grad_norm": 0.19052253663539886, + "learning_rate": 0.001, + "loss": 1.8171, + "step": 5850 + }, + { + "epoch": 0.24752517133429225, + "grad_norm": 0.19511641561985016, + "learning_rate": 0.001, + "loss": 1.7506, + "step": 5851 + }, + { + "epoch": 0.2475674760978086, + "grad_norm": 0.7128787040710449, + "learning_rate": 0.001, + "loss": 1.6148, + "step": 5852 + }, + { + "epoch": 0.24760978086132499, + "grad_norm": 0.1971040964126587, + "learning_rate": 0.001, + "loss": 2.5899, + "step": 5853 + }, + { + "epoch": 0.24765208562484137, + "grad_norm": 0.3237464129924774, + "learning_rate": 0.001, + "loss": 1.9698, + "step": 5854 + }, + { + "epoch": 0.24769439038835772, + "grad_norm": 0.20334038138389587, + "learning_rate": 0.001, + "loss": 1.8541, + "step": 5855 + }, + { + "epoch": 0.2477366951518741, + "grad_norm": 0.19235415756702423, + "learning_rate": 0.001, + "loss": 2.1102, + "step": 5856 + }, + { + "epoch": 0.24777899991539049, + "grad_norm": 0.18635153770446777, + "learning_rate": 0.001, + "loss": 1.7164, + "step": 5857 + }, + { + "epoch": 0.24782130467890684, + "grad_norm": 0.8393286466598511, + "learning_rate": 0.001, + "loss": 2.0415, + "step": 5858 + }, + { + "epoch": 0.24786360944242322, + "grad_norm": 0.20402997732162476, + "learning_rate": 0.001, + "loss": 2.5329, + "step": 5859 + }, + { + "epoch": 0.24790591420593958, + "grad_norm": 0.20956696569919586, + "learning_rate": 0.001, + "loss": 1.7601, + "step": 5860 + }, + { + "epoch": 0.24794821896945596, + "grad_norm": 0.3180994391441345, + "learning_rate": 0.001, + "loss": 2.319, + "step": 5861 + }, + { + "epoch": 0.24799052373297234, + "grad_norm": 0.4629594087600708, + "learning_rate": 0.001, + "loss": 2.763, + "step": 5862 + }, + { + "epoch": 0.2480328284964887, + "grad_norm": 0.2035917043685913, + "learning_rate": 0.001, + "loss": 2.7616, + "step": 5863 + }, + { + "epoch": 0.24807513326000508, + "grad_norm": 0.20831398665905, + "learning_rate": 0.001, + "loss": 3.2368, + "step": 5864 + }, + { + "epoch": 0.24811743802352146, + "grad_norm": 0.24462851881980896, + "learning_rate": 0.001, + "loss": 2.8819, + "step": 5865 + }, + { + "epoch": 0.2481597427870378, + "grad_norm": 0.72509765625, + "learning_rate": 0.001, + "loss": 1.4765, + "step": 5866 + }, + { + "epoch": 0.2482020475505542, + "grad_norm": 0.2004188746213913, + "learning_rate": 0.001, + "loss": 2.7114, + "step": 5867 + }, + { + "epoch": 0.24824435231407058, + "grad_norm": 0.16426634788513184, + "learning_rate": 0.001, + "loss": 2.2513, + "step": 5868 + }, + { + "epoch": 0.24828665707758693, + "grad_norm": 0.23938223719596863, + "learning_rate": 0.001, + "loss": 2.5919, + "step": 5869 + }, + { + "epoch": 0.2483289618411033, + "grad_norm": 0.20206694304943085, + "learning_rate": 0.001, + "loss": 1.8613, + "step": 5870 + }, + { + "epoch": 0.24837126660461967, + "grad_norm": 0.20341122150421143, + "learning_rate": 0.001, + "loss": 1.6956, + "step": 5871 + }, + { + "epoch": 0.24841357136813605, + "grad_norm": 0.16674411296844482, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 5872 + }, + { + "epoch": 0.24845587613165243, + "grad_norm": 0.17640060186386108, + "learning_rate": 0.001, + "loss": 1.5507, + "step": 5873 + }, + { + "epoch": 0.24849818089516879, + "grad_norm": 0.2345527559518814, + "learning_rate": 0.001, + "loss": 2.1983, + "step": 5874 + }, + { + "epoch": 0.24854048565868517, + "grad_norm": 2.3376219272613525, + "learning_rate": 0.001, + "loss": 2.1126, + "step": 5875 + }, + { + "epoch": 0.24858279042220155, + "grad_norm": 0.24027837812900543, + "learning_rate": 0.001, + "loss": 1.968, + "step": 5876 + }, + { + "epoch": 0.2486250951857179, + "grad_norm": 0.22010159492492676, + "learning_rate": 0.001, + "loss": 1.8924, + "step": 5877 + }, + { + "epoch": 0.24866739994923429, + "grad_norm": 0.1861301064491272, + "learning_rate": 0.001, + "loss": 2.1468, + "step": 5878 + }, + { + "epoch": 0.24870970471275067, + "grad_norm": 0.2004723697900772, + "learning_rate": 0.001, + "loss": 2.5531, + "step": 5879 + }, + { + "epoch": 0.24875200947626702, + "grad_norm": 0.33933591842651367, + "learning_rate": 0.001, + "loss": 2.7241, + "step": 5880 + }, + { + "epoch": 0.2487943142397834, + "grad_norm": 0.26953572034835815, + "learning_rate": 0.001, + "loss": 2.3623, + "step": 5881 + }, + { + "epoch": 0.24883661900329976, + "grad_norm": 0.1680213063955307, + "learning_rate": 0.001, + "loss": 2.5205, + "step": 5882 + }, + { + "epoch": 0.24887892376681614, + "grad_norm": 0.17806370556354523, + "learning_rate": 0.001, + "loss": 2.0251, + "step": 5883 + }, + { + "epoch": 0.24892122853033252, + "grad_norm": 0.44884923100471497, + "learning_rate": 0.001, + "loss": 1.8082, + "step": 5884 + }, + { + "epoch": 0.24896353329384888, + "grad_norm": 0.28582075238227844, + "learning_rate": 0.001, + "loss": 3.7704, + "step": 5885 + }, + { + "epoch": 0.24900583805736526, + "grad_norm": 0.21283996105194092, + "learning_rate": 0.001, + "loss": 1.7666, + "step": 5886 + }, + { + "epoch": 0.24904814282088164, + "grad_norm": 0.22247281670570374, + "learning_rate": 0.001, + "loss": 2.6567, + "step": 5887 + }, + { + "epoch": 0.249090447584398, + "grad_norm": 0.2375125139951706, + "learning_rate": 0.001, + "loss": 2.3062, + "step": 5888 + }, + { + "epoch": 0.24913275234791438, + "grad_norm": 0.3201584219932556, + "learning_rate": 0.001, + "loss": 3.1853, + "step": 5889 + }, + { + "epoch": 0.24917505711143076, + "grad_norm": 0.21060162782669067, + "learning_rate": 0.001, + "loss": 2.4099, + "step": 5890 + }, + { + "epoch": 0.2492173618749471, + "grad_norm": 0.2675623595714569, + "learning_rate": 0.001, + "loss": 2.5034, + "step": 5891 + }, + { + "epoch": 0.2492596666384635, + "grad_norm": 3.282320022583008, + "learning_rate": 0.001, + "loss": 1.659, + "step": 5892 + }, + { + "epoch": 0.24930197140197985, + "grad_norm": 0.28002822399139404, + "learning_rate": 0.001, + "loss": 2.2949, + "step": 5893 + }, + { + "epoch": 0.24934427616549623, + "grad_norm": 0.17490912973880768, + "learning_rate": 0.001, + "loss": 2.7314, + "step": 5894 + }, + { + "epoch": 0.2493865809290126, + "grad_norm": 0.5958810448646545, + "learning_rate": 0.001, + "loss": 2.6676, + "step": 5895 + }, + { + "epoch": 0.24942888569252897, + "grad_norm": 0.18583674728870392, + "learning_rate": 0.001, + "loss": 2.1634, + "step": 5896 + }, + { + "epoch": 0.24947119045604535, + "grad_norm": 0.2628713548183441, + "learning_rate": 0.001, + "loss": 2.3261, + "step": 5897 + }, + { + "epoch": 0.24951349521956173, + "grad_norm": 0.1902725249528885, + "learning_rate": 0.001, + "loss": 2.037, + "step": 5898 + }, + { + "epoch": 0.24955579998307809, + "grad_norm": 0.8447865843772888, + "learning_rate": 0.001, + "loss": 1.9191, + "step": 5899 + }, + { + "epoch": 0.24959810474659447, + "grad_norm": 0.25084027647972107, + "learning_rate": 0.001, + "loss": 2.6923, + "step": 5900 + }, + { + "epoch": 0.24964040951011085, + "grad_norm": 0.2469928115606308, + "learning_rate": 0.001, + "loss": 2.6025, + "step": 5901 + }, + { + "epoch": 0.2496827142736272, + "grad_norm": 0.18557626008987427, + "learning_rate": 0.001, + "loss": 1.8872, + "step": 5902 + }, + { + "epoch": 0.2497250190371436, + "grad_norm": 0.22026236355304718, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 5903 + }, + { + "epoch": 0.24976732380065994, + "grad_norm": 0.21095015108585358, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 5904 + }, + { + "epoch": 0.24980962856417632, + "grad_norm": 0.18303923308849335, + "learning_rate": 0.001, + "loss": 3.1426, + "step": 5905 + }, + { + "epoch": 0.2498519333276927, + "grad_norm": 0.9932180643081665, + "learning_rate": 0.001, + "loss": 2.3002, + "step": 5906 + }, + { + "epoch": 0.24989423809120906, + "grad_norm": 0.2075134962797165, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 5907 + }, + { + "epoch": 0.24993654285472544, + "grad_norm": 0.1895306408405304, + "learning_rate": 0.001, + "loss": 1.8905, + "step": 5908 + }, + { + "epoch": 0.24997884761824182, + "grad_norm": 0.2975296378135681, + "learning_rate": 0.001, + "loss": 3.0346, + "step": 5909 + }, + { + "epoch": 0.2500211523817582, + "grad_norm": 0.19449001550674438, + "learning_rate": 0.001, + "loss": 2.3004, + "step": 5910 + }, + { + "epoch": 0.25006345714527456, + "grad_norm": 0.24991470575332642, + "learning_rate": 0.001, + "loss": 3.1309, + "step": 5911 + }, + { + "epoch": 0.2501057619087909, + "grad_norm": 0.35687020421028137, + "learning_rate": 0.001, + "loss": 2.2765, + "step": 5912 + }, + { + "epoch": 0.2501480666723073, + "grad_norm": 0.17699697613716125, + "learning_rate": 0.001, + "loss": 2.4377, + "step": 5913 + }, + { + "epoch": 0.2501903714358237, + "grad_norm": 0.2586126923561096, + "learning_rate": 0.001, + "loss": 3.4109, + "step": 5914 + }, + { + "epoch": 0.25023267619934003, + "grad_norm": 0.21270103752613068, + "learning_rate": 0.001, + "loss": 1.8928, + "step": 5915 + }, + { + "epoch": 0.25027498096285644, + "grad_norm": 0.225282222032547, + "learning_rate": 0.001, + "loss": 2.3448, + "step": 5916 + }, + { + "epoch": 0.2503172857263728, + "grad_norm": 0.18407504260540009, + "learning_rate": 0.001, + "loss": 2.0276, + "step": 5917 + }, + { + "epoch": 0.25035959048988915, + "grad_norm": 0.22186896204948425, + "learning_rate": 0.001, + "loss": 2.0891, + "step": 5918 + }, + { + "epoch": 0.25040189525340556, + "grad_norm": 2.6034348011016846, + "learning_rate": 0.001, + "loss": 2.0726, + "step": 5919 + }, + { + "epoch": 0.2504442000169219, + "grad_norm": 0.20303837954998016, + "learning_rate": 0.001, + "loss": 2.3935, + "step": 5920 + }, + { + "epoch": 0.25048650478043827, + "grad_norm": 0.18995042145252228, + "learning_rate": 0.001, + "loss": 2.0791, + "step": 5921 + }, + { + "epoch": 0.2505288095439546, + "grad_norm": 0.17359282076358795, + "learning_rate": 0.001, + "loss": 2.0846, + "step": 5922 + }, + { + "epoch": 0.25057111430747103, + "grad_norm": 0.2554149031639099, + "learning_rate": 0.001, + "loss": 1.8015, + "step": 5923 + }, + { + "epoch": 0.2506134190709874, + "grad_norm": 0.4858115315437317, + "learning_rate": 0.001, + "loss": 2.3795, + "step": 5924 + }, + { + "epoch": 0.25065572383450374, + "grad_norm": 0.3626372516155243, + "learning_rate": 0.001, + "loss": 2.0548, + "step": 5925 + }, + { + "epoch": 0.25069802859802015, + "grad_norm": 0.17694664001464844, + "learning_rate": 0.001, + "loss": 1.7527, + "step": 5926 + }, + { + "epoch": 0.2507403333615365, + "grad_norm": 0.5203558206558228, + "learning_rate": 0.001, + "loss": 3.2289, + "step": 5927 + }, + { + "epoch": 0.25078263812505286, + "grad_norm": 0.1976146101951599, + "learning_rate": 0.001, + "loss": 2.4577, + "step": 5928 + }, + { + "epoch": 0.25082494288856927, + "grad_norm": 0.2331826388835907, + "learning_rate": 0.001, + "loss": 3.3164, + "step": 5929 + }, + { + "epoch": 0.2508672476520856, + "grad_norm": 0.4622613787651062, + "learning_rate": 0.001, + "loss": 1.578, + "step": 5930 + }, + { + "epoch": 0.250909552415602, + "grad_norm": 0.20590519905090332, + "learning_rate": 0.001, + "loss": 2.2907, + "step": 5931 + }, + { + "epoch": 0.2509518571791184, + "grad_norm": 0.20743978023529053, + "learning_rate": 0.001, + "loss": 2.0356, + "step": 5932 + }, + { + "epoch": 0.25099416194263474, + "grad_norm": 4.0422892570495605, + "learning_rate": 0.001, + "loss": 2.584, + "step": 5933 + }, + { + "epoch": 0.2510364667061511, + "grad_norm": 0.4900633990764618, + "learning_rate": 0.001, + "loss": 3.1356, + "step": 5934 + }, + { + "epoch": 0.2510787714696675, + "grad_norm": 0.19140180945396423, + "learning_rate": 0.001, + "loss": 2.1572, + "step": 5935 + }, + { + "epoch": 0.25112107623318386, + "grad_norm": 0.2231162041425705, + "learning_rate": 0.001, + "loss": 2.8386, + "step": 5936 + }, + { + "epoch": 0.2511633809967002, + "grad_norm": 1.3613148927688599, + "learning_rate": 0.001, + "loss": 2.2125, + "step": 5937 + }, + { + "epoch": 0.2512056857602166, + "grad_norm": 0.23480670154094696, + "learning_rate": 0.001, + "loss": 2.3708, + "step": 5938 + }, + { + "epoch": 0.251247990523733, + "grad_norm": 0.5639317035675049, + "learning_rate": 0.001, + "loss": 2.2903, + "step": 5939 + }, + { + "epoch": 0.25129029528724933, + "grad_norm": 1.7350952625274658, + "learning_rate": 0.001, + "loss": 2.4496, + "step": 5940 + }, + { + "epoch": 0.25133260005076574, + "grad_norm": 0.1928754597902298, + "learning_rate": 0.001, + "loss": 2.6528, + "step": 5941 + }, + { + "epoch": 0.2513749048142821, + "grad_norm": 0.27500829100608826, + "learning_rate": 0.001, + "loss": 2.7894, + "step": 5942 + }, + { + "epoch": 0.25141720957779845, + "grad_norm": 0.2504851520061493, + "learning_rate": 0.001, + "loss": 3.2059, + "step": 5943 + }, + { + "epoch": 0.2514595143413148, + "grad_norm": 0.45656251907348633, + "learning_rate": 0.001, + "loss": 2.6401, + "step": 5944 + }, + { + "epoch": 0.2515018191048312, + "grad_norm": 0.2430989146232605, + "learning_rate": 0.001, + "loss": 2.3806, + "step": 5945 + }, + { + "epoch": 0.25154412386834757, + "grad_norm": 0.44327855110168457, + "learning_rate": 0.001, + "loss": 1.9777, + "step": 5946 + }, + { + "epoch": 0.2515864286318639, + "grad_norm": 0.2197687029838562, + "learning_rate": 0.001, + "loss": 2.3825, + "step": 5947 + }, + { + "epoch": 0.25162873339538033, + "grad_norm": 0.24564971029758453, + "learning_rate": 0.001, + "loss": 2.022, + "step": 5948 + }, + { + "epoch": 0.2516710381588967, + "grad_norm": 0.41810470819473267, + "learning_rate": 0.001, + "loss": 2.4189, + "step": 5949 + }, + { + "epoch": 0.25171334292241304, + "grad_norm": 0.2302566021680832, + "learning_rate": 0.001, + "loss": 2.1172, + "step": 5950 + }, + { + "epoch": 0.25175564768592945, + "grad_norm": 0.5698367357254028, + "learning_rate": 0.001, + "loss": 1.8955, + "step": 5951 + }, + { + "epoch": 0.2517979524494458, + "grad_norm": 0.20264704525470734, + "learning_rate": 0.001, + "loss": 2.55, + "step": 5952 + }, + { + "epoch": 0.25184025721296216, + "grad_norm": 0.19961802661418915, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 5953 + }, + { + "epoch": 0.25188256197647857, + "grad_norm": 0.24171172082424164, + "learning_rate": 0.001, + "loss": 2.9635, + "step": 5954 + }, + { + "epoch": 0.2519248667399949, + "grad_norm": 0.19584928452968597, + "learning_rate": 0.001, + "loss": 2.0816, + "step": 5955 + }, + { + "epoch": 0.2519671715035113, + "grad_norm": 0.7652029991149902, + "learning_rate": 0.001, + "loss": 2.8902, + "step": 5956 + }, + { + "epoch": 0.2520094762670277, + "grad_norm": 0.22400104999542236, + "learning_rate": 0.001, + "loss": 2.0901, + "step": 5957 + }, + { + "epoch": 0.25205178103054404, + "grad_norm": 0.21903732419013977, + "learning_rate": 0.001, + "loss": 2.0618, + "step": 5958 + }, + { + "epoch": 0.2520940857940604, + "grad_norm": 0.24086321890354156, + "learning_rate": 0.001, + "loss": 2.9158, + "step": 5959 + }, + { + "epoch": 0.2521363905575768, + "grad_norm": 2.2688310146331787, + "learning_rate": 0.001, + "loss": 2.2415, + "step": 5960 + }, + { + "epoch": 0.25217869532109316, + "grad_norm": 0.2769038677215576, + "learning_rate": 0.001, + "loss": 2.5179, + "step": 5961 + }, + { + "epoch": 0.2522210000846095, + "grad_norm": 0.19732657074928284, + "learning_rate": 0.001, + "loss": 1.5416, + "step": 5962 + }, + { + "epoch": 0.2522633048481259, + "grad_norm": 0.21410557627677917, + "learning_rate": 0.001, + "loss": 1.9668, + "step": 5963 + }, + { + "epoch": 0.2523056096116423, + "grad_norm": 0.20264627039432526, + "learning_rate": 0.001, + "loss": 1.8539, + "step": 5964 + }, + { + "epoch": 0.25234791437515863, + "grad_norm": 0.9442425966262817, + "learning_rate": 0.001, + "loss": 2.2133, + "step": 5965 + }, + { + "epoch": 0.252390219138675, + "grad_norm": 0.17984417080879211, + "learning_rate": 0.001, + "loss": 1.9855, + "step": 5966 + }, + { + "epoch": 0.2524325239021914, + "grad_norm": 0.3779272735118866, + "learning_rate": 0.001, + "loss": 3.2348, + "step": 5967 + }, + { + "epoch": 0.25247482866570775, + "grad_norm": 0.23306319117546082, + "learning_rate": 0.001, + "loss": 2.9164, + "step": 5968 + }, + { + "epoch": 0.2525171334292241, + "grad_norm": 0.21737343072891235, + "learning_rate": 0.001, + "loss": 2.4458, + "step": 5969 + }, + { + "epoch": 0.2525594381927405, + "grad_norm": 0.38391655683517456, + "learning_rate": 0.001, + "loss": 1.901, + "step": 5970 + }, + { + "epoch": 0.25260174295625687, + "grad_norm": 0.21682848036289215, + "learning_rate": 0.001, + "loss": 2.745, + "step": 5971 + }, + { + "epoch": 0.2526440477197732, + "grad_norm": 0.1871444135904312, + "learning_rate": 0.001, + "loss": 2.6208, + "step": 5972 + }, + { + "epoch": 0.25268635248328963, + "grad_norm": 0.23921331763267517, + "learning_rate": 0.001, + "loss": 2.7021, + "step": 5973 + }, + { + "epoch": 0.252728657246806, + "grad_norm": 0.17137141525745392, + "learning_rate": 0.001, + "loss": 2.0486, + "step": 5974 + }, + { + "epoch": 0.25277096201032234, + "grad_norm": 0.5864259600639343, + "learning_rate": 0.001, + "loss": 1.7707, + "step": 5975 + }, + { + "epoch": 0.25281326677383875, + "grad_norm": 0.18779319524765015, + "learning_rate": 0.001, + "loss": 1.8549, + "step": 5976 + }, + { + "epoch": 0.2528555715373551, + "grad_norm": 0.2662065625190735, + "learning_rate": 0.001, + "loss": 2.9272, + "step": 5977 + }, + { + "epoch": 0.25289787630087146, + "grad_norm": 0.23567529022693634, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 5978 + }, + { + "epoch": 0.25294018106438787, + "grad_norm": 0.1899128556251526, + "learning_rate": 0.001, + "loss": 1.992, + "step": 5979 + }, + { + "epoch": 0.2529824858279042, + "grad_norm": 0.23404709994792938, + "learning_rate": 0.001, + "loss": 2.3842, + "step": 5980 + }, + { + "epoch": 0.2530247905914206, + "grad_norm": 0.5312284827232361, + "learning_rate": 0.001, + "loss": 2.2135, + "step": 5981 + }, + { + "epoch": 0.253067095354937, + "grad_norm": 0.2125319540500641, + "learning_rate": 0.001, + "loss": 2.3937, + "step": 5982 + }, + { + "epoch": 0.25310940011845334, + "grad_norm": 0.20524631440639496, + "learning_rate": 0.001, + "loss": 2.5639, + "step": 5983 + }, + { + "epoch": 0.2531517048819697, + "grad_norm": 0.20020252466201782, + "learning_rate": 0.001, + "loss": 2.6334, + "step": 5984 + }, + { + "epoch": 0.2531940096454861, + "grad_norm": 0.2192360907793045, + "learning_rate": 0.001, + "loss": 2.2578, + "step": 5985 + }, + { + "epoch": 0.25323631440900246, + "grad_norm": 0.16975408792495728, + "learning_rate": 0.001, + "loss": 1.8028, + "step": 5986 + }, + { + "epoch": 0.2532786191725188, + "grad_norm": 1.4105056524276733, + "learning_rate": 0.001, + "loss": 3.1986, + "step": 5987 + }, + { + "epoch": 0.2533209239360352, + "grad_norm": 1.0389015674591064, + "learning_rate": 0.001, + "loss": 2.7766, + "step": 5988 + }, + { + "epoch": 0.2533632286995516, + "grad_norm": 1.0410923957824707, + "learning_rate": 0.001, + "loss": 2.4953, + "step": 5989 + }, + { + "epoch": 0.25340553346306793, + "grad_norm": 0.24617451429367065, + "learning_rate": 0.001, + "loss": 1.7532, + "step": 5990 + }, + { + "epoch": 0.2534478382265843, + "grad_norm": 0.22218583524227142, + "learning_rate": 0.001, + "loss": 1.8373, + "step": 5991 + }, + { + "epoch": 0.2534901429901007, + "grad_norm": 0.3823182284832001, + "learning_rate": 0.001, + "loss": 1.8441, + "step": 5992 + }, + { + "epoch": 0.25353244775361705, + "grad_norm": 0.2810841202735901, + "learning_rate": 0.001, + "loss": 1.8284, + "step": 5993 + }, + { + "epoch": 0.2535747525171334, + "grad_norm": 0.21739636361598969, + "learning_rate": 0.001, + "loss": 2.66, + "step": 5994 + }, + { + "epoch": 0.2536170572806498, + "grad_norm": 0.8973679542541504, + "learning_rate": 0.001, + "loss": 2.4392, + "step": 5995 + }, + { + "epoch": 0.25365936204416617, + "grad_norm": 0.24056527018547058, + "learning_rate": 0.001, + "loss": 2.4884, + "step": 5996 + }, + { + "epoch": 0.2537016668076825, + "grad_norm": 0.7096660733222961, + "learning_rate": 0.001, + "loss": 2.4973, + "step": 5997 + }, + { + "epoch": 0.25374397157119893, + "grad_norm": 3.935514450073242, + "learning_rate": 0.001, + "loss": 2.5481, + "step": 5998 + }, + { + "epoch": 0.2537862763347153, + "grad_norm": 0.22361649572849274, + "learning_rate": 0.001, + "loss": 1.8946, + "step": 5999 + }, + { + "epoch": 0.25382858109823164, + "grad_norm": 1.974503755569458, + "learning_rate": 0.001, + "loss": 2.4045, + "step": 6000 + }, + { + "epoch": 0.25387088586174805, + "grad_norm": 0.44096559286117554, + "learning_rate": 0.001, + "loss": 2.4101, + "step": 6001 + }, + { + "epoch": 0.2539131906252644, + "grad_norm": 0.4442439079284668, + "learning_rate": 0.001, + "loss": 2.372, + "step": 6002 + }, + { + "epoch": 0.25395549538878076, + "grad_norm": 0.24630646407604218, + "learning_rate": 0.001, + "loss": 2.3218, + "step": 6003 + }, + { + "epoch": 0.25399780015229717, + "grad_norm": 0.32622864842414856, + "learning_rate": 0.001, + "loss": 3.0714, + "step": 6004 + }, + { + "epoch": 0.2540401049158135, + "grad_norm": 0.2776721119880676, + "learning_rate": 0.001, + "loss": 2.2819, + "step": 6005 + }, + { + "epoch": 0.2540824096793299, + "grad_norm": 0.22410140931606293, + "learning_rate": 0.001, + "loss": 2.2823, + "step": 6006 + }, + { + "epoch": 0.2541247144428463, + "grad_norm": 0.7463712096214294, + "learning_rate": 0.001, + "loss": 2.9199, + "step": 6007 + }, + { + "epoch": 0.25416701920636264, + "grad_norm": 0.3214509189128876, + "learning_rate": 0.001, + "loss": 3.1198, + "step": 6008 + }, + { + "epoch": 0.254209323969879, + "grad_norm": 0.2701822817325592, + "learning_rate": 0.001, + "loss": 1.9467, + "step": 6009 + }, + { + "epoch": 0.2542516287333954, + "grad_norm": 0.28340399265289307, + "learning_rate": 0.001, + "loss": 1.888, + "step": 6010 + }, + { + "epoch": 0.25429393349691176, + "grad_norm": 0.3803146779537201, + "learning_rate": 0.001, + "loss": 3.7349, + "step": 6011 + }, + { + "epoch": 0.2543362382604281, + "grad_norm": 0.2079993635416031, + "learning_rate": 0.001, + "loss": 2.412, + "step": 6012 + }, + { + "epoch": 0.25437854302394447, + "grad_norm": 0.2278539538383484, + "learning_rate": 0.001, + "loss": 2.1716, + "step": 6013 + }, + { + "epoch": 0.2544208477874609, + "grad_norm": 0.23987050354480743, + "learning_rate": 0.001, + "loss": 2.7047, + "step": 6014 + }, + { + "epoch": 0.25446315255097723, + "grad_norm": 0.32305946946144104, + "learning_rate": 0.001, + "loss": 2.1134, + "step": 6015 + }, + { + "epoch": 0.2545054573144936, + "grad_norm": 0.3197750449180603, + "learning_rate": 0.001, + "loss": 2.5648, + "step": 6016 + }, + { + "epoch": 0.25454776207801, + "grad_norm": 0.2103656828403473, + "learning_rate": 0.001, + "loss": 2.1189, + "step": 6017 + }, + { + "epoch": 0.25459006684152635, + "grad_norm": 0.6218228936195374, + "learning_rate": 0.001, + "loss": 1.9693, + "step": 6018 + }, + { + "epoch": 0.2546323716050427, + "grad_norm": 0.6921700835227966, + "learning_rate": 0.001, + "loss": 2.4787, + "step": 6019 + }, + { + "epoch": 0.2546746763685591, + "grad_norm": 0.20332172513008118, + "learning_rate": 0.001, + "loss": 1.9037, + "step": 6020 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 0.5180513262748718, + "learning_rate": 0.001, + "loss": 2.5619, + "step": 6021 + }, + { + "epoch": 0.2547592858955918, + "grad_norm": 0.199415922164917, + "learning_rate": 0.001, + "loss": 2.1282, + "step": 6022 + }, + { + "epoch": 0.25480159065910823, + "grad_norm": 0.20138099789619446, + "learning_rate": 0.001, + "loss": 1.8072, + "step": 6023 + }, + { + "epoch": 0.2548438954226246, + "grad_norm": 0.24892349541187286, + "learning_rate": 0.001, + "loss": 2.2655, + "step": 6024 + }, + { + "epoch": 0.25488620018614094, + "grad_norm": 0.4697567820549011, + "learning_rate": 0.001, + "loss": 2.9607, + "step": 6025 + }, + { + "epoch": 0.25492850494965735, + "grad_norm": 0.21022146940231323, + "learning_rate": 0.001, + "loss": 2.0065, + "step": 6026 + }, + { + "epoch": 0.2549708097131737, + "grad_norm": 2.630333423614502, + "learning_rate": 0.001, + "loss": 1.862, + "step": 6027 + }, + { + "epoch": 0.25501311447669006, + "grad_norm": 0.2536892890930176, + "learning_rate": 0.001, + "loss": 2.4433, + "step": 6028 + }, + { + "epoch": 0.25505541924020647, + "grad_norm": 0.212021142244339, + "learning_rate": 0.001, + "loss": 2.0887, + "step": 6029 + }, + { + "epoch": 0.2550977240037228, + "grad_norm": 0.2172902524471283, + "learning_rate": 0.001, + "loss": 1.958, + "step": 6030 + }, + { + "epoch": 0.2551400287672392, + "grad_norm": 0.22086498141288757, + "learning_rate": 0.001, + "loss": 2.5838, + "step": 6031 + }, + { + "epoch": 0.2551823335307556, + "grad_norm": 0.2510300576686859, + "learning_rate": 0.001, + "loss": 1.6274, + "step": 6032 + }, + { + "epoch": 0.25522463829427194, + "grad_norm": 1.1860932111740112, + "learning_rate": 0.001, + "loss": 3.0628, + "step": 6033 + }, + { + "epoch": 0.2552669430577883, + "grad_norm": 7.115164756774902, + "learning_rate": 0.001, + "loss": 3.3898, + "step": 6034 + }, + { + "epoch": 0.25530924782130465, + "grad_norm": 0.2606843411922455, + "learning_rate": 0.001, + "loss": 2.2712, + "step": 6035 + }, + { + "epoch": 0.25535155258482106, + "grad_norm": 0.25481027364730835, + "learning_rate": 0.001, + "loss": 2.5696, + "step": 6036 + }, + { + "epoch": 0.2553938573483374, + "grad_norm": 0.19426991045475006, + "learning_rate": 0.001, + "loss": 1.8278, + "step": 6037 + }, + { + "epoch": 0.25543616211185377, + "grad_norm": 0.23399263620376587, + "learning_rate": 0.001, + "loss": 2.0001, + "step": 6038 + }, + { + "epoch": 0.2554784668753702, + "grad_norm": 0.2326452136039734, + "learning_rate": 0.001, + "loss": 1.9532, + "step": 6039 + }, + { + "epoch": 0.25552077163888653, + "grad_norm": 0.5606713891029358, + "learning_rate": 0.001, + "loss": 2.2031, + "step": 6040 + }, + { + "epoch": 0.2555630764024029, + "grad_norm": 1.2523069381713867, + "learning_rate": 0.001, + "loss": 1.9169, + "step": 6041 + }, + { + "epoch": 0.2556053811659193, + "grad_norm": 0.21582403779029846, + "learning_rate": 0.001, + "loss": 2.3978, + "step": 6042 + }, + { + "epoch": 0.25564768592943565, + "grad_norm": 0.3164418339729309, + "learning_rate": 0.001, + "loss": 2.7011, + "step": 6043 + }, + { + "epoch": 0.255689990692952, + "grad_norm": 1.014667272567749, + "learning_rate": 0.001, + "loss": 2.3931, + "step": 6044 + }, + { + "epoch": 0.2557322954564684, + "grad_norm": 0.2742740213871002, + "learning_rate": 0.001, + "loss": 2.705, + "step": 6045 + }, + { + "epoch": 0.25577460021998477, + "grad_norm": 0.3064813017845154, + "learning_rate": 0.001, + "loss": 2.5047, + "step": 6046 + }, + { + "epoch": 0.2558169049835011, + "grad_norm": 0.2514945864677429, + "learning_rate": 0.001, + "loss": 1.743, + "step": 6047 + }, + { + "epoch": 0.25585920974701754, + "grad_norm": 0.48261088132858276, + "learning_rate": 0.001, + "loss": 2.7783, + "step": 6048 + }, + { + "epoch": 0.2559015145105339, + "grad_norm": 0.2113606333732605, + "learning_rate": 0.001, + "loss": 2.6006, + "step": 6049 + }, + { + "epoch": 0.25594381927405024, + "grad_norm": 0.21159514784812927, + "learning_rate": 0.001, + "loss": 1.9746, + "step": 6050 + }, + { + "epoch": 0.25598612403756665, + "grad_norm": 0.30281734466552734, + "learning_rate": 0.001, + "loss": 2.6729, + "step": 6051 + }, + { + "epoch": 0.256028428801083, + "grad_norm": 0.19452209770679474, + "learning_rate": 0.001, + "loss": 1.9967, + "step": 6052 + }, + { + "epoch": 0.25607073356459936, + "grad_norm": 0.19158554077148438, + "learning_rate": 0.001, + "loss": 2.0234, + "step": 6053 + }, + { + "epoch": 0.25611303832811577, + "grad_norm": 0.3292371332645416, + "learning_rate": 0.001, + "loss": 1.8973, + "step": 6054 + }, + { + "epoch": 0.2561553430916321, + "grad_norm": 0.1782860904932022, + "learning_rate": 0.001, + "loss": 1.786, + "step": 6055 + }, + { + "epoch": 0.2561976478551485, + "grad_norm": 0.2003399133682251, + "learning_rate": 0.001, + "loss": 2.1325, + "step": 6056 + }, + { + "epoch": 0.25623995261866483, + "grad_norm": 3.0092291831970215, + "learning_rate": 0.001, + "loss": 2.6524, + "step": 6057 + }, + { + "epoch": 0.25628225738218124, + "grad_norm": 0.21355977654457092, + "learning_rate": 0.001, + "loss": 2.2556, + "step": 6058 + }, + { + "epoch": 0.2563245621456976, + "grad_norm": 0.2934946119785309, + "learning_rate": 0.001, + "loss": 2.8166, + "step": 6059 + }, + { + "epoch": 0.25636686690921395, + "grad_norm": 0.1829323172569275, + "learning_rate": 0.001, + "loss": 1.8922, + "step": 6060 + }, + { + "epoch": 0.25640917167273036, + "grad_norm": 0.2807469069957733, + "learning_rate": 0.001, + "loss": 2.3787, + "step": 6061 + }, + { + "epoch": 0.2564514764362467, + "grad_norm": 0.22693084180355072, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 6062 + }, + { + "epoch": 0.25649378119976307, + "grad_norm": 0.1689741611480713, + "learning_rate": 0.001, + "loss": 2.1221, + "step": 6063 + }, + { + "epoch": 0.2565360859632795, + "grad_norm": 1.339165449142456, + "learning_rate": 0.001, + "loss": 1.8906, + "step": 6064 + }, + { + "epoch": 0.25657839072679584, + "grad_norm": 1.1260621547698975, + "learning_rate": 0.001, + "loss": 2.2244, + "step": 6065 + }, + { + "epoch": 0.2566206954903122, + "grad_norm": 0.2071322202682495, + "learning_rate": 0.001, + "loss": 2.0834, + "step": 6066 + }, + { + "epoch": 0.2566630002538286, + "grad_norm": 0.2302442193031311, + "learning_rate": 0.001, + "loss": 1.8294, + "step": 6067 + }, + { + "epoch": 0.25670530501734495, + "grad_norm": 0.4180930554866791, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 6068 + }, + { + "epoch": 0.2567476097808613, + "grad_norm": 0.2579159736633301, + "learning_rate": 0.001, + "loss": 2.7813, + "step": 6069 + }, + { + "epoch": 0.2567899145443777, + "grad_norm": 0.3067325949668884, + "learning_rate": 0.001, + "loss": 2.3948, + "step": 6070 + }, + { + "epoch": 0.25683221930789407, + "grad_norm": 0.2110936939716339, + "learning_rate": 0.001, + "loss": 2.5168, + "step": 6071 + }, + { + "epoch": 0.2568745240714104, + "grad_norm": 0.4867793023586273, + "learning_rate": 0.001, + "loss": 3.3979, + "step": 6072 + }, + { + "epoch": 0.25691682883492684, + "grad_norm": 0.22111758589744568, + "learning_rate": 0.001, + "loss": 1.9811, + "step": 6073 + }, + { + "epoch": 0.2569591335984432, + "grad_norm": 0.7423004508018494, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 6074 + }, + { + "epoch": 0.25700143836195954, + "grad_norm": 0.32565468549728394, + "learning_rate": 0.001, + "loss": 1.619, + "step": 6075 + }, + { + "epoch": 0.25704374312547595, + "grad_norm": 5.436065673828125, + "learning_rate": 0.001, + "loss": 2.3115, + "step": 6076 + }, + { + "epoch": 0.2570860478889923, + "grad_norm": 0.3309813141822815, + "learning_rate": 0.001, + "loss": 2.1493, + "step": 6077 + }, + { + "epoch": 0.25712835265250866, + "grad_norm": 0.7553781270980835, + "learning_rate": 0.001, + "loss": 3.0408, + "step": 6078 + }, + { + "epoch": 0.257170657416025, + "grad_norm": 0.20468690991401672, + "learning_rate": 0.001, + "loss": 2.1414, + "step": 6079 + }, + { + "epoch": 0.2572129621795414, + "grad_norm": 3.904078483581543, + "learning_rate": 0.001, + "loss": 2.7939, + "step": 6080 + }, + { + "epoch": 0.2572552669430578, + "grad_norm": 0.3017195165157318, + "learning_rate": 0.001, + "loss": 2.5183, + "step": 6081 + }, + { + "epoch": 0.25729757170657414, + "grad_norm": 0.3187876343727112, + "learning_rate": 0.001, + "loss": 3.2043, + "step": 6082 + }, + { + "epoch": 0.25733987647009055, + "grad_norm": 0.29611068964004517, + "learning_rate": 0.001, + "loss": 2.7729, + "step": 6083 + }, + { + "epoch": 0.2573821812336069, + "grad_norm": 0.26517346501350403, + "learning_rate": 0.001, + "loss": 3.1814, + "step": 6084 + }, + { + "epoch": 0.25742448599712325, + "grad_norm": 0.22469617426395416, + "learning_rate": 0.001, + "loss": 2.2341, + "step": 6085 + }, + { + "epoch": 0.25746679076063966, + "grad_norm": 0.7284866571426392, + "learning_rate": 0.001, + "loss": 2.2107, + "step": 6086 + }, + { + "epoch": 0.257509095524156, + "grad_norm": 0.35012027621269226, + "learning_rate": 0.001, + "loss": 2.9378, + "step": 6087 + }, + { + "epoch": 0.25755140028767237, + "grad_norm": 0.23448063433170319, + "learning_rate": 0.001, + "loss": 2.6047, + "step": 6088 + }, + { + "epoch": 0.2575937050511888, + "grad_norm": 0.185800239443779, + "learning_rate": 0.001, + "loss": 3.7792, + "step": 6089 + }, + { + "epoch": 0.25763600981470514, + "grad_norm": 0.8078601956367493, + "learning_rate": 0.001, + "loss": 2.4504, + "step": 6090 + }, + { + "epoch": 0.2576783145782215, + "grad_norm": 0.22115640342235565, + "learning_rate": 0.001, + "loss": 2.1107, + "step": 6091 + }, + { + "epoch": 0.2577206193417379, + "grad_norm": 0.2682878375053406, + "learning_rate": 0.001, + "loss": 2.3305, + "step": 6092 + }, + { + "epoch": 0.25776292410525425, + "grad_norm": 1.0689104795455933, + "learning_rate": 0.001, + "loss": 2.6872, + "step": 6093 + }, + { + "epoch": 0.2578052288687706, + "grad_norm": 0.22226876020431519, + "learning_rate": 0.001, + "loss": 2.9591, + "step": 6094 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 0.2324611246585846, + "learning_rate": 0.001, + "loss": 1.431, + "step": 6095 + }, + { + "epoch": 0.2578898383958034, + "grad_norm": 0.2755308449268341, + "learning_rate": 0.001, + "loss": 2.6437, + "step": 6096 + }, + { + "epoch": 0.2579321431593197, + "grad_norm": 0.39930108189582825, + "learning_rate": 0.001, + "loss": 2.093, + "step": 6097 + }, + { + "epoch": 0.25797444792283614, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.001, + "loss": 4.0456, + "step": 6098 + }, + { + "epoch": 0.2580167526863525, + "grad_norm": 0.5283278822898865, + "learning_rate": 0.001, + "loss": 2.4854, + "step": 6099 + }, + { + "epoch": 0.25805905744986884, + "grad_norm": 0.313729465007782, + "learning_rate": 0.001, + "loss": 2.2843, + "step": 6100 + }, + { + "epoch": 0.25810136221338525, + "grad_norm": 0.21948851644992828, + "learning_rate": 0.001, + "loss": 2.1515, + "step": 6101 + }, + { + "epoch": 0.2581436669769016, + "grad_norm": 0.21910326182842255, + "learning_rate": 0.001, + "loss": 2.5378, + "step": 6102 + }, + { + "epoch": 0.25818597174041796, + "grad_norm": 0.22086754441261292, + "learning_rate": 0.001, + "loss": 2.3505, + "step": 6103 + }, + { + "epoch": 0.2582282765039343, + "grad_norm": 0.23505359888076782, + "learning_rate": 0.001, + "loss": 2.2017, + "step": 6104 + }, + { + "epoch": 0.2582705812674507, + "grad_norm": 0.23428547382354736, + "learning_rate": 0.001, + "loss": 1.8068, + "step": 6105 + }, + { + "epoch": 0.2583128860309671, + "grad_norm": 0.16871504485607147, + "learning_rate": 0.001, + "loss": 1.9511, + "step": 6106 + }, + { + "epoch": 0.25835519079448344, + "grad_norm": 0.19622443616390228, + "learning_rate": 0.001, + "loss": 2.0708, + "step": 6107 + }, + { + "epoch": 0.25839749555799985, + "grad_norm": 0.2113541215658188, + "learning_rate": 0.001, + "loss": 2.1059, + "step": 6108 + }, + { + "epoch": 0.2584398003215162, + "grad_norm": 0.2662128806114197, + "learning_rate": 0.001, + "loss": 2.7546, + "step": 6109 + }, + { + "epoch": 0.25848210508503255, + "grad_norm": 0.18668299913406372, + "learning_rate": 0.001, + "loss": 1.9433, + "step": 6110 + }, + { + "epoch": 0.25852440984854896, + "grad_norm": 0.31358543038368225, + "learning_rate": 0.001, + "loss": 1.6306, + "step": 6111 + }, + { + "epoch": 0.2585667146120653, + "grad_norm": 0.23096436262130737, + "learning_rate": 0.001, + "loss": 1.8918, + "step": 6112 + }, + { + "epoch": 0.2586090193755817, + "grad_norm": 0.21152295172214508, + "learning_rate": 0.001, + "loss": 2.1265, + "step": 6113 + }, + { + "epoch": 0.2586513241390981, + "grad_norm": 0.40861940383911133, + "learning_rate": 0.001, + "loss": 1.994, + "step": 6114 + }, + { + "epoch": 0.25869362890261444, + "grad_norm": 0.21992965042591095, + "learning_rate": 0.001, + "loss": 2.5881, + "step": 6115 + }, + { + "epoch": 0.2587359336661308, + "grad_norm": 0.43303796648979187, + "learning_rate": 0.001, + "loss": 3.6542, + "step": 6116 + }, + { + "epoch": 0.2587782384296472, + "grad_norm": 0.2100251019001007, + "learning_rate": 0.001, + "loss": 2.3924, + "step": 6117 + }, + { + "epoch": 0.25882054319316355, + "grad_norm": 0.18890580534934998, + "learning_rate": 0.001, + "loss": 2.7952, + "step": 6118 + }, + { + "epoch": 0.2588628479566799, + "grad_norm": 2.6557726860046387, + "learning_rate": 0.001, + "loss": 1.9914, + "step": 6119 + }, + { + "epoch": 0.2589051527201963, + "grad_norm": 0.16840679943561554, + "learning_rate": 0.001, + "loss": 3.2511, + "step": 6120 + }, + { + "epoch": 0.2589474574837127, + "grad_norm": 0.2200114130973816, + "learning_rate": 0.001, + "loss": 1.7654, + "step": 6121 + }, + { + "epoch": 0.258989762247229, + "grad_norm": 0.2384369820356369, + "learning_rate": 0.001, + "loss": 2.5738, + "step": 6122 + }, + { + "epoch": 0.25903206701074544, + "grad_norm": 0.2304677963256836, + "learning_rate": 0.001, + "loss": 1.8705, + "step": 6123 + }, + { + "epoch": 0.2590743717742618, + "grad_norm": 0.23483799397945404, + "learning_rate": 0.001, + "loss": 2.1473, + "step": 6124 + }, + { + "epoch": 0.25911667653777815, + "grad_norm": 0.3107430040836334, + "learning_rate": 0.001, + "loss": 2.4566, + "step": 6125 + }, + { + "epoch": 0.2591589813012945, + "grad_norm": 0.4102107882499695, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 6126 + }, + { + "epoch": 0.2592012860648109, + "grad_norm": 0.28709524869918823, + "learning_rate": 0.001, + "loss": 2.6238, + "step": 6127 + }, + { + "epoch": 0.25924359082832726, + "grad_norm": 0.23573820292949677, + "learning_rate": 0.001, + "loss": 1.296, + "step": 6128 + }, + { + "epoch": 0.2592858955918436, + "grad_norm": 0.19910579919815063, + "learning_rate": 0.001, + "loss": 3.5181, + "step": 6129 + }, + { + "epoch": 0.25932820035536003, + "grad_norm": 0.2777842879295349, + "learning_rate": 0.001, + "loss": 2.2883, + "step": 6130 + }, + { + "epoch": 0.2593705051188764, + "grad_norm": 0.2618696391582489, + "learning_rate": 0.001, + "loss": 2.7957, + "step": 6131 + }, + { + "epoch": 0.25941280988239274, + "grad_norm": 0.23511484265327454, + "learning_rate": 0.001, + "loss": 2.1526, + "step": 6132 + }, + { + "epoch": 0.25945511464590915, + "grad_norm": 0.36623239517211914, + "learning_rate": 0.001, + "loss": 1.851, + "step": 6133 + }, + { + "epoch": 0.2594974194094255, + "grad_norm": 2.1750786304473877, + "learning_rate": 0.001, + "loss": 2.3166, + "step": 6134 + }, + { + "epoch": 0.25953972417294185, + "grad_norm": 4.039839744567871, + "learning_rate": 0.001, + "loss": 2.4438, + "step": 6135 + }, + { + "epoch": 0.25958202893645826, + "grad_norm": 0.30647915601730347, + "learning_rate": 0.001, + "loss": 2.255, + "step": 6136 + }, + { + "epoch": 0.2596243336999746, + "grad_norm": 0.23755225539207458, + "learning_rate": 0.001, + "loss": 2.0286, + "step": 6137 + }, + { + "epoch": 0.259666638463491, + "grad_norm": 4.9150919914245605, + "learning_rate": 0.001, + "loss": 2.212, + "step": 6138 + }, + { + "epoch": 0.2597089432270074, + "grad_norm": 0.3311949670314789, + "learning_rate": 0.001, + "loss": 3.6409, + "step": 6139 + }, + { + "epoch": 0.25975124799052374, + "grad_norm": 0.3053383529186249, + "learning_rate": 0.001, + "loss": 2.218, + "step": 6140 + }, + { + "epoch": 0.2597935527540401, + "grad_norm": 0.47039172053337097, + "learning_rate": 0.001, + "loss": 2.3472, + "step": 6141 + }, + { + "epoch": 0.2598358575175565, + "grad_norm": 0.27455034852027893, + "learning_rate": 0.001, + "loss": 2.3315, + "step": 6142 + }, + { + "epoch": 0.25987816228107286, + "grad_norm": 0.34887149930000305, + "learning_rate": 0.001, + "loss": 2.2812, + "step": 6143 + }, + { + "epoch": 0.2599204670445892, + "grad_norm": 0.20073916018009186, + "learning_rate": 0.001, + "loss": 1.674, + "step": 6144 + }, + { + "epoch": 0.2599627718081056, + "grad_norm": 0.5015227794647217, + "learning_rate": 0.001, + "loss": 3.2171, + "step": 6145 + }, + { + "epoch": 0.260005076571622, + "grad_norm": 12.801533699035645, + "learning_rate": 0.001, + "loss": 2.7793, + "step": 6146 + }, + { + "epoch": 0.26004738133513833, + "grad_norm": 0.8890368938446045, + "learning_rate": 0.001, + "loss": 2.3024, + "step": 6147 + }, + { + "epoch": 0.2600896860986547, + "grad_norm": 0.24711109697818756, + "learning_rate": 0.001, + "loss": 2.2968, + "step": 6148 + }, + { + "epoch": 0.2601319908621711, + "grad_norm": 2.7126057147979736, + "learning_rate": 0.001, + "loss": 1.9963, + "step": 6149 + }, + { + "epoch": 0.26017429562568745, + "grad_norm": 0.18857067823410034, + "learning_rate": 0.001, + "loss": 2.1632, + "step": 6150 + }, + { + "epoch": 0.2602166003892038, + "grad_norm": 0.2537596821784973, + "learning_rate": 0.001, + "loss": 1.9813, + "step": 6151 + }, + { + "epoch": 0.2602589051527202, + "grad_norm": 0.2444206178188324, + "learning_rate": 0.001, + "loss": 3.3214, + "step": 6152 + }, + { + "epoch": 0.26030120991623656, + "grad_norm": 0.19725318253040314, + "learning_rate": 0.001, + "loss": 1.9918, + "step": 6153 + }, + { + "epoch": 0.2603435146797529, + "grad_norm": 0.38273319602012634, + "learning_rate": 0.001, + "loss": 3.036, + "step": 6154 + }, + { + "epoch": 0.26038581944326933, + "grad_norm": 1.0224730968475342, + "learning_rate": 0.001, + "loss": 1.9985, + "step": 6155 + }, + { + "epoch": 0.2604281242067857, + "grad_norm": 0.6665830016136169, + "learning_rate": 0.001, + "loss": 3.0693, + "step": 6156 + }, + { + "epoch": 0.26047042897030204, + "grad_norm": 0.20416006445884705, + "learning_rate": 0.001, + "loss": 2.3206, + "step": 6157 + }, + { + "epoch": 0.26051273373381845, + "grad_norm": 0.3051769733428955, + "learning_rate": 0.001, + "loss": 3.7681, + "step": 6158 + }, + { + "epoch": 0.2605550384973348, + "grad_norm": 0.2127608060836792, + "learning_rate": 0.001, + "loss": 2.4234, + "step": 6159 + }, + { + "epoch": 0.26059734326085116, + "grad_norm": 0.28691282868385315, + "learning_rate": 0.001, + "loss": 1.7813, + "step": 6160 + }, + { + "epoch": 0.26063964802436757, + "grad_norm": 2.052556276321411, + "learning_rate": 0.001, + "loss": 3.2356, + "step": 6161 + }, + { + "epoch": 0.2606819527878839, + "grad_norm": 0.20295457541942596, + "learning_rate": 0.001, + "loss": 2.1718, + "step": 6162 + }, + { + "epoch": 0.2607242575514003, + "grad_norm": 0.19287841022014618, + "learning_rate": 0.001, + "loss": 1.7287, + "step": 6163 + }, + { + "epoch": 0.2607665623149167, + "grad_norm": 0.3149377405643463, + "learning_rate": 0.001, + "loss": 3.1882, + "step": 6164 + }, + { + "epoch": 0.26080886707843304, + "grad_norm": 0.21928805112838745, + "learning_rate": 0.001, + "loss": 2.8839, + "step": 6165 + }, + { + "epoch": 0.2608511718419494, + "grad_norm": 0.30192065238952637, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 6166 + }, + { + "epoch": 0.2608934766054658, + "grad_norm": 0.22280140221118927, + "learning_rate": 0.001, + "loss": 2.7337, + "step": 6167 + }, + { + "epoch": 0.26093578136898216, + "grad_norm": 0.19599604606628418, + "learning_rate": 0.001, + "loss": 2.2517, + "step": 6168 + }, + { + "epoch": 0.2609780861324985, + "grad_norm": 0.20323944091796875, + "learning_rate": 0.001, + "loss": 1.906, + "step": 6169 + }, + { + "epoch": 0.26102039089601486, + "grad_norm": 0.5633019804954529, + "learning_rate": 0.001, + "loss": 2.1333, + "step": 6170 + }, + { + "epoch": 0.2610626956595313, + "grad_norm": 0.20369386672973633, + "learning_rate": 0.001, + "loss": 1.9435, + "step": 6171 + }, + { + "epoch": 0.26110500042304763, + "grad_norm": 0.37366142868995667, + "learning_rate": 0.001, + "loss": 2.2786, + "step": 6172 + }, + { + "epoch": 0.261147305186564, + "grad_norm": 0.21656358242034912, + "learning_rate": 0.001, + "loss": 2.4738, + "step": 6173 + }, + { + "epoch": 0.2611896099500804, + "grad_norm": 0.1992012858390808, + "learning_rate": 0.001, + "loss": 2.1982, + "step": 6174 + }, + { + "epoch": 0.26123191471359675, + "grad_norm": 0.32912105321884155, + "learning_rate": 0.001, + "loss": 2.3972, + "step": 6175 + }, + { + "epoch": 0.2612742194771131, + "grad_norm": 0.16803164780139923, + "learning_rate": 0.001, + "loss": 1.9239, + "step": 6176 + }, + { + "epoch": 0.2613165242406295, + "grad_norm": 0.2005642205476761, + "learning_rate": 0.001, + "loss": 2.1696, + "step": 6177 + }, + { + "epoch": 0.26135882900414587, + "grad_norm": 0.22105731070041656, + "learning_rate": 0.001, + "loss": 3.1925, + "step": 6178 + }, + { + "epoch": 0.2614011337676622, + "grad_norm": 0.20076580345630646, + "learning_rate": 0.001, + "loss": 2.1953, + "step": 6179 + }, + { + "epoch": 0.26144343853117863, + "grad_norm": 0.19129331409931183, + "learning_rate": 0.001, + "loss": 2.6739, + "step": 6180 + }, + { + "epoch": 0.261485743294695, + "grad_norm": 0.17908786237239838, + "learning_rate": 0.001, + "loss": 1.8419, + "step": 6181 + }, + { + "epoch": 0.26152804805821134, + "grad_norm": 0.2999635636806488, + "learning_rate": 0.001, + "loss": 2.5631, + "step": 6182 + }, + { + "epoch": 0.26157035282172775, + "grad_norm": 0.1999271810054779, + "learning_rate": 0.001, + "loss": 2.2973, + "step": 6183 + }, + { + "epoch": 0.2616126575852441, + "grad_norm": 0.19798393547534943, + "learning_rate": 0.001, + "loss": 2.4701, + "step": 6184 + }, + { + "epoch": 0.26165496234876046, + "grad_norm": 0.195541650056839, + "learning_rate": 0.001, + "loss": 3.1255, + "step": 6185 + }, + { + "epoch": 0.26169726711227687, + "grad_norm": 0.18188412487506866, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 6186 + }, + { + "epoch": 0.2617395718757932, + "grad_norm": 0.22832369804382324, + "learning_rate": 0.001, + "loss": 3.4992, + "step": 6187 + }, + { + "epoch": 0.2617818766393096, + "grad_norm": 0.20645490288734436, + "learning_rate": 0.001, + "loss": 2.6053, + "step": 6188 + }, + { + "epoch": 0.261824181402826, + "grad_norm": 0.20869730412960052, + "learning_rate": 0.001, + "loss": 2.9876, + "step": 6189 + }, + { + "epoch": 0.26186648616634234, + "grad_norm": 1.8209075927734375, + "learning_rate": 0.001, + "loss": 1.7739, + "step": 6190 + }, + { + "epoch": 0.2619087909298587, + "grad_norm": 0.24617734551429749, + "learning_rate": 0.001, + "loss": 2.9264, + "step": 6191 + }, + { + "epoch": 0.26195109569337505, + "grad_norm": 0.5746178030967712, + "learning_rate": 0.001, + "loss": 2.4101, + "step": 6192 + }, + { + "epoch": 0.26199340045689146, + "grad_norm": 0.3251841366291046, + "learning_rate": 0.001, + "loss": 1.9637, + "step": 6193 + }, + { + "epoch": 0.2620357052204078, + "grad_norm": 0.19260448217391968, + "learning_rate": 0.001, + "loss": 2.7791, + "step": 6194 + }, + { + "epoch": 0.26207800998392417, + "grad_norm": 0.1845528781414032, + "learning_rate": 0.001, + "loss": 2.2625, + "step": 6195 + }, + { + "epoch": 0.2621203147474406, + "grad_norm": 0.230647012591362, + "learning_rate": 0.001, + "loss": 3.0717, + "step": 6196 + }, + { + "epoch": 0.26216261951095693, + "grad_norm": 0.2902037799358368, + "learning_rate": 0.001, + "loss": 1.7595, + "step": 6197 + }, + { + "epoch": 0.2622049242744733, + "grad_norm": 0.20350608229637146, + "learning_rate": 0.001, + "loss": 2.0974, + "step": 6198 + }, + { + "epoch": 0.2622472290379897, + "grad_norm": 0.6259804964065552, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 6199 + }, + { + "epoch": 0.26228953380150605, + "grad_norm": 0.16990482807159424, + "learning_rate": 0.001, + "loss": 2.5884, + "step": 6200 + }, + { + "epoch": 0.2623318385650224, + "grad_norm": 0.2967141568660736, + "learning_rate": 0.001, + "loss": 2.3306, + "step": 6201 + }, + { + "epoch": 0.2623741433285388, + "grad_norm": 0.3053082823753357, + "learning_rate": 0.001, + "loss": 2.2518, + "step": 6202 + }, + { + "epoch": 0.26241644809205517, + "grad_norm": 0.9330534338951111, + "learning_rate": 0.001, + "loss": 2.6071, + "step": 6203 + }, + { + "epoch": 0.2624587528555715, + "grad_norm": 0.19737285375595093, + "learning_rate": 0.001, + "loss": 2.0726, + "step": 6204 + }, + { + "epoch": 0.26250105761908793, + "grad_norm": 0.23269009590148926, + "learning_rate": 0.001, + "loss": 1.6949, + "step": 6205 + }, + { + "epoch": 0.2625433623826043, + "grad_norm": 0.1986543983221054, + "learning_rate": 0.001, + "loss": 2.53, + "step": 6206 + }, + { + "epoch": 0.26258566714612064, + "grad_norm": 1.0258678197860718, + "learning_rate": 0.001, + "loss": 2.4325, + "step": 6207 + }, + { + "epoch": 0.26262797190963705, + "grad_norm": 0.6095445156097412, + "learning_rate": 0.001, + "loss": 2.5046, + "step": 6208 + }, + { + "epoch": 0.2626702766731534, + "grad_norm": 1.8578591346740723, + "learning_rate": 0.001, + "loss": 2.3447, + "step": 6209 + }, + { + "epoch": 0.26271258143666976, + "grad_norm": 5.6780619621276855, + "learning_rate": 0.001, + "loss": 2.2678, + "step": 6210 + }, + { + "epoch": 0.26275488620018617, + "grad_norm": 0.2798073887825012, + "learning_rate": 0.001, + "loss": 2.0733, + "step": 6211 + }, + { + "epoch": 0.2627971909637025, + "grad_norm": 1.4034373760223389, + "learning_rate": 0.001, + "loss": 1.8523, + "step": 6212 + }, + { + "epoch": 0.2628394957272189, + "grad_norm": 0.198062464594841, + "learning_rate": 0.001, + "loss": 1.4889, + "step": 6213 + }, + { + "epoch": 0.26288180049073523, + "grad_norm": 0.2564472556114197, + "learning_rate": 0.001, + "loss": 2.4133, + "step": 6214 + }, + { + "epoch": 0.26292410525425164, + "grad_norm": 2.446117877960205, + "learning_rate": 0.001, + "loss": 1.8318, + "step": 6215 + }, + { + "epoch": 0.262966410017768, + "grad_norm": 0.28336429595947266, + "learning_rate": 0.001, + "loss": 3.2593, + "step": 6216 + }, + { + "epoch": 0.26300871478128435, + "grad_norm": 0.29390016198158264, + "learning_rate": 0.001, + "loss": 2.292, + "step": 6217 + }, + { + "epoch": 0.26305101954480076, + "grad_norm": 0.6104854941368103, + "learning_rate": 0.001, + "loss": 1.9547, + "step": 6218 + }, + { + "epoch": 0.2630933243083171, + "grad_norm": 1.6165142059326172, + "learning_rate": 0.001, + "loss": 2.2211, + "step": 6219 + }, + { + "epoch": 0.26313562907183347, + "grad_norm": 2.914968252182007, + "learning_rate": 0.001, + "loss": 2.4817, + "step": 6220 + }, + { + "epoch": 0.2631779338353499, + "grad_norm": 0.24600031971931458, + "learning_rate": 0.001, + "loss": 2.0624, + "step": 6221 + }, + { + "epoch": 0.26322023859886623, + "grad_norm": 0.23349520564079285, + "learning_rate": 0.001, + "loss": 1.9119, + "step": 6222 + }, + { + "epoch": 0.2632625433623826, + "grad_norm": 0.2566237151622772, + "learning_rate": 0.001, + "loss": 3.1059, + "step": 6223 + }, + { + "epoch": 0.263304848125899, + "grad_norm": 0.27912285923957825, + "learning_rate": 0.001, + "loss": 2.2333, + "step": 6224 + }, + { + "epoch": 0.26334715288941535, + "grad_norm": 0.2630113661289215, + "learning_rate": 0.001, + "loss": 2.0277, + "step": 6225 + }, + { + "epoch": 0.2633894576529317, + "grad_norm": 0.34040457010269165, + "learning_rate": 0.001, + "loss": 2.6417, + "step": 6226 + }, + { + "epoch": 0.2634317624164481, + "grad_norm": 0.7498903870582581, + "learning_rate": 0.001, + "loss": 2.513, + "step": 6227 + }, + { + "epoch": 0.26347406717996447, + "grad_norm": 0.4051353633403778, + "learning_rate": 0.001, + "loss": 2.2107, + "step": 6228 + }, + { + "epoch": 0.2635163719434808, + "grad_norm": 0.288908451795578, + "learning_rate": 0.001, + "loss": 3.0754, + "step": 6229 + }, + { + "epoch": 0.26355867670699723, + "grad_norm": 0.40644702315330505, + "learning_rate": 0.001, + "loss": 3.2407, + "step": 6230 + }, + { + "epoch": 0.2636009814705136, + "grad_norm": 0.19740119576454163, + "learning_rate": 0.001, + "loss": 2.0704, + "step": 6231 + }, + { + "epoch": 0.26364328623402994, + "grad_norm": 0.22680628299713135, + "learning_rate": 0.001, + "loss": 2.6613, + "step": 6232 + }, + { + "epoch": 0.26368559099754635, + "grad_norm": 0.34792256355285645, + "learning_rate": 0.001, + "loss": 1.6583, + "step": 6233 + }, + { + "epoch": 0.2637278957610627, + "grad_norm": 0.57359379529953, + "learning_rate": 0.001, + "loss": 3.129, + "step": 6234 + }, + { + "epoch": 0.26377020052457906, + "grad_norm": 0.3252086937427521, + "learning_rate": 0.001, + "loss": 4.0645, + "step": 6235 + }, + { + "epoch": 0.26381250528809547, + "grad_norm": 0.7273129224777222, + "learning_rate": 0.001, + "loss": 2.7411, + "step": 6236 + }, + { + "epoch": 0.2638548100516118, + "grad_norm": 0.22891153395175934, + "learning_rate": 0.001, + "loss": 2.2799, + "step": 6237 + }, + { + "epoch": 0.2638971148151282, + "grad_norm": 0.4040292501449585, + "learning_rate": 0.001, + "loss": 2.375, + "step": 6238 + }, + { + "epoch": 0.26393941957864453, + "grad_norm": 20.426698684692383, + "learning_rate": 0.001, + "loss": 1.8637, + "step": 6239 + }, + { + "epoch": 0.26398172434216094, + "grad_norm": 0.26762643456459045, + "learning_rate": 0.001, + "loss": 2.1107, + "step": 6240 + }, + { + "epoch": 0.2640240291056773, + "grad_norm": 1.0472038984298706, + "learning_rate": 0.001, + "loss": 2.1314, + "step": 6241 + }, + { + "epoch": 0.26406633386919365, + "grad_norm": 0.7407483458518982, + "learning_rate": 0.001, + "loss": 2.1423, + "step": 6242 + }, + { + "epoch": 0.26410863863271006, + "grad_norm": 0.45358023047447205, + "learning_rate": 0.001, + "loss": 2.017, + "step": 6243 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.40108269453048706, + "learning_rate": 0.001, + "loss": 2.1408, + "step": 6244 + }, + { + "epoch": 0.26419324815974277, + "grad_norm": 0.20317232608795166, + "learning_rate": 0.001, + "loss": 2.2763, + "step": 6245 + }, + { + "epoch": 0.2642355529232592, + "grad_norm": 0.21886859834194183, + "learning_rate": 0.001, + "loss": 2.9445, + "step": 6246 + }, + { + "epoch": 0.26427785768677553, + "grad_norm": 0.2402205467224121, + "learning_rate": 0.001, + "loss": 3.2153, + "step": 6247 + }, + { + "epoch": 0.2643201624502919, + "grad_norm": 0.4614834487438202, + "learning_rate": 0.001, + "loss": 2.5112, + "step": 6248 + }, + { + "epoch": 0.2643624672138083, + "grad_norm": 0.4552881717681885, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 6249 + }, + { + "epoch": 0.26440477197732465, + "grad_norm": 0.20050367712974548, + "learning_rate": 0.001, + "loss": 2.7497, + "step": 6250 + }, + { + "epoch": 0.264447076740841, + "grad_norm": 0.21264247596263885, + "learning_rate": 0.001, + "loss": 2.3534, + "step": 6251 + }, + { + "epoch": 0.2644893815043574, + "grad_norm": 0.9750521183013916, + "learning_rate": 0.001, + "loss": 1.9891, + "step": 6252 + }, + { + "epoch": 0.26453168626787377, + "grad_norm": 0.7031003832817078, + "learning_rate": 0.001, + "loss": 3.7157, + "step": 6253 + }, + { + "epoch": 0.2645739910313901, + "grad_norm": 1.4832327365875244, + "learning_rate": 0.001, + "loss": 2.1368, + "step": 6254 + }, + { + "epoch": 0.26461629579490653, + "grad_norm": 0.6920870542526245, + "learning_rate": 0.001, + "loss": 2.8061, + "step": 6255 + }, + { + "epoch": 0.2646586005584229, + "grad_norm": 0.5719873309135437, + "learning_rate": 0.001, + "loss": 2.5427, + "step": 6256 + }, + { + "epoch": 0.26470090532193924, + "grad_norm": 0.23134736716747284, + "learning_rate": 0.001, + "loss": 2.2727, + "step": 6257 + }, + { + "epoch": 0.26474321008545565, + "grad_norm": 0.1953703761100769, + "learning_rate": 0.001, + "loss": 1.9514, + "step": 6258 + }, + { + "epoch": 0.264785514848972, + "grad_norm": 0.27613356709480286, + "learning_rate": 0.001, + "loss": 2.8069, + "step": 6259 + }, + { + "epoch": 0.26482781961248836, + "grad_norm": 33.50883102416992, + "learning_rate": 0.001, + "loss": 2.0961, + "step": 6260 + }, + { + "epoch": 0.2648701243760047, + "grad_norm": 0.2973005771636963, + "learning_rate": 0.001, + "loss": 3.3931, + "step": 6261 + }, + { + "epoch": 0.2649124291395211, + "grad_norm": 1.8105061054229736, + "learning_rate": 0.001, + "loss": 2.3645, + "step": 6262 + }, + { + "epoch": 0.2649547339030375, + "grad_norm": 0.9858757853507996, + "learning_rate": 0.001, + "loss": 2.1567, + "step": 6263 + }, + { + "epoch": 0.26499703866655383, + "grad_norm": 0.3192642629146576, + "learning_rate": 0.001, + "loss": 2.5659, + "step": 6264 + }, + { + "epoch": 0.26503934343007024, + "grad_norm": 0.21430915594100952, + "learning_rate": 0.001, + "loss": 2.5984, + "step": 6265 + }, + { + "epoch": 0.2650816481935866, + "grad_norm": 0.45341306924819946, + "learning_rate": 0.001, + "loss": 2.8938, + "step": 6266 + }, + { + "epoch": 0.26512395295710295, + "grad_norm": 0.2323470115661621, + "learning_rate": 0.001, + "loss": 1.904, + "step": 6267 + }, + { + "epoch": 0.26516625772061936, + "grad_norm": 0.1812925934791565, + "learning_rate": 0.001, + "loss": 1.8184, + "step": 6268 + }, + { + "epoch": 0.2652085624841357, + "grad_norm": 0.23729638755321503, + "learning_rate": 0.001, + "loss": 2.2054, + "step": 6269 + }, + { + "epoch": 0.26525086724765207, + "grad_norm": 0.24241091310977936, + "learning_rate": 0.001, + "loss": 2.2426, + "step": 6270 + }, + { + "epoch": 0.2652931720111685, + "grad_norm": 0.22026705741882324, + "learning_rate": 0.001, + "loss": 1.9184, + "step": 6271 + }, + { + "epoch": 0.26533547677468483, + "grad_norm": 0.3513158857822418, + "learning_rate": 0.001, + "loss": 2.5154, + "step": 6272 + }, + { + "epoch": 0.2653777815382012, + "grad_norm": 1.525829792022705, + "learning_rate": 0.001, + "loss": 1.7338, + "step": 6273 + }, + { + "epoch": 0.2654200863017176, + "grad_norm": 2.88657808303833, + "learning_rate": 0.001, + "loss": 2.0355, + "step": 6274 + }, + { + "epoch": 0.26546239106523395, + "grad_norm": 1.638237476348877, + "learning_rate": 0.001, + "loss": 2.125, + "step": 6275 + }, + { + "epoch": 0.2655046958287503, + "grad_norm": 0.27576902508735657, + "learning_rate": 0.001, + "loss": 2.2267, + "step": 6276 + }, + { + "epoch": 0.2655470005922667, + "grad_norm": 1.6162000894546509, + "learning_rate": 0.001, + "loss": 2.4581, + "step": 6277 + }, + { + "epoch": 0.26558930535578307, + "grad_norm": 0.23247942328453064, + "learning_rate": 0.001, + "loss": 2.4351, + "step": 6278 + }, + { + "epoch": 0.2656316101192994, + "grad_norm": 0.28002870082855225, + "learning_rate": 0.001, + "loss": 2.1524, + "step": 6279 + }, + { + "epoch": 0.26567391488281583, + "grad_norm": 0.23175565898418427, + "learning_rate": 0.001, + "loss": 2.1924, + "step": 6280 + }, + { + "epoch": 0.2657162196463322, + "grad_norm": 0.7777692675590515, + "learning_rate": 0.001, + "loss": 2.6283, + "step": 6281 + }, + { + "epoch": 0.26575852440984854, + "grad_norm": 1.0904306173324585, + "learning_rate": 0.001, + "loss": 2.1067, + "step": 6282 + }, + { + "epoch": 0.2658008291733649, + "grad_norm": 0.6720946431159973, + "learning_rate": 0.001, + "loss": 1.938, + "step": 6283 + }, + { + "epoch": 0.2658431339368813, + "grad_norm": 7.82070255279541, + "learning_rate": 0.001, + "loss": 2.4127, + "step": 6284 + }, + { + "epoch": 0.26588543870039766, + "grad_norm": 0.4037260115146637, + "learning_rate": 0.001, + "loss": 3.3877, + "step": 6285 + }, + { + "epoch": 0.265927743463914, + "grad_norm": 1.355433702468872, + "learning_rate": 0.001, + "loss": 1.7886, + "step": 6286 + }, + { + "epoch": 0.2659700482274304, + "grad_norm": 0.42245426774024963, + "learning_rate": 0.001, + "loss": 3.5764, + "step": 6287 + }, + { + "epoch": 0.2660123529909468, + "grad_norm": 0.24568170309066772, + "learning_rate": 0.001, + "loss": 1.8148, + "step": 6288 + }, + { + "epoch": 0.26605465775446313, + "grad_norm": 6.263239860534668, + "learning_rate": 0.001, + "loss": 1.8413, + "step": 6289 + }, + { + "epoch": 0.26609696251797954, + "grad_norm": 0.2733914256095886, + "learning_rate": 0.001, + "loss": 2.3739, + "step": 6290 + }, + { + "epoch": 0.2661392672814959, + "grad_norm": 0.30786386132240295, + "learning_rate": 0.001, + "loss": 2.0434, + "step": 6291 + }, + { + "epoch": 0.26618157204501225, + "grad_norm": 0.29940205812454224, + "learning_rate": 0.001, + "loss": 2.5419, + "step": 6292 + }, + { + "epoch": 0.26622387680852866, + "grad_norm": 0.43194735050201416, + "learning_rate": 0.001, + "loss": 2.0004, + "step": 6293 + }, + { + "epoch": 0.266266181572045, + "grad_norm": 0.3845593333244324, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 6294 + }, + { + "epoch": 0.26630848633556137, + "grad_norm": 4.627384185791016, + "learning_rate": 0.001, + "loss": 3.1407, + "step": 6295 + }, + { + "epoch": 0.2663507910990778, + "grad_norm": 0.3965384066104889, + "learning_rate": 0.001, + "loss": 2.1406, + "step": 6296 + }, + { + "epoch": 0.26639309586259413, + "grad_norm": 2.6226112842559814, + "learning_rate": 0.001, + "loss": 1.7126, + "step": 6297 + }, + { + "epoch": 0.2664354006261105, + "grad_norm": 1.5446035861968994, + "learning_rate": 0.001, + "loss": 1.9526, + "step": 6298 + }, + { + "epoch": 0.2664777053896269, + "grad_norm": 2.2716145515441895, + "learning_rate": 0.001, + "loss": 2.4368, + "step": 6299 + }, + { + "epoch": 0.26652001015314325, + "grad_norm": 0.33402782678604126, + "learning_rate": 0.001, + "loss": 1.8137, + "step": 6300 + }, + { + "epoch": 0.2665623149166596, + "grad_norm": 0.9563067555427551, + "learning_rate": 0.001, + "loss": 2.6016, + "step": 6301 + }, + { + "epoch": 0.266604619680176, + "grad_norm": 0.3132352828979492, + "learning_rate": 0.001, + "loss": 2.9526, + "step": 6302 + }, + { + "epoch": 0.26664692444369237, + "grad_norm": 0.6881158351898193, + "learning_rate": 0.001, + "loss": 2.4045, + "step": 6303 + }, + { + "epoch": 0.2666892292072087, + "grad_norm": 1.5459450483322144, + "learning_rate": 0.001, + "loss": 2.9012, + "step": 6304 + }, + { + "epoch": 0.2667315339707251, + "grad_norm": 0.40609946846961975, + "learning_rate": 0.001, + "loss": 1.9667, + "step": 6305 + }, + { + "epoch": 0.2667738387342415, + "grad_norm": 0.35249534249305725, + "learning_rate": 0.001, + "loss": 3.4398, + "step": 6306 + }, + { + "epoch": 0.26681614349775784, + "grad_norm": 0.981610894203186, + "learning_rate": 0.001, + "loss": 3.0861, + "step": 6307 + }, + { + "epoch": 0.2668584482612742, + "grad_norm": 0.530106246471405, + "learning_rate": 0.001, + "loss": 2.5613, + "step": 6308 + }, + { + "epoch": 0.2669007530247906, + "grad_norm": 2.370054244995117, + "learning_rate": 0.001, + "loss": 2.989, + "step": 6309 + }, + { + "epoch": 0.26694305778830696, + "grad_norm": 0.7922685742378235, + "learning_rate": 0.001, + "loss": 2.8242, + "step": 6310 + }, + { + "epoch": 0.2669853625518233, + "grad_norm": 0.472288578748703, + "learning_rate": 0.001, + "loss": 2.9797, + "step": 6311 + }, + { + "epoch": 0.2670276673153397, + "grad_norm": 7.536009788513184, + "learning_rate": 0.001, + "loss": 3.7025, + "step": 6312 + }, + { + "epoch": 0.2670699720788561, + "grad_norm": 0.35234659910202026, + "learning_rate": 0.001, + "loss": 2.187, + "step": 6313 + }, + { + "epoch": 0.26711227684237243, + "grad_norm": 0.45496541261672974, + "learning_rate": 0.001, + "loss": 2.8294, + "step": 6314 + }, + { + "epoch": 0.26715458160588884, + "grad_norm": 0.4241243302822113, + "learning_rate": 0.001, + "loss": 2.8042, + "step": 6315 + }, + { + "epoch": 0.2671968863694052, + "grad_norm": 0.49575382471084595, + "learning_rate": 0.001, + "loss": 2.7069, + "step": 6316 + }, + { + "epoch": 0.26723919113292155, + "grad_norm": 0.32094845175743103, + "learning_rate": 0.001, + "loss": 2.2842, + "step": 6317 + }, + { + "epoch": 0.26728149589643796, + "grad_norm": 2.19527268409729, + "learning_rate": 0.001, + "loss": 3.9412, + "step": 6318 + }, + { + "epoch": 0.2673238006599543, + "grad_norm": 0.24995988607406616, + "learning_rate": 0.001, + "loss": 2.4964, + "step": 6319 + }, + { + "epoch": 0.26736610542347067, + "grad_norm": 4.221808433532715, + "learning_rate": 0.001, + "loss": 2.6365, + "step": 6320 + }, + { + "epoch": 0.2674084101869871, + "grad_norm": 0.434870183467865, + "learning_rate": 0.001, + "loss": 2.3938, + "step": 6321 + }, + { + "epoch": 0.26745071495050343, + "grad_norm": 0.32494959235191345, + "learning_rate": 0.001, + "loss": 3.9073, + "step": 6322 + }, + { + "epoch": 0.2674930197140198, + "grad_norm": 0.314816951751709, + "learning_rate": 0.001, + "loss": 3.404, + "step": 6323 + }, + { + "epoch": 0.2675353244775362, + "grad_norm": 0.3015631139278412, + "learning_rate": 0.001, + "loss": 2.1681, + "step": 6324 + }, + { + "epoch": 0.26757762924105255, + "grad_norm": 0.23250603675842285, + "learning_rate": 0.001, + "loss": 2.4507, + "step": 6325 + }, + { + "epoch": 0.2676199340045689, + "grad_norm": 0.77018141746521, + "learning_rate": 0.001, + "loss": 3.565, + "step": 6326 + }, + { + "epoch": 0.26766223876808526, + "grad_norm": 2.2710511684417725, + "learning_rate": 0.001, + "loss": 2.027, + "step": 6327 + }, + { + "epoch": 0.26770454353160167, + "grad_norm": 0.3492465317249298, + "learning_rate": 0.001, + "loss": 3.2861, + "step": 6328 + }, + { + "epoch": 0.267746848295118, + "grad_norm": 2.129352331161499, + "learning_rate": 0.001, + "loss": 1.7412, + "step": 6329 + }, + { + "epoch": 0.2677891530586344, + "grad_norm": 0.8317650556564331, + "learning_rate": 0.001, + "loss": 3.3198, + "step": 6330 + }, + { + "epoch": 0.2678314578221508, + "grad_norm": 0.3606610894203186, + "learning_rate": 0.001, + "loss": 3.1525, + "step": 6331 + }, + { + "epoch": 0.26787376258566714, + "grad_norm": 0.2575969696044922, + "learning_rate": 0.001, + "loss": 1.9891, + "step": 6332 + }, + { + "epoch": 0.2679160673491835, + "grad_norm": 0.26098647713661194, + "learning_rate": 0.001, + "loss": 2.6293, + "step": 6333 + }, + { + "epoch": 0.2679583721126999, + "grad_norm": 0.30769988894462585, + "learning_rate": 0.001, + "loss": 2.5751, + "step": 6334 + }, + { + "epoch": 0.26800067687621626, + "grad_norm": 0.9987971186637878, + "learning_rate": 0.001, + "loss": 2.2541, + "step": 6335 + }, + { + "epoch": 0.2680429816397326, + "grad_norm": 0.20395010709762573, + "learning_rate": 0.001, + "loss": 1.7506, + "step": 6336 + }, + { + "epoch": 0.268085286403249, + "grad_norm": 0.23569338023662567, + "learning_rate": 0.001, + "loss": 1.9746, + "step": 6337 + }, + { + "epoch": 0.2681275911667654, + "grad_norm": 0.3824636936187744, + "learning_rate": 0.001, + "loss": 3.2225, + "step": 6338 + }, + { + "epoch": 0.26816989593028173, + "grad_norm": 0.46429404616355896, + "learning_rate": 0.001, + "loss": 3.0844, + "step": 6339 + }, + { + "epoch": 0.26821220069379814, + "grad_norm": 0.6086142063140869, + "learning_rate": 0.001, + "loss": 3.0908, + "step": 6340 + }, + { + "epoch": 0.2682545054573145, + "grad_norm": 0.24936416745185852, + "learning_rate": 0.001, + "loss": 2.1276, + "step": 6341 + }, + { + "epoch": 0.26829681022083085, + "grad_norm": 0.41220951080322266, + "learning_rate": 0.001, + "loss": 2.7437, + "step": 6342 + }, + { + "epoch": 0.26833911498434726, + "grad_norm": 0.2830599546432495, + "learning_rate": 0.001, + "loss": 2.2482, + "step": 6343 + }, + { + "epoch": 0.2683814197478636, + "grad_norm": 0.2458266317844391, + "learning_rate": 0.001, + "loss": 2.1775, + "step": 6344 + }, + { + "epoch": 0.26842372451137997, + "grad_norm": 0.33611828088760376, + "learning_rate": 0.001, + "loss": 2.4973, + "step": 6345 + }, + { + "epoch": 0.2684660292748964, + "grad_norm": 0.7442197799682617, + "learning_rate": 0.001, + "loss": 3.3098, + "step": 6346 + }, + { + "epoch": 0.26850833403841273, + "grad_norm": 0.23901230096817017, + "learning_rate": 0.001, + "loss": 2.303, + "step": 6347 + }, + { + "epoch": 0.2685506388019291, + "grad_norm": 0.22979353368282318, + "learning_rate": 0.001, + "loss": 1.7865, + "step": 6348 + }, + { + "epoch": 0.2685929435654455, + "grad_norm": 0.3396184742450714, + "learning_rate": 0.001, + "loss": 3.8322, + "step": 6349 + }, + { + "epoch": 0.26863524832896185, + "grad_norm": 0.2517753541469574, + "learning_rate": 0.001, + "loss": 2.2335, + "step": 6350 + }, + { + "epoch": 0.2686775530924782, + "grad_norm": 0.4144297242164612, + "learning_rate": 0.001, + "loss": 3.4627, + "step": 6351 + }, + { + "epoch": 0.26871985785599456, + "grad_norm": 0.891348123550415, + "learning_rate": 0.001, + "loss": 2.9963, + "step": 6352 + }, + { + "epoch": 0.26876216261951097, + "grad_norm": 0.18582716584205627, + "learning_rate": 0.001, + "loss": 2.2934, + "step": 6353 + }, + { + "epoch": 0.2688044673830273, + "grad_norm": 0.27463603019714355, + "learning_rate": 0.001, + "loss": 2.0024, + "step": 6354 + }, + { + "epoch": 0.2688467721465437, + "grad_norm": 7.595641613006592, + "learning_rate": 0.001, + "loss": 2.3449, + "step": 6355 + }, + { + "epoch": 0.2688890769100601, + "grad_norm": 1.0186480283737183, + "learning_rate": 0.001, + "loss": 2.4801, + "step": 6356 + }, + { + "epoch": 0.26893138167357644, + "grad_norm": 0.25770995020866394, + "learning_rate": 0.001, + "loss": 3.5221, + "step": 6357 + }, + { + "epoch": 0.2689736864370928, + "grad_norm": 0.5055752396583557, + "learning_rate": 0.001, + "loss": 3.0373, + "step": 6358 + }, + { + "epoch": 0.2690159912006092, + "grad_norm": 0.25883978605270386, + "learning_rate": 0.001, + "loss": 2.6142, + "step": 6359 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 0.2824064791202545, + "learning_rate": 0.001, + "loss": 2.4801, + "step": 6360 + }, + { + "epoch": 0.2691006007276419, + "grad_norm": 0.3055369555950165, + "learning_rate": 0.001, + "loss": 2.5225, + "step": 6361 + }, + { + "epoch": 0.2691429054911583, + "grad_norm": 0.27755987644195557, + "learning_rate": 0.001, + "loss": 2.5036, + "step": 6362 + }, + { + "epoch": 0.2691852102546747, + "grad_norm": 0.24744181334972382, + "learning_rate": 0.001, + "loss": 1.9392, + "step": 6363 + }, + { + "epoch": 0.26922751501819103, + "grad_norm": 0.23263895511627197, + "learning_rate": 0.001, + "loss": 1.7806, + "step": 6364 + }, + { + "epoch": 0.26926981978170744, + "grad_norm": 0.30707335472106934, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 6365 + }, + { + "epoch": 0.2693121245452238, + "grad_norm": 0.17739908397197723, + "learning_rate": 0.001, + "loss": 1.8609, + "step": 6366 + }, + { + "epoch": 0.26935442930874015, + "grad_norm": 0.299203097820282, + "learning_rate": 0.001, + "loss": 2.6187, + "step": 6367 + }, + { + "epoch": 0.26939673407225656, + "grad_norm": 0.27614837884902954, + "learning_rate": 0.001, + "loss": 2.5299, + "step": 6368 + }, + { + "epoch": 0.2694390388357729, + "grad_norm": 1.2516486644744873, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 6369 + }, + { + "epoch": 0.26948134359928927, + "grad_norm": 0.43937593698501587, + "learning_rate": 0.001, + "loss": 3.1544, + "step": 6370 + }, + { + "epoch": 0.2695236483628057, + "grad_norm": 0.272873193025589, + "learning_rate": 0.001, + "loss": 2.6493, + "step": 6371 + }, + { + "epoch": 0.26956595312632203, + "grad_norm": 0.3647904694080353, + "learning_rate": 0.001, + "loss": 2.4063, + "step": 6372 + }, + { + "epoch": 0.2696082578898384, + "grad_norm": 1.3242371082305908, + "learning_rate": 0.001, + "loss": 3.4809, + "step": 6373 + }, + { + "epoch": 0.26965056265335474, + "grad_norm": 0.23816105723381042, + "learning_rate": 0.001, + "loss": 1.8286, + "step": 6374 + }, + { + "epoch": 0.26969286741687115, + "grad_norm": 0.22475877404212952, + "learning_rate": 0.001, + "loss": 2.1492, + "step": 6375 + }, + { + "epoch": 0.2697351721803875, + "grad_norm": 0.33347398042678833, + "learning_rate": 0.001, + "loss": 2.8582, + "step": 6376 + }, + { + "epoch": 0.26977747694390386, + "grad_norm": 0.22079631686210632, + "learning_rate": 0.001, + "loss": 2.6284, + "step": 6377 + }, + { + "epoch": 0.26981978170742027, + "grad_norm": 0.24591165781021118, + "learning_rate": 0.001, + "loss": 2.1861, + "step": 6378 + }, + { + "epoch": 0.2698620864709366, + "grad_norm": 0.25623226165771484, + "learning_rate": 0.001, + "loss": 2.7529, + "step": 6379 + }, + { + "epoch": 0.269904391234453, + "grad_norm": 0.3840174078941345, + "learning_rate": 0.001, + "loss": 2.0479, + "step": 6380 + }, + { + "epoch": 0.2699466959979694, + "grad_norm": 0.29531073570251465, + "learning_rate": 0.001, + "loss": 2.1461, + "step": 6381 + }, + { + "epoch": 0.26998900076148574, + "grad_norm": 0.22676776349544525, + "learning_rate": 0.001, + "loss": 2.2194, + "step": 6382 + }, + { + "epoch": 0.2700313055250021, + "grad_norm": 0.233690544962883, + "learning_rate": 0.001, + "loss": 1.6935, + "step": 6383 + }, + { + "epoch": 0.2700736102885185, + "grad_norm": 0.5040211081504822, + "learning_rate": 0.001, + "loss": 3.0911, + "step": 6384 + }, + { + "epoch": 0.27011591505203486, + "grad_norm": 0.4640330374240875, + "learning_rate": 0.001, + "loss": 1.8026, + "step": 6385 + }, + { + "epoch": 0.2701582198155512, + "grad_norm": 0.21546930074691772, + "learning_rate": 0.001, + "loss": 3.1936, + "step": 6386 + }, + { + "epoch": 0.2702005245790676, + "grad_norm": 0.22518888115882874, + "learning_rate": 0.001, + "loss": 2.1559, + "step": 6387 + }, + { + "epoch": 0.270242829342584, + "grad_norm": 0.3616427183151245, + "learning_rate": 0.001, + "loss": 2.9413, + "step": 6388 + }, + { + "epoch": 0.27028513410610033, + "grad_norm": 0.22333014011383057, + "learning_rate": 0.001, + "loss": 2.1103, + "step": 6389 + }, + { + "epoch": 0.27032743886961674, + "grad_norm": 3.191633462905884, + "learning_rate": 0.001, + "loss": 2.508, + "step": 6390 + }, + { + "epoch": 0.2703697436331331, + "grad_norm": 0.24529823660850525, + "learning_rate": 0.001, + "loss": 2.4031, + "step": 6391 + }, + { + "epoch": 0.27041204839664945, + "grad_norm": 0.21744905412197113, + "learning_rate": 0.001, + "loss": 3.51, + "step": 6392 + }, + { + "epoch": 0.27045435316016586, + "grad_norm": 0.422944575548172, + "learning_rate": 0.001, + "loss": 3.0605, + "step": 6393 + }, + { + "epoch": 0.2704966579236822, + "grad_norm": 0.29786035418510437, + "learning_rate": 0.001, + "loss": 2.2543, + "step": 6394 + }, + { + "epoch": 0.27053896268719857, + "grad_norm": 0.5101473927497864, + "learning_rate": 0.001, + "loss": 3.697, + "step": 6395 + }, + { + "epoch": 0.2705812674507149, + "grad_norm": 0.2863130569458008, + "learning_rate": 0.001, + "loss": 2.2927, + "step": 6396 + }, + { + "epoch": 0.27062357221423133, + "grad_norm": 0.21719864010810852, + "learning_rate": 0.001, + "loss": 1.9034, + "step": 6397 + }, + { + "epoch": 0.2706658769777477, + "grad_norm": 0.5843220353126526, + "learning_rate": 0.001, + "loss": 2.0929, + "step": 6398 + }, + { + "epoch": 0.27070818174126404, + "grad_norm": 0.2397383600473404, + "learning_rate": 0.001, + "loss": 1.5715, + "step": 6399 + }, + { + "epoch": 0.27075048650478045, + "grad_norm": 0.29469746351242065, + "learning_rate": 0.001, + "loss": 3.1697, + "step": 6400 + }, + { + "epoch": 0.2707927912682968, + "grad_norm": 0.4289894998073578, + "learning_rate": 0.001, + "loss": 2.8917, + "step": 6401 + }, + { + "epoch": 0.27083509603181316, + "grad_norm": 0.40637311339378357, + "learning_rate": 0.001, + "loss": 2.5034, + "step": 6402 + }, + { + "epoch": 0.27087740079532957, + "grad_norm": 1.2302122116088867, + "learning_rate": 0.001, + "loss": 2.5266, + "step": 6403 + }, + { + "epoch": 0.2709197055588459, + "grad_norm": 0.7081871628761292, + "learning_rate": 0.001, + "loss": 2.5219, + "step": 6404 + }, + { + "epoch": 0.2709620103223623, + "grad_norm": 0.1945783644914627, + "learning_rate": 0.001, + "loss": 2.5327, + "step": 6405 + }, + { + "epoch": 0.2710043150858787, + "grad_norm": 2.336282968521118, + "learning_rate": 0.001, + "loss": 2.8066, + "step": 6406 + }, + { + "epoch": 0.27104661984939504, + "grad_norm": 0.2778642177581787, + "learning_rate": 0.001, + "loss": 2.8345, + "step": 6407 + }, + { + "epoch": 0.2710889246129114, + "grad_norm": 0.3541242778301239, + "learning_rate": 0.001, + "loss": 2.5382, + "step": 6408 + }, + { + "epoch": 0.2711312293764278, + "grad_norm": 3.476750373840332, + "learning_rate": 0.001, + "loss": 2.3818, + "step": 6409 + }, + { + "epoch": 0.27117353413994416, + "grad_norm": 0.2823469936847687, + "learning_rate": 0.001, + "loss": 2.2147, + "step": 6410 + }, + { + "epoch": 0.2712158389034605, + "grad_norm": 0.2130715698003769, + "learning_rate": 0.001, + "loss": 2.3666, + "step": 6411 + }, + { + "epoch": 0.2712581436669769, + "grad_norm": 0.210503488779068, + "learning_rate": 0.001, + "loss": 1.7467, + "step": 6412 + }, + { + "epoch": 0.2713004484304933, + "grad_norm": 0.22151534259319305, + "learning_rate": 0.001, + "loss": 2.9495, + "step": 6413 + }, + { + "epoch": 0.27134275319400963, + "grad_norm": 0.24885092675685883, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 6414 + }, + { + "epoch": 0.27138505795752604, + "grad_norm": 0.23562543094158173, + "learning_rate": 0.001, + "loss": 2.2867, + "step": 6415 + }, + { + "epoch": 0.2714273627210424, + "grad_norm": 0.20105452835559845, + "learning_rate": 0.001, + "loss": 2.7994, + "step": 6416 + }, + { + "epoch": 0.27146966748455875, + "grad_norm": 0.2746374011039734, + "learning_rate": 0.001, + "loss": 1.9023, + "step": 6417 + }, + { + "epoch": 0.2715119722480751, + "grad_norm": 1.5132687091827393, + "learning_rate": 0.001, + "loss": 1.8792, + "step": 6418 + }, + { + "epoch": 0.2715542770115915, + "grad_norm": 0.24032434821128845, + "learning_rate": 0.001, + "loss": 1.9926, + "step": 6419 + }, + { + "epoch": 0.27159658177510787, + "grad_norm": 0.37214019894599915, + "learning_rate": 0.001, + "loss": 2.6278, + "step": 6420 + }, + { + "epoch": 0.2716388865386242, + "grad_norm": 0.2632519006729126, + "learning_rate": 0.001, + "loss": 2.4603, + "step": 6421 + }, + { + "epoch": 0.27168119130214063, + "grad_norm": 1.0849531888961792, + "learning_rate": 0.001, + "loss": 2.8918, + "step": 6422 + }, + { + "epoch": 0.271723496065657, + "grad_norm": 3.291983127593994, + "learning_rate": 0.001, + "loss": 2.1872, + "step": 6423 + }, + { + "epoch": 0.27176580082917334, + "grad_norm": 0.26407113671302795, + "learning_rate": 0.001, + "loss": 1.9061, + "step": 6424 + }, + { + "epoch": 0.27180810559268975, + "grad_norm": 0.2467721402645111, + "learning_rate": 0.001, + "loss": 2.1413, + "step": 6425 + }, + { + "epoch": 0.2718504103562061, + "grad_norm": 2.2249884605407715, + "learning_rate": 0.001, + "loss": 2.9308, + "step": 6426 + }, + { + "epoch": 0.27189271511972246, + "grad_norm": 0.866671621799469, + "learning_rate": 0.001, + "loss": 2.4567, + "step": 6427 + }, + { + "epoch": 0.27193501988323887, + "grad_norm": 0.32258787751197815, + "learning_rate": 0.001, + "loss": 3.198, + "step": 6428 + }, + { + "epoch": 0.2719773246467552, + "grad_norm": 2.03613018989563, + "learning_rate": 0.001, + "loss": 3.3261, + "step": 6429 + }, + { + "epoch": 0.2720196294102716, + "grad_norm": 0.9766329526901245, + "learning_rate": 0.001, + "loss": 2.2824, + "step": 6430 + }, + { + "epoch": 0.272061934173788, + "grad_norm": 0.31005382537841797, + "learning_rate": 0.001, + "loss": 3.2165, + "step": 6431 + }, + { + "epoch": 0.27210423893730434, + "grad_norm": 0.32901322841644287, + "learning_rate": 0.001, + "loss": 1.9921, + "step": 6432 + }, + { + "epoch": 0.2721465437008207, + "grad_norm": 0.24402910470962524, + "learning_rate": 0.001, + "loss": 3.7147, + "step": 6433 + }, + { + "epoch": 0.2721888484643371, + "grad_norm": 0.24346059560775757, + "learning_rate": 0.001, + "loss": 2.2568, + "step": 6434 + }, + { + "epoch": 0.27223115322785346, + "grad_norm": 0.27896568179130554, + "learning_rate": 0.001, + "loss": 2.3955, + "step": 6435 + }, + { + "epoch": 0.2722734579913698, + "grad_norm": 0.7607430219650269, + "learning_rate": 0.001, + "loss": 2.6034, + "step": 6436 + }, + { + "epoch": 0.2723157627548862, + "grad_norm": 0.5905636548995972, + "learning_rate": 0.001, + "loss": 2.1869, + "step": 6437 + }, + { + "epoch": 0.2723580675184026, + "grad_norm": 0.24976111948490143, + "learning_rate": 0.001, + "loss": 2.6681, + "step": 6438 + }, + { + "epoch": 0.27240037228191893, + "grad_norm": 0.4961627423763275, + "learning_rate": 0.001, + "loss": 3.1772, + "step": 6439 + }, + { + "epoch": 0.2724426770454353, + "grad_norm": 0.23998108506202698, + "learning_rate": 0.001, + "loss": 2.0783, + "step": 6440 + }, + { + "epoch": 0.2724849818089517, + "grad_norm": 1.2053577899932861, + "learning_rate": 0.001, + "loss": 3.2813, + "step": 6441 + }, + { + "epoch": 0.27252728657246805, + "grad_norm": 0.2748907804489136, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 6442 + }, + { + "epoch": 0.2725695913359844, + "grad_norm": 0.31637445092201233, + "learning_rate": 0.001, + "loss": 2.2979, + "step": 6443 + }, + { + "epoch": 0.2726118960995008, + "grad_norm": 1.9223753213882446, + "learning_rate": 0.001, + "loss": 2.6186, + "step": 6444 + }, + { + "epoch": 0.27265420086301717, + "grad_norm": 0.6588495969772339, + "learning_rate": 0.001, + "loss": 2.2728, + "step": 6445 + }, + { + "epoch": 0.2726965056265335, + "grad_norm": 4.742616176605225, + "learning_rate": 0.001, + "loss": 2.0405, + "step": 6446 + }, + { + "epoch": 0.27273881039004993, + "grad_norm": 0.25673648715019226, + "learning_rate": 0.001, + "loss": 2.5662, + "step": 6447 + }, + { + "epoch": 0.2727811151535663, + "grad_norm": 0.22692641615867615, + "learning_rate": 0.001, + "loss": 1.8061, + "step": 6448 + }, + { + "epoch": 0.27282341991708264, + "grad_norm": 0.33545932173728943, + "learning_rate": 0.001, + "loss": 1.6433, + "step": 6449 + }, + { + "epoch": 0.27286572468059905, + "grad_norm": 0.2357272207736969, + "learning_rate": 0.001, + "loss": 2.672, + "step": 6450 + }, + { + "epoch": 0.2729080294441154, + "grad_norm": 0.28300443291664124, + "learning_rate": 0.001, + "loss": 2.1577, + "step": 6451 + }, + { + "epoch": 0.27295033420763176, + "grad_norm": 0.2952788472175598, + "learning_rate": 0.001, + "loss": 2.1633, + "step": 6452 + }, + { + "epoch": 0.27299263897114817, + "grad_norm": 0.2804337441921234, + "learning_rate": 0.001, + "loss": 1.9271, + "step": 6453 + }, + { + "epoch": 0.2730349437346645, + "grad_norm": 0.24086658656597137, + "learning_rate": 0.001, + "loss": 2.162, + "step": 6454 + }, + { + "epoch": 0.2730772484981809, + "grad_norm": 0.26707324385643005, + "learning_rate": 0.001, + "loss": 2.1178, + "step": 6455 + }, + { + "epoch": 0.2731195532616973, + "grad_norm": 0.26330190896987915, + "learning_rate": 0.001, + "loss": 2.5571, + "step": 6456 + }, + { + "epoch": 0.27316185802521364, + "grad_norm": 1.0483336448669434, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 6457 + }, + { + "epoch": 0.27320416278873, + "grad_norm": 0.28875404596328735, + "learning_rate": 0.001, + "loss": 2.8507, + "step": 6458 + }, + { + "epoch": 0.2732464675522464, + "grad_norm": 0.3092382550239563, + "learning_rate": 0.001, + "loss": 2.0755, + "step": 6459 + }, + { + "epoch": 0.27328877231576276, + "grad_norm": 0.24076971411705017, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 6460 + }, + { + "epoch": 0.2733310770792791, + "grad_norm": 0.26437196135520935, + "learning_rate": 0.001, + "loss": 3.098, + "step": 6461 + }, + { + "epoch": 0.2733733818427955, + "grad_norm": 0.2675298750400543, + "learning_rate": 0.001, + "loss": 3.5501, + "step": 6462 + }, + { + "epoch": 0.2734156866063119, + "grad_norm": 0.3043130338191986, + "learning_rate": 0.001, + "loss": 2.3021, + "step": 6463 + }, + { + "epoch": 0.27345799136982823, + "grad_norm": 1.052797555923462, + "learning_rate": 0.001, + "loss": 2.6146, + "step": 6464 + }, + { + "epoch": 0.2735002961333446, + "grad_norm": 0.23038017749786377, + "learning_rate": 0.001, + "loss": 1.6309, + "step": 6465 + }, + { + "epoch": 0.273542600896861, + "grad_norm": 0.3373502194881439, + "learning_rate": 0.001, + "loss": 2.1514, + "step": 6466 + }, + { + "epoch": 0.27358490566037735, + "grad_norm": 0.23279638588428497, + "learning_rate": 0.001, + "loss": 2.5891, + "step": 6467 + }, + { + "epoch": 0.2736272104238937, + "grad_norm": 0.507875919342041, + "learning_rate": 0.001, + "loss": 2.5316, + "step": 6468 + }, + { + "epoch": 0.2736695151874101, + "grad_norm": 0.21502485871315002, + "learning_rate": 0.001, + "loss": 2.2808, + "step": 6469 + }, + { + "epoch": 0.27371181995092647, + "grad_norm": 0.19854310154914856, + "learning_rate": 0.001, + "loss": 3.3566, + "step": 6470 + }, + { + "epoch": 0.2737541247144428, + "grad_norm": 0.2249358594417572, + "learning_rate": 0.001, + "loss": 1.8404, + "step": 6471 + }, + { + "epoch": 0.27379642947795924, + "grad_norm": 0.2288307249546051, + "learning_rate": 0.001, + "loss": 1.924, + "step": 6472 + }, + { + "epoch": 0.2738387342414756, + "grad_norm": 0.24865660071372986, + "learning_rate": 0.001, + "loss": 2.3673, + "step": 6473 + }, + { + "epoch": 0.27388103900499194, + "grad_norm": 0.24267597496509552, + "learning_rate": 0.001, + "loss": 1.9281, + "step": 6474 + }, + { + "epoch": 0.27392334376850835, + "grad_norm": 0.2626087963581085, + "learning_rate": 0.001, + "loss": 2.4203, + "step": 6475 + }, + { + "epoch": 0.2739656485320247, + "grad_norm": 0.8168473839759827, + "learning_rate": 0.001, + "loss": 2.6164, + "step": 6476 + }, + { + "epoch": 0.27400795329554106, + "grad_norm": 0.18421576917171478, + "learning_rate": 0.001, + "loss": 1.9259, + "step": 6477 + }, + { + "epoch": 0.27405025805905747, + "grad_norm": 0.2525613307952881, + "learning_rate": 0.001, + "loss": 2.3877, + "step": 6478 + }, + { + "epoch": 0.2740925628225738, + "grad_norm": 0.20929360389709473, + "learning_rate": 0.001, + "loss": 1.6114, + "step": 6479 + }, + { + "epoch": 0.2741348675860902, + "grad_norm": 0.4461418688297272, + "learning_rate": 0.001, + "loss": 2.563, + "step": 6480 + }, + { + "epoch": 0.2741771723496066, + "grad_norm": 0.2533968389034271, + "learning_rate": 0.001, + "loss": 2.7533, + "step": 6481 + }, + { + "epoch": 0.27421947711312294, + "grad_norm": 1.817432165145874, + "learning_rate": 0.001, + "loss": 2.2708, + "step": 6482 + }, + { + "epoch": 0.2742617818766393, + "grad_norm": 1.1035431623458862, + "learning_rate": 0.001, + "loss": 2.2485, + "step": 6483 + }, + { + "epoch": 0.2743040866401557, + "grad_norm": 0.4007866680622101, + "learning_rate": 0.001, + "loss": 2.566, + "step": 6484 + }, + { + "epoch": 0.27434639140367206, + "grad_norm": 0.569590151309967, + "learning_rate": 0.001, + "loss": 1.9645, + "step": 6485 + }, + { + "epoch": 0.2743886961671884, + "grad_norm": 0.2620263695716858, + "learning_rate": 0.001, + "loss": 2.1197, + "step": 6486 + }, + { + "epoch": 0.27443100093070477, + "grad_norm": 0.6177977323532104, + "learning_rate": 0.001, + "loss": 2.6985, + "step": 6487 + }, + { + "epoch": 0.2744733056942212, + "grad_norm": 0.22808204591274261, + "learning_rate": 0.001, + "loss": 3.8149, + "step": 6488 + }, + { + "epoch": 0.27451561045773754, + "grad_norm": 0.2721755802631378, + "learning_rate": 0.001, + "loss": 2.389, + "step": 6489 + }, + { + "epoch": 0.2745579152212539, + "grad_norm": 0.35955166816711426, + "learning_rate": 0.001, + "loss": 2.1531, + "step": 6490 + }, + { + "epoch": 0.2746002199847703, + "grad_norm": 0.2631028890609741, + "learning_rate": 0.001, + "loss": 1.944, + "step": 6491 + }, + { + "epoch": 0.27464252474828665, + "grad_norm": 0.40836191177368164, + "learning_rate": 0.001, + "loss": 2.219, + "step": 6492 + }, + { + "epoch": 0.274684829511803, + "grad_norm": 0.29734811186790466, + "learning_rate": 0.001, + "loss": 2.4754, + "step": 6493 + }, + { + "epoch": 0.2747271342753194, + "grad_norm": 0.3713732063770294, + "learning_rate": 0.001, + "loss": 3.1679, + "step": 6494 + }, + { + "epoch": 0.27476943903883577, + "grad_norm": 0.23647406697273254, + "learning_rate": 0.001, + "loss": 2.3628, + "step": 6495 + }, + { + "epoch": 0.2748117438023521, + "grad_norm": 0.19687795639038086, + "learning_rate": 0.001, + "loss": 2.8735, + "step": 6496 + }, + { + "epoch": 0.27485404856586854, + "grad_norm": 2.3540210723876953, + "learning_rate": 0.001, + "loss": 2.1741, + "step": 6497 + }, + { + "epoch": 0.2748963533293849, + "grad_norm": 0.5432139039039612, + "learning_rate": 0.001, + "loss": 2.0557, + "step": 6498 + }, + { + "epoch": 0.27493865809290124, + "grad_norm": 0.4527101516723633, + "learning_rate": 0.001, + "loss": 3.2012, + "step": 6499 + }, + { + "epoch": 0.27498096285641765, + "grad_norm": 0.22198386490345, + "learning_rate": 0.001, + "loss": 2.1226, + "step": 6500 + }, + { + "epoch": 0.275023267619934, + "grad_norm": 0.2468535304069519, + "learning_rate": 0.001, + "loss": 2.4435, + "step": 6501 + }, + { + "epoch": 0.27506557238345036, + "grad_norm": 0.22474850714206696, + "learning_rate": 0.001, + "loss": 2.0643, + "step": 6502 + }, + { + "epoch": 0.2751078771469668, + "grad_norm": 0.38860535621643066, + "learning_rate": 0.001, + "loss": 2.0566, + "step": 6503 + }, + { + "epoch": 0.2751501819104831, + "grad_norm": 0.4149872064590454, + "learning_rate": 0.001, + "loss": 2.1151, + "step": 6504 + }, + { + "epoch": 0.2751924866739995, + "grad_norm": 0.8003644943237305, + "learning_rate": 0.001, + "loss": 3.1268, + "step": 6505 + }, + { + "epoch": 0.2752347914375159, + "grad_norm": 0.2186277061700821, + "learning_rate": 0.001, + "loss": 1.8286, + "step": 6506 + }, + { + "epoch": 0.27527709620103225, + "grad_norm": 0.2277793288230896, + "learning_rate": 0.001, + "loss": 2.1968, + "step": 6507 + }, + { + "epoch": 0.2753194009645486, + "grad_norm": 0.2010040283203125, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 6508 + }, + { + "epoch": 0.27536170572806495, + "grad_norm": 0.9116567373275757, + "learning_rate": 0.001, + "loss": 2.6635, + "step": 6509 + }, + { + "epoch": 0.27540401049158136, + "grad_norm": 0.20740988850593567, + "learning_rate": 0.001, + "loss": 2.2698, + "step": 6510 + }, + { + "epoch": 0.2754463152550977, + "grad_norm": 0.22273097932338715, + "learning_rate": 0.001, + "loss": 2.3287, + "step": 6511 + }, + { + "epoch": 0.27548862001861407, + "grad_norm": 0.28017768263816833, + "learning_rate": 0.001, + "loss": 2.3398, + "step": 6512 + }, + { + "epoch": 0.2755309247821305, + "grad_norm": 0.19713011384010315, + "learning_rate": 0.001, + "loss": 1.8994, + "step": 6513 + }, + { + "epoch": 0.27557322954564684, + "grad_norm": 0.19678883254528046, + "learning_rate": 0.001, + "loss": 2.0113, + "step": 6514 + }, + { + "epoch": 0.2756155343091632, + "grad_norm": 25.500926971435547, + "learning_rate": 0.001, + "loss": 1.8373, + "step": 6515 + }, + { + "epoch": 0.2756578390726796, + "grad_norm": 2.405385732650757, + "learning_rate": 0.001, + "loss": 2.077, + "step": 6516 + }, + { + "epoch": 0.27570014383619595, + "grad_norm": 0.9735079407691956, + "learning_rate": 0.001, + "loss": 2.5276, + "step": 6517 + }, + { + "epoch": 0.2757424485997123, + "grad_norm": 0.3777754604816437, + "learning_rate": 0.001, + "loss": 2.6313, + "step": 6518 + }, + { + "epoch": 0.2757847533632287, + "grad_norm": 0.25702303647994995, + "learning_rate": 0.001, + "loss": 2.9321, + "step": 6519 + }, + { + "epoch": 0.2758270581267451, + "grad_norm": 1.9004207849502563, + "learning_rate": 0.001, + "loss": 1.8612, + "step": 6520 + }, + { + "epoch": 0.2758693628902614, + "grad_norm": 0.2935592532157898, + "learning_rate": 0.001, + "loss": 3.1381, + "step": 6521 + }, + { + "epoch": 0.27591166765377784, + "grad_norm": 0.7922021150588989, + "learning_rate": 0.001, + "loss": 2.1313, + "step": 6522 + }, + { + "epoch": 0.2759539724172942, + "grad_norm": 0.2765513062477112, + "learning_rate": 0.001, + "loss": 2.7907, + "step": 6523 + }, + { + "epoch": 0.27599627718081055, + "grad_norm": 0.24716876447200775, + "learning_rate": 0.001, + "loss": 2.1889, + "step": 6524 + }, + { + "epoch": 0.27603858194432696, + "grad_norm": 0.24158848822116852, + "learning_rate": 0.001, + "loss": 1.8922, + "step": 6525 + }, + { + "epoch": 0.2760808867078433, + "grad_norm": 0.20398221909999847, + "learning_rate": 0.001, + "loss": 2.2493, + "step": 6526 + }, + { + "epoch": 0.27612319147135966, + "grad_norm": 0.22118747234344482, + "learning_rate": 0.001, + "loss": 2.6995, + "step": 6527 + }, + { + "epoch": 0.2761654962348761, + "grad_norm": 0.21023960411548615, + "learning_rate": 0.001, + "loss": 2.1655, + "step": 6528 + }, + { + "epoch": 0.2762078009983924, + "grad_norm": 0.2214033603668213, + "learning_rate": 0.001, + "loss": 1.7361, + "step": 6529 + }, + { + "epoch": 0.2762501057619088, + "grad_norm": 0.4276762306690216, + "learning_rate": 0.001, + "loss": 2.3999, + "step": 6530 + }, + { + "epoch": 0.27629241052542514, + "grad_norm": 0.2657619118690491, + "learning_rate": 0.001, + "loss": 3.2196, + "step": 6531 + }, + { + "epoch": 0.27633471528894155, + "grad_norm": 0.47785693407058716, + "learning_rate": 0.001, + "loss": 1.933, + "step": 6532 + }, + { + "epoch": 0.2763770200524579, + "grad_norm": 0.6065228581428528, + "learning_rate": 0.001, + "loss": 2.8416, + "step": 6533 + }, + { + "epoch": 0.27641932481597425, + "grad_norm": 0.18479658663272858, + "learning_rate": 0.001, + "loss": 1.9656, + "step": 6534 + }, + { + "epoch": 0.27646162957949066, + "grad_norm": 0.2248462438583374, + "learning_rate": 0.001, + "loss": 2.1931, + "step": 6535 + }, + { + "epoch": 0.276503934343007, + "grad_norm": 0.23013022541999817, + "learning_rate": 0.001, + "loss": 3.1781, + "step": 6536 + }, + { + "epoch": 0.2765462391065234, + "grad_norm": 0.23272770643234253, + "learning_rate": 0.001, + "loss": 2.345, + "step": 6537 + }, + { + "epoch": 0.2765885438700398, + "grad_norm": 6.299062252044678, + "learning_rate": 0.001, + "loss": 2.0466, + "step": 6538 + }, + { + "epoch": 0.27663084863355614, + "grad_norm": 0.2376430779695511, + "learning_rate": 0.001, + "loss": 2.2214, + "step": 6539 + }, + { + "epoch": 0.2766731533970725, + "grad_norm": 4.812066555023193, + "learning_rate": 0.001, + "loss": 2.6048, + "step": 6540 + }, + { + "epoch": 0.2767154581605889, + "grad_norm": 2.5265612602233887, + "learning_rate": 0.001, + "loss": 2.3992, + "step": 6541 + }, + { + "epoch": 0.27675776292410526, + "grad_norm": 0.3985212445259094, + "learning_rate": 0.001, + "loss": 3.2, + "step": 6542 + }, + { + "epoch": 0.2768000676876216, + "grad_norm": 4.070377349853516, + "learning_rate": 0.001, + "loss": 2.1483, + "step": 6543 + }, + { + "epoch": 0.276842372451138, + "grad_norm": 0.7375186681747437, + "learning_rate": 0.001, + "loss": 1.9092, + "step": 6544 + }, + { + "epoch": 0.2768846772146544, + "grad_norm": 0.2642837464809418, + "learning_rate": 0.001, + "loss": 2.5145, + "step": 6545 + }, + { + "epoch": 0.2769269819781707, + "grad_norm": 0.23197656869888306, + "learning_rate": 0.001, + "loss": 2.6426, + "step": 6546 + }, + { + "epoch": 0.27696928674168714, + "grad_norm": 0.4172825813293457, + "learning_rate": 0.001, + "loss": 1.7941, + "step": 6547 + }, + { + "epoch": 0.2770115915052035, + "grad_norm": 0.1864205151796341, + "learning_rate": 0.001, + "loss": 1.6709, + "step": 6548 + }, + { + "epoch": 0.27705389626871985, + "grad_norm": 1.1688998937606812, + "learning_rate": 0.001, + "loss": 2.374, + "step": 6549 + }, + { + "epoch": 0.27709620103223626, + "grad_norm": 0.26885920763015747, + "learning_rate": 0.001, + "loss": 2.6946, + "step": 6550 + }, + { + "epoch": 0.2771385057957526, + "grad_norm": 0.6076298952102661, + "learning_rate": 0.001, + "loss": 2.1671, + "step": 6551 + }, + { + "epoch": 0.27718081055926896, + "grad_norm": 333.4671325683594, + "learning_rate": 0.001, + "loss": 2.3423, + "step": 6552 + }, + { + "epoch": 0.2772231153227853, + "grad_norm": 0.7827109098434448, + "learning_rate": 0.001, + "loss": 2.1145, + "step": 6553 + }, + { + "epoch": 0.27726542008630173, + "grad_norm": 2.1822445392608643, + "learning_rate": 0.001, + "loss": 1.8533, + "step": 6554 + }, + { + "epoch": 0.2773077248498181, + "grad_norm": 81.6081314086914, + "learning_rate": 0.001, + "loss": 2.4029, + "step": 6555 + }, + { + "epoch": 0.27735002961333444, + "grad_norm": 1.5109821557998657, + "learning_rate": 0.001, + "loss": 2.849, + "step": 6556 + }, + { + "epoch": 0.27739233437685085, + "grad_norm": 3.2216641902923584, + "learning_rate": 0.001, + "loss": 2.4335, + "step": 6557 + }, + { + "epoch": 0.2774346391403672, + "grad_norm": 0.28260374069213867, + "learning_rate": 0.001, + "loss": 2.8565, + "step": 6558 + }, + { + "epoch": 0.27747694390388356, + "grad_norm": 0.2404414415359497, + "learning_rate": 0.001, + "loss": 2.3991, + "step": 6559 + }, + { + "epoch": 0.27751924866739996, + "grad_norm": 0.2975144386291504, + "learning_rate": 0.001, + "loss": 2.8319, + "step": 6560 + }, + { + "epoch": 0.2775615534309163, + "grad_norm": 8.33987045288086, + "learning_rate": 0.001, + "loss": 2.9683, + "step": 6561 + }, + { + "epoch": 0.2776038581944327, + "grad_norm": 1.107771396636963, + "learning_rate": 0.001, + "loss": 2.3978, + "step": 6562 + }, + { + "epoch": 0.2776461629579491, + "grad_norm": 0.24816367030143738, + "learning_rate": 0.001, + "loss": 2.367, + "step": 6563 + }, + { + "epoch": 0.27768846772146544, + "grad_norm": 1.1018351316452026, + "learning_rate": 0.001, + "loss": 2.62, + "step": 6564 + }, + { + "epoch": 0.2777307724849818, + "grad_norm": 1.640609860420227, + "learning_rate": 0.001, + "loss": 1.867, + "step": 6565 + }, + { + "epoch": 0.2777730772484982, + "grad_norm": 0.2782337963581085, + "learning_rate": 0.001, + "loss": 2.3716, + "step": 6566 + }, + { + "epoch": 0.27781538201201456, + "grad_norm": 1.2613625526428223, + "learning_rate": 0.001, + "loss": 2.6911, + "step": 6567 + }, + { + "epoch": 0.2778576867755309, + "grad_norm": 3.5542197227478027, + "learning_rate": 0.001, + "loss": 2.6192, + "step": 6568 + }, + { + "epoch": 0.2778999915390473, + "grad_norm": 0.22724595665931702, + "learning_rate": 0.001, + "loss": 1.9024, + "step": 6569 + }, + { + "epoch": 0.2779422963025637, + "grad_norm": 0.21525609493255615, + "learning_rate": 0.001, + "loss": 2.0792, + "step": 6570 + }, + { + "epoch": 0.27798460106608003, + "grad_norm": 1.333457589149475, + "learning_rate": 0.001, + "loss": 2.5817, + "step": 6571 + }, + { + "epoch": 0.27802690582959644, + "grad_norm": 0.2638329863548279, + "learning_rate": 0.001, + "loss": 1.8625, + "step": 6572 + }, + { + "epoch": 0.2780692105931128, + "grad_norm": 0.5335271954536438, + "learning_rate": 0.001, + "loss": 2.255, + "step": 6573 + }, + { + "epoch": 0.27811151535662915, + "grad_norm": 0.8093989491462708, + "learning_rate": 0.001, + "loss": 2.6137, + "step": 6574 + }, + { + "epoch": 0.2781538201201455, + "grad_norm": 0.2589428126811981, + "learning_rate": 0.001, + "loss": 2.4491, + "step": 6575 + }, + { + "epoch": 0.2781961248836619, + "grad_norm": 1.0892338752746582, + "learning_rate": 0.001, + "loss": 2.1675, + "step": 6576 + }, + { + "epoch": 0.27823842964717826, + "grad_norm": 0.3066917657852173, + "learning_rate": 0.001, + "loss": 2.3011, + "step": 6577 + }, + { + "epoch": 0.2782807344106946, + "grad_norm": 0.6517425179481506, + "learning_rate": 0.001, + "loss": 2.241, + "step": 6578 + }, + { + "epoch": 0.27832303917421103, + "grad_norm": 0.7544508576393127, + "learning_rate": 0.001, + "loss": 2.6692, + "step": 6579 + }, + { + "epoch": 0.2783653439377274, + "grad_norm": 0.2718650698661804, + "learning_rate": 0.001, + "loss": 2.6371, + "step": 6580 + }, + { + "epoch": 0.27840764870124374, + "grad_norm": 1.2337608337402344, + "learning_rate": 0.001, + "loss": 2.6096, + "step": 6581 + }, + { + "epoch": 0.27844995346476015, + "grad_norm": 0.36760374903678894, + "learning_rate": 0.001, + "loss": 2.2351, + "step": 6582 + }, + { + "epoch": 0.2784922582282765, + "grad_norm": 0.25954657793045044, + "learning_rate": 0.001, + "loss": 2.1766, + "step": 6583 + }, + { + "epoch": 0.27853456299179286, + "grad_norm": 0.606953501701355, + "learning_rate": 0.001, + "loss": 2.1137, + "step": 6584 + }, + { + "epoch": 0.27857686775530927, + "grad_norm": 0.2433394342660904, + "learning_rate": 0.001, + "loss": 1.8739, + "step": 6585 + }, + { + "epoch": 0.2786191725188256, + "grad_norm": 0.2681048810482025, + "learning_rate": 0.001, + "loss": 2.2242, + "step": 6586 + }, + { + "epoch": 0.278661477282342, + "grad_norm": 0.37554433941841125, + "learning_rate": 0.001, + "loss": 2.5414, + "step": 6587 + }, + { + "epoch": 0.2787037820458584, + "grad_norm": 0.37288689613342285, + "learning_rate": 0.001, + "loss": 2.7379, + "step": 6588 + }, + { + "epoch": 0.27874608680937474, + "grad_norm": 1.6531394720077515, + "learning_rate": 0.001, + "loss": 2.3285, + "step": 6589 + }, + { + "epoch": 0.2787883915728911, + "grad_norm": 0.4495515823364258, + "learning_rate": 0.001, + "loss": 1.9095, + "step": 6590 + }, + { + "epoch": 0.2788306963364075, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.001, + "loss": 1.9806, + "step": 6591 + }, + { + "epoch": 0.27887300109992386, + "grad_norm": 0.4093262255191803, + "learning_rate": 0.001, + "loss": 2.9568, + "step": 6592 + }, + { + "epoch": 0.2789153058634402, + "grad_norm": 0.4655383825302124, + "learning_rate": 0.001, + "loss": 2.2092, + "step": 6593 + }, + { + "epoch": 0.2789576106269566, + "grad_norm": 0.1933087557554245, + "learning_rate": 0.001, + "loss": 2.6561, + "step": 6594 + }, + { + "epoch": 0.278999915390473, + "grad_norm": 0.7289586663246155, + "learning_rate": 0.001, + "loss": 2.2029, + "step": 6595 + }, + { + "epoch": 0.27904222015398933, + "grad_norm": 0.33073756098747253, + "learning_rate": 0.001, + "loss": 2.1711, + "step": 6596 + }, + { + "epoch": 0.27908452491750574, + "grad_norm": 0.1785137802362442, + "learning_rate": 0.001, + "loss": 1.7439, + "step": 6597 + }, + { + "epoch": 0.2791268296810221, + "grad_norm": 0.23087824881076813, + "learning_rate": 0.001, + "loss": 2.3974, + "step": 6598 + }, + { + "epoch": 0.27916913444453845, + "grad_norm": 0.2256430834531784, + "learning_rate": 0.001, + "loss": 2.225, + "step": 6599 + }, + { + "epoch": 0.2792114392080548, + "grad_norm": 0.27692413330078125, + "learning_rate": 0.001, + "loss": 1.9553, + "step": 6600 + }, + { + "epoch": 0.2792537439715712, + "grad_norm": 0.28464028239250183, + "learning_rate": 0.001, + "loss": 3.7032, + "step": 6601 + }, + { + "epoch": 0.27929604873508757, + "grad_norm": 0.22122737765312195, + "learning_rate": 0.001, + "loss": 1.9469, + "step": 6602 + }, + { + "epoch": 0.2793383534986039, + "grad_norm": 0.2612067461013794, + "learning_rate": 0.001, + "loss": 1.7212, + "step": 6603 + }, + { + "epoch": 0.27938065826212033, + "grad_norm": 0.21582959592342377, + "learning_rate": 0.001, + "loss": 2.3386, + "step": 6604 + }, + { + "epoch": 0.2794229630256367, + "grad_norm": 0.258429616689682, + "learning_rate": 0.001, + "loss": 3.188, + "step": 6605 + }, + { + "epoch": 0.27946526778915304, + "grad_norm": 0.23203891515731812, + "learning_rate": 0.001, + "loss": 2.3323, + "step": 6606 + }, + { + "epoch": 0.27950757255266945, + "grad_norm": 0.37351715564727783, + "learning_rate": 0.001, + "loss": 3.2925, + "step": 6607 + }, + { + "epoch": 0.2795498773161858, + "grad_norm": 0.2870527505874634, + "learning_rate": 0.001, + "loss": 2.128, + "step": 6608 + }, + { + "epoch": 0.27959218207970216, + "grad_norm": 0.2394646555185318, + "learning_rate": 0.001, + "loss": 2.1182, + "step": 6609 + }, + { + "epoch": 0.27963448684321857, + "grad_norm": 1.8435173034667969, + "learning_rate": 0.001, + "loss": 2.1496, + "step": 6610 + }, + { + "epoch": 0.2796767916067349, + "grad_norm": 0.689439058303833, + "learning_rate": 0.001, + "loss": 1.6171, + "step": 6611 + }, + { + "epoch": 0.2797190963702513, + "grad_norm": 0.3655761182308197, + "learning_rate": 0.001, + "loss": 1.8437, + "step": 6612 + }, + { + "epoch": 0.2797614011337677, + "grad_norm": 0.21056896448135376, + "learning_rate": 0.001, + "loss": 3.201, + "step": 6613 + }, + { + "epoch": 0.27980370589728404, + "grad_norm": 0.20994161069393158, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 6614 + }, + { + "epoch": 0.2798460106608004, + "grad_norm": 0.20594006776809692, + "learning_rate": 0.001, + "loss": 2.0774, + "step": 6615 + }, + { + "epoch": 0.2798883154243168, + "grad_norm": 0.24466033279895782, + "learning_rate": 0.001, + "loss": 2.101, + "step": 6616 + }, + { + "epoch": 0.27993062018783316, + "grad_norm": 0.23356914520263672, + "learning_rate": 0.001, + "loss": 3.0406, + "step": 6617 + }, + { + "epoch": 0.2799729249513495, + "grad_norm": 0.3080406188964844, + "learning_rate": 0.001, + "loss": 1.7894, + "step": 6618 + }, + { + "epoch": 0.2800152297148659, + "grad_norm": 0.28324589133262634, + "learning_rate": 0.001, + "loss": 2.5605, + "step": 6619 + }, + { + "epoch": 0.2800575344783823, + "grad_norm": 0.4697299599647522, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 6620 + }, + { + "epoch": 0.28009983924189863, + "grad_norm": 0.7480176091194153, + "learning_rate": 0.001, + "loss": 3.3983, + "step": 6621 + }, + { + "epoch": 0.280142144005415, + "grad_norm": 1.2225714921951294, + "learning_rate": 0.001, + "loss": 2.6653, + "step": 6622 + }, + { + "epoch": 0.2801844487689314, + "grad_norm": 1.5693175792694092, + "learning_rate": 0.001, + "loss": 2.5905, + "step": 6623 + }, + { + "epoch": 0.28022675353244775, + "grad_norm": 1.6206374168395996, + "learning_rate": 0.001, + "loss": 2.0437, + "step": 6624 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 0.19221502542495728, + "learning_rate": 0.001, + "loss": 2.4707, + "step": 6625 + }, + { + "epoch": 0.2803113630594805, + "grad_norm": 0.8016247153282166, + "learning_rate": 0.001, + "loss": 2.6747, + "step": 6626 + }, + { + "epoch": 0.28035366782299687, + "grad_norm": 0.21207623183727264, + "learning_rate": 0.001, + "loss": 2.1313, + "step": 6627 + }, + { + "epoch": 0.2803959725865132, + "grad_norm": 0.22111967206001282, + "learning_rate": 0.001, + "loss": 3.0506, + "step": 6628 + }, + { + "epoch": 0.28043827735002963, + "grad_norm": 0.21571913361549377, + "learning_rate": 0.001, + "loss": 1.8644, + "step": 6629 + }, + { + "epoch": 0.280480582113546, + "grad_norm": 0.23307397961616516, + "learning_rate": 0.001, + "loss": 2.8528, + "step": 6630 + }, + { + "epoch": 0.28052288687706234, + "grad_norm": 1.7991856336593628, + "learning_rate": 0.001, + "loss": 2.2838, + "step": 6631 + }, + { + "epoch": 0.28056519164057875, + "grad_norm": 0.23802727460861206, + "learning_rate": 0.001, + "loss": 2.0143, + "step": 6632 + }, + { + "epoch": 0.2806074964040951, + "grad_norm": 0.19570757448673248, + "learning_rate": 0.001, + "loss": 1.7036, + "step": 6633 + }, + { + "epoch": 0.28064980116761146, + "grad_norm": 0.27703866362571716, + "learning_rate": 0.001, + "loss": 4.0551, + "step": 6634 + }, + { + "epoch": 0.28069210593112787, + "grad_norm": 0.30882275104522705, + "learning_rate": 0.001, + "loss": 2.4837, + "step": 6635 + }, + { + "epoch": 0.2807344106946442, + "grad_norm": 0.2575119137763977, + "learning_rate": 0.001, + "loss": 2.4555, + "step": 6636 + }, + { + "epoch": 0.2807767154581606, + "grad_norm": 3.0365078449249268, + "learning_rate": 0.001, + "loss": 1.8518, + "step": 6637 + }, + { + "epoch": 0.280819020221677, + "grad_norm": 0.21521303057670593, + "learning_rate": 0.001, + "loss": 2.1552, + "step": 6638 + }, + { + "epoch": 0.28086132498519334, + "grad_norm": 0.2531343698501587, + "learning_rate": 0.001, + "loss": 2.9282, + "step": 6639 + }, + { + "epoch": 0.2809036297487097, + "grad_norm": 0.1944054663181305, + "learning_rate": 0.001, + "loss": 2.3841, + "step": 6640 + }, + { + "epoch": 0.2809459345122261, + "grad_norm": 0.18977178633213043, + "learning_rate": 0.001, + "loss": 1.9523, + "step": 6641 + }, + { + "epoch": 0.28098823927574246, + "grad_norm": 0.20107170939445496, + "learning_rate": 0.001, + "loss": 1.9213, + "step": 6642 + }, + { + "epoch": 0.2810305440392588, + "grad_norm": 1.5466896295547485, + "learning_rate": 0.001, + "loss": 2.2963, + "step": 6643 + }, + { + "epoch": 0.28107284880277517, + "grad_norm": 0.20323142409324646, + "learning_rate": 0.001, + "loss": 2.3657, + "step": 6644 + }, + { + "epoch": 0.2811151535662916, + "grad_norm": 0.20698612928390503, + "learning_rate": 0.001, + "loss": 2.1832, + "step": 6645 + }, + { + "epoch": 0.28115745832980793, + "grad_norm": 0.265480101108551, + "learning_rate": 0.001, + "loss": 4.3758, + "step": 6646 + }, + { + "epoch": 0.2811997630933243, + "grad_norm": 0.21881890296936035, + "learning_rate": 0.001, + "loss": 1.9424, + "step": 6647 + }, + { + "epoch": 0.2812420678568407, + "grad_norm": 0.2621309459209442, + "learning_rate": 0.001, + "loss": 3.2508, + "step": 6648 + }, + { + "epoch": 0.28128437262035705, + "grad_norm": 0.2738354206085205, + "learning_rate": 0.001, + "loss": 2.7365, + "step": 6649 + }, + { + "epoch": 0.2813266773838734, + "grad_norm": 0.44590675830841064, + "learning_rate": 0.001, + "loss": 2.2821, + "step": 6650 + }, + { + "epoch": 0.2813689821473898, + "grad_norm": 0.25638794898986816, + "learning_rate": 0.001, + "loss": 2.7701, + "step": 6651 + }, + { + "epoch": 0.28141128691090617, + "grad_norm": 0.275796115398407, + "learning_rate": 0.001, + "loss": 2.6728, + "step": 6652 + }, + { + "epoch": 0.2814535916744225, + "grad_norm": 0.2530985176563263, + "learning_rate": 0.001, + "loss": 1.9915, + "step": 6653 + }, + { + "epoch": 0.28149589643793893, + "grad_norm": 0.1915893405675888, + "learning_rate": 0.001, + "loss": 1.9204, + "step": 6654 + }, + { + "epoch": 0.2815382012014553, + "grad_norm": 2.445066213607788, + "learning_rate": 0.001, + "loss": 2.5217, + "step": 6655 + }, + { + "epoch": 0.28158050596497164, + "grad_norm": 3.7785122394561768, + "learning_rate": 0.001, + "loss": 1.6211, + "step": 6656 + }, + { + "epoch": 0.28162281072848805, + "grad_norm": 0.21273264288902283, + "learning_rate": 0.001, + "loss": 1.958, + "step": 6657 + }, + { + "epoch": 0.2816651154920044, + "grad_norm": 0.29316845536231995, + "learning_rate": 0.001, + "loss": 2.8814, + "step": 6658 + }, + { + "epoch": 0.28170742025552076, + "grad_norm": 16.398019790649414, + "learning_rate": 0.001, + "loss": 2.0424, + "step": 6659 + }, + { + "epoch": 0.28174972501903717, + "grad_norm": 0.5399495363235474, + "learning_rate": 0.001, + "loss": 3.4691, + "step": 6660 + }, + { + "epoch": 0.2817920297825535, + "grad_norm": 0.29314282536506653, + "learning_rate": 0.001, + "loss": 1.9568, + "step": 6661 + }, + { + "epoch": 0.2818343345460699, + "grad_norm": 0.27456262707710266, + "learning_rate": 0.001, + "loss": 2.6013, + "step": 6662 + }, + { + "epoch": 0.2818766393095863, + "grad_norm": 0.5269389152526855, + "learning_rate": 0.001, + "loss": 2.4911, + "step": 6663 + }, + { + "epoch": 0.28191894407310264, + "grad_norm": 0.5170745849609375, + "learning_rate": 0.001, + "loss": 3.7857, + "step": 6664 + }, + { + "epoch": 0.281961248836619, + "grad_norm": 0.21741123497486115, + "learning_rate": 0.001, + "loss": 2.2259, + "step": 6665 + }, + { + "epoch": 0.28200355360013535, + "grad_norm": 0.19433678686618805, + "learning_rate": 0.001, + "loss": 2.3927, + "step": 6666 + }, + { + "epoch": 0.28204585836365176, + "grad_norm": 0.2336796671152115, + "learning_rate": 0.001, + "loss": 2.2329, + "step": 6667 + }, + { + "epoch": 0.2820881631271681, + "grad_norm": 0.3010079264640808, + "learning_rate": 0.001, + "loss": 2.8767, + "step": 6668 + }, + { + "epoch": 0.28213046789068447, + "grad_norm": 23.44135856628418, + "learning_rate": 0.001, + "loss": 2.6002, + "step": 6669 + }, + { + "epoch": 0.2821727726542009, + "grad_norm": 0.3416197597980499, + "learning_rate": 0.001, + "loss": 2.3501, + "step": 6670 + }, + { + "epoch": 0.28221507741771723, + "grad_norm": 0.20520153641700745, + "learning_rate": 0.001, + "loss": 1.875, + "step": 6671 + }, + { + "epoch": 0.2822573821812336, + "grad_norm": 0.2823079526424408, + "learning_rate": 0.001, + "loss": 2.699, + "step": 6672 + }, + { + "epoch": 0.28229968694475, + "grad_norm": 0.23430481553077698, + "learning_rate": 0.001, + "loss": 3.0939, + "step": 6673 + }, + { + "epoch": 0.28234199170826635, + "grad_norm": 0.2834431231021881, + "learning_rate": 0.001, + "loss": 2.2496, + "step": 6674 + }, + { + "epoch": 0.2823842964717827, + "grad_norm": 0.5793172717094421, + "learning_rate": 0.001, + "loss": 2.2915, + "step": 6675 + }, + { + "epoch": 0.2824266012352991, + "grad_norm": 0.4027354121208191, + "learning_rate": 0.001, + "loss": 2.1231, + "step": 6676 + }, + { + "epoch": 0.28246890599881547, + "grad_norm": 0.24830132722854614, + "learning_rate": 0.001, + "loss": 1.8969, + "step": 6677 + }, + { + "epoch": 0.2825112107623318, + "grad_norm": 0.27415531873703003, + "learning_rate": 0.001, + "loss": 2.1313, + "step": 6678 + }, + { + "epoch": 0.28255351552584823, + "grad_norm": 0.21020923554897308, + "learning_rate": 0.001, + "loss": 1.6612, + "step": 6679 + }, + { + "epoch": 0.2825958202893646, + "grad_norm": 0.4596908986568451, + "learning_rate": 0.001, + "loss": 3.1641, + "step": 6680 + }, + { + "epoch": 0.28263812505288094, + "grad_norm": 0.2096545398235321, + "learning_rate": 0.001, + "loss": 2.3925, + "step": 6681 + }, + { + "epoch": 0.28268042981639735, + "grad_norm": 0.21853958070278168, + "learning_rate": 0.001, + "loss": 1.7605, + "step": 6682 + }, + { + "epoch": 0.2827227345799137, + "grad_norm": 0.2005593180656433, + "learning_rate": 0.001, + "loss": 2.0397, + "step": 6683 + }, + { + "epoch": 0.28276503934343006, + "grad_norm": 9.324366569519043, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 6684 + }, + { + "epoch": 0.28280734410694647, + "grad_norm": 0.23955844342708588, + "learning_rate": 0.001, + "loss": 2.4135, + "step": 6685 + }, + { + "epoch": 0.2828496488704628, + "grad_norm": 0.240536168217659, + "learning_rate": 0.001, + "loss": 2.2006, + "step": 6686 + }, + { + "epoch": 0.2828919536339792, + "grad_norm": 0.2282586544752121, + "learning_rate": 0.001, + "loss": 2.4402, + "step": 6687 + }, + { + "epoch": 0.28293425839749553, + "grad_norm": 0.23527540266513824, + "learning_rate": 0.001, + "loss": 2.5282, + "step": 6688 + }, + { + "epoch": 0.28297656316101194, + "grad_norm": 0.2016860693693161, + "learning_rate": 0.001, + "loss": 2.9175, + "step": 6689 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 0.46961989998817444, + "learning_rate": 0.001, + "loss": 1.9682, + "step": 6690 + }, + { + "epoch": 0.28306117268804465, + "grad_norm": 3.572152614593506, + "learning_rate": 0.001, + "loss": 2.3511, + "step": 6691 + }, + { + "epoch": 0.28310347745156106, + "grad_norm": 0.18426565825939178, + "learning_rate": 0.001, + "loss": 2.563, + "step": 6692 + }, + { + "epoch": 0.2831457822150774, + "grad_norm": 0.20007674396038055, + "learning_rate": 0.001, + "loss": 2.182, + "step": 6693 + }, + { + "epoch": 0.28318808697859377, + "grad_norm": 0.2859644591808319, + "learning_rate": 0.001, + "loss": 2.4287, + "step": 6694 + }, + { + "epoch": 0.2832303917421102, + "grad_norm": 0.2305973470211029, + "learning_rate": 0.001, + "loss": 2.3868, + "step": 6695 + }, + { + "epoch": 0.28327269650562653, + "grad_norm": 0.21478715538978577, + "learning_rate": 0.001, + "loss": 2.2786, + "step": 6696 + }, + { + "epoch": 0.2833150012691429, + "grad_norm": 0.2607964277267456, + "learning_rate": 0.001, + "loss": 2.7508, + "step": 6697 + }, + { + "epoch": 0.2833573060326593, + "grad_norm": 0.2294250726699829, + "learning_rate": 0.001, + "loss": 1.9693, + "step": 6698 + }, + { + "epoch": 0.28339961079617565, + "grad_norm": 0.382492333650589, + "learning_rate": 0.001, + "loss": 3.9725, + "step": 6699 + }, + { + "epoch": 0.283441915559692, + "grad_norm": 0.2415558397769928, + "learning_rate": 0.001, + "loss": 2.6939, + "step": 6700 + }, + { + "epoch": 0.2834842203232084, + "grad_norm": 0.18394537270069122, + "learning_rate": 0.001, + "loss": 2.3632, + "step": 6701 + }, + { + "epoch": 0.28352652508672477, + "grad_norm": 0.2569441795349121, + "learning_rate": 0.001, + "loss": 2.3462, + "step": 6702 + }, + { + "epoch": 0.2835688298502411, + "grad_norm": 0.19975893199443817, + "learning_rate": 0.001, + "loss": 1.6878, + "step": 6703 + }, + { + "epoch": 0.28361113461375753, + "grad_norm": 2.8916850090026855, + "learning_rate": 0.001, + "loss": 1.7501, + "step": 6704 + }, + { + "epoch": 0.2836534393772739, + "grad_norm": 0.24953538179397583, + "learning_rate": 0.001, + "loss": 2.7791, + "step": 6705 + }, + { + "epoch": 0.28369574414079024, + "grad_norm": 0.6636295318603516, + "learning_rate": 0.001, + "loss": 2.2578, + "step": 6706 + }, + { + "epoch": 0.28373804890430665, + "grad_norm": 2.3027749061584473, + "learning_rate": 0.001, + "loss": 1.9202, + "step": 6707 + }, + { + "epoch": 0.283780353667823, + "grad_norm": 0.17140471935272217, + "learning_rate": 0.001, + "loss": 1.9173, + "step": 6708 + }, + { + "epoch": 0.28382265843133936, + "grad_norm": 0.1749950796365738, + "learning_rate": 0.001, + "loss": 1.5205, + "step": 6709 + }, + { + "epoch": 0.28386496319485577, + "grad_norm": 0.8592566251754761, + "learning_rate": 0.001, + "loss": 3.4644, + "step": 6710 + }, + { + "epoch": 0.2839072679583721, + "grad_norm": 0.17556998133659363, + "learning_rate": 0.001, + "loss": 1.9657, + "step": 6711 + }, + { + "epoch": 0.2839495727218885, + "grad_norm": 0.2964968681335449, + "learning_rate": 0.001, + "loss": 2.8859, + "step": 6712 + }, + { + "epoch": 0.28399187748540483, + "grad_norm": 0.6098200678825378, + "learning_rate": 0.001, + "loss": 1.7954, + "step": 6713 + }, + { + "epoch": 0.28403418224892124, + "grad_norm": 0.43372607231140137, + "learning_rate": 0.001, + "loss": 3.3326, + "step": 6714 + }, + { + "epoch": 0.2840764870124376, + "grad_norm": 0.4292704463005066, + "learning_rate": 0.001, + "loss": 2.5526, + "step": 6715 + }, + { + "epoch": 0.28411879177595395, + "grad_norm": 0.21659818291664124, + "learning_rate": 0.001, + "loss": 1.9127, + "step": 6716 + }, + { + "epoch": 0.28416109653947036, + "grad_norm": 0.4214901030063629, + "learning_rate": 0.001, + "loss": 1.7826, + "step": 6717 + }, + { + "epoch": 0.2842034013029867, + "grad_norm": 0.3387224078178406, + "learning_rate": 0.001, + "loss": 2.7385, + "step": 6718 + }, + { + "epoch": 0.28424570606650307, + "grad_norm": 0.5583015084266663, + "learning_rate": 0.001, + "loss": 3.1142, + "step": 6719 + }, + { + "epoch": 0.2842880108300195, + "grad_norm": 0.19958928227424622, + "learning_rate": 0.001, + "loss": 3.1751, + "step": 6720 + }, + { + "epoch": 0.28433031559353583, + "grad_norm": 0.22173653542995453, + "learning_rate": 0.001, + "loss": 3.0606, + "step": 6721 + }, + { + "epoch": 0.2843726203570522, + "grad_norm": 0.24500377476215363, + "learning_rate": 0.001, + "loss": 2.0964, + "step": 6722 + }, + { + "epoch": 0.2844149251205686, + "grad_norm": 0.21380500495433807, + "learning_rate": 0.001, + "loss": 2.1937, + "step": 6723 + }, + { + "epoch": 0.28445722988408495, + "grad_norm": 0.21296393871307373, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 6724 + }, + { + "epoch": 0.2844995346476013, + "grad_norm": 0.19265830516815186, + "learning_rate": 0.001, + "loss": 2.587, + "step": 6725 + }, + { + "epoch": 0.2845418394111177, + "grad_norm": 0.19577600061893463, + "learning_rate": 0.001, + "loss": 1.7467, + "step": 6726 + }, + { + "epoch": 0.28458414417463407, + "grad_norm": 0.25337207317352295, + "learning_rate": 0.001, + "loss": 2.1288, + "step": 6727 + }, + { + "epoch": 0.2846264489381504, + "grad_norm": 0.4300183653831482, + "learning_rate": 0.001, + "loss": 2.7892, + "step": 6728 + }, + { + "epoch": 0.28466875370166683, + "grad_norm": 0.8722933530807495, + "learning_rate": 0.001, + "loss": 2.3566, + "step": 6729 + }, + { + "epoch": 0.2847110584651832, + "grad_norm": 0.17936961352825165, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 6730 + }, + { + "epoch": 0.28475336322869954, + "grad_norm": 0.18349401652812958, + "learning_rate": 0.001, + "loss": 1.7784, + "step": 6731 + }, + { + "epoch": 0.28479566799221595, + "grad_norm": 1.0016671419143677, + "learning_rate": 0.001, + "loss": 1.9157, + "step": 6732 + }, + { + "epoch": 0.2848379727557323, + "grad_norm": 0.14717590808868408, + "learning_rate": 0.001, + "loss": 1.3068, + "step": 6733 + }, + { + "epoch": 0.28488027751924866, + "grad_norm": 4.4579243659973145, + "learning_rate": 0.001, + "loss": 2.4236, + "step": 6734 + }, + { + "epoch": 0.284922582282765, + "grad_norm": 0.2059265822172165, + "learning_rate": 0.001, + "loss": 1.9139, + "step": 6735 + }, + { + "epoch": 0.2849648870462814, + "grad_norm": 0.2114630937576294, + "learning_rate": 0.001, + "loss": 3.1348, + "step": 6736 + }, + { + "epoch": 0.2850071918097978, + "grad_norm": 0.2629503011703491, + "learning_rate": 0.001, + "loss": 2.6543, + "step": 6737 + }, + { + "epoch": 0.28504949657331413, + "grad_norm": 2.027836799621582, + "learning_rate": 0.001, + "loss": 2.1327, + "step": 6738 + }, + { + "epoch": 0.28509180133683054, + "grad_norm": 0.9196799993515015, + "learning_rate": 0.001, + "loss": 2.5672, + "step": 6739 + }, + { + "epoch": 0.2851341061003469, + "grad_norm": 5.23358154296875, + "learning_rate": 0.001, + "loss": 2.0236, + "step": 6740 + }, + { + "epoch": 0.28517641086386325, + "grad_norm": 0.17935171723365784, + "learning_rate": 0.001, + "loss": 2.6343, + "step": 6741 + }, + { + "epoch": 0.28521871562737966, + "grad_norm": 1.6367027759552002, + "learning_rate": 0.001, + "loss": 1.7514, + "step": 6742 + }, + { + "epoch": 0.285261020390896, + "grad_norm": 0.20048488676548004, + "learning_rate": 0.001, + "loss": 2.6243, + "step": 6743 + }, + { + "epoch": 0.28530332515441237, + "grad_norm": 0.37594085931777954, + "learning_rate": 0.001, + "loss": 3.2875, + "step": 6744 + }, + { + "epoch": 0.2853456299179288, + "grad_norm": 0.18674291670322418, + "learning_rate": 0.001, + "loss": 1.8737, + "step": 6745 + }, + { + "epoch": 0.28538793468144513, + "grad_norm": 0.40229612588882446, + "learning_rate": 0.001, + "loss": 2.3867, + "step": 6746 + }, + { + "epoch": 0.2854302394449615, + "grad_norm": 2.3481996059417725, + "learning_rate": 0.001, + "loss": 3.3213, + "step": 6747 + }, + { + "epoch": 0.2854725442084779, + "grad_norm": 0.23974232375621796, + "learning_rate": 0.001, + "loss": 3.0777, + "step": 6748 + }, + { + "epoch": 0.28551484897199425, + "grad_norm": 0.22430221736431122, + "learning_rate": 0.001, + "loss": 2.5951, + "step": 6749 + }, + { + "epoch": 0.2855571537355106, + "grad_norm": 1.6026118993759155, + "learning_rate": 0.001, + "loss": 2.3724, + "step": 6750 + }, + { + "epoch": 0.285599458499027, + "grad_norm": 2.978184938430786, + "learning_rate": 0.001, + "loss": 2.1911, + "step": 6751 + }, + { + "epoch": 0.28564176326254337, + "grad_norm": 11.393836975097656, + "learning_rate": 0.001, + "loss": 3.553, + "step": 6752 + }, + { + "epoch": 0.2856840680260597, + "grad_norm": 0.254300057888031, + "learning_rate": 0.001, + "loss": 2.9947, + "step": 6753 + }, + { + "epoch": 0.28572637278957613, + "grad_norm": 0.5926849246025085, + "learning_rate": 0.001, + "loss": 2.2195, + "step": 6754 + }, + { + "epoch": 0.2857686775530925, + "grad_norm": 6.964447975158691, + "learning_rate": 0.001, + "loss": 2.2015, + "step": 6755 + }, + { + "epoch": 0.28581098231660884, + "grad_norm": 4.038476943969727, + "learning_rate": 0.001, + "loss": 2.186, + "step": 6756 + }, + { + "epoch": 0.2858532870801252, + "grad_norm": 3.579437494277954, + "learning_rate": 0.001, + "loss": 2.3797, + "step": 6757 + }, + { + "epoch": 0.2858955918436416, + "grad_norm": 4.091229438781738, + "learning_rate": 0.001, + "loss": 2.5496, + "step": 6758 + }, + { + "epoch": 0.28593789660715796, + "grad_norm": 3.153745412826538, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 6759 + }, + { + "epoch": 0.2859802013706743, + "grad_norm": 1.0637880563735962, + "learning_rate": 0.001, + "loss": 2.5076, + "step": 6760 + }, + { + "epoch": 0.2860225061341907, + "grad_norm": 1.7310835123062134, + "learning_rate": 0.001, + "loss": 2.6743, + "step": 6761 + }, + { + "epoch": 0.2860648108977071, + "grad_norm": 0.3798615336418152, + "learning_rate": 0.001, + "loss": 2.1774, + "step": 6762 + }, + { + "epoch": 0.28610711566122343, + "grad_norm": 0.35704922676086426, + "learning_rate": 0.001, + "loss": 1.6369, + "step": 6763 + }, + { + "epoch": 0.28614942042473984, + "grad_norm": 0.22196084260940552, + "learning_rate": 0.001, + "loss": 2.2773, + "step": 6764 + }, + { + "epoch": 0.2861917251882562, + "grad_norm": 0.2564329504966736, + "learning_rate": 0.001, + "loss": 2.0616, + "step": 6765 + }, + { + "epoch": 0.28623402995177255, + "grad_norm": 0.662453830242157, + "learning_rate": 0.001, + "loss": 2.425, + "step": 6766 + }, + { + "epoch": 0.28627633471528896, + "grad_norm": 0.275669127702713, + "learning_rate": 0.001, + "loss": 2.1394, + "step": 6767 + }, + { + "epoch": 0.2863186394788053, + "grad_norm": 1.1164103746414185, + "learning_rate": 0.001, + "loss": 3.1801, + "step": 6768 + }, + { + "epoch": 0.28636094424232167, + "grad_norm": 0.28561368584632874, + "learning_rate": 0.001, + "loss": 3.4721, + "step": 6769 + }, + { + "epoch": 0.2864032490058381, + "grad_norm": 0.44134315848350525, + "learning_rate": 0.001, + "loss": 2.4825, + "step": 6770 + }, + { + "epoch": 0.28644555376935443, + "grad_norm": 0.6540394425392151, + "learning_rate": 0.001, + "loss": 2.4181, + "step": 6771 + }, + { + "epoch": 0.2864878585328708, + "grad_norm": 0.9629650712013245, + "learning_rate": 0.001, + "loss": 1.801, + "step": 6772 + }, + { + "epoch": 0.2865301632963872, + "grad_norm": 1.2992072105407715, + "learning_rate": 0.001, + "loss": 2.4131, + "step": 6773 + }, + { + "epoch": 0.28657246805990355, + "grad_norm": 0.8524438142776489, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 6774 + }, + { + "epoch": 0.2866147728234199, + "grad_norm": 18.05820083618164, + "learning_rate": 0.001, + "loss": 3.5782, + "step": 6775 + }, + { + "epoch": 0.2866570775869363, + "grad_norm": 0.2133456915616989, + "learning_rate": 0.001, + "loss": 2.8237, + "step": 6776 + }, + { + "epoch": 0.28669938235045267, + "grad_norm": 29.985143661499023, + "learning_rate": 0.001, + "loss": 1.9714, + "step": 6777 + }, + { + "epoch": 0.286741687113969, + "grad_norm": 0.23235024511814117, + "learning_rate": 0.001, + "loss": 2.2575, + "step": 6778 + }, + { + "epoch": 0.2867839918774854, + "grad_norm": 0.2286916971206665, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 6779 + }, + { + "epoch": 0.2868262966410018, + "grad_norm": 0.20938080549240112, + "learning_rate": 0.001, + "loss": 1.8264, + "step": 6780 + }, + { + "epoch": 0.28686860140451814, + "grad_norm": 0.1856824904680252, + "learning_rate": 0.001, + "loss": 2.2964, + "step": 6781 + }, + { + "epoch": 0.2869109061680345, + "grad_norm": 0.18935956060886383, + "learning_rate": 0.001, + "loss": 2.8474, + "step": 6782 + }, + { + "epoch": 0.2869532109315509, + "grad_norm": 0.24744349718093872, + "learning_rate": 0.001, + "loss": 2.2822, + "step": 6783 + }, + { + "epoch": 0.28699551569506726, + "grad_norm": 0.26295721530914307, + "learning_rate": 0.001, + "loss": 2.3905, + "step": 6784 + }, + { + "epoch": 0.2870378204585836, + "grad_norm": 0.23481005430221558, + "learning_rate": 0.001, + "loss": 2.0109, + "step": 6785 + }, + { + "epoch": 0.2870801252221, + "grad_norm": 0.17823268473148346, + "learning_rate": 0.001, + "loss": 2.6713, + "step": 6786 + }, + { + "epoch": 0.2871224299856164, + "grad_norm": 0.28128448128700256, + "learning_rate": 0.001, + "loss": 2.5553, + "step": 6787 + }, + { + "epoch": 0.28716473474913273, + "grad_norm": 0.23149077594280243, + "learning_rate": 0.001, + "loss": 1.8728, + "step": 6788 + }, + { + "epoch": 0.28720703951264914, + "grad_norm": 0.21473278105258942, + "learning_rate": 0.001, + "loss": 2.493, + "step": 6789 + }, + { + "epoch": 0.2872493442761655, + "grad_norm": 0.19875210523605347, + "learning_rate": 0.001, + "loss": 1.7056, + "step": 6790 + }, + { + "epoch": 0.28729164903968185, + "grad_norm": 0.18944133818149567, + "learning_rate": 0.001, + "loss": 2.243, + "step": 6791 + }, + { + "epoch": 0.28733395380319826, + "grad_norm": 0.17879356443881989, + "learning_rate": 0.001, + "loss": 1.9253, + "step": 6792 + }, + { + "epoch": 0.2873762585667146, + "grad_norm": 0.39051762223243713, + "learning_rate": 0.001, + "loss": 2.477, + "step": 6793 + }, + { + "epoch": 0.28741856333023097, + "grad_norm": 0.2168491631746292, + "learning_rate": 0.001, + "loss": 2.2378, + "step": 6794 + }, + { + "epoch": 0.2874608680937474, + "grad_norm": 0.20587210357189178, + "learning_rate": 0.001, + "loss": 2.9123, + "step": 6795 + }, + { + "epoch": 0.28750317285726373, + "grad_norm": 0.16420647501945496, + "learning_rate": 0.001, + "loss": 1.4775, + "step": 6796 + }, + { + "epoch": 0.2875454776207801, + "grad_norm": 0.18433904647827148, + "learning_rate": 0.001, + "loss": 2.6196, + "step": 6797 + }, + { + "epoch": 0.2875877823842965, + "grad_norm": 0.18781892955303192, + "learning_rate": 0.001, + "loss": 2.1033, + "step": 6798 + }, + { + "epoch": 0.28763008714781285, + "grad_norm": 0.17372409999370575, + "learning_rate": 0.001, + "loss": 2.12, + "step": 6799 + }, + { + "epoch": 0.2876723919113292, + "grad_norm": 0.18070591986179352, + "learning_rate": 0.001, + "loss": 2.02, + "step": 6800 + }, + { + "epoch": 0.28771469667484556, + "grad_norm": 0.1879878044128418, + "learning_rate": 0.001, + "loss": 1.6728, + "step": 6801 + }, + { + "epoch": 0.28775700143836197, + "grad_norm": 0.18465609848499298, + "learning_rate": 0.001, + "loss": 2.1977, + "step": 6802 + }, + { + "epoch": 0.2877993062018783, + "grad_norm": 0.19959770143032074, + "learning_rate": 0.001, + "loss": 2.1808, + "step": 6803 + }, + { + "epoch": 0.2878416109653947, + "grad_norm": 0.2111247032880783, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 6804 + }, + { + "epoch": 0.2878839157289111, + "grad_norm": 0.25699108839035034, + "learning_rate": 0.001, + "loss": 2.784, + "step": 6805 + }, + { + "epoch": 0.28792622049242744, + "grad_norm": 0.17606748640537262, + "learning_rate": 0.001, + "loss": 1.9087, + "step": 6806 + }, + { + "epoch": 0.2879685252559438, + "grad_norm": 0.17687219381332397, + "learning_rate": 0.001, + "loss": 1.848, + "step": 6807 + }, + { + "epoch": 0.2880108300194602, + "grad_norm": 0.22173765301704407, + "learning_rate": 0.001, + "loss": 2.3972, + "step": 6808 + }, + { + "epoch": 0.28805313478297656, + "grad_norm": 0.18182872235774994, + "learning_rate": 0.001, + "loss": 1.7431, + "step": 6809 + }, + { + "epoch": 0.2880954395464929, + "grad_norm": 0.25314095616340637, + "learning_rate": 0.001, + "loss": 1.7966, + "step": 6810 + }, + { + "epoch": 0.2881377443100093, + "grad_norm": 0.20727166533470154, + "learning_rate": 0.001, + "loss": 2.3798, + "step": 6811 + }, + { + "epoch": 0.2881800490735257, + "grad_norm": 0.9613415002822876, + "learning_rate": 0.001, + "loss": 2.2462, + "step": 6812 + }, + { + "epoch": 0.28822235383704203, + "grad_norm": 1.1291375160217285, + "learning_rate": 0.001, + "loss": 2.3434, + "step": 6813 + }, + { + "epoch": 0.28826465860055844, + "grad_norm": 0.1664966493844986, + "learning_rate": 0.001, + "loss": 1.8012, + "step": 6814 + }, + { + "epoch": 0.2883069633640748, + "grad_norm": 0.1893632560968399, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 6815 + }, + { + "epoch": 0.28834926812759115, + "grad_norm": 0.2506442070007324, + "learning_rate": 0.001, + "loss": 2.2874, + "step": 6816 + }, + { + "epoch": 0.28839157289110756, + "grad_norm": 0.20881643891334534, + "learning_rate": 0.001, + "loss": 2.5897, + "step": 6817 + }, + { + "epoch": 0.2884338776546239, + "grad_norm": 3.2081472873687744, + "learning_rate": 0.001, + "loss": 3.7027, + "step": 6818 + }, + { + "epoch": 0.28847618241814027, + "grad_norm": 0.17766347527503967, + "learning_rate": 0.001, + "loss": 2.7331, + "step": 6819 + }, + { + "epoch": 0.2885184871816567, + "grad_norm": 0.1828794777393341, + "learning_rate": 0.001, + "loss": 1.878, + "step": 6820 + }, + { + "epoch": 0.28856079194517303, + "grad_norm": 0.2575604021549225, + "learning_rate": 0.001, + "loss": 2.9143, + "step": 6821 + }, + { + "epoch": 0.2886030967086894, + "grad_norm": 0.21907007694244385, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 6822 + }, + { + "epoch": 0.28864540147220574, + "grad_norm": 0.19140586256980896, + "learning_rate": 0.001, + "loss": 1.9638, + "step": 6823 + }, + { + "epoch": 0.28868770623572215, + "grad_norm": 0.829940915107727, + "learning_rate": 0.001, + "loss": 2.1286, + "step": 6824 + }, + { + "epoch": 0.2887300109992385, + "grad_norm": 0.1640363186597824, + "learning_rate": 0.001, + "loss": 1.6881, + "step": 6825 + }, + { + "epoch": 0.28877231576275486, + "grad_norm": 0.22927752137184143, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 6826 + }, + { + "epoch": 0.28881462052627127, + "grad_norm": 0.21113528311252594, + "learning_rate": 0.001, + "loss": 2.4409, + "step": 6827 + }, + { + "epoch": 0.2888569252897876, + "grad_norm": 0.1978234350681305, + "learning_rate": 0.001, + "loss": 2.0009, + "step": 6828 + }, + { + "epoch": 0.288899230053304, + "grad_norm": 0.43694573640823364, + "learning_rate": 0.001, + "loss": 2.7192, + "step": 6829 + }, + { + "epoch": 0.2889415348168204, + "grad_norm": 0.20024238526821136, + "learning_rate": 0.001, + "loss": 2.2269, + "step": 6830 + }, + { + "epoch": 0.28898383958033674, + "grad_norm": 0.17758913338184357, + "learning_rate": 0.001, + "loss": 1.9114, + "step": 6831 + }, + { + "epoch": 0.2890261443438531, + "grad_norm": 0.2091837078332901, + "learning_rate": 0.001, + "loss": 2.1373, + "step": 6832 + }, + { + "epoch": 0.2890684491073695, + "grad_norm": 0.28614622354507446, + "learning_rate": 0.001, + "loss": 2.3553, + "step": 6833 + }, + { + "epoch": 0.28911075387088586, + "grad_norm": 0.22399480640888214, + "learning_rate": 0.001, + "loss": 2.0642, + "step": 6834 + }, + { + "epoch": 0.2891530586344022, + "grad_norm": 0.24540039896965027, + "learning_rate": 0.001, + "loss": 1.8619, + "step": 6835 + }, + { + "epoch": 0.2891953633979186, + "grad_norm": 0.6220614314079285, + "learning_rate": 0.001, + "loss": 1.9019, + "step": 6836 + }, + { + "epoch": 0.289237668161435, + "grad_norm": 0.15628793835639954, + "learning_rate": 0.001, + "loss": 2.5514, + "step": 6837 + }, + { + "epoch": 0.28927997292495133, + "grad_norm": 2.0576629638671875, + "learning_rate": 0.001, + "loss": 2.5113, + "step": 6838 + }, + { + "epoch": 0.28932227768846774, + "grad_norm": 0.19702525436878204, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 6839 + }, + { + "epoch": 0.2893645824519841, + "grad_norm": 0.20886345207691193, + "learning_rate": 0.001, + "loss": 3.0677, + "step": 6840 + }, + { + "epoch": 0.28940688721550045, + "grad_norm": 0.18030396103858948, + "learning_rate": 0.001, + "loss": 2.0693, + "step": 6841 + }, + { + "epoch": 0.28944919197901686, + "grad_norm": 0.16617818176746368, + "learning_rate": 0.001, + "loss": 1.7641, + "step": 6842 + }, + { + "epoch": 0.2894914967425332, + "grad_norm": 0.22310234606266022, + "learning_rate": 0.001, + "loss": 1.9379, + "step": 6843 + }, + { + "epoch": 0.28953380150604957, + "grad_norm": 0.19781459867954254, + "learning_rate": 0.001, + "loss": 1.9957, + "step": 6844 + }, + { + "epoch": 0.289576106269566, + "grad_norm": 0.9519227743148804, + "learning_rate": 0.001, + "loss": 2.2693, + "step": 6845 + }, + { + "epoch": 0.28961841103308233, + "grad_norm": 0.69413161277771, + "learning_rate": 0.001, + "loss": 2.4312, + "step": 6846 + }, + { + "epoch": 0.2896607157965987, + "grad_norm": 0.23835915327072144, + "learning_rate": 0.001, + "loss": 2.2305, + "step": 6847 + }, + { + "epoch": 0.28970302056011504, + "grad_norm": 0.16248169541358948, + "learning_rate": 0.001, + "loss": 1.9366, + "step": 6848 + }, + { + "epoch": 0.28974532532363145, + "grad_norm": 1.342553973197937, + "learning_rate": 0.001, + "loss": 2.4784, + "step": 6849 + }, + { + "epoch": 0.2897876300871478, + "grad_norm": 0.2539532482624054, + "learning_rate": 0.001, + "loss": 2.0127, + "step": 6850 + }, + { + "epoch": 0.28982993485066416, + "grad_norm": 0.3332529664039612, + "learning_rate": 0.001, + "loss": 2.0516, + "step": 6851 + }, + { + "epoch": 0.28987223961418057, + "grad_norm": 0.25316402316093445, + "learning_rate": 0.001, + "loss": 2.1151, + "step": 6852 + }, + { + "epoch": 0.2899145443776969, + "grad_norm": 0.2206842005252838, + "learning_rate": 0.001, + "loss": 2.1286, + "step": 6853 + }, + { + "epoch": 0.2899568491412133, + "grad_norm": 0.18992824852466583, + "learning_rate": 0.001, + "loss": 2.1739, + "step": 6854 + }, + { + "epoch": 0.2899991539047297, + "grad_norm": 0.22509132325649261, + "learning_rate": 0.001, + "loss": 1.8315, + "step": 6855 + }, + { + "epoch": 0.29004145866824604, + "grad_norm": 0.22096151113510132, + "learning_rate": 0.001, + "loss": 2.0061, + "step": 6856 + }, + { + "epoch": 0.2900837634317624, + "grad_norm": 0.27027440071105957, + "learning_rate": 0.001, + "loss": 2.488, + "step": 6857 + }, + { + "epoch": 0.2901260681952788, + "grad_norm": 0.2823043465614319, + "learning_rate": 0.001, + "loss": 2.5708, + "step": 6858 + }, + { + "epoch": 0.29016837295879516, + "grad_norm": 0.2073088437318802, + "learning_rate": 0.001, + "loss": 2.6952, + "step": 6859 + }, + { + "epoch": 0.2902106777223115, + "grad_norm": 0.18988396227359772, + "learning_rate": 0.001, + "loss": 2.1142, + "step": 6860 + }, + { + "epoch": 0.2902529824858279, + "grad_norm": 0.1997257024049759, + "learning_rate": 0.001, + "loss": 2.0511, + "step": 6861 + }, + { + "epoch": 0.2902952872493443, + "grad_norm": 0.17761564254760742, + "learning_rate": 0.001, + "loss": 1.7944, + "step": 6862 + }, + { + "epoch": 0.29033759201286063, + "grad_norm": 0.258060485124588, + "learning_rate": 0.001, + "loss": 3.3614, + "step": 6863 + }, + { + "epoch": 0.29037989677637704, + "grad_norm": 0.17017389833927155, + "learning_rate": 0.001, + "loss": 1.5792, + "step": 6864 + }, + { + "epoch": 0.2904222015398934, + "grad_norm": 0.4084915518760681, + "learning_rate": 0.001, + "loss": 1.8688, + "step": 6865 + }, + { + "epoch": 0.29046450630340975, + "grad_norm": 0.2108771950006485, + "learning_rate": 0.001, + "loss": 2.1209, + "step": 6866 + }, + { + "epoch": 0.29050681106692616, + "grad_norm": 0.42666342854499817, + "learning_rate": 0.001, + "loss": 2.7563, + "step": 6867 + }, + { + "epoch": 0.2905491158304425, + "grad_norm": 0.18676477670669556, + "learning_rate": 0.001, + "loss": 2.3573, + "step": 6868 + }, + { + "epoch": 0.29059142059395887, + "grad_norm": 0.586787760257721, + "learning_rate": 0.001, + "loss": 2.2616, + "step": 6869 + }, + { + "epoch": 0.2906337253574752, + "grad_norm": 2.2827725410461426, + "learning_rate": 0.001, + "loss": 1.948, + "step": 6870 + }, + { + "epoch": 0.29067603012099164, + "grad_norm": 0.1944393664598465, + "learning_rate": 0.001, + "loss": 2.7219, + "step": 6871 + }, + { + "epoch": 0.290718334884508, + "grad_norm": 2.946514844894409, + "learning_rate": 0.001, + "loss": 2.3146, + "step": 6872 + }, + { + "epoch": 0.29076063964802434, + "grad_norm": 0.410149484872818, + "learning_rate": 0.001, + "loss": 2.6918, + "step": 6873 + }, + { + "epoch": 0.29080294441154075, + "grad_norm": 0.18303263187408447, + "learning_rate": 0.001, + "loss": 1.4522, + "step": 6874 + }, + { + "epoch": 0.2908452491750571, + "grad_norm": 0.8593881726264954, + "learning_rate": 0.001, + "loss": 2.6251, + "step": 6875 + }, + { + "epoch": 0.29088755393857346, + "grad_norm": 2.545989751815796, + "learning_rate": 0.001, + "loss": 2.0312, + "step": 6876 + }, + { + "epoch": 0.29092985870208987, + "grad_norm": 0.30340850353240967, + "learning_rate": 0.001, + "loss": 2.2144, + "step": 6877 + }, + { + "epoch": 0.2909721634656062, + "grad_norm": 0.23748478293418884, + "learning_rate": 0.001, + "loss": 3.4675, + "step": 6878 + }, + { + "epoch": 0.2910144682291226, + "grad_norm": 18.90664291381836, + "learning_rate": 0.001, + "loss": 2.7135, + "step": 6879 + }, + { + "epoch": 0.291056772992639, + "grad_norm": 0.2496711015701294, + "learning_rate": 0.001, + "loss": 2.5996, + "step": 6880 + }, + { + "epoch": 0.29109907775615534, + "grad_norm": 1.5730305910110474, + "learning_rate": 0.001, + "loss": 3.1332, + "step": 6881 + }, + { + "epoch": 0.2911413825196717, + "grad_norm": 0.4019278585910797, + "learning_rate": 0.001, + "loss": 1.94, + "step": 6882 + }, + { + "epoch": 0.2911836872831881, + "grad_norm": 2.1472578048706055, + "learning_rate": 0.001, + "loss": 2.5725, + "step": 6883 + }, + { + "epoch": 0.29122599204670446, + "grad_norm": 0.42344650626182556, + "learning_rate": 0.001, + "loss": 2.5348, + "step": 6884 + }, + { + "epoch": 0.2912682968102208, + "grad_norm": 0.7056704759597778, + "learning_rate": 0.001, + "loss": 2.6225, + "step": 6885 + }, + { + "epoch": 0.2913106015737372, + "grad_norm": 0.6706409454345703, + "learning_rate": 0.001, + "loss": 2.3904, + "step": 6886 + }, + { + "epoch": 0.2913529063372536, + "grad_norm": 0.35459625720977783, + "learning_rate": 0.001, + "loss": 2.1607, + "step": 6887 + }, + { + "epoch": 0.29139521110076994, + "grad_norm": 0.3938932418823242, + "learning_rate": 0.001, + "loss": 2.4787, + "step": 6888 + }, + { + "epoch": 0.29143751586428635, + "grad_norm": 0.2852618992328644, + "learning_rate": 0.001, + "loss": 2.3739, + "step": 6889 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 1.7088735103607178, + "learning_rate": 0.001, + "loss": 2.6956, + "step": 6890 + }, + { + "epoch": 0.29152212539131905, + "grad_norm": 0.21869906783103943, + "learning_rate": 0.001, + "loss": 2.1744, + "step": 6891 + }, + { + "epoch": 0.2915644301548354, + "grad_norm": 0.40579238533973694, + "learning_rate": 0.001, + "loss": 2.3586, + "step": 6892 + }, + { + "epoch": 0.2916067349183518, + "grad_norm": 0.26919931173324585, + "learning_rate": 0.001, + "loss": 1.6718, + "step": 6893 + }, + { + "epoch": 0.29164903968186817, + "grad_norm": 0.33402884006500244, + "learning_rate": 0.001, + "loss": 3.2099, + "step": 6894 + }, + { + "epoch": 0.2916913444453845, + "grad_norm": 5.1144490242004395, + "learning_rate": 0.001, + "loss": 2.5433, + "step": 6895 + }, + { + "epoch": 0.29173364920890094, + "grad_norm": 27.068161010742188, + "learning_rate": 0.001, + "loss": 2.5207, + "step": 6896 + }, + { + "epoch": 0.2917759539724173, + "grad_norm": 1.7865883111953735, + "learning_rate": 0.001, + "loss": 3.3911, + "step": 6897 + }, + { + "epoch": 0.29181825873593364, + "grad_norm": 2.192460298538208, + "learning_rate": 0.001, + "loss": 2.4101, + "step": 6898 + }, + { + "epoch": 0.29186056349945005, + "grad_norm": 0.3550018072128296, + "learning_rate": 0.001, + "loss": 2.3135, + "step": 6899 + }, + { + "epoch": 0.2919028682629664, + "grad_norm": 0.24947704374790192, + "learning_rate": 0.001, + "loss": 3.6274, + "step": 6900 + }, + { + "epoch": 0.29194517302648276, + "grad_norm": 0.23705358803272247, + "learning_rate": 0.001, + "loss": 3.0381, + "step": 6901 + }, + { + "epoch": 0.2919874777899992, + "grad_norm": 1.2429789304733276, + "learning_rate": 0.001, + "loss": 2.1528, + "step": 6902 + }, + { + "epoch": 0.2920297825535155, + "grad_norm": 0.28556519746780396, + "learning_rate": 0.001, + "loss": 1.9076, + "step": 6903 + }, + { + "epoch": 0.2920720873170319, + "grad_norm": 3.0117995738983154, + "learning_rate": 0.001, + "loss": 2.4768, + "step": 6904 + }, + { + "epoch": 0.2921143920805483, + "grad_norm": 0.43369370698928833, + "learning_rate": 0.001, + "loss": 1.9007, + "step": 6905 + }, + { + "epoch": 0.29215669684406465, + "grad_norm": 0.2250383198261261, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 6906 + }, + { + "epoch": 0.292199001607581, + "grad_norm": 0.26837220788002014, + "learning_rate": 0.001, + "loss": 2.5939, + "step": 6907 + }, + { + "epoch": 0.2922413063710974, + "grad_norm": 0.20127150416374207, + "learning_rate": 0.001, + "loss": 2.2609, + "step": 6908 + }, + { + "epoch": 0.29228361113461376, + "grad_norm": 0.9822574257850647, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 6909 + }, + { + "epoch": 0.2923259158981301, + "grad_norm": 0.2298155575990677, + "learning_rate": 0.001, + "loss": 1.9496, + "step": 6910 + }, + { + "epoch": 0.2923682206616465, + "grad_norm": 1.8333818912506104, + "learning_rate": 0.001, + "loss": 2.3752, + "step": 6911 + }, + { + "epoch": 0.2924105254251629, + "grad_norm": 0.18054918944835663, + "learning_rate": 0.001, + "loss": 2.273, + "step": 6912 + }, + { + "epoch": 0.29245283018867924, + "grad_norm": 0.22334930300712585, + "learning_rate": 0.001, + "loss": 1.8855, + "step": 6913 + }, + { + "epoch": 0.2924951349521956, + "grad_norm": 0.22284521162509918, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 6914 + }, + { + "epoch": 0.292537439715712, + "grad_norm": 0.18738651275634766, + "learning_rate": 0.001, + "loss": 3.3555, + "step": 6915 + }, + { + "epoch": 0.29257974447922835, + "grad_norm": 0.23615574836730957, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 6916 + }, + { + "epoch": 0.2926220492427447, + "grad_norm": 0.19093096256256104, + "learning_rate": 0.001, + "loss": 2.0053, + "step": 6917 + }, + { + "epoch": 0.2926643540062611, + "grad_norm": 0.22282744944095612, + "learning_rate": 0.001, + "loss": 2.3443, + "step": 6918 + }, + { + "epoch": 0.2927066587697775, + "grad_norm": 0.35610735416412354, + "learning_rate": 0.001, + "loss": 2.5339, + "step": 6919 + }, + { + "epoch": 0.2927489635332938, + "grad_norm": 0.2231018990278244, + "learning_rate": 0.001, + "loss": 2.6499, + "step": 6920 + }, + { + "epoch": 0.29279126829681024, + "grad_norm": 9.339323043823242, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 6921 + }, + { + "epoch": 0.2928335730603266, + "grad_norm": 0.27400538325309753, + "learning_rate": 0.001, + "loss": 2.139, + "step": 6922 + }, + { + "epoch": 0.29287587782384294, + "grad_norm": 1.792051076889038, + "learning_rate": 0.001, + "loss": 3.2322, + "step": 6923 + }, + { + "epoch": 0.29291818258735935, + "grad_norm": 0.20132683217525482, + "learning_rate": 0.001, + "loss": 1.7977, + "step": 6924 + }, + { + "epoch": 0.2929604873508757, + "grad_norm": 0.22404009103775024, + "learning_rate": 0.001, + "loss": 2.1195, + "step": 6925 + }, + { + "epoch": 0.29300279211439206, + "grad_norm": 0.32715481519699097, + "learning_rate": 0.001, + "loss": 2.158, + "step": 6926 + }, + { + "epoch": 0.2930450968779085, + "grad_norm": 0.4097978472709656, + "learning_rate": 0.001, + "loss": 2.2345, + "step": 6927 + }, + { + "epoch": 0.2930874016414248, + "grad_norm": 15.934835433959961, + "learning_rate": 0.001, + "loss": 2.2525, + "step": 6928 + }, + { + "epoch": 0.2931297064049412, + "grad_norm": 0.20492605865001678, + "learning_rate": 0.001, + "loss": 2.2868, + "step": 6929 + }, + { + "epoch": 0.2931720111684576, + "grad_norm": 0.22894705832004547, + "learning_rate": 0.001, + "loss": 3.1435, + "step": 6930 + }, + { + "epoch": 0.29321431593197395, + "grad_norm": 0.19995182752609253, + "learning_rate": 0.001, + "loss": 2.3582, + "step": 6931 + }, + { + "epoch": 0.2932566206954903, + "grad_norm": 13.218109130859375, + "learning_rate": 0.001, + "loss": 4.1725, + "step": 6932 + }, + { + "epoch": 0.2932989254590067, + "grad_norm": 151.23451232910156, + "learning_rate": 0.001, + "loss": 1.6978, + "step": 6933 + }, + { + "epoch": 0.29334123022252306, + "grad_norm": 0.6989339590072632, + "learning_rate": 0.001, + "loss": 1.5756, + "step": 6934 + }, + { + "epoch": 0.2933835349860394, + "grad_norm": 11.406463623046875, + "learning_rate": 0.001, + "loss": 2.1872, + "step": 6935 + }, + { + "epoch": 0.2934258397495558, + "grad_norm": 0.19447572529315948, + "learning_rate": 0.001, + "loss": 1.9257, + "step": 6936 + }, + { + "epoch": 0.2934681445130722, + "grad_norm": 0.7553207874298096, + "learning_rate": 0.001, + "loss": 2.0954, + "step": 6937 + }, + { + "epoch": 0.29351044927658854, + "grad_norm": 0.6458339095115662, + "learning_rate": 0.001, + "loss": 1.9439, + "step": 6938 + }, + { + "epoch": 0.2935527540401049, + "grad_norm": 4.88887357711792, + "learning_rate": 0.001, + "loss": 2.7551, + "step": 6939 + }, + { + "epoch": 0.2935950588036213, + "grad_norm": 0.16294671595096588, + "learning_rate": 0.001, + "loss": 2.1921, + "step": 6940 + }, + { + "epoch": 0.29363736356713765, + "grad_norm": 0.1700448840856552, + "learning_rate": 0.001, + "loss": 2.3544, + "step": 6941 + }, + { + "epoch": 0.293679668330654, + "grad_norm": 0.9373911023139954, + "learning_rate": 0.001, + "loss": 3.519, + "step": 6942 + }, + { + "epoch": 0.2937219730941704, + "grad_norm": 0.16765141487121582, + "learning_rate": 0.001, + "loss": 1.753, + "step": 6943 + }, + { + "epoch": 0.2937642778576868, + "grad_norm": 0.785889744758606, + "learning_rate": 0.001, + "loss": 2.0261, + "step": 6944 + }, + { + "epoch": 0.2938065826212031, + "grad_norm": 0.36694565415382385, + "learning_rate": 0.001, + "loss": 2.8762, + "step": 6945 + }, + { + "epoch": 0.29384888738471954, + "grad_norm": 2.4932944774627686, + "learning_rate": 0.001, + "loss": 3.5164, + "step": 6946 + }, + { + "epoch": 0.2938911921482359, + "grad_norm": 0.3365854024887085, + "learning_rate": 0.001, + "loss": 2.5705, + "step": 6947 + }, + { + "epoch": 0.29393349691175225, + "grad_norm": 0.18186473846435547, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 6948 + }, + { + "epoch": 0.29397580167526866, + "grad_norm": 0.18802011013031006, + "learning_rate": 0.001, + "loss": 1.6162, + "step": 6949 + }, + { + "epoch": 0.294018106438785, + "grad_norm": 0.6282293796539307, + "learning_rate": 0.001, + "loss": 2.0641, + "step": 6950 + }, + { + "epoch": 0.29406041120230136, + "grad_norm": 0.18980659544467926, + "learning_rate": 0.001, + "loss": 2.1692, + "step": 6951 + }, + { + "epoch": 0.2941027159658178, + "grad_norm": 0.25274673104286194, + "learning_rate": 0.001, + "loss": 2.1386, + "step": 6952 + }, + { + "epoch": 0.29414502072933413, + "grad_norm": 1.6487905979156494, + "learning_rate": 0.001, + "loss": 2.0647, + "step": 6953 + }, + { + "epoch": 0.2941873254928505, + "grad_norm": 0.38848909735679626, + "learning_rate": 0.001, + "loss": 1.3862, + "step": 6954 + }, + { + "epoch": 0.2942296302563669, + "grad_norm": 0.5868200063705444, + "learning_rate": 0.001, + "loss": 2.8731, + "step": 6955 + }, + { + "epoch": 0.29427193501988325, + "grad_norm": 117.72666931152344, + "learning_rate": 0.001, + "loss": 3.0126, + "step": 6956 + }, + { + "epoch": 0.2943142397833996, + "grad_norm": 0.19856488704681396, + "learning_rate": 0.001, + "loss": 2.3552, + "step": 6957 + }, + { + "epoch": 0.294356544546916, + "grad_norm": 0.29041945934295654, + "learning_rate": 0.001, + "loss": 3.019, + "step": 6958 + }, + { + "epoch": 0.29439884931043236, + "grad_norm": 0.22051405906677246, + "learning_rate": 0.001, + "loss": 2.6603, + "step": 6959 + }, + { + "epoch": 0.2944411540739487, + "grad_norm": 0.1814699023962021, + "learning_rate": 0.001, + "loss": 1.5751, + "step": 6960 + }, + { + "epoch": 0.2944834588374651, + "grad_norm": 8.469980239868164, + "learning_rate": 0.001, + "loss": 1.8405, + "step": 6961 + }, + { + "epoch": 0.2945257636009815, + "grad_norm": 0.3591289222240448, + "learning_rate": 0.001, + "loss": 1.9221, + "step": 6962 + }, + { + "epoch": 0.29456806836449784, + "grad_norm": 0.23890052735805511, + "learning_rate": 0.001, + "loss": 2.0497, + "step": 6963 + }, + { + "epoch": 0.2946103731280142, + "grad_norm": 0.29208847880363464, + "learning_rate": 0.001, + "loss": 2.6245, + "step": 6964 + }, + { + "epoch": 0.2946526778915306, + "grad_norm": 0.19358113408088684, + "learning_rate": 0.001, + "loss": 2.2433, + "step": 6965 + }, + { + "epoch": 0.29469498265504696, + "grad_norm": 0.24694213271141052, + "learning_rate": 0.001, + "loss": 1.7421, + "step": 6966 + }, + { + "epoch": 0.2947372874185633, + "grad_norm": 0.22472073137760162, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 6967 + }, + { + "epoch": 0.2947795921820797, + "grad_norm": 0.3076910376548767, + "learning_rate": 0.001, + "loss": 1.9558, + "step": 6968 + }, + { + "epoch": 0.2948218969455961, + "grad_norm": 0.21519528329372406, + "learning_rate": 0.001, + "loss": 3.0599, + "step": 6969 + }, + { + "epoch": 0.29486420170911243, + "grad_norm": 0.2277740240097046, + "learning_rate": 0.001, + "loss": 2.8818, + "step": 6970 + }, + { + "epoch": 0.29490650647262884, + "grad_norm": 4.881925106048584, + "learning_rate": 0.001, + "loss": 1.9913, + "step": 6971 + }, + { + "epoch": 0.2949488112361452, + "grad_norm": 0.2809251844882965, + "learning_rate": 0.001, + "loss": 2.8118, + "step": 6972 + }, + { + "epoch": 0.29499111599966155, + "grad_norm": 0.322844535112381, + "learning_rate": 0.001, + "loss": 3.4413, + "step": 6973 + }, + { + "epoch": 0.29503342076317796, + "grad_norm": 0.2614366114139557, + "learning_rate": 0.001, + "loss": 2.5353, + "step": 6974 + }, + { + "epoch": 0.2950757255266943, + "grad_norm": 0.23935487866401672, + "learning_rate": 0.001, + "loss": 2.6036, + "step": 6975 + }, + { + "epoch": 0.29511803029021066, + "grad_norm": 0.2764032185077667, + "learning_rate": 0.001, + "loss": 2.577, + "step": 6976 + }, + { + "epoch": 0.2951603350537271, + "grad_norm": 0.2084626853466034, + "learning_rate": 0.001, + "loss": 2.0175, + "step": 6977 + }, + { + "epoch": 0.29520263981724343, + "grad_norm": 0.39110100269317627, + "learning_rate": 0.001, + "loss": 3.6172, + "step": 6978 + }, + { + "epoch": 0.2952449445807598, + "grad_norm": 0.5247201919555664, + "learning_rate": 0.001, + "loss": 3.2458, + "step": 6979 + }, + { + "epoch": 0.2952872493442762, + "grad_norm": 0.2991565465927124, + "learning_rate": 0.001, + "loss": 3.4346, + "step": 6980 + }, + { + "epoch": 0.29532955410779255, + "grad_norm": 0.218788743019104, + "learning_rate": 0.001, + "loss": 2.654, + "step": 6981 + }, + { + "epoch": 0.2953718588713089, + "grad_norm": 0.19704671204090118, + "learning_rate": 0.001, + "loss": 2.1183, + "step": 6982 + }, + { + "epoch": 0.29541416363482526, + "grad_norm": 0.1847701370716095, + "learning_rate": 0.001, + "loss": 2.2644, + "step": 6983 + }, + { + "epoch": 0.29545646839834167, + "grad_norm": 0.20297656953334808, + "learning_rate": 0.001, + "loss": 1.8673, + "step": 6984 + }, + { + "epoch": 0.295498773161858, + "grad_norm": 1.8230969905853271, + "learning_rate": 0.001, + "loss": 1.7627, + "step": 6985 + }, + { + "epoch": 0.2955410779253744, + "grad_norm": 0.22176900506019592, + "learning_rate": 0.001, + "loss": 2.2876, + "step": 6986 + }, + { + "epoch": 0.2955833826888908, + "grad_norm": 0.1842503547668457, + "learning_rate": 0.001, + "loss": 2.764, + "step": 6987 + }, + { + "epoch": 0.29562568745240714, + "grad_norm": 0.2775016725063324, + "learning_rate": 0.001, + "loss": 1.8702, + "step": 6988 + }, + { + "epoch": 0.2956679922159235, + "grad_norm": 0.21142591536045074, + "learning_rate": 0.001, + "loss": 3.3746, + "step": 6989 + }, + { + "epoch": 0.2957102969794399, + "grad_norm": 0.2473433017730713, + "learning_rate": 0.001, + "loss": 2.1965, + "step": 6990 + }, + { + "epoch": 0.29575260174295626, + "grad_norm": 0.2217579185962677, + "learning_rate": 0.001, + "loss": 3.064, + "step": 6991 + }, + { + "epoch": 0.2957949065064726, + "grad_norm": 0.2861359417438507, + "learning_rate": 0.001, + "loss": 2.5936, + "step": 6992 + }, + { + "epoch": 0.295837211269989, + "grad_norm": 0.6411170363426208, + "learning_rate": 0.001, + "loss": 2.0748, + "step": 6993 + }, + { + "epoch": 0.2958795160335054, + "grad_norm": 0.20746761560440063, + "learning_rate": 0.001, + "loss": 1.8649, + "step": 6994 + }, + { + "epoch": 0.29592182079702173, + "grad_norm": 0.19803252816200256, + "learning_rate": 0.001, + "loss": 2.6345, + "step": 6995 + }, + { + "epoch": 0.29596412556053814, + "grad_norm": 0.19498823583126068, + "learning_rate": 0.001, + "loss": 2.8652, + "step": 6996 + }, + { + "epoch": 0.2960064303240545, + "grad_norm": 0.18126071989536285, + "learning_rate": 0.001, + "loss": 1.7674, + "step": 6997 + }, + { + "epoch": 0.29604873508757085, + "grad_norm": 0.17771077156066895, + "learning_rate": 0.001, + "loss": 1.8077, + "step": 6998 + }, + { + "epoch": 0.29609103985108726, + "grad_norm": 0.2599290609359741, + "learning_rate": 0.001, + "loss": 2.336, + "step": 6999 + }, + { + "epoch": 0.2961333446146036, + "grad_norm": 0.3005380928516388, + "learning_rate": 0.001, + "loss": 2.4787, + "step": 7000 + }, + { + "epoch": 0.29617564937811997, + "grad_norm": 0.18659712374210358, + "learning_rate": 0.001, + "loss": 2.1932, + "step": 7001 + }, + { + "epoch": 0.2962179541416364, + "grad_norm": 0.19324898719787598, + "learning_rate": 0.001, + "loss": 2.0008, + "step": 7002 + }, + { + "epoch": 0.29626025890515273, + "grad_norm": 0.3487750291824341, + "learning_rate": 0.001, + "loss": 2.2476, + "step": 7003 + }, + { + "epoch": 0.2963025636686691, + "grad_norm": 0.20598524808883667, + "learning_rate": 0.001, + "loss": 2.0175, + "step": 7004 + }, + { + "epoch": 0.29634486843218544, + "grad_norm": 0.20581454038619995, + "learning_rate": 0.001, + "loss": 2.5366, + "step": 7005 + }, + { + "epoch": 0.29638717319570185, + "grad_norm": 0.19093303382396698, + "learning_rate": 0.001, + "loss": 2.4369, + "step": 7006 + }, + { + "epoch": 0.2964294779592182, + "grad_norm": 0.1642286777496338, + "learning_rate": 0.001, + "loss": 2.6144, + "step": 7007 + }, + { + "epoch": 0.29647178272273456, + "grad_norm": 0.5392095446586609, + "learning_rate": 0.001, + "loss": 2.2483, + "step": 7008 + }, + { + "epoch": 0.29651408748625097, + "grad_norm": 0.22686930000782013, + "learning_rate": 0.001, + "loss": 1.9133, + "step": 7009 + }, + { + "epoch": 0.2965563922497673, + "grad_norm": 0.22740709781646729, + "learning_rate": 0.001, + "loss": 1.4307, + "step": 7010 + }, + { + "epoch": 0.2965986970132837, + "grad_norm": 0.19837690889835358, + "learning_rate": 0.001, + "loss": 1.8655, + "step": 7011 + }, + { + "epoch": 0.2966410017768001, + "grad_norm": 0.32683438062667847, + "learning_rate": 0.001, + "loss": 2.1361, + "step": 7012 + }, + { + "epoch": 0.29668330654031644, + "grad_norm": 4.837608814239502, + "learning_rate": 0.001, + "loss": 3.105, + "step": 7013 + }, + { + "epoch": 0.2967256113038328, + "grad_norm": 0.1853974610567093, + "learning_rate": 0.001, + "loss": 1.8663, + "step": 7014 + }, + { + "epoch": 0.2967679160673492, + "grad_norm": 3.4521446228027344, + "learning_rate": 0.001, + "loss": 2.0969, + "step": 7015 + }, + { + "epoch": 0.29681022083086556, + "grad_norm": 11.476473808288574, + "learning_rate": 0.001, + "loss": 2.1661, + "step": 7016 + }, + { + "epoch": 0.2968525255943819, + "grad_norm": 0.2935603857040405, + "learning_rate": 0.001, + "loss": 2.8407, + "step": 7017 + }, + { + "epoch": 0.2968948303578983, + "grad_norm": 0.22967737913131714, + "learning_rate": 0.001, + "loss": 2.347, + "step": 7018 + }, + { + "epoch": 0.2969371351214147, + "grad_norm": 0.24542132019996643, + "learning_rate": 0.001, + "loss": 2.7476, + "step": 7019 + }, + { + "epoch": 0.29697943988493103, + "grad_norm": 0.3539610207080841, + "learning_rate": 0.001, + "loss": 3.4266, + "step": 7020 + }, + { + "epoch": 0.29702174464844744, + "grad_norm": 2.1936635971069336, + "learning_rate": 0.001, + "loss": 2.4714, + "step": 7021 + }, + { + "epoch": 0.2970640494119638, + "grad_norm": 36.311824798583984, + "learning_rate": 0.001, + "loss": 2.2848, + "step": 7022 + }, + { + "epoch": 0.29710635417548015, + "grad_norm": 0.7815760970115662, + "learning_rate": 0.001, + "loss": 3.0078, + "step": 7023 + }, + { + "epoch": 0.29714865893899656, + "grad_norm": 0.25058576464653015, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 7024 + }, + { + "epoch": 0.2971909637025129, + "grad_norm": 0.32843679189682007, + "learning_rate": 0.001, + "loss": 2.2564, + "step": 7025 + }, + { + "epoch": 0.29723326846602927, + "grad_norm": 0.31752750277519226, + "learning_rate": 0.001, + "loss": 3.8093, + "step": 7026 + }, + { + "epoch": 0.2972755732295456, + "grad_norm": 0.3511388301849365, + "learning_rate": 0.001, + "loss": 2.9753, + "step": 7027 + }, + { + "epoch": 0.29731787799306203, + "grad_norm": 0.18950346112251282, + "learning_rate": 0.001, + "loss": 1.8184, + "step": 7028 + }, + { + "epoch": 0.2973601827565784, + "grad_norm": 1.3205153942108154, + "learning_rate": 0.001, + "loss": 2.4554, + "step": 7029 + }, + { + "epoch": 0.29740248752009474, + "grad_norm": 0.5435582995414734, + "learning_rate": 0.001, + "loss": 2.9064, + "step": 7030 + }, + { + "epoch": 0.29744479228361115, + "grad_norm": 0.6610992550849915, + "learning_rate": 0.001, + "loss": 2.4811, + "step": 7031 + }, + { + "epoch": 0.2974870970471275, + "grad_norm": 0.43292543292045593, + "learning_rate": 0.001, + "loss": 2.1861, + "step": 7032 + }, + { + "epoch": 0.29752940181064386, + "grad_norm": 0.29178565740585327, + "learning_rate": 0.001, + "loss": 2.0465, + "step": 7033 + }, + { + "epoch": 0.29757170657416027, + "grad_norm": 0.4427778720855713, + "learning_rate": 0.001, + "loss": 2.1454, + "step": 7034 + }, + { + "epoch": 0.2976140113376766, + "grad_norm": 0.36568501591682434, + "learning_rate": 0.001, + "loss": 2.513, + "step": 7035 + }, + { + "epoch": 0.297656316101193, + "grad_norm": 0.7165125608444214, + "learning_rate": 0.001, + "loss": 1.9268, + "step": 7036 + }, + { + "epoch": 0.2976986208647094, + "grad_norm": 0.23342393338680267, + "learning_rate": 0.001, + "loss": 2.8504, + "step": 7037 + }, + { + "epoch": 0.29774092562822574, + "grad_norm": 0.24309587478637695, + "learning_rate": 0.001, + "loss": 2.6698, + "step": 7038 + }, + { + "epoch": 0.2977832303917421, + "grad_norm": 1.0469741821289062, + "learning_rate": 0.001, + "loss": 4.0021, + "step": 7039 + }, + { + "epoch": 0.2978255351552585, + "grad_norm": 4.104727268218994, + "learning_rate": 0.001, + "loss": 2.4115, + "step": 7040 + }, + { + "epoch": 0.29786783991877486, + "grad_norm": 2.293821096420288, + "learning_rate": 0.001, + "loss": 2.3395, + "step": 7041 + }, + { + "epoch": 0.2979101446822912, + "grad_norm": 0.2298813909292221, + "learning_rate": 0.001, + "loss": 1.913, + "step": 7042 + }, + { + "epoch": 0.2979524494458076, + "grad_norm": 0.5691928267478943, + "learning_rate": 0.001, + "loss": 2.0014, + "step": 7043 + }, + { + "epoch": 0.297994754209324, + "grad_norm": 0.4775361120700836, + "learning_rate": 0.001, + "loss": 2.5362, + "step": 7044 + }, + { + "epoch": 0.29803705897284033, + "grad_norm": 0.364666223526001, + "learning_rate": 0.001, + "loss": 2.8652, + "step": 7045 + }, + { + "epoch": 0.29807936373635674, + "grad_norm": 0.9387134313583374, + "learning_rate": 0.001, + "loss": 2.1165, + "step": 7046 + }, + { + "epoch": 0.2981216684998731, + "grad_norm": 0.2410208135843277, + "learning_rate": 0.001, + "loss": 3.0745, + "step": 7047 + }, + { + "epoch": 0.29816397326338945, + "grad_norm": 0.2509887218475342, + "learning_rate": 0.001, + "loss": 1.7976, + "step": 7048 + }, + { + "epoch": 0.2982062780269058, + "grad_norm": 1.8324601650238037, + "learning_rate": 0.001, + "loss": 2.6896, + "step": 7049 + }, + { + "epoch": 0.2982485827904222, + "grad_norm": 0.342830091714859, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 7050 + }, + { + "epoch": 0.29829088755393857, + "grad_norm": 0.263128399848938, + "learning_rate": 0.001, + "loss": 2.8128, + "step": 7051 + }, + { + "epoch": 0.2983331923174549, + "grad_norm": 0.2615799903869629, + "learning_rate": 0.001, + "loss": 1.9154, + "step": 7052 + }, + { + "epoch": 0.29837549708097133, + "grad_norm": 0.3741340935230255, + "learning_rate": 0.001, + "loss": 2.8024, + "step": 7053 + }, + { + "epoch": 0.2984178018444877, + "grad_norm": 0.21344539523124695, + "learning_rate": 0.001, + "loss": 2.0821, + "step": 7054 + }, + { + "epoch": 0.29846010660800404, + "grad_norm": 7.319907188415527, + "learning_rate": 0.001, + "loss": 2.5422, + "step": 7055 + }, + { + "epoch": 0.29850241137152045, + "grad_norm": 0.1762029081583023, + "learning_rate": 0.001, + "loss": 1.8821, + "step": 7056 + }, + { + "epoch": 0.2985447161350368, + "grad_norm": 0.21324655413627625, + "learning_rate": 0.001, + "loss": 1.4527, + "step": 7057 + }, + { + "epoch": 0.29858702089855316, + "grad_norm": 0.35666510462760925, + "learning_rate": 0.001, + "loss": 3.0456, + "step": 7058 + }, + { + "epoch": 0.29862932566206957, + "grad_norm": 0.21469701826572418, + "learning_rate": 0.001, + "loss": 2.0336, + "step": 7059 + }, + { + "epoch": 0.2986716304255859, + "grad_norm": 0.39980030059814453, + "learning_rate": 0.001, + "loss": 3.3548, + "step": 7060 + }, + { + "epoch": 0.2987139351891023, + "grad_norm": 0.23621757328510284, + "learning_rate": 0.001, + "loss": 2.3665, + "step": 7061 + }, + { + "epoch": 0.2987562399526187, + "grad_norm": 0.26678037643432617, + "learning_rate": 0.001, + "loss": 2.1806, + "step": 7062 + }, + { + "epoch": 0.29879854471613504, + "grad_norm": 0.5849965810775757, + "learning_rate": 0.001, + "loss": 1.8085, + "step": 7063 + }, + { + "epoch": 0.2988408494796514, + "grad_norm": 0.4421706199645996, + "learning_rate": 0.001, + "loss": 2.1855, + "step": 7064 + }, + { + "epoch": 0.2988831542431678, + "grad_norm": 0.2031574845314026, + "learning_rate": 0.001, + "loss": 1.6082, + "step": 7065 + }, + { + "epoch": 0.29892545900668416, + "grad_norm": 0.24453617632389069, + "learning_rate": 0.001, + "loss": 1.9515, + "step": 7066 + }, + { + "epoch": 0.2989677637702005, + "grad_norm": 0.28984296321868896, + "learning_rate": 0.001, + "loss": 2.3851, + "step": 7067 + }, + { + "epoch": 0.2990100685337169, + "grad_norm": 0.22082221508026123, + "learning_rate": 0.001, + "loss": 1.8154, + "step": 7068 + }, + { + "epoch": 0.2990523732972333, + "grad_norm": 0.20017731189727783, + "learning_rate": 0.001, + "loss": 1.9377, + "step": 7069 + }, + { + "epoch": 0.29909467806074963, + "grad_norm": 0.1992572695016861, + "learning_rate": 0.001, + "loss": 1.5338, + "step": 7070 + }, + { + "epoch": 0.29913698282426604, + "grad_norm": 3.3462400436401367, + "learning_rate": 0.001, + "loss": 2.8609, + "step": 7071 + }, + { + "epoch": 0.2991792875877824, + "grad_norm": 0.1810547411441803, + "learning_rate": 0.001, + "loss": 2.7506, + "step": 7072 + }, + { + "epoch": 0.29922159235129875, + "grad_norm": 0.351933091878891, + "learning_rate": 0.001, + "loss": 1.9723, + "step": 7073 + }, + { + "epoch": 0.2992638971148151, + "grad_norm": 0.2719806134700775, + "learning_rate": 0.001, + "loss": 2.5537, + "step": 7074 + }, + { + "epoch": 0.2993062018783315, + "grad_norm": 1.2375730276107788, + "learning_rate": 0.001, + "loss": 2.7143, + "step": 7075 + }, + { + "epoch": 0.29934850664184787, + "grad_norm": 0.27189958095550537, + "learning_rate": 0.001, + "loss": 2.8949, + "step": 7076 + }, + { + "epoch": 0.2993908114053642, + "grad_norm": 2.186654567718506, + "learning_rate": 0.001, + "loss": 3.3108, + "step": 7077 + }, + { + "epoch": 0.29943311616888063, + "grad_norm": 0.2271018624305725, + "learning_rate": 0.001, + "loss": 1.9664, + "step": 7078 + }, + { + "epoch": 0.299475420932397, + "grad_norm": 0.20025469362735748, + "learning_rate": 0.001, + "loss": 2.14, + "step": 7079 + }, + { + "epoch": 0.29951772569591334, + "grad_norm": 1.0243349075317383, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 7080 + }, + { + "epoch": 0.29956003045942975, + "grad_norm": 0.27229562401771545, + "learning_rate": 0.001, + "loss": 3.3795, + "step": 7081 + }, + { + "epoch": 0.2996023352229461, + "grad_norm": 0.20607994496822357, + "learning_rate": 0.001, + "loss": 2.6859, + "step": 7082 + }, + { + "epoch": 0.29964463998646246, + "grad_norm": 0.18946023285388947, + "learning_rate": 0.001, + "loss": 1.9361, + "step": 7083 + }, + { + "epoch": 0.29968694474997887, + "grad_norm": 0.22694577276706696, + "learning_rate": 0.001, + "loss": 2.872, + "step": 7084 + }, + { + "epoch": 0.2997292495134952, + "grad_norm": 0.19697017967700958, + "learning_rate": 0.001, + "loss": 1.8643, + "step": 7085 + }, + { + "epoch": 0.2997715542770116, + "grad_norm": 0.20350822806358337, + "learning_rate": 0.001, + "loss": 1.8414, + "step": 7086 + }, + { + "epoch": 0.299813859040528, + "grad_norm": 1.2988654375076294, + "learning_rate": 0.001, + "loss": 2.5616, + "step": 7087 + }, + { + "epoch": 0.29985616380404434, + "grad_norm": 7.280746936798096, + "learning_rate": 0.001, + "loss": 2.5211, + "step": 7088 + }, + { + "epoch": 0.2998984685675607, + "grad_norm": 0.9409200549125671, + "learning_rate": 0.001, + "loss": 1.8807, + "step": 7089 + }, + { + "epoch": 0.2999407733310771, + "grad_norm": 0.20737378299236298, + "learning_rate": 0.001, + "loss": 2.2614, + "step": 7090 + }, + { + "epoch": 0.29998307809459346, + "grad_norm": 0.19399072229862213, + "learning_rate": 0.001, + "loss": 1.5657, + "step": 7091 + }, + { + "epoch": 0.3000253828581098, + "grad_norm": 2.7120447158813477, + "learning_rate": 0.001, + "loss": 3.1643, + "step": 7092 + }, + { + "epoch": 0.3000676876216262, + "grad_norm": 0.23883989453315735, + "learning_rate": 0.001, + "loss": 1.4332, + "step": 7093 + }, + { + "epoch": 0.3001099923851426, + "grad_norm": 0.24009618163108826, + "learning_rate": 0.001, + "loss": 2.0072, + "step": 7094 + }, + { + "epoch": 0.30015229714865893, + "grad_norm": 0.27275872230529785, + "learning_rate": 0.001, + "loss": 1.8004, + "step": 7095 + }, + { + "epoch": 0.3001946019121753, + "grad_norm": 0.3240390419960022, + "learning_rate": 0.001, + "loss": 2.4354, + "step": 7096 + }, + { + "epoch": 0.3002369066756917, + "grad_norm": 0.2155575454235077, + "learning_rate": 0.001, + "loss": 2.4924, + "step": 7097 + }, + { + "epoch": 0.30027921143920805, + "grad_norm": 0.26140278577804565, + "learning_rate": 0.001, + "loss": 3.0696, + "step": 7098 + }, + { + "epoch": 0.3003215162027244, + "grad_norm": 0.20411476492881775, + "learning_rate": 0.001, + "loss": 2.8706, + "step": 7099 + }, + { + "epoch": 0.3003638209662408, + "grad_norm": 0.26278483867645264, + "learning_rate": 0.001, + "loss": 2.6213, + "step": 7100 + }, + { + "epoch": 0.30040612572975717, + "grad_norm": 0.23542357981204987, + "learning_rate": 0.001, + "loss": 2.4739, + "step": 7101 + }, + { + "epoch": 0.3004484304932735, + "grad_norm": 1.478679895401001, + "learning_rate": 0.001, + "loss": 2.8704, + "step": 7102 + }, + { + "epoch": 0.30049073525678993, + "grad_norm": 0.22880683839321136, + "learning_rate": 0.001, + "loss": 2.2796, + "step": 7103 + }, + { + "epoch": 0.3005330400203063, + "grad_norm": 0.22094206511974335, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 7104 + }, + { + "epoch": 0.30057534478382264, + "grad_norm": 0.2112313210964203, + "learning_rate": 0.001, + "loss": 2.5717, + "step": 7105 + }, + { + "epoch": 0.30061764954733905, + "grad_norm": 0.19635865092277527, + "learning_rate": 0.001, + "loss": 1.7046, + "step": 7106 + }, + { + "epoch": 0.3006599543108554, + "grad_norm": 0.42038843035697937, + "learning_rate": 0.001, + "loss": 3.0465, + "step": 7107 + }, + { + "epoch": 0.30070225907437176, + "grad_norm": 0.2118614763021469, + "learning_rate": 0.001, + "loss": 2.0218, + "step": 7108 + }, + { + "epoch": 0.30074456383788817, + "grad_norm": 0.1884838491678238, + "learning_rate": 0.001, + "loss": 1.6187, + "step": 7109 + }, + { + "epoch": 0.3007868686014045, + "grad_norm": 1.542157530784607, + "learning_rate": 0.001, + "loss": 2.7272, + "step": 7110 + }, + { + "epoch": 0.3008291733649209, + "grad_norm": 0.19439256191253662, + "learning_rate": 0.001, + "loss": 1.8408, + "step": 7111 + }, + { + "epoch": 0.3008714781284373, + "grad_norm": 0.21478168666362762, + "learning_rate": 0.001, + "loss": 2.5348, + "step": 7112 + }, + { + "epoch": 0.30091378289195364, + "grad_norm": 0.2651658058166504, + "learning_rate": 0.001, + "loss": 2.2955, + "step": 7113 + }, + { + "epoch": 0.30095608765547, + "grad_norm": 0.20375876128673553, + "learning_rate": 0.001, + "loss": 2.7385, + "step": 7114 + }, + { + "epoch": 0.3009983924189864, + "grad_norm": 1.0855168104171753, + "learning_rate": 0.001, + "loss": 1.7983, + "step": 7115 + }, + { + "epoch": 0.30104069718250276, + "grad_norm": 1.9149056673049927, + "learning_rate": 0.001, + "loss": 2.4025, + "step": 7116 + }, + { + "epoch": 0.3010830019460191, + "grad_norm": 0.2241235226392746, + "learning_rate": 0.001, + "loss": 2.6799, + "step": 7117 + }, + { + "epoch": 0.30112530670953547, + "grad_norm": 0.21922898292541504, + "learning_rate": 0.001, + "loss": 2.2658, + "step": 7118 + }, + { + "epoch": 0.3011676114730519, + "grad_norm": 0.2675611078739166, + "learning_rate": 0.001, + "loss": 1.4285, + "step": 7119 + }, + { + "epoch": 0.30120991623656823, + "grad_norm": 0.5068369507789612, + "learning_rate": 0.001, + "loss": 2.1726, + "step": 7120 + }, + { + "epoch": 0.3012522210000846, + "grad_norm": 0.2993006110191345, + "learning_rate": 0.001, + "loss": 2.4697, + "step": 7121 + }, + { + "epoch": 0.301294525763601, + "grad_norm": 0.31761422753334045, + "learning_rate": 0.001, + "loss": 2.6937, + "step": 7122 + }, + { + "epoch": 0.30133683052711735, + "grad_norm": 0.2203313410282135, + "learning_rate": 0.001, + "loss": 2.4712, + "step": 7123 + }, + { + "epoch": 0.3013791352906337, + "grad_norm": 0.2073364555835724, + "learning_rate": 0.001, + "loss": 2.3447, + "step": 7124 + }, + { + "epoch": 0.3014214400541501, + "grad_norm": 0.27055227756500244, + "learning_rate": 0.001, + "loss": 1.9593, + "step": 7125 + }, + { + "epoch": 0.30146374481766647, + "grad_norm": 0.19267384707927704, + "learning_rate": 0.001, + "loss": 2.5415, + "step": 7126 + }, + { + "epoch": 0.3015060495811828, + "grad_norm": 0.21630926430225372, + "learning_rate": 0.001, + "loss": 3.0923, + "step": 7127 + }, + { + "epoch": 0.30154835434469923, + "grad_norm": 0.17075297236442566, + "learning_rate": 0.001, + "loss": 1.9153, + "step": 7128 + }, + { + "epoch": 0.3015906591082156, + "grad_norm": 2.4271461963653564, + "learning_rate": 0.001, + "loss": 2.0476, + "step": 7129 + }, + { + "epoch": 0.30163296387173194, + "grad_norm": 0.1639959067106247, + "learning_rate": 0.001, + "loss": 1.7935, + "step": 7130 + }, + { + "epoch": 0.30167526863524835, + "grad_norm": 0.17364169657230377, + "learning_rate": 0.001, + "loss": 2.065, + "step": 7131 + }, + { + "epoch": 0.3017175733987647, + "grad_norm": 0.19003529846668243, + "learning_rate": 0.001, + "loss": 2.4537, + "step": 7132 + }, + { + "epoch": 0.30175987816228106, + "grad_norm": 6.679607391357422, + "learning_rate": 0.001, + "loss": 2.5314, + "step": 7133 + }, + { + "epoch": 0.30180218292579747, + "grad_norm": 0.20183472335338593, + "learning_rate": 0.001, + "loss": 2.6579, + "step": 7134 + }, + { + "epoch": 0.3018444876893138, + "grad_norm": 2.5921237468719482, + "learning_rate": 0.001, + "loss": 2.0, + "step": 7135 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.23701635003089905, + "learning_rate": 0.001, + "loss": 2.559, + "step": 7136 + }, + { + "epoch": 0.3019290972163466, + "grad_norm": 0.5344852805137634, + "learning_rate": 0.001, + "loss": 2.0646, + "step": 7137 + }, + { + "epoch": 0.30197140197986294, + "grad_norm": 0.20276984572410583, + "learning_rate": 0.001, + "loss": 2.3369, + "step": 7138 + }, + { + "epoch": 0.3020137067433793, + "grad_norm": 0.23550361394882202, + "learning_rate": 0.001, + "loss": 2.3764, + "step": 7139 + }, + { + "epoch": 0.30205601150689565, + "grad_norm": 0.1865498572587967, + "learning_rate": 0.001, + "loss": 1.8937, + "step": 7140 + }, + { + "epoch": 0.30209831627041206, + "grad_norm": 0.18712198734283447, + "learning_rate": 0.001, + "loss": 2.3899, + "step": 7141 + }, + { + "epoch": 0.3021406210339284, + "grad_norm": 1.8322774171829224, + "learning_rate": 0.001, + "loss": 1.7824, + "step": 7142 + }, + { + "epoch": 0.30218292579744477, + "grad_norm": 1.2885193824768066, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 7143 + }, + { + "epoch": 0.3022252305609612, + "grad_norm": 1.9234724044799805, + "learning_rate": 0.001, + "loss": 3.0237, + "step": 7144 + }, + { + "epoch": 0.30226753532447753, + "grad_norm": 0.21429021656513214, + "learning_rate": 0.001, + "loss": 2.3753, + "step": 7145 + }, + { + "epoch": 0.3023098400879939, + "grad_norm": 0.1841968446969986, + "learning_rate": 0.001, + "loss": 1.9292, + "step": 7146 + }, + { + "epoch": 0.3023521448515103, + "grad_norm": 1.093137264251709, + "learning_rate": 0.001, + "loss": 1.7962, + "step": 7147 + }, + { + "epoch": 0.30239444961502665, + "grad_norm": 0.3102741539478302, + "learning_rate": 0.001, + "loss": 2.7395, + "step": 7148 + }, + { + "epoch": 0.302436754378543, + "grad_norm": 1.5967626571655273, + "learning_rate": 0.001, + "loss": 2.026, + "step": 7149 + }, + { + "epoch": 0.3024790591420594, + "grad_norm": 0.1948966681957245, + "learning_rate": 0.001, + "loss": 2.3974, + "step": 7150 + }, + { + "epoch": 0.30252136390557577, + "grad_norm": 5.692415714263916, + "learning_rate": 0.001, + "loss": 1.7977, + "step": 7151 + }, + { + "epoch": 0.3025636686690921, + "grad_norm": 0.3406045436859131, + "learning_rate": 0.001, + "loss": 3.7244, + "step": 7152 + }, + { + "epoch": 0.30260597343260853, + "grad_norm": 0.19864249229431152, + "learning_rate": 0.001, + "loss": 2.0721, + "step": 7153 + }, + { + "epoch": 0.3026482781961249, + "grad_norm": 0.22038595378398895, + "learning_rate": 0.001, + "loss": 1.618, + "step": 7154 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 0.17631886899471283, + "learning_rate": 0.001, + "loss": 2.1884, + "step": 7155 + }, + { + "epoch": 0.30273288772315765, + "grad_norm": 0.1953791379928589, + "learning_rate": 0.001, + "loss": 1.4864, + "step": 7156 + }, + { + "epoch": 0.302775192486674, + "grad_norm": 0.3843629062175751, + "learning_rate": 0.001, + "loss": 2.1935, + "step": 7157 + }, + { + "epoch": 0.30281749725019036, + "grad_norm": 0.562131941318512, + "learning_rate": 0.001, + "loss": 2.6914, + "step": 7158 + }, + { + "epoch": 0.30285980201370677, + "grad_norm": 0.30789318680763245, + "learning_rate": 0.001, + "loss": 3.5069, + "step": 7159 + }, + { + "epoch": 0.3029021067772231, + "grad_norm": 5.395096778869629, + "learning_rate": 0.001, + "loss": 2.2879, + "step": 7160 + }, + { + "epoch": 0.3029444115407395, + "grad_norm": 0.22398367524147034, + "learning_rate": 0.001, + "loss": 2.5926, + "step": 7161 + }, + { + "epoch": 0.30298671630425583, + "grad_norm": 0.3233366012573242, + "learning_rate": 0.001, + "loss": 2.6733, + "step": 7162 + }, + { + "epoch": 0.30302902106777224, + "grad_norm": 0.20136740803718567, + "learning_rate": 0.001, + "loss": 1.6234, + "step": 7163 + }, + { + "epoch": 0.3030713258312886, + "grad_norm": 0.20825202763080597, + "learning_rate": 0.001, + "loss": 2.3597, + "step": 7164 + }, + { + "epoch": 0.30311363059480495, + "grad_norm": 1.0632126331329346, + "learning_rate": 0.001, + "loss": 1.722, + "step": 7165 + }, + { + "epoch": 0.30315593535832136, + "grad_norm": 0.22308948636054993, + "learning_rate": 0.001, + "loss": 2.4832, + "step": 7166 + }, + { + "epoch": 0.3031982401218377, + "grad_norm": 0.22294947504997253, + "learning_rate": 0.001, + "loss": 2.4689, + "step": 7167 + }, + { + "epoch": 0.30324054488535407, + "grad_norm": 0.18253259360790253, + "learning_rate": 0.001, + "loss": 2.1326, + "step": 7168 + }, + { + "epoch": 0.3032828496488705, + "grad_norm": 2.8974242210388184, + "learning_rate": 0.001, + "loss": 2.3822, + "step": 7169 + }, + { + "epoch": 0.30332515441238683, + "grad_norm": 0.19481533765792847, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 7170 + }, + { + "epoch": 0.3033674591759032, + "grad_norm": 0.42890578508377075, + "learning_rate": 0.001, + "loss": 2.517, + "step": 7171 + }, + { + "epoch": 0.3034097639394196, + "grad_norm": 0.24793508648872375, + "learning_rate": 0.001, + "loss": 1.7478, + "step": 7172 + }, + { + "epoch": 0.30345206870293595, + "grad_norm": 0.1969098150730133, + "learning_rate": 0.001, + "loss": 1.9163, + "step": 7173 + }, + { + "epoch": 0.3034943734664523, + "grad_norm": 0.37608563899993896, + "learning_rate": 0.001, + "loss": 2.0317, + "step": 7174 + }, + { + "epoch": 0.3035366782299687, + "grad_norm": 0.30871760845184326, + "learning_rate": 0.001, + "loss": 1.9419, + "step": 7175 + }, + { + "epoch": 0.30357898299348507, + "grad_norm": 0.18646669387817383, + "learning_rate": 0.001, + "loss": 1.7309, + "step": 7176 + }, + { + "epoch": 0.3036212877570014, + "grad_norm": 0.17259854078292847, + "learning_rate": 0.001, + "loss": 1.7845, + "step": 7177 + }, + { + "epoch": 0.30366359252051783, + "grad_norm": 0.17856760323047638, + "learning_rate": 0.001, + "loss": 2.4272, + "step": 7178 + }, + { + "epoch": 0.3037058972840342, + "grad_norm": 2.3033905029296875, + "learning_rate": 0.001, + "loss": 2.732, + "step": 7179 + }, + { + "epoch": 0.30374820204755054, + "grad_norm": 0.4790777266025543, + "learning_rate": 0.001, + "loss": 2.2848, + "step": 7180 + }, + { + "epoch": 0.30379050681106695, + "grad_norm": 0.2084704339504242, + "learning_rate": 0.001, + "loss": 3.398, + "step": 7181 + }, + { + "epoch": 0.3038328115745833, + "grad_norm": 0.3605867028236389, + "learning_rate": 0.001, + "loss": 3.1206, + "step": 7182 + }, + { + "epoch": 0.30387511633809966, + "grad_norm": 0.17509806156158447, + "learning_rate": 0.001, + "loss": 1.7837, + "step": 7183 + }, + { + "epoch": 0.303917421101616, + "grad_norm": 0.22098810970783234, + "learning_rate": 0.001, + "loss": 2.0828, + "step": 7184 + }, + { + "epoch": 0.3039597258651324, + "grad_norm": 3.9246859550476074, + "learning_rate": 0.001, + "loss": 3.2388, + "step": 7185 + }, + { + "epoch": 0.3040020306286488, + "grad_norm": 0.1589023768901825, + "learning_rate": 0.001, + "loss": 2.3912, + "step": 7186 + }, + { + "epoch": 0.30404433539216513, + "grad_norm": 0.20874576270580292, + "learning_rate": 0.001, + "loss": 2.8516, + "step": 7187 + }, + { + "epoch": 0.30408664015568154, + "grad_norm": 1.3450440168380737, + "learning_rate": 0.001, + "loss": 1.8097, + "step": 7188 + }, + { + "epoch": 0.3041289449191979, + "grad_norm": 1.1989301443099976, + "learning_rate": 0.001, + "loss": 1.8467, + "step": 7189 + }, + { + "epoch": 0.30417124968271425, + "grad_norm": 0.21062646806240082, + "learning_rate": 0.001, + "loss": 2.1306, + "step": 7190 + }, + { + "epoch": 0.30421355444623066, + "grad_norm": 4.8882012367248535, + "learning_rate": 0.001, + "loss": 2.5283, + "step": 7191 + }, + { + "epoch": 0.304255859209747, + "grad_norm": 0.22382013499736786, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 7192 + }, + { + "epoch": 0.30429816397326337, + "grad_norm": 0.6270066499710083, + "learning_rate": 0.001, + "loss": 1.8903, + "step": 7193 + }, + { + "epoch": 0.3043404687367798, + "grad_norm": 0.21127991378307343, + "learning_rate": 0.001, + "loss": 3.1819, + "step": 7194 + }, + { + "epoch": 0.30438277350029613, + "grad_norm": 0.20672987401485443, + "learning_rate": 0.001, + "loss": 3.1049, + "step": 7195 + }, + { + "epoch": 0.3044250782638125, + "grad_norm": 0.5075833201408386, + "learning_rate": 0.001, + "loss": 1.8776, + "step": 7196 + }, + { + "epoch": 0.3044673830273289, + "grad_norm": 0.2034783810377121, + "learning_rate": 0.001, + "loss": 1.9582, + "step": 7197 + }, + { + "epoch": 0.30450968779084525, + "grad_norm": 0.2227412313222885, + "learning_rate": 0.001, + "loss": 2.315, + "step": 7198 + }, + { + "epoch": 0.3045519925543616, + "grad_norm": 0.38511255383491516, + "learning_rate": 0.001, + "loss": 2.3305, + "step": 7199 + }, + { + "epoch": 0.304594297317878, + "grad_norm": 0.48111987113952637, + "learning_rate": 0.001, + "loss": 2.6835, + "step": 7200 + }, + { + "epoch": 0.30463660208139437, + "grad_norm": 0.39257773756980896, + "learning_rate": 0.001, + "loss": 2.4686, + "step": 7201 + }, + { + "epoch": 0.3046789068449107, + "grad_norm": 0.24036632478237152, + "learning_rate": 0.001, + "loss": 3.5692, + "step": 7202 + }, + { + "epoch": 0.30472121160842713, + "grad_norm": 1.5374091863632202, + "learning_rate": 0.001, + "loss": 2.1117, + "step": 7203 + }, + { + "epoch": 0.3047635163719435, + "grad_norm": 0.21697300672531128, + "learning_rate": 0.001, + "loss": 3.4893, + "step": 7204 + }, + { + "epoch": 0.30480582113545984, + "grad_norm": 0.2033626139163971, + "learning_rate": 0.001, + "loss": 2.6569, + "step": 7205 + }, + { + "epoch": 0.30484812589897625, + "grad_norm": 0.5761461853981018, + "learning_rate": 0.001, + "loss": 2.5943, + "step": 7206 + }, + { + "epoch": 0.3048904306624926, + "grad_norm": 0.17697875201702118, + "learning_rate": 0.001, + "loss": 1.7242, + "step": 7207 + }, + { + "epoch": 0.30493273542600896, + "grad_norm": 0.20617154240608215, + "learning_rate": 0.001, + "loss": 1.9209, + "step": 7208 + }, + { + "epoch": 0.3049750401895253, + "grad_norm": 1.6445116996765137, + "learning_rate": 0.001, + "loss": 2.3285, + "step": 7209 + }, + { + "epoch": 0.3050173449530417, + "grad_norm": 0.2126307338476181, + "learning_rate": 0.001, + "loss": 2.9291, + "step": 7210 + }, + { + "epoch": 0.3050596497165581, + "grad_norm": 0.2812674939632416, + "learning_rate": 0.001, + "loss": 2.0021, + "step": 7211 + }, + { + "epoch": 0.30510195448007443, + "grad_norm": 0.19099974632263184, + "learning_rate": 0.001, + "loss": 1.9334, + "step": 7212 + }, + { + "epoch": 0.30514425924359084, + "grad_norm": 0.23043900728225708, + "learning_rate": 0.001, + "loss": 3.7938, + "step": 7213 + }, + { + "epoch": 0.3051865640071072, + "grad_norm": 0.8108448386192322, + "learning_rate": 0.001, + "loss": 2.4988, + "step": 7214 + }, + { + "epoch": 0.30522886877062355, + "grad_norm": 0.22695882618427277, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 7215 + }, + { + "epoch": 0.30527117353413996, + "grad_norm": 0.2140110582113266, + "learning_rate": 0.001, + "loss": 2.7109, + "step": 7216 + }, + { + "epoch": 0.3053134782976563, + "grad_norm": 0.18746429681777954, + "learning_rate": 0.001, + "loss": 1.5037, + "step": 7217 + }, + { + "epoch": 0.30535578306117267, + "grad_norm": 0.5185785293579102, + "learning_rate": 0.001, + "loss": 2.1417, + "step": 7218 + }, + { + "epoch": 0.3053980878246891, + "grad_norm": 0.22843578457832336, + "learning_rate": 0.001, + "loss": 2.9488, + "step": 7219 + }, + { + "epoch": 0.30544039258820543, + "grad_norm": 0.22175458073616028, + "learning_rate": 0.001, + "loss": 1.8912, + "step": 7220 + }, + { + "epoch": 0.3054826973517218, + "grad_norm": 0.3052138388156891, + "learning_rate": 0.001, + "loss": 2.7883, + "step": 7221 + }, + { + "epoch": 0.3055250021152382, + "grad_norm": 0.20926591753959656, + "learning_rate": 0.001, + "loss": 2.2513, + "step": 7222 + }, + { + "epoch": 0.30556730687875455, + "grad_norm": 0.5725785493850708, + "learning_rate": 0.001, + "loss": 2.0737, + "step": 7223 + }, + { + "epoch": 0.3056096116422709, + "grad_norm": 0.2553465962409973, + "learning_rate": 0.001, + "loss": 2.8104, + "step": 7224 + }, + { + "epoch": 0.3056519164057873, + "grad_norm": 0.17359638214111328, + "learning_rate": 0.001, + "loss": 1.9327, + "step": 7225 + }, + { + "epoch": 0.30569422116930367, + "grad_norm": 0.38395628333091736, + "learning_rate": 0.001, + "loss": 3.3217, + "step": 7226 + }, + { + "epoch": 0.30573652593282, + "grad_norm": 0.4329422414302826, + "learning_rate": 0.001, + "loss": 3.0564, + "step": 7227 + }, + { + "epoch": 0.30577883069633643, + "grad_norm": 0.19598042964935303, + "learning_rate": 0.001, + "loss": 2.1527, + "step": 7228 + }, + { + "epoch": 0.3058211354598528, + "grad_norm": 4.16204309463501, + "learning_rate": 0.001, + "loss": 2.1645, + "step": 7229 + }, + { + "epoch": 0.30586344022336914, + "grad_norm": 0.9371064901351929, + "learning_rate": 0.001, + "loss": 2.5554, + "step": 7230 + }, + { + "epoch": 0.3059057449868855, + "grad_norm": 89.23577117919922, + "learning_rate": 0.001, + "loss": 1.957, + "step": 7231 + }, + { + "epoch": 0.3059480497504019, + "grad_norm": 2.158701181411743, + "learning_rate": 0.001, + "loss": 2.8319, + "step": 7232 + }, + { + "epoch": 0.30599035451391826, + "grad_norm": 0.17827636003494263, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 7233 + }, + { + "epoch": 0.3060326592774346, + "grad_norm": 0.21226316690444946, + "learning_rate": 0.001, + "loss": 1.4754, + "step": 7234 + }, + { + "epoch": 0.306074964040951, + "grad_norm": 0.6273579001426697, + "learning_rate": 0.001, + "loss": 1.9398, + "step": 7235 + }, + { + "epoch": 0.3061172688044674, + "grad_norm": 3.001936197280884, + "learning_rate": 0.001, + "loss": 2.9792, + "step": 7236 + }, + { + "epoch": 0.30615957356798373, + "grad_norm": 0.8433371186256409, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 7237 + }, + { + "epoch": 0.30620187833150014, + "grad_norm": 0.518166720867157, + "learning_rate": 0.001, + "loss": 2.1598, + "step": 7238 + }, + { + "epoch": 0.3062441830950165, + "grad_norm": 0.25381919741630554, + "learning_rate": 0.001, + "loss": 2.3477, + "step": 7239 + }, + { + "epoch": 0.30628648785853285, + "grad_norm": 0.3235541880130768, + "learning_rate": 0.001, + "loss": 2.4599, + "step": 7240 + }, + { + "epoch": 0.30632879262204926, + "grad_norm": 0.46788913011550903, + "learning_rate": 0.001, + "loss": 2.0962, + "step": 7241 + }, + { + "epoch": 0.3063710973855656, + "grad_norm": 0.26095959544181824, + "learning_rate": 0.001, + "loss": 3.9055, + "step": 7242 + }, + { + "epoch": 0.30641340214908197, + "grad_norm": 0.44010379910469055, + "learning_rate": 0.001, + "loss": 2.2192, + "step": 7243 + }, + { + "epoch": 0.3064557069125984, + "grad_norm": 0.24772557616233826, + "learning_rate": 0.001, + "loss": 1.9465, + "step": 7244 + }, + { + "epoch": 0.30649801167611473, + "grad_norm": 0.22386042773723602, + "learning_rate": 0.001, + "loss": 2.0204, + "step": 7245 + }, + { + "epoch": 0.3065403164396311, + "grad_norm": 0.1983170062303543, + "learning_rate": 0.001, + "loss": 1.712, + "step": 7246 + }, + { + "epoch": 0.3065826212031475, + "grad_norm": 0.2919590175151825, + "learning_rate": 0.001, + "loss": 2.2712, + "step": 7247 + }, + { + "epoch": 0.30662492596666385, + "grad_norm": 1.4634429216384888, + "learning_rate": 0.001, + "loss": 1.5808, + "step": 7248 + }, + { + "epoch": 0.3066672307301802, + "grad_norm": 0.24671746790409088, + "learning_rate": 0.001, + "loss": 3.1394, + "step": 7249 + }, + { + "epoch": 0.3067095354936966, + "grad_norm": 4.342376708984375, + "learning_rate": 0.001, + "loss": 1.5847, + "step": 7250 + }, + { + "epoch": 0.30675184025721297, + "grad_norm": 0.6360762715339661, + "learning_rate": 0.001, + "loss": 2.1356, + "step": 7251 + }, + { + "epoch": 0.3067941450207293, + "grad_norm": 0.19539184868335724, + "learning_rate": 0.001, + "loss": 3.3058, + "step": 7252 + }, + { + "epoch": 0.3068364497842457, + "grad_norm": 0.22951632738113403, + "learning_rate": 0.001, + "loss": 2.0753, + "step": 7253 + }, + { + "epoch": 0.3068787545477621, + "grad_norm": 0.3070700764656067, + "learning_rate": 0.001, + "loss": 1.9365, + "step": 7254 + }, + { + "epoch": 0.30692105931127844, + "grad_norm": 0.36672693490982056, + "learning_rate": 0.001, + "loss": 3.0584, + "step": 7255 + }, + { + "epoch": 0.3069633640747948, + "grad_norm": 0.8749989867210388, + "learning_rate": 0.001, + "loss": 1.9865, + "step": 7256 + }, + { + "epoch": 0.3070056688383112, + "grad_norm": 0.4010057747364044, + "learning_rate": 0.001, + "loss": 2.2224, + "step": 7257 + }, + { + "epoch": 0.30704797360182756, + "grad_norm": 0.19571296870708466, + "learning_rate": 0.001, + "loss": 1.8307, + "step": 7258 + }, + { + "epoch": 0.3070902783653439, + "grad_norm": 0.6532837748527527, + "learning_rate": 0.001, + "loss": 2.183, + "step": 7259 + }, + { + "epoch": 0.3071325831288603, + "grad_norm": 0.22160778939723969, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 7260 + }, + { + "epoch": 0.3071748878923767, + "grad_norm": 0.24020320177078247, + "learning_rate": 0.001, + "loss": 2.6333, + "step": 7261 + }, + { + "epoch": 0.30721719265589303, + "grad_norm": 0.1901521235704422, + "learning_rate": 0.001, + "loss": 2.0718, + "step": 7262 + }, + { + "epoch": 0.30725949741940944, + "grad_norm": 0.19890253245830536, + "learning_rate": 0.001, + "loss": 1.6972, + "step": 7263 + }, + { + "epoch": 0.3073018021829258, + "grad_norm": 0.17532768845558167, + "learning_rate": 0.001, + "loss": 2.0171, + "step": 7264 + }, + { + "epoch": 0.30734410694644215, + "grad_norm": 0.21217653155326843, + "learning_rate": 0.001, + "loss": 3.6026, + "step": 7265 + }, + { + "epoch": 0.30738641170995856, + "grad_norm": 0.23952527344226837, + "learning_rate": 0.001, + "loss": 1.9304, + "step": 7266 + }, + { + "epoch": 0.3074287164734749, + "grad_norm": 0.2269655019044876, + "learning_rate": 0.001, + "loss": 2.0697, + "step": 7267 + }, + { + "epoch": 0.30747102123699127, + "grad_norm": 0.21532459557056427, + "learning_rate": 0.001, + "loss": 2.6024, + "step": 7268 + }, + { + "epoch": 0.3075133260005077, + "grad_norm": 0.20854906737804413, + "learning_rate": 0.001, + "loss": 2.4159, + "step": 7269 + }, + { + "epoch": 0.30755563076402403, + "grad_norm": 0.2222970575094223, + "learning_rate": 0.001, + "loss": 3.1089, + "step": 7270 + }, + { + "epoch": 0.3075979355275404, + "grad_norm": 0.8760474920272827, + "learning_rate": 0.001, + "loss": 1.7882, + "step": 7271 + }, + { + "epoch": 0.3076402402910568, + "grad_norm": 0.18725240230560303, + "learning_rate": 0.001, + "loss": 2.9421, + "step": 7272 + }, + { + "epoch": 0.30768254505457315, + "grad_norm": 0.9830304384231567, + "learning_rate": 0.001, + "loss": 1.7632, + "step": 7273 + }, + { + "epoch": 0.3077248498180895, + "grad_norm": 0.2174868881702423, + "learning_rate": 0.001, + "loss": 3.1616, + "step": 7274 + }, + { + "epoch": 0.30776715458160586, + "grad_norm": 0.2975056767463684, + "learning_rate": 0.001, + "loss": 2.1744, + "step": 7275 + }, + { + "epoch": 0.30780945934512227, + "grad_norm": 0.23097103834152222, + "learning_rate": 0.001, + "loss": 2.6325, + "step": 7276 + }, + { + "epoch": 0.3078517641086386, + "grad_norm": 0.20248940587043762, + "learning_rate": 0.001, + "loss": 2.3101, + "step": 7277 + }, + { + "epoch": 0.307894068872155, + "grad_norm": 0.21180301904678345, + "learning_rate": 0.001, + "loss": 2.1885, + "step": 7278 + }, + { + "epoch": 0.3079363736356714, + "grad_norm": 0.20690077543258667, + "learning_rate": 0.001, + "loss": 2.6467, + "step": 7279 + }, + { + "epoch": 0.30797867839918774, + "grad_norm": 0.19214750826358795, + "learning_rate": 0.001, + "loss": 2.5726, + "step": 7280 + }, + { + "epoch": 0.3080209831627041, + "grad_norm": 0.6494995355606079, + "learning_rate": 0.001, + "loss": 2.6384, + "step": 7281 + }, + { + "epoch": 0.3080632879262205, + "grad_norm": 0.19168926775455475, + "learning_rate": 0.001, + "loss": 2.1541, + "step": 7282 + }, + { + "epoch": 0.30810559268973686, + "grad_norm": 0.2198658436536789, + "learning_rate": 0.001, + "loss": 2.046, + "step": 7283 + }, + { + "epoch": 0.3081478974532532, + "grad_norm": 0.23235653340816498, + "learning_rate": 0.001, + "loss": 2.2637, + "step": 7284 + }, + { + "epoch": 0.3081902022167696, + "grad_norm": 0.48129093647003174, + "learning_rate": 0.001, + "loss": 2.2735, + "step": 7285 + }, + { + "epoch": 0.308232506980286, + "grad_norm": 0.1928509920835495, + "learning_rate": 0.001, + "loss": 2.4181, + "step": 7286 + }, + { + "epoch": 0.30827481174380233, + "grad_norm": 0.21180304884910583, + "learning_rate": 0.001, + "loss": 1.9391, + "step": 7287 + }, + { + "epoch": 0.30831711650731874, + "grad_norm": 0.5490750074386597, + "learning_rate": 0.001, + "loss": 2.0321, + "step": 7288 + }, + { + "epoch": 0.3083594212708351, + "grad_norm": 0.19193410873413086, + "learning_rate": 0.001, + "loss": 2.4427, + "step": 7289 + }, + { + "epoch": 0.30840172603435145, + "grad_norm": 0.1852174997329712, + "learning_rate": 0.001, + "loss": 1.7946, + "step": 7290 + }, + { + "epoch": 0.30844403079786786, + "grad_norm": 0.1935269683599472, + "learning_rate": 0.001, + "loss": 2.5008, + "step": 7291 + }, + { + "epoch": 0.3084863355613842, + "grad_norm": 0.5893869400024414, + "learning_rate": 0.001, + "loss": 1.7927, + "step": 7292 + }, + { + "epoch": 0.30852864032490057, + "grad_norm": 0.4908425807952881, + "learning_rate": 0.001, + "loss": 2.8926, + "step": 7293 + }, + { + "epoch": 0.308570945088417, + "grad_norm": 0.20681232213974, + "learning_rate": 0.001, + "loss": 1.8814, + "step": 7294 + }, + { + "epoch": 0.30861324985193334, + "grad_norm": 1.3799690008163452, + "learning_rate": 0.001, + "loss": 2.032, + "step": 7295 + }, + { + "epoch": 0.3086555546154497, + "grad_norm": 0.26583728194236755, + "learning_rate": 0.001, + "loss": 2.3084, + "step": 7296 + }, + { + "epoch": 0.30869785937896604, + "grad_norm": 0.21364301443099976, + "learning_rate": 0.001, + "loss": 2.5819, + "step": 7297 + }, + { + "epoch": 0.30874016414248245, + "grad_norm": 0.1767185777425766, + "learning_rate": 0.001, + "loss": 2.2752, + "step": 7298 + }, + { + "epoch": 0.3087824689059988, + "grad_norm": 0.19230739772319794, + "learning_rate": 0.001, + "loss": 2.4451, + "step": 7299 + }, + { + "epoch": 0.30882477366951516, + "grad_norm": 0.19486257433891296, + "learning_rate": 0.001, + "loss": 2.663, + "step": 7300 + }, + { + "epoch": 0.30886707843303157, + "grad_norm": 0.23837964236736298, + "learning_rate": 0.001, + "loss": 2.2232, + "step": 7301 + }, + { + "epoch": 0.3089093831965479, + "grad_norm": 0.2171543687582016, + "learning_rate": 0.001, + "loss": 2.9941, + "step": 7302 + }, + { + "epoch": 0.3089516879600643, + "grad_norm": 0.21793997287750244, + "learning_rate": 0.001, + "loss": 2.36, + "step": 7303 + }, + { + "epoch": 0.3089939927235807, + "grad_norm": 0.22793272137641907, + "learning_rate": 0.001, + "loss": 1.9991, + "step": 7304 + }, + { + "epoch": 0.30903629748709704, + "grad_norm": 0.23069630563259125, + "learning_rate": 0.001, + "loss": 1.94, + "step": 7305 + }, + { + "epoch": 0.3090786022506134, + "grad_norm": 0.25365108251571655, + "learning_rate": 0.001, + "loss": 2.1211, + "step": 7306 + }, + { + "epoch": 0.3091209070141298, + "grad_norm": 0.981309175491333, + "learning_rate": 0.001, + "loss": 2.3867, + "step": 7307 + }, + { + "epoch": 0.30916321177764616, + "grad_norm": 0.2560966908931732, + "learning_rate": 0.001, + "loss": 3.7126, + "step": 7308 + }, + { + "epoch": 0.3092055165411625, + "grad_norm": 0.18280810117721558, + "learning_rate": 0.001, + "loss": 1.8863, + "step": 7309 + }, + { + "epoch": 0.3092478213046789, + "grad_norm": 1.1143485307693481, + "learning_rate": 0.001, + "loss": 2.3522, + "step": 7310 + }, + { + "epoch": 0.3092901260681953, + "grad_norm": 0.42736926674842834, + "learning_rate": 0.001, + "loss": 2.6357, + "step": 7311 + }, + { + "epoch": 0.30933243083171164, + "grad_norm": 0.3606073260307312, + "learning_rate": 0.001, + "loss": 3.6859, + "step": 7312 + }, + { + "epoch": 0.30937473559522805, + "grad_norm": 0.5770615339279175, + "learning_rate": 0.001, + "loss": 2.6721, + "step": 7313 + }, + { + "epoch": 0.3094170403587444, + "grad_norm": 0.3912082314491272, + "learning_rate": 0.001, + "loss": 2.4154, + "step": 7314 + }, + { + "epoch": 0.30945934512226075, + "grad_norm": 0.21277593076229095, + "learning_rate": 0.001, + "loss": 1.8694, + "step": 7315 + }, + { + "epoch": 0.30950164988577716, + "grad_norm": 0.17572276294231415, + "learning_rate": 0.001, + "loss": 2.1676, + "step": 7316 + }, + { + "epoch": 0.3095439546492935, + "grad_norm": 0.20815758407115936, + "learning_rate": 0.001, + "loss": 3.3823, + "step": 7317 + }, + { + "epoch": 0.30958625941280987, + "grad_norm": 0.745972752571106, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 7318 + }, + { + "epoch": 0.3096285641763263, + "grad_norm": 0.20606304705142975, + "learning_rate": 0.001, + "loss": 1.9767, + "step": 7319 + }, + { + "epoch": 0.30967086893984264, + "grad_norm": 0.19630566239356995, + "learning_rate": 0.001, + "loss": 1.8691, + "step": 7320 + }, + { + "epoch": 0.309713173703359, + "grad_norm": 0.48055750131607056, + "learning_rate": 0.001, + "loss": 1.9261, + "step": 7321 + }, + { + "epoch": 0.30975547846687534, + "grad_norm": 2.7202658653259277, + "learning_rate": 0.001, + "loss": 2.2182, + "step": 7322 + }, + { + "epoch": 0.30979778323039175, + "grad_norm": 0.5181911587715149, + "learning_rate": 0.001, + "loss": 4.2096, + "step": 7323 + }, + { + "epoch": 0.3098400879939081, + "grad_norm": 0.17982327938079834, + "learning_rate": 0.001, + "loss": 1.9361, + "step": 7324 + }, + { + "epoch": 0.30988239275742446, + "grad_norm": 0.19139717519283295, + "learning_rate": 0.001, + "loss": 2.3097, + "step": 7325 + }, + { + "epoch": 0.3099246975209409, + "grad_norm": 0.2525288462638855, + "learning_rate": 0.001, + "loss": 2.176, + "step": 7326 + }, + { + "epoch": 0.3099670022844572, + "grad_norm": 0.3544459640979767, + "learning_rate": 0.001, + "loss": 3.4766, + "step": 7327 + }, + { + "epoch": 0.3100093070479736, + "grad_norm": 0.20294804871082306, + "learning_rate": 0.001, + "loss": 1.8903, + "step": 7328 + }, + { + "epoch": 0.31005161181149, + "grad_norm": 2.3171162605285645, + "learning_rate": 0.001, + "loss": 2.4474, + "step": 7329 + }, + { + "epoch": 0.31009391657500635, + "grad_norm": 0.22611065208911896, + "learning_rate": 0.001, + "loss": 2.5749, + "step": 7330 + }, + { + "epoch": 0.3101362213385227, + "grad_norm": 1.3491861820220947, + "learning_rate": 0.001, + "loss": 1.9441, + "step": 7331 + }, + { + "epoch": 0.3101785261020391, + "grad_norm": 0.22623848915100098, + "learning_rate": 0.001, + "loss": 2.951, + "step": 7332 + }, + { + "epoch": 0.31022083086555546, + "grad_norm": 1.876600742340088, + "learning_rate": 0.001, + "loss": 1.9068, + "step": 7333 + }, + { + "epoch": 0.3102631356290718, + "grad_norm": 3.5861880779266357, + "learning_rate": 0.001, + "loss": 2.7255, + "step": 7334 + }, + { + "epoch": 0.3103054403925882, + "grad_norm": 0.2411908656358719, + "learning_rate": 0.001, + "loss": 1.5527, + "step": 7335 + }, + { + "epoch": 0.3103477451561046, + "grad_norm": 1.0028561353683472, + "learning_rate": 0.001, + "loss": 2.5271, + "step": 7336 + }, + { + "epoch": 0.31039004991962094, + "grad_norm": 0.23291008174419403, + "learning_rate": 0.001, + "loss": 2.0163, + "step": 7337 + }, + { + "epoch": 0.31043235468313735, + "grad_norm": 0.2906794846057892, + "learning_rate": 0.001, + "loss": 2.2704, + "step": 7338 + }, + { + "epoch": 0.3104746594466537, + "grad_norm": 0.7202196717262268, + "learning_rate": 0.001, + "loss": 2.9688, + "step": 7339 + }, + { + "epoch": 0.31051696421017005, + "grad_norm": 0.2299274504184723, + "learning_rate": 0.001, + "loss": 1.9663, + "step": 7340 + }, + { + "epoch": 0.31055926897368646, + "grad_norm": 0.25570148229599, + "learning_rate": 0.001, + "loss": 2.3401, + "step": 7341 + }, + { + "epoch": 0.3106015737372028, + "grad_norm": 0.18588940799236298, + "learning_rate": 0.001, + "loss": 2.0283, + "step": 7342 + }, + { + "epoch": 0.3106438785007192, + "grad_norm": 2.1442577838897705, + "learning_rate": 0.001, + "loss": 2.3488, + "step": 7343 + }, + { + "epoch": 0.3106861832642355, + "grad_norm": 2.738429069519043, + "learning_rate": 0.001, + "loss": 1.9623, + "step": 7344 + }, + { + "epoch": 0.31072848802775194, + "grad_norm": 0.20081764459609985, + "learning_rate": 0.001, + "loss": 2.3359, + "step": 7345 + }, + { + "epoch": 0.3107707927912683, + "grad_norm": 0.44594720005989075, + "learning_rate": 0.001, + "loss": 2.2799, + "step": 7346 + }, + { + "epoch": 0.31081309755478465, + "grad_norm": 0.26621145009994507, + "learning_rate": 0.001, + "loss": 3.0893, + "step": 7347 + }, + { + "epoch": 0.31085540231830106, + "grad_norm": 1.0780677795410156, + "learning_rate": 0.001, + "loss": 2.2325, + "step": 7348 + }, + { + "epoch": 0.3108977070818174, + "grad_norm": 0.33734557032585144, + "learning_rate": 0.001, + "loss": 1.9325, + "step": 7349 + }, + { + "epoch": 0.31094001184533376, + "grad_norm": 0.7444893717765808, + "learning_rate": 0.001, + "loss": 2.5934, + "step": 7350 + }, + { + "epoch": 0.3109823166088502, + "grad_norm": 0.2100495994091034, + "learning_rate": 0.001, + "loss": 2.7241, + "step": 7351 + }, + { + "epoch": 0.3110246213723665, + "grad_norm": 0.2134474217891693, + "learning_rate": 0.001, + "loss": 3.2037, + "step": 7352 + }, + { + "epoch": 0.3110669261358829, + "grad_norm": 0.21119053661823273, + "learning_rate": 0.001, + "loss": 2.3662, + "step": 7353 + }, + { + "epoch": 0.3111092308993993, + "grad_norm": 2.0805516242980957, + "learning_rate": 0.001, + "loss": 2.2039, + "step": 7354 + }, + { + "epoch": 0.31115153566291565, + "grad_norm": 0.245794877409935, + "learning_rate": 0.001, + "loss": 2.1602, + "step": 7355 + }, + { + "epoch": 0.311193840426432, + "grad_norm": 0.25053372979164124, + "learning_rate": 0.001, + "loss": 2.8372, + "step": 7356 + }, + { + "epoch": 0.3112361451899484, + "grad_norm": 0.2969318628311157, + "learning_rate": 0.001, + "loss": 2.893, + "step": 7357 + }, + { + "epoch": 0.31127844995346476, + "grad_norm": 0.20529517531394958, + "learning_rate": 0.001, + "loss": 1.6362, + "step": 7358 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 0.43395882844924927, + "learning_rate": 0.001, + "loss": 3.1913, + "step": 7359 + }, + { + "epoch": 0.31136305948049753, + "grad_norm": 0.18605674803256989, + "learning_rate": 0.001, + "loss": 2.0649, + "step": 7360 + }, + { + "epoch": 0.3114053642440139, + "grad_norm": 16.036762237548828, + "learning_rate": 0.001, + "loss": 1.9725, + "step": 7361 + }, + { + "epoch": 0.31144766900753024, + "grad_norm": 0.7822217345237732, + "learning_rate": 0.001, + "loss": 2.0036, + "step": 7362 + }, + { + "epoch": 0.31148997377104665, + "grad_norm": 2.8043062686920166, + "learning_rate": 0.001, + "loss": 2.5385, + "step": 7363 + }, + { + "epoch": 0.311532278534563, + "grad_norm": 0.18811993300914764, + "learning_rate": 0.001, + "loss": 1.9102, + "step": 7364 + }, + { + "epoch": 0.31157458329807936, + "grad_norm": 0.9021945595741272, + "learning_rate": 0.001, + "loss": 2.5355, + "step": 7365 + }, + { + "epoch": 0.3116168880615957, + "grad_norm": 0.247837632894516, + "learning_rate": 0.001, + "loss": 1.8979, + "step": 7366 + }, + { + "epoch": 0.3116591928251121, + "grad_norm": 0.4068886637687683, + "learning_rate": 0.001, + "loss": 2.5863, + "step": 7367 + }, + { + "epoch": 0.3117014975886285, + "grad_norm": 0.3906690776348114, + "learning_rate": 0.001, + "loss": 1.8295, + "step": 7368 + }, + { + "epoch": 0.3117438023521448, + "grad_norm": 0.18249556422233582, + "learning_rate": 0.001, + "loss": 2.2368, + "step": 7369 + }, + { + "epoch": 0.31178610711566124, + "grad_norm": 0.22424672544002533, + "learning_rate": 0.001, + "loss": 1.736, + "step": 7370 + }, + { + "epoch": 0.3118284118791776, + "grad_norm": 1.013100504875183, + "learning_rate": 0.001, + "loss": 2.3112, + "step": 7371 + }, + { + "epoch": 0.31187071664269395, + "grad_norm": 0.5906060338020325, + "learning_rate": 0.001, + "loss": 2.3438, + "step": 7372 + }, + { + "epoch": 0.31191302140621036, + "grad_norm": 0.22655218839645386, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 7373 + }, + { + "epoch": 0.3119553261697267, + "grad_norm": 0.8316282629966736, + "learning_rate": 0.001, + "loss": 2.0015, + "step": 7374 + }, + { + "epoch": 0.31199763093324306, + "grad_norm": 0.3435095548629761, + "learning_rate": 0.001, + "loss": 2.3837, + "step": 7375 + }, + { + "epoch": 0.3120399356967595, + "grad_norm": 1.1091632843017578, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 7376 + }, + { + "epoch": 0.31208224046027583, + "grad_norm": 0.35905349254608154, + "learning_rate": 0.001, + "loss": 2.6176, + "step": 7377 + }, + { + "epoch": 0.3121245452237922, + "grad_norm": 0.2939832806587219, + "learning_rate": 0.001, + "loss": 2.6581, + "step": 7378 + }, + { + "epoch": 0.3121668499873086, + "grad_norm": 0.21034960448741913, + "learning_rate": 0.001, + "loss": 1.7465, + "step": 7379 + }, + { + "epoch": 0.31220915475082495, + "grad_norm": 0.23007825016975403, + "learning_rate": 0.001, + "loss": 2.6893, + "step": 7380 + }, + { + "epoch": 0.3122514595143413, + "grad_norm": 0.1853979229927063, + "learning_rate": 0.001, + "loss": 1.961, + "step": 7381 + }, + { + "epoch": 0.3122937642778577, + "grad_norm": 4.700438499450684, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 7382 + }, + { + "epoch": 0.31233606904137406, + "grad_norm": 0.210820272564888, + "learning_rate": 0.001, + "loss": 2.2501, + "step": 7383 + }, + { + "epoch": 0.3123783738048904, + "grad_norm": 0.2408655881881714, + "learning_rate": 0.001, + "loss": 2.5988, + "step": 7384 + }, + { + "epoch": 0.31242067856840683, + "grad_norm": 7.633557319641113, + "learning_rate": 0.001, + "loss": 2.0167, + "step": 7385 + }, + { + "epoch": 0.3124629833319232, + "grad_norm": 0.8851701617240906, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 7386 + }, + { + "epoch": 0.31250528809543954, + "grad_norm": 0.4367370009422302, + "learning_rate": 0.001, + "loss": 2.7794, + "step": 7387 + }, + { + "epoch": 0.3125475928589559, + "grad_norm": 0.24811141192913055, + "learning_rate": 0.001, + "loss": 1.8663, + "step": 7388 + }, + { + "epoch": 0.3125898976224723, + "grad_norm": 0.24149896204471588, + "learning_rate": 0.001, + "loss": 2.0688, + "step": 7389 + }, + { + "epoch": 0.31263220238598866, + "grad_norm": 0.17393232882022858, + "learning_rate": 0.001, + "loss": 2.2648, + "step": 7390 + }, + { + "epoch": 0.312674507149505, + "grad_norm": 0.6203790903091431, + "learning_rate": 0.001, + "loss": 2.4404, + "step": 7391 + }, + { + "epoch": 0.3127168119130214, + "grad_norm": 0.21590745449066162, + "learning_rate": 0.001, + "loss": 2.0125, + "step": 7392 + }, + { + "epoch": 0.3127591166765378, + "grad_norm": 0.2051093429327011, + "learning_rate": 0.001, + "loss": 1.9774, + "step": 7393 + }, + { + "epoch": 0.31280142144005413, + "grad_norm": 0.9935820698738098, + "learning_rate": 0.001, + "loss": 1.9072, + "step": 7394 + }, + { + "epoch": 0.31284372620357054, + "grad_norm": 0.18654446303844452, + "learning_rate": 0.001, + "loss": 2.4551, + "step": 7395 + }, + { + "epoch": 0.3128860309670869, + "grad_norm": 0.19215311110019684, + "learning_rate": 0.001, + "loss": 1.9348, + "step": 7396 + }, + { + "epoch": 0.31292833573060325, + "grad_norm": 0.18548458814620972, + "learning_rate": 0.001, + "loss": 1.9123, + "step": 7397 + }, + { + "epoch": 0.31297064049411966, + "grad_norm": 0.2123180329799652, + "learning_rate": 0.001, + "loss": 2.5744, + "step": 7398 + }, + { + "epoch": 0.313012945257636, + "grad_norm": 0.20290523767471313, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 7399 + }, + { + "epoch": 0.31305525002115236, + "grad_norm": 0.2106354683637619, + "learning_rate": 0.001, + "loss": 2.2838, + "step": 7400 + }, + { + "epoch": 0.3130975547846688, + "grad_norm": 0.9356090426445007, + "learning_rate": 0.001, + "loss": 2.6152, + "step": 7401 + }, + { + "epoch": 0.31313985954818513, + "grad_norm": 0.190611332654953, + "learning_rate": 0.001, + "loss": 3.1069, + "step": 7402 + }, + { + "epoch": 0.3131821643117015, + "grad_norm": 0.20556482672691345, + "learning_rate": 0.001, + "loss": 1.7953, + "step": 7403 + }, + { + "epoch": 0.3132244690752179, + "grad_norm": 0.2476384937763214, + "learning_rate": 0.001, + "loss": 1.7635, + "step": 7404 + }, + { + "epoch": 0.31326677383873425, + "grad_norm": 0.2387237846851349, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 7405 + }, + { + "epoch": 0.3133090786022506, + "grad_norm": 1.1636114120483398, + "learning_rate": 0.001, + "loss": 1.8156, + "step": 7406 + }, + { + "epoch": 0.313351383365767, + "grad_norm": 0.36299633979797363, + "learning_rate": 0.001, + "loss": 1.984, + "step": 7407 + }, + { + "epoch": 0.31339368812928337, + "grad_norm": 0.20854444801807404, + "learning_rate": 0.001, + "loss": 2.8094, + "step": 7408 + }, + { + "epoch": 0.3134359928927997, + "grad_norm": 0.20307183265686035, + "learning_rate": 0.001, + "loss": 2.0501, + "step": 7409 + }, + { + "epoch": 0.3134782976563161, + "grad_norm": 0.37564030289649963, + "learning_rate": 0.001, + "loss": 1.8919, + "step": 7410 + }, + { + "epoch": 0.3135206024198325, + "grad_norm": 0.4626857340335846, + "learning_rate": 0.001, + "loss": 3.2498, + "step": 7411 + }, + { + "epoch": 0.31356290718334884, + "grad_norm": 1.1579762697219849, + "learning_rate": 0.001, + "loss": 2.1231, + "step": 7412 + }, + { + "epoch": 0.3136052119468652, + "grad_norm": 0.17268231511116028, + "learning_rate": 0.001, + "loss": 2.8643, + "step": 7413 + }, + { + "epoch": 0.3136475167103816, + "grad_norm": 0.2048521339893341, + "learning_rate": 0.001, + "loss": 3.5636, + "step": 7414 + }, + { + "epoch": 0.31368982147389796, + "grad_norm": 0.3608675003051758, + "learning_rate": 0.001, + "loss": 3.2036, + "step": 7415 + }, + { + "epoch": 0.3137321262374143, + "grad_norm": 0.24735359847545624, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 7416 + }, + { + "epoch": 0.3137744310009307, + "grad_norm": 0.6630021333694458, + "learning_rate": 0.001, + "loss": 3.1071, + "step": 7417 + }, + { + "epoch": 0.3138167357644471, + "grad_norm": 0.19361117482185364, + "learning_rate": 0.001, + "loss": 2.294, + "step": 7418 + }, + { + "epoch": 0.31385904052796343, + "grad_norm": 0.26215800642967224, + "learning_rate": 0.001, + "loss": 2.9492, + "step": 7419 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 0.20346727967262268, + "learning_rate": 0.001, + "loss": 2.087, + "step": 7420 + }, + { + "epoch": 0.3139436500549962, + "grad_norm": 0.3235960304737091, + "learning_rate": 0.001, + "loss": 2.0558, + "step": 7421 + }, + { + "epoch": 0.31398595481851255, + "grad_norm": 0.2161642611026764, + "learning_rate": 0.001, + "loss": 2.1842, + "step": 7422 + }, + { + "epoch": 0.31402825958202896, + "grad_norm": 0.19181302189826965, + "learning_rate": 0.001, + "loss": 1.9424, + "step": 7423 + }, + { + "epoch": 0.3140705643455453, + "grad_norm": 19.93857765197754, + "learning_rate": 0.001, + "loss": 2.7717, + "step": 7424 + }, + { + "epoch": 0.31411286910906167, + "grad_norm": 0.2410607635974884, + "learning_rate": 0.001, + "loss": 2.7044, + "step": 7425 + }, + { + "epoch": 0.3141551738725781, + "grad_norm": 1.307002067565918, + "learning_rate": 0.001, + "loss": 2.1455, + "step": 7426 + }, + { + "epoch": 0.31419747863609443, + "grad_norm": 0.4844810962677002, + "learning_rate": 0.001, + "loss": 2.3331, + "step": 7427 + }, + { + "epoch": 0.3142397833996108, + "grad_norm": 0.28429657220840454, + "learning_rate": 0.001, + "loss": 2.89, + "step": 7428 + }, + { + "epoch": 0.3142820881631272, + "grad_norm": 0.8632631897926331, + "learning_rate": 0.001, + "loss": 1.8695, + "step": 7429 + }, + { + "epoch": 0.31432439292664355, + "grad_norm": 0.3193078637123108, + "learning_rate": 0.001, + "loss": 3.2732, + "step": 7430 + }, + { + "epoch": 0.3143666976901599, + "grad_norm": 0.20497079193592072, + "learning_rate": 0.001, + "loss": 2.1384, + "step": 7431 + }, + { + "epoch": 0.31440900245367626, + "grad_norm": 0.2516421973705292, + "learning_rate": 0.001, + "loss": 2.2816, + "step": 7432 + }, + { + "epoch": 0.31445130721719267, + "grad_norm": 1.0141093730926514, + "learning_rate": 0.001, + "loss": 2.6545, + "step": 7433 + }, + { + "epoch": 0.314493611980709, + "grad_norm": 0.18999618291854858, + "learning_rate": 0.001, + "loss": 2.0399, + "step": 7434 + }, + { + "epoch": 0.3145359167442254, + "grad_norm": 0.456870436668396, + "learning_rate": 0.001, + "loss": 2.8804, + "step": 7435 + }, + { + "epoch": 0.3145782215077418, + "grad_norm": 0.46013543009757996, + "learning_rate": 0.001, + "loss": 2.5783, + "step": 7436 + }, + { + "epoch": 0.31462052627125814, + "grad_norm": 0.20753584802150726, + "learning_rate": 0.001, + "loss": 2.4055, + "step": 7437 + }, + { + "epoch": 0.3146628310347745, + "grad_norm": 0.23144669830799103, + "learning_rate": 0.001, + "loss": 2.9632, + "step": 7438 + }, + { + "epoch": 0.3147051357982909, + "grad_norm": 0.3479333817958832, + "learning_rate": 0.001, + "loss": 1.792, + "step": 7439 + }, + { + "epoch": 0.31474744056180726, + "grad_norm": 4.283021926879883, + "learning_rate": 0.001, + "loss": 2.1915, + "step": 7440 + }, + { + "epoch": 0.3147897453253236, + "grad_norm": 7.6823649406433105, + "learning_rate": 0.001, + "loss": 2.5411, + "step": 7441 + }, + { + "epoch": 0.31483205008884, + "grad_norm": 0.6836051344871521, + "learning_rate": 0.001, + "loss": 2.3475, + "step": 7442 + }, + { + "epoch": 0.3148743548523564, + "grad_norm": 1.2467254400253296, + "learning_rate": 0.001, + "loss": 2.448, + "step": 7443 + }, + { + "epoch": 0.31491665961587273, + "grad_norm": 0.4439103603363037, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 7444 + }, + { + "epoch": 0.31495896437938914, + "grad_norm": 0.23518876731395721, + "learning_rate": 0.001, + "loss": 2.3276, + "step": 7445 + }, + { + "epoch": 0.3150012691429055, + "grad_norm": 0.5438497066497803, + "learning_rate": 0.001, + "loss": 1.9945, + "step": 7446 + }, + { + "epoch": 0.31504357390642185, + "grad_norm": 1.6677125692367554, + "learning_rate": 0.001, + "loss": 1.5225, + "step": 7447 + }, + { + "epoch": 0.31508587866993826, + "grad_norm": 0.20245139300823212, + "learning_rate": 0.001, + "loss": 1.973, + "step": 7448 + }, + { + "epoch": 0.3151281834334546, + "grad_norm": 0.24195541441440582, + "learning_rate": 0.001, + "loss": 1.9298, + "step": 7449 + }, + { + "epoch": 0.31517048819697097, + "grad_norm": 0.21246738731861115, + "learning_rate": 0.001, + "loss": 2.0905, + "step": 7450 + }, + { + "epoch": 0.3152127929604874, + "grad_norm": 0.2219218611717224, + "learning_rate": 0.001, + "loss": 1.9244, + "step": 7451 + }, + { + "epoch": 0.31525509772400373, + "grad_norm": 0.20514735579490662, + "learning_rate": 0.001, + "loss": 2.4173, + "step": 7452 + }, + { + "epoch": 0.3152974024875201, + "grad_norm": 0.1975151151418686, + "learning_rate": 0.001, + "loss": 2.7215, + "step": 7453 + }, + { + "epoch": 0.3153397072510365, + "grad_norm": 0.2547701299190521, + "learning_rate": 0.001, + "loss": 2.3066, + "step": 7454 + }, + { + "epoch": 0.31538201201455285, + "grad_norm": 0.2303609549999237, + "learning_rate": 0.001, + "loss": 2.6705, + "step": 7455 + }, + { + "epoch": 0.3154243167780692, + "grad_norm": 0.2129909247159958, + "learning_rate": 0.001, + "loss": 1.9109, + "step": 7456 + }, + { + "epoch": 0.31546662154158556, + "grad_norm": 0.2366720288991928, + "learning_rate": 0.001, + "loss": 2.8221, + "step": 7457 + }, + { + "epoch": 0.31550892630510197, + "grad_norm": 0.23550964891910553, + "learning_rate": 0.001, + "loss": 3.5912, + "step": 7458 + }, + { + "epoch": 0.3155512310686183, + "grad_norm": 0.21231484413146973, + "learning_rate": 0.001, + "loss": 1.7912, + "step": 7459 + }, + { + "epoch": 0.3155935358321347, + "grad_norm": 0.22212854027748108, + "learning_rate": 0.001, + "loss": 2.5323, + "step": 7460 + }, + { + "epoch": 0.3156358405956511, + "grad_norm": 1.3847850561141968, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 7461 + }, + { + "epoch": 0.31567814535916744, + "grad_norm": 0.6198214292526245, + "learning_rate": 0.001, + "loss": 2.3198, + "step": 7462 + }, + { + "epoch": 0.3157204501226838, + "grad_norm": 0.20347964763641357, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 7463 + }, + { + "epoch": 0.3157627548862002, + "grad_norm": 0.3482247292995453, + "learning_rate": 0.001, + "loss": 1.8097, + "step": 7464 + }, + { + "epoch": 0.31580505964971656, + "grad_norm": 0.22890900075435638, + "learning_rate": 0.001, + "loss": 2.2547, + "step": 7465 + }, + { + "epoch": 0.3158473644132329, + "grad_norm": 0.21213634312152863, + "learning_rate": 0.001, + "loss": 2.1473, + "step": 7466 + }, + { + "epoch": 0.3158896691767493, + "grad_norm": 0.2835163176059723, + "learning_rate": 0.001, + "loss": 2.5594, + "step": 7467 + }, + { + "epoch": 0.3159319739402657, + "grad_norm": 0.22125250101089478, + "learning_rate": 0.001, + "loss": 2.9808, + "step": 7468 + }, + { + "epoch": 0.31597427870378203, + "grad_norm": 2.557072401046753, + "learning_rate": 0.001, + "loss": 3.2332, + "step": 7469 + }, + { + "epoch": 0.31601658346729844, + "grad_norm": 4.832421779632568, + "learning_rate": 0.001, + "loss": 2.2446, + "step": 7470 + }, + { + "epoch": 0.3160588882308148, + "grad_norm": 0.19227334856987, + "learning_rate": 0.001, + "loss": 2.5917, + "step": 7471 + }, + { + "epoch": 0.31610119299433115, + "grad_norm": 0.23523429036140442, + "learning_rate": 0.001, + "loss": 2.5136, + "step": 7472 + }, + { + "epoch": 0.31614349775784756, + "grad_norm": 0.22452674806118011, + "learning_rate": 0.001, + "loss": 2.0461, + "step": 7473 + }, + { + "epoch": 0.3161858025213639, + "grad_norm": 0.43354332447052, + "learning_rate": 0.001, + "loss": 3.0801, + "step": 7474 + }, + { + "epoch": 0.31622810728488027, + "grad_norm": 0.21420817077159882, + "learning_rate": 0.001, + "loss": 1.6362, + "step": 7475 + }, + { + "epoch": 0.3162704120483967, + "grad_norm": 5.511373043060303, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 7476 + }, + { + "epoch": 0.31631271681191303, + "grad_norm": 0.647841215133667, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 7477 + }, + { + "epoch": 0.3163550215754294, + "grad_norm": 0.4985596537590027, + "learning_rate": 0.001, + "loss": 2.6925, + "step": 7478 + }, + { + "epoch": 0.31639732633894574, + "grad_norm": 0.6579802632331848, + "learning_rate": 0.001, + "loss": 2.7019, + "step": 7479 + }, + { + "epoch": 0.31643963110246215, + "grad_norm": 2.1017627716064453, + "learning_rate": 0.001, + "loss": 2.4639, + "step": 7480 + }, + { + "epoch": 0.3164819358659785, + "grad_norm": 0.5756150484085083, + "learning_rate": 0.001, + "loss": 1.7458, + "step": 7481 + }, + { + "epoch": 0.31652424062949486, + "grad_norm": 0.24982336163520813, + "learning_rate": 0.001, + "loss": 3.0768, + "step": 7482 + }, + { + "epoch": 0.31656654539301127, + "grad_norm": 0.5327016711235046, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 7483 + }, + { + "epoch": 0.3166088501565276, + "grad_norm": 7.926521301269531, + "learning_rate": 0.001, + "loss": 2.0401, + "step": 7484 + }, + { + "epoch": 0.316651154920044, + "grad_norm": 0.2336919903755188, + "learning_rate": 0.001, + "loss": 2.5418, + "step": 7485 + }, + { + "epoch": 0.3166934596835604, + "grad_norm": 1.4902650117874146, + "learning_rate": 0.001, + "loss": 2.053, + "step": 7486 + }, + { + "epoch": 0.31673576444707674, + "grad_norm": 0.3811624348163605, + "learning_rate": 0.001, + "loss": 1.8232, + "step": 7487 + }, + { + "epoch": 0.3167780692105931, + "grad_norm": 0.37704822421073914, + "learning_rate": 0.001, + "loss": 2.9608, + "step": 7488 + }, + { + "epoch": 0.3168203739741095, + "grad_norm": 0.35828453302383423, + "learning_rate": 0.001, + "loss": 1.9027, + "step": 7489 + }, + { + "epoch": 0.31686267873762586, + "grad_norm": 0.22911661863327026, + "learning_rate": 0.001, + "loss": 2.7649, + "step": 7490 + }, + { + "epoch": 0.3169049835011422, + "grad_norm": 0.2562866806983948, + "learning_rate": 0.001, + "loss": 2.4122, + "step": 7491 + }, + { + "epoch": 0.3169472882646586, + "grad_norm": 0.24646058678627014, + "learning_rate": 0.001, + "loss": 2.0809, + "step": 7492 + }, + { + "epoch": 0.316989593028175, + "grad_norm": 1.1058540344238281, + "learning_rate": 0.001, + "loss": 2.0834, + "step": 7493 + }, + { + "epoch": 0.31703189779169133, + "grad_norm": 0.20626600086688995, + "learning_rate": 0.001, + "loss": 2.2533, + "step": 7494 + }, + { + "epoch": 0.31707420255520774, + "grad_norm": 0.32516318559646606, + "learning_rate": 0.001, + "loss": 3.1424, + "step": 7495 + }, + { + "epoch": 0.3171165073187241, + "grad_norm": 0.2439742237329483, + "learning_rate": 0.001, + "loss": 2.2094, + "step": 7496 + }, + { + "epoch": 0.31715881208224045, + "grad_norm": 0.34792065620422363, + "learning_rate": 0.001, + "loss": 1.9122, + "step": 7497 + }, + { + "epoch": 0.31720111684575686, + "grad_norm": 0.23733051121234894, + "learning_rate": 0.001, + "loss": 1.936, + "step": 7498 + }, + { + "epoch": 0.3172434216092732, + "grad_norm": 0.24519434571266174, + "learning_rate": 0.001, + "loss": 2.036, + "step": 7499 + }, + { + "epoch": 0.31728572637278957, + "grad_norm": 0.4781532883644104, + "learning_rate": 0.001, + "loss": 3.1208, + "step": 7500 + }, + { + "epoch": 0.3173280311363059, + "grad_norm": 0.19272254407405853, + "learning_rate": 0.001, + "loss": 2.552, + "step": 7501 + }, + { + "epoch": 0.31737033589982233, + "grad_norm": 0.32179877161979675, + "learning_rate": 0.001, + "loss": 2.8888, + "step": 7502 + }, + { + "epoch": 0.3174126406633387, + "grad_norm": 0.7427499890327454, + "learning_rate": 0.001, + "loss": 2.615, + "step": 7503 + }, + { + "epoch": 0.31745494542685504, + "grad_norm": 0.31697311997413635, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 7504 + }, + { + "epoch": 0.31749725019037145, + "grad_norm": 0.16490371525287628, + "learning_rate": 0.001, + "loss": 2.6595, + "step": 7505 + }, + { + "epoch": 0.3175395549538878, + "grad_norm": 0.2627149224281311, + "learning_rate": 0.001, + "loss": 4.131, + "step": 7506 + }, + { + "epoch": 0.31758185971740416, + "grad_norm": 0.8162776827812195, + "learning_rate": 0.001, + "loss": 1.8456, + "step": 7507 + }, + { + "epoch": 0.31762416448092057, + "grad_norm": 0.3852151036262512, + "learning_rate": 0.001, + "loss": 3.4449, + "step": 7508 + }, + { + "epoch": 0.3176664692444369, + "grad_norm": 1.2743273973464966, + "learning_rate": 0.001, + "loss": 2.073, + "step": 7509 + }, + { + "epoch": 0.3177087740079533, + "grad_norm": 0.23382671177387238, + "learning_rate": 0.001, + "loss": 1.7473, + "step": 7510 + }, + { + "epoch": 0.3177510787714697, + "grad_norm": 0.25078028440475464, + "learning_rate": 0.001, + "loss": 2.8671, + "step": 7511 + }, + { + "epoch": 0.31779338353498604, + "grad_norm": 0.18183058500289917, + "learning_rate": 0.001, + "loss": 2.3517, + "step": 7512 + }, + { + "epoch": 0.3178356882985024, + "grad_norm": 0.20492641627788544, + "learning_rate": 0.001, + "loss": 1.6892, + "step": 7513 + }, + { + "epoch": 0.3178779930620188, + "grad_norm": 0.19157011806964874, + "learning_rate": 0.001, + "loss": 1.9524, + "step": 7514 + }, + { + "epoch": 0.31792029782553516, + "grad_norm": 0.20687264204025269, + "learning_rate": 0.001, + "loss": 2.091, + "step": 7515 + }, + { + "epoch": 0.3179626025890515, + "grad_norm": 0.3631476163864136, + "learning_rate": 0.001, + "loss": 2.5191, + "step": 7516 + }, + { + "epoch": 0.3180049073525679, + "grad_norm": 2.8174641132354736, + "learning_rate": 0.001, + "loss": 2.2289, + "step": 7517 + }, + { + "epoch": 0.3180472121160843, + "grad_norm": 0.21862949430942535, + "learning_rate": 0.001, + "loss": 2.8965, + "step": 7518 + }, + { + "epoch": 0.31808951687960063, + "grad_norm": 1.085720419883728, + "learning_rate": 0.001, + "loss": 2.622, + "step": 7519 + }, + { + "epoch": 0.31813182164311704, + "grad_norm": 0.18160761892795563, + "learning_rate": 0.001, + "loss": 2.2491, + "step": 7520 + }, + { + "epoch": 0.3181741264066334, + "grad_norm": 0.8975410461425781, + "learning_rate": 0.001, + "loss": 2.8174, + "step": 7521 + }, + { + "epoch": 0.31821643117014975, + "grad_norm": 1.3470760583877563, + "learning_rate": 0.001, + "loss": 1.7925, + "step": 7522 + }, + { + "epoch": 0.3182587359336661, + "grad_norm": 1.8409106731414795, + "learning_rate": 0.001, + "loss": 2.8657, + "step": 7523 + }, + { + "epoch": 0.3183010406971825, + "grad_norm": 0.17481787502765656, + "learning_rate": 0.001, + "loss": 2.4448, + "step": 7524 + }, + { + "epoch": 0.31834334546069887, + "grad_norm": 0.2951485514640808, + "learning_rate": 0.001, + "loss": 2.678, + "step": 7525 + }, + { + "epoch": 0.3183856502242152, + "grad_norm": 0.22341516613960266, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 7526 + }, + { + "epoch": 0.31842795498773163, + "grad_norm": 0.2601909935474396, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 7527 + }, + { + "epoch": 0.318470259751248, + "grad_norm": 0.2952847480773926, + "learning_rate": 0.001, + "loss": 2.1086, + "step": 7528 + }, + { + "epoch": 0.31851256451476434, + "grad_norm": 0.4443856477737427, + "learning_rate": 0.001, + "loss": 3.2807, + "step": 7529 + }, + { + "epoch": 0.31855486927828075, + "grad_norm": 0.235245943069458, + "learning_rate": 0.001, + "loss": 2.9082, + "step": 7530 + }, + { + "epoch": 0.3185971740417971, + "grad_norm": 1.0847169160842896, + "learning_rate": 0.001, + "loss": 2.0038, + "step": 7531 + }, + { + "epoch": 0.31863947880531346, + "grad_norm": 0.245633065700531, + "learning_rate": 0.001, + "loss": 2.6333, + "step": 7532 + }, + { + "epoch": 0.31868178356882987, + "grad_norm": 0.2640596330165863, + "learning_rate": 0.001, + "loss": 3.2857, + "step": 7533 + }, + { + "epoch": 0.3187240883323462, + "grad_norm": 0.5016381740570068, + "learning_rate": 0.001, + "loss": 3.0914, + "step": 7534 + }, + { + "epoch": 0.3187663930958626, + "grad_norm": 0.19767212867736816, + "learning_rate": 0.001, + "loss": 1.7764, + "step": 7535 + }, + { + "epoch": 0.318808697859379, + "grad_norm": 0.2402283400297165, + "learning_rate": 0.001, + "loss": 2.7833, + "step": 7536 + }, + { + "epoch": 0.31885100262289534, + "grad_norm": 4.479025363922119, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 7537 + }, + { + "epoch": 0.3188933073864117, + "grad_norm": 0.584766685962677, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 7538 + }, + { + "epoch": 0.3189356121499281, + "grad_norm": 0.5840973258018494, + "learning_rate": 0.001, + "loss": 1.8462, + "step": 7539 + }, + { + "epoch": 0.31897791691344446, + "grad_norm": 0.2561521828174591, + "learning_rate": 0.001, + "loss": 2.3039, + "step": 7540 + }, + { + "epoch": 0.3190202216769608, + "grad_norm": 4.742803573608398, + "learning_rate": 0.001, + "loss": 2.0092, + "step": 7541 + }, + { + "epoch": 0.3190625264404772, + "grad_norm": 0.24484741687774658, + "learning_rate": 0.001, + "loss": 2.2363, + "step": 7542 + }, + { + "epoch": 0.3191048312039936, + "grad_norm": 0.70278000831604, + "learning_rate": 0.001, + "loss": 3.5423, + "step": 7543 + }, + { + "epoch": 0.31914713596750993, + "grad_norm": 0.22903594374656677, + "learning_rate": 0.001, + "loss": 1.8414, + "step": 7544 + }, + { + "epoch": 0.3191894407310263, + "grad_norm": 0.5062339901924133, + "learning_rate": 0.001, + "loss": 2.1526, + "step": 7545 + }, + { + "epoch": 0.3192317454945427, + "grad_norm": 0.2158339023590088, + "learning_rate": 0.001, + "loss": 1.9886, + "step": 7546 + }, + { + "epoch": 0.31927405025805905, + "grad_norm": 1.0523641109466553, + "learning_rate": 0.001, + "loss": 2.6195, + "step": 7547 + }, + { + "epoch": 0.3193163550215754, + "grad_norm": 0.2197253257036209, + "learning_rate": 0.001, + "loss": 1.5022, + "step": 7548 + }, + { + "epoch": 0.3193586597850918, + "grad_norm": 2.6306278705596924, + "learning_rate": 0.001, + "loss": 2.143, + "step": 7549 + }, + { + "epoch": 0.31940096454860817, + "grad_norm": 0.7273126840591431, + "learning_rate": 0.001, + "loss": 2.4476, + "step": 7550 + }, + { + "epoch": 0.3194432693121245, + "grad_norm": 0.29238003492355347, + "learning_rate": 0.001, + "loss": 1.9445, + "step": 7551 + }, + { + "epoch": 0.31948557407564093, + "grad_norm": 0.6033421158790588, + "learning_rate": 0.001, + "loss": 2.5735, + "step": 7552 + }, + { + "epoch": 0.3195278788391573, + "grad_norm": 0.723403811454773, + "learning_rate": 0.001, + "loss": 2.9264, + "step": 7553 + }, + { + "epoch": 0.31957018360267364, + "grad_norm": 0.3378523886203766, + "learning_rate": 0.001, + "loss": 2.0737, + "step": 7554 + }, + { + "epoch": 0.31961248836619005, + "grad_norm": 0.2882983684539795, + "learning_rate": 0.001, + "loss": 3.3238, + "step": 7555 + }, + { + "epoch": 0.3196547931297064, + "grad_norm": 1.7969506978988647, + "learning_rate": 0.001, + "loss": 2.2955, + "step": 7556 + }, + { + "epoch": 0.31969709789322276, + "grad_norm": 1.0519235134124756, + "learning_rate": 0.001, + "loss": 2.9223, + "step": 7557 + }, + { + "epoch": 0.31973940265673917, + "grad_norm": 0.26675179600715637, + "learning_rate": 0.001, + "loss": 3.1042, + "step": 7558 + }, + { + "epoch": 0.3197817074202555, + "grad_norm": 0.210018128156662, + "learning_rate": 0.001, + "loss": 2.4462, + "step": 7559 + }, + { + "epoch": 0.3198240121837719, + "grad_norm": 0.24346289038658142, + "learning_rate": 0.001, + "loss": 2.5506, + "step": 7560 + }, + { + "epoch": 0.3198663169472883, + "grad_norm": 0.2894313633441925, + "learning_rate": 0.001, + "loss": 2.6206, + "step": 7561 + }, + { + "epoch": 0.31990862171080464, + "grad_norm": 0.23805563151836395, + "learning_rate": 0.001, + "loss": 2.1749, + "step": 7562 + }, + { + "epoch": 0.319950926474321, + "grad_norm": 0.2397831827402115, + "learning_rate": 0.001, + "loss": 2.7243, + "step": 7563 + }, + { + "epoch": 0.3199932312378374, + "grad_norm": 0.23759059607982635, + "learning_rate": 0.001, + "loss": 1.9441, + "step": 7564 + }, + { + "epoch": 0.32003553600135376, + "grad_norm": 1.2827054262161255, + "learning_rate": 0.001, + "loss": 2.5177, + "step": 7565 + }, + { + "epoch": 0.3200778407648701, + "grad_norm": 0.3205539882183075, + "learning_rate": 0.001, + "loss": 3.2377, + "step": 7566 + }, + { + "epoch": 0.3201201455283865, + "grad_norm": 0.3002196252346039, + "learning_rate": 0.001, + "loss": 2.2226, + "step": 7567 + }, + { + "epoch": 0.3201624502919029, + "grad_norm": 0.35749757289886475, + "learning_rate": 0.001, + "loss": 2.0588, + "step": 7568 + }, + { + "epoch": 0.32020475505541923, + "grad_norm": 0.4479646384716034, + "learning_rate": 0.001, + "loss": 2.8293, + "step": 7569 + }, + { + "epoch": 0.3202470598189356, + "grad_norm": 1.4561418294906616, + "learning_rate": 0.001, + "loss": 1.874, + "step": 7570 + }, + { + "epoch": 0.320289364582452, + "grad_norm": 0.25509679317474365, + "learning_rate": 0.001, + "loss": 1.9602, + "step": 7571 + }, + { + "epoch": 0.32033166934596835, + "grad_norm": 0.33940833806991577, + "learning_rate": 0.001, + "loss": 3.8938, + "step": 7572 + }, + { + "epoch": 0.3203739741094847, + "grad_norm": 0.23563840985298157, + "learning_rate": 0.001, + "loss": 1.9939, + "step": 7573 + }, + { + "epoch": 0.3204162788730011, + "grad_norm": 0.5354148745536804, + "learning_rate": 0.001, + "loss": 2.3731, + "step": 7574 + }, + { + "epoch": 0.32045858363651747, + "grad_norm": 0.6357294917106628, + "learning_rate": 0.001, + "loss": 2.4846, + "step": 7575 + }, + { + "epoch": 0.3205008884000338, + "grad_norm": 0.24302570521831512, + "learning_rate": 0.001, + "loss": 2.7975, + "step": 7576 + }, + { + "epoch": 0.32054319316355023, + "grad_norm": 0.46470755338668823, + "learning_rate": 0.001, + "loss": 2.1249, + "step": 7577 + }, + { + "epoch": 0.3205854979270666, + "grad_norm": 0.24483340978622437, + "learning_rate": 0.001, + "loss": 3.0459, + "step": 7578 + }, + { + "epoch": 0.32062780269058294, + "grad_norm": 0.22136029601097107, + "learning_rate": 0.001, + "loss": 1.8347, + "step": 7579 + }, + { + "epoch": 0.32067010745409935, + "grad_norm": 7.803838729858398, + "learning_rate": 0.001, + "loss": 1.7036, + "step": 7580 + }, + { + "epoch": 0.3207124122176157, + "grad_norm": 0.2764716148376465, + "learning_rate": 0.001, + "loss": 2.6748, + "step": 7581 + }, + { + "epoch": 0.32075471698113206, + "grad_norm": 0.44170138239860535, + "learning_rate": 0.001, + "loss": 2.1851, + "step": 7582 + }, + { + "epoch": 0.32079702174464847, + "grad_norm": 0.6049759387969971, + "learning_rate": 0.001, + "loss": 2.012, + "step": 7583 + }, + { + "epoch": 0.3208393265081648, + "grad_norm": 0.18830320239067078, + "learning_rate": 0.001, + "loss": 2.1773, + "step": 7584 + }, + { + "epoch": 0.3208816312716812, + "grad_norm": 0.3015429377555847, + "learning_rate": 0.001, + "loss": 2.196, + "step": 7585 + }, + { + "epoch": 0.3209239360351976, + "grad_norm": 0.20359709858894348, + "learning_rate": 0.001, + "loss": 2.4333, + "step": 7586 + }, + { + "epoch": 0.32096624079871394, + "grad_norm": 0.4667384922504425, + "learning_rate": 0.001, + "loss": 2.0174, + "step": 7587 + }, + { + "epoch": 0.3210085455622303, + "grad_norm": 1.0664136409759521, + "learning_rate": 0.001, + "loss": 2.0681, + "step": 7588 + }, + { + "epoch": 0.3210508503257467, + "grad_norm": 0.37264055013656616, + "learning_rate": 0.001, + "loss": 2.3549, + "step": 7589 + }, + { + "epoch": 0.32109315508926306, + "grad_norm": 0.2381688803434372, + "learning_rate": 0.001, + "loss": 2.0513, + "step": 7590 + }, + { + "epoch": 0.3211354598527794, + "grad_norm": 1.4251641035079956, + "learning_rate": 0.001, + "loss": 2.5208, + "step": 7591 + }, + { + "epoch": 0.32117776461629577, + "grad_norm": 0.6149337291717529, + "learning_rate": 0.001, + "loss": 2.0792, + "step": 7592 + }, + { + "epoch": 0.3212200693798122, + "grad_norm": 0.18450385332107544, + "learning_rate": 0.001, + "loss": 3.026, + "step": 7593 + }, + { + "epoch": 0.32126237414332853, + "grad_norm": 0.26451027393341064, + "learning_rate": 0.001, + "loss": 2.5821, + "step": 7594 + }, + { + "epoch": 0.3213046789068449, + "grad_norm": 0.2782851755619049, + "learning_rate": 0.001, + "loss": 2.0855, + "step": 7595 + }, + { + "epoch": 0.3213469836703613, + "grad_norm": 0.20386971533298492, + "learning_rate": 0.001, + "loss": 1.6014, + "step": 7596 + }, + { + "epoch": 0.32138928843387765, + "grad_norm": 1.3683080673217773, + "learning_rate": 0.001, + "loss": 3.028, + "step": 7597 + }, + { + "epoch": 0.321431593197394, + "grad_norm": 1.1451998949050903, + "learning_rate": 0.001, + "loss": 2.1981, + "step": 7598 + }, + { + "epoch": 0.3214738979609104, + "grad_norm": 0.43921521306037903, + "learning_rate": 0.001, + "loss": 2.2234, + "step": 7599 + }, + { + "epoch": 0.32151620272442677, + "grad_norm": 0.2476237416267395, + "learning_rate": 0.001, + "loss": 3.1997, + "step": 7600 + }, + { + "epoch": 0.3215585074879431, + "grad_norm": 0.21488797664642334, + "learning_rate": 0.001, + "loss": 2.2913, + "step": 7601 + }, + { + "epoch": 0.32160081225145953, + "grad_norm": 0.2706213891506195, + "learning_rate": 0.001, + "loss": 2.4037, + "step": 7602 + }, + { + "epoch": 0.3216431170149759, + "grad_norm": 0.23860187828540802, + "learning_rate": 0.001, + "loss": 2.116, + "step": 7603 + }, + { + "epoch": 0.32168542177849224, + "grad_norm": 0.23283398151397705, + "learning_rate": 0.001, + "loss": 2.6682, + "step": 7604 + }, + { + "epoch": 0.32172772654200865, + "grad_norm": 0.4454127550125122, + "learning_rate": 0.001, + "loss": 2.2326, + "step": 7605 + }, + { + "epoch": 0.321770031305525, + "grad_norm": 0.2188045084476471, + "learning_rate": 0.001, + "loss": 2.3459, + "step": 7606 + }, + { + "epoch": 0.32181233606904136, + "grad_norm": 0.2107754945755005, + "learning_rate": 0.001, + "loss": 1.8558, + "step": 7607 + }, + { + "epoch": 0.32185464083255777, + "grad_norm": 0.24413500726222992, + "learning_rate": 0.001, + "loss": 2.6748, + "step": 7608 + }, + { + "epoch": 0.3218969455960741, + "grad_norm": 0.1848825365304947, + "learning_rate": 0.001, + "loss": 1.7373, + "step": 7609 + }, + { + "epoch": 0.3219392503595905, + "grad_norm": 0.4209091365337372, + "learning_rate": 0.001, + "loss": 2.9954, + "step": 7610 + }, + { + "epoch": 0.3219815551231069, + "grad_norm": 1.6776453256607056, + "learning_rate": 0.001, + "loss": 3.9771, + "step": 7611 + }, + { + "epoch": 0.32202385988662324, + "grad_norm": 0.21313358843326569, + "learning_rate": 0.001, + "loss": 2.2837, + "step": 7612 + }, + { + "epoch": 0.3220661646501396, + "grad_norm": 0.4032410979270935, + "learning_rate": 0.001, + "loss": 2.0786, + "step": 7613 + }, + { + "epoch": 0.32210846941365595, + "grad_norm": 0.18987895548343658, + "learning_rate": 0.001, + "loss": 2.3311, + "step": 7614 + }, + { + "epoch": 0.32215077417717236, + "grad_norm": 7.441311359405518, + "learning_rate": 0.001, + "loss": 2.2271, + "step": 7615 + }, + { + "epoch": 0.3221930789406887, + "grad_norm": 0.20409175753593445, + "learning_rate": 0.001, + "loss": 2.9311, + "step": 7616 + }, + { + "epoch": 0.32223538370420507, + "grad_norm": 0.19200804829597473, + "learning_rate": 0.001, + "loss": 1.5997, + "step": 7617 + }, + { + "epoch": 0.3222776884677215, + "grad_norm": 0.2112921178340912, + "learning_rate": 0.001, + "loss": 1.853, + "step": 7618 + }, + { + "epoch": 0.32231999323123783, + "grad_norm": 0.5670382976531982, + "learning_rate": 0.001, + "loss": 2.1103, + "step": 7619 + }, + { + "epoch": 0.3223622979947542, + "grad_norm": 1.1795889139175415, + "learning_rate": 0.001, + "loss": 1.5708, + "step": 7620 + }, + { + "epoch": 0.3224046027582706, + "grad_norm": 0.19757835566997528, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 7621 + }, + { + "epoch": 0.32244690752178695, + "grad_norm": 0.3224509060382843, + "learning_rate": 0.001, + "loss": 2.1601, + "step": 7622 + }, + { + "epoch": 0.3224892122853033, + "grad_norm": 3.6661248207092285, + "learning_rate": 0.001, + "loss": 2.2917, + "step": 7623 + }, + { + "epoch": 0.3225315170488197, + "grad_norm": 0.26291152834892273, + "learning_rate": 0.001, + "loss": 2.5686, + "step": 7624 + }, + { + "epoch": 0.32257382181233607, + "grad_norm": 0.2524521052837372, + "learning_rate": 0.001, + "loss": 2.3755, + "step": 7625 + }, + { + "epoch": 0.3226161265758524, + "grad_norm": 0.22929400205612183, + "learning_rate": 0.001, + "loss": 2.6761, + "step": 7626 + }, + { + "epoch": 0.32265843133936883, + "grad_norm": 0.2434311956167221, + "learning_rate": 0.001, + "loss": 2.5041, + "step": 7627 + }, + { + "epoch": 0.3227007361028852, + "grad_norm": 0.3644380569458008, + "learning_rate": 0.001, + "loss": 2.0181, + "step": 7628 + }, + { + "epoch": 0.32274304086640154, + "grad_norm": 0.8881004452705383, + "learning_rate": 0.001, + "loss": 2.3105, + "step": 7629 + }, + { + "epoch": 0.32278534562991795, + "grad_norm": 1.9201459884643555, + "learning_rate": 0.001, + "loss": 2.3565, + "step": 7630 + }, + { + "epoch": 0.3228276503934343, + "grad_norm": 0.22108227014541626, + "learning_rate": 0.001, + "loss": 1.9022, + "step": 7631 + }, + { + "epoch": 0.32286995515695066, + "grad_norm": 0.5766029953956604, + "learning_rate": 0.001, + "loss": 1.7875, + "step": 7632 + }, + { + "epoch": 0.32291225992046707, + "grad_norm": 0.23156306147575378, + "learning_rate": 0.001, + "loss": 2.9351, + "step": 7633 + }, + { + "epoch": 0.3229545646839834, + "grad_norm": 0.20917218923568726, + "learning_rate": 0.001, + "loss": 1.7399, + "step": 7634 + }, + { + "epoch": 0.3229968694474998, + "grad_norm": 0.2092711329460144, + "learning_rate": 0.001, + "loss": 2.2154, + "step": 7635 + }, + { + "epoch": 0.32303917421101613, + "grad_norm": 1.8786296844482422, + "learning_rate": 0.001, + "loss": 1.755, + "step": 7636 + }, + { + "epoch": 0.32308147897453254, + "grad_norm": 5.879979133605957, + "learning_rate": 0.001, + "loss": 2.5399, + "step": 7637 + }, + { + "epoch": 0.3231237837380489, + "grad_norm": 1.0984258651733398, + "learning_rate": 0.001, + "loss": 2.2887, + "step": 7638 + }, + { + "epoch": 0.32316608850156525, + "grad_norm": 0.2462794929742813, + "learning_rate": 0.001, + "loss": 1.8517, + "step": 7639 + }, + { + "epoch": 0.32320839326508166, + "grad_norm": 0.22659048438072205, + "learning_rate": 0.001, + "loss": 1.6135, + "step": 7640 + }, + { + "epoch": 0.323250698028598, + "grad_norm": 0.2993631064891815, + "learning_rate": 0.001, + "loss": 1.9505, + "step": 7641 + }, + { + "epoch": 0.32329300279211437, + "grad_norm": 0.43561097979545593, + "learning_rate": 0.001, + "loss": 2.3488, + "step": 7642 + }, + { + "epoch": 0.3233353075556308, + "grad_norm": 0.2653340995311737, + "learning_rate": 0.001, + "loss": 3.7175, + "step": 7643 + }, + { + "epoch": 0.32337761231914713, + "grad_norm": 0.2567252218723297, + "learning_rate": 0.001, + "loss": 2.252, + "step": 7644 + }, + { + "epoch": 0.3234199170826635, + "grad_norm": 0.2634236514568329, + "learning_rate": 0.001, + "loss": 2.5914, + "step": 7645 + }, + { + "epoch": 0.3234622218461799, + "grad_norm": 0.21972741186618805, + "learning_rate": 0.001, + "loss": 1.9794, + "step": 7646 + }, + { + "epoch": 0.32350452660969625, + "grad_norm": 0.37712669372558594, + "learning_rate": 0.001, + "loss": 2.7509, + "step": 7647 + }, + { + "epoch": 0.3235468313732126, + "grad_norm": 0.7490605711936951, + "learning_rate": 0.001, + "loss": 2.8407, + "step": 7648 + }, + { + "epoch": 0.323589136136729, + "grad_norm": 0.38894638419151306, + "learning_rate": 0.001, + "loss": 2.2041, + "step": 7649 + }, + { + "epoch": 0.32363144090024537, + "grad_norm": 0.4028621315956116, + "learning_rate": 0.001, + "loss": 2.0586, + "step": 7650 + }, + { + "epoch": 0.3236737456637617, + "grad_norm": 2.477647542953491, + "learning_rate": 0.001, + "loss": 3.4487, + "step": 7651 + }, + { + "epoch": 0.32371605042727813, + "grad_norm": 0.26158443093299866, + "learning_rate": 0.001, + "loss": 2.305, + "step": 7652 + }, + { + "epoch": 0.3237583551907945, + "grad_norm": 0.2559559643268585, + "learning_rate": 0.001, + "loss": 2.3287, + "step": 7653 + }, + { + "epoch": 0.32380065995431084, + "grad_norm": 0.2276749461889267, + "learning_rate": 0.001, + "loss": 2.2801, + "step": 7654 + }, + { + "epoch": 0.32384296471782725, + "grad_norm": 0.2165883332490921, + "learning_rate": 0.001, + "loss": 2.4184, + "step": 7655 + }, + { + "epoch": 0.3238852694813436, + "grad_norm": 1.6801161766052246, + "learning_rate": 0.001, + "loss": 2.2247, + "step": 7656 + }, + { + "epoch": 0.32392757424485996, + "grad_norm": 0.23615668714046478, + "learning_rate": 0.001, + "loss": 3.3127, + "step": 7657 + }, + { + "epoch": 0.3239698790083763, + "grad_norm": 0.20711363852024078, + "learning_rate": 0.001, + "loss": 2.502, + "step": 7658 + }, + { + "epoch": 0.3240121837718927, + "grad_norm": 0.20527245104312897, + "learning_rate": 0.001, + "loss": 1.7657, + "step": 7659 + }, + { + "epoch": 0.3240544885354091, + "grad_norm": 0.18716438114643097, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 7660 + }, + { + "epoch": 0.32409679329892543, + "grad_norm": 0.25125744938850403, + "learning_rate": 0.001, + "loss": 2.0741, + "step": 7661 + }, + { + "epoch": 0.32413909806244184, + "grad_norm": 0.1993718147277832, + "learning_rate": 0.001, + "loss": 3.3968, + "step": 7662 + }, + { + "epoch": 0.3241814028259582, + "grad_norm": 0.1872645914554596, + "learning_rate": 0.001, + "loss": 2.0158, + "step": 7663 + }, + { + "epoch": 0.32422370758947455, + "grad_norm": 0.19530023634433746, + "learning_rate": 0.001, + "loss": 1.9639, + "step": 7664 + }, + { + "epoch": 0.32426601235299096, + "grad_norm": 0.2583797872066498, + "learning_rate": 0.001, + "loss": 1.7254, + "step": 7665 + }, + { + "epoch": 0.3243083171165073, + "grad_norm": 2.2203147411346436, + "learning_rate": 0.001, + "loss": 2.6183, + "step": 7666 + }, + { + "epoch": 0.32435062188002367, + "grad_norm": 0.19957537949085236, + "learning_rate": 0.001, + "loss": 2.202, + "step": 7667 + }, + { + "epoch": 0.3243929266435401, + "grad_norm": 0.2118930220603943, + "learning_rate": 0.001, + "loss": 3.7787, + "step": 7668 + }, + { + "epoch": 0.32443523140705643, + "grad_norm": 0.21599611639976501, + "learning_rate": 0.001, + "loss": 2.5277, + "step": 7669 + }, + { + "epoch": 0.3244775361705728, + "grad_norm": 3.4629387855529785, + "learning_rate": 0.001, + "loss": 1.6031, + "step": 7670 + }, + { + "epoch": 0.3245198409340892, + "grad_norm": 0.29955390095710754, + "learning_rate": 0.001, + "loss": 1.6658, + "step": 7671 + }, + { + "epoch": 0.32456214569760555, + "grad_norm": 0.20181111991405487, + "learning_rate": 0.001, + "loss": 1.9581, + "step": 7672 + }, + { + "epoch": 0.3246044504611219, + "grad_norm": 0.8020063638687134, + "learning_rate": 0.001, + "loss": 2.6512, + "step": 7673 + }, + { + "epoch": 0.3246467552246383, + "grad_norm": 0.17198266088962555, + "learning_rate": 0.001, + "loss": 2.0142, + "step": 7674 + }, + { + "epoch": 0.32468905998815467, + "grad_norm": 0.25351399183273315, + "learning_rate": 0.001, + "loss": 2.301, + "step": 7675 + }, + { + "epoch": 0.324731364751671, + "grad_norm": 0.28780749440193176, + "learning_rate": 0.001, + "loss": 1.8023, + "step": 7676 + }, + { + "epoch": 0.32477366951518744, + "grad_norm": 8.679863929748535, + "learning_rate": 0.001, + "loss": 2.0141, + "step": 7677 + }, + { + "epoch": 0.3248159742787038, + "grad_norm": 1.8151581287384033, + "learning_rate": 0.001, + "loss": 1.7588, + "step": 7678 + }, + { + "epoch": 0.32485827904222014, + "grad_norm": 0.19262565672397614, + "learning_rate": 0.001, + "loss": 1.7654, + "step": 7679 + }, + { + "epoch": 0.32490058380573655, + "grad_norm": 0.2888406217098236, + "learning_rate": 0.001, + "loss": 2.3853, + "step": 7680 + }, + { + "epoch": 0.3249428885692529, + "grad_norm": 0.5525591373443604, + "learning_rate": 0.001, + "loss": 2.9946, + "step": 7681 + }, + { + "epoch": 0.32498519333276926, + "grad_norm": 0.319832444190979, + "learning_rate": 0.001, + "loss": 2.0593, + "step": 7682 + }, + { + "epoch": 0.3250274980962856, + "grad_norm": 0.3268459737300873, + "learning_rate": 0.001, + "loss": 2.1757, + "step": 7683 + }, + { + "epoch": 0.325069802859802, + "grad_norm": 2.9600412845611572, + "learning_rate": 0.001, + "loss": 2.3893, + "step": 7684 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 0.9362667202949524, + "learning_rate": 0.001, + "loss": 2.2399, + "step": 7685 + }, + { + "epoch": 0.32515441238683473, + "grad_norm": 0.509442150592804, + "learning_rate": 0.001, + "loss": 2.1812, + "step": 7686 + }, + { + "epoch": 0.32519671715035114, + "grad_norm": 0.6412566900253296, + "learning_rate": 0.001, + "loss": 2.117, + "step": 7687 + }, + { + "epoch": 0.3252390219138675, + "grad_norm": 0.25562146306037903, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 7688 + }, + { + "epoch": 0.32528132667738385, + "grad_norm": 0.20876967906951904, + "learning_rate": 0.001, + "loss": 1.7797, + "step": 7689 + }, + { + "epoch": 0.32532363144090026, + "grad_norm": 0.2410915642976761, + "learning_rate": 0.001, + "loss": 1.6797, + "step": 7690 + }, + { + "epoch": 0.3253659362044166, + "grad_norm": 0.26328933238983154, + "learning_rate": 0.001, + "loss": 2.514, + "step": 7691 + }, + { + "epoch": 0.32540824096793297, + "grad_norm": 0.23529529571533203, + "learning_rate": 0.001, + "loss": 2.1006, + "step": 7692 + }, + { + "epoch": 0.3254505457314494, + "grad_norm": 0.31655412912368774, + "learning_rate": 0.001, + "loss": 2.7627, + "step": 7693 + }, + { + "epoch": 0.32549285049496574, + "grad_norm": 0.6058570742607117, + "learning_rate": 0.001, + "loss": 2.5461, + "step": 7694 + }, + { + "epoch": 0.3255351552584821, + "grad_norm": 0.21855738759040833, + "learning_rate": 0.001, + "loss": 1.8018, + "step": 7695 + }, + { + "epoch": 0.3255774600219985, + "grad_norm": 0.22689220309257507, + "learning_rate": 0.001, + "loss": 2.122, + "step": 7696 + }, + { + "epoch": 0.32561976478551485, + "grad_norm": 0.22445081174373627, + "learning_rate": 0.001, + "loss": 1.8022, + "step": 7697 + }, + { + "epoch": 0.3256620695490312, + "grad_norm": 0.2299547791481018, + "learning_rate": 0.001, + "loss": 2.2955, + "step": 7698 + }, + { + "epoch": 0.3257043743125476, + "grad_norm": 0.5000811219215393, + "learning_rate": 0.001, + "loss": 2.6577, + "step": 7699 + }, + { + "epoch": 0.32574667907606397, + "grad_norm": 2.1569273471832275, + "learning_rate": 0.001, + "loss": 2.2027, + "step": 7700 + }, + { + "epoch": 0.3257889838395803, + "grad_norm": 0.25455546379089355, + "learning_rate": 0.001, + "loss": 2.4252, + "step": 7701 + }, + { + "epoch": 0.32583128860309674, + "grad_norm": 0.21060289442539215, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 7702 + }, + { + "epoch": 0.3258735933666131, + "grad_norm": 0.1968967467546463, + "learning_rate": 0.001, + "loss": 2.5999, + "step": 7703 + }, + { + "epoch": 0.32591589813012944, + "grad_norm": 0.31110793352127075, + "learning_rate": 0.001, + "loss": 2.2596, + "step": 7704 + }, + { + "epoch": 0.3259582028936458, + "grad_norm": 2.084595203399658, + "learning_rate": 0.001, + "loss": 2.7048, + "step": 7705 + }, + { + "epoch": 0.3260005076571622, + "grad_norm": 0.22130149602890015, + "learning_rate": 0.001, + "loss": 1.5586, + "step": 7706 + }, + { + "epoch": 0.32604281242067856, + "grad_norm": 0.2199217677116394, + "learning_rate": 0.001, + "loss": 1.9871, + "step": 7707 + }, + { + "epoch": 0.3260851171841949, + "grad_norm": 0.19622403383255005, + "learning_rate": 0.001, + "loss": 1.8239, + "step": 7708 + }, + { + "epoch": 0.3261274219477113, + "grad_norm": 0.23500365018844604, + "learning_rate": 0.001, + "loss": 2.7668, + "step": 7709 + }, + { + "epoch": 0.3261697267112277, + "grad_norm": 0.4715394973754883, + "learning_rate": 0.001, + "loss": 3.1356, + "step": 7710 + }, + { + "epoch": 0.32621203147474404, + "grad_norm": 0.6608024835586548, + "learning_rate": 0.001, + "loss": 2.1273, + "step": 7711 + }, + { + "epoch": 0.32625433623826045, + "grad_norm": 0.2573659420013428, + "learning_rate": 0.001, + "loss": 2.0172, + "step": 7712 + }, + { + "epoch": 0.3262966410017768, + "grad_norm": 3.757869243621826, + "learning_rate": 0.001, + "loss": 2.2053, + "step": 7713 + }, + { + "epoch": 0.32633894576529315, + "grad_norm": 0.36734887957572937, + "learning_rate": 0.001, + "loss": 2.188, + "step": 7714 + }, + { + "epoch": 0.32638125052880956, + "grad_norm": 0.8497750163078308, + "learning_rate": 0.001, + "loss": 3.191, + "step": 7715 + }, + { + "epoch": 0.3264235552923259, + "grad_norm": 1.0639852285385132, + "learning_rate": 0.001, + "loss": 2.1099, + "step": 7716 + }, + { + "epoch": 0.32646586005584227, + "grad_norm": 0.19022592902183533, + "learning_rate": 0.001, + "loss": 1.6883, + "step": 7717 + }, + { + "epoch": 0.3265081648193587, + "grad_norm": 0.19042854011058807, + "learning_rate": 0.001, + "loss": 2.3408, + "step": 7718 + }, + { + "epoch": 0.32655046958287504, + "grad_norm": 0.2183213233947754, + "learning_rate": 0.001, + "loss": 2.0902, + "step": 7719 + }, + { + "epoch": 0.3265927743463914, + "grad_norm": 0.2381005436182022, + "learning_rate": 0.001, + "loss": 2.5141, + "step": 7720 + }, + { + "epoch": 0.3266350791099078, + "grad_norm": 0.2116747945547104, + "learning_rate": 0.001, + "loss": 1.8768, + "step": 7721 + }, + { + "epoch": 0.32667738387342415, + "grad_norm": 0.43030673265457153, + "learning_rate": 0.001, + "loss": 3.133, + "step": 7722 + }, + { + "epoch": 0.3267196886369405, + "grad_norm": 3.445666551589966, + "learning_rate": 0.001, + "loss": 1.8229, + "step": 7723 + }, + { + "epoch": 0.3267619934004569, + "grad_norm": 1.1962391138076782, + "learning_rate": 0.001, + "loss": 2.3559, + "step": 7724 + }, + { + "epoch": 0.3268042981639733, + "grad_norm": 0.4430636763572693, + "learning_rate": 0.001, + "loss": 2.3892, + "step": 7725 + }, + { + "epoch": 0.3268466029274896, + "grad_norm": 0.21626804769039154, + "learning_rate": 0.001, + "loss": 2.156, + "step": 7726 + }, + { + "epoch": 0.326888907691006, + "grad_norm": 0.21447373926639557, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 7727 + }, + { + "epoch": 0.3269312124545224, + "grad_norm": 0.2664428949356079, + "learning_rate": 0.001, + "loss": 2.1011, + "step": 7728 + }, + { + "epoch": 0.32697351721803875, + "grad_norm": 0.19551439583301544, + "learning_rate": 0.001, + "loss": 2.5206, + "step": 7729 + }, + { + "epoch": 0.3270158219815551, + "grad_norm": 0.1879674643278122, + "learning_rate": 0.001, + "loss": 1.8628, + "step": 7730 + }, + { + "epoch": 0.3270581267450715, + "grad_norm": 0.20081692934036255, + "learning_rate": 0.001, + "loss": 1.8979, + "step": 7731 + }, + { + "epoch": 0.32710043150858786, + "grad_norm": 0.178413525223732, + "learning_rate": 0.001, + "loss": 2.3565, + "step": 7732 + }, + { + "epoch": 0.3271427362721042, + "grad_norm": 0.384456992149353, + "learning_rate": 0.001, + "loss": 2.2038, + "step": 7733 + }, + { + "epoch": 0.3271850410356206, + "grad_norm": 0.29158031940460205, + "learning_rate": 0.001, + "loss": 2.7088, + "step": 7734 + }, + { + "epoch": 0.327227345799137, + "grad_norm": 0.17012235522270203, + "learning_rate": 0.001, + "loss": 1.5478, + "step": 7735 + }, + { + "epoch": 0.32726965056265334, + "grad_norm": 0.39595428109169006, + "learning_rate": 0.001, + "loss": 2.7015, + "step": 7736 + }, + { + "epoch": 0.32731195532616975, + "grad_norm": 0.58181232213974, + "learning_rate": 0.001, + "loss": 1.7588, + "step": 7737 + }, + { + "epoch": 0.3273542600896861, + "grad_norm": 0.20064769685268402, + "learning_rate": 0.001, + "loss": 1.8287, + "step": 7738 + }, + { + "epoch": 0.32739656485320245, + "grad_norm": 0.2265099585056305, + "learning_rate": 0.001, + "loss": 2.289, + "step": 7739 + }, + { + "epoch": 0.32743886961671886, + "grad_norm": 0.2981756031513214, + "learning_rate": 0.001, + "loss": 2.1712, + "step": 7740 + }, + { + "epoch": 0.3274811743802352, + "grad_norm": 1.7779628038406372, + "learning_rate": 0.001, + "loss": 3.4488, + "step": 7741 + }, + { + "epoch": 0.3275234791437516, + "grad_norm": 0.2585592269897461, + "learning_rate": 0.001, + "loss": 2.4738, + "step": 7742 + }, + { + "epoch": 0.327565783907268, + "grad_norm": 0.9747766256332397, + "learning_rate": 0.001, + "loss": 3.1085, + "step": 7743 + }, + { + "epoch": 0.32760808867078434, + "grad_norm": 6.341303825378418, + "learning_rate": 0.001, + "loss": 2.3515, + "step": 7744 + }, + { + "epoch": 0.3276503934343007, + "grad_norm": 0.28042072057724, + "learning_rate": 0.001, + "loss": 1.8256, + "step": 7745 + }, + { + "epoch": 0.3276926981978171, + "grad_norm": 0.21017926931381226, + "learning_rate": 0.001, + "loss": 2.2271, + "step": 7746 + }, + { + "epoch": 0.32773500296133345, + "grad_norm": 0.259328156709671, + "learning_rate": 0.001, + "loss": 2.6266, + "step": 7747 + }, + { + "epoch": 0.3277773077248498, + "grad_norm": 0.16318392753601074, + "learning_rate": 0.001, + "loss": 1.8191, + "step": 7748 + }, + { + "epoch": 0.32781961248836616, + "grad_norm": 0.27874764800071716, + "learning_rate": 0.001, + "loss": 1.9758, + "step": 7749 + }, + { + "epoch": 0.3278619172518826, + "grad_norm": 0.2953718304634094, + "learning_rate": 0.001, + "loss": 3.011, + "step": 7750 + }, + { + "epoch": 0.3279042220153989, + "grad_norm": 0.3343517780303955, + "learning_rate": 0.001, + "loss": 2.1521, + "step": 7751 + }, + { + "epoch": 0.3279465267789153, + "grad_norm": 0.25767582654953003, + "learning_rate": 0.001, + "loss": 2.672, + "step": 7752 + }, + { + "epoch": 0.3279888315424317, + "grad_norm": 0.21539786458015442, + "learning_rate": 0.001, + "loss": 2.3109, + "step": 7753 + }, + { + "epoch": 0.32803113630594805, + "grad_norm": 0.22561384737491608, + "learning_rate": 0.001, + "loss": 2.5035, + "step": 7754 + }, + { + "epoch": 0.3280734410694644, + "grad_norm": 0.2779701054096222, + "learning_rate": 0.001, + "loss": 2.5272, + "step": 7755 + }, + { + "epoch": 0.3281157458329808, + "grad_norm": 0.19811490178108215, + "learning_rate": 0.001, + "loss": 1.7952, + "step": 7756 + }, + { + "epoch": 0.32815805059649716, + "grad_norm": 0.20879384875297546, + "learning_rate": 0.001, + "loss": 1.2741, + "step": 7757 + }, + { + "epoch": 0.3282003553600135, + "grad_norm": 0.8248985409736633, + "learning_rate": 0.001, + "loss": 1.9017, + "step": 7758 + }, + { + "epoch": 0.32824266012352993, + "grad_norm": 0.20458753407001495, + "learning_rate": 0.001, + "loss": 1.9238, + "step": 7759 + }, + { + "epoch": 0.3282849648870463, + "grad_norm": 0.32681992650032043, + "learning_rate": 0.001, + "loss": 3.2441, + "step": 7760 + }, + { + "epoch": 0.32832726965056264, + "grad_norm": 6.789393424987793, + "learning_rate": 0.001, + "loss": 2.4838, + "step": 7761 + }, + { + "epoch": 0.32836957441407905, + "grad_norm": 0.49927785992622375, + "learning_rate": 0.001, + "loss": 2.888, + "step": 7762 + }, + { + "epoch": 0.3284118791775954, + "grad_norm": 2.3099589347839355, + "learning_rate": 0.001, + "loss": 2.6976, + "step": 7763 + }, + { + "epoch": 0.32845418394111175, + "grad_norm": 2.1838667392730713, + "learning_rate": 0.001, + "loss": 2.2512, + "step": 7764 + }, + { + "epoch": 0.32849648870462816, + "grad_norm": 0.39657530188560486, + "learning_rate": 0.001, + "loss": 2.7728, + "step": 7765 + }, + { + "epoch": 0.3285387934681445, + "grad_norm": 0.2155158668756485, + "learning_rate": 0.001, + "loss": 2.2733, + "step": 7766 + }, + { + "epoch": 0.3285810982316609, + "grad_norm": 0.46449387073516846, + "learning_rate": 0.001, + "loss": 3.3834, + "step": 7767 + }, + { + "epoch": 0.3286234029951773, + "grad_norm": 0.20302799344062805, + "learning_rate": 0.001, + "loss": 2.1242, + "step": 7768 + }, + { + "epoch": 0.32866570775869364, + "grad_norm": 0.1897967904806137, + "learning_rate": 0.001, + "loss": 2.2495, + "step": 7769 + }, + { + "epoch": 0.32870801252221, + "grad_norm": 0.17885787785053253, + "learning_rate": 0.001, + "loss": 1.9588, + "step": 7770 + }, + { + "epoch": 0.32875031728572635, + "grad_norm": 0.25244593620300293, + "learning_rate": 0.001, + "loss": 2.8224, + "step": 7771 + }, + { + "epoch": 0.32879262204924276, + "grad_norm": 0.17659065127372742, + "learning_rate": 0.001, + "loss": 1.5998, + "step": 7772 + }, + { + "epoch": 0.3288349268127591, + "grad_norm": 0.19181221723556519, + "learning_rate": 0.001, + "loss": 1.8639, + "step": 7773 + }, + { + "epoch": 0.32887723157627546, + "grad_norm": 0.3274793028831482, + "learning_rate": 0.001, + "loss": 2.3154, + "step": 7774 + }, + { + "epoch": 0.3289195363397919, + "grad_norm": 0.21050041913986206, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 7775 + }, + { + "epoch": 0.32896184110330823, + "grad_norm": 0.17837364971637726, + "learning_rate": 0.001, + "loss": 1.7701, + "step": 7776 + }, + { + "epoch": 0.3290041458668246, + "grad_norm": 0.6773764491081238, + "learning_rate": 0.001, + "loss": 2.6309, + "step": 7777 + }, + { + "epoch": 0.329046450630341, + "grad_norm": 2.140749216079712, + "learning_rate": 0.001, + "loss": 3.2878, + "step": 7778 + }, + { + "epoch": 0.32908875539385735, + "grad_norm": 4.552440166473389, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 7779 + }, + { + "epoch": 0.3291310601573737, + "grad_norm": 0.37157878279685974, + "learning_rate": 0.001, + "loss": 2.3027, + "step": 7780 + }, + { + "epoch": 0.3291733649208901, + "grad_norm": 0.25460559129714966, + "learning_rate": 0.001, + "loss": 2.0759, + "step": 7781 + }, + { + "epoch": 0.32921566968440646, + "grad_norm": 1.6842424869537354, + "learning_rate": 0.001, + "loss": 2.857, + "step": 7782 + }, + { + "epoch": 0.3292579744479228, + "grad_norm": 0.2335149198770523, + "learning_rate": 0.001, + "loss": 1.7248, + "step": 7783 + }, + { + "epoch": 0.32930027921143923, + "grad_norm": 0.1995423585176468, + "learning_rate": 0.001, + "loss": 1.7434, + "step": 7784 + }, + { + "epoch": 0.3293425839749556, + "grad_norm": 0.2805491089820862, + "learning_rate": 0.001, + "loss": 2.6196, + "step": 7785 + }, + { + "epoch": 0.32938488873847194, + "grad_norm": 0.7963484525680542, + "learning_rate": 0.001, + "loss": 2.7678, + "step": 7786 + }, + { + "epoch": 0.32942719350198835, + "grad_norm": 0.2746239900588989, + "learning_rate": 0.001, + "loss": 2.0567, + "step": 7787 + }, + { + "epoch": 0.3294694982655047, + "grad_norm": 0.6149387955665588, + "learning_rate": 0.001, + "loss": 2.4013, + "step": 7788 + }, + { + "epoch": 0.32951180302902106, + "grad_norm": 0.5365425944328308, + "learning_rate": 0.001, + "loss": 2.2887, + "step": 7789 + }, + { + "epoch": 0.32955410779253747, + "grad_norm": 0.22529155015945435, + "learning_rate": 0.001, + "loss": 2.5844, + "step": 7790 + }, + { + "epoch": 0.3295964125560538, + "grad_norm": 0.28875237703323364, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 7791 + }, + { + "epoch": 0.3296387173195702, + "grad_norm": 0.20266857743263245, + "learning_rate": 0.001, + "loss": 2.5674, + "step": 7792 + }, + { + "epoch": 0.32968102208308653, + "grad_norm": 0.16716927289962769, + "learning_rate": 0.001, + "loss": 2.2583, + "step": 7793 + }, + { + "epoch": 0.32972332684660294, + "grad_norm": 0.25711336731910706, + "learning_rate": 0.001, + "loss": 2.9724, + "step": 7794 + }, + { + "epoch": 0.3297656316101193, + "grad_norm": 4.552552700042725, + "learning_rate": 0.001, + "loss": 2.6996, + "step": 7795 + }, + { + "epoch": 0.32980793637363565, + "grad_norm": 0.5906350612640381, + "learning_rate": 0.001, + "loss": 2.1236, + "step": 7796 + }, + { + "epoch": 0.32985024113715206, + "grad_norm": 0.2682677209377289, + "learning_rate": 0.001, + "loss": 2.4385, + "step": 7797 + }, + { + "epoch": 0.3298925459006684, + "grad_norm": 0.25783124566078186, + "learning_rate": 0.001, + "loss": 2.3089, + "step": 7798 + }, + { + "epoch": 0.32993485066418476, + "grad_norm": 0.18664343655109406, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 7799 + }, + { + "epoch": 0.3299771554277012, + "grad_norm": 0.1630748063325882, + "learning_rate": 0.001, + "loss": 1.6401, + "step": 7800 + }, + { + "epoch": 0.33001946019121753, + "grad_norm": 0.30394893884658813, + "learning_rate": 0.001, + "loss": 2.3112, + "step": 7801 + }, + { + "epoch": 0.3300617649547339, + "grad_norm": 0.19138090312480927, + "learning_rate": 0.001, + "loss": 2.5035, + "step": 7802 + }, + { + "epoch": 0.3301040697182503, + "grad_norm": 0.16754819452762604, + "learning_rate": 0.001, + "loss": 1.6879, + "step": 7803 + }, + { + "epoch": 0.33014637448176665, + "grad_norm": 0.8580173850059509, + "learning_rate": 0.001, + "loss": 2.4691, + "step": 7804 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 0.176096573472023, + "learning_rate": 0.001, + "loss": 1.8854, + "step": 7805 + }, + { + "epoch": 0.3302309840087994, + "grad_norm": 0.20934566855430603, + "learning_rate": 0.001, + "loss": 2.0659, + "step": 7806 + }, + { + "epoch": 0.33027328877231577, + "grad_norm": 5.532082557678223, + "learning_rate": 0.001, + "loss": 1.759, + "step": 7807 + }, + { + "epoch": 0.3303155935358321, + "grad_norm": 0.19679847359657288, + "learning_rate": 0.001, + "loss": 2.3835, + "step": 7808 + }, + { + "epoch": 0.33035789829934853, + "grad_norm": 0.19240596890449524, + "learning_rate": 0.001, + "loss": 2.0052, + "step": 7809 + }, + { + "epoch": 0.3304002030628649, + "grad_norm": 0.19509436190128326, + "learning_rate": 0.001, + "loss": 2.426, + "step": 7810 + }, + { + "epoch": 0.33044250782638124, + "grad_norm": 0.19828227162361145, + "learning_rate": 0.001, + "loss": 2.1933, + "step": 7811 + }, + { + "epoch": 0.33048481258989765, + "grad_norm": 2.229938507080078, + "learning_rate": 0.001, + "loss": 1.7994, + "step": 7812 + }, + { + "epoch": 0.330527117353414, + "grad_norm": 0.2188158929347992, + "learning_rate": 0.001, + "loss": 1.8493, + "step": 7813 + }, + { + "epoch": 0.33056942211693036, + "grad_norm": 0.21723762154579163, + "learning_rate": 0.001, + "loss": 1.8718, + "step": 7814 + }, + { + "epoch": 0.33061172688044677, + "grad_norm": 0.5078144669532776, + "learning_rate": 0.001, + "loss": 2.1031, + "step": 7815 + }, + { + "epoch": 0.3306540316439631, + "grad_norm": 0.24922381341457367, + "learning_rate": 0.001, + "loss": 2.1853, + "step": 7816 + }, + { + "epoch": 0.3306963364074795, + "grad_norm": 0.19677498936653137, + "learning_rate": 0.001, + "loss": 2.4259, + "step": 7817 + }, + { + "epoch": 0.33073864117099583, + "grad_norm": 0.2011793553829193, + "learning_rate": 0.001, + "loss": 3.2597, + "step": 7818 + }, + { + "epoch": 0.33078094593451224, + "grad_norm": 0.3408401608467102, + "learning_rate": 0.001, + "loss": 2.3159, + "step": 7819 + }, + { + "epoch": 0.3308232506980286, + "grad_norm": 0.1871691644191742, + "learning_rate": 0.001, + "loss": 2.5773, + "step": 7820 + }, + { + "epoch": 0.33086555546154495, + "grad_norm": 0.19966411590576172, + "learning_rate": 0.001, + "loss": 1.4954, + "step": 7821 + }, + { + "epoch": 0.33090786022506136, + "grad_norm": 0.20927104353904724, + "learning_rate": 0.001, + "loss": 2.0749, + "step": 7822 + }, + { + "epoch": 0.3309501649885777, + "grad_norm": 0.2717510759830475, + "learning_rate": 0.001, + "loss": 2.9177, + "step": 7823 + }, + { + "epoch": 0.33099246975209407, + "grad_norm": 0.2813604176044464, + "learning_rate": 0.001, + "loss": 1.8495, + "step": 7824 + }, + { + "epoch": 0.3310347745156105, + "grad_norm": 0.18087762594223022, + "learning_rate": 0.001, + "loss": 2.747, + "step": 7825 + }, + { + "epoch": 0.33107707927912683, + "grad_norm": 29.72042465209961, + "learning_rate": 0.001, + "loss": 3.0675, + "step": 7826 + }, + { + "epoch": 0.3311193840426432, + "grad_norm": 0.2071724832057953, + "learning_rate": 0.001, + "loss": 2.6322, + "step": 7827 + }, + { + "epoch": 0.3311616888061596, + "grad_norm": 0.208586648106575, + "learning_rate": 0.001, + "loss": 2.4795, + "step": 7828 + }, + { + "epoch": 0.33120399356967595, + "grad_norm": 0.19940099120140076, + "learning_rate": 0.001, + "loss": 2.5773, + "step": 7829 + }, + { + "epoch": 0.3312462983331923, + "grad_norm": 2.280409097671509, + "learning_rate": 0.001, + "loss": 2.3525, + "step": 7830 + }, + { + "epoch": 0.3312886030967087, + "grad_norm": 0.977155864238739, + "learning_rate": 0.001, + "loss": 1.8458, + "step": 7831 + }, + { + "epoch": 0.33133090786022507, + "grad_norm": 0.33999037742614746, + "learning_rate": 0.001, + "loss": 1.8913, + "step": 7832 + }, + { + "epoch": 0.3313732126237414, + "grad_norm": 0.23237483203411102, + "learning_rate": 0.001, + "loss": 2.69, + "step": 7833 + }, + { + "epoch": 0.33141551738725783, + "grad_norm": 0.1733536273241043, + "learning_rate": 0.001, + "loss": 2.289, + "step": 7834 + }, + { + "epoch": 0.3314578221507742, + "grad_norm": 0.19590957462787628, + "learning_rate": 0.001, + "loss": 2.3374, + "step": 7835 + }, + { + "epoch": 0.33150012691429054, + "grad_norm": 0.22737683355808258, + "learning_rate": 0.001, + "loss": 2.8243, + "step": 7836 + }, + { + "epoch": 0.33154243167780695, + "grad_norm": 0.22133977711200714, + "learning_rate": 0.001, + "loss": 1.9613, + "step": 7837 + }, + { + "epoch": 0.3315847364413233, + "grad_norm": 0.24154724180698395, + "learning_rate": 0.001, + "loss": 2.0886, + "step": 7838 + }, + { + "epoch": 0.33162704120483966, + "grad_norm": 0.1789299100637436, + "learning_rate": 0.001, + "loss": 2.6418, + "step": 7839 + }, + { + "epoch": 0.331669345968356, + "grad_norm": 0.27288487553596497, + "learning_rate": 0.001, + "loss": 2.4128, + "step": 7840 + }, + { + "epoch": 0.3317116507318724, + "grad_norm": 0.7873362302780151, + "learning_rate": 0.001, + "loss": 1.8761, + "step": 7841 + }, + { + "epoch": 0.3317539554953888, + "grad_norm": 0.32761895656585693, + "learning_rate": 0.001, + "loss": 2.2242, + "step": 7842 + }, + { + "epoch": 0.33179626025890513, + "grad_norm": 2.1952366828918457, + "learning_rate": 0.001, + "loss": 2.0018, + "step": 7843 + }, + { + "epoch": 0.33183856502242154, + "grad_norm": 0.18359605967998505, + "learning_rate": 0.001, + "loss": 1.7927, + "step": 7844 + }, + { + "epoch": 0.3318808697859379, + "grad_norm": 0.9010672569274902, + "learning_rate": 0.001, + "loss": 2.9856, + "step": 7845 + }, + { + "epoch": 0.33192317454945425, + "grad_norm": 0.23249398171901703, + "learning_rate": 0.001, + "loss": 2.6271, + "step": 7846 + }, + { + "epoch": 0.33196547931297066, + "grad_norm": 0.5741890072822571, + "learning_rate": 0.001, + "loss": 3.903, + "step": 7847 + }, + { + "epoch": 0.332007784076487, + "grad_norm": 0.5828627347946167, + "learning_rate": 0.001, + "loss": 2.3399, + "step": 7848 + }, + { + "epoch": 0.33205008884000337, + "grad_norm": 0.2302628457546234, + "learning_rate": 0.001, + "loss": 2.6404, + "step": 7849 + }, + { + "epoch": 0.3320923936035198, + "grad_norm": 1.7967714071273804, + "learning_rate": 0.001, + "loss": 2.3163, + "step": 7850 + }, + { + "epoch": 0.33213469836703613, + "grad_norm": 36.19169616699219, + "learning_rate": 0.001, + "loss": 2.172, + "step": 7851 + }, + { + "epoch": 0.3321770031305525, + "grad_norm": 4.201509952545166, + "learning_rate": 0.001, + "loss": 2.2538, + "step": 7852 + }, + { + "epoch": 0.3322193078940689, + "grad_norm": 0.23143364489078522, + "learning_rate": 0.001, + "loss": 3.6801, + "step": 7853 + }, + { + "epoch": 0.33226161265758525, + "grad_norm": 0.4308498203754425, + "learning_rate": 0.001, + "loss": 2.6071, + "step": 7854 + }, + { + "epoch": 0.3323039174211016, + "grad_norm": 0.1866970956325531, + "learning_rate": 0.001, + "loss": 3.0409, + "step": 7855 + }, + { + "epoch": 0.332346222184618, + "grad_norm": 0.20423516631126404, + "learning_rate": 0.001, + "loss": 2.1872, + "step": 7856 + }, + { + "epoch": 0.33238852694813437, + "grad_norm": 0.23156633973121643, + "learning_rate": 0.001, + "loss": 1.8005, + "step": 7857 + }, + { + "epoch": 0.3324308317116507, + "grad_norm": 0.29893800616264343, + "learning_rate": 0.001, + "loss": 1.9907, + "step": 7858 + }, + { + "epoch": 0.33247313647516713, + "grad_norm": 0.23436959087848663, + "learning_rate": 0.001, + "loss": 2.8521, + "step": 7859 + }, + { + "epoch": 0.3325154412386835, + "grad_norm": 0.1933186650276184, + "learning_rate": 0.001, + "loss": 1.6054, + "step": 7860 + }, + { + "epoch": 0.33255774600219984, + "grad_norm": 0.2026066929101944, + "learning_rate": 0.001, + "loss": 2.026, + "step": 7861 + }, + { + "epoch": 0.3326000507657162, + "grad_norm": 0.21507464349269867, + "learning_rate": 0.001, + "loss": 2.9487, + "step": 7862 + }, + { + "epoch": 0.3326423555292326, + "grad_norm": 0.2192848175764084, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 7863 + }, + { + "epoch": 0.33268466029274896, + "grad_norm": 2.8954203128814697, + "learning_rate": 0.001, + "loss": 1.8237, + "step": 7864 + }, + { + "epoch": 0.3327269650562653, + "grad_norm": 0.30463677644729614, + "learning_rate": 0.001, + "loss": 2.1563, + "step": 7865 + }, + { + "epoch": 0.3327692698197817, + "grad_norm": 0.45512622594833374, + "learning_rate": 0.001, + "loss": 2.8004, + "step": 7866 + }, + { + "epoch": 0.3328115745832981, + "grad_norm": 0.32765403389930725, + "learning_rate": 0.001, + "loss": 2.4544, + "step": 7867 + }, + { + "epoch": 0.33285387934681443, + "grad_norm": 0.21778345108032227, + "learning_rate": 0.001, + "loss": 2.2046, + "step": 7868 + }, + { + "epoch": 0.33289618411033084, + "grad_norm": 0.2558639645576477, + "learning_rate": 0.001, + "loss": 3.3903, + "step": 7869 + }, + { + "epoch": 0.3329384888738472, + "grad_norm": 0.27127140760421753, + "learning_rate": 0.001, + "loss": 2.1592, + "step": 7870 + }, + { + "epoch": 0.33298079363736355, + "grad_norm": 0.2440665364265442, + "learning_rate": 0.001, + "loss": 3.0212, + "step": 7871 + }, + { + "epoch": 0.33302309840087996, + "grad_norm": 0.23537136614322662, + "learning_rate": 0.001, + "loss": 2.1113, + "step": 7872 + }, + { + "epoch": 0.3330654031643963, + "grad_norm": 0.5498445630073547, + "learning_rate": 0.001, + "loss": 2.9246, + "step": 7873 + }, + { + "epoch": 0.33310770792791267, + "grad_norm": 0.19727638363838196, + "learning_rate": 0.001, + "loss": 1.9724, + "step": 7874 + }, + { + "epoch": 0.3331500126914291, + "grad_norm": 0.19722647964954376, + "learning_rate": 0.001, + "loss": 1.92, + "step": 7875 + }, + { + "epoch": 0.33319231745494543, + "grad_norm": 1.0840023756027222, + "learning_rate": 0.001, + "loss": 2.243, + "step": 7876 + }, + { + "epoch": 0.3332346222184618, + "grad_norm": 1.0607739686965942, + "learning_rate": 0.001, + "loss": 2.2591, + "step": 7877 + }, + { + "epoch": 0.3332769269819782, + "grad_norm": 0.19976572692394257, + "learning_rate": 0.001, + "loss": 2.5023, + "step": 7878 + }, + { + "epoch": 0.33331923174549455, + "grad_norm": 0.20749805867671967, + "learning_rate": 0.001, + "loss": 2.4522, + "step": 7879 + }, + { + "epoch": 0.3333615365090109, + "grad_norm": 0.1775732934474945, + "learning_rate": 0.001, + "loss": 2.4282, + "step": 7880 + }, + { + "epoch": 0.3334038412725273, + "grad_norm": 0.6955072283744812, + "learning_rate": 0.001, + "loss": 1.5459, + "step": 7881 + }, + { + "epoch": 0.33344614603604367, + "grad_norm": 2.1425440311431885, + "learning_rate": 0.001, + "loss": 1.8967, + "step": 7882 + }, + { + "epoch": 0.33348845079956, + "grad_norm": 3.9106364250183105, + "learning_rate": 0.001, + "loss": 2.2198, + "step": 7883 + }, + { + "epoch": 0.3335307555630764, + "grad_norm": 6.036744594573975, + "learning_rate": 0.001, + "loss": 2.4918, + "step": 7884 + }, + { + "epoch": 0.3335730603265928, + "grad_norm": 0.19927839934825897, + "learning_rate": 0.001, + "loss": 2.5132, + "step": 7885 + }, + { + "epoch": 0.33361536509010914, + "grad_norm": 1.190531849861145, + "learning_rate": 0.001, + "loss": 1.8756, + "step": 7886 + }, + { + "epoch": 0.3336576698536255, + "grad_norm": 0.2305949628353119, + "learning_rate": 0.001, + "loss": 3.0, + "step": 7887 + }, + { + "epoch": 0.3336999746171419, + "grad_norm": 0.26107683777809143, + "learning_rate": 0.001, + "loss": 2.8662, + "step": 7888 + }, + { + "epoch": 0.33374227938065826, + "grad_norm": 0.3804430365562439, + "learning_rate": 0.001, + "loss": 3.1798, + "step": 7889 + }, + { + "epoch": 0.3337845841441746, + "grad_norm": 0.32322415709495544, + "learning_rate": 0.001, + "loss": 2.1622, + "step": 7890 + }, + { + "epoch": 0.333826888907691, + "grad_norm": 1.6569503545761108, + "learning_rate": 0.001, + "loss": 2.8064, + "step": 7891 + }, + { + "epoch": 0.3338691936712074, + "grad_norm": 0.975721001625061, + "learning_rate": 0.001, + "loss": 2.7777, + "step": 7892 + }, + { + "epoch": 0.33391149843472373, + "grad_norm": 1.4115006923675537, + "learning_rate": 0.001, + "loss": 2.3504, + "step": 7893 + }, + { + "epoch": 0.33395380319824014, + "grad_norm": 0.6836025714874268, + "learning_rate": 0.001, + "loss": 2.1942, + "step": 7894 + }, + { + "epoch": 0.3339961079617565, + "grad_norm": 0.35912907123565674, + "learning_rate": 0.001, + "loss": 3.1702, + "step": 7895 + }, + { + "epoch": 0.33403841272527285, + "grad_norm": 1.0951977968215942, + "learning_rate": 0.001, + "loss": 3.3936, + "step": 7896 + }, + { + "epoch": 0.33408071748878926, + "grad_norm": 0.24415375292301178, + "learning_rate": 0.001, + "loss": 2.4679, + "step": 7897 + }, + { + "epoch": 0.3341230222523056, + "grad_norm": 0.23192666471004486, + "learning_rate": 0.001, + "loss": 1.486, + "step": 7898 + }, + { + "epoch": 0.33416532701582197, + "grad_norm": 0.3416637182235718, + "learning_rate": 0.001, + "loss": 1.8632, + "step": 7899 + }, + { + "epoch": 0.3342076317793384, + "grad_norm": 0.23700256645679474, + "learning_rate": 0.001, + "loss": 2.0956, + "step": 7900 + }, + { + "epoch": 0.33424993654285473, + "grad_norm": 0.2494289129972458, + "learning_rate": 0.001, + "loss": 2.3394, + "step": 7901 + }, + { + "epoch": 0.3342922413063711, + "grad_norm": 0.2167077511548996, + "learning_rate": 0.001, + "loss": 2.3218, + "step": 7902 + }, + { + "epoch": 0.3343345460698875, + "grad_norm": 1.0088181495666504, + "learning_rate": 0.001, + "loss": 2.4546, + "step": 7903 + }, + { + "epoch": 0.33437685083340385, + "grad_norm": 0.45194211602211, + "learning_rate": 0.001, + "loss": 1.9015, + "step": 7904 + }, + { + "epoch": 0.3344191555969202, + "grad_norm": 0.20241999626159668, + "learning_rate": 0.001, + "loss": 1.8324, + "step": 7905 + }, + { + "epoch": 0.33446146036043656, + "grad_norm": 0.26756224036216736, + "learning_rate": 0.001, + "loss": 3.2489, + "step": 7906 + }, + { + "epoch": 0.33450376512395297, + "grad_norm": 0.49798786640167236, + "learning_rate": 0.001, + "loss": 3.2647, + "step": 7907 + }, + { + "epoch": 0.3345460698874693, + "grad_norm": 0.2664455473423004, + "learning_rate": 0.001, + "loss": 2.7186, + "step": 7908 + }, + { + "epoch": 0.3345883746509857, + "grad_norm": 0.3100428879261017, + "learning_rate": 0.001, + "loss": 2.275, + "step": 7909 + }, + { + "epoch": 0.3346306794145021, + "grad_norm": 0.19534626603126526, + "learning_rate": 0.001, + "loss": 2.7456, + "step": 7910 + }, + { + "epoch": 0.33467298417801844, + "grad_norm": 0.6475488543510437, + "learning_rate": 0.001, + "loss": 2.2076, + "step": 7911 + }, + { + "epoch": 0.3347152889415348, + "grad_norm": 0.3301447927951813, + "learning_rate": 0.001, + "loss": 2.3495, + "step": 7912 + }, + { + "epoch": 0.3347575937050512, + "grad_norm": 2.3587558269500732, + "learning_rate": 0.001, + "loss": 2.3851, + "step": 7913 + }, + { + "epoch": 0.33479989846856756, + "grad_norm": 0.2016088366508484, + "learning_rate": 0.001, + "loss": 2.1589, + "step": 7914 + }, + { + "epoch": 0.3348422032320839, + "grad_norm": 0.2044316530227661, + "learning_rate": 0.001, + "loss": 3.0746, + "step": 7915 + }, + { + "epoch": 0.3348845079956003, + "grad_norm": 0.19156186282634735, + "learning_rate": 0.001, + "loss": 2.1999, + "step": 7916 + }, + { + "epoch": 0.3349268127591167, + "grad_norm": 0.23087522387504578, + "learning_rate": 0.001, + "loss": 3.6112, + "step": 7917 + }, + { + "epoch": 0.33496911752263303, + "grad_norm": 0.1780395656824112, + "learning_rate": 0.001, + "loss": 2.6512, + "step": 7918 + }, + { + "epoch": 0.33501142228614944, + "grad_norm": 0.20307819545269012, + "learning_rate": 0.001, + "loss": 1.8293, + "step": 7919 + }, + { + "epoch": 0.3350537270496658, + "grad_norm": 0.735317587852478, + "learning_rate": 0.001, + "loss": 1.861, + "step": 7920 + }, + { + "epoch": 0.33509603181318215, + "grad_norm": 0.2308008074760437, + "learning_rate": 0.001, + "loss": 2.4375, + "step": 7921 + }, + { + "epoch": 0.33513833657669856, + "grad_norm": 0.2518015205860138, + "learning_rate": 0.001, + "loss": 2.1128, + "step": 7922 + }, + { + "epoch": 0.3351806413402149, + "grad_norm": 0.22979722917079926, + "learning_rate": 0.001, + "loss": 1.8036, + "step": 7923 + }, + { + "epoch": 0.33522294610373127, + "grad_norm": 0.20202603936195374, + "learning_rate": 0.001, + "loss": 2.629, + "step": 7924 + }, + { + "epoch": 0.3352652508672477, + "grad_norm": 0.19163471460342407, + "learning_rate": 0.001, + "loss": 2.8881, + "step": 7925 + }, + { + "epoch": 0.33530755563076403, + "grad_norm": 0.17413891851902008, + "learning_rate": 0.001, + "loss": 1.7156, + "step": 7926 + }, + { + "epoch": 0.3353498603942804, + "grad_norm": 0.3942449390888214, + "learning_rate": 0.001, + "loss": 1.8654, + "step": 7927 + }, + { + "epoch": 0.3353921651577968, + "grad_norm": 20.90395164489746, + "learning_rate": 0.001, + "loss": 1.722, + "step": 7928 + }, + { + "epoch": 0.33543446992131315, + "grad_norm": 2.177119016647339, + "learning_rate": 0.001, + "loss": 1.8256, + "step": 7929 + }, + { + "epoch": 0.3354767746848295, + "grad_norm": 0.31585386395454407, + "learning_rate": 0.001, + "loss": 2.4039, + "step": 7930 + }, + { + "epoch": 0.33551907944834586, + "grad_norm": 0.3052481710910797, + "learning_rate": 0.001, + "loss": 2.1275, + "step": 7931 + }, + { + "epoch": 0.33556138421186227, + "grad_norm": 0.23809614777565002, + "learning_rate": 0.001, + "loss": 1.7413, + "step": 7932 + }, + { + "epoch": 0.3356036889753786, + "grad_norm": 2.0751454830169678, + "learning_rate": 0.001, + "loss": 2.132, + "step": 7933 + }, + { + "epoch": 0.335645993738895, + "grad_norm": 0.6498151421546936, + "learning_rate": 0.001, + "loss": 2.0706, + "step": 7934 + }, + { + "epoch": 0.3356882985024114, + "grad_norm": 0.26023566722869873, + "learning_rate": 0.001, + "loss": 2.4264, + "step": 7935 + }, + { + "epoch": 0.33573060326592774, + "grad_norm": 0.203081876039505, + "learning_rate": 0.001, + "loss": 2.4726, + "step": 7936 + }, + { + "epoch": 0.3357729080294441, + "grad_norm": 0.2829737961292267, + "learning_rate": 0.001, + "loss": 3.3616, + "step": 7937 + }, + { + "epoch": 0.3358152127929605, + "grad_norm": 1.6824712753295898, + "learning_rate": 0.001, + "loss": 1.6086, + "step": 7938 + }, + { + "epoch": 0.33585751755647686, + "grad_norm": 0.19478239119052887, + "learning_rate": 0.001, + "loss": 2.579, + "step": 7939 + }, + { + "epoch": 0.3358998223199932, + "grad_norm": 0.22429609298706055, + "learning_rate": 0.001, + "loss": 2.8092, + "step": 7940 + }, + { + "epoch": 0.3359421270835096, + "grad_norm": 0.16416104137897491, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 7941 + }, + { + "epoch": 0.335984431847026, + "grad_norm": 0.27525100111961365, + "learning_rate": 0.001, + "loss": 2.1074, + "step": 7942 + }, + { + "epoch": 0.33602673661054233, + "grad_norm": 0.219032421708107, + "learning_rate": 0.001, + "loss": 2.4779, + "step": 7943 + }, + { + "epoch": 0.33606904137405874, + "grad_norm": 0.7257388830184937, + "learning_rate": 0.001, + "loss": 1.9499, + "step": 7944 + }, + { + "epoch": 0.3361113461375751, + "grad_norm": 0.8192842602729797, + "learning_rate": 0.001, + "loss": 1.9096, + "step": 7945 + }, + { + "epoch": 0.33615365090109145, + "grad_norm": 0.1924901306629181, + "learning_rate": 0.001, + "loss": 1.9039, + "step": 7946 + }, + { + "epoch": 0.33619595566460786, + "grad_norm": 0.18553310632705688, + "learning_rate": 0.001, + "loss": 1.9586, + "step": 7947 + }, + { + "epoch": 0.3362382604281242, + "grad_norm": 0.41920140385627747, + "learning_rate": 0.001, + "loss": 2.1416, + "step": 7948 + }, + { + "epoch": 0.33628056519164057, + "grad_norm": 0.21451827883720398, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 7949 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 0.2012207955121994, + "learning_rate": 0.001, + "loss": 2.3316, + "step": 7950 + }, + { + "epoch": 0.33636517471867333, + "grad_norm": 0.1847204715013504, + "learning_rate": 0.001, + "loss": 2.8531, + "step": 7951 + }, + { + "epoch": 0.3364074794821897, + "grad_norm": 0.2382209748029709, + "learning_rate": 0.001, + "loss": 2.1404, + "step": 7952 + }, + { + "epoch": 0.33644978424570604, + "grad_norm": 0.19776912033557892, + "learning_rate": 0.001, + "loss": 2.2637, + "step": 7953 + }, + { + "epoch": 0.33649208900922245, + "grad_norm": 0.3410530388355255, + "learning_rate": 0.001, + "loss": 2.5895, + "step": 7954 + }, + { + "epoch": 0.3365343937727388, + "grad_norm": 0.4076724648475647, + "learning_rate": 0.001, + "loss": 2.3355, + "step": 7955 + }, + { + "epoch": 0.33657669853625516, + "grad_norm": 0.24683041870594025, + "learning_rate": 0.001, + "loss": 2.0245, + "step": 7956 + }, + { + "epoch": 0.33661900329977157, + "grad_norm": 0.1744687408208847, + "learning_rate": 0.001, + "loss": 2.8672, + "step": 7957 + }, + { + "epoch": 0.3366613080632879, + "grad_norm": 0.1615762561559677, + "learning_rate": 0.001, + "loss": 2.8006, + "step": 7958 + }, + { + "epoch": 0.3367036128268043, + "grad_norm": 6.103714942932129, + "learning_rate": 0.001, + "loss": 2.3797, + "step": 7959 + }, + { + "epoch": 0.3367459175903207, + "grad_norm": 0.23423656821250916, + "learning_rate": 0.001, + "loss": 2.1547, + "step": 7960 + }, + { + "epoch": 0.33678822235383704, + "grad_norm": 0.36211827397346497, + "learning_rate": 0.001, + "loss": 2.6871, + "step": 7961 + }, + { + "epoch": 0.3368305271173534, + "grad_norm": 0.2331177294254303, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 7962 + }, + { + "epoch": 0.3368728318808698, + "grad_norm": 0.18622010946273804, + "learning_rate": 0.001, + "loss": 2.4594, + "step": 7963 + }, + { + "epoch": 0.33691513664438616, + "grad_norm": 0.26407673954963684, + "learning_rate": 0.001, + "loss": 2.1465, + "step": 7964 + }, + { + "epoch": 0.3369574414079025, + "grad_norm": 0.20621120929718018, + "learning_rate": 0.001, + "loss": 1.7513, + "step": 7965 + }, + { + "epoch": 0.3369997461714189, + "grad_norm": 0.2361491471529007, + "learning_rate": 0.001, + "loss": 1.8199, + "step": 7966 + }, + { + "epoch": 0.3370420509349353, + "grad_norm": 0.20442907512187958, + "learning_rate": 0.001, + "loss": 3.1949, + "step": 7967 + }, + { + "epoch": 0.33708435569845163, + "grad_norm": 0.23592479526996613, + "learning_rate": 0.001, + "loss": 1.7339, + "step": 7968 + }, + { + "epoch": 0.33712666046196804, + "grad_norm": 19.943355560302734, + "learning_rate": 0.001, + "loss": 1.6594, + "step": 7969 + }, + { + "epoch": 0.3371689652254844, + "grad_norm": 4.2921223640441895, + "learning_rate": 0.001, + "loss": 2.8318, + "step": 7970 + }, + { + "epoch": 0.33721126998900075, + "grad_norm": 0.3829931616783142, + "learning_rate": 0.001, + "loss": 2.4734, + "step": 7971 + }, + { + "epoch": 0.33725357475251716, + "grad_norm": 1.2138727903366089, + "learning_rate": 0.001, + "loss": 3.164, + "step": 7972 + }, + { + "epoch": 0.3372958795160335, + "grad_norm": 0.18562185764312744, + "learning_rate": 0.001, + "loss": 2.0124, + "step": 7973 + }, + { + "epoch": 0.33733818427954987, + "grad_norm": 1.363898754119873, + "learning_rate": 0.001, + "loss": 2.4757, + "step": 7974 + }, + { + "epoch": 0.3373804890430662, + "grad_norm": 0.16648072004318237, + "learning_rate": 0.001, + "loss": 2.2298, + "step": 7975 + }, + { + "epoch": 0.33742279380658263, + "grad_norm": 0.27364203333854675, + "learning_rate": 0.001, + "loss": 2.1235, + "step": 7976 + }, + { + "epoch": 0.337465098570099, + "grad_norm": 0.2676752209663391, + "learning_rate": 0.001, + "loss": 2.7993, + "step": 7977 + }, + { + "epoch": 0.33750740333361534, + "grad_norm": 0.21944323182106018, + "learning_rate": 0.001, + "loss": 2.7508, + "step": 7978 + }, + { + "epoch": 0.33754970809713175, + "grad_norm": 0.18799695372581482, + "learning_rate": 0.001, + "loss": 1.9509, + "step": 7979 + }, + { + "epoch": 0.3375920128606481, + "grad_norm": 0.17753717303276062, + "learning_rate": 0.001, + "loss": 2.3597, + "step": 7980 + }, + { + "epoch": 0.33763431762416446, + "grad_norm": 0.22551682591438293, + "learning_rate": 0.001, + "loss": 1.6755, + "step": 7981 + }, + { + "epoch": 0.33767662238768087, + "grad_norm": 0.21229778230190277, + "learning_rate": 0.001, + "loss": 2.2954, + "step": 7982 + }, + { + "epoch": 0.3377189271511972, + "grad_norm": 0.21653328835964203, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 7983 + }, + { + "epoch": 0.3377612319147136, + "grad_norm": 0.17863360047340393, + "learning_rate": 0.001, + "loss": 2.0211, + "step": 7984 + }, + { + "epoch": 0.33780353667823, + "grad_norm": 0.3650844693183899, + "learning_rate": 0.001, + "loss": 2.7883, + "step": 7985 + }, + { + "epoch": 0.33784584144174634, + "grad_norm": 0.31294921040534973, + "learning_rate": 0.001, + "loss": 2.2396, + "step": 7986 + }, + { + "epoch": 0.3378881462052627, + "grad_norm": 0.8418748378753662, + "learning_rate": 0.001, + "loss": 1.9066, + "step": 7987 + }, + { + "epoch": 0.3379304509687791, + "grad_norm": 1.4910551309585571, + "learning_rate": 0.001, + "loss": 1.5615, + "step": 7988 + }, + { + "epoch": 0.33797275573229546, + "grad_norm": 0.7362372875213623, + "learning_rate": 0.001, + "loss": 2.7537, + "step": 7989 + }, + { + "epoch": 0.3380150604958118, + "grad_norm": 0.27851244807243347, + "learning_rate": 0.001, + "loss": 2.1922, + "step": 7990 + }, + { + "epoch": 0.3380573652593282, + "grad_norm": 0.18572275340557098, + "learning_rate": 0.001, + "loss": 1.9722, + "step": 7991 + }, + { + "epoch": 0.3380996700228446, + "grad_norm": 0.32508203387260437, + "learning_rate": 0.001, + "loss": 2.7969, + "step": 7992 + }, + { + "epoch": 0.33814197478636093, + "grad_norm": 0.22849464416503906, + "learning_rate": 0.001, + "loss": 1.5679, + "step": 7993 + }, + { + "epoch": 0.33818427954987734, + "grad_norm": 0.18476389348506927, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 7994 + }, + { + "epoch": 0.3382265843133937, + "grad_norm": 0.32618439197540283, + "learning_rate": 0.001, + "loss": 1.9877, + "step": 7995 + }, + { + "epoch": 0.33826888907691005, + "grad_norm": 0.20857444405555725, + "learning_rate": 0.001, + "loss": 1.8781, + "step": 7996 + }, + { + "epoch": 0.3383111938404264, + "grad_norm": 0.1851375550031662, + "learning_rate": 0.001, + "loss": 1.5724, + "step": 7997 + }, + { + "epoch": 0.3383534986039428, + "grad_norm": 0.5002090930938721, + "learning_rate": 0.001, + "loss": 2.1482, + "step": 7998 + }, + { + "epoch": 0.33839580336745917, + "grad_norm": 0.18497441709041595, + "learning_rate": 0.001, + "loss": 2.2619, + "step": 7999 + }, + { + "epoch": 0.3384381081309755, + "grad_norm": 0.2192855030298233, + "learning_rate": 0.001, + "loss": 2.4961, + "step": 8000 + }, + { + "epoch": 0.33848041289449193, + "grad_norm": 3.4979212284088135, + "learning_rate": 0.001, + "loss": 2.0733, + "step": 8001 + }, + { + "epoch": 0.3385227176580083, + "grad_norm": 0.2794019877910614, + "learning_rate": 0.001, + "loss": 2.9049, + "step": 8002 + }, + { + "epoch": 0.33856502242152464, + "grad_norm": 0.26850298047065735, + "learning_rate": 0.001, + "loss": 2.1762, + "step": 8003 + }, + { + "epoch": 0.33860732718504105, + "grad_norm": 0.19548161327838898, + "learning_rate": 0.001, + "loss": 1.9791, + "step": 8004 + }, + { + "epoch": 0.3386496319485574, + "grad_norm": 0.20981860160827637, + "learning_rate": 0.001, + "loss": 2.0878, + "step": 8005 + }, + { + "epoch": 0.33869193671207376, + "grad_norm": 0.3214268088340759, + "learning_rate": 0.001, + "loss": 1.628, + "step": 8006 + }, + { + "epoch": 0.33873424147559017, + "grad_norm": 0.2680968940258026, + "learning_rate": 0.001, + "loss": 1.6191, + "step": 8007 + }, + { + "epoch": 0.3387765462391065, + "grad_norm": 5.1096930503845215, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 8008 + }, + { + "epoch": 0.3388188510026229, + "grad_norm": 0.2013072371482849, + "learning_rate": 0.001, + "loss": 2.0241, + "step": 8009 + }, + { + "epoch": 0.3388611557661393, + "grad_norm": 0.2397328019142151, + "learning_rate": 0.001, + "loss": 1.7325, + "step": 8010 + }, + { + "epoch": 0.33890346052965564, + "grad_norm": 0.20630115270614624, + "learning_rate": 0.001, + "loss": 1.7987, + "step": 8011 + }, + { + "epoch": 0.338945765293172, + "grad_norm": 0.3187551200389862, + "learning_rate": 0.001, + "loss": 2.6951, + "step": 8012 + }, + { + "epoch": 0.3389880700566884, + "grad_norm": 0.2622925937175751, + "learning_rate": 0.001, + "loss": 1.9729, + "step": 8013 + }, + { + "epoch": 0.33903037482020476, + "grad_norm": 0.20265188813209534, + "learning_rate": 0.001, + "loss": 3.4582, + "step": 8014 + }, + { + "epoch": 0.3390726795837211, + "grad_norm": 0.44326838850975037, + "learning_rate": 0.001, + "loss": 2.0444, + "step": 8015 + }, + { + "epoch": 0.3391149843472375, + "grad_norm": 6.31563663482666, + "learning_rate": 0.001, + "loss": 1.9386, + "step": 8016 + }, + { + "epoch": 0.3391572891107539, + "grad_norm": 0.7522348761558533, + "learning_rate": 0.001, + "loss": 2.9402, + "step": 8017 + }, + { + "epoch": 0.33919959387427023, + "grad_norm": 0.27529436349868774, + "learning_rate": 0.001, + "loss": 1.899, + "step": 8018 + }, + { + "epoch": 0.3392418986377866, + "grad_norm": 0.304376482963562, + "learning_rate": 0.001, + "loss": 2.4614, + "step": 8019 + }, + { + "epoch": 0.339284203401303, + "grad_norm": 0.2474507987499237, + "learning_rate": 0.001, + "loss": 2.5785, + "step": 8020 + }, + { + "epoch": 0.33932650816481935, + "grad_norm": 0.27842018008232117, + "learning_rate": 0.001, + "loss": 2.2808, + "step": 8021 + }, + { + "epoch": 0.3393688129283357, + "grad_norm": 0.2625916302204132, + "learning_rate": 0.001, + "loss": 2.1915, + "step": 8022 + }, + { + "epoch": 0.3394111176918521, + "grad_norm": 0.2093726396560669, + "learning_rate": 0.001, + "loss": 2.1816, + "step": 8023 + }, + { + "epoch": 0.33945342245536847, + "grad_norm": 0.2079465538263321, + "learning_rate": 0.001, + "loss": 1.8849, + "step": 8024 + }, + { + "epoch": 0.3394957272188848, + "grad_norm": 0.7054566740989685, + "learning_rate": 0.001, + "loss": 1.8009, + "step": 8025 + }, + { + "epoch": 0.33953803198240123, + "grad_norm": 0.24773012101650238, + "learning_rate": 0.001, + "loss": 3.0748, + "step": 8026 + }, + { + "epoch": 0.3395803367459176, + "grad_norm": 0.23611260950565338, + "learning_rate": 0.001, + "loss": 2.0335, + "step": 8027 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 1.1430414915084839, + "learning_rate": 0.001, + "loss": 3.0462, + "step": 8028 + }, + { + "epoch": 0.33966494627295035, + "grad_norm": 0.22795584797859192, + "learning_rate": 0.001, + "loss": 2.0713, + "step": 8029 + }, + { + "epoch": 0.3397072510364667, + "grad_norm": 0.2508586347103119, + "learning_rate": 0.001, + "loss": 3.3238, + "step": 8030 + }, + { + "epoch": 0.33974955579998306, + "grad_norm": 0.24516475200653076, + "learning_rate": 0.001, + "loss": 1.9439, + "step": 8031 + }, + { + "epoch": 0.33979186056349947, + "grad_norm": 0.1642148196697235, + "learning_rate": 0.001, + "loss": 2.5945, + "step": 8032 + }, + { + "epoch": 0.3398341653270158, + "grad_norm": 2.4271435737609863, + "learning_rate": 0.001, + "loss": 2.6635, + "step": 8033 + }, + { + "epoch": 0.3398764700905322, + "grad_norm": 0.8231348991394043, + "learning_rate": 0.001, + "loss": 2.5651, + "step": 8034 + }, + { + "epoch": 0.3399187748540486, + "grad_norm": 0.23157507181167603, + "learning_rate": 0.001, + "loss": 2.0043, + "step": 8035 + }, + { + "epoch": 0.33996107961756494, + "grad_norm": 0.24356773495674133, + "learning_rate": 0.001, + "loss": 2.0861, + "step": 8036 + }, + { + "epoch": 0.3400033843810813, + "grad_norm": 0.23287416994571686, + "learning_rate": 0.001, + "loss": 2.1728, + "step": 8037 + }, + { + "epoch": 0.3400456891445977, + "grad_norm": 0.1699216365814209, + "learning_rate": 0.001, + "loss": 1.8013, + "step": 8038 + }, + { + "epoch": 0.34008799390811406, + "grad_norm": 0.19102227687835693, + "learning_rate": 0.001, + "loss": 2.2718, + "step": 8039 + }, + { + "epoch": 0.3401302986716304, + "grad_norm": 0.2611904442310333, + "learning_rate": 0.001, + "loss": 3.0925, + "step": 8040 + }, + { + "epoch": 0.34017260343514677, + "grad_norm": 0.8404667973518372, + "learning_rate": 0.001, + "loss": 1.8287, + "step": 8041 + }, + { + "epoch": 0.3402149081986632, + "grad_norm": 0.3977033793926239, + "learning_rate": 0.001, + "loss": 2.1804, + "step": 8042 + }, + { + "epoch": 0.34025721296217953, + "grad_norm": 0.21507540345191956, + "learning_rate": 0.001, + "loss": 2.6397, + "step": 8043 + }, + { + "epoch": 0.3402995177256959, + "grad_norm": 0.22415781021118164, + "learning_rate": 0.001, + "loss": 2.7939, + "step": 8044 + }, + { + "epoch": 0.3403418224892123, + "grad_norm": 2.2323434352874756, + "learning_rate": 0.001, + "loss": 2.3274, + "step": 8045 + }, + { + "epoch": 0.34038412725272865, + "grad_norm": 5.563083171844482, + "learning_rate": 0.001, + "loss": 2.1133, + "step": 8046 + }, + { + "epoch": 0.340426432016245, + "grad_norm": 0.27088257670402527, + "learning_rate": 0.001, + "loss": 2.0518, + "step": 8047 + }, + { + "epoch": 0.3404687367797614, + "grad_norm": 1.1499382257461548, + "learning_rate": 0.001, + "loss": 2.057, + "step": 8048 + }, + { + "epoch": 0.34051104154327777, + "grad_norm": 0.2051892727613449, + "learning_rate": 0.001, + "loss": 2.125, + "step": 8049 + }, + { + "epoch": 0.3405533463067941, + "grad_norm": 0.3838144540786743, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 8050 + }, + { + "epoch": 0.34059565107031053, + "grad_norm": 0.28759241104125977, + "learning_rate": 0.001, + "loss": 1.9844, + "step": 8051 + }, + { + "epoch": 0.3406379558338269, + "grad_norm": 0.2020261138677597, + "learning_rate": 0.001, + "loss": 2.8377, + "step": 8052 + }, + { + "epoch": 0.34068026059734324, + "grad_norm": 0.24153375625610352, + "learning_rate": 0.001, + "loss": 2.1826, + "step": 8053 + }, + { + "epoch": 0.34072256536085965, + "grad_norm": 0.43648314476013184, + "learning_rate": 0.001, + "loss": 2.8549, + "step": 8054 + }, + { + "epoch": 0.340764870124376, + "grad_norm": 0.19116896390914917, + "learning_rate": 0.001, + "loss": 3.6501, + "step": 8055 + }, + { + "epoch": 0.34080717488789236, + "grad_norm": 7.185187816619873, + "learning_rate": 0.001, + "loss": 2.6377, + "step": 8056 + }, + { + "epoch": 0.34084947965140877, + "grad_norm": 0.18975602090358734, + "learning_rate": 0.001, + "loss": 2.1213, + "step": 8057 + }, + { + "epoch": 0.3408917844149251, + "grad_norm": 0.2663729190826416, + "learning_rate": 0.001, + "loss": 2.6091, + "step": 8058 + }, + { + "epoch": 0.3409340891784415, + "grad_norm": 0.20250354707241058, + "learning_rate": 0.001, + "loss": 2.103, + "step": 8059 + }, + { + "epoch": 0.3409763939419579, + "grad_norm": 0.20965036749839783, + "learning_rate": 0.001, + "loss": 3.2814, + "step": 8060 + }, + { + "epoch": 0.34101869870547424, + "grad_norm": 0.2456333339214325, + "learning_rate": 0.001, + "loss": 3.3578, + "step": 8061 + }, + { + "epoch": 0.3410610034689906, + "grad_norm": 0.18389590084552765, + "learning_rate": 0.001, + "loss": 2.5921, + "step": 8062 + }, + { + "epoch": 0.341103308232507, + "grad_norm": 0.23083733022212982, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 8063 + }, + { + "epoch": 0.34114561299602336, + "grad_norm": 0.6850119829177856, + "learning_rate": 0.001, + "loss": 1.5674, + "step": 8064 + }, + { + "epoch": 0.3411879177595397, + "grad_norm": 0.2017279416322708, + "learning_rate": 0.001, + "loss": 2.0718, + "step": 8065 + }, + { + "epoch": 0.34123022252305607, + "grad_norm": 0.18653610348701477, + "learning_rate": 0.001, + "loss": 2.4179, + "step": 8066 + }, + { + "epoch": 0.3412725272865725, + "grad_norm": 0.3980189263820648, + "learning_rate": 0.001, + "loss": 2.1241, + "step": 8067 + }, + { + "epoch": 0.34131483205008883, + "grad_norm": 0.7478682398796082, + "learning_rate": 0.001, + "loss": 1.8917, + "step": 8068 + }, + { + "epoch": 0.3413571368136052, + "grad_norm": 0.1957026720046997, + "learning_rate": 0.001, + "loss": 2.3097, + "step": 8069 + }, + { + "epoch": 0.3413994415771216, + "grad_norm": 0.1947687864303589, + "learning_rate": 0.001, + "loss": 1.8799, + "step": 8070 + }, + { + "epoch": 0.34144174634063795, + "grad_norm": 0.216701477766037, + "learning_rate": 0.001, + "loss": 3.1663, + "step": 8071 + }, + { + "epoch": 0.3414840511041543, + "grad_norm": 0.2218198925256729, + "learning_rate": 0.001, + "loss": 2.1391, + "step": 8072 + }, + { + "epoch": 0.3415263558676707, + "grad_norm": 0.27254411578178406, + "learning_rate": 0.001, + "loss": 2.6124, + "step": 8073 + }, + { + "epoch": 0.34156866063118707, + "grad_norm": 0.27166926860809326, + "learning_rate": 0.001, + "loss": 1.6906, + "step": 8074 + }, + { + "epoch": 0.3416109653947034, + "grad_norm": 0.2239091545343399, + "learning_rate": 0.001, + "loss": 1.8461, + "step": 8075 + }, + { + "epoch": 0.34165327015821984, + "grad_norm": 0.8380588889122009, + "learning_rate": 0.001, + "loss": 2.1581, + "step": 8076 + }, + { + "epoch": 0.3416955749217362, + "grad_norm": 0.39979785680770874, + "learning_rate": 0.001, + "loss": 2.3237, + "step": 8077 + }, + { + "epoch": 0.34173787968525254, + "grad_norm": 0.25030001997947693, + "learning_rate": 0.001, + "loss": 1.9951, + "step": 8078 + }, + { + "epoch": 0.34178018444876895, + "grad_norm": 0.22300481796264648, + "learning_rate": 0.001, + "loss": 2.2102, + "step": 8079 + }, + { + "epoch": 0.3418224892122853, + "grad_norm": 0.1912737488746643, + "learning_rate": 0.001, + "loss": 2.1506, + "step": 8080 + }, + { + "epoch": 0.34186479397580166, + "grad_norm": 2.661966562271118, + "learning_rate": 0.001, + "loss": 1.5773, + "step": 8081 + }, + { + "epoch": 0.34190709873931807, + "grad_norm": 0.2479306310415268, + "learning_rate": 0.001, + "loss": 2.8477, + "step": 8082 + }, + { + "epoch": 0.3419494035028344, + "grad_norm": 0.43052566051483154, + "learning_rate": 0.001, + "loss": 3.5951, + "step": 8083 + }, + { + "epoch": 0.3419917082663508, + "grad_norm": 0.2106366604566574, + "learning_rate": 0.001, + "loss": 2.7114, + "step": 8084 + }, + { + "epoch": 0.3420340130298672, + "grad_norm": 1.2527594566345215, + "learning_rate": 0.001, + "loss": 2.2984, + "step": 8085 + }, + { + "epoch": 0.34207631779338354, + "grad_norm": 4.545886993408203, + "learning_rate": 0.001, + "loss": 1.636, + "step": 8086 + }, + { + "epoch": 0.3421186225568999, + "grad_norm": 0.3907563090324402, + "learning_rate": 0.001, + "loss": 3.0252, + "step": 8087 + }, + { + "epoch": 0.34216092732041625, + "grad_norm": 0.24621452391147614, + "learning_rate": 0.001, + "loss": 1.6867, + "step": 8088 + }, + { + "epoch": 0.34220323208393266, + "grad_norm": 0.26306939125061035, + "learning_rate": 0.001, + "loss": 1.9305, + "step": 8089 + }, + { + "epoch": 0.342245536847449, + "grad_norm": 0.26752984523773193, + "learning_rate": 0.001, + "loss": 2.902, + "step": 8090 + }, + { + "epoch": 0.34228784161096537, + "grad_norm": 0.2583836019039154, + "learning_rate": 0.001, + "loss": 3.4137, + "step": 8091 + }, + { + "epoch": 0.3423301463744818, + "grad_norm": 1.7589455842971802, + "learning_rate": 0.001, + "loss": 2.0823, + "step": 8092 + }, + { + "epoch": 0.34237245113799814, + "grad_norm": 3.6824162006378174, + "learning_rate": 0.001, + "loss": 2.4873, + "step": 8093 + }, + { + "epoch": 0.3424147559015145, + "grad_norm": 0.45065632462501526, + "learning_rate": 0.001, + "loss": 2.184, + "step": 8094 + }, + { + "epoch": 0.3424570606650309, + "grad_norm": 0.22970075905323029, + "learning_rate": 0.001, + "loss": 1.886, + "step": 8095 + }, + { + "epoch": 0.34249936542854725, + "grad_norm": 0.2680737376213074, + "learning_rate": 0.001, + "loss": 1.9278, + "step": 8096 + }, + { + "epoch": 0.3425416701920636, + "grad_norm": 0.39559051394462585, + "learning_rate": 0.001, + "loss": 2.6823, + "step": 8097 + }, + { + "epoch": 0.34258397495558, + "grad_norm": 0.23845720291137695, + "learning_rate": 0.001, + "loss": 2.6721, + "step": 8098 + }, + { + "epoch": 0.34262627971909637, + "grad_norm": 0.6681344509124756, + "learning_rate": 0.001, + "loss": 3.0333, + "step": 8099 + }, + { + "epoch": 0.3426685844826127, + "grad_norm": 0.935767650604248, + "learning_rate": 0.001, + "loss": 3.4422, + "step": 8100 + }, + { + "epoch": 0.34271088924612914, + "grad_norm": 0.21342290937900543, + "learning_rate": 0.001, + "loss": 2.4125, + "step": 8101 + }, + { + "epoch": 0.3427531940096455, + "grad_norm": 0.7626002430915833, + "learning_rate": 0.001, + "loss": 3.0488, + "step": 8102 + }, + { + "epoch": 0.34279549877316184, + "grad_norm": 0.991487979888916, + "learning_rate": 0.001, + "loss": 2.2147, + "step": 8103 + }, + { + "epoch": 0.34283780353667825, + "grad_norm": 0.19054241478443146, + "learning_rate": 0.001, + "loss": 1.7796, + "step": 8104 + }, + { + "epoch": 0.3428801083001946, + "grad_norm": 2.831486701965332, + "learning_rate": 0.001, + "loss": 2.9814, + "step": 8105 + }, + { + "epoch": 0.34292241306371096, + "grad_norm": 0.27352792024612427, + "learning_rate": 0.001, + "loss": 1.866, + "step": 8106 + }, + { + "epoch": 0.34296471782722737, + "grad_norm": 0.24455174803733826, + "learning_rate": 0.001, + "loss": 2.2868, + "step": 8107 + }, + { + "epoch": 0.3430070225907437, + "grad_norm": 0.21650566160678864, + "learning_rate": 0.001, + "loss": 2.3255, + "step": 8108 + }, + { + "epoch": 0.3430493273542601, + "grad_norm": 0.18031521141529083, + "learning_rate": 0.001, + "loss": 2.1578, + "step": 8109 + }, + { + "epoch": 0.34309163211777643, + "grad_norm": 0.18996316194534302, + "learning_rate": 0.001, + "loss": 1.8919, + "step": 8110 + }, + { + "epoch": 0.34313393688129284, + "grad_norm": 0.18360304832458496, + "learning_rate": 0.001, + "loss": 1.61, + "step": 8111 + }, + { + "epoch": 0.3431762416448092, + "grad_norm": 0.44647422432899475, + "learning_rate": 0.001, + "loss": 2.3315, + "step": 8112 + }, + { + "epoch": 0.34321854640832555, + "grad_norm": 0.937113344669342, + "learning_rate": 0.001, + "loss": 3.0726, + "step": 8113 + }, + { + "epoch": 0.34326085117184196, + "grad_norm": 0.2121163010597229, + "learning_rate": 0.001, + "loss": 2.0622, + "step": 8114 + }, + { + "epoch": 0.3433031559353583, + "grad_norm": 0.21113400161266327, + "learning_rate": 0.001, + "loss": 2.1562, + "step": 8115 + }, + { + "epoch": 0.34334546069887467, + "grad_norm": 0.22805431485176086, + "learning_rate": 0.001, + "loss": 1.9647, + "step": 8116 + }, + { + "epoch": 0.3433877654623911, + "grad_norm": 0.18042448163032532, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 8117 + }, + { + "epoch": 0.34343007022590744, + "grad_norm": 0.33136382699012756, + "learning_rate": 0.001, + "loss": 3.6987, + "step": 8118 + }, + { + "epoch": 0.3434723749894238, + "grad_norm": 0.865990936756134, + "learning_rate": 0.001, + "loss": 2.1322, + "step": 8119 + }, + { + "epoch": 0.3435146797529402, + "grad_norm": 0.2049923986196518, + "learning_rate": 0.001, + "loss": 2.1151, + "step": 8120 + }, + { + "epoch": 0.34355698451645655, + "grad_norm": 0.41140925884246826, + "learning_rate": 0.001, + "loss": 2.1526, + "step": 8121 + }, + { + "epoch": 0.3435992892799729, + "grad_norm": 0.2289661467075348, + "learning_rate": 0.001, + "loss": 2.3287, + "step": 8122 + }, + { + "epoch": 0.3436415940434893, + "grad_norm": 0.8169209957122803, + "learning_rate": 0.001, + "loss": 3.6942, + "step": 8123 + }, + { + "epoch": 0.34368389880700567, + "grad_norm": 0.21063531935214996, + "learning_rate": 0.001, + "loss": 2.1003, + "step": 8124 + }, + { + "epoch": 0.343726203570522, + "grad_norm": 0.20649611949920654, + "learning_rate": 0.001, + "loss": 1.8405, + "step": 8125 + }, + { + "epoch": 0.34376850833403844, + "grad_norm": 0.20465020835399628, + "learning_rate": 0.001, + "loss": 2.3314, + "step": 8126 + }, + { + "epoch": 0.3438108130975548, + "grad_norm": 0.20881281793117523, + "learning_rate": 0.001, + "loss": 2.8126, + "step": 8127 + }, + { + "epoch": 0.34385311786107114, + "grad_norm": 0.45468512177467346, + "learning_rate": 0.001, + "loss": 2.2817, + "step": 8128 + }, + { + "epoch": 0.34389542262458755, + "grad_norm": 0.4071137011051178, + "learning_rate": 0.001, + "loss": 2.9757, + "step": 8129 + }, + { + "epoch": 0.3439377273881039, + "grad_norm": 1.8788225650787354, + "learning_rate": 0.001, + "loss": 3.8213, + "step": 8130 + }, + { + "epoch": 0.34398003215162026, + "grad_norm": 0.18839234113693237, + "learning_rate": 0.001, + "loss": 2.2812, + "step": 8131 + }, + { + "epoch": 0.3440223369151366, + "grad_norm": 2.463320016860962, + "learning_rate": 0.001, + "loss": 2.9136, + "step": 8132 + }, + { + "epoch": 0.344064641678653, + "grad_norm": 0.2248656004667282, + "learning_rate": 0.001, + "loss": 3.0937, + "step": 8133 + }, + { + "epoch": 0.3441069464421694, + "grad_norm": 1.793042540550232, + "learning_rate": 0.001, + "loss": 3.3005, + "step": 8134 + }, + { + "epoch": 0.34414925120568574, + "grad_norm": 0.17570704221725464, + "learning_rate": 0.001, + "loss": 3.2353, + "step": 8135 + }, + { + "epoch": 0.34419155596920215, + "grad_norm": 1.5945035219192505, + "learning_rate": 0.001, + "loss": 1.9676, + "step": 8136 + }, + { + "epoch": 0.3442338607327185, + "grad_norm": 0.18541212379932404, + "learning_rate": 0.001, + "loss": 2.5174, + "step": 8137 + }, + { + "epoch": 0.34427616549623485, + "grad_norm": 0.1724303811788559, + "learning_rate": 0.001, + "loss": 2.6184, + "step": 8138 + }, + { + "epoch": 0.34431847025975126, + "grad_norm": 0.9634104371070862, + "learning_rate": 0.001, + "loss": 2.0628, + "step": 8139 + }, + { + "epoch": 0.3443607750232676, + "grad_norm": 0.23390017449855804, + "learning_rate": 0.001, + "loss": 2.6646, + "step": 8140 + }, + { + "epoch": 0.34440307978678397, + "grad_norm": 0.19771607220172882, + "learning_rate": 0.001, + "loss": 2.321, + "step": 8141 + }, + { + "epoch": 0.3444453845503004, + "grad_norm": 0.21361044049263, + "learning_rate": 0.001, + "loss": 1.8798, + "step": 8142 + }, + { + "epoch": 0.34448768931381674, + "grad_norm": 0.20966002345085144, + "learning_rate": 0.001, + "loss": 1.9735, + "step": 8143 + }, + { + "epoch": 0.3445299940773331, + "grad_norm": 0.24883241951465607, + "learning_rate": 0.001, + "loss": 2.6098, + "step": 8144 + }, + { + "epoch": 0.3445722988408495, + "grad_norm": 1.5156461000442505, + "learning_rate": 0.001, + "loss": 1.8247, + "step": 8145 + }, + { + "epoch": 0.34461460360436585, + "grad_norm": 0.18546625971794128, + "learning_rate": 0.001, + "loss": 1.8604, + "step": 8146 + }, + { + "epoch": 0.3446569083678822, + "grad_norm": 0.18728595972061157, + "learning_rate": 0.001, + "loss": 2.6729, + "step": 8147 + }, + { + "epoch": 0.3446992131313986, + "grad_norm": 0.17871029675006866, + "learning_rate": 0.001, + "loss": 2.6175, + "step": 8148 + }, + { + "epoch": 0.344741517894915, + "grad_norm": 0.1993844360113144, + "learning_rate": 0.001, + "loss": 2.3318, + "step": 8149 + }, + { + "epoch": 0.3447838226584313, + "grad_norm": 0.3129980266094208, + "learning_rate": 0.001, + "loss": 2.1132, + "step": 8150 + }, + { + "epoch": 0.34482612742194774, + "grad_norm": 0.2431798130273819, + "learning_rate": 0.001, + "loss": 1.7761, + "step": 8151 + }, + { + "epoch": 0.3448684321854641, + "grad_norm": 0.33781230449676514, + "learning_rate": 0.001, + "loss": 4.0398, + "step": 8152 + }, + { + "epoch": 0.34491073694898045, + "grad_norm": 1.1244258880615234, + "learning_rate": 0.001, + "loss": 3.1731, + "step": 8153 + }, + { + "epoch": 0.3449530417124968, + "grad_norm": 0.1972734034061432, + "learning_rate": 0.001, + "loss": 3.1912, + "step": 8154 + }, + { + "epoch": 0.3449953464760132, + "grad_norm": 0.19920934736728668, + "learning_rate": 0.001, + "loss": 2.0368, + "step": 8155 + }, + { + "epoch": 0.34503765123952956, + "grad_norm": 0.2313353568315506, + "learning_rate": 0.001, + "loss": 1.7179, + "step": 8156 + }, + { + "epoch": 0.3450799560030459, + "grad_norm": 0.3713974356651306, + "learning_rate": 0.001, + "loss": 2.0866, + "step": 8157 + }, + { + "epoch": 0.3451222607665623, + "grad_norm": 0.21588067710399628, + "learning_rate": 0.001, + "loss": 1.7316, + "step": 8158 + }, + { + "epoch": 0.3451645655300787, + "grad_norm": 0.3257901668548584, + "learning_rate": 0.001, + "loss": 1.9901, + "step": 8159 + }, + { + "epoch": 0.34520687029359504, + "grad_norm": 0.21106848120689392, + "learning_rate": 0.001, + "loss": 3.0461, + "step": 8160 + }, + { + "epoch": 0.34524917505711145, + "grad_norm": 0.6497496962547302, + "learning_rate": 0.001, + "loss": 3.1958, + "step": 8161 + }, + { + "epoch": 0.3452914798206278, + "grad_norm": 0.17610898613929749, + "learning_rate": 0.001, + "loss": 2.2529, + "step": 8162 + }, + { + "epoch": 0.34533378458414415, + "grad_norm": 0.19626934826374054, + "learning_rate": 0.001, + "loss": 1.6347, + "step": 8163 + }, + { + "epoch": 0.34537608934766056, + "grad_norm": 0.21596650779247284, + "learning_rate": 0.001, + "loss": 3.0951, + "step": 8164 + }, + { + "epoch": 0.3454183941111769, + "grad_norm": 0.635787308216095, + "learning_rate": 0.001, + "loss": 2.1464, + "step": 8165 + }, + { + "epoch": 0.3454606988746933, + "grad_norm": 0.17580640316009521, + "learning_rate": 0.001, + "loss": 1.8562, + "step": 8166 + }, + { + "epoch": 0.3455030036382097, + "grad_norm": 0.35619786381721497, + "learning_rate": 0.001, + "loss": 2.0581, + "step": 8167 + }, + { + "epoch": 0.34554530840172604, + "grad_norm": 0.19072014093399048, + "learning_rate": 0.001, + "loss": 2.9908, + "step": 8168 + }, + { + "epoch": 0.3455876131652424, + "grad_norm": 0.21359659731388092, + "learning_rate": 0.001, + "loss": 1.9315, + "step": 8169 + }, + { + "epoch": 0.3456299179287588, + "grad_norm": 0.197892963886261, + "learning_rate": 0.001, + "loss": 2.3538, + "step": 8170 + }, + { + "epoch": 0.34567222269227516, + "grad_norm": 0.18982310593128204, + "learning_rate": 0.001, + "loss": 2.0689, + "step": 8171 + }, + { + "epoch": 0.3457145274557915, + "grad_norm": 0.19009721279144287, + "learning_rate": 0.001, + "loss": 2.123, + "step": 8172 + }, + { + "epoch": 0.3457568322193079, + "grad_norm": 0.5255012512207031, + "learning_rate": 0.001, + "loss": 2.5506, + "step": 8173 + }, + { + "epoch": 0.3457991369828243, + "grad_norm": 1.7404595613479614, + "learning_rate": 0.001, + "loss": 2.4197, + "step": 8174 + }, + { + "epoch": 0.3458414417463406, + "grad_norm": 0.2141122967004776, + "learning_rate": 0.001, + "loss": 2.2982, + "step": 8175 + }, + { + "epoch": 0.34588374650985704, + "grad_norm": 0.21013054251670837, + "learning_rate": 0.001, + "loss": 3.148, + "step": 8176 + }, + { + "epoch": 0.3459260512733734, + "grad_norm": 0.2299475222826004, + "learning_rate": 0.001, + "loss": 2.7599, + "step": 8177 + }, + { + "epoch": 0.34596835603688975, + "grad_norm": 0.735609769821167, + "learning_rate": 0.001, + "loss": 1.6265, + "step": 8178 + }, + { + "epoch": 0.3460106608004061, + "grad_norm": 0.3593994677066803, + "learning_rate": 0.001, + "loss": 2.6944, + "step": 8179 + }, + { + "epoch": 0.3460529655639225, + "grad_norm": 0.22775153815746307, + "learning_rate": 0.001, + "loss": 2.2529, + "step": 8180 + }, + { + "epoch": 0.34609527032743886, + "grad_norm": 0.2560899257659912, + "learning_rate": 0.001, + "loss": 1.9796, + "step": 8181 + }, + { + "epoch": 0.3461375750909552, + "grad_norm": 0.20973028242588043, + "learning_rate": 0.001, + "loss": 2.9659, + "step": 8182 + }, + { + "epoch": 0.34617987985447163, + "grad_norm": 2.2904813289642334, + "learning_rate": 0.001, + "loss": 1.5693, + "step": 8183 + }, + { + "epoch": 0.346222184617988, + "grad_norm": 0.2068173885345459, + "learning_rate": 0.001, + "loss": 2.2342, + "step": 8184 + }, + { + "epoch": 0.34626448938150434, + "grad_norm": 0.6102269291877747, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 8185 + }, + { + "epoch": 0.34630679414502075, + "grad_norm": 0.23791219294071198, + "learning_rate": 0.001, + "loss": 1.873, + "step": 8186 + }, + { + "epoch": 0.3463490989085371, + "grad_norm": 0.2418316751718521, + "learning_rate": 0.001, + "loss": 2.0798, + "step": 8187 + }, + { + "epoch": 0.34639140367205346, + "grad_norm": 0.944127082824707, + "learning_rate": 0.001, + "loss": 2.1407, + "step": 8188 + }, + { + "epoch": 0.34643370843556986, + "grad_norm": 0.20907090604305267, + "learning_rate": 0.001, + "loss": 2.5714, + "step": 8189 + }, + { + "epoch": 0.3464760131990862, + "grad_norm": 0.26245564222335815, + "learning_rate": 0.001, + "loss": 2.214, + "step": 8190 + }, + { + "epoch": 0.3465183179626026, + "grad_norm": 0.2782506048679352, + "learning_rate": 0.001, + "loss": 1.9815, + "step": 8191 + }, + { + "epoch": 0.346560622726119, + "grad_norm": 0.2448754757642746, + "learning_rate": 0.001, + "loss": 2.5002, + "step": 8192 + }, + { + "epoch": 0.34660292748963534, + "grad_norm": 0.21422387659549713, + "learning_rate": 0.001, + "loss": 2.2284, + "step": 8193 + }, + { + "epoch": 0.3466452322531517, + "grad_norm": 0.19360801577568054, + "learning_rate": 0.001, + "loss": 1.8096, + "step": 8194 + }, + { + "epoch": 0.3466875370166681, + "grad_norm": 0.17713648080825806, + "learning_rate": 0.001, + "loss": 2.179, + "step": 8195 + }, + { + "epoch": 0.34672984178018446, + "grad_norm": 0.23182488977909088, + "learning_rate": 0.001, + "loss": 1.9, + "step": 8196 + }, + { + "epoch": 0.3467721465437008, + "grad_norm": 0.6592618227005005, + "learning_rate": 0.001, + "loss": 2.6549, + "step": 8197 + }, + { + "epoch": 0.3468144513072172, + "grad_norm": 0.20647495985031128, + "learning_rate": 0.001, + "loss": 2.7395, + "step": 8198 + }, + { + "epoch": 0.3468567560707336, + "grad_norm": 18.073009490966797, + "learning_rate": 0.001, + "loss": 2.7044, + "step": 8199 + }, + { + "epoch": 0.34689906083424993, + "grad_norm": 0.1915636509656906, + "learning_rate": 0.001, + "loss": 2.1236, + "step": 8200 + }, + { + "epoch": 0.3469413655977663, + "grad_norm": 0.18017853796482086, + "learning_rate": 0.001, + "loss": 1.8282, + "step": 8201 + }, + { + "epoch": 0.3469836703612827, + "grad_norm": 1.731067419052124, + "learning_rate": 0.001, + "loss": 2.718, + "step": 8202 + }, + { + "epoch": 0.34702597512479905, + "grad_norm": 0.2168511152267456, + "learning_rate": 0.001, + "loss": 2.0306, + "step": 8203 + }, + { + "epoch": 0.3470682798883154, + "grad_norm": 0.23651044070720673, + "learning_rate": 0.001, + "loss": 2.4044, + "step": 8204 + }, + { + "epoch": 0.3471105846518318, + "grad_norm": 0.2720677852630615, + "learning_rate": 0.001, + "loss": 1.8696, + "step": 8205 + }, + { + "epoch": 0.34715288941534816, + "grad_norm": 0.24406278133392334, + "learning_rate": 0.001, + "loss": 1.9719, + "step": 8206 + }, + { + "epoch": 0.3471951941788645, + "grad_norm": 0.2830585241317749, + "learning_rate": 0.001, + "loss": 2.2669, + "step": 8207 + }, + { + "epoch": 0.34723749894238093, + "grad_norm": 0.9110187292098999, + "learning_rate": 0.001, + "loss": 2.46, + "step": 8208 + }, + { + "epoch": 0.3472798037058973, + "grad_norm": 0.23988114297389984, + "learning_rate": 0.001, + "loss": 2.1952, + "step": 8209 + }, + { + "epoch": 0.34732210846941364, + "grad_norm": 0.23413024842739105, + "learning_rate": 0.001, + "loss": 1.7244, + "step": 8210 + }, + { + "epoch": 0.34736441323293005, + "grad_norm": 0.17257529497146606, + "learning_rate": 0.001, + "loss": 1.3189, + "step": 8211 + }, + { + "epoch": 0.3474067179964464, + "grad_norm": 0.3271627128124237, + "learning_rate": 0.001, + "loss": 2.999, + "step": 8212 + }, + { + "epoch": 0.34744902275996276, + "grad_norm": 0.4803444445133209, + "learning_rate": 0.001, + "loss": 3.5374, + "step": 8213 + }, + { + "epoch": 0.34749132752347917, + "grad_norm": 0.20940083265304565, + "learning_rate": 0.001, + "loss": 3.9058, + "step": 8214 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 2.5996439456939697, + "learning_rate": 0.001, + "loss": 2.0939, + "step": 8215 + }, + { + "epoch": 0.3475759370505119, + "grad_norm": 3.1301825046539307, + "learning_rate": 0.001, + "loss": 3.033, + "step": 8216 + }, + { + "epoch": 0.3476182418140283, + "grad_norm": 0.2768004834651947, + "learning_rate": 0.001, + "loss": 2.4367, + "step": 8217 + }, + { + "epoch": 0.34766054657754464, + "grad_norm": 0.2278338372707367, + "learning_rate": 0.001, + "loss": 2.7675, + "step": 8218 + }, + { + "epoch": 0.347702851341061, + "grad_norm": 59.52205276489258, + "learning_rate": 0.001, + "loss": 1.8877, + "step": 8219 + }, + { + "epoch": 0.3477451561045774, + "grad_norm": 0.4094898998737335, + "learning_rate": 0.001, + "loss": 3.3673, + "step": 8220 + }, + { + "epoch": 0.34778746086809376, + "grad_norm": 0.24076704680919647, + "learning_rate": 0.001, + "loss": 1.7683, + "step": 8221 + }, + { + "epoch": 0.3478297656316101, + "grad_norm": 0.2221417874097824, + "learning_rate": 0.001, + "loss": 2.8931, + "step": 8222 + }, + { + "epoch": 0.34787207039512646, + "grad_norm": 0.2613305151462555, + "learning_rate": 0.001, + "loss": 2.6144, + "step": 8223 + }, + { + "epoch": 0.3479143751586429, + "grad_norm": 0.22772428393363953, + "learning_rate": 0.001, + "loss": 2.4166, + "step": 8224 + }, + { + "epoch": 0.34795667992215923, + "grad_norm": 0.21340018510818481, + "learning_rate": 0.001, + "loss": 1.5321, + "step": 8225 + }, + { + "epoch": 0.3479989846856756, + "grad_norm": 0.24845480918884277, + "learning_rate": 0.001, + "loss": 1.9605, + "step": 8226 + }, + { + "epoch": 0.348041289449192, + "grad_norm": 0.24793021380901337, + "learning_rate": 0.001, + "loss": 3.3195, + "step": 8227 + }, + { + "epoch": 0.34808359421270835, + "grad_norm": 3.903813123703003, + "learning_rate": 0.001, + "loss": 2.8211, + "step": 8228 + }, + { + "epoch": 0.3481258989762247, + "grad_norm": 0.21152375638484955, + "learning_rate": 0.001, + "loss": 2.4791, + "step": 8229 + }, + { + "epoch": 0.3481682037397411, + "grad_norm": 0.8900184631347656, + "learning_rate": 0.001, + "loss": 2.0313, + "step": 8230 + }, + { + "epoch": 0.34821050850325747, + "grad_norm": 0.2712019383907318, + "learning_rate": 0.001, + "loss": 2.3294, + "step": 8231 + }, + { + "epoch": 0.3482528132667738, + "grad_norm": 5.551436424255371, + "learning_rate": 0.001, + "loss": 2.4235, + "step": 8232 + }, + { + "epoch": 0.34829511803029023, + "grad_norm": 0.28290316462516785, + "learning_rate": 0.001, + "loss": 2.4363, + "step": 8233 + }, + { + "epoch": 0.3483374227938066, + "grad_norm": 0.7481885552406311, + "learning_rate": 0.001, + "loss": 1.7503, + "step": 8234 + }, + { + "epoch": 0.34837972755732294, + "grad_norm": 0.2814173400402069, + "learning_rate": 0.001, + "loss": 3.2661, + "step": 8235 + }, + { + "epoch": 0.34842203232083935, + "grad_norm": 1.8952068090438843, + "learning_rate": 0.001, + "loss": 3.6599, + "step": 8236 + }, + { + "epoch": 0.3484643370843557, + "grad_norm": 1.625000238418579, + "learning_rate": 0.001, + "loss": 3.2106, + "step": 8237 + }, + { + "epoch": 0.34850664184787206, + "grad_norm": 0.29325634241104126, + "learning_rate": 0.001, + "loss": 1.8171, + "step": 8238 + }, + { + "epoch": 0.34854894661138847, + "grad_norm": 0.23248402774333954, + "learning_rate": 0.001, + "loss": 1.9535, + "step": 8239 + }, + { + "epoch": 0.3485912513749048, + "grad_norm": 0.36317363381385803, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 8240 + }, + { + "epoch": 0.3486335561384212, + "grad_norm": 7.670083522796631, + "learning_rate": 0.001, + "loss": 2.581, + "step": 8241 + }, + { + "epoch": 0.3486758609019376, + "grad_norm": 0.2423803061246872, + "learning_rate": 0.001, + "loss": 2.7115, + "step": 8242 + }, + { + "epoch": 0.34871816566545394, + "grad_norm": 0.21461081504821777, + "learning_rate": 0.001, + "loss": 2.5597, + "step": 8243 + }, + { + "epoch": 0.3487604704289703, + "grad_norm": 0.28708726167678833, + "learning_rate": 0.001, + "loss": 3.2198, + "step": 8244 + }, + { + "epoch": 0.34880277519248665, + "grad_norm": 6.893510341644287, + "learning_rate": 0.001, + "loss": 2.521, + "step": 8245 + }, + { + "epoch": 0.34884507995600306, + "grad_norm": 0.19617153704166412, + "learning_rate": 0.001, + "loss": 1.9453, + "step": 8246 + }, + { + "epoch": 0.3488873847195194, + "grad_norm": 0.20679624378681183, + "learning_rate": 0.001, + "loss": 2.4901, + "step": 8247 + }, + { + "epoch": 0.34892968948303577, + "grad_norm": 0.28653019666671753, + "learning_rate": 0.001, + "loss": 2.3008, + "step": 8248 + }, + { + "epoch": 0.3489719942465522, + "grad_norm": 0.7451533675193787, + "learning_rate": 0.001, + "loss": 3.0218, + "step": 8249 + }, + { + "epoch": 0.34901429901006853, + "grad_norm": 0.3160172998905182, + "learning_rate": 0.001, + "loss": 2.5261, + "step": 8250 + }, + { + "epoch": 0.3490566037735849, + "grad_norm": 0.20035721361637115, + "learning_rate": 0.001, + "loss": 2.3146, + "step": 8251 + }, + { + "epoch": 0.3490989085371013, + "grad_norm": 0.19196628034114838, + "learning_rate": 0.001, + "loss": 3.0744, + "step": 8252 + }, + { + "epoch": 0.34914121330061765, + "grad_norm": 0.2677168846130371, + "learning_rate": 0.001, + "loss": 2.6725, + "step": 8253 + }, + { + "epoch": 0.349183518064134, + "grad_norm": 0.210031196475029, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 8254 + }, + { + "epoch": 0.3492258228276504, + "grad_norm": 0.2728026211261749, + "learning_rate": 0.001, + "loss": 1.6813, + "step": 8255 + }, + { + "epoch": 0.34926812759116677, + "grad_norm": 0.2557034492492676, + "learning_rate": 0.001, + "loss": 2.1864, + "step": 8256 + }, + { + "epoch": 0.3493104323546831, + "grad_norm": 0.7835190892219543, + "learning_rate": 0.001, + "loss": 3.1137, + "step": 8257 + }, + { + "epoch": 0.34935273711819953, + "grad_norm": 0.28121432662010193, + "learning_rate": 0.001, + "loss": 2.9322, + "step": 8258 + }, + { + "epoch": 0.3493950418817159, + "grad_norm": 0.207332044839859, + "learning_rate": 0.001, + "loss": 2.1749, + "step": 8259 + }, + { + "epoch": 0.34943734664523224, + "grad_norm": 0.5446605682373047, + "learning_rate": 0.001, + "loss": 2.0835, + "step": 8260 + }, + { + "epoch": 0.34947965140874865, + "grad_norm": 0.17694032192230225, + "learning_rate": 0.001, + "loss": 1.9013, + "step": 8261 + }, + { + "epoch": 0.349521956172265, + "grad_norm": 0.546602725982666, + "learning_rate": 0.001, + "loss": 2.1906, + "step": 8262 + }, + { + "epoch": 0.34956426093578136, + "grad_norm": 0.26002398133277893, + "learning_rate": 0.001, + "loss": 2.5833, + "step": 8263 + }, + { + "epoch": 0.34960656569929777, + "grad_norm": 0.21473874151706696, + "learning_rate": 0.001, + "loss": 1.9894, + "step": 8264 + }, + { + "epoch": 0.3496488704628141, + "grad_norm": 0.22110356390476227, + "learning_rate": 0.001, + "loss": 1.663, + "step": 8265 + }, + { + "epoch": 0.3496911752263305, + "grad_norm": 0.8616834878921509, + "learning_rate": 0.001, + "loss": 2.6967, + "step": 8266 + }, + { + "epoch": 0.34973347998984683, + "grad_norm": 0.2699930965900421, + "learning_rate": 0.001, + "loss": 3.416, + "step": 8267 + }, + { + "epoch": 0.34977578475336324, + "grad_norm": 0.27595359086990356, + "learning_rate": 0.001, + "loss": 1.9483, + "step": 8268 + }, + { + "epoch": 0.3498180895168796, + "grad_norm": 0.9030215740203857, + "learning_rate": 0.001, + "loss": 1.9612, + "step": 8269 + }, + { + "epoch": 0.34986039428039595, + "grad_norm": 0.21707811951637268, + "learning_rate": 0.001, + "loss": 1.8743, + "step": 8270 + }, + { + "epoch": 0.34990269904391236, + "grad_norm": 0.22298790514469147, + "learning_rate": 0.001, + "loss": 2.299, + "step": 8271 + }, + { + "epoch": 0.3499450038074287, + "grad_norm": 0.4610617756843567, + "learning_rate": 0.001, + "loss": 2.5806, + "step": 8272 + }, + { + "epoch": 0.34998730857094507, + "grad_norm": 0.16975651681423187, + "learning_rate": 0.001, + "loss": 1.8847, + "step": 8273 + }, + { + "epoch": 0.3500296133344615, + "grad_norm": 0.1778561770915985, + "learning_rate": 0.001, + "loss": 1.9668, + "step": 8274 + }, + { + "epoch": 0.35007191809797783, + "grad_norm": 0.1916327327489853, + "learning_rate": 0.001, + "loss": 2.3812, + "step": 8275 + }, + { + "epoch": 0.3501142228614942, + "grad_norm": 0.18574245274066925, + "learning_rate": 0.001, + "loss": 2.6713, + "step": 8276 + }, + { + "epoch": 0.3501565276250106, + "grad_norm": 0.17055678367614746, + "learning_rate": 0.001, + "loss": 2.7099, + "step": 8277 + }, + { + "epoch": 0.35019883238852695, + "grad_norm": 3.7420198917388916, + "learning_rate": 0.001, + "loss": 2.2635, + "step": 8278 + }, + { + "epoch": 0.3502411371520433, + "grad_norm": 0.19405874609947205, + "learning_rate": 0.001, + "loss": 2.8745, + "step": 8279 + }, + { + "epoch": 0.3502834419155597, + "grad_norm": 0.43192586302757263, + "learning_rate": 0.001, + "loss": 2.0631, + "step": 8280 + }, + { + "epoch": 0.35032574667907607, + "grad_norm": 0.19240808486938477, + "learning_rate": 0.001, + "loss": 2.2849, + "step": 8281 + }, + { + "epoch": 0.3503680514425924, + "grad_norm": 0.17617124319076538, + "learning_rate": 0.001, + "loss": 1.8595, + "step": 8282 + }, + { + "epoch": 0.35041035620610883, + "grad_norm": 4.70356559753418, + "learning_rate": 0.001, + "loss": 2.568, + "step": 8283 + }, + { + "epoch": 0.3504526609696252, + "grad_norm": 0.2642245292663574, + "learning_rate": 0.001, + "loss": 3.0254, + "step": 8284 + }, + { + "epoch": 0.35049496573314154, + "grad_norm": 0.1709575653076172, + "learning_rate": 0.001, + "loss": 2.6415, + "step": 8285 + }, + { + "epoch": 0.35053727049665795, + "grad_norm": 0.2094145566225052, + "learning_rate": 0.001, + "loss": 2.4383, + "step": 8286 + }, + { + "epoch": 0.3505795752601743, + "grad_norm": 0.18870730698108673, + "learning_rate": 0.001, + "loss": 1.6139, + "step": 8287 + }, + { + "epoch": 0.35062188002369066, + "grad_norm": 0.23713693022727966, + "learning_rate": 0.001, + "loss": 1.9205, + "step": 8288 + }, + { + "epoch": 0.35066418478720707, + "grad_norm": 0.1588599532842636, + "learning_rate": 0.001, + "loss": 1.5372, + "step": 8289 + }, + { + "epoch": 0.3507064895507234, + "grad_norm": 7.636360168457031, + "learning_rate": 0.001, + "loss": 2.0058, + "step": 8290 + }, + { + "epoch": 0.3507487943142398, + "grad_norm": 0.8418156504631042, + "learning_rate": 0.001, + "loss": 2.6797, + "step": 8291 + }, + { + "epoch": 0.35079109907775613, + "grad_norm": 0.2791912853717804, + "learning_rate": 0.001, + "loss": 3.1777, + "step": 8292 + }, + { + "epoch": 0.35083340384127254, + "grad_norm": 0.22210825979709625, + "learning_rate": 0.001, + "loss": 2.0927, + "step": 8293 + }, + { + "epoch": 0.3508757086047889, + "grad_norm": 2.1510884761810303, + "learning_rate": 0.001, + "loss": 2.077, + "step": 8294 + }, + { + "epoch": 0.35091801336830525, + "grad_norm": 0.21856360137462616, + "learning_rate": 0.001, + "loss": 2.7959, + "step": 8295 + }, + { + "epoch": 0.35096031813182166, + "grad_norm": 0.2667442560195923, + "learning_rate": 0.001, + "loss": 2.2033, + "step": 8296 + }, + { + "epoch": 0.351002622895338, + "grad_norm": 0.24852122366428375, + "learning_rate": 0.001, + "loss": 2.1141, + "step": 8297 + }, + { + "epoch": 0.35104492765885437, + "grad_norm": 1.397088885307312, + "learning_rate": 0.001, + "loss": 1.803, + "step": 8298 + }, + { + "epoch": 0.3510872324223708, + "grad_norm": 17.205541610717773, + "learning_rate": 0.001, + "loss": 2.371, + "step": 8299 + }, + { + "epoch": 0.35112953718588713, + "grad_norm": 0.2123159021139145, + "learning_rate": 0.001, + "loss": 3.3105, + "step": 8300 + }, + { + "epoch": 0.3511718419494035, + "grad_norm": 2.099641799926758, + "learning_rate": 0.001, + "loss": 2.1354, + "step": 8301 + }, + { + "epoch": 0.3512141467129199, + "grad_norm": 6.61326789855957, + "learning_rate": 0.001, + "loss": 2.2409, + "step": 8302 + }, + { + "epoch": 0.35125645147643625, + "grad_norm": 2.718135118484497, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 8303 + }, + { + "epoch": 0.3512987562399526, + "grad_norm": 1.8631720542907715, + "learning_rate": 0.001, + "loss": 3.8234, + "step": 8304 + }, + { + "epoch": 0.351341061003469, + "grad_norm": 20.797183990478516, + "learning_rate": 0.001, + "loss": 3.4096, + "step": 8305 + }, + { + "epoch": 0.35138336576698537, + "grad_norm": 0.22008393704891205, + "learning_rate": 0.001, + "loss": 1.8032, + "step": 8306 + }, + { + "epoch": 0.3514256705305017, + "grad_norm": 0.26470401883125305, + "learning_rate": 0.001, + "loss": 2.6094, + "step": 8307 + }, + { + "epoch": 0.35146797529401813, + "grad_norm": 0.19725409150123596, + "learning_rate": 0.001, + "loss": 1.5124, + "step": 8308 + }, + { + "epoch": 0.3515102800575345, + "grad_norm": 0.3234120011329651, + "learning_rate": 0.001, + "loss": 2.1466, + "step": 8309 + }, + { + "epoch": 0.35155258482105084, + "grad_norm": 0.32287687063217163, + "learning_rate": 0.001, + "loss": 2.0659, + "step": 8310 + }, + { + "epoch": 0.35159488958456725, + "grad_norm": 0.2961645722389221, + "learning_rate": 0.001, + "loss": 2.6533, + "step": 8311 + }, + { + "epoch": 0.3516371943480836, + "grad_norm": 2.627986192703247, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 8312 + }, + { + "epoch": 0.35167949911159996, + "grad_norm": 0.374530166387558, + "learning_rate": 0.001, + "loss": 2.9414, + "step": 8313 + }, + { + "epoch": 0.3517218038751163, + "grad_norm": 0.19504281878471375, + "learning_rate": 0.001, + "loss": 1.7939, + "step": 8314 + }, + { + "epoch": 0.3517641086386327, + "grad_norm": 0.6185072064399719, + "learning_rate": 0.001, + "loss": 2.1054, + "step": 8315 + }, + { + "epoch": 0.3518064134021491, + "grad_norm": 0.4495874345302582, + "learning_rate": 0.001, + "loss": 1.7725, + "step": 8316 + }, + { + "epoch": 0.35184871816566543, + "grad_norm": 0.2591138780117035, + "learning_rate": 0.001, + "loss": 2.4438, + "step": 8317 + }, + { + "epoch": 0.35189102292918184, + "grad_norm": 0.2160710096359253, + "learning_rate": 0.001, + "loss": 1.9199, + "step": 8318 + }, + { + "epoch": 0.3519333276926982, + "grad_norm": 0.22822026908397675, + "learning_rate": 0.001, + "loss": 2.8699, + "step": 8319 + }, + { + "epoch": 0.35197563245621455, + "grad_norm": 0.23242418467998505, + "learning_rate": 0.001, + "loss": 2.9319, + "step": 8320 + }, + { + "epoch": 0.35201793721973096, + "grad_norm": 59.025177001953125, + "learning_rate": 0.001, + "loss": 2.2009, + "step": 8321 + }, + { + "epoch": 0.3520602419832473, + "grad_norm": 0.3726635277271271, + "learning_rate": 0.001, + "loss": 1.9791, + "step": 8322 + }, + { + "epoch": 0.35210254674676367, + "grad_norm": 0.2542991638183594, + "learning_rate": 0.001, + "loss": 3.2745, + "step": 8323 + }, + { + "epoch": 0.3521448515102801, + "grad_norm": 0.25025397539138794, + "learning_rate": 0.001, + "loss": 1.8405, + "step": 8324 + }, + { + "epoch": 0.35218715627379643, + "grad_norm": 0.2506003677845001, + "learning_rate": 0.001, + "loss": 3.0609, + "step": 8325 + }, + { + "epoch": 0.3522294610373128, + "grad_norm": 0.26360347867012024, + "learning_rate": 0.001, + "loss": 2.1514, + "step": 8326 + }, + { + "epoch": 0.3522717658008292, + "grad_norm": 0.23163963854312897, + "learning_rate": 0.001, + "loss": 2.0993, + "step": 8327 + }, + { + "epoch": 0.35231407056434555, + "grad_norm": 18.21137046813965, + "learning_rate": 0.001, + "loss": 2.8873, + "step": 8328 + }, + { + "epoch": 0.3523563753278619, + "grad_norm": 0.21554243564605713, + "learning_rate": 0.001, + "loss": 2.7229, + "step": 8329 + }, + { + "epoch": 0.3523986800913783, + "grad_norm": 0.26712241768836975, + "learning_rate": 0.001, + "loss": 2.5294, + "step": 8330 + }, + { + "epoch": 0.35244098485489467, + "grad_norm": 0.2301950454711914, + "learning_rate": 0.001, + "loss": 2.3371, + "step": 8331 + }, + { + "epoch": 0.352483289618411, + "grad_norm": 0.19002433121204376, + "learning_rate": 0.001, + "loss": 2.637, + "step": 8332 + }, + { + "epoch": 0.35252559438192743, + "grad_norm": 0.25531867146492004, + "learning_rate": 0.001, + "loss": 2.001, + "step": 8333 + }, + { + "epoch": 0.3525678991454438, + "grad_norm": 0.9937705397605896, + "learning_rate": 0.001, + "loss": 1.9568, + "step": 8334 + }, + { + "epoch": 0.35261020390896014, + "grad_norm": 0.3505503833293915, + "learning_rate": 0.001, + "loss": 2.2035, + "step": 8335 + }, + { + "epoch": 0.3526525086724765, + "grad_norm": 0.2179388403892517, + "learning_rate": 0.001, + "loss": 2.0458, + "step": 8336 + }, + { + "epoch": 0.3526948134359929, + "grad_norm": 0.17621107399463654, + "learning_rate": 0.001, + "loss": 1.9847, + "step": 8337 + }, + { + "epoch": 0.35273711819950926, + "grad_norm": 0.34067684412002563, + "learning_rate": 0.001, + "loss": 2.064, + "step": 8338 + }, + { + "epoch": 0.3527794229630256, + "grad_norm": 0.24471262097358704, + "learning_rate": 0.001, + "loss": 3.5153, + "step": 8339 + }, + { + "epoch": 0.352821727726542, + "grad_norm": 0.37908583879470825, + "learning_rate": 0.001, + "loss": 2.4995, + "step": 8340 + }, + { + "epoch": 0.3528640324900584, + "grad_norm": 0.453393816947937, + "learning_rate": 0.001, + "loss": 2.0838, + "step": 8341 + }, + { + "epoch": 0.35290633725357473, + "grad_norm": 0.18824003636837006, + "learning_rate": 0.001, + "loss": 2.6714, + "step": 8342 + }, + { + "epoch": 0.35294864201709114, + "grad_norm": 0.16498175263404846, + "learning_rate": 0.001, + "loss": 1.8878, + "step": 8343 + }, + { + "epoch": 0.3529909467806075, + "grad_norm": 0.18404828011989594, + "learning_rate": 0.001, + "loss": 2.5525, + "step": 8344 + }, + { + "epoch": 0.35303325154412385, + "grad_norm": 0.19402121007442474, + "learning_rate": 0.001, + "loss": 1.3525, + "step": 8345 + }, + { + "epoch": 0.35307555630764026, + "grad_norm": 0.3609767258167267, + "learning_rate": 0.001, + "loss": 2.2392, + "step": 8346 + }, + { + "epoch": 0.3531178610711566, + "grad_norm": 0.17331457138061523, + "learning_rate": 0.001, + "loss": 1.9563, + "step": 8347 + }, + { + "epoch": 0.35316016583467297, + "grad_norm": 0.1629885584115982, + "learning_rate": 0.001, + "loss": 2.2316, + "step": 8348 + }, + { + "epoch": 0.3532024705981894, + "grad_norm": 0.4724940061569214, + "learning_rate": 0.001, + "loss": 2.8375, + "step": 8349 + }, + { + "epoch": 0.35324477536170573, + "grad_norm": 0.19430530071258545, + "learning_rate": 0.001, + "loss": 2.5702, + "step": 8350 + }, + { + "epoch": 0.3532870801252221, + "grad_norm": 0.2884927988052368, + "learning_rate": 0.001, + "loss": 1.8518, + "step": 8351 + }, + { + "epoch": 0.3533293848887385, + "grad_norm": 0.30345895886421204, + "learning_rate": 0.001, + "loss": 2.2031, + "step": 8352 + }, + { + "epoch": 0.35337168965225485, + "grad_norm": 0.1875777542591095, + "learning_rate": 0.001, + "loss": 1.9054, + "step": 8353 + }, + { + "epoch": 0.3534139944157712, + "grad_norm": 0.17914628982543945, + "learning_rate": 0.001, + "loss": 2.1172, + "step": 8354 + }, + { + "epoch": 0.3534562991792876, + "grad_norm": 0.223419189453125, + "learning_rate": 0.001, + "loss": 2.0011, + "step": 8355 + }, + { + "epoch": 0.35349860394280397, + "grad_norm": 1.009746789932251, + "learning_rate": 0.001, + "loss": 3.5386, + "step": 8356 + }, + { + "epoch": 0.3535409087063203, + "grad_norm": 0.17974236607551575, + "learning_rate": 0.001, + "loss": 2.8207, + "step": 8357 + }, + { + "epoch": 0.3535832134698367, + "grad_norm": 0.20830687880516052, + "learning_rate": 0.001, + "loss": 2.6878, + "step": 8358 + }, + { + "epoch": 0.3536255182333531, + "grad_norm": 0.19284899532794952, + "learning_rate": 0.001, + "loss": 1.9765, + "step": 8359 + }, + { + "epoch": 0.35366782299686944, + "grad_norm": 1.829329252243042, + "learning_rate": 0.001, + "loss": 1.6846, + "step": 8360 + }, + { + "epoch": 0.3537101277603858, + "grad_norm": 0.21594662964344025, + "learning_rate": 0.001, + "loss": 1.8569, + "step": 8361 + }, + { + "epoch": 0.3537524325239022, + "grad_norm": 4.540489673614502, + "learning_rate": 0.001, + "loss": 2.6528, + "step": 8362 + }, + { + "epoch": 0.35379473728741856, + "grad_norm": 0.24919438362121582, + "learning_rate": 0.001, + "loss": 3.6536, + "step": 8363 + }, + { + "epoch": 0.3538370420509349, + "grad_norm": 0.21543803811073303, + "learning_rate": 0.001, + "loss": 2.3457, + "step": 8364 + }, + { + "epoch": 0.3538793468144513, + "grad_norm": 0.18875446915626526, + "learning_rate": 0.001, + "loss": 1.6829, + "step": 8365 + }, + { + "epoch": 0.3539216515779677, + "grad_norm": 0.18854385614395142, + "learning_rate": 0.001, + "loss": 2.7787, + "step": 8366 + }, + { + "epoch": 0.35396395634148403, + "grad_norm": 0.23087264597415924, + "learning_rate": 0.001, + "loss": 2.1272, + "step": 8367 + }, + { + "epoch": 0.35400626110500044, + "grad_norm": 0.16182763874530792, + "learning_rate": 0.001, + "loss": 1.4934, + "step": 8368 + }, + { + "epoch": 0.3540485658685168, + "grad_norm": 0.19627897441387177, + "learning_rate": 0.001, + "loss": 2.9024, + "step": 8369 + }, + { + "epoch": 0.35409087063203315, + "grad_norm": 0.18524384498596191, + "learning_rate": 0.001, + "loss": 2.1512, + "step": 8370 + }, + { + "epoch": 0.35413317539554956, + "grad_norm": 0.1795620322227478, + "learning_rate": 0.001, + "loss": 2.0742, + "step": 8371 + }, + { + "epoch": 0.3541754801590659, + "grad_norm": 0.20785008370876312, + "learning_rate": 0.001, + "loss": 1.6379, + "step": 8372 + }, + { + "epoch": 0.35421778492258227, + "grad_norm": 0.20255346596240997, + "learning_rate": 0.001, + "loss": 2.1128, + "step": 8373 + }, + { + "epoch": 0.3542600896860987, + "grad_norm": 0.22817464172840118, + "learning_rate": 0.001, + "loss": 2.0911, + "step": 8374 + }, + { + "epoch": 0.35430239444961503, + "grad_norm": 1.5379173755645752, + "learning_rate": 0.001, + "loss": 2.2441, + "step": 8375 + }, + { + "epoch": 0.3543446992131314, + "grad_norm": 1.9879519939422607, + "learning_rate": 0.001, + "loss": 1.7536, + "step": 8376 + }, + { + "epoch": 0.3543870039766478, + "grad_norm": 0.8693563938140869, + "learning_rate": 0.001, + "loss": 2.0423, + "step": 8377 + }, + { + "epoch": 0.35442930874016415, + "grad_norm": 0.20398443937301636, + "learning_rate": 0.001, + "loss": 1.926, + "step": 8378 + }, + { + "epoch": 0.3544716135036805, + "grad_norm": 1.420688509941101, + "learning_rate": 0.001, + "loss": 2.1442, + "step": 8379 + }, + { + "epoch": 0.35451391826719686, + "grad_norm": 8.911663055419922, + "learning_rate": 0.001, + "loss": 1.6632, + "step": 8380 + }, + { + "epoch": 0.35455622303071327, + "grad_norm": 0.2737337052822113, + "learning_rate": 0.001, + "loss": 1.9327, + "step": 8381 + }, + { + "epoch": 0.3545985277942296, + "grad_norm": 0.20059816539287567, + "learning_rate": 0.001, + "loss": 2.094, + "step": 8382 + }, + { + "epoch": 0.354640832557746, + "grad_norm": 0.21529340744018555, + "learning_rate": 0.001, + "loss": 1.811, + "step": 8383 + }, + { + "epoch": 0.3546831373212624, + "grad_norm": 0.21227684617042542, + "learning_rate": 0.001, + "loss": 1.9585, + "step": 8384 + }, + { + "epoch": 0.35472544208477874, + "grad_norm": 0.18570013344287872, + "learning_rate": 0.001, + "loss": 2.5025, + "step": 8385 + }, + { + "epoch": 0.3547677468482951, + "grad_norm": 0.21961857378482819, + "learning_rate": 0.001, + "loss": 1.7425, + "step": 8386 + }, + { + "epoch": 0.3548100516118115, + "grad_norm": 8.055773735046387, + "learning_rate": 0.001, + "loss": 2.41, + "step": 8387 + }, + { + "epoch": 0.35485235637532786, + "grad_norm": 0.33439183235168457, + "learning_rate": 0.001, + "loss": 2.141, + "step": 8388 + }, + { + "epoch": 0.3548946611388442, + "grad_norm": 0.2350430190563202, + "learning_rate": 0.001, + "loss": 2.0489, + "step": 8389 + }, + { + "epoch": 0.3549369659023606, + "grad_norm": 0.2935107946395874, + "learning_rate": 0.001, + "loss": 1.439, + "step": 8390 + }, + { + "epoch": 0.354979270665877, + "grad_norm": 2.349884033203125, + "learning_rate": 0.001, + "loss": 2.6664, + "step": 8391 + }, + { + "epoch": 0.35502157542939333, + "grad_norm": 2.4287772178649902, + "learning_rate": 0.001, + "loss": 2.796, + "step": 8392 + }, + { + "epoch": 0.35506388019290974, + "grad_norm": 0.24385391175746918, + "learning_rate": 0.001, + "loss": 1.9699, + "step": 8393 + }, + { + "epoch": 0.3551061849564261, + "grad_norm": 0.21865463256835938, + "learning_rate": 0.001, + "loss": 1.4755, + "step": 8394 + }, + { + "epoch": 0.35514848971994245, + "grad_norm": 0.221740260720253, + "learning_rate": 0.001, + "loss": 2.6338, + "step": 8395 + }, + { + "epoch": 0.35519079448345886, + "grad_norm": 0.212009459733963, + "learning_rate": 0.001, + "loss": 1.8636, + "step": 8396 + }, + { + "epoch": 0.3552330992469752, + "grad_norm": 0.42118605971336365, + "learning_rate": 0.001, + "loss": 1.8155, + "step": 8397 + }, + { + "epoch": 0.35527540401049157, + "grad_norm": 0.19747601449489594, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 8398 + }, + { + "epoch": 0.355317708774008, + "grad_norm": 0.21434566378593445, + "learning_rate": 0.001, + "loss": 3.2472, + "step": 8399 + }, + { + "epoch": 0.35536001353752433, + "grad_norm": 0.1856713443994522, + "learning_rate": 0.001, + "loss": 1.8753, + "step": 8400 + }, + { + "epoch": 0.3554023183010407, + "grad_norm": 0.215153768658638, + "learning_rate": 0.001, + "loss": 1.7379, + "step": 8401 + }, + { + "epoch": 0.35544462306455704, + "grad_norm": 0.16858762502670288, + "learning_rate": 0.001, + "loss": 1.8505, + "step": 8402 + }, + { + "epoch": 0.35548692782807345, + "grad_norm": 0.23868021368980408, + "learning_rate": 0.001, + "loss": 3.6215, + "step": 8403 + }, + { + "epoch": 0.3555292325915898, + "grad_norm": 0.21436399221420288, + "learning_rate": 0.001, + "loss": 1.5321, + "step": 8404 + }, + { + "epoch": 0.35557153735510616, + "grad_norm": 0.3087001442909241, + "learning_rate": 0.001, + "loss": 2.6408, + "step": 8405 + }, + { + "epoch": 0.35561384211862257, + "grad_norm": 0.20392103493213654, + "learning_rate": 0.001, + "loss": 3.0366, + "step": 8406 + }, + { + "epoch": 0.3556561468821389, + "grad_norm": 1.7128150463104248, + "learning_rate": 0.001, + "loss": 3.0189, + "step": 8407 + }, + { + "epoch": 0.3556984516456553, + "grad_norm": 0.2017560452222824, + "learning_rate": 0.001, + "loss": 2.2836, + "step": 8408 + }, + { + "epoch": 0.3557407564091717, + "grad_norm": 0.2031020075082779, + "learning_rate": 0.001, + "loss": 2.3527, + "step": 8409 + }, + { + "epoch": 0.35578306117268804, + "grad_norm": 0.19620314240455627, + "learning_rate": 0.001, + "loss": 2.9028, + "step": 8410 + }, + { + "epoch": 0.3558253659362044, + "grad_norm": 0.23667864501476288, + "learning_rate": 0.001, + "loss": 1.8299, + "step": 8411 + }, + { + "epoch": 0.3558676706997208, + "grad_norm": 0.18068136274814606, + "learning_rate": 0.001, + "loss": 2.8935, + "step": 8412 + }, + { + "epoch": 0.35590997546323716, + "grad_norm": 0.9692279696464539, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 8413 + }, + { + "epoch": 0.3559522802267535, + "grad_norm": 0.1930302083492279, + "learning_rate": 0.001, + "loss": 1.8272, + "step": 8414 + }, + { + "epoch": 0.3559945849902699, + "grad_norm": 0.49082469940185547, + "learning_rate": 0.001, + "loss": 3.0818, + "step": 8415 + }, + { + "epoch": 0.3560368897537863, + "grad_norm": 0.17451566457748413, + "learning_rate": 0.001, + "loss": 1.9013, + "step": 8416 + }, + { + "epoch": 0.35607919451730263, + "grad_norm": 0.3995453715324402, + "learning_rate": 0.001, + "loss": 2.0403, + "step": 8417 + }, + { + "epoch": 0.35612149928081904, + "grad_norm": 0.18853165209293365, + "learning_rate": 0.001, + "loss": 1.9157, + "step": 8418 + }, + { + "epoch": 0.3561638040443354, + "grad_norm": 0.16779901087284088, + "learning_rate": 0.001, + "loss": 2.425, + "step": 8419 + }, + { + "epoch": 0.35620610880785175, + "grad_norm": 0.20070673525333405, + "learning_rate": 0.001, + "loss": 2.0243, + "step": 8420 + }, + { + "epoch": 0.35624841357136816, + "grad_norm": 0.26167789101600647, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 8421 + }, + { + "epoch": 0.3562907183348845, + "grad_norm": 0.39837929606437683, + "learning_rate": 0.001, + "loss": 3.0087, + "step": 8422 + }, + { + "epoch": 0.35633302309840087, + "grad_norm": 0.1800961196422577, + "learning_rate": 0.001, + "loss": 2.4717, + "step": 8423 + }, + { + "epoch": 0.3563753278619173, + "grad_norm": 0.2057555615901947, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 8424 + }, + { + "epoch": 0.35641763262543363, + "grad_norm": 0.17443668842315674, + "learning_rate": 0.001, + "loss": 1.6971, + "step": 8425 + }, + { + "epoch": 0.35645993738895, + "grad_norm": 0.21079231798648834, + "learning_rate": 0.001, + "loss": 2.3975, + "step": 8426 + }, + { + "epoch": 0.35650224215246634, + "grad_norm": 3.70257830619812, + "learning_rate": 0.001, + "loss": 2.5433, + "step": 8427 + }, + { + "epoch": 0.35654454691598275, + "grad_norm": 0.17566591501235962, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 8428 + }, + { + "epoch": 0.3565868516794991, + "grad_norm": 0.4361303746700287, + "learning_rate": 0.001, + "loss": 2.4334, + "step": 8429 + }, + { + "epoch": 0.35662915644301546, + "grad_norm": 0.27633586525917053, + "learning_rate": 0.001, + "loss": 3.0307, + "step": 8430 + }, + { + "epoch": 0.35667146120653187, + "grad_norm": 0.21895642578601837, + "learning_rate": 0.001, + "loss": 2.015, + "step": 8431 + }, + { + "epoch": 0.3567137659700482, + "grad_norm": 0.15661796927452087, + "learning_rate": 0.001, + "loss": 1.8659, + "step": 8432 + }, + { + "epoch": 0.3567560707335646, + "grad_norm": 0.19141338765621185, + "learning_rate": 0.001, + "loss": 2.7277, + "step": 8433 + }, + { + "epoch": 0.356798375497081, + "grad_norm": 0.2092185765504837, + "learning_rate": 0.001, + "loss": 2.2695, + "step": 8434 + }, + { + "epoch": 0.35684068026059734, + "grad_norm": 0.19185788929462433, + "learning_rate": 0.001, + "loss": 2.2118, + "step": 8435 + }, + { + "epoch": 0.3568829850241137, + "grad_norm": 0.20594419538974762, + "learning_rate": 0.001, + "loss": 2.4456, + "step": 8436 + }, + { + "epoch": 0.3569252897876301, + "grad_norm": 0.843341588973999, + "learning_rate": 0.001, + "loss": 2.6461, + "step": 8437 + }, + { + "epoch": 0.35696759455114646, + "grad_norm": 0.18174798786640167, + "learning_rate": 0.001, + "loss": 2.0485, + "step": 8438 + }, + { + "epoch": 0.3570098993146628, + "grad_norm": 0.6228585839271545, + "learning_rate": 0.001, + "loss": 2.8919, + "step": 8439 + }, + { + "epoch": 0.3570522040781792, + "grad_norm": 0.21685943007469177, + "learning_rate": 0.001, + "loss": 2.1506, + "step": 8440 + }, + { + "epoch": 0.3570945088416956, + "grad_norm": 0.16078568994998932, + "learning_rate": 0.001, + "loss": 2.6238, + "step": 8441 + }, + { + "epoch": 0.35713681360521193, + "grad_norm": 0.2162536233663559, + "learning_rate": 0.001, + "loss": 2.6207, + "step": 8442 + }, + { + "epoch": 0.35717911836872834, + "grad_norm": 0.20838353037834167, + "learning_rate": 0.001, + "loss": 2.4904, + "step": 8443 + }, + { + "epoch": 0.3572214231322447, + "grad_norm": 0.20165249705314636, + "learning_rate": 0.001, + "loss": 3.4246, + "step": 8444 + }, + { + "epoch": 0.35726372789576105, + "grad_norm": 0.15616677701473236, + "learning_rate": 0.001, + "loss": 2.2008, + "step": 8445 + }, + { + "epoch": 0.35730603265927746, + "grad_norm": 0.48721373081207275, + "learning_rate": 0.001, + "loss": 1.8402, + "step": 8446 + }, + { + "epoch": 0.3573483374227938, + "grad_norm": 0.23544834554195404, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 8447 + }, + { + "epoch": 0.35739064218631017, + "grad_norm": 27.898847579956055, + "learning_rate": 0.001, + "loss": 2.5454, + "step": 8448 + }, + { + "epoch": 0.3574329469498265, + "grad_norm": 0.23525142669677734, + "learning_rate": 0.001, + "loss": 3.1336, + "step": 8449 + }, + { + "epoch": 0.35747525171334293, + "grad_norm": 0.16481630504131317, + "learning_rate": 0.001, + "loss": 1.871, + "step": 8450 + }, + { + "epoch": 0.3575175564768593, + "grad_norm": 0.20045652985572815, + "learning_rate": 0.001, + "loss": 2.5494, + "step": 8451 + }, + { + "epoch": 0.35755986124037564, + "grad_norm": 0.4051211476325989, + "learning_rate": 0.001, + "loss": 2.1058, + "step": 8452 + }, + { + "epoch": 0.35760216600389205, + "grad_norm": 1.801464557647705, + "learning_rate": 0.001, + "loss": 2.0417, + "step": 8453 + }, + { + "epoch": 0.3576444707674084, + "grad_norm": 0.22075815498828888, + "learning_rate": 0.001, + "loss": 2.2037, + "step": 8454 + }, + { + "epoch": 0.35768677553092476, + "grad_norm": 0.20504413545131683, + "learning_rate": 0.001, + "loss": 2.048, + "step": 8455 + }, + { + "epoch": 0.35772908029444117, + "grad_norm": 0.20746447145938873, + "learning_rate": 0.001, + "loss": 2.2803, + "step": 8456 + }, + { + "epoch": 0.3577713850579575, + "grad_norm": 0.47997429966926575, + "learning_rate": 0.001, + "loss": 2.247, + "step": 8457 + }, + { + "epoch": 0.3578136898214739, + "grad_norm": 0.24447347223758698, + "learning_rate": 0.001, + "loss": 1.9872, + "step": 8458 + }, + { + "epoch": 0.3578559945849903, + "grad_norm": 0.2988087832927704, + "learning_rate": 0.001, + "loss": 3.0362, + "step": 8459 + }, + { + "epoch": 0.35789829934850664, + "grad_norm": 1.136828064918518, + "learning_rate": 0.001, + "loss": 2.3112, + "step": 8460 + }, + { + "epoch": 0.357940604112023, + "grad_norm": 0.4372611343860626, + "learning_rate": 0.001, + "loss": 1.7607, + "step": 8461 + }, + { + "epoch": 0.3579829088755394, + "grad_norm": 0.18738792836666107, + "learning_rate": 0.001, + "loss": 1.9763, + "step": 8462 + }, + { + "epoch": 0.35802521363905576, + "grad_norm": 0.4911549985408783, + "learning_rate": 0.001, + "loss": 3.0154, + "step": 8463 + }, + { + "epoch": 0.3580675184025721, + "grad_norm": 0.21335723996162415, + "learning_rate": 0.001, + "loss": 3.5891, + "step": 8464 + }, + { + "epoch": 0.3581098231660885, + "grad_norm": 0.9935727715492249, + "learning_rate": 0.001, + "loss": 2.0077, + "step": 8465 + }, + { + "epoch": 0.3581521279296049, + "grad_norm": 0.16692140698432922, + "learning_rate": 0.001, + "loss": 1.725, + "step": 8466 + }, + { + "epoch": 0.35819443269312123, + "grad_norm": 3.184319496154785, + "learning_rate": 0.001, + "loss": 1.9069, + "step": 8467 + }, + { + "epoch": 0.35823673745663764, + "grad_norm": 0.2190055400133133, + "learning_rate": 0.001, + "loss": 2.3108, + "step": 8468 + }, + { + "epoch": 0.358279042220154, + "grad_norm": 2.8575377464294434, + "learning_rate": 0.001, + "loss": 1.8183, + "step": 8469 + }, + { + "epoch": 0.35832134698367035, + "grad_norm": 0.27034443616867065, + "learning_rate": 0.001, + "loss": 1.9055, + "step": 8470 + }, + { + "epoch": 0.3583636517471867, + "grad_norm": 0.2202761024236679, + "learning_rate": 0.001, + "loss": 2.4092, + "step": 8471 + }, + { + "epoch": 0.3584059565107031, + "grad_norm": 1.0408316850662231, + "learning_rate": 0.001, + "loss": 2.3927, + "step": 8472 + }, + { + "epoch": 0.35844826127421947, + "grad_norm": 0.26200470328330994, + "learning_rate": 0.001, + "loss": 2.256, + "step": 8473 + }, + { + "epoch": 0.3584905660377358, + "grad_norm": 0.23284506797790527, + "learning_rate": 0.001, + "loss": 2.4034, + "step": 8474 + }, + { + "epoch": 0.35853287080125223, + "grad_norm": 0.20646698772907257, + "learning_rate": 0.001, + "loss": 1.829, + "step": 8475 + }, + { + "epoch": 0.3585751755647686, + "grad_norm": 0.25378456711769104, + "learning_rate": 0.001, + "loss": 2.9985, + "step": 8476 + }, + { + "epoch": 0.35861748032828494, + "grad_norm": 0.31622862815856934, + "learning_rate": 0.001, + "loss": 2.0001, + "step": 8477 + }, + { + "epoch": 0.35865978509180135, + "grad_norm": 0.21043705940246582, + "learning_rate": 0.001, + "loss": 2.5147, + "step": 8478 + }, + { + "epoch": 0.3587020898553177, + "grad_norm": 0.3560662865638733, + "learning_rate": 0.001, + "loss": 2.2411, + "step": 8479 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.4322994649410248, + "learning_rate": 0.001, + "loss": 3.4217, + "step": 8480 + }, + { + "epoch": 0.35878669938235047, + "grad_norm": 0.7399798631668091, + "learning_rate": 0.001, + "loss": 2.408, + "step": 8481 + }, + { + "epoch": 0.3588290041458668, + "grad_norm": 0.22962254285812378, + "learning_rate": 0.001, + "loss": 2.7897, + "step": 8482 + }, + { + "epoch": 0.3588713089093832, + "grad_norm": 0.20676597952842712, + "learning_rate": 0.001, + "loss": 2.0482, + "step": 8483 + }, + { + "epoch": 0.3589136136728996, + "grad_norm": 0.1842261403799057, + "learning_rate": 0.001, + "loss": 2.6388, + "step": 8484 + }, + { + "epoch": 0.35895591843641594, + "grad_norm": 0.28361448645591736, + "learning_rate": 0.001, + "loss": 2.6798, + "step": 8485 + }, + { + "epoch": 0.3589982231999323, + "grad_norm": 0.910611093044281, + "learning_rate": 0.001, + "loss": 2.1683, + "step": 8486 + }, + { + "epoch": 0.3590405279634487, + "grad_norm": 0.7657113671302795, + "learning_rate": 0.001, + "loss": 2.3814, + "step": 8487 + }, + { + "epoch": 0.35908283272696506, + "grad_norm": 0.29642629623413086, + "learning_rate": 0.001, + "loss": 1.888, + "step": 8488 + }, + { + "epoch": 0.3591251374904814, + "grad_norm": 0.1811065673828125, + "learning_rate": 0.001, + "loss": 3.645, + "step": 8489 + }, + { + "epoch": 0.3591674422539978, + "grad_norm": 0.2280883491039276, + "learning_rate": 0.001, + "loss": 2.1726, + "step": 8490 + }, + { + "epoch": 0.3592097470175142, + "grad_norm": 0.19528071582317352, + "learning_rate": 0.001, + "loss": 2.1867, + "step": 8491 + }, + { + "epoch": 0.35925205178103053, + "grad_norm": 0.24748535454273224, + "learning_rate": 0.001, + "loss": 2.0032, + "step": 8492 + }, + { + "epoch": 0.3592943565445469, + "grad_norm": 0.21031174063682556, + "learning_rate": 0.001, + "loss": 2.0499, + "step": 8493 + }, + { + "epoch": 0.3593366613080633, + "grad_norm": 0.22891457378864288, + "learning_rate": 0.001, + "loss": 3.2536, + "step": 8494 + }, + { + "epoch": 0.35937896607157965, + "grad_norm": 0.18080003559589386, + "learning_rate": 0.001, + "loss": 2.1618, + "step": 8495 + }, + { + "epoch": 0.359421270835096, + "grad_norm": 0.3403954803943634, + "learning_rate": 0.001, + "loss": 1.625, + "step": 8496 + }, + { + "epoch": 0.3594635755986124, + "grad_norm": 0.1686491221189499, + "learning_rate": 0.001, + "loss": 3.029, + "step": 8497 + }, + { + "epoch": 0.35950588036212877, + "grad_norm": 0.3256143629550934, + "learning_rate": 0.001, + "loss": 2.1193, + "step": 8498 + }, + { + "epoch": 0.3595481851256451, + "grad_norm": 7.151230335235596, + "learning_rate": 0.001, + "loss": 3.4917, + "step": 8499 + }, + { + "epoch": 0.35959048988916154, + "grad_norm": 0.2594936788082123, + "learning_rate": 0.001, + "loss": 2.7958, + "step": 8500 + }, + { + "epoch": 0.3596327946526779, + "grad_norm": 1.2051527500152588, + "learning_rate": 0.001, + "loss": 2.0397, + "step": 8501 + }, + { + "epoch": 0.35967509941619424, + "grad_norm": 4.79257345199585, + "learning_rate": 0.001, + "loss": 2.5444, + "step": 8502 + }, + { + "epoch": 0.35971740417971065, + "grad_norm": 0.1828813999891281, + "learning_rate": 0.001, + "loss": 2.5241, + "step": 8503 + }, + { + "epoch": 0.359759708943227, + "grad_norm": 0.22619538009166718, + "learning_rate": 0.001, + "loss": 2.5545, + "step": 8504 + }, + { + "epoch": 0.35980201370674336, + "grad_norm": 0.16107860207557678, + "learning_rate": 0.001, + "loss": 2.2485, + "step": 8505 + }, + { + "epoch": 0.35984431847025977, + "grad_norm": 0.22678741812705994, + "learning_rate": 0.001, + "loss": 2.5525, + "step": 8506 + }, + { + "epoch": 0.3598866232337761, + "grad_norm": 0.20380182564258575, + "learning_rate": 0.001, + "loss": 2.495, + "step": 8507 + }, + { + "epoch": 0.3599289279972925, + "grad_norm": 0.32149720191955566, + "learning_rate": 0.001, + "loss": 2.2425, + "step": 8508 + }, + { + "epoch": 0.3599712327608089, + "grad_norm": 0.17898282408714294, + "learning_rate": 0.001, + "loss": 2.3266, + "step": 8509 + }, + { + "epoch": 0.36001353752432524, + "grad_norm": 0.742031455039978, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 8510 + }, + { + "epoch": 0.3600558422878416, + "grad_norm": 1.0595115423202515, + "learning_rate": 0.001, + "loss": 2.9696, + "step": 8511 + }, + { + "epoch": 0.360098147051358, + "grad_norm": 0.22999782860279083, + "learning_rate": 0.001, + "loss": 3.4851, + "step": 8512 + }, + { + "epoch": 0.36014045181487436, + "grad_norm": 0.2157551646232605, + "learning_rate": 0.001, + "loss": 2.5969, + "step": 8513 + }, + { + "epoch": 0.3601827565783907, + "grad_norm": 0.8261212706565857, + "learning_rate": 0.001, + "loss": 2.3355, + "step": 8514 + }, + { + "epoch": 0.36022506134190707, + "grad_norm": 0.40030571818351746, + "learning_rate": 0.001, + "loss": 2.4083, + "step": 8515 + }, + { + "epoch": 0.3602673661054235, + "grad_norm": 0.23546592891216278, + "learning_rate": 0.001, + "loss": 2.1861, + "step": 8516 + }, + { + "epoch": 0.36030967086893984, + "grad_norm": 0.20725642144680023, + "learning_rate": 0.001, + "loss": 2.3907, + "step": 8517 + }, + { + "epoch": 0.3603519756324562, + "grad_norm": 0.28327709436416626, + "learning_rate": 0.001, + "loss": 3.0746, + "step": 8518 + }, + { + "epoch": 0.3603942803959726, + "grad_norm": 0.4538307189941406, + "learning_rate": 0.001, + "loss": 2.6733, + "step": 8519 + }, + { + "epoch": 0.36043658515948895, + "grad_norm": 0.20344911515712738, + "learning_rate": 0.001, + "loss": 2.7097, + "step": 8520 + }, + { + "epoch": 0.3604788899230053, + "grad_norm": 0.2238440215587616, + "learning_rate": 0.001, + "loss": 2.2248, + "step": 8521 + }, + { + "epoch": 0.3605211946865217, + "grad_norm": 0.2143414467573166, + "learning_rate": 0.001, + "loss": 2.6659, + "step": 8522 + }, + { + "epoch": 0.36056349945003807, + "grad_norm": 0.21883174777030945, + "learning_rate": 0.001, + "loss": 2.8158, + "step": 8523 + }, + { + "epoch": 0.3606058042135544, + "grad_norm": 1.42238450050354, + "learning_rate": 0.001, + "loss": 2.9731, + "step": 8524 + }, + { + "epoch": 0.36064810897707084, + "grad_norm": 4.848227500915527, + "learning_rate": 0.001, + "loss": 2.352, + "step": 8525 + }, + { + "epoch": 0.3606904137405872, + "grad_norm": 0.7891883850097656, + "learning_rate": 0.001, + "loss": 2.0768, + "step": 8526 + }, + { + "epoch": 0.36073271850410354, + "grad_norm": 84.44062042236328, + "learning_rate": 0.001, + "loss": 3.0048, + "step": 8527 + }, + { + "epoch": 0.36077502326761995, + "grad_norm": 0.2623269855976105, + "learning_rate": 0.001, + "loss": 2.887, + "step": 8528 + }, + { + "epoch": 0.3608173280311363, + "grad_norm": 0.2041042447090149, + "learning_rate": 0.001, + "loss": 1.7496, + "step": 8529 + }, + { + "epoch": 0.36085963279465266, + "grad_norm": 0.25694215297698975, + "learning_rate": 0.001, + "loss": 2.0294, + "step": 8530 + }, + { + "epoch": 0.3609019375581691, + "grad_norm": 0.4767371416091919, + "learning_rate": 0.001, + "loss": 1.9393, + "step": 8531 + }, + { + "epoch": 0.3609442423216854, + "grad_norm": 0.5682966113090515, + "learning_rate": 0.001, + "loss": 2.401, + "step": 8532 + }, + { + "epoch": 0.3609865470852018, + "grad_norm": 0.754736065864563, + "learning_rate": 0.001, + "loss": 1.53, + "step": 8533 + }, + { + "epoch": 0.3610288518487182, + "grad_norm": 0.19843953847885132, + "learning_rate": 0.001, + "loss": 2.3422, + "step": 8534 + }, + { + "epoch": 0.36107115661223455, + "grad_norm": 0.6914002299308777, + "learning_rate": 0.001, + "loss": 2.5332, + "step": 8535 + }, + { + "epoch": 0.3611134613757509, + "grad_norm": 0.9868711829185486, + "learning_rate": 0.001, + "loss": 3.0186, + "step": 8536 + }, + { + "epoch": 0.3611557661392673, + "grad_norm": 0.9456235766410828, + "learning_rate": 0.001, + "loss": 1.7753, + "step": 8537 + }, + { + "epoch": 0.36119807090278366, + "grad_norm": 0.3088882565498352, + "learning_rate": 0.001, + "loss": 2.0363, + "step": 8538 + }, + { + "epoch": 0.3612403756663, + "grad_norm": 0.8840659856796265, + "learning_rate": 0.001, + "loss": 3.2643, + "step": 8539 + }, + { + "epoch": 0.36128268042981637, + "grad_norm": 1.7744220495224, + "learning_rate": 0.001, + "loss": 1.981, + "step": 8540 + }, + { + "epoch": 0.3613249851933328, + "grad_norm": 0.8132971525192261, + "learning_rate": 0.001, + "loss": 3.234, + "step": 8541 + }, + { + "epoch": 0.36136728995684914, + "grad_norm": 0.26763689517974854, + "learning_rate": 0.001, + "loss": 2.3408, + "step": 8542 + }, + { + "epoch": 0.3614095947203655, + "grad_norm": 0.18884502351284027, + "learning_rate": 0.001, + "loss": 3.1137, + "step": 8543 + }, + { + "epoch": 0.3614518994838819, + "grad_norm": 4.647204399108887, + "learning_rate": 0.001, + "loss": 2.6148, + "step": 8544 + }, + { + "epoch": 0.36149420424739825, + "grad_norm": 3.6507856845855713, + "learning_rate": 0.001, + "loss": 3.341, + "step": 8545 + }, + { + "epoch": 0.3615365090109146, + "grad_norm": 0.21972277760505676, + "learning_rate": 0.001, + "loss": 1.8723, + "step": 8546 + }, + { + "epoch": 0.361578813774431, + "grad_norm": 0.3465214967727661, + "learning_rate": 0.001, + "loss": 2.1504, + "step": 8547 + }, + { + "epoch": 0.3616211185379474, + "grad_norm": 0.40754181146621704, + "learning_rate": 0.001, + "loss": 2.694, + "step": 8548 + }, + { + "epoch": 0.3616634233014637, + "grad_norm": 0.20011767745018005, + "learning_rate": 0.001, + "loss": 2.4816, + "step": 8549 + }, + { + "epoch": 0.36170572806498014, + "grad_norm": 0.1964236944913864, + "learning_rate": 0.001, + "loss": 1.6528, + "step": 8550 + }, + { + "epoch": 0.3617480328284965, + "grad_norm": 0.17661263048648834, + "learning_rate": 0.001, + "loss": 1.9885, + "step": 8551 + }, + { + "epoch": 0.36179033759201285, + "grad_norm": 0.7868698835372925, + "learning_rate": 0.001, + "loss": 2.6136, + "step": 8552 + }, + { + "epoch": 0.36183264235552925, + "grad_norm": 0.23645655810832977, + "learning_rate": 0.001, + "loss": 2.408, + "step": 8553 + }, + { + "epoch": 0.3618749471190456, + "grad_norm": 0.18343326449394226, + "learning_rate": 0.001, + "loss": 1.5529, + "step": 8554 + }, + { + "epoch": 0.36191725188256196, + "grad_norm": 0.18939687311649323, + "learning_rate": 0.001, + "loss": 2.5676, + "step": 8555 + }, + { + "epoch": 0.3619595566460784, + "grad_norm": 0.7980564832687378, + "learning_rate": 0.001, + "loss": 2.4449, + "step": 8556 + }, + { + "epoch": 0.3620018614095947, + "grad_norm": 0.8566358089447021, + "learning_rate": 0.001, + "loss": 3.1181, + "step": 8557 + }, + { + "epoch": 0.3620441661731111, + "grad_norm": 0.24437959492206573, + "learning_rate": 0.001, + "loss": 2.1782, + "step": 8558 + }, + { + "epoch": 0.3620864709366275, + "grad_norm": 0.22951102256774902, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 8559 + }, + { + "epoch": 0.36212877570014385, + "grad_norm": 0.5679842233657837, + "learning_rate": 0.001, + "loss": 2.8188, + "step": 8560 + }, + { + "epoch": 0.3621710804636602, + "grad_norm": 0.2738345265388489, + "learning_rate": 0.001, + "loss": 2.9867, + "step": 8561 + }, + { + "epoch": 0.36221338522717655, + "grad_norm": 0.6427478790283203, + "learning_rate": 0.001, + "loss": 2.0955, + "step": 8562 + }, + { + "epoch": 0.36225568999069296, + "grad_norm": 0.17962263524532318, + "learning_rate": 0.001, + "loss": 1.8587, + "step": 8563 + }, + { + "epoch": 0.3622979947542093, + "grad_norm": 1.4514254331588745, + "learning_rate": 0.001, + "loss": 1.7745, + "step": 8564 + }, + { + "epoch": 0.3623402995177257, + "grad_norm": 0.2207036018371582, + "learning_rate": 0.001, + "loss": 1.6446, + "step": 8565 + }, + { + "epoch": 0.3623826042812421, + "grad_norm": 0.3202604055404663, + "learning_rate": 0.001, + "loss": 2.5328, + "step": 8566 + }, + { + "epoch": 0.36242490904475844, + "grad_norm": 0.6351190209388733, + "learning_rate": 0.001, + "loss": 2.7094, + "step": 8567 + }, + { + "epoch": 0.3624672138082748, + "grad_norm": 0.21355757117271423, + "learning_rate": 0.001, + "loss": 2.6323, + "step": 8568 + }, + { + "epoch": 0.3625095185717912, + "grad_norm": 0.20924630761146545, + "learning_rate": 0.001, + "loss": 1.6898, + "step": 8569 + }, + { + "epoch": 0.36255182333530755, + "grad_norm": 0.4818645417690277, + "learning_rate": 0.001, + "loss": 1.6882, + "step": 8570 + }, + { + "epoch": 0.3625941280988239, + "grad_norm": 0.2274334728717804, + "learning_rate": 0.001, + "loss": 3.8046, + "step": 8571 + }, + { + "epoch": 0.3626364328623403, + "grad_norm": 4.399014949798584, + "learning_rate": 0.001, + "loss": 2.6037, + "step": 8572 + }, + { + "epoch": 0.3626787376258567, + "grad_norm": 0.20539069175720215, + "learning_rate": 0.001, + "loss": 1.8258, + "step": 8573 + }, + { + "epoch": 0.362721042389373, + "grad_norm": 0.3310551345348358, + "learning_rate": 0.001, + "loss": 3.1566, + "step": 8574 + }, + { + "epoch": 0.36276334715288944, + "grad_norm": 0.2262580245733261, + "learning_rate": 0.001, + "loss": 2.6769, + "step": 8575 + }, + { + "epoch": 0.3628056519164058, + "grad_norm": 0.20004798471927643, + "learning_rate": 0.001, + "loss": 1.7463, + "step": 8576 + }, + { + "epoch": 0.36284795667992215, + "grad_norm": 0.17591601610183716, + "learning_rate": 0.001, + "loss": 1.5559, + "step": 8577 + }, + { + "epoch": 0.36289026144343856, + "grad_norm": 0.17528226971626282, + "learning_rate": 0.001, + "loss": 1.9886, + "step": 8578 + }, + { + "epoch": 0.3629325662069549, + "grad_norm": 0.3700055181980133, + "learning_rate": 0.001, + "loss": 2.4776, + "step": 8579 + }, + { + "epoch": 0.36297487097047126, + "grad_norm": 0.17485986649990082, + "learning_rate": 0.001, + "loss": 2.0908, + "step": 8580 + }, + { + "epoch": 0.3630171757339877, + "grad_norm": 9.299296379089355, + "learning_rate": 0.001, + "loss": 2.3967, + "step": 8581 + }, + { + "epoch": 0.36305948049750403, + "grad_norm": 0.20761297643184662, + "learning_rate": 0.001, + "loss": 2.0552, + "step": 8582 + }, + { + "epoch": 0.3631017852610204, + "grad_norm": 0.19602414965629578, + "learning_rate": 0.001, + "loss": 2.1057, + "step": 8583 + }, + { + "epoch": 0.36314409002453674, + "grad_norm": 1.7627781629562378, + "learning_rate": 0.001, + "loss": 2.5504, + "step": 8584 + }, + { + "epoch": 0.36318639478805315, + "grad_norm": 1.635079026222229, + "learning_rate": 0.001, + "loss": 1.8716, + "step": 8585 + }, + { + "epoch": 0.3632286995515695, + "grad_norm": 2.4154467582702637, + "learning_rate": 0.001, + "loss": 2.5212, + "step": 8586 + }, + { + "epoch": 0.36327100431508585, + "grad_norm": 0.1748884916305542, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 8587 + }, + { + "epoch": 0.36331330907860226, + "grad_norm": 1.158073902130127, + "learning_rate": 0.001, + "loss": 2.5192, + "step": 8588 + }, + { + "epoch": 0.3633556138421186, + "grad_norm": 0.24196279048919678, + "learning_rate": 0.001, + "loss": 2.1299, + "step": 8589 + }, + { + "epoch": 0.363397918605635, + "grad_norm": 0.21735741198062897, + "learning_rate": 0.001, + "loss": 2.2981, + "step": 8590 + }, + { + "epoch": 0.3634402233691514, + "grad_norm": 0.22882144153118134, + "learning_rate": 0.001, + "loss": 2.1787, + "step": 8591 + }, + { + "epoch": 0.36348252813266774, + "grad_norm": 0.22966794669628143, + "learning_rate": 0.001, + "loss": 2.0843, + "step": 8592 + }, + { + "epoch": 0.3635248328961841, + "grad_norm": 3.5307583808898926, + "learning_rate": 0.001, + "loss": 2.6233, + "step": 8593 + }, + { + "epoch": 0.3635671376597005, + "grad_norm": 0.1808539777994156, + "learning_rate": 0.001, + "loss": 2.447, + "step": 8594 + }, + { + "epoch": 0.36360944242321686, + "grad_norm": 0.191551074385643, + "learning_rate": 0.001, + "loss": 1.9306, + "step": 8595 + }, + { + "epoch": 0.3636517471867332, + "grad_norm": 0.2742213308811188, + "learning_rate": 0.001, + "loss": 2.5281, + "step": 8596 + }, + { + "epoch": 0.3636940519502496, + "grad_norm": 3.3009042739868164, + "learning_rate": 0.001, + "loss": 3.4417, + "step": 8597 + }, + { + "epoch": 0.363736356713766, + "grad_norm": 0.22634568810462952, + "learning_rate": 0.001, + "loss": 1.9962, + "step": 8598 + }, + { + "epoch": 0.36377866147728233, + "grad_norm": 0.3063672184944153, + "learning_rate": 0.001, + "loss": 2.1643, + "step": 8599 + }, + { + "epoch": 0.36382096624079874, + "grad_norm": 0.4341367781162262, + "learning_rate": 0.001, + "loss": 2.8908, + "step": 8600 + }, + { + "epoch": 0.3638632710043151, + "grad_norm": 0.2267148345708847, + "learning_rate": 0.001, + "loss": 3.3518, + "step": 8601 + }, + { + "epoch": 0.36390557576783145, + "grad_norm": 0.235674649477005, + "learning_rate": 0.001, + "loss": 2.0174, + "step": 8602 + }, + { + "epoch": 0.36394788053134786, + "grad_norm": 0.1691252589225769, + "learning_rate": 0.001, + "loss": 1.7468, + "step": 8603 + }, + { + "epoch": 0.3639901852948642, + "grad_norm": 0.4646351933479309, + "learning_rate": 0.001, + "loss": 2.9241, + "step": 8604 + }, + { + "epoch": 0.36403249005838056, + "grad_norm": 0.18952451646327972, + "learning_rate": 0.001, + "loss": 1.9208, + "step": 8605 + }, + { + "epoch": 0.3640747948218969, + "grad_norm": 1.2215399742126465, + "learning_rate": 0.001, + "loss": 2.0738, + "step": 8606 + }, + { + "epoch": 0.36411709958541333, + "grad_norm": 0.1823042780160904, + "learning_rate": 0.001, + "loss": 2.0435, + "step": 8607 + }, + { + "epoch": 0.3641594043489297, + "grad_norm": 0.18429160118103027, + "learning_rate": 0.001, + "loss": 2.1176, + "step": 8608 + }, + { + "epoch": 0.36420170911244604, + "grad_norm": 0.30278459191322327, + "learning_rate": 0.001, + "loss": 2.0551, + "step": 8609 + }, + { + "epoch": 0.36424401387596245, + "grad_norm": 0.27430135011672974, + "learning_rate": 0.001, + "loss": 2.7956, + "step": 8610 + }, + { + "epoch": 0.3642863186394788, + "grad_norm": 0.7540306448936462, + "learning_rate": 0.001, + "loss": 2.312, + "step": 8611 + }, + { + "epoch": 0.36432862340299516, + "grad_norm": 0.24535398185253143, + "learning_rate": 0.001, + "loss": 2.2725, + "step": 8612 + }, + { + "epoch": 0.36437092816651157, + "grad_norm": 0.39230287075042725, + "learning_rate": 0.001, + "loss": 2.2182, + "step": 8613 + }, + { + "epoch": 0.3644132329300279, + "grad_norm": 0.19538423418998718, + "learning_rate": 0.001, + "loss": 2.1788, + "step": 8614 + }, + { + "epoch": 0.3644555376935443, + "grad_norm": 0.40475207567214966, + "learning_rate": 0.001, + "loss": 2.1257, + "step": 8615 + }, + { + "epoch": 0.3644978424570607, + "grad_norm": 0.27652233839035034, + "learning_rate": 0.001, + "loss": 3.0038, + "step": 8616 + }, + { + "epoch": 0.36454014722057704, + "grad_norm": 0.19666795432567596, + "learning_rate": 0.001, + "loss": 1.4796, + "step": 8617 + }, + { + "epoch": 0.3645824519840934, + "grad_norm": 0.8510257601737976, + "learning_rate": 0.001, + "loss": 2.5566, + "step": 8618 + }, + { + "epoch": 0.3646247567476098, + "grad_norm": 0.2172936201095581, + "learning_rate": 0.001, + "loss": 2.1356, + "step": 8619 + }, + { + "epoch": 0.36466706151112616, + "grad_norm": 0.20282861590385437, + "learning_rate": 0.001, + "loss": 1.5411, + "step": 8620 + }, + { + "epoch": 0.3647093662746425, + "grad_norm": 0.1585301011800766, + "learning_rate": 0.001, + "loss": 1.3001, + "step": 8621 + }, + { + "epoch": 0.3647516710381589, + "grad_norm": 0.20633943378925323, + "learning_rate": 0.001, + "loss": 1.869, + "step": 8622 + }, + { + "epoch": 0.3647939758016753, + "grad_norm": 0.24952706694602966, + "learning_rate": 0.001, + "loss": 2.2658, + "step": 8623 + }, + { + "epoch": 0.36483628056519163, + "grad_norm": 0.24051597714424133, + "learning_rate": 0.001, + "loss": 2.0523, + "step": 8624 + }, + { + "epoch": 0.36487858532870804, + "grad_norm": 1.4806382656097412, + "learning_rate": 0.001, + "loss": 3.0184, + "step": 8625 + }, + { + "epoch": 0.3649208900922244, + "grad_norm": 0.2159508466720581, + "learning_rate": 0.001, + "loss": 1.995, + "step": 8626 + }, + { + "epoch": 0.36496319485574075, + "grad_norm": 0.5792630314826965, + "learning_rate": 0.001, + "loss": 3.4955, + "step": 8627 + }, + { + "epoch": 0.3650054996192571, + "grad_norm": 0.18533584475517273, + "learning_rate": 0.001, + "loss": 1.815, + "step": 8628 + }, + { + "epoch": 0.3650478043827735, + "grad_norm": 0.2772461175918579, + "learning_rate": 0.001, + "loss": 2.1131, + "step": 8629 + }, + { + "epoch": 0.36509010914628987, + "grad_norm": 0.27100545167922974, + "learning_rate": 0.001, + "loss": 2.2957, + "step": 8630 + }, + { + "epoch": 0.3651324139098062, + "grad_norm": 0.2463526576757431, + "learning_rate": 0.001, + "loss": 3.0297, + "step": 8631 + }, + { + "epoch": 0.36517471867332263, + "grad_norm": 1.035469889640808, + "learning_rate": 0.001, + "loss": 1.6165, + "step": 8632 + }, + { + "epoch": 0.365217023436839, + "grad_norm": 0.23822209239006042, + "learning_rate": 0.001, + "loss": 2.0529, + "step": 8633 + }, + { + "epoch": 0.36525932820035534, + "grad_norm": 0.20233659446239471, + "learning_rate": 0.001, + "loss": 1.6715, + "step": 8634 + }, + { + "epoch": 0.36530163296387175, + "grad_norm": 0.48247143626213074, + "learning_rate": 0.001, + "loss": 2.6857, + "step": 8635 + }, + { + "epoch": 0.3653439377273881, + "grad_norm": 0.19231975078582764, + "learning_rate": 0.001, + "loss": 1.9427, + "step": 8636 + }, + { + "epoch": 0.36538624249090446, + "grad_norm": 0.7569523453712463, + "learning_rate": 0.001, + "loss": 2.3201, + "step": 8637 + }, + { + "epoch": 0.36542854725442087, + "grad_norm": 0.27033358812332153, + "learning_rate": 0.001, + "loss": 2.4324, + "step": 8638 + }, + { + "epoch": 0.3654708520179372, + "grad_norm": 0.18519775569438934, + "learning_rate": 0.001, + "loss": 1.9692, + "step": 8639 + }, + { + "epoch": 0.3655131567814536, + "grad_norm": 0.25175315141677856, + "learning_rate": 0.001, + "loss": 3.5475, + "step": 8640 + }, + { + "epoch": 0.36555546154497, + "grad_norm": 0.18655212223529816, + "learning_rate": 0.001, + "loss": 1.7191, + "step": 8641 + }, + { + "epoch": 0.36559776630848634, + "grad_norm": 0.21611282229423523, + "learning_rate": 0.001, + "loss": 2.2513, + "step": 8642 + }, + { + "epoch": 0.3656400710720027, + "grad_norm": 0.21771200001239777, + "learning_rate": 0.001, + "loss": 1.6885, + "step": 8643 + }, + { + "epoch": 0.3656823758355191, + "grad_norm": 0.19719097018241882, + "learning_rate": 0.001, + "loss": 2.629, + "step": 8644 + }, + { + "epoch": 0.36572468059903546, + "grad_norm": 11.389663696289062, + "learning_rate": 0.001, + "loss": 2.8977, + "step": 8645 + }, + { + "epoch": 0.3657669853625518, + "grad_norm": 0.15473631024360657, + "learning_rate": 0.001, + "loss": 1.5634, + "step": 8646 + }, + { + "epoch": 0.3658092901260682, + "grad_norm": 0.4173217713832855, + "learning_rate": 0.001, + "loss": 2.0252, + "step": 8647 + }, + { + "epoch": 0.3658515948895846, + "grad_norm": 1.9161365032196045, + "learning_rate": 0.001, + "loss": 3.082, + "step": 8648 + }, + { + "epoch": 0.36589389965310093, + "grad_norm": 0.1819329559803009, + "learning_rate": 0.001, + "loss": 2.5534, + "step": 8649 + }, + { + "epoch": 0.3659362044166173, + "grad_norm": 0.2493087202310562, + "learning_rate": 0.001, + "loss": 2.2305, + "step": 8650 + }, + { + "epoch": 0.3659785091801337, + "grad_norm": 0.20480546355247498, + "learning_rate": 0.001, + "loss": 2.044, + "step": 8651 + }, + { + "epoch": 0.36602081394365005, + "grad_norm": 0.22163717448711395, + "learning_rate": 0.001, + "loss": 1.9366, + "step": 8652 + }, + { + "epoch": 0.3660631187071664, + "grad_norm": 0.187017560005188, + "learning_rate": 0.001, + "loss": 1.992, + "step": 8653 + }, + { + "epoch": 0.3661054234706828, + "grad_norm": 0.21020790934562683, + "learning_rate": 0.001, + "loss": 2.3122, + "step": 8654 + }, + { + "epoch": 0.36614772823419917, + "grad_norm": 0.19258715212345123, + "learning_rate": 0.001, + "loss": 3.0791, + "step": 8655 + }, + { + "epoch": 0.3661900329977155, + "grad_norm": 0.18852315843105316, + "learning_rate": 0.001, + "loss": 2.7449, + "step": 8656 + }, + { + "epoch": 0.36623233776123193, + "grad_norm": 0.16643263399600983, + "learning_rate": 0.001, + "loss": 1.8381, + "step": 8657 + }, + { + "epoch": 0.3662746425247483, + "grad_norm": 0.17222201824188232, + "learning_rate": 0.001, + "loss": 1.7572, + "step": 8658 + }, + { + "epoch": 0.36631694728826464, + "grad_norm": 0.1717625856399536, + "learning_rate": 0.001, + "loss": 2.1745, + "step": 8659 + }, + { + "epoch": 0.36635925205178105, + "grad_norm": 0.22495992481708527, + "learning_rate": 0.001, + "loss": 2.179, + "step": 8660 + }, + { + "epoch": 0.3664015568152974, + "grad_norm": 2.0256707668304443, + "learning_rate": 0.001, + "loss": 3.8565, + "step": 8661 + }, + { + "epoch": 0.36644386157881376, + "grad_norm": 0.20122158527374268, + "learning_rate": 0.001, + "loss": 2.0525, + "step": 8662 + }, + { + "epoch": 0.36648616634233017, + "grad_norm": 0.2111087143421173, + "learning_rate": 0.001, + "loss": 2.4875, + "step": 8663 + }, + { + "epoch": 0.3665284711058465, + "grad_norm": 0.6038394570350647, + "learning_rate": 0.001, + "loss": 2.5007, + "step": 8664 + }, + { + "epoch": 0.3665707758693629, + "grad_norm": 0.18492920696735382, + "learning_rate": 0.001, + "loss": 1.9992, + "step": 8665 + }, + { + "epoch": 0.3666130806328793, + "grad_norm": 0.39475682377815247, + "learning_rate": 0.001, + "loss": 3.2232, + "step": 8666 + }, + { + "epoch": 0.36665538539639564, + "grad_norm": 0.2003537118434906, + "learning_rate": 0.001, + "loss": 1.8457, + "step": 8667 + }, + { + "epoch": 0.366697690159912, + "grad_norm": 0.20579084753990173, + "learning_rate": 0.001, + "loss": 2.3453, + "step": 8668 + }, + { + "epoch": 0.3667399949234284, + "grad_norm": 0.3103180527687073, + "learning_rate": 0.001, + "loss": 2.771, + "step": 8669 + }, + { + "epoch": 0.36678229968694476, + "grad_norm": 0.18858268857002258, + "learning_rate": 0.001, + "loss": 1.9894, + "step": 8670 + }, + { + "epoch": 0.3668246044504611, + "grad_norm": 0.2262255698442459, + "learning_rate": 0.001, + "loss": 1.9428, + "step": 8671 + }, + { + "epoch": 0.3668669092139775, + "grad_norm": 0.4273422062397003, + "learning_rate": 0.001, + "loss": 3.7103, + "step": 8672 + }, + { + "epoch": 0.3669092139774939, + "grad_norm": 0.22894766926765442, + "learning_rate": 0.001, + "loss": 2.0784, + "step": 8673 + }, + { + "epoch": 0.36695151874101023, + "grad_norm": 0.248250812292099, + "learning_rate": 0.001, + "loss": 1.7906, + "step": 8674 + }, + { + "epoch": 0.3669938235045266, + "grad_norm": 0.307745099067688, + "learning_rate": 0.001, + "loss": 3.2343, + "step": 8675 + }, + { + "epoch": 0.367036128268043, + "grad_norm": 0.21414296329021454, + "learning_rate": 0.001, + "loss": 3.1725, + "step": 8676 + }, + { + "epoch": 0.36707843303155935, + "grad_norm": 1.1623830795288086, + "learning_rate": 0.001, + "loss": 1.9302, + "step": 8677 + }, + { + "epoch": 0.3671207377950757, + "grad_norm": 0.218984916806221, + "learning_rate": 0.001, + "loss": 2.4448, + "step": 8678 + }, + { + "epoch": 0.3671630425585921, + "grad_norm": 0.24998073279857635, + "learning_rate": 0.001, + "loss": 2.1159, + "step": 8679 + }, + { + "epoch": 0.36720534732210847, + "grad_norm": 0.19620297849178314, + "learning_rate": 0.001, + "loss": 2.264, + "step": 8680 + }, + { + "epoch": 0.3672476520856248, + "grad_norm": 0.1893426477909088, + "learning_rate": 0.001, + "loss": 1.677, + "step": 8681 + }, + { + "epoch": 0.36728995684914123, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.001, + "loss": 3.0901, + "step": 8682 + }, + { + "epoch": 0.3673322616126576, + "grad_norm": 0.3786305785179138, + "learning_rate": 0.001, + "loss": 3.0062, + "step": 8683 + }, + { + "epoch": 0.36737456637617394, + "grad_norm": 5.867971897125244, + "learning_rate": 0.001, + "loss": 2.209, + "step": 8684 + }, + { + "epoch": 0.36741687113969035, + "grad_norm": 44.805763244628906, + "learning_rate": 0.001, + "loss": 2.1827, + "step": 8685 + }, + { + "epoch": 0.3674591759032067, + "grad_norm": 0.18080832064151764, + "learning_rate": 0.001, + "loss": 1.9619, + "step": 8686 + }, + { + "epoch": 0.36750148066672306, + "grad_norm": 0.22980724275112152, + "learning_rate": 0.001, + "loss": 2.2155, + "step": 8687 + }, + { + "epoch": 0.36754378543023947, + "grad_norm": 0.2389446198940277, + "learning_rate": 0.001, + "loss": 2.045, + "step": 8688 + }, + { + "epoch": 0.3675860901937558, + "grad_norm": 0.22380129992961884, + "learning_rate": 0.001, + "loss": 2.716, + "step": 8689 + }, + { + "epoch": 0.3676283949572722, + "grad_norm": 0.3434034585952759, + "learning_rate": 0.001, + "loss": 3.1835, + "step": 8690 + }, + { + "epoch": 0.3676706997207886, + "grad_norm": 0.237090066075325, + "learning_rate": 0.001, + "loss": 1.9271, + "step": 8691 + }, + { + "epoch": 0.36771300448430494, + "grad_norm": 0.22618965804576874, + "learning_rate": 0.001, + "loss": 2.1982, + "step": 8692 + }, + { + "epoch": 0.3677553092478213, + "grad_norm": 0.1993749737739563, + "learning_rate": 0.001, + "loss": 4.0091, + "step": 8693 + }, + { + "epoch": 0.3677976140113377, + "grad_norm": 2.4615116119384766, + "learning_rate": 0.001, + "loss": 2.3653, + "step": 8694 + }, + { + "epoch": 0.36783991877485406, + "grad_norm": 0.5453082323074341, + "learning_rate": 0.001, + "loss": 3.2241, + "step": 8695 + }, + { + "epoch": 0.3678822235383704, + "grad_norm": 0.22832508385181427, + "learning_rate": 0.001, + "loss": 2.1284, + "step": 8696 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 0.22793933749198914, + "learning_rate": 0.001, + "loss": 2.7905, + "step": 8697 + }, + { + "epoch": 0.3679668330654032, + "grad_norm": 0.2116621732711792, + "learning_rate": 0.001, + "loss": 3.0399, + "step": 8698 + }, + { + "epoch": 0.36800913782891953, + "grad_norm": 0.26181337237358093, + "learning_rate": 0.001, + "loss": 1.625, + "step": 8699 + }, + { + "epoch": 0.3680514425924359, + "grad_norm": 0.228001207113266, + "learning_rate": 0.001, + "loss": 3.5122, + "step": 8700 + }, + { + "epoch": 0.3680937473559523, + "grad_norm": 0.1740439236164093, + "learning_rate": 0.001, + "loss": 2.094, + "step": 8701 + }, + { + "epoch": 0.36813605211946865, + "grad_norm": 0.19469280540943146, + "learning_rate": 0.001, + "loss": 2.2975, + "step": 8702 + }, + { + "epoch": 0.368178356882985, + "grad_norm": 0.1703725904226303, + "learning_rate": 0.001, + "loss": 3.327, + "step": 8703 + }, + { + "epoch": 0.3682206616465014, + "grad_norm": 0.20927943289279938, + "learning_rate": 0.001, + "loss": 1.7857, + "step": 8704 + }, + { + "epoch": 0.36826296641001777, + "grad_norm": 0.15202440321445465, + "learning_rate": 0.001, + "loss": 2.1899, + "step": 8705 + }, + { + "epoch": 0.3683052711735341, + "grad_norm": 0.17597658932209015, + "learning_rate": 0.001, + "loss": 1.7741, + "step": 8706 + }, + { + "epoch": 0.36834757593705053, + "grad_norm": 0.16804081201553345, + "learning_rate": 0.001, + "loss": 2.0558, + "step": 8707 + }, + { + "epoch": 0.3683898807005669, + "grad_norm": 0.18276292085647583, + "learning_rate": 0.001, + "loss": 2.2216, + "step": 8708 + }, + { + "epoch": 0.36843218546408324, + "grad_norm": 0.2148251235485077, + "learning_rate": 0.001, + "loss": 1.9215, + "step": 8709 + }, + { + "epoch": 0.36847449022759965, + "grad_norm": 0.9029409289360046, + "learning_rate": 0.001, + "loss": 1.6041, + "step": 8710 + }, + { + "epoch": 0.368516794991116, + "grad_norm": 0.19321629405021667, + "learning_rate": 0.001, + "loss": 1.8104, + "step": 8711 + }, + { + "epoch": 0.36855909975463236, + "grad_norm": 0.177422434091568, + "learning_rate": 0.001, + "loss": 1.8768, + "step": 8712 + }, + { + "epoch": 0.36860140451814877, + "grad_norm": 0.2311866581439972, + "learning_rate": 0.001, + "loss": 2.567, + "step": 8713 + }, + { + "epoch": 0.3686437092816651, + "grad_norm": 0.2138138860464096, + "learning_rate": 0.001, + "loss": 2.0323, + "step": 8714 + }, + { + "epoch": 0.3686860140451815, + "grad_norm": 0.17242108285427094, + "learning_rate": 0.001, + "loss": 2.3389, + "step": 8715 + }, + { + "epoch": 0.3687283188086979, + "grad_norm": 3.463766098022461, + "learning_rate": 0.001, + "loss": 1.9939, + "step": 8716 + }, + { + "epoch": 0.36877062357221424, + "grad_norm": 0.29998794198036194, + "learning_rate": 0.001, + "loss": 2.0793, + "step": 8717 + }, + { + "epoch": 0.3688129283357306, + "grad_norm": 0.22262133657932281, + "learning_rate": 0.001, + "loss": 2.7188, + "step": 8718 + }, + { + "epoch": 0.36885523309924695, + "grad_norm": 0.28944888710975647, + "learning_rate": 0.001, + "loss": 1.7407, + "step": 8719 + }, + { + "epoch": 0.36889753786276336, + "grad_norm": 0.20619124174118042, + "learning_rate": 0.001, + "loss": 1.8345, + "step": 8720 + }, + { + "epoch": 0.3689398426262797, + "grad_norm": 0.1661635786294937, + "learning_rate": 0.001, + "loss": 1.586, + "step": 8721 + }, + { + "epoch": 0.36898214738979607, + "grad_norm": 0.222077414393425, + "learning_rate": 0.001, + "loss": 3.3912, + "step": 8722 + }, + { + "epoch": 0.3690244521533125, + "grad_norm": 0.19446569681167603, + "learning_rate": 0.001, + "loss": 2.499, + "step": 8723 + }, + { + "epoch": 0.36906675691682883, + "grad_norm": 1.546207070350647, + "learning_rate": 0.001, + "loss": 1.9862, + "step": 8724 + }, + { + "epoch": 0.3691090616803452, + "grad_norm": 0.22588209807872772, + "learning_rate": 0.001, + "loss": 3.0398, + "step": 8725 + }, + { + "epoch": 0.3691513664438616, + "grad_norm": 0.22056500613689423, + "learning_rate": 0.001, + "loss": 2.7724, + "step": 8726 + }, + { + "epoch": 0.36919367120737795, + "grad_norm": 10.859556198120117, + "learning_rate": 0.001, + "loss": 1.9629, + "step": 8727 + }, + { + "epoch": 0.3692359759708943, + "grad_norm": 0.22937330603599548, + "learning_rate": 0.001, + "loss": 2.0512, + "step": 8728 + }, + { + "epoch": 0.3692782807344107, + "grad_norm": 2.563246965408325, + "learning_rate": 0.001, + "loss": 2.3327, + "step": 8729 + }, + { + "epoch": 0.36932058549792707, + "grad_norm": 0.31085848808288574, + "learning_rate": 0.001, + "loss": 2.0146, + "step": 8730 + }, + { + "epoch": 0.3693628902614434, + "grad_norm": 0.22053919732570648, + "learning_rate": 0.001, + "loss": 1.9071, + "step": 8731 + }, + { + "epoch": 0.36940519502495983, + "grad_norm": 0.3156310021877289, + "learning_rate": 0.001, + "loss": 1.7969, + "step": 8732 + }, + { + "epoch": 0.3694474997884762, + "grad_norm": 0.24050398170948029, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 8733 + }, + { + "epoch": 0.36948980455199254, + "grad_norm": 0.18906262516975403, + "learning_rate": 0.001, + "loss": 1.7419, + "step": 8734 + }, + { + "epoch": 0.36953210931550895, + "grad_norm": 0.17383328080177307, + "learning_rate": 0.001, + "loss": 2.0027, + "step": 8735 + }, + { + "epoch": 0.3695744140790253, + "grad_norm": 0.19866390526294708, + "learning_rate": 0.001, + "loss": 2.1751, + "step": 8736 + }, + { + "epoch": 0.36961671884254166, + "grad_norm": 0.2835475504398346, + "learning_rate": 0.001, + "loss": 1.9432, + "step": 8737 + }, + { + "epoch": 0.36965902360605807, + "grad_norm": 0.9217908978462219, + "learning_rate": 0.001, + "loss": 1.6859, + "step": 8738 + }, + { + "epoch": 0.3697013283695744, + "grad_norm": 0.2083401083946228, + "learning_rate": 0.001, + "loss": 2.3959, + "step": 8739 + }, + { + "epoch": 0.3697436331330908, + "grad_norm": 0.19344541430473328, + "learning_rate": 0.001, + "loss": 1.929, + "step": 8740 + }, + { + "epoch": 0.36978593789660713, + "grad_norm": 0.2091798186302185, + "learning_rate": 0.001, + "loss": 1.8886, + "step": 8741 + }, + { + "epoch": 0.36982824266012354, + "grad_norm": 0.2184278666973114, + "learning_rate": 0.001, + "loss": 2.0926, + "step": 8742 + }, + { + "epoch": 0.3698705474236399, + "grad_norm": 0.2625698149204254, + "learning_rate": 0.001, + "loss": 2.3432, + "step": 8743 + }, + { + "epoch": 0.36991285218715625, + "grad_norm": 0.2139061689376831, + "learning_rate": 0.001, + "loss": 2.6096, + "step": 8744 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 0.18271711468696594, + "learning_rate": 0.001, + "loss": 2.6203, + "step": 8745 + }, + { + "epoch": 0.369997461714189, + "grad_norm": 0.16942210495471954, + "learning_rate": 0.001, + "loss": 2.8279, + "step": 8746 + }, + { + "epoch": 0.37003976647770537, + "grad_norm": 0.19270247220993042, + "learning_rate": 0.001, + "loss": 1.8951, + "step": 8747 + }, + { + "epoch": 0.3700820712412218, + "grad_norm": 0.19946153461933136, + "learning_rate": 0.001, + "loss": 2.6455, + "step": 8748 + }, + { + "epoch": 0.37012437600473813, + "grad_norm": 0.21664415299892426, + "learning_rate": 0.001, + "loss": 2.2819, + "step": 8749 + }, + { + "epoch": 0.3701666807682545, + "grad_norm": 0.19284196197986603, + "learning_rate": 0.001, + "loss": 2.4914, + "step": 8750 + }, + { + "epoch": 0.3702089855317709, + "grad_norm": 0.18668252229690552, + "learning_rate": 0.001, + "loss": 2.0608, + "step": 8751 + }, + { + "epoch": 0.37025129029528725, + "grad_norm": 1.1074740886688232, + "learning_rate": 0.001, + "loss": 2.6925, + "step": 8752 + }, + { + "epoch": 0.3702935950588036, + "grad_norm": 0.20908206701278687, + "learning_rate": 0.001, + "loss": 2.147, + "step": 8753 + }, + { + "epoch": 0.37033589982232, + "grad_norm": 0.19691090285778046, + "learning_rate": 0.001, + "loss": 2.0874, + "step": 8754 + }, + { + "epoch": 0.37037820458583637, + "grad_norm": 0.25301340222358704, + "learning_rate": 0.001, + "loss": 2.1522, + "step": 8755 + }, + { + "epoch": 0.3704205093493527, + "grad_norm": 0.1993921846151352, + "learning_rate": 0.001, + "loss": 2.3689, + "step": 8756 + }, + { + "epoch": 0.37046281411286913, + "grad_norm": 0.5521764755249023, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 8757 + }, + { + "epoch": 0.3705051188763855, + "grad_norm": 0.19061268866062164, + "learning_rate": 0.001, + "loss": 2.8067, + "step": 8758 + }, + { + "epoch": 0.37054742363990184, + "grad_norm": 0.2150365263223648, + "learning_rate": 0.001, + "loss": 1.9172, + "step": 8759 + }, + { + "epoch": 0.37058972840341825, + "grad_norm": 0.18955853581428528, + "learning_rate": 0.001, + "loss": 2.3987, + "step": 8760 + }, + { + "epoch": 0.3706320331669346, + "grad_norm": 0.18300773203372955, + "learning_rate": 0.001, + "loss": 2.123, + "step": 8761 + }, + { + "epoch": 0.37067433793045096, + "grad_norm": 0.1935848891735077, + "learning_rate": 0.001, + "loss": 2.1818, + "step": 8762 + }, + { + "epoch": 0.3707166426939673, + "grad_norm": 0.20884737372398376, + "learning_rate": 0.001, + "loss": 2.2408, + "step": 8763 + }, + { + "epoch": 0.3707589474574837, + "grad_norm": 0.915891170501709, + "learning_rate": 0.001, + "loss": 2.0213, + "step": 8764 + }, + { + "epoch": 0.3708012522210001, + "grad_norm": 18.33778190612793, + "learning_rate": 0.001, + "loss": 2.0037, + "step": 8765 + }, + { + "epoch": 0.37084355698451643, + "grad_norm": 1.2620832920074463, + "learning_rate": 0.001, + "loss": 2.4491, + "step": 8766 + }, + { + "epoch": 0.37088586174803284, + "grad_norm": 0.18721136450767517, + "learning_rate": 0.001, + "loss": 1.9352, + "step": 8767 + }, + { + "epoch": 0.3709281665115492, + "grad_norm": 0.1763981133699417, + "learning_rate": 0.001, + "loss": 2.1145, + "step": 8768 + }, + { + "epoch": 0.37097047127506555, + "grad_norm": 0.20833328366279602, + "learning_rate": 0.001, + "loss": 2.5553, + "step": 8769 + }, + { + "epoch": 0.37101277603858196, + "grad_norm": 0.37538349628448486, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 8770 + }, + { + "epoch": 0.3710550808020983, + "grad_norm": 0.1779465526342392, + "learning_rate": 0.001, + "loss": 1.8611, + "step": 8771 + }, + { + "epoch": 0.37109738556561467, + "grad_norm": 0.18236777186393738, + "learning_rate": 0.001, + "loss": 1.5463, + "step": 8772 + }, + { + "epoch": 0.3711396903291311, + "grad_norm": 0.3060573935508728, + "learning_rate": 0.001, + "loss": 2.9194, + "step": 8773 + }, + { + "epoch": 0.37118199509264743, + "grad_norm": 0.20852862298488617, + "learning_rate": 0.001, + "loss": 2.5415, + "step": 8774 + }, + { + "epoch": 0.3712242998561638, + "grad_norm": 0.48245781660079956, + "learning_rate": 0.001, + "loss": 1.5005, + "step": 8775 + }, + { + "epoch": 0.3712666046196802, + "grad_norm": 0.31515905261039734, + "learning_rate": 0.001, + "loss": 3.4687, + "step": 8776 + }, + { + "epoch": 0.37130890938319655, + "grad_norm": 0.2978445887565613, + "learning_rate": 0.001, + "loss": 2.7744, + "step": 8777 + }, + { + "epoch": 0.3713512141467129, + "grad_norm": 0.18172970414161682, + "learning_rate": 0.001, + "loss": 1.9334, + "step": 8778 + }, + { + "epoch": 0.3713935189102293, + "grad_norm": 0.18180030584335327, + "learning_rate": 0.001, + "loss": 1.6701, + "step": 8779 + }, + { + "epoch": 0.37143582367374567, + "grad_norm": 0.19042405486106873, + "learning_rate": 0.001, + "loss": 2.0225, + "step": 8780 + }, + { + "epoch": 0.371478128437262, + "grad_norm": 0.1896408349275589, + "learning_rate": 0.001, + "loss": 1.7211, + "step": 8781 + }, + { + "epoch": 0.37152043320077843, + "grad_norm": 0.21694940328598022, + "learning_rate": 0.001, + "loss": 2.1461, + "step": 8782 + }, + { + "epoch": 0.3715627379642948, + "grad_norm": 0.2233157753944397, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 8783 + }, + { + "epoch": 0.37160504272781114, + "grad_norm": 0.2073362171649933, + "learning_rate": 0.001, + "loss": 1.9724, + "step": 8784 + }, + { + "epoch": 0.37164734749132755, + "grad_norm": 0.3852141201496124, + "learning_rate": 0.001, + "loss": 2.5631, + "step": 8785 + }, + { + "epoch": 0.3716896522548439, + "grad_norm": 0.18070602416992188, + "learning_rate": 0.001, + "loss": 1.6071, + "step": 8786 + }, + { + "epoch": 0.37173195701836026, + "grad_norm": 0.16367939114570618, + "learning_rate": 0.001, + "loss": 2.2925, + "step": 8787 + }, + { + "epoch": 0.3717742617818766, + "grad_norm": 0.1724989265203476, + "learning_rate": 0.001, + "loss": 3.1093, + "step": 8788 + }, + { + "epoch": 0.371816566545393, + "grad_norm": 0.16434898972511292, + "learning_rate": 0.001, + "loss": 2.4629, + "step": 8789 + }, + { + "epoch": 0.3718588713089094, + "grad_norm": 0.2104768455028534, + "learning_rate": 0.001, + "loss": 2.8752, + "step": 8790 + }, + { + "epoch": 0.37190117607242573, + "grad_norm": 0.20089322328567505, + "learning_rate": 0.001, + "loss": 2.4023, + "step": 8791 + }, + { + "epoch": 0.37194348083594214, + "grad_norm": 0.2389354258775711, + "learning_rate": 0.001, + "loss": 2.2687, + "step": 8792 + }, + { + "epoch": 0.3719857855994585, + "grad_norm": 0.47742322087287903, + "learning_rate": 0.001, + "loss": 1.9946, + "step": 8793 + }, + { + "epoch": 0.37202809036297485, + "grad_norm": 0.1540321409702301, + "learning_rate": 0.001, + "loss": 1.7913, + "step": 8794 + }, + { + "epoch": 0.37207039512649126, + "grad_norm": 0.17690719664096832, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 8795 + }, + { + "epoch": 0.3721126998900076, + "grad_norm": 0.23479756712913513, + "learning_rate": 0.001, + "loss": 3.3659, + "step": 8796 + }, + { + "epoch": 0.37215500465352397, + "grad_norm": 0.89059978723526, + "learning_rate": 0.001, + "loss": 3.5161, + "step": 8797 + }, + { + "epoch": 0.3721973094170404, + "grad_norm": 5.015807628631592, + "learning_rate": 0.001, + "loss": 2.5261, + "step": 8798 + }, + { + "epoch": 0.37223961418055673, + "grad_norm": 0.21967126429080963, + "learning_rate": 0.001, + "loss": 2.07, + "step": 8799 + }, + { + "epoch": 0.3722819189440731, + "grad_norm": 0.24731267988681793, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 8800 + }, + { + "epoch": 0.3723242237075895, + "grad_norm": 0.20447765290737152, + "learning_rate": 0.001, + "loss": 1.8957, + "step": 8801 + }, + { + "epoch": 0.37236652847110585, + "grad_norm": 0.2456209510564804, + "learning_rate": 0.001, + "loss": 1.9937, + "step": 8802 + }, + { + "epoch": 0.3724088332346222, + "grad_norm": 0.22434556484222412, + "learning_rate": 0.001, + "loss": 1.5603, + "step": 8803 + }, + { + "epoch": 0.3724511379981386, + "grad_norm": 0.45701247453689575, + "learning_rate": 0.001, + "loss": 2.2196, + "step": 8804 + }, + { + "epoch": 0.37249344276165497, + "grad_norm": 0.2302723377943039, + "learning_rate": 0.001, + "loss": 1.9974, + "step": 8805 + }, + { + "epoch": 0.3725357475251713, + "grad_norm": 0.28021976351737976, + "learning_rate": 0.001, + "loss": 3.7377, + "step": 8806 + }, + { + "epoch": 0.37257805228868773, + "grad_norm": 0.8244222402572632, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 8807 + }, + { + "epoch": 0.3726203570522041, + "grad_norm": 0.20261800289154053, + "learning_rate": 0.001, + "loss": 3.6426, + "step": 8808 + }, + { + "epoch": 0.37266266181572044, + "grad_norm": 0.19825433194637299, + "learning_rate": 0.001, + "loss": 1.9425, + "step": 8809 + }, + { + "epoch": 0.3727049665792368, + "grad_norm": 1.1609710454940796, + "learning_rate": 0.001, + "loss": 2.1458, + "step": 8810 + }, + { + "epoch": 0.3727472713427532, + "grad_norm": 0.5567569136619568, + "learning_rate": 0.001, + "loss": 1.7775, + "step": 8811 + }, + { + "epoch": 0.37278957610626956, + "grad_norm": 0.2688750922679901, + "learning_rate": 0.001, + "loss": 2.358, + "step": 8812 + }, + { + "epoch": 0.3728318808697859, + "grad_norm": 0.33358681201934814, + "learning_rate": 0.001, + "loss": 2.8446, + "step": 8813 + }, + { + "epoch": 0.3728741856333023, + "grad_norm": 0.20433993637561798, + "learning_rate": 0.001, + "loss": 2.3362, + "step": 8814 + }, + { + "epoch": 0.3729164903968187, + "grad_norm": 0.19342532753944397, + "learning_rate": 0.001, + "loss": 2.8176, + "step": 8815 + }, + { + "epoch": 0.37295879516033503, + "grad_norm": 0.23268815875053406, + "learning_rate": 0.001, + "loss": 1.9077, + "step": 8816 + }, + { + "epoch": 0.37300109992385144, + "grad_norm": 0.23792216181755066, + "learning_rate": 0.001, + "loss": 2.4612, + "step": 8817 + }, + { + "epoch": 0.3730434046873678, + "grad_norm": 179.4039764404297, + "learning_rate": 0.001, + "loss": 3.1672, + "step": 8818 + }, + { + "epoch": 0.37308570945088415, + "grad_norm": 0.29416170716285706, + "learning_rate": 0.001, + "loss": 2.0936, + "step": 8819 + }, + { + "epoch": 0.37312801421440056, + "grad_norm": 0.2061353474855423, + "learning_rate": 0.001, + "loss": 3.3697, + "step": 8820 + }, + { + "epoch": 0.3731703189779169, + "grad_norm": 0.22392816841602325, + "learning_rate": 0.001, + "loss": 1.8806, + "step": 8821 + }, + { + "epoch": 0.37321262374143327, + "grad_norm": 0.5261951088905334, + "learning_rate": 0.001, + "loss": 2.6985, + "step": 8822 + }, + { + "epoch": 0.3732549285049497, + "grad_norm": 0.9621092081069946, + "learning_rate": 0.001, + "loss": 2.2439, + "step": 8823 + }, + { + "epoch": 0.37329723326846603, + "grad_norm": 0.20049139857292175, + "learning_rate": 0.001, + "loss": 2.2026, + "step": 8824 + }, + { + "epoch": 0.3733395380319824, + "grad_norm": 1.3945945501327515, + "learning_rate": 0.001, + "loss": 2.6688, + "step": 8825 + }, + { + "epoch": 0.3733818427954988, + "grad_norm": 0.28076982498168945, + "learning_rate": 0.001, + "loss": 2.9429, + "step": 8826 + }, + { + "epoch": 0.37342414755901515, + "grad_norm": 0.45473167300224304, + "learning_rate": 0.001, + "loss": 2.7468, + "step": 8827 + }, + { + "epoch": 0.3734664523225315, + "grad_norm": 1.4571759700775146, + "learning_rate": 0.001, + "loss": 1.6316, + "step": 8828 + }, + { + "epoch": 0.3735087570860479, + "grad_norm": 1.2502529621124268, + "learning_rate": 0.001, + "loss": 3.0181, + "step": 8829 + }, + { + "epoch": 0.37355106184956427, + "grad_norm": 0.26605066657066345, + "learning_rate": 0.001, + "loss": 2.5819, + "step": 8830 + }, + { + "epoch": 0.3735933666130806, + "grad_norm": 0.4681839644908905, + "learning_rate": 0.001, + "loss": 2.5359, + "step": 8831 + }, + { + "epoch": 0.373635671376597, + "grad_norm": 0.24733616411685944, + "learning_rate": 0.001, + "loss": 3.0378, + "step": 8832 + }, + { + "epoch": 0.3736779761401134, + "grad_norm": 0.20918242633342743, + "learning_rate": 0.001, + "loss": 2.6718, + "step": 8833 + }, + { + "epoch": 0.37372028090362974, + "grad_norm": 0.7425049543380737, + "learning_rate": 0.001, + "loss": 1.8763, + "step": 8834 + }, + { + "epoch": 0.3737625856671461, + "grad_norm": 0.1861983686685562, + "learning_rate": 0.001, + "loss": 2.6386, + "step": 8835 + }, + { + "epoch": 0.3738048904306625, + "grad_norm": 0.22254261374473572, + "learning_rate": 0.001, + "loss": 2.9136, + "step": 8836 + }, + { + "epoch": 0.37384719519417886, + "grad_norm": 0.22637084126472473, + "learning_rate": 0.001, + "loss": 2.5875, + "step": 8837 + }, + { + "epoch": 0.3738894999576952, + "grad_norm": 0.2065073400735855, + "learning_rate": 0.001, + "loss": 2.6558, + "step": 8838 + }, + { + "epoch": 0.3739318047212116, + "grad_norm": 0.18488679826259613, + "learning_rate": 0.001, + "loss": 2.429, + "step": 8839 + }, + { + "epoch": 0.373974109484728, + "grad_norm": 0.22156591713428497, + "learning_rate": 0.001, + "loss": 1.8497, + "step": 8840 + }, + { + "epoch": 0.37401641424824433, + "grad_norm": 0.1818188726902008, + "learning_rate": 0.001, + "loss": 2.1918, + "step": 8841 + }, + { + "epoch": 0.37405871901176074, + "grad_norm": 0.1845444142818451, + "learning_rate": 0.001, + "loss": 1.7402, + "step": 8842 + }, + { + "epoch": 0.3741010237752771, + "grad_norm": 0.17082853615283966, + "learning_rate": 0.001, + "loss": 1.9857, + "step": 8843 + }, + { + "epoch": 0.37414332853879345, + "grad_norm": 0.1782820224761963, + "learning_rate": 0.001, + "loss": 2.0385, + "step": 8844 + }, + { + "epoch": 0.37418563330230986, + "grad_norm": 1.1810798645019531, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 8845 + }, + { + "epoch": 0.3742279380658262, + "grad_norm": 0.19883759319782257, + "learning_rate": 0.001, + "loss": 2.543, + "step": 8846 + }, + { + "epoch": 0.37427024282934257, + "grad_norm": 0.47685497999191284, + "learning_rate": 0.001, + "loss": 2.2209, + "step": 8847 + }, + { + "epoch": 0.374312547592859, + "grad_norm": 0.6518121957778931, + "learning_rate": 0.001, + "loss": 2.8904, + "step": 8848 + }, + { + "epoch": 0.37435485235637533, + "grad_norm": 0.1763034462928772, + "learning_rate": 0.001, + "loss": 2.5656, + "step": 8849 + }, + { + "epoch": 0.3743971571198917, + "grad_norm": 13.006291389465332, + "learning_rate": 0.001, + "loss": 1.7881, + "step": 8850 + }, + { + "epoch": 0.3744394618834081, + "grad_norm": 0.25351783633232117, + "learning_rate": 0.001, + "loss": 2.2294, + "step": 8851 + }, + { + "epoch": 0.37448176664692445, + "grad_norm": 10.508666038513184, + "learning_rate": 0.001, + "loss": 2.6075, + "step": 8852 + }, + { + "epoch": 0.3745240714104408, + "grad_norm": 0.23762452602386475, + "learning_rate": 0.001, + "loss": 2.0531, + "step": 8853 + }, + { + "epoch": 0.37456637617395716, + "grad_norm": 0.21458260715007782, + "learning_rate": 0.001, + "loss": 1.6651, + "step": 8854 + }, + { + "epoch": 0.37460868093747357, + "grad_norm": 0.19314922392368317, + "learning_rate": 0.001, + "loss": 2.2319, + "step": 8855 + }, + { + "epoch": 0.3746509857009899, + "grad_norm": 0.21474038064479828, + "learning_rate": 0.001, + "loss": 1.684, + "step": 8856 + }, + { + "epoch": 0.3746932904645063, + "grad_norm": 0.9535976648330688, + "learning_rate": 0.001, + "loss": 2.1219, + "step": 8857 + }, + { + "epoch": 0.3747355952280227, + "grad_norm": 0.3602615296840668, + "learning_rate": 0.001, + "loss": 2.2903, + "step": 8858 + }, + { + "epoch": 0.37477789999153904, + "grad_norm": 0.8784622550010681, + "learning_rate": 0.001, + "loss": 2.7722, + "step": 8859 + }, + { + "epoch": 0.3748202047550554, + "grad_norm": 0.21422924101352692, + "learning_rate": 0.001, + "loss": 2.2356, + "step": 8860 + }, + { + "epoch": 0.3748625095185718, + "grad_norm": 0.3022599220275879, + "learning_rate": 0.001, + "loss": 2.8531, + "step": 8861 + }, + { + "epoch": 0.37490481428208816, + "grad_norm": 0.2399420291185379, + "learning_rate": 0.001, + "loss": 2.1652, + "step": 8862 + }, + { + "epoch": 0.3749471190456045, + "grad_norm": 0.24375419318675995, + "learning_rate": 0.001, + "loss": 1.7758, + "step": 8863 + }, + { + "epoch": 0.3749894238091209, + "grad_norm": 0.2895803451538086, + "learning_rate": 0.001, + "loss": 1.9264, + "step": 8864 + }, + { + "epoch": 0.3750317285726373, + "grad_norm": 0.1909046471118927, + "learning_rate": 0.001, + "loss": 2.2194, + "step": 8865 + }, + { + "epoch": 0.37507403333615363, + "grad_norm": 0.2433093786239624, + "learning_rate": 0.001, + "loss": 2.556, + "step": 8866 + }, + { + "epoch": 0.37511633809967004, + "grad_norm": 0.20016717910766602, + "learning_rate": 0.001, + "loss": 1.7043, + "step": 8867 + }, + { + "epoch": 0.3751586428631864, + "grad_norm": 0.25574326515197754, + "learning_rate": 0.001, + "loss": 2.0388, + "step": 8868 + }, + { + "epoch": 0.37520094762670275, + "grad_norm": 0.2795771062374115, + "learning_rate": 0.001, + "loss": 1.9324, + "step": 8869 + }, + { + "epoch": 0.37524325239021916, + "grad_norm": 0.26534348726272583, + "learning_rate": 0.001, + "loss": 2.8957, + "step": 8870 + }, + { + "epoch": 0.3752855571537355, + "grad_norm": 0.2329631894826889, + "learning_rate": 0.001, + "loss": 2.7308, + "step": 8871 + }, + { + "epoch": 0.37532786191725187, + "grad_norm": 0.21924613416194916, + "learning_rate": 0.001, + "loss": 2.1649, + "step": 8872 + }, + { + "epoch": 0.3753701666807683, + "grad_norm": 1.5583733320236206, + "learning_rate": 0.001, + "loss": 2.8169, + "step": 8873 + }, + { + "epoch": 0.37541247144428463, + "grad_norm": 0.2518419027328491, + "learning_rate": 0.001, + "loss": 2.3901, + "step": 8874 + }, + { + "epoch": 0.375454776207801, + "grad_norm": 0.21858522295951843, + "learning_rate": 0.001, + "loss": 2.339, + "step": 8875 + }, + { + "epoch": 0.37549708097131734, + "grad_norm": 0.4082587957382202, + "learning_rate": 0.001, + "loss": 2.3889, + "step": 8876 + }, + { + "epoch": 0.37553938573483375, + "grad_norm": 0.20391131937503815, + "learning_rate": 0.001, + "loss": 3.0828, + "step": 8877 + }, + { + "epoch": 0.3755816904983501, + "grad_norm": 0.21462643146514893, + "learning_rate": 0.001, + "loss": 2.1877, + "step": 8878 + }, + { + "epoch": 0.37562399526186646, + "grad_norm": 0.2579302489757538, + "learning_rate": 0.001, + "loss": 2.0629, + "step": 8879 + }, + { + "epoch": 0.37566630002538287, + "grad_norm": 0.17081435024738312, + "learning_rate": 0.001, + "loss": 2.2482, + "step": 8880 + }, + { + "epoch": 0.3757086047888992, + "grad_norm": 0.21092277765274048, + "learning_rate": 0.001, + "loss": 2.7822, + "step": 8881 + }, + { + "epoch": 0.3757509095524156, + "grad_norm": 0.5651504397392273, + "learning_rate": 0.001, + "loss": 2.5021, + "step": 8882 + }, + { + "epoch": 0.375793214315932, + "grad_norm": 0.23301394283771515, + "learning_rate": 0.001, + "loss": 2.0126, + "step": 8883 + }, + { + "epoch": 0.37583551907944834, + "grad_norm": 2.354231595993042, + "learning_rate": 0.001, + "loss": 2.4646, + "step": 8884 + }, + { + "epoch": 0.3758778238429647, + "grad_norm": 2.867743492126465, + "learning_rate": 0.001, + "loss": 2.1439, + "step": 8885 + }, + { + "epoch": 0.3759201286064811, + "grad_norm": 1.451896071434021, + "learning_rate": 0.001, + "loss": 2.6435, + "step": 8886 + }, + { + "epoch": 0.37596243336999746, + "grad_norm": 0.2039012610912323, + "learning_rate": 0.001, + "loss": 2.3304, + "step": 8887 + }, + { + "epoch": 0.3760047381335138, + "grad_norm": 0.4442058801651001, + "learning_rate": 0.001, + "loss": 2.9109, + "step": 8888 + }, + { + "epoch": 0.3760470428970302, + "grad_norm": 0.2005094289779663, + "learning_rate": 0.001, + "loss": 3.5619, + "step": 8889 + }, + { + "epoch": 0.3760893476605466, + "grad_norm": 3.216552495956421, + "learning_rate": 0.001, + "loss": 2.8794, + "step": 8890 + }, + { + "epoch": 0.37613165242406293, + "grad_norm": 0.23342199623584747, + "learning_rate": 0.001, + "loss": 2.6378, + "step": 8891 + }, + { + "epoch": 0.37617395718757934, + "grad_norm": 0.23157887160778046, + "learning_rate": 0.001, + "loss": 2.9445, + "step": 8892 + }, + { + "epoch": 0.3762162619510957, + "grad_norm": 0.2705920338630676, + "learning_rate": 0.001, + "loss": 2.2957, + "step": 8893 + }, + { + "epoch": 0.37625856671461205, + "grad_norm": 0.28149649500846863, + "learning_rate": 0.001, + "loss": 1.7105, + "step": 8894 + }, + { + "epoch": 0.37630087147812846, + "grad_norm": 1.1188766956329346, + "learning_rate": 0.001, + "loss": 2.2677, + "step": 8895 + }, + { + "epoch": 0.3763431762416448, + "grad_norm": 0.8402219414710999, + "learning_rate": 0.001, + "loss": 3.4824, + "step": 8896 + }, + { + "epoch": 0.37638548100516117, + "grad_norm": 0.2254517376422882, + "learning_rate": 0.001, + "loss": 2.4484, + "step": 8897 + }, + { + "epoch": 0.3764277857686776, + "grad_norm": 0.18616388738155365, + "learning_rate": 0.001, + "loss": 1.8755, + "step": 8898 + }, + { + "epoch": 0.37647009053219394, + "grad_norm": 0.16204172372817993, + "learning_rate": 0.001, + "loss": 2.3035, + "step": 8899 + }, + { + "epoch": 0.3765123952957103, + "grad_norm": 0.655873715877533, + "learning_rate": 0.001, + "loss": 2.7063, + "step": 8900 + }, + { + "epoch": 0.37655470005922664, + "grad_norm": 0.31336209177970886, + "learning_rate": 0.001, + "loss": 2.2015, + "step": 8901 + }, + { + "epoch": 0.37659700482274305, + "grad_norm": 0.21138517558574677, + "learning_rate": 0.001, + "loss": 2.0808, + "step": 8902 + }, + { + "epoch": 0.3766393095862594, + "grad_norm": 0.32678109407424927, + "learning_rate": 0.001, + "loss": 3.5257, + "step": 8903 + }, + { + "epoch": 0.37668161434977576, + "grad_norm": 0.17282311618328094, + "learning_rate": 0.001, + "loss": 1.6933, + "step": 8904 + }, + { + "epoch": 0.37672391911329217, + "grad_norm": 0.19820040464401245, + "learning_rate": 0.001, + "loss": 1.8809, + "step": 8905 + }, + { + "epoch": 0.3767662238768085, + "grad_norm": 0.23431384563446045, + "learning_rate": 0.001, + "loss": 1.9761, + "step": 8906 + }, + { + "epoch": 0.3768085286403249, + "grad_norm": 0.275423139333725, + "learning_rate": 0.001, + "loss": 2.9449, + "step": 8907 + }, + { + "epoch": 0.3768508334038413, + "grad_norm": 0.4699144661426544, + "learning_rate": 0.001, + "loss": 1.8988, + "step": 8908 + }, + { + "epoch": 0.37689313816735764, + "grad_norm": 0.17784275114536285, + "learning_rate": 0.001, + "loss": 2.1766, + "step": 8909 + }, + { + "epoch": 0.376935442930874, + "grad_norm": 0.25600793957710266, + "learning_rate": 0.001, + "loss": 2.2169, + "step": 8910 + }, + { + "epoch": 0.3769777476943904, + "grad_norm": 0.19872885942459106, + "learning_rate": 0.001, + "loss": 1.8248, + "step": 8911 + }, + { + "epoch": 0.37702005245790676, + "grad_norm": 0.6820826530456543, + "learning_rate": 0.001, + "loss": 2.1578, + "step": 8912 + }, + { + "epoch": 0.3770623572214231, + "grad_norm": 0.17017528414726257, + "learning_rate": 0.001, + "loss": 3.678, + "step": 8913 + }, + { + "epoch": 0.3771046619849395, + "grad_norm": 0.1723102331161499, + "learning_rate": 0.001, + "loss": 3.2953, + "step": 8914 + }, + { + "epoch": 0.3771469667484559, + "grad_norm": 0.17692267894744873, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 8915 + }, + { + "epoch": 0.37718927151197224, + "grad_norm": 0.18386520445346832, + "learning_rate": 0.001, + "loss": 1.7949, + "step": 8916 + }, + { + "epoch": 0.37723157627548864, + "grad_norm": 0.17489704489707947, + "learning_rate": 0.001, + "loss": 2.8251, + "step": 8917 + }, + { + "epoch": 0.377273881039005, + "grad_norm": 0.18046604096889496, + "learning_rate": 0.001, + "loss": 1.9294, + "step": 8918 + }, + { + "epoch": 0.37731618580252135, + "grad_norm": 0.19288641214370728, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 8919 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 20.963592529296875, + "learning_rate": 0.001, + "loss": 1.7886, + "step": 8920 + }, + { + "epoch": 0.3774007953295541, + "grad_norm": 0.20476023852825165, + "learning_rate": 0.001, + "loss": 2.0257, + "step": 8921 + }, + { + "epoch": 0.37744310009307047, + "grad_norm": 0.17102450132369995, + "learning_rate": 0.001, + "loss": 3.2415, + "step": 8922 + }, + { + "epoch": 0.3774854048565868, + "grad_norm": 0.24836905300617218, + "learning_rate": 0.001, + "loss": 1.6505, + "step": 8923 + }, + { + "epoch": 0.37752770962010324, + "grad_norm": 0.2269943505525589, + "learning_rate": 0.001, + "loss": 1.9437, + "step": 8924 + }, + { + "epoch": 0.3775700143836196, + "grad_norm": 0.18366478383541107, + "learning_rate": 0.001, + "loss": 1.9654, + "step": 8925 + }, + { + "epoch": 0.37761231914713594, + "grad_norm": 0.19033034145832062, + "learning_rate": 0.001, + "loss": 1.7305, + "step": 8926 + }, + { + "epoch": 0.37765462391065235, + "grad_norm": 0.16039331257343292, + "learning_rate": 0.001, + "loss": 2.4644, + "step": 8927 + }, + { + "epoch": 0.3776969286741687, + "grad_norm": 0.15948708355426788, + "learning_rate": 0.001, + "loss": 3.5417, + "step": 8928 + }, + { + "epoch": 0.37773923343768506, + "grad_norm": 0.16082708537578583, + "learning_rate": 0.001, + "loss": 2.1491, + "step": 8929 + }, + { + "epoch": 0.37778153820120147, + "grad_norm": 0.21579056978225708, + "learning_rate": 0.001, + "loss": 2.8686, + "step": 8930 + }, + { + "epoch": 0.3778238429647178, + "grad_norm": 0.18891681730747223, + "learning_rate": 0.001, + "loss": 2.1063, + "step": 8931 + }, + { + "epoch": 0.3778661477282342, + "grad_norm": 0.2987871468067169, + "learning_rate": 0.001, + "loss": 2.1846, + "step": 8932 + }, + { + "epoch": 0.3779084524917506, + "grad_norm": 0.19955849647521973, + "learning_rate": 0.001, + "loss": 3.7915, + "step": 8933 + }, + { + "epoch": 0.37795075725526694, + "grad_norm": 0.4888962507247925, + "learning_rate": 0.001, + "loss": 1.5937, + "step": 8934 + }, + { + "epoch": 0.3779930620187833, + "grad_norm": 0.23087970912456512, + "learning_rate": 0.001, + "loss": 2.2461, + "step": 8935 + }, + { + "epoch": 0.3780353667822997, + "grad_norm": 0.48626619577407837, + "learning_rate": 0.001, + "loss": 2.6076, + "step": 8936 + }, + { + "epoch": 0.37807767154581606, + "grad_norm": 0.21073201298713684, + "learning_rate": 0.001, + "loss": 1.6726, + "step": 8937 + }, + { + "epoch": 0.3781199763093324, + "grad_norm": 0.166831374168396, + "learning_rate": 0.001, + "loss": 1.709, + "step": 8938 + }, + { + "epoch": 0.3781622810728488, + "grad_norm": 0.19128407537937164, + "learning_rate": 0.001, + "loss": 2.1098, + "step": 8939 + }, + { + "epoch": 0.3782045858363652, + "grad_norm": 0.19305849075317383, + "learning_rate": 0.001, + "loss": 1.8852, + "step": 8940 + }, + { + "epoch": 0.37824689059988154, + "grad_norm": 0.18565382063388824, + "learning_rate": 0.001, + "loss": 2.1476, + "step": 8941 + }, + { + "epoch": 0.37828919536339795, + "grad_norm": 0.25521063804626465, + "learning_rate": 0.001, + "loss": 2.7811, + "step": 8942 + }, + { + "epoch": 0.3783315001269143, + "grad_norm": 0.49443572759628296, + "learning_rate": 0.001, + "loss": 1.8314, + "step": 8943 + }, + { + "epoch": 0.37837380489043065, + "grad_norm": 0.24259395897388458, + "learning_rate": 0.001, + "loss": 1.8752, + "step": 8944 + }, + { + "epoch": 0.378416109653947, + "grad_norm": 0.21471655368804932, + "learning_rate": 0.001, + "loss": 1.8431, + "step": 8945 + }, + { + "epoch": 0.3784584144174634, + "grad_norm": 0.34845396876335144, + "learning_rate": 0.001, + "loss": 2.63, + "step": 8946 + }, + { + "epoch": 0.37850071918097977, + "grad_norm": 0.4037695825099945, + "learning_rate": 0.001, + "loss": 2.641, + "step": 8947 + }, + { + "epoch": 0.3785430239444961, + "grad_norm": 0.1841561496257782, + "learning_rate": 0.001, + "loss": 2.0631, + "step": 8948 + }, + { + "epoch": 0.37858532870801254, + "grad_norm": 0.5983163714408875, + "learning_rate": 0.001, + "loss": 2.3617, + "step": 8949 + }, + { + "epoch": 0.3786276334715289, + "grad_norm": 0.1743028163909912, + "learning_rate": 0.001, + "loss": 2.2375, + "step": 8950 + }, + { + "epoch": 0.37866993823504524, + "grad_norm": 0.24144525825977325, + "learning_rate": 0.001, + "loss": 1.9207, + "step": 8951 + }, + { + "epoch": 0.37871224299856165, + "grad_norm": 0.19788867235183716, + "learning_rate": 0.001, + "loss": 2.8912, + "step": 8952 + }, + { + "epoch": 0.378754547762078, + "grad_norm": 0.2542145550251007, + "learning_rate": 0.001, + "loss": 1.792, + "step": 8953 + }, + { + "epoch": 0.37879685252559436, + "grad_norm": 0.19751232862472534, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 8954 + }, + { + "epoch": 0.3788391572891108, + "grad_norm": 0.3706015944480896, + "learning_rate": 0.001, + "loss": 2.3522, + "step": 8955 + }, + { + "epoch": 0.3788814620526271, + "grad_norm": 0.20275190472602844, + "learning_rate": 0.001, + "loss": 1.8339, + "step": 8956 + }, + { + "epoch": 0.3789237668161435, + "grad_norm": 0.18066799640655518, + "learning_rate": 0.001, + "loss": 1.6604, + "step": 8957 + }, + { + "epoch": 0.3789660715796599, + "grad_norm": 0.20664532482624054, + "learning_rate": 0.001, + "loss": 2.1049, + "step": 8958 + }, + { + "epoch": 0.37900837634317625, + "grad_norm": 0.15460434556007385, + "learning_rate": 0.001, + "loss": 2.0694, + "step": 8959 + }, + { + "epoch": 0.3790506811066926, + "grad_norm": 0.177540585398674, + "learning_rate": 0.001, + "loss": 2.5301, + "step": 8960 + }, + { + "epoch": 0.379092985870209, + "grad_norm": 0.21939094364643097, + "learning_rate": 0.001, + "loss": 2.1656, + "step": 8961 + }, + { + "epoch": 0.37913529063372536, + "grad_norm": 0.17154373228549957, + "learning_rate": 0.001, + "loss": 3.5702, + "step": 8962 + }, + { + "epoch": 0.3791775953972417, + "grad_norm": 1.2005940675735474, + "learning_rate": 0.001, + "loss": 3.8547, + "step": 8963 + }, + { + "epoch": 0.3792199001607581, + "grad_norm": 0.20012064278125763, + "learning_rate": 0.001, + "loss": 2.1547, + "step": 8964 + }, + { + "epoch": 0.3792622049242745, + "grad_norm": 0.2005445957183838, + "learning_rate": 0.001, + "loss": 1.9539, + "step": 8965 + }, + { + "epoch": 0.37930450968779084, + "grad_norm": 0.17973119020462036, + "learning_rate": 0.001, + "loss": 2.0098, + "step": 8966 + }, + { + "epoch": 0.3793468144513072, + "grad_norm": 0.16720591485500336, + "learning_rate": 0.001, + "loss": 1.3429, + "step": 8967 + }, + { + "epoch": 0.3793891192148236, + "grad_norm": 1.0415222644805908, + "learning_rate": 0.001, + "loss": 2.3825, + "step": 8968 + }, + { + "epoch": 0.37943142397833995, + "grad_norm": 0.19271378219127655, + "learning_rate": 0.001, + "loss": 1.7486, + "step": 8969 + }, + { + "epoch": 0.3794737287418563, + "grad_norm": 0.26010555028915405, + "learning_rate": 0.001, + "loss": 2.988, + "step": 8970 + }, + { + "epoch": 0.3795160335053727, + "grad_norm": 1.41874361038208, + "learning_rate": 0.001, + "loss": 2.4324, + "step": 8971 + }, + { + "epoch": 0.3795583382688891, + "grad_norm": 0.18716134130954742, + "learning_rate": 0.001, + "loss": 2.2786, + "step": 8972 + }, + { + "epoch": 0.3796006430324054, + "grad_norm": 0.21621747314929962, + "learning_rate": 0.001, + "loss": 2.0158, + "step": 8973 + }, + { + "epoch": 0.37964294779592184, + "grad_norm": 0.21100404858589172, + "learning_rate": 0.001, + "loss": 2.6913, + "step": 8974 + }, + { + "epoch": 0.3796852525594382, + "grad_norm": 0.23763734102249146, + "learning_rate": 0.001, + "loss": 2.5987, + "step": 8975 + }, + { + "epoch": 0.37972755732295455, + "grad_norm": 0.24332080781459808, + "learning_rate": 0.001, + "loss": 2.6919, + "step": 8976 + }, + { + "epoch": 0.37976986208647096, + "grad_norm": 0.27417856454849243, + "learning_rate": 0.001, + "loss": 2.2423, + "step": 8977 + }, + { + "epoch": 0.3798121668499873, + "grad_norm": 0.19818872213363647, + "learning_rate": 0.001, + "loss": 2.0502, + "step": 8978 + }, + { + "epoch": 0.37985447161350366, + "grad_norm": 0.16945703327655792, + "learning_rate": 0.001, + "loss": 2.848, + "step": 8979 + }, + { + "epoch": 0.3798967763770201, + "grad_norm": 0.15949320793151855, + "learning_rate": 0.001, + "loss": 1.7612, + "step": 8980 + }, + { + "epoch": 0.3799390811405364, + "grad_norm": 0.17047828435897827, + "learning_rate": 0.001, + "loss": 1.9685, + "step": 8981 + }, + { + "epoch": 0.3799813859040528, + "grad_norm": 0.19749274849891663, + "learning_rate": 0.001, + "loss": 2.5775, + "step": 8982 + }, + { + "epoch": 0.3800236906675692, + "grad_norm": 0.19193464517593384, + "learning_rate": 0.001, + "loss": 2.1396, + "step": 8983 + }, + { + "epoch": 0.38006599543108555, + "grad_norm": 2.228489398956299, + "learning_rate": 0.001, + "loss": 2.439, + "step": 8984 + }, + { + "epoch": 0.3801083001946019, + "grad_norm": 0.18555498123168945, + "learning_rate": 0.001, + "loss": 2.0835, + "step": 8985 + }, + { + "epoch": 0.3801506049581183, + "grad_norm": 0.4127380847930908, + "learning_rate": 0.001, + "loss": 1.7705, + "step": 8986 + }, + { + "epoch": 0.38019290972163466, + "grad_norm": 0.2189057320356369, + "learning_rate": 0.001, + "loss": 2.3658, + "step": 8987 + }, + { + "epoch": 0.380235214485151, + "grad_norm": 0.8270202279090881, + "learning_rate": 0.001, + "loss": 2.0476, + "step": 8988 + }, + { + "epoch": 0.3802775192486674, + "grad_norm": 0.2435636818408966, + "learning_rate": 0.001, + "loss": 2.236, + "step": 8989 + }, + { + "epoch": 0.3803198240121838, + "grad_norm": 0.40932199358940125, + "learning_rate": 0.001, + "loss": 2.5558, + "step": 8990 + }, + { + "epoch": 0.38036212877570014, + "grad_norm": 0.5411979556083679, + "learning_rate": 0.001, + "loss": 2.7722, + "step": 8991 + }, + { + "epoch": 0.3804044335392165, + "grad_norm": 0.28621330857276917, + "learning_rate": 0.001, + "loss": 1.9874, + "step": 8992 + }, + { + "epoch": 0.3804467383027329, + "grad_norm": 0.20250999927520752, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 8993 + }, + { + "epoch": 0.38048904306624926, + "grad_norm": 0.7544026970863342, + "learning_rate": 0.001, + "loss": 2.3332, + "step": 8994 + }, + { + "epoch": 0.3805313478297656, + "grad_norm": 0.20567587018013, + "learning_rate": 0.001, + "loss": 1.698, + "step": 8995 + }, + { + "epoch": 0.380573652593282, + "grad_norm": 0.20252707600593567, + "learning_rate": 0.001, + "loss": 1.5407, + "step": 8996 + }, + { + "epoch": 0.3806159573567984, + "grad_norm": 0.3609828054904938, + "learning_rate": 0.001, + "loss": 3.2147, + "step": 8997 + }, + { + "epoch": 0.3806582621203147, + "grad_norm": 0.18369510769844055, + "learning_rate": 0.001, + "loss": 1.6158, + "step": 8998 + }, + { + "epoch": 0.38070056688383114, + "grad_norm": 3.380438804626465, + "learning_rate": 0.001, + "loss": 2.0616, + "step": 8999 + }, + { + "epoch": 0.3807428716473475, + "grad_norm": 0.17703545093536377, + "learning_rate": 0.001, + "loss": 2.9286, + "step": 9000 + }, + { + "epoch": 0.38078517641086385, + "grad_norm": 0.601148247718811, + "learning_rate": 0.001, + "loss": 2.1041, + "step": 9001 + }, + { + "epoch": 0.38082748117438026, + "grad_norm": 0.1904536634683609, + "learning_rate": 0.001, + "loss": 2.2669, + "step": 9002 + }, + { + "epoch": 0.3808697859378966, + "grad_norm": 0.5615403056144714, + "learning_rate": 0.001, + "loss": 2.1864, + "step": 9003 + }, + { + "epoch": 0.38091209070141296, + "grad_norm": 0.18499788641929626, + "learning_rate": 0.001, + "loss": 1.82, + "step": 9004 + }, + { + "epoch": 0.3809543954649294, + "grad_norm": 0.22329087555408478, + "learning_rate": 0.001, + "loss": 1.9953, + "step": 9005 + }, + { + "epoch": 0.38099670022844573, + "grad_norm": 0.21240036189556122, + "learning_rate": 0.001, + "loss": 2.3098, + "step": 9006 + }, + { + "epoch": 0.3810390049919621, + "grad_norm": 0.171017587184906, + "learning_rate": 0.001, + "loss": 2.0382, + "step": 9007 + }, + { + "epoch": 0.3810813097554785, + "grad_norm": 0.22148646414279938, + "learning_rate": 0.001, + "loss": 2.1392, + "step": 9008 + }, + { + "epoch": 0.38112361451899485, + "grad_norm": 0.18616536259651184, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 9009 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 10.153366088867188, + "learning_rate": 0.001, + "loss": 2.767, + "step": 9010 + }, + { + "epoch": 0.38120822404602756, + "grad_norm": 0.19627103209495544, + "learning_rate": 0.001, + "loss": 1.7014, + "step": 9011 + }, + { + "epoch": 0.38125052880954396, + "grad_norm": 0.2045442759990692, + "learning_rate": 0.001, + "loss": 1.4853, + "step": 9012 + }, + { + "epoch": 0.3812928335730603, + "grad_norm": 0.8567882180213928, + "learning_rate": 0.001, + "loss": 2.0106, + "step": 9013 + }, + { + "epoch": 0.3813351383365767, + "grad_norm": 0.21957813203334808, + "learning_rate": 0.001, + "loss": 2.1993, + "step": 9014 + }, + { + "epoch": 0.3813774431000931, + "grad_norm": 0.20184127986431122, + "learning_rate": 0.001, + "loss": 2.0607, + "step": 9015 + }, + { + "epoch": 0.38141974786360944, + "grad_norm": 1.3871818780899048, + "learning_rate": 0.001, + "loss": 1.851, + "step": 9016 + }, + { + "epoch": 0.3814620526271258, + "grad_norm": 0.26643404364585876, + "learning_rate": 0.001, + "loss": 2.1513, + "step": 9017 + }, + { + "epoch": 0.3815043573906422, + "grad_norm": 0.3258124887943268, + "learning_rate": 0.001, + "loss": 2.6778, + "step": 9018 + }, + { + "epoch": 0.38154666215415856, + "grad_norm": 0.2190716564655304, + "learning_rate": 0.001, + "loss": 2.1382, + "step": 9019 + }, + { + "epoch": 0.3815889669176749, + "grad_norm": 0.21376149356365204, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 9020 + }, + { + "epoch": 0.3816312716811913, + "grad_norm": 0.22805331647396088, + "learning_rate": 0.001, + "loss": 2.3052, + "step": 9021 + }, + { + "epoch": 0.3816735764447077, + "grad_norm": 0.18218421936035156, + "learning_rate": 0.001, + "loss": 2.7028, + "step": 9022 + }, + { + "epoch": 0.38171588120822403, + "grad_norm": 0.19406476616859436, + "learning_rate": 0.001, + "loss": 2.589, + "step": 9023 + }, + { + "epoch": 0.38175818597174044, + "grad_norm": 5.224491119384766, + "learning_rate": 0.001, + "loss": 2.0287, + "step": 9024 + }, + { + "epoch": 0.3818004907352568, + "grad_norm": 0.2605280578136444, + "learning_rate": 0.001, + "loss": 2.3784, + "step": 9025 + }, + { + "epoch": 0.38184279549877315, + "grad_norm": 2.6531484127044678, + "learning_rate": 0.001, + "loss": 2.6587, + "step": 9026 + }, + { + "epoch": 0.38188510026228956, + "grad_norm": 0.19211426377296448, + "learning_rate": 0.001, + "loss": 2.5609, + "step": 9027 + }, + { + "epoch": 0.3819274050258059, + "grad_norm": 0.31936317682266235, + "learning_rate": 0.001, + "loss": 2.2436, + "step": 9028 + }, + { + "epoch": 0.38196970978932226, + "grad_norm": 0.27096110582351685, + "learning_rate": 0.001, + "loss": 2.763, + "step": 9029 + }, + { + "epoch": 0.3820120145528387, + "grad_norm": 0.27728742361068726, + "learning_rate": 0.001, + "loss": 2.2142, + "step": 9030 + }, + { + "epoch": 0.38205431931635503, + "grad_norm": 0.23881641030311584, + "learning_rate": 0.001, + "loss": 2.6463, + "step": 9031 + }, + { + "epoch": 0.3820966240798714, + "grad_norm": 1.6963469982147217, + "learning_rate": 0.001, + "loss": 1.7634, + "step": 9032 + }, + { + "epoch": 0.3821389288433878, + "grad_norm": 0.9256619215011597, + "learning_rate": 0.001, + "loss": 2.8096, + "step": 9033 + }, + { + "epoch": 0.38218123360690415, + "grad_norm": 0.23679040372371674, + "learning_rate": 0.001, + "loss": 2.5447, + "step": 9034 + }, + { + "epoch": 0.3822235383704205, + "grad_norm": 0.266876220703125, + "learning_rate": 0.001, + "loss": 2.1042, + "step": 9035 + }, + { + "epoch": 0.38226584313393686, + "grad_norm": 0.47037845849990845, + "learning_rate": 0.001, + "loss": 4.1207, + "step": 9036 + }, + { + "epoch": 0.38230814789745327, + "grad_norm": 0.3769329786300659, + "learning_rate": 0.001, + "loss": 3.3571, + "step": 9037 + }, + { + "epoch": 0.3823504526609696, + "grad_norm": 1.5549075603485107, + "learning_rate": 0.001, + "loss": 3.0967, + "step": 9038 + }, + { + "epoch": 0.382392757424486, + "grad_norm": 0.48421478271484375, + "learning_rate": 0.001, + "loss": 2.0338, + "step": 9039 + }, + { + "epoch": 0.3824350621880024, + "grad_norm": 0.22943010926246643, + "learning_rate": 0.001, + "loss": 2.5297, + "step": 9040 + }, + { + "epoch": 0.38247736695151874, + "grad_norm": 0.2692186236381531, + "learning_rate": 0.001, + "loss": 2.6332, + "step": 9041 + }, + { + "epoch": 0.3825196717150351, + "grad_norm": 0.31615620851516724, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 9042 + }, + { + "epoch": 0.3825619764785515, + "grad_norm": 0.2741483449935913, + "learning_rate": 0.001, + "loss": 1.9714, + "step": 9043 + }, + { + "epoch": 0.38260428124206786, + "grad_norm": 0.21158187091350555, + "learning_rate": 0.001, + "loss": 1.8752, + "step": 9044 + }, + { + "epoch": 0.3826465860055842, + "grad_norm": 9.120562553405762, + "learning_rate": 0.001, + "loss": 3.4247, + "step": 9045 + }, + { + "epoch": 0.3826888907691006, + "grad_norm": 0.3110532760620117, + "learning_rate": 0.001, + "loss": 2.6979, + "step": 9046 + }, + { + "epoch": 0.382731195532617, + "grad_norm": 0.4028329849243164, + "learning_rate": 0.001, + "loss": 3.1487, + "step": 9047 + }, + { + "epoch": 0.38277350029613333, + "grad_norm": 13.722456932067871, + "learning_rate": 0.001, + "loss": 3.2165, + "step": 9048 + }, + { + "epoch": 0.38281580505964974, + "grad_norm": 0.2979806661605835, + "learning_rate": 0.001, + "loss": 1.7009, + "step": 9049 + }, + { + "epoch": 0.3828581098231661, + "grad_norm": 0.2089688926935196, + "learning_rate": 0.001, + "loss": 1.9347, + "step": 9050 + }, + { + "epoch": 0.38290041458668245, + "grad_norm": 0.19975943863391876, + "learning_rate": 0.001, + "loss": 2.757, + "step": 9051 + }, + { + "epoch": 0.38294271935019886, + "grad_norm": 0.1705322414636612, + "learning_rate": 0.001, + "loss": 1.8747, + "step": 9052 + }, + { + "epoch": 0.3829850241137152, + "grad_norm": 0.2010183334350586, + "learning_rate": 0.001, + "loss": 1.6273, + "step": 9053 + }, + { + "epoch": 0.38302732887723157, + "grad_norm": 9.024285316467285, + "learning_rate": 0.001, + "loss": 1.99, + "step": 9054 + }, + { + "epoch": 0.383069633640748, + "grad_norm": 0.34857797622680664, + "learning_rate": 0.001, + "loss": 2.5817, + "step": 9055 + }, + { + "epoch": 0.38311193840426433, + "grad_norm": 0.5103548169136047, + "learning_rate": 0.001, + "loss": 2.021, + "step": 9056 + }, + { + "epoch": 0.3831542431677807, + "grad_norm": 2.387510299682617, + "learning_rate": 0.001, + "loss": 2.2303, + "step": 9057 + }, + { + "epoch": 0.38319654793129704, + "grad_norm": 0.2850176990032196, + "learning_rate": 0.001, + "loss": 2.4001, + "step": 9058 + }, + { + "epoch": 0.38323885269481345, + "grad_norm": 0.6285756826400757, + "learning_rate": 0.001, + "loss": 1.8074, + "step": 9059 + }, + { + "epoch": 0.3832811574583298, + "grad_norm": 0.5569390654563904, + "learning_rate": 0.001, + "loss": 2.4452, + "step": 9060 + }, + { + "epoch": 0.38332346222184616, + "grad_norm": 5.331487655639648, + "learning_rate": 0.001, + "loss": 2.7975, + "step": 9061 + }, + { + "epoch": 0.38336576698536257, + "grad_norm": 0.30871254205703735, + "learning_rate": 0.001, + "loss": 2.3949, + "step": 9062 + }, + { + "epoch": 0.3834080717488789, + "grad_norm": 0.3623650074005127, + "learning_rate": 0.001, + "loss": 2.5391, + "step": 9063 + }, + { + "epoch": 0.3834503765123953, + "grad_norm": 0.22876878082752228, + "learning_rate": 0.001, + "loss": 2.3655, + "step": 9064 + }, + { + "epoch": 0.3834926812759117, + "grad_norm": 0.1963501125574112, + "learning_rate": 0.001, + "loss": 2.2311, + "step": 9065 + }, + { + "epoch": 0.38353498603942804, + "grad_norm": 0.8837391138076782, + "learning_rate": 0.001, + "loss": 2.8312, + "step": 9066 + }, + { + "epoch": 0.3835772908029444, + "grad_norm": 0.24529825150966644, + "learning_rate": 0.001, + "loss": 2.4636, + "step": 9067 + }, + { + "epoch": 0.3836195955664608, + "grad_norm": 0.21412113308906555, + "learning_rate": 0.001, + "loss": 2.0917, + "step": 9068 + }, + { + "epoch": 0.38366190032997716, + "grad_norm": 0.23029856383800507, + "learning_rate": 0.001, + "loss": 1.8907, + "step": 9069 + }, + { + "epoch": 0.3837042050934935, + "grad_norm": 1.920670509338379, + "learning_rate": 0.001, + "loss": 2.3119, + "step": 9070 + }, + { + "epoch": 0.3837465098570099, + "grad_norm": 0.18355774879455566, + "learning_rate": 0.001, + "loss": 2.456, + "step": 9071 + }, + { + "epoch": 0.3837888146205263, + "grad_norm": 3.794325351715088, + "learning_rate": 0.001, + "loss": 2.9978, + "step": 9072 + }, + { + "epoch": 0.38383111938404263, + "grad_norm": 0.41664114594459534, + "learning_rate": 0.001, + "loss": 3.2798, + "step": 9073 + }, + { + "epoch": 0.38387342414755904, + "grad_norm": 5.11715030670166, + "learning_rate": 0.001, + "loss": 1.6814, + "step": 9074 + }, + { + "epoch": 0.3839157289110754, + "grad_norm": 0.2116970419883728, + "learning_rate": 0.001, + "loss": 2.3122, + "step": 9075 + }, + { + "epoch": 0.38395803367459175, + "grad_norm": 0.6905900239944458, + "learning_rate": 0.001, + "loss": 2.5548, + "step": 9076 + }, + { + "epoch": 0.38400033843810816, + "grad_norm": 0.18607500195503235, + "learning_rate": 0.001, + "loss": 2.4136, + "step": 9077 + }, + { + "epoch": 0.3840426432016245, + "grad_norm": 0.18247579038143158, + "learning_rate": 0.001, + "loss": 1.8635, + "step": 9078 + }, + { + "epoch": 0.38408494796514087, + "grad_norm": 0.17803490161895752, + "learning_rate": 0.001, + "loss": 1.9766, + "step": 9079 + }, + { + "epoch": 0.3841272527286572, + "grad_norm": 0.898413896560669, + "learning_rate": 0.001, + "loss": 2.7229, + "step": 9080 + }, + { + "epoch": 0.38416955749217363, + "grad_norm": 0.25650283694267273, + "learning_rate": 0.001, + "loss": 2.6571, + "step": 9081 + }, + { + "epoch": 0.38421186225569, + "grad_norm": 0.18835589289665222, + "learning_rate": 0.001, + "loss": 2.0706, + "step": 9082 + }, + { + "epoch": 0.38425416701920634, + "grad_norm": 0.2816076874732971, + "learning_rate": 0.001, + "loss": 3.3339, + "step": 9083 + }, + { + "epoch": 0.38429647178272275, + "grad_norm": 0.22398167848587036, + "learning_rate": 0.001, + "loss": 2.4135, + "step": 9084 + }, + { + "epoch": 0.3843387765462391, + "grad_norm": 0.906223714351654, + "learning_rate": 0.001, + "loss": 2.5593, + "step": 9085 + }, + { + "epoch": 0.38438108130975546, + "grad_norm": 0.3350967764854431, + "learning_rate": 0.001, + "loss": 2.6067, + "step": 9086 + }, + { + "epoch": 0.38442338607327187, + "grad_norm": 0.21072322130203247, + "learning_rate": 0.001, + "loss": 2.4582, + "step": 9087 + }, + { + "epoch": 0.3844656908367882, + "grad_norm": 0.26963114738464355, + "learning_rate": 0.001, + "loss": 2.1451, + "step": 9088 + }, + { + "epoch": 0.3845079956003046, + "grad_norm": 0.1972511261701584, + "learning_rate": 0.001, + "loss": 2.2775, + "step": 9089 + }, + { + "epoch": 0.384550300363821, + "grad_norm": 0.19861550629138947, + "learning_rate": 0.001, + "loss": 2.1255, + "step": 9090 + }, + { + "epoch": 0.38459260512733734, + "grad_norm": 0.22689582407474518, + "learning_rate": 0.001, + "loss": 1.9905, + "step": 9091 + }, + { + "epoch": 0.3846349098908537, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.001, + "loss": 1.9154, + "step": 9092 + }, + { + "epoch": 0.3846772146543701, + "grad_norm": 0.20935285091400146, + "learning_rate": 0.001, + "loss": 2.0418, + "step": 9093 + }, + { + "epoch": 0.38471951941788646, + "grad_norm": 0.20221330225467682, + "learning_rate": 0.001, + "loss": 1.9986, + "step": 9094 + }, + { + "epoch": 0.3847618241814028, + "grad_norm": 0.22481901943683624, + "learning_rate": 0.001, + "loss": 2.7436, + "step": 9095 + }, + { + "epoch": 0.3848041289449192, + "grad_norm": 0.20071500539779663, + "learning_rate": 0.001, + "loss": 2.0842, + "step": 9096 + }, + { + "epoch": 0.3848464337084356, + "grad_norm": 1.3057656288146973, + "learning_rate": 0.001, + "loss": 2.072, + "step": 9097 + }, + { + "epoch": 0.38488873847195193, + "grad_norm": 0.1865609586238861, + "learning_rate": 0.001, + "loss": 1.8298, + "step": 9098 + }, + { + "epoch": 0.38493104323546834, + "grad_norm": 0.3449401557445526, + "learning_rate": 0.001, + "loss": 3.089, + "step": 9099 + }, + { + "epoch": 0.3849733479989847, + "grad_norm": 2.8522613048553467, + "learning_rate": 0.001, + "loss": 2.6214, + "step": 9100 + }, + { + "epoch": 0.38501565276250105, + "grad_norm": 4.267495632171631, + "learning_rate": 0.001, + "loss": 2.3361, + "step": 9101 + }, + { + "epoch": 0.3850579575260174, + "grad_norm": 0.6872550249099731, + "learning_rate": 0.001, + "loss": 2.6345, + "step": 9102 + }, + { + "epoch": 0.3851002622895338, + "grad_norm": 0.21168558299541473, + "learning_rate": 0.001, + "loss": 2.5256, + "step": 9103 + }, + { + "epoch": 0.38514256705305017, + "grad_norm": 0.2972700297832489, + "learning_rate": 0.001, + "loss": 3.0594, + "step": 9104 + }, + { + "epoch": 0.3851848718165665, + "grad_norm": 0.42669767141342163, + "learning_rate": 0.001, + "loss": 2.4855, + "step": 9105 + }, + { + "epoch": 0.38522717658008293, + "grad_norm": 4.5662126541137695, + "learning_rate": 0.001, + "loss": 3.0319, + "step": 9106 + }, + { + "epoch": 0.3852694813435993, + "grad_norm": 3.4174654483795166, + "learning_rate": 0.001, + "loss": 1.9545, + "step": 9107 + }, + { + "epoch": 0.38531178610711564, + "grad_norm": 0.5192535519599915, + "learning_rate": 0.001, + "loss": 2.3336, + "step": 9108 + }, + { + "epoch": 0.38535409087063205, + "grad_norm": 1.6883811950683594, + "learning_rate": 0.001, + "loss": 2.6045, + "step": 9109 + }, + { + "epoch": 0.3853963956341484, + "grad_norm": 0.24284732341766357, + "learning_rate": 0.001, + "loss": 2.2217, + "step": 9110 + }, + { + "epoch": 0.38543870039766476, + "grad_norm": 0.2053961157798767, + "learning_rate": 0.001, + "loss": 2.6769, + "step": 9111 + }, + { + "epoch": 0.38548100516118117, + "grad_norm": 0.32398176193237305, + "learning_rate": 0.001, + "loss": 2.1218, + "step": 9112 + }, + { + "epoch": 0.3855233099246975, + "grad_norm": 0.1831725686788559, + "learning_rate": 0.001, + "loss": 1.6788, + "step": 9113 + }, + { + "epoch": 0.3855656146882139, + "grad_norm": 0.4239247739315033, + "learning_rate": 0.001, + "loss": 3.2968, + "step": 9114 + }, + { + "epoch": 0.3856079194517303, + "grad_norm": 0.20434336364269257, + "learning_rate": 0.001, + "loss": 1.5334, + "step": 9115 + }, + { + "epoch": 0.38565022421524664, + "grad_norm": 0.18700295686721802, + "learning_rate": 0.001, + "loss": 2.1835, + "step": 9116 + }, + { + "epoch": 0.385692528978763, + "grad_norm": 0.33566123247146606, + "learning_rate": 0.001, + "loss": 2.4534, + "step": 9117 + }, + { + "epoch": 0.3857348337422794, + "grad_norm": 0.4300113022327423, + "learning_rate": 0.001, + "loss": 2.7559, + "step": 9118 + }, + { + "epoch": 0.38577713850579576, + "grad_norm": 0.21322444081306458, + "learning_rate": 0.001, + "loss": 1.8912, + "step": 9119 + }, + { + "epoch": 0.3858194432693121, + "grad_norm": 0.25216782093048096, + "learning_rate": 0.001, + "loss": 2.3039, + "step": 9120 + }, + { + "epoch": 0.3858617480328285, + "grad_norm": 0.3035780191421509, + "learning_rate": 0.001, + "loss": 2.1976, + "step": 9121 + }, + { + "epoch": 0.3859040527963449, + "grad_norm": 0.23805847764015198, + "learning_rate": 0.001, + "loss": 2.2747, + "step": 9122 + }, + { + "epoch": 0.38594635755986123, + "grad_norm": 4.429523944854736, + "learning_rate": 0.001, + "loss": 3.2502, + "step": 9123 + }, + { + "epoch": 0.3859886623233776, + "grad_norm": 0.2643257677555084, + "learning_rate": 0.001, + "loss": 3.0433, + "step": 9124 + }, + { + "epoch": 0.386030967086894, + "grad_norm": 0.1765228807926178, + "learning_rate": 0.001, + "loss": 1.8067, + "step": 9125 + }, + { + "epoch": 0.38607327185041035, + "grad_norm": 2.101315975189209, + "learning_rate": 0.001, + "loss": 2.885, + "step": 9126 + }, + { + "epoch": 0.3861155766139267, + "grad_norm": 0.20153525471687317, + "learning_rate": 0.001, + "loss": 1.8812, + "step": 9127 + }, + { + "epoch": 0.3861578813774431, + "grad_norm": 0.2016594409942627, + "learning_rate": 0.001, + "loss": 1.6079, + "step": 9128 + }, + { + "epoch": 0.38620018614095947, + "grad_norm": 0.23051926493644714, + "learning_rate": 0.001, + "loss": 2.3102, + "step": 9129 + }, + { + "epoch": 0.3862424909044758, + "grad_norm": 0.20287853479385376, + "learning_rate": 0.001, + "loss": 2.5548, + "step": 9130 + }, + { + "epoch": 0.38628479566799223, + "grad_norm": 0.2215650975704193, + "learning_rate": 0.001, + "loss": 3.1022, + "step": 9131 + }, + { + "epoch": 0.3863271004315086, + "grad_norm": 14.681644439697266, + "learning_rate": 0.001, + "loss": 2.5631, + "step": 9132 + }, + { + "epoch": 0.38636940519502494, + "grad_norm": 0.21271252632141113, + "learning_rate": 0.001, + "loss": 1.8983, + "step": 9133 + }, + { + "epoch": 0.38641170995854135, + "grad_norm": 0.5988287329673767, + "learning_rate": 0.001, + "loss": 2.0374, + "step": 9134 + }, + { + "epoch": 0.3864540147220577, + "grad_norm": 0.5086121559143066, + "learning_rate": 0.001, + "loss": 2.1648, + "step": 9135 + }, + { + "epoch": 0.38649631948557406, + "grad_norm": 0.28043192625045776, + "learning_rate": 0.001, + "loss": 2.6912, + "step": 9136 + }, + { + "epoch": 0.38653862424909047, + "grad_norm": 1.1043363809585571, + "learning_rate": 0.001, + "loss": 2.9766, + "step": 9137 + }, + { + "epoch": 0.3865809290126068, + "grad_norm": 0.42361801862716675, + "learning_rate": 0.001, + "loss": 2.2055, + "step": 9138 + }, + { + "epoch": 0.3866232337761232, + "grad_norm": 0.7368301749229431, + "learning_rate": 0.001, + "loss": 3.197, + "step": 9139 + }, + { + "epoch": 0.3866655385396396, + "grad_norm": 0.4298476278781891, + "learning_rate": 0.001, + "loss": 2.3723, + "step": 9140 + }, + { + "epoch": 0.38670784330315594, + "grad_norm": 0.20373845100402832, + "learning_rate": 0.001, + "loss": 2.2679, + "step": 9141 + }, + { + "epoch": 0.3867501480666723, + "grad_norm": 0.18816867470741272, + "learning_rate": 0.001, + "loss": 2.195, + "step": 9142 + }, + { + "epoch": 0.3867924528301887, + "grad_norm": 0.24059052765369415, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 9143 + }, + { + "epoch": 0.38683475759370506, + "grad_norm": 0.19671255350112915, + "learning_rate": 0.001, + "loss": 3.1763, + "step": 9144 + }, + { + "epoch": 0.3868770623572214, + "grad_norm": 0.5776639580726624, + "learning_rate": 0.001, + "loss": 1.9861, + "step": 9145 + }, + { + "epoch": 0.3869193671207378, + "grad_norm": 0.8698708415031433, + "learning_rate": 0.001, + "loss": 2.8534, + "step": 9146 + }, + { + "epoch": 0.3869616718842542, + "grad_norm": 4.098544597625732, + "learning_rate": 0.001, + "loss": 1.9432, + "step": 9147 + }, + { + "epoch": 0.38700397664777053, + "grad_norm": 0.20377150177955627, + "learning_rate": 0.001, + "loss": 2.981, + "step": 9148 + }, + { + "epoch": 0.3870462814112869, + "grad_norm": 0.17152920365333557, + "learning_rate": 0.001, + "loss": 1.976, + "step": 9149 + }, + { + "epoch": 0.3870885861748033, + "grad_norm": 0.19490063190460205, + "learning_rate": 0.001, + "loss": 2.7471, + "step": 9150 + }, + { + "epoch": 0.38713089093831965, + "grad_norm": 0.21407170593738556, + "learning_rate": 0.001, + "loss": 2.155, + "step": 9151 + }, + { + "epoch": 0.387173195701836, + "grad_norm": 2.212458610534668, + "learning_rate": 0.001, + "loss": 1.773, + "step": 9152 + }, + { + "epoch": 0.3872155004653524, + "grad_norm": 0.15391570329666138, + "learning_rate": 0.001, + "loss": 2.4875, + "step": 9153 + }, + { + "epoch": 0.38725780522886877, + "grad_norm": 0.15746274590492249, + "learning_rate": 0.001, + "loss": 2.0196, + "step": 9154 + }, + { + "epoch": 0.3873001099923851, + "grad_norm": 0.2026894986629486, + "learning_rate": 0.001, + "loss": 2.7989, + "step": 9155 + }, + { + "epoch": 0.38734241475590153, + "grad_norm": 0.1813724935054779, + "learning_rate": 0.001, + "loss": 2.718, + "step": 9156 + }, + { + "epoch": 0.3873847195194179, + "grad_norm": 0.17108535766601562, + "learning_rate": 0.001, + "loss": 1.7214, + "step": 9157 + }, + { + "epoch": 0.38742702428293424, + "grad_norm": 0.1782073825597763, + "learning_rate": 0.001, + "loss": 2.4506, + "step": 9158 + }, + { + "epoch": 0.38746932904645065, + "grad_norm": 0.46579709649086, + "learning_rate": 0.001, + "loss": 1.7336, + "step": 9159 + }, + { + "epoch": 0.387511633809967, + "grad_norm": 0.19454286992549896, + "learning_rate": 0.001, + "loss": 1.7805, + "step": 9160 + }, + { + "epoch": 0.38755393857348336, + "grad_norm": 0.18374571204185486, + "learning_rate": 0.001, + "loss": 1.6501, + "step": 9161 + }, + { + "epoch": 0.38759624333699977, + "grad_norm": 0.23796077072620392, + "learning_rate": 0.001, + "loss": 2.6949, + "step": 9162 + }, + { + "epoch": 0.3876385481005161, + "grad_norm": 0.3394874036312103, + "learning_rate": 0.001, + "loss": 3.0172, + "step": 9163 + }, + { + "epoch": 0.3876808528640325, + "grad_norm": 0.20546482503414154, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 9164 + }, + { + "epoch": 0.3877231576275489, + "grad_norm": 0.23391731083393097, + "learning_rate": 0.001, + "loss": 2.7176, + "step": 9165 + }, + { + "epoch": 0.38776546239106524, + "grad_norm": 0.2764662206172943, + "learning_rate": 0.001, + "loss": 2.2979, + "step": 9166 + }, + { + "epoch": 0.3878077671545816, + "grad_norm": 0.17653588950634003, + "learning_rate": 0.001, + "loss": 2.3402, + "step": 9167 + }, + { + "epoch": 0.387850071918098, + "grad_norm": 0.4383090138435364, + "learning_rate": 0.001, + "loss": 2.0904, + "step": 9168 + }, + { + "epoch": 0.38789237668161436, + "grad_norm": 0.20846538245677948, + "learning_rate": 0.001, + "loss": 3.2569, + "step": 9169 + }, + { + "epoch": 0.3879346814451307, + "grad_norm": 0.6603441834449768, + "learning_rate": 0.001, + "loss": 2.309, + "step": 9170 + }, + { + "epoch": 0.38797698620864707, + "grad_norm": 0.18814191222190857, + "learning_rate": 0.001, + "loss": 1.4408, + "step": 9171 + }, + { + "epoch": 0.3880192909721635, + "grad_norm": 3.1111462116241455, + "learning_rate": 0.001, + "loss": 2.6072, + "step": 9172 + }, + { + "epoch": 0.38806159573567983, + "grad_norm": 0.2014855295419693, + "learning_rate": 0.001, + "loss": 1.6812, + "step": 9173 + }, + { + "epoch": 0.3881039004991962, + "grad_norm": 0.24328027665615082, + "learning_rate": 0.001, + "loss": 2.088, + "step": 9174 + }, + { + "epoch": 0.3881462052627126, + "grad_norm": 0.2333800196647644, + "learning_rate": 0.001, + "loss": 2.3285, + "step": 9175 + }, + { + "epoch": 0.38818851002622895, + "grad_norm": 0.21624669432640076, + "learning_rate": 0.001, + "loss": 2.7369, + "step": 9176 + }, + { + "epoch": 0.3882308147897453, + "grad_norm": 0.21028754115104675, + "learning_rate": 0.001, + "loss": 2.5539, + "step": 9177 + }, + { + "epoch": 0.3882731195532617, + "grad_norm": 0.26059690117836, + "learning_rate": 0.001, + "loss": 2.5894, + "step": 9178 + }, + { + "epoch": 0.38831542431677807, + "grad_norm": 0.20622827112674713, + "learning_rate": 0.001, + "loss": 2.2577, + "step": 9179 + }, + { + "epoch": 0.3883577290802944, + "grad_norm": 0.2566535472869873, + "learning_rate": 0.001, + "loss": 2.6067, + "step": 9180 + }, + { + "epoch": 0.38840003384381083, + "grad_norm": 0.1649394929409027, + "learning_rate": 0.001, + "loss": 1.9004, + "step": 9181 + }, + { + "epoch": 0.3884423386073272, + "grad_norm": 0.211260125041008, + "learning_rate": 0.001, + "loss": 2.304, + "step": 9182 + }, + { + "epoch": 0.38848464337084354, + "grad_norm": 5.116180896759033, + "learning_rate": 0.001, + "loss": 2.774, + "step": 9183 + }, + { + "epoch": 0.38852694813435995, + "grad_norm": 0.3661218583583832, + "learning_rate": 0.001, + "loss": 1.7813, + "step": 9184 + }, + { + "epoch": 0.3885692528978763, + "grad_norm": 0.2053448110818863, + "learning_rate": 0.001, + "loss": 2.1452, + "step": 9185 + }, + { + "epoch": 0.38861155766139266, + "grad_norm": 0.16580262780189514, + "learning_rate": 0.001, + "loss": 1.8883, + "step": 9186 + }, + { + "epoch": 0.38865386242490907, + "grad_norm": 0.1831020563840866, + "learning_rate": 0.001, + "loss": 2.1391, + "step": 9187 + }, + { + "epoch": 0.3886961671884254, + "grad_norm": 1.015125036239624, + "learning_rate": 0.001, + "loss": 1.8072, + "step": 9188 + }, + { + "epoch": 0.3887384719519418, + "grad_norm": 0.22336459159851074, + "learning_rate": 0.001, + "loss": 2.1674, + "step": 9189 + }, + { + "epoch": 0.3887807767154582, + "grad_norm": 0.20443737506866455, + "learning_rate": 0.001, + "loss": 2.1295, + "step": 9190 + }, + { + "epoch": 0.38882308147897454, + "grad_norm": 0.2641700208187103, + "learning_rate": 0.001, + "loss": 1.5808, + "step": 9191 + }, + { + "epoch": 0.3888653862424909, + "grad_norm": 0.5966519117355347, + "learning_rate": 0.001, + "loss": 1.7853, + "step": 9192 + }, + { + "epoch": 0.38890769100600725, + "grad_norm": 0.6837995648384094, + "learning_rate": 0.001, + "loss": 2.0421, + "step": 9193 + }, + { + "epoch": 0.38894999576952366, + "grad_norm": 0.20334653556346893, + "learning_rate": 0.001, + "loss": 1.995, + "step": 9194 + }, + { + "epoch": 0.38899230053304, + "grad_norm": 1.7629520893096924, + "learning_rate": 0.001, + "loss": 1.6059, + "step": 9195 + }, + { + "epoch": 0.38903460529655637, + "grad_norm": 0.4265093505382538, + "learning_rate": 0.001, + "loss": 2.4376, + "step": 9196 + }, + { + "epoch": 0.3890769100600728, + "grad_norm": 0.47653642296791077, + "learning_rate": 0.001, + "loss": 2.4451, + "step": 9197 + }, + { + "epoch": 0.38911921482358913, + "grad_norm": 0.43369871377944946, + "learning_rate": 0.001, + "loss": 1.8053, + "step": 9198 + }, + { + "epoch": 0.3891615195871055, + "grad_norm": 0.19080866873264313, + "learning_rate": 0.001, + "loss": 1.9405, + "step": 9199 + }, + { + "epoch": 0.3892038243506219, + "grad_norm": 0.22995217144489288, + "learning_rate": 0.001, + "loss": 2.9559, + "step": 9200 + }, + { + "epoch": 0.38924612911413825, + "grad_norm": 0.21692658960819244, + "learning_rate": 0.001, + "loss": 3.0938, + "step": 9201 + }, + { + "epoch": 0.3892884338776546, + "grad_norm": 0.6913968920707703, + "learning_rate": 0.001, + "loss": 2.2379, + "step": 9202 + }, + { + "epoch": 0.389330738641171, + "grad_norm": 0.34403377771377563, + "learning_rate": 0.001, + "loss": 2.1844, + "step": 9203 + }, + { + "epoch": 0.38937304340468737, + "grad_norm": 1.5377286672592163, + "learning_rate": 0.001, + "loss": 3.3703, + "step": 9204 + }, + { + "epoch": 0.3894153481682037, + "grad_norm": 0.212476909160614, + "learning_rate": 0.001, + "loss": 2.0148, + "step": 9205 + }, + { + "epoch": 0.38945765293172013, + "grad_norm": 0.8234764933586121, + "learning_rate": 0.001, + "loss": 2.8544, + "step": 9206 + }, + { + "epoch": 0.3894999576952365, + "grad_norm": 0.1971426010131836, + "learning_rate": 0.001, + "loss": 2.0184, + "step": 9207 + }, + { + "epoch": 0.38954226245875284, + "grad_norm": 0.23782867193222046, + "learning_rate": 0.001, + "loss": 2.1397, + "step": 9208 + }, + { + "epoch": 0.38958456722226925, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.001, + "loss": 1.8995, + "step": 9209 + }, + { + "epoch": 0.3896268719857856, + "grad_norm": 15.050933837890625, + "learning_rate": 0.001, + "loss": 1.6647, + "step": 9210 + }, + { + "epoch": 0.38966917674930196, + "grad_norm": 0.3999849259853363, + "learning_rate": 0.001, + "loss": 2.5121, + "step": 9211 + }, + { + "epoch": 0.38971148151281837, + "grad_norm": 0.35282769799232483, + "learning_rate": 0.001, + "loss": 1.9899, + "step": 9212 + }, + { + "epoch": 0.3897537862763347, + "grad_norm": 0.534419059753418, + "learning_rate": 0.001, + "loss": 2.1861, + "step": 9213 + }, + { + "epoch": 0.3897960910398511, + "grad_norm": 0.2646177411079407, + "learning_rate": 0.001, + "loss": 2.0317, + "step": 9214 + }, + { + "epoch": 0.38983839580336743, + "grad_norm": 0.20612642168998718, + "learning_rate": 0.001, + "loss": 1.835, + "step": 9215 + }, + { + "epoch": 0.38988070056688384, + "grad_norm": 0.25167348980903625, + "learning_rate": 0.001, + "loss": 2.4175, + "step": 9216 + }, + { + "epoch": 0.3899230053304002, + "grad_norm": 0.21403273940086365, + "learning_rate": 0.001, + "loss": 2.954, + "step": 9217 + }, + { + "epoch": 0.38996531009391655, + "grad_norm": 0.17985409498214722, + "learning_rate": 0.001, + "loss": 1.8319, + "step": 9218 + }, + { + "epoch": 0.39000761485743296, + "grad_norm": 2.4308388233184814, + "learning_rate": 0.001, + "loss": 2.0903, + "step": 9219 + }, + { + "epoch": 0.3900499196209493, + "grad_norm": 0.2107587307691574, + "learning_rate": 0.001, + "loss": 2.7956, + "step": 9220 + }, + { + "epoch": 0.39009222438446567, + "grad_norm": 1.1186227798461914, + "learning_rate": 0.001, + "loss": 2.1013, + "step": 9221 + }, + { + "epoch": 0.3901345291479821, + "grad_norm": 0.2170724719762802, + "learning_rate": 0.001, + "loss": 2.1852, + "step": 9222 + }, + { + "epoch": 0.39017683391149843, + "grad_norm": 0.1864577829837799, + "learning_rate": 0.001, + "loss": 1.4238, + "step": 9223 + }, + { + "epoch": 0.3902191386750148, + "grad_norm": 0.23575836420059204, + "learning_rate": 0.001, + "loss": 2.6753, + "step": 9224 + }, + { + "epoch": 0.3902614434385312, + "grad_norm": 0.21837228536605835, + "learning_rate": 0.001, + "loss": 2.4262, + "step": 9225 + }, + { + "epoch": 0.39030374820204755, + "grad_norm": 4.643385410308838, + "learning_rate": 0.001, + "loss": 2.1914, + "step": 9226 + }, + { + "epoch": 0.3903460529655639, + "grad_norm": 0.25850534439086914, + "learning_rate": 0.001, + "loss": 2.254, + "step": 9227 + }, + { + "epoch": 0.3903883577290803, + "grad_norm": 0.4413652718067169, + "learning_rate": 0.001, + "loss": 4.012, + "step": 9228 + }, + { + "epoch": 0.39043066249259667, + "grad_norm": 0.20437230169773102, + "learning_rate": 0.001, + "loss": 2.7935, + "step": 9229 + }, + { + "epoch": 0.390472967256113, + "grad_norm": 0.2475084662437439, + "learning_rate": 0.001, + "loss": 2.2118, + "step": 9230 + }, + { + "epoch": 0.39051527201962943, + "grad_norm": 0.7936202883720398, + "learning_rate": 0.001, + "loss": 2.0084, + "step": 9231 + }, + { + "epoch": 0.3905575767831458, + "grad_norm": 0.21864911913871765, + "learning_rate": 0.001, + "loss": 2.0875, + "step": 9232 + }, + { + "epoch": 0.39059988154666214, + "grad_norm": 0.25647038221359253, + "learning_rate": 0.001, + "loss": 2.1307, + "step": 9233 + }, + { + "epoch": 0.39064218631017855, + "grad_norm": 1.2627050876617432, + "learning_rate": 0.001, + "loss": 1.862, + "step": 9234 + }, + { + "epoch": 0.3906844910736949, + "grad_norm": 0.18225641548633575, + "learning_rate": 0.001, + "loss": 2.1382, + "step": 9235 + }, + { + "epoch": 0.39072679583721126, + "grad_norm": 0.1784055531024933, + "learning_rate": 0.001, + "loss": 2.4409, + "step": 9236 + }, + { + "epoch": 0.3907691006007276, + "grad_norm": 1.4898735284805298, + "learning_rate": 0.001, + "loss": 2.1492, + "step": 9237 + }, + { + "epoch": 0.390811405364244, + "grad_norm": 5.816760540008545, + "learning_rate": 0.001, + "loss": 2.0967, + "step": 9238 + }, + { + "epoch": 0.3908537101277604, + "grad_norm": 0.18301312625408173, + "learning_rate": 0.001, + "loss": 1.8928, + "step": 9239 + }, + { + "epoch": 0.39089601489127673, + "grad_norm": 0.21848469972610474, + "learning_rate": 0.001, + "loss": 2.2617, + "step": 9240 + }, + { + "epoch": 0.39093831965479314, + "grad_norm": 0.23523452877998352, + "learning_rate": 0.001, + "loss": 1.8523, + "step": 9241 + }, + { + "epoch": 0.3909806244183095, + "grad_norm": 0.28411930799484253, + "learning_rate": 0.001, + "loss": 2.5036, + "step": 9242 + }, + { + "epoch": 0.39102292918182585, + "grad_norm": 0.39887842535972595, + "learning_rate": 0.001, + "loss": 3.1872, + "step": 9243 + }, + { + "epoch": 0.39106523394534226, + "grad_norm": 0.26545318961143494, + "learning_rate": 0.001, + "loss": 2.1511, + "step": 9244 + }, + { + "epoch": 0.3911075387088586, + "grad_norm": 0.3136458992958069, + "learning_rate": 0.001, + "loss": 2.8136, + "step": 9245 + }, + { + "epoch": 0.39114984347237497, + "grad_norm": 0.2994268238544464, + "learning_rate": 0.001, + "loss": 2.0385, + "step": 9246 + }, + { + "epoch": 0.3911921482358914, + "grad_norm": 0.21520791947841644, + "learning_rate": 0.001, + "loss": 3.8089, + "step": 9247 + }, + { + "epoch": 0.39123445299940773, + "grad_norm": 0.18887737393379211, + "learning_rate": 0.001, + "loss": 1.6818, + "step": 9248 + }, + { + "epoch": 0.3912767577629241, + "grad_norm": 0.23042213916778564, + "learning_rate": 0.001, + "loss": 2.0132, + "step": 9249 + }, + { + "epoch": 0.3913190625264405, + "grad_norm": 8.303719520568848, + "learning_rate": 0.001, + "loss": 1.782, + "step": 9250 + }, + { + "epoch": 0.39136136728995685, + "grad_norm": 0.25407400727272034, + "learning_rate": 0.001, + "loss": 3.071, + "step": 9251 + }, + { + "epoch": 0.3914036720534732, + "grad_norm": 0.19759856164455414, + "learning_rate": 0.001, + "loss": 1.8951, + "step": 9252 + }, + { + "epoch": 0.3914459768169896, + "grad_norm": 0.3458765745162964, + "learning_rate": 0.001, + "loss": 3.0209, + "step": 9253 + }, + { + "epoch": 0.39148828158050597, + "grad_norm": 1.5416405200958252, + "learning_rate": 0.001, + "loss": 2.7139, + "step": 9254 + }, + { + "epoch": 0.3915305863440223, + "grad_norm": 0.21925224363803864, + "learning_rate": 0.001, + "loss": 2.1197, + "step": 9255 + }, + { + "epoch": 0.39157289110753873, + "grad_norm": 0.6360104084014893, + "learning_rate": 0.001, + "loss": 1.9278, + "step": 9256 + }, + { + "epoch": 0.3916151958710551, + "grad_norm": 0.17912402749061584, + "learning_rate": 0.001, + "loss": 2.8046, + "step": 9257 + }, + { + "epoch": 0.39165750063457144, + "grad_norm": 0.22755807638168335, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 9258 + }, + { + "epoch": 0.3916998053980878, + "grad_norm": 0.3106396794319153, + "learning_rate": 0.001, + "loss": 2.5425, + "step": 9259 + }, + { + "epoch": 0.3917421101616042, + "grad_norm": 1.9510068893432617, + "learning_rate": 0.001, + "loss": 1.7647, + "step": 9260 + }, + { + "epoch": 0.39178441492512056, + "grad_norm": 0.20490862429141998, + "learning_rate": 0.001, + "loss": 2.5582, + "step": 9261 + }, + { + "epoch": 0.3918267196886369, + "grad_norm": 0.18178889155387878, + "learning_rate": 0.001, + "loss": 2.8619, + "step": 9262 + }, + { + "epoch": 0.3918690244521533, + "grad_norm": 0.22644934058189392, + "learning_rate": 0.001, + "loss": 2.4013, + "step": 9263 + }, + { + "epoch": 0.3919113292156697, + "grad_norm": 0.2479361891746521, + "learning_rate": 0.001, + "loss": 2.4039, + "step": 9264 + }, + { + "epoch": 0.39195363397918603, + "grad_norm": 0.20470456779003143, + "learning_rate": 0.001, + "loss": 2.086, + "step": 9265 + }, + { + "epoch": 0.39199593874270244, + "grad_norm": 0.3624879717826843, + "learning_rate": 0.001, + "loss": 1.9208, + "step": 9266 + }, + { + "epoch": 0.3920382435062188, + "grad_norm": 0.3792955279350281, + "learning_rate": 0.001, + "loss": 2.912, + "step": 9267 + }, + { + "epoch": 0.39208054826973515, + "grad_norm": 0.3835373818874359, + "learning_rate": 0.001, + "loss": 3.1838, + "step": 9268 + }, + { + "epoch": 0.39212285303325156, + "grad_norm": 0.25581711530685425, + "learning_rate": 0.001, + "loss": 2.0758, + "step": 9269 + }, + { + "epoch": 0.3921651577967679, + "grad_norm": 0.20404990017414093, + "learning_rate": 0.001, + "loss": 1.8675, + "step": 9270 + }, + { + "epoch": 0.39220746256028427, + "grad_norm": 0.22453588247299194, + "learning_rate": 0.001, + "loss": 2.3676, + "step": 9271 + }, + { + "epoch": 0.3922497673238007, + "grad_norm": 0.2473563551902771, + "learning_rate": 0.001, + "loss": 2.179, + "step": 9272 + }, + { + "epoch": 0.39229207208731703, + "grad_norm": 0.24559181928634644, + "learning_rate": 0.001, + "loss": 3.4248, + "step": 9273 + }, + { + "epoch": 0.3923343768508334, + "grad_norm": 0.2277313470840454, + "learning_rate": 0.001, + "loss": 2.0933, + "step": 9274 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 0.19083429872989655, + "learning_rate": 0.001, + "loss": 2.4522, + "step": 9275 + }, + { + "epoch": 0.39241898637786615, + "grad_norm": 0.2212873101234436, + "learning_rate": 0.001, + "loss": 2.3474, + "step": 9276 + }, + { + "epoch": 0.3924612911413825, + "grad_norm": 0.1985638290643692, + "learning_rate": 0.001, + "loss": 2.283, + "step": 9277 + }, + { + "epoch": 0.3925035959048989, + "grad_norm": 1.7838554382324219, + "learning_rate": 0.001, + "loss": 2.1613, + "step": 9278 + }, + { + "epoch": 0.39254590066841527, + "grad_norm": 0.4087802469730377, + "learning_rate": 0.001, + "loss": 1.7939, + "step": 9279 + }, + { + "epoch": 0.3925882054319316, + "grad_norm": 0.17536307871341705, + "learning_rate": 0.001, + "loss": 1.9135, + "step": 9280 + }, + { + "epoch": 0.39263051019544803, + "grad_norm": 0.6189838647842407, + "learning_rate": 0.001, + "loss": 3.2527, + "step": 9281 + }, + { + "epoch": 0.3926728149589644, + "grad_norm": 0.2154485583305359, + "learning_rate": 0.001, + "loss": 1.8292, + "step": 9282 + }, + { + "epoch": 0.39271511972248074, + "grad_norm": 0.3119647204875946, + "learning_rate": 0.001, + "loss": 3.5083, + "step": 9283 + }, + { + "epoch": 0.3927574244859971, + "grad_norm": 0.28993380069732666, + "learning_rate": 0.001, + "loss": 3.3037, + "step": 9284 + }, + { + "epoch": 0.3927997292495135, + "grad_norm": 0.2555803954601288, + "learning_rate": 0.001, + "loss": 3.6532, + "step": 9285 + }, + { + "epoch": 0.39284203401302986, + "grad_norm": 0.22608381509780884, + "learning_rate": 0.001, + "loss": 2.7748, + "step": 9286 + }, + { + "epoch": 0.3928843387765462, + "grad_norm": 0.2289711982011795, + "learning_rate": 0.001, + "loss": 2.0334, + "step": 9287 + }, + { + "epoch": 0.3929266435400626, + "grad_norm": 0.7672654390335083, + "learning_rate": 0.001, + "loss": 1.7421, + "step": 9288 + }, + { + "epoch": 0.392968948303579, + "grad_norm": 0.210458442568779, + "learning_rate": 0.001, + "loss": 2.2449, + "step": 9289 + }, + { + "epoch": 0.39301125306709533, + "grad_norm": 0.4783534109592438, + "learning_rate": 0.001, + "loss": 1.7884, + "step": 9290 + }, + { + "epoch": 0.39305355783061174, + "grad_norm": 0.16665317118167877, + "learning_rate": 0.001, + "loss": 1.8335, + "step": 9291 + }, + { + "epoch": 0.3930958625941281, + "grad_norm": 0.6642040610313416, + "learning_rate": 0.001, + "loss": 2.3598, + "step": 9292 + }, + { + "epoch": 0.39313816735764445, + "grad_norm": 0.16919787228107452, + "learning_rate": 0.001, + "loss": 2.0728, + "step": 9293 + }, + { + "epoch": 0.39318047212116086, + "grad_norm": 0.17005786299705505, + "learning_rate": 0.001, + "loss": 1.8163, + "step": 9294 + }, + { + "epoch": 0.3932227768846772, + "grad_norm": 0.29996180534362793, + "learning_rate": 0.001, + "loss": 2.3236, + "step": 9295 + }, + { + "epoch": 0.39326508164819357, + "grad_norm": 0.19820648431777954, + "learning_rate": 0.001, + "loss": 2.0907, + "step": 9296 + }, + { + "epoch": 0.39330738641171, + "grad_norm": 1.3484530448913574, + "learning_rate": 0.001, + "loss": 1.6248, + "step": 9297 + }, + { + "epoch": 0.39334969117522633, + "grad_norm": 0.38515207171440125, + "learning_rate": 0.001, + "loss": 1.6151, + "step": 9298 + }, + { + "epoch": 0.3933919959387427, + "grad_norm": 0.2751673758029938, + "learning_rate": 0.001, + "loss": 1.8327, + "step": 9299 + }, + { + "epoch": 0.3934343007022591, + "grad_norm": 0.1867271363735199, + "learning_rate": 0.001, + "loss": 2.3518, + "step": 9300 + }, + { + "epoch": 0.39347660546577545, + "grad_norm": 1.881629467010498, + "learning_rate": 0.001, + "loss": 2.6431, + "step": 9301 + }, + { + "epoch": 0.3935189102292918, + "grad_norm": 0.1862645000219345, + "learning_rate": 0.001, + "loss": 2.3873, + "step": 9302 + }, + { + "epoch": 0.3935612149928082, + "grad_norm": 4.848597526550293, + "learning_rate": 0.001, + "loss": 2.0754, + "step": 9303 + }, + { + "epoch": 0.39360351975632457, + "grad_norm": 0.8090160489082336, + "learning_rate": 0.001, + "loss": 2.0008, + "step": 9304 + }, + { + "epoch": 0.3936458245198409, + "grad_norm": 60.91786575317383, + "learning_rate": 0.001, + "loss": 2.1857, + "step": 9305 + }, + { + "epoch": 0.3936881292833573, + "grad_norm": 0.5015795826911926, + "learning_rate": 0.001, + "loss": 2.5673, + "step": 9306 + }, + { + "epoch": 0.3937304340468737, + "grad_norm": 0.21446509659290314, + "learning_rate": 0.001, + "loss": 2.0362, + "step": 9307 + }, + { + "epoch": 0.39377273881039004, + "grad_norm": 1.1446090936660767, + "learning_rate": 0.001, + "loss": 1.639, + "step": 9308 + }, + { + "epoch": 0.3938150435739064, + "grad_norm": 0.2021520733833313, + "learning_rate": 0.001, + "loss": 3.2664, + "step": 9309 + }, + { + "epoch": 0.3938573483374228, + "grad_norm": 0.2120133638381958, + "learning_rate": 0.001, + "loss": 2.2566, + "step": 9310 + }, + { + "epoch": 0.39389965310093916, + "grad_norm": 0.226138174533844, + "learning_rate": 0.001, + "loss": 1.8226, + "step": 9311 + }, + { + "epoch": 0.3939419578644555, + "grad_norm": 4.0954694747924805, + "learning_rate": 0.001, + "loss": 2.0812, + "step": 9312 + }, + { + "epoch": 0.3939842626279719, + "grad_norm": 0.37310054898262024, + "learning_rate": 0.001, + "loss": 3.0503, + "step": 9313 + }, + { + "epoch": 0.3940265673914883, + "grad_norm": 0.6909289360046387, + "learning_rate": 0.001, + "loss": 2.0039, + "step": 9314 + }, + { + "epoch": 0.39406887215500463, + "grad_norm": 0.23392127454280853, + "learning_rate": 0.001, + "loss": 1.9528, + "step": 9315 + }, + { + "epoch": 0.39411117691852104, + "grad_norm": 1.0609430074691772, + "learning_rate": 0.001, + "loss": 2.6997, + "step": 9316 + }, + { + "epoch": 0.3941534816820374, + "grad_norm": 0.2690190374851227, + "learning_rate": 0.001, + "loss": 2.5265, + "step": 9317 + }, + { + "epoch": 0.39419578644555375, + "grad_norm": 0.19583603739738464, + "learning_rate": 0.001, + "loss": 2.9415, + "step": 9318 + }, + { + "epoch": 0.39423809120907016, + "grad_norm": 0.2094552218914032, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 9319 + }, + { + "epoch": 0.3942803959725865, + "grad_norm": 0.33723655343055725, + "learning_rate": 0.001, + "loss": 1.8156, + "step": 9320 + }, + { + "epoch": 0.39432270073610287, + "grad_norm": 0.2463519275188446, + "learning_rate": 0.001, + "loss": 2.1634, + "step": 9321 + }, + { + "epoch": 0.3943650054996193, + "grad_norm": 2.194249153137207, + "learning_rate": 0.001, + "loss": 1.8768, + "step": 9322 + }, + { + "epoch": 0.39440731026313564, + "grad_norm": 0.2177824079990387, + "learning_rate": 0.001, + "loss": 1.7509, + "step": 9323 + }, + { + "epoch": 0.394449615026652, + "grad_norm": 0.2521611452102661, + "learning_rate": 0.001, + "loss": 2.2049, + "step": 9324 + }, + { + "epoch": 0.3944919197901684, + "grad_norm": 0.5440561175346375, + "learning_rate": 0.001, + "loss": 2.5381, + "step": 9325 + }, + { + "epoch": 0.39453422455368475, + "grad_norm": 0.22166214883327484, + "learning_rate": 0.001, + "loss": 1.8115, + "step": 9326 + }, + { + "epoch": 0.3945765293172011, + "grad_norm": 0.8539856672286987, + "learning_rate": 0.001, + "loss": 3.4899, + "step": 9327 + }, + { + "epoch": 0.39461883408071746, + "grad_norm": 0.34720534086227417, + "learning_rate": 0.001, + "loss": 1.8842, + "step": 9328 + }, + { + "epoch": 0.39466113884423387, + "grad_norm": 0.2507597804069519, + "learning_rate": 0.001, + "loss": 2.0993, + "step": 9329 + }, + { + "epoch": 0.3947034436077502, + "grad_norm": 17.17608642578125, + "learning_rate": 0.001, + "loss": 2.5217, + "step": 9330 + }, + { + "epoch": 0.3947457483712666, + "grad_norm": 0.21708784997463226, + "learning_rate": 0.001, + "loss": 3.1434, + "step": 9331 + }, + { + "epoch": 0.394788053134783, + "grad_norm": 8.15025520324707, + "learning_rate": 0.001, + "loss": 1.6658, + "step": 9332 + }, + { + "epoch": 0.39483035789829934, + "grad_norm": 0.1978793740272522, + "learning_rate": 0.001, + "loss": 1.6641, + "step": 9333 + }, + { + "epoch": 0.3948726626618157, + "grad_norm": 0.22561314702033997, + "learning_rate": 0.001, + "loss": 1.9712, + "step": 9334 + }, + { + "epoch": 0.3949149674253321, + "grad_norm": 8.708436012268066, + "learning_rate": 0.001, + "loss": 1.9502, + "step": 9335 + }, + { + "epoch": 0.39495727218884846, + "grad_norm": 0.2591575086116791, + "learning_rate": 0.001, + "loss": 2.4213, + "step": 9336 + }, + { + "epoch": 0.3949995769523648, + "grad_norm": 0.22696593403816223, + "learning_rate": 0.001, + "loss": 1.8231, + "step": 9337 + }, + { + "epoch": 0.3950418817158812, + "grad_norm": 0.4304518401622772, + "learning_rate": 0.001, + "loss": 1.6622, + "step": 9338 + }, + { + "epoch": 0.3950841864793976, + "grad_norm": 0.27557769417762756, + "learning_rate": 0.001, + "loss": 2.2719, + "step": 9339 + }, + { + "epoch": 0.39512649124291394, + "grad_norm": 0.2860799729824066, + "learning_rate": 0.001, + "loss": 1.9818, + "step": 9340 + }, + { + "epoch": 0.39516879600643035, + "grad_norm": 119.59538269042969, + "learning_rate": 0.001, + "loss": 1.7143, + "step": 9341 + }, + { + "epoch": 0.3952111007699467, + "grad_norm": 0.20414665341377258, + "learning_rate": 0.001, + "loss": 3.6217, + "step": 9342 + }, + { + "epoch": 0.39525340553346305, + "grad_norm": 0.2606525123119354, + "learning_rate": 0.001, + "loss": 2.0212, + "step": 9343 + }, + { + "epoch": 0.39529571029697946, + "grad_norm": 0.2075478881597519, + "learning_rate": 0.001, + "loss": 2.1644, + "step": 9344 + }, + { + "epoch": 0.3953380150604958, + "grad_norm": 0.24295449256896973, + "learning_rate": 0.001, + "loss": 2.2285, + "step": 9345 + }, + { + "epoch": 0.39538031982401217, + "grad_norm": 0.22193071246147156, + "learning_rate": 0.001, + "loss": 1.8047, + "step": 9346 + }, + { + "epoch": 0.3954226245875286, + "grad_norm": 0.21409432590007782, + "learning_rate": 0.001, + "loss": 2.1791, + "step": 9347 + }, + { + "epoch": 0.39546492935104494, + "grad_norm": 2.0014984607696533, + "learning_rate": 0.001, + "loss": 2.5374, + "step": 9348 + }, + { + "epoch": 0.3955072341145613, + "grad_norm": 5.059160232543945, + "learning_rate": 0.001, + "loss": 2.2516, + "step": 9349 + }, + { + "epoch": 0.39554953887807764, + "grad_norm": 0.1997808814048767, + "learning_rate": 0.001, + "loss": 2.2374, + "step": 9350 + }, + { + "epoch": 0.39559184364159405, + "grad_norm": 0.19376425445079803, + "learning_rate": 0.001, + "loss": 1.8633, + "step": 9351 + }, + { + "epoch": 0.3956341484051104, + "grad_norm": 0.8050264716148376, + "learning_rate": 0.001, + "loss": 2.5749, + "step": 9352 + }, + { + "epoch": 0.39567645316862676, + "grad_norm": 0.3786727786064148, + "learning_rate": 0.001, + "loss": 3.1153, + "step": 9353 + }, + { + "epoch": 0.3957187579321432, + "grad_norm": 0.2291092574596405, + "learning_rate": 0.001, + "loss": 1.8893, + "step": 9354 + }, + { + "epoch": 0.3957610626956595, + "grad_norm": 0.49110499024391174, + "learning_rate": 0.001, + "loss": 2.2142, + "step": 9355 + }, + { + "epoch": 0.3958033674591759, + "grad_norm": 0.18595391511917114, + "learning_rate": 0.001, + "loss": 1.9032, + "step": 9356 + }, + { + "epoch": 0.3958456722226923, + "grad_norm": 2.1453158855438232, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 9357 + }, + { + "epoch": 0.39588797698620865, + "grad_norm": 0.33591386675834656, + "learning_rate": 0.001, + "loss": 2.0517, + "step": 9358 + }, + { + "epoch": 0.395930281749725, + "grad_norm": 1.8636958599090576, + "learning_rate": 0.001, + "loss": 2.7847, + "step": 9359 + }, + { + "epoch": 0.3959725865132414, + "grad_norm": 0.20229916274547577, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 9360 + }, + { + "epoch": 0.39601489127675776, + "grad_norm": 0.2586257755756378, + "learning_rate": 0.001, + "loss": 1.8981, + "step": 9361 + }, + { + "epoch": 0.3960571960402741, + "grad_norm": 0.340988427400589, + "learning_rate": 0.001, + "loss": 2.7096, + "step": 9362 + }, + { + "epoch": 0.3960995008037905, + "grad_norm": 0.21785105764865875, + "learning_rate": 0.001, + "loss": 1.8415, + "step": 9363 + }, + { + "epoch": 0.3961418055673069, + "grad_norm": 0.7364193201065063, + "learning_rate": 0.001, + "loss": 2.9645, + "step": 9364 + }, + { + "epoch": 0.39618411033082324, + "grad_norm": 0.32688039541244507, + "learning_rate": 0.001, + "loss": 2.9784, + "step": 9365 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 0.2986684739589691, + "learning_rate": 0.001, + "loss": 3.2292, + "step": 9366 + }, + { + "epoch": 0.396268719857856, + "grad_norm": 2.457265853881836, + "learning_rate": 0.001, + "loss": 1.9641, + "step": 9367 + }, + { + "epoch": 0.39631102462137235, + "grad_norm": 0.28799575567245483, + "learning_rate": 0.001, + "loss": 2.2059, + "step": 9368 + }, + { + "epoch": 0.39635332938488876, + "grad_norm": 0.7766205668449402, + "learning_rate": 0.001, + "loss": 3.7114, + "step": 9369 + }, + { + "epoch": 0.3963956341484051, + "grad_norm": 0.7524096369743347, + "learning_rate": 0.001, + "loss": 2.6579, + "step": 9370 + }, + { + "epoch": 0.3964379389119215, + "grad_norm": 1.014512062072754, + "learning_rate": 0.001, + "loss": 2.8398, + "step": 9371 + }, + { + "epoch": 0.3964802436754378, + "grad_norm": 0.22580230236053467, + "learning_rate": 0.001, + "loss": 1.9793, + "step": 9372 + }, + { + "epoch": 0.39652254843895424, + "grad_norm": 0.24192282557487488, + "learning_rate": 0.001, + "loss": 2.111, + "step": 9373 + }, + { + "epoch": 0.3965648532024706, + "grad_norm": 1.9858895540237427, + "learning_rate": 0.001, + "loss": 2.8711, + "step": 9374 + }, + { + "epoch": 0.39660715796598695, + "grad_norm": 0.21744923293590546, + "learning_rate": 0.001, + "loss": 1.9478, + "step": 9375 + }, + { + "epoch": 0.39664946272950335, + "grad_norm": 1.180682897567749, + "learning_rate": 0.001, + "loss": 1.7087, + "step": 9376 + }, + { + "epoch": 0.3966917674930197, + "grad_norm": 0.7958380579948425, + "learning_rate": 0.001, + "loss": 3.7644, + "step": 9377 + }, + { + "epoch": 0.39673407225653606, + "grad_norm": 1.155543565750122, + "learning_rate": 0.001, + "loss": 2.2873, + "step": 9378 + }, + { + "epoch": 0.3967763770200525, + "grad_norm": 0.2220691293478012, + "learning_rate": 0.001, + "loss": 2.2881, + "step": 9379 + }, + { + "epoch": 0.3968186817835688, + "grad_norm": 2.1947951316833496, + "learning_rate": 0.001, + "loss": 2.9223, + "step": 9380 + }, + { + "epoch": 0.3968609865470852, + "grad_norm": 0.404043048620224, + "learning_rate": 0.001, + "loss": 2.0489, + "step": 9381 + }, + { + "epoch": 0.3969032913106016, + "grad_norm": 0.22369450330734253, + "learning_rate": 0.001, + "loss": 2.169, + "step": 9382 + }, + { + "epoch": 0.39694559607411795, + "grad_norm": 0.2130047082901001, + "learning_rate": 0.001, + "loss": 2.0125, + "step": 9383 + }, + { + "epoch": 0.3969879008376343, + "grad_norm": 0.42711809277534485, + "learning_rate": 0.001, + "loss": 2.6084, + "step": 9384 + }, + { + "epoch": 0.3970302056011507, + "grad_norm": 0.31725093722343445, + "learning_rate": 0.001, + "loss": 2.2298, + "step": 9385 + }, + { + "epoch": 0.39707251036466706, + "grad_norm": 0.2912001311779022, + "learning_rate": 0.001, + "loss": 2.0143, + "step": 9386 + }, + { + "epoch": 0.3971148151281834, + "grad_norm": 0.27698057889938354, + "learning_rate": 0.001, + "loss": 2.3999, + "step": 9387 + }, + { + "epoch": 0.39715711989169983, + "grad_norm": 0.7280207276344299, + "learning_rate": 0.001, + "loss": 2.1915, + "step": 9388 + }, + { + "epoch": 0.3971994246552162, + "grad_norm": 1.9393442869186401, + "learning_rate": 0.001, + "loss": 2.6316, + "step": 9389 + }, + { + "epoch": 0.39724172941873254, + "grad_norm": 0.20909947156906128, + "learning_rate": 0.001, + "loss": 2.5462, + "step": 9390 + }, + { + "epoch": 0.39728403418224895, + "grad_norm": 2.941432237625122, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 9391 + }, + { + "epoch": 0.3973263389457653, + "grad_norm": 0.18532417714595795, + "learning_rate": 0.001, + "loss": 2.9728, + "step": 9392 + }, + { + "epoch": 0.39736864370928165, + "grad_norm": 0.1898314207792282, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 9393 + }, + { + "epoch": 0.39741094847279806, + "grad_norm": 2.149635076522827, + "learning_rate": 0.001, + "loss": 2.0937, + "step": 9394 + }, + { + "epoch": 0.3974532532363144, + "grad_norm": 0.22059142589569092, + "learning_rate": 0.001, + "loss": 2.1177, + "step": 9395 + }, + { + "epoch": 0.3974955579998308, + "grad_norm": 0.38471415638923645, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 9396 + }, + { + "epoch": 0.3975378627633471, + "grad_norm": 0.6079134941101074, + "learning_rate": 0.001, + "loss": 2.0545, + "step": 9397 + }, + { + "epoch": 0.39758016752686354, + "grad_norm": 0.5451988577842712, + "learning_rate": 0.001, + "loss": 2.1774, + "step": 9398 + }, + { + "epoch": 0.3976224722903799, + "grad_norm": 0.20046019554138184, + "learning_rate": 0.001, + "loss": 1.909, + "step": 9399 + }, + { + "epoch": 0.39766477705389625, + "grad_norm": 0.7716697454452515, + "learning_rate": 0.001, + "loss": 2.4313, + "step": 9400 + }, + { + "epoch": 0.39770708181741266, + "grad_norm": 0.3297424018383026, + "learning_rate": 0.001, + "loss": 2.0079, + "step": 9401 + }, + { + "epoch": 0.397749386580929, + "grad_norm": 0.1952991932630539, + "learning_rate": 0.001, + "loss": 2.1102, + "step": 9402 + }, + { + "epoch": 0.39779169134444536, + "grad_norm": 0.19148588180541992, + "learning_rate": 0.001, + "loss": 1.9686, + "step": 9403 + }, + { + "epoch": 0.3978339961079618, + "grad_norm": 0.2454746812582016, + "learning_rate": 0.001, + "loss": 2.5814, + "step": 9404 + }, + { + "epoch": 0.39787630087147813, + "grad_norm": 1.7941993474960327, + "learning_rate": 0.001, + "loss": 2.0305, + "step": 9405 + }, + { + "epoch": 0.3979186056349945, + "grad_norm": 15.885468482971191, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 9406 + }, + { + "epoch": 0.3979609103985109, + "grad_norm": 0.22626948356628418, + "learning_rate": 0.001, + "loss": 2.3471, + "step": 9407 + }, + { + "epoch": 0.39800321516202725, + "grad_norm": 0.3605894446372986, + "learning_rate": 0.001, + "loss": 2.5423, + "step": 9408 + }, + { + "epoch": 0.3980455199255436, + "grad_norm": 0.19169077277183533, + "learning_rate": 0.001, + "loss": 1.6805, + "step": 9409 + }, + { + "epoch": 0.39808782468906, + "grad_norm": 0.32516393065452576, + "learning_rate": 0.001, + "loss": 2.1722, + "step": 9410 + }, + { + "epoch": 0.39813012945257636, + "grad_norm": 0.7686219215393066, + "learning_rate": 0.001, + "loss": 2.8895, + "step": 9411 + }, + { + "epoch": 0.3981724342160927, + "grad_norm": 0.2653716802597046, + "learning_rate": 0.001, + "loss": 1.8654, + "step": 9412 + }, + { + "epoch": 0.39821473897960913, + "grad_norm": 3.2883455753326416, + "learning_rate": 0.001, + "loss": 2.0769, + "step": 9413 + }, + { + "epoch": 0.3982570437431255, + "grad_norm": 10.234563827514648, + "learning_rate": 0.001, + "loss": 2.4105, + "step": 9414 + }, + { + "epoch": 0.39829934850664184, + "grad_norm": 0.2494221329689026, + "learning_rate": 0.001, + "loss": 1.6076, + "step": 9415 + }, + { + "epoch": 0.39834165327015825, + "grad_norm": 0.28424543142318726, + "learning_rate": 0.001, + "loss": 2.3292, + "step": 9416 + }, + { + "epoch": 0.3983839580336746, + "grad_norm": 0.23830546438694, + "learning_rate": 0.001, + "loss": 2.6482, + "step": 9417 + }, + { + "epoch": 0.39842626279719096, + "grad_norm": 3.2902987003326416, + "learning_rate": 0.001, + "loss": 2.8442, + "step": 9418 + }, + { + "epoch": 0.3984685675607073, + "grad_norm": 1.0926035642623901, + "learning_rate": 0.001, + "loss": 1.8991, + "step": 9419 + }, + { + "epoch": 0.3985108723242237, + "grad_norm": 0.6159302592277527, + "learning_rate": 0.001, + "loss": 3.9763, + "step": 9420 + }, + { + "epoch": 0.3985531770877401, + "grad_norm": 0.40966638922691345, + "learning_rate": 0.001, + "loss": 3.1978, + "step": 9421 + }, + { + "epoch": 0.39859548185125643, + "grad_norm": 1.2507386207580566, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 9422 + }, + { + "epoch": 0.39863778661477284, + "grad_norm": 0.210091695189476, + "learning_rate": 0.001, + "loss": 2.0291, + "step": 9423 + }, + { + "epoch": 0.3986800913782892, + "grad_norm": 0.6132796406745911, + "learning_rate": 0.001, + "loss": 1.9906, + "step": 9424 + }, + { + "epoch": 0.39872239614180555, + "grad_norm": 0.20599402487277985, + "learning_rate": 0.001, + "loss": 1.9661, + "step": 9425 + }, + { + "epoch": 0.39876470090532196, + "grad_norm": 75.12084197998047, + "learning_rate": 0.001, + "loss": 2.2098, + "step": 9426 + }, + { + "epoch": 0.3988070056688383, + "grad_norm": 0.19137638807296753, + "learning_rate": 0.001, + "loss": 2.4808, + "step": 9427 + }, + { + "epoch": 0.39884931043235466, + "grad_norm": 0.4978329539299011, + "learning_rate": 0.001, + "loss": 2.0099, + "step": 9428 + }, + { + "epoch": 0.3988916151958711, + "grad_norm": 1.2876667976379395, + "learning_rate": 0.001, + "loss": 2.6079, + "step": 9429 + }, + { + "epoch": 0.39893391995938743, + "grad_norm": 8.302372932434082, + "learning_rate": 0.001, + "loss": 2.3667, + "step": 9430 + }, + { + "epoch": 0.3989762247229038, + "grad_norm": 0.2350839525461197, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 9431 + }, + { + "epoch": 0.3990185294864202, + "grad_norm": 7.219992160797119, + "learning_rate": 0.001, + "loss": 2.0594, + "step": 9432 + }, + { + "epoch": 0.39906083424993655, + "grad_norm": 0.8567195534706116, + "learning_rate": 0.001, + "loss": 2.2847, + "step": 9433 + }, + { + "epoch": 0.3991031390134529, + "grad_norm": 0.2659953236579895, + "learning_rate": 0.001, + "loss": 2.0949, + "step": 9434 + }, + { + "epoch": 0.3991454437769693, + "grad_norm": 0.27466630935668945, + "learning_rate": 0.001, + "loss": 2.9983, + "step": 9435 + }, + { + "epoch": 0.39918774854048567, + "grad_norm": 0.22787296772003174, + "learning_rate": 0.001, + "loss": 3.1284, + "step": 9436 + }, + { + "epoch": 0.399230053304002, + "grad_norm": 0.6802979111671448, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 9437 + }, + { + "epoch": 0.39927235806751843, + "grad_norm": 0.22264288365840912, + "learning_rate": 0.001, + "loss": 3.2098, + "step": 9438 + }, + { + "epoch": 0.3993146628310348, + "grad_norm": 0.20934641361236572, + "learning_rate": 0.001, + "loss": 2.5188, + "step": 9439 + }, + { + "epoch": 0.39935696759455114, + "grad_norm": 0.3321964144706726, + "learning_rate": 0.001, + "loss": 2.8836, + "step": 9440 + }, + { + "epoch": 0.3993992723580675, + "grad_norm": 0.7503458857536316, + "learning_rate": 0.001, + "loss": 2.3137, + "step": 9441 + }, + { + "epoch": 0.3994415771215839, + "grad_norm": 0.20252926647663116, + "learning_rate": 0.001, + "loss": 3.1602, + "step": 9442 + }, + { + "epoch": 0.39948388188510026, + "grad_norm": 0.2206176519393921, + "learning_rate": 0.001, + "loss": 4.0381, + "step": 9443 + }, + { + "epoch": 0.3995261866486166, + "grad_norm": 0.16592377424240112, + "learning_rate": 0.001, + "loss": 2.0148, + "step": 9444 + }, + { + "epoch": 0.399568491412133, + "grad_norm": 0.19829799234867096, + "learning_rate": 0.001, + "loss": 1.8411, + "step": 9445 + }, + { + "epoch": 0.3996107961756494, + "grad_norm": 0.18881183862686157, + "learning_rate": 0.001, + "loss": 1.9147, + "step": 9446 + }, + { + "epoch": 0.39965310093916573, + "grad_norm": 0.24857249855995178, + "learning_rate": 0.001, + "loss": 2.3787, + "step": 9447 + }, + { + "epoch": 0.39969540570268214, + "grad_norm": 0.2752115726470947, + "learning_rate": 0.001, + "loss": 1.9453, + "step": 9448 + }, + { + "epoch": 0.3997377104661985, + "grad_norm": 0.1992688775062561, + "learning_rate": 0.001, + "loss": 2.4652, + "step": 9449 + }, + { + "epoch": 0.39978001522971485, + "grad_norm": 0.20093099772930145, + "learning_rate": 0.001, + "loss": 1.8111, + "step": 9450 + }, + { + "epoch": 0.39982231999323126, + "grad_norm": 0.1518988311290741, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 9451 + }, + { + "epoch": 0.3998646247567476, + "grad_norm": 0.20735451579093933, + "learning_rate": 0.001, + "loss": 2.1059, + "step": 9452 + }, + { + "epoch": 0.39990692952026397, + "grad_norm": 0.2454904466867447, + "learning_rate": 0.001, + "loss": 2.5582, + "step": 9453 + }, + { + "epoch": 0.3999492342837804, + "grad_norm": 0.8346079587936401, + "learning_rate": 0.001, + "loss": 3.135, + "step": 9454 + }, + { + "epoch": 0.39999153904729673, + "grad_norm": 0.1993485987186432, + "learning_rate": 0.001, + "loss": 2.1025, + "step": 9455 + }, + { + "epoch": 0.4000338438108131, + "grad_norm": 0.4179655909538269, + "learning_rate": 0.001, + "loss": 2.0312, + "step": 9456 + }, + { + "epoch": 0.4000761485743295, + "grad_norm": 0.17672595381736755, + "learning_rate": 0.001, + "loss": 2.5397, + "step": 9457 + }, + { + "epoch": 0.40011845333784585, + "grad_norm": 0.24849601089954376, + "learning_rate": 0.001, + "loss": 2.4992, + "step": 9458 + }, + { + "epoch": 0.4001607581013622, + "grad_norm": 0.2164924442768097, + "learning_rate": 0.001, + "loss": 2.0831, + "step": 9459 + }, + { + "epoch": 0.4002030628648786, + "grad_norm": 3.8473236560821533, + "learning_rate": 0.001, + "loss": 2.1948, + "step": 9460 + }, + { + "epoch": 0.40024536762839497, + "grad_norm": 0.24513396620750427, + "learning_rate": 0.001, + "loss": 1.9716, + "step": 9461 + }, + { + "epoch": 0.4002876723919113, + "grad_norm": 0.6947419047355652, + "learning_rate": 0.001, + "loss": 2.8307, + "step": 9462 + }, + { + "epoch": 0.4003299771554277, + "grad_norm": 0.20410794019699097, + "learning_rate": 0.001, + "loss": 3.0127, + "step": 9463 + }, + { + "epoch": 0.4003722819189441, + "grad_norm": 0.18092742562294006, + "learning_rate": 0.001, + "loss": 1.9074, + "step": 9464 + }, + { + "epoch": 0.40041458668246044, + "grad_norm": 0.17439773678779602, + "learning_rate": 0.001, + "loss": 1.8441, + "step": 9465 + }, + { + "epoch": 0.4004568914459768, + "grad_norm": 0.5184692144393921, + "learning_rate": 0.001, + "loss": 2.8331, + "step": 9466 + }, + { + "epoch": 0.4004991962094932, + "grad_norm": 0.19530414044857025, + "learning_rate": 0.001, + "loss": 2.3759, + "step": 9467 + }, + { + "epoch": 0.40054150097300956, + "grad_norm": 0.5446052551269531, + "learning_rate": 0.001, + "loss": 2.5317, + "step": 9468 + }, + { + "epoch": 0.4005838057365259, + "grad_norm": 1.0706979036331177, + "learning_rate": 0.001, + "loss": 3.0557, + "step": 9469 + }, + { + "epoch": 0.4006261105000423, + "grad_norm": 0.25208720564842224, + "learning_rate": 0.001, + "loss": 2.2757, + "step": 9470 + }, + { + "epoch": 0.4006684152635587, + "grad_norm": 0.1785879284143448, + "learning_rate": 0.001, + "loss": 1.8306, + "step": 9471 + }, + { + "epoch": 0.40071072002707503, + "grad_norm": 0.20661844313144684, + "learning_rate": 0.001, + "loss": 2.9331, + "step": 9472 + }, + { + "epoch": 0.40075302479059144, + "grad_norm": 0.24064034223556519, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 9473 + }, + { + "epoch": 0.4007953295541078, + "grad_norm": 28.610883712768555, + "learning_rate": 0.001, + "loss": 2.2863, + "step": 9474 + }, + { + "epoch": 0.40083763431762415, + "grad_norm": 0.6039111614227295, + "learning_rate": 0.001, + "loss": 2.4287, + "step": 9475 + }, + { + "epoch": 0.40087993908114056, + "grad_norm": 0.18513540923595428, + "learning_rate": 0.001, + "loss": 2.2021, + "step": 9476 + }, + { + "epoch": 0.4009222438446569, + "grad_norm": 1.6514339447021484, + "learning_rate": 0.001, + "loss": 1.9221, + "step": 9477 + }, + { + "epoch": 0.40096454860817327, + "grad_norm": 10.18622875213623, + "learning_rate": 0.001, + "loss": 2.3673, + "step": 9478 + }, + { + "epoch": 0.4010068533716897, + "grad_norm": 1.0164726972579956, + "learning_rate": 0.001, + "loss": 1.8288, + "step": 9479 + }, + { + "epoch": 0.40104915813520603, + "grad_norm": 0.5969464778900146, + "learning_rate": 0.001, + "loss": 2.2311, + "step": 9480 + }, + { + "epoch": 0.4010914628987224, + "grad_norm": 0.22844240069389343, + "learning_rate": 0.001, + "loss": 2.826, + "step": 9481 + }, + { + "epoch": 0.4011337676622388, + "grad_norm": 0.3800431191921234, + "learning_rate": 0.001, + "loss": 2.7672, + "step": 9482 + }, + { + "epoch": 0.40117607242575515, + "grad_norm": 0.4591025114059448, + "learning_rate": 0.001, + "loss": 2.5813, + "step": 9483 + }, + { + "epoch": 0.4012183771892715, + "grad_norm": 0.21790550649166107, + "learning_rate": 0.001, + "loss": 1.8184, + "step": 9484 + }, + { + "epoch": 0.40126068195278786, + "grad_norm": 0.7299678325653076, + "learning_rate": 0.001, + "loss": 1.9012, + "step": 9485 + }, + { + "epoch": 0.40130298671630427, + "grad_norm": 1.0232609510421753, + "learning_rate": 0.001, + "loss": 3.1905, + "step": 9486 + }, + { + "epoch": 0.4013452914798206, + "grad_norm": 0.1979575753211975, + "learning_rate": 0.001, + "loss": 1.8858, + "step": 9487 + }, + { + "epoch": 0.401387596243337, + "grad_norm": 1.3063571453094482, + "learning_rate": 0.001, + "loss": 1.5222, + "step": 9488 + }, + { + "epoch": 0.4014299010068534, + "grad_norm": 0.2541360855102539, + "learning_rate": 0.001, + "loss": 2.1482, + "step": 9489 + }, + { + "epoch": 0.40147220577036974, + "grad_norm": 2.719597816467285, + "learning_rate": 0.001, + "loss": 2.9177, + "step": 9490 + }, + { + "epoch": 0.4015145105338861, + "grad_norm": 0.18410396575927734, + "learning_rate": 0.001, + "loss": 1.3939, + "step": 9491 + }, + { + "epoch": 0.4015568152974025, + "grad_norm": 0.30777132511138916, + "learning_rate": 0.001, + "loss": 2.4803, + "step": 9492 + }, + { + "epoch": 0.40159912006091886, + "grad_norm": 0.5091328024864197, + "learning_rate": 0.001, + "loss": 2.1258, + "step": 9493 + }, + { + "epoch": 0.4016414248244352, + "grad_norm": 0.5316731929779053, + "learning_rate": 0.001, + "loss": 3.1333, + "step": 9494 + }, + { + "epoch": 0.4016837295879516, + "grad_norm": 0.5768591165542603, + "learning_rate": 0.001, + "loss": 2.1632, + "step": 9495 + }, + { + "epoch": 0.401726034351468, + "grad_norm": 0.2193114310503006, + "learning_rate": 0.001, + "loss": 2.4664, + "step": 9496 + }, + { + "epoch": 0.40176833911498433, + "grad_norm": 0.37915632128715515, + "learning_rate": 0.001, + "loss": 4.2133, + "step": 9497 + }, + { + "epoch": 0.40181064387850074, + "grad_norm": 0.18758665025234222, + "learning_rate": 0.001, + "loss": 1.8454, + "step": 9498 + }, + { + "epoch": 0.4018529486420171, + "grad_norm": 0.24519726634025574, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 9499 + }, + { + "epoch": 0.40189525340553345, + "grad_norm": 0.2464457005262375, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 9500 + }, + { + "epoch": 0.40193755816904986, + "grad_norm": 0.21936391294002533, + "learning_rate": 0.001, + "loss": 2.2305, + "step": 9501 + }, + { + "epoch": 0.4019798629325662, + "grad_norm": 0.248738095164299, + "learning_rate": 0.001, + "loss": 2.2684, + "step": 9502 + }, + { + "epoch": 0.40202216769608257, + "grad_norm": 0.25540632009506226, + "learning_rate": 0.001, + "loss": 3.4357, + "step": 9503 + }, + { + "epoch": 0.402064472459599, + "grad_norm": 1.870626449584961, + "learning_rate": 0.001, + "loss": 2.1011, + "step": 9504 + }, + { + "epoch": 0.40210677722311533, + "grad_norm": 0.23823359608650208, + "learning_rate": 0.001, + "loss": 1.7739, + "step": 9505 + }, + { + "epoch": 0.4021490819866317, + "grad_norm": 0.678466260433197, + "learning_rate": 0.001, + "loss": 1.9031, + "step": 9506 + }, + { + "epoch": 0.4021913867501481, + "grad_norm": 0.4503398537635803, + "learning_rate": 0.001, + "loss": 1.8608, + "step": 9507 + }, + { + "epoch": 0.40223369151366445, + "grad_norm": 0.528942346572876, + "learning_rate": 0.001, + "loss": 1.7426, + "step": 9508 + }, + { + "epoch": 0.4022759962771808, + "grad_norm": 0.2618485391139984, + "learning_rate": 0.001, + "loss": 2.144, + "step": 9509 + }, + { + "epoch": 0.40231830104069716, + "grad_norm": 4.79293155670166, + "learning_rate": 0.001, + "loss": 2.7917, + "step": 9510 + }, + { + "epoch": 0.40236060580421357, + "grad_norm": 0.18239739537239075, + "learning_rate": 0.001, + "loss": 2.5635, + "step": 9511 + }, + { + "epoch": 0.4024029105677299, + "grad_norm": 0.7269040942192078, + "learning_rate": 0.001, + "loss": 2.3232, + "step": 9512 + }, + { + "epoch": 0.4024452153312463, + "grad_norm": 0.18123169243335724, + "learning_rate": 0.001, + "loss": 1.7372, + "step": 9513 + }, + { + "epoch": 0.4024875200947627, + "grad_norm": 0.19387836754322052, + "learning_rate": 0.001, + "loss": 2.0364, + "step": 9514 + }, + { + "epoch": 0.40252982485827904, + "grad_norm": 0.18925026059150696, + "learning_rate": 0.001, + "loss": 1.7467, + "step": 9515 + }, + { + "epoch": 0.4025721296217954, + "grad_norm": 0.3890759348869324, + "learning_rate": 0.001, + "loss": 2.7014, + "step": 9516 + }, + { + "epoch": 0.4026144343853118, + "grad_norm": 0.3346877098083496, + "learning_rate": 0.001, + "loss": 3.0175, + "step": 9517 + }, + { + "epoch": 0.40265673914882816, + "grad_norm": 0.3895154893398285, + "learning_rate": 0.001, + "loss": 2.8014, + "step": 9518 + }, + { + "epoch": 0.4026990439123445, + "grad_norm": 3.064667224884033, + "learning_rate": 0.001, + "loss": 2.5548, + "step": 9519 + }, + { + "epoch": 0.4027413486758609, + "grad_norm": 0.25531595945358276, + "learning_rate": 0.001, + "loss": 1.9813, + "step": 9520 + }, + { + "epoch": 0.4027836534393773, + "grad_norm": 0.23096615076065063, + "learning_rate": 0.001, + "loss": 2.8028, + "step": 9521 + }, + { + "epoch": 0.40282595820289363, + "grad_norm": 0.22149285674095154, + "learning_rate": 0.001, + "loss": 2.2017, + "step": 9522 + }, + { + "epoch": 0.40286826296641004, + "grad_norm": 0.32118022441864014, + "learning_rate": 0.001, + "loss": 3.6867, + "step": 9523 + }, + { + "epoch": 0.4029105677299264, + "grad_norm": 0.2108854353427887, + "learning_rate": 0.001, + "loss": 1.8694, + "step": 9524 + }, + { + "epoch": 0.40295287249344275, + "grad_norm": 0.9841258525848389, + "learning_rate": 0.001, + "loss": 3.0174, + "step": 9525 + }, + { + "epoch": 0.40299517725695916, + "grad_norm": 0.5012634992599487, + "learning_rate": 0.001, + "loss": 2.83, + "step": 9526 + }, + { + "epoch": 0.4030374820204755, + "grad_norm": 0.27034950256347656, + "learning_rate": 0.001, + "loss": 2.4939, + "step": 9527 + }, + { + "epoch": 0.40307978678399187, + "grad_norm": 0.18938782811164856, + "learning_rate": 0.001, + "loss": 2.5085, + "step": 9528 + }, + { + "epoch": 0.4031220915475083, + "grad_norm": 0.1947273164987564, + "learning_rate": 0.001, + "loss": 2.1869, + "step": 9529 + }, + { + "epoch": 0.40316439631102463, + "grad_norm": 0.26790064573287964, + "learning_rate": 0.001, + "loss": 2.4795, + "step": 9530 + }, + { + "epoch": 0.403206701074541, + "grad_norm": 0.19423073530197144, + "learning_rate": 0.001, + "loss": 2.0744, + "step": 9531 + }, + { + "epoch": 0.40324900583805734, + "grad_norm": 0.5601986646652222, + "learning_rate": 0.001, + "loss": 2.2896, + "step": 9532 + }, + { + "epoch": 0.40329131060157375, + "grad_norm": 1.604595422744751, + "learning_rate": 0.001, + "loss": 2.3243, + "step": 9533 + }, + { + "epoch": 0.4033336153650901, + "grad_norm": 0.31608372926712036, + "learning_rate": 0.001, + "loss": 2.3288, + "step": 9534 + }, + { + "epoch": 0.40337592012860646, + "grad_norm": 0.28224605321884155, + "learning_rate": 0.001, + "loss": 3.0434, + "step": 9535 + }, + { + "epoch": 0.40341822489212287, + "grad_norm": 0.21669940650463104, + "learning_rate": 0.001, + "loss": 2.5078, + "step": 9536 + }, + { + "epoch": 0.4034605296556392, + "grad_norm": 0.34247350692749023, + "learning_rate": 0.001, + "loss": 2.1365, + "step": 9537 + }, + { + "epoch": 0.4035028344191556, + "grad_norm": 0.3081550598144531, + "learning_rate": 0.001, + "loss": 1.9531, + "step": 9538 + }, + { + "epoch": 0.403545139182672, + "grad_norm": 1.4701812267303467, + "learning_rate": 0.001, + "loss": 2.1148, + "step": 9539 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.21030528843402863, + "learning_rate": 0.001, + "loss": 2.2116, + "step": 9540 + }, + { + "epoch": 0.4036297487097047, + "grad_norm": 0.22468726336956024, + "learning_rate": 0.001, + "loss": 2.0276, + "step": 9541 + }, + { + "epoch": 0.4036720534732211, + "grad_norm": 0.2096341848373413, + "learning_rate": 0.001, + "loss": 2.1614, + "step": 9542 + }, + { + "epoch": 0.40371435823673746, + "grad_norm": 0.17588932812213898, + "learning_rate": 0.001, + "loss": 1.7986, + "step": 9543 + }, + { + "epoch": 0.4037566630002538, + "grad_norm": 0.21680901944637299, + "learning_rate": 0.001, + "loss": 2.815, + "step": 9544 + }, + { + "epoch": 0.4037989677637702, + "grad_norm": 0.16149887442588806, + "learning_rate": 0.001, + "loss": 1.4867, + "step": 9545 + }, + { + "epoch": 0.4038412725272866, + "grad_norm": 0.25393468141555786, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 9546 + }, + { + "epoch": 0.40388357729080293, + "grad_norm": 0.26233553886413574, + "learning_rate": 0.001, + "loss": 2.9283, + "step": 9547 + }, + { + "epoch": 0.40392588205431934, + "grad_norm": 0.21052797138690948, + "learning_rate": 0.001, + "loss": 2.1164, + "step": 9548 + }, + { + "epoch": 0.4039681868178357, + "grad_norm": 0.220393568277359, + "learning_rate": 0.001, + "loss": 2.5153, + "step": 9549 + }, + { + "epoch": 0.40401049158135205, + "grad_norm": 0.25661540031433105, + "learning_rate": 0.001, + "loss": 1.9917, + "step": 9550 + }, + { + "epoch": 0.40405279634486846, + "grad_norm": 2.5997157096862793, + "learning_rate": 0.001, + "loss": 2.2229, + "step": 9551 + }, + { + "epoch": 0.4040951011083848, + "grad_norm": 0.198783740401268, + "learning_rate": 0.001, + "loss": 2.0679, + "step": 9552 + }, + { + "epoch": 0.40413740587190117, + "grad_norm": 0.18976983428001404, + "learning_rate": 0.001, + "loss": 2.141, + "step": 9553 + }, + { + "epoch": 0.4041797106354175, + "grad_norm": 0.3730868101119995, + "learning_rate": 0.001, + "loss": 2.0335, + "step": 9554 + }, + { + "epoch": 0.40422201539893393, + "grad_norm": 0.1948307752609253, + "learning_rate": 0.001, + "loss": 1.7714, + "step": 9555 + }, + { + "epoch": 0.4042643201624503, + "grad_norm": 0.2243911772966385, + "learning_rate": 0.001, + "loss": 2.2354, + "step": 9556 + }, + { + "epoch": 0.40430662492596664, + "grad_norm": 0.24743397533893585, + "learning_rate": 0.001, + "loss": 2.2232, + "step": 9557 + }, + { + "epoch": 0.40434892968948305, + "grad_norm": 0.3496690094470978, + "learning_rate": 0.001, + "loss": 1.7223, + "step": 9558 + }, + { + "epoch": 0.4043912344529994, + "grad_norm": 0.3667478561401367, + "learning_rate": 0.001, + "loss": 2.9957, + "step": 9559 + }, + { + "epoch": 0.40443353921651576, + "grad_norm": 0.32227084040641785, + "learning_rate": 0.001, + "loss": 2.0967, + "step": 9560 + }, + { + "epoch": 0.40447584398003217, + "grad_norm": 0.18110904097557068, + "learning_rate": 0.001, + "loss": 1.6972, + "step": 9561 + }, + { + "epoch": 0.4045181487435485, + "grad_norm": 0.19031678140163422, + "learning_rate": 0.001, + "loss": 2.3908, + "step": 9562 + }, + { + "epoch": 0.4045604535070649, + "grad_norm": 0.17075388133525848, + "learning_rate": 0.001, + "loss": 1.9596, + "step": 9563 + }, + { + "epoch": 0.4046027582705813, + "grad_norm": 0.20088161528110504, + "learning_rate": 0.001, + "loss": 2.8288, + "step": 9564 + }, + { + "epoch": 0.40464506303409764, + "grad_norm": 0.19151663780212402, + "learning_rate": 0.001, + "loss": 1.8171, + "step": 9565 + }, + { + "epoch": 0.404687367797614, + "grad_norm": 0.19099347293376923, + "learning_rate": 0.001, + "loss": 2.3548, + "step": 9566 + }, + { + "epoch": 0.4047296725611304, + "grad_norm": 0.2712372839450836, + "learning_rate": 0.001, + "loss": 2.8372, + "step": 9567 + }, + { + "epoch": 0.40477197732464676, + "grad_norm": 0.17886675894260406, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 9568 + }, + { + "epoch": 0.4048142820881631, + "grad_norm": 46.942535400390625, + "learning_rate": 0.001, + "loss": 2.8675, + "step": 9569 + }, + { + "epoch": 0.4048565868516795, + "grad_norm": 0.2914670407772064, + "learning_rate": 0.001, + "loss": 1.7127, + "step": 9570 + }, + { + "epoch": 0.4048988916151959, + "grad_norm": 0.1782679408788681, + "learning_rate": 0.001, + "loss": 1.7707, + "step": 9571 + }, + { + "epoch": 0.40494119637871223, + "grad_norm": 0.446181982755661, + "learning_rate": 0.001, + "loss": 2.3157, + "step": 9572 + }, + { + "epoch": 0.40498350114222864, + "grad_norm": 1.071265697479248, + "learning_rate": 0.001, + "loss": 2.153, + "step": 9573 + }, + { + "epoch": 0.405025805905745, + "grad_norm": 0.2291143536567688, + "learning_rate": 0.001, + "loss": 1.8002, + "step": 9574 + }, + { + "epoch": 0.40506811066926135, + "grad_norm": 0.21641968190670013, + "learning_rate": 0.001, + "loss": 2.2813, + "step": 9575 + }, + { + "epoch": 0.4051104154327777, + "grad_norm": 3.91137957572937, + "learning_rate": 0.001, + "loss": 2.7253, + "step": 9576 + }, + { + "epoch": 0.4051527201962941, + "grad_norm": 0.15815329551696777, + "learning_rate": 0.001, + "loss": 1.5105, + "step": 9577 + }, + { + "epoch": 0.40519502495981047, + "grad_norm": 0.43796873092651367, + "learning_rate": 0.001, + "loss": 2.2207, + "step": 9578 + }, + { + "epoch": 0.4052373297233268, + "grad_norm": 0.17552462220191956, + "learning_rate": 0.001, + "loss": 2.1559, + "step": 9579 + }, + { + "epoch": 0.40527963448684323, + "grad_norm": 0.2455400824546814, + "learning_rate": 0.001, + "loss": 1.8954, + "step": 9580 + }, + { + "epoch": 0.4053219392503596, + "grad_norm": 0.9045276045799255, + "learning_rate": 0.001, + "loss": 2.028, + "step": 9581 + }, + { + "epoch": 0.40536424401387594, + "grad_norm": 0.16627678275108337, + "learning_rate": 0.001, + "loss": 1.7033, + "step": 9582 + }, + { + "epoch": 0.40540654877739235, + "grad_norm": 0.23107825219631195, + "learning_rate": 0.001, + "loss": 3.9189, + "step": 9583 + }, + { + "epoch": 0.4054488535409087, + "grad_norm": 0.5262654423713684, + "learning_rate": 0.001, + "loss": 2.7847, + "step": 9584 + }, + { + "epoch": 0.40549115830442506, + "grad_norm": 0.3403107225894928, + "learning_rate": 0.001, + "loss": 2.8312, + "step": 9585 + }, + { + "epoch": 0.40553346306794147, + "grad_norm": 0.19533272087574005, + "learning_rate": 0.001, + "loss": 2.0224, + "step": 9586 + }, + { + "epoch": 0.4055757678314578, + "grad_norm": 0.2007928043603897, + "learning_rate": 0.001, + "loss": 1.8877, + "step": 9587 + }, + { + "epoch": 0.4056180725949742, + "grad_norm": 0.2459113597869873, + "learning_rate": 0.001, + "loss": 2.3136, + "step": 9588 + }, + { + "epoch": 0.4056603773584906, + "grad_norm": 0.17115989327430725, + "learning_rate": 0.001, + "loss": 2.79, + "step": 9589 + }, + { + "epoch": 0.40570268212200694, + "grad_norm": 1.789944052696228, + "learning_rate": 0.001, + "loss": 2.5636, + "step": 9590 + }, + { + "epoch": 0.4057449868855233, + "grad_norm": 0.5894603133201599, + "learning_rate": 0.001, + "loss": 2.3501, + "step": 9591 + }, + { + "epoch": 0.4057872916490397, + "grad_norm": 0.24456271529197693, + "learning_rate": 0.001, + "loss": 2.8854, + "step": 9592 + }, + { + "epoch": 0.40582959641255606, + "grad_norm": 0.15908919274806976, + "learning_rate": 0.001, + "loss": 1.5415, + "step": 9593 + }, + { + "epoch": 0.4058719011760724, + "grad_norm": 0.17225715517997742, + "learning_rate": 0.001, + "loss": 2.4391, + "step": 9594 + }, + { + "epoch": 0.4059142059395888, + "grad_norm": 0.17913804948329926, + "learning_rate": 0.001, + "loss": 1.4686, + "step": 9595 + }, + { + "epoch": 0.4059565107031052, + "grad_norm": 0.4248640239238739, + "learning_rate": 0.001, + "loss": 2.7928, + "step": 9596 + }, + { + "epoch": 0.40599881546662153, + "grad_norm": 1.1703771352767944, + "learning_rate": 0.001, + "loss": 2.104, + "step": 9597 + }, + { + "epoch": 0.4060411202301379, + "grad_norm": 0.18261665105819702, + "learning_rate": 0.001, + "loss": 1.6789, + "step": 9598 + }, + { + "epoch": 0.4060834249936543, + "grad_norm": 1.6488425731658936, + "learning_rate": 0.001, + "loss": 2.1651, + "step": 9599 + }, + { + "epoch": 0.40612572975717065, + "grad_norm": 1.800002098083496, + "learning_rate": 0.001, + "loss": 2.8898, + "step": 9600 + }, + { + "epoch": 0.406168034520687, + "grad_norm": 2.6963353157043457, + "learning_rate": 0.001, + "loss": 2.6452, + "step": 9601 + }, + { + "epoch": 0.4062103392842034, + "grad_norm": 0.2065376192331314, + "learning_rate": 0.001, + "loss": 1.9318, + "step": 9602 + }, + { + "epoch": 0.40625264404771977, + "grad_norm": 0.9833797812461853, + "learning_rate": 0.001, + "loss": 2.7088, + "step": 9603 + }, + { + "epoch": 0.4062949488112361, + "grad_norm": 0.332534521818161, + "learning_rate": 0.001, + "loss": 1.9894, + "step": 9604 + }, + { + "epoch": 0.40633725357475253, + "grad_norm": 0.2684883177280426, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 9605 + }, + { + "epoch": 0.4063795583382689, + "grad_norm": 1.313679814338684, + "learning_rate": 0.001, + "loss": 3.2259, + "step": 9606 + }, + { + "epoch": 0.40642186310178524, + "grad_norm": 0.4700590968132019, + "learning_rate": 0.001, + "loss": 1.8352, + "step": 9607 + }, + { + "epoch": 0.40646416786530165, + "grad_norm": 0.29726094007492065, + "learning_rate": 0.001, + "loss": 2.2151, + "step": 9608 + }, + { + "epoch": 0.406506472628818, + "grad_norm": 0.20610110461711884, + "learning_rate": 0.001, + "loss": 1.5279, + "step": 9609 + }, + { + "epoch": 0.40654877739233436, + "grad_norm": 0.19683901965618134, + "learning_rate": 0.001, + "loss": 2.1613, + "step": 9610 + }, + { + "epoch": 0.40659108215585077, + "grad_norm": 0.37825819849967957, + "learning_rate": 0.001, + "loss": 3.1917, + "step": 9611 + }, + { + "epoch": 0.4066333869193671, + "grad_norm": 0.18482539057731628, + "learning_rate": 0.001, + "loss": 2.0728, + "step": 9612 + }, + { + "epoch": 0.4066756916828835, + "grad_norm": 0.2475501000881195, + "learning_rate": 0.001, + "loss": 2.9863, + "step": 9613 + }, + { + "epoch": 0.4067179964463999, + "grad_norm": 0.14848104119300842, + "learning_rate": 0.001, + "loss": 2.2224, + "step": 9614 + }, + { + "epoch": 0.40676030120991624, + "grad_norm": 0.4939437806606293, + "learning_rate": 0.001, + "loss": 2.6319, + "step": 9615 + }, + { + "epoch": 0.4068026059734326, + "grad_norm": 1.2136601209640503, + "learning_rate": 0.001, + "loss": 2.591, + "step": 9616 + }, + { + "epoch": 0.406844910736949, + "grad_norm": 1.4348440170288086, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 9617 + }, + { + "epoch": 0.40688721550046536, + "grad_norm": 0.2134755551815033, + "learning_rate": 0.001, + "loss": 1.9658, + "step": 9618 + }, + { + "epoch": 0.4069295202639817, + "grad_norm": 1.9099276065826416, + "learning_rate": 0.001, + "loss": 2.0009, + "step": 9619 + }, + { + "epoch": 0.40697182502749807, + "grad_norm": 0.2304713875055313, + "learning_rate": 0.001, + "loss": 2.0757, + "step": 9620 + }, + { + "epoch": 0.4070141297910145, + "grad_norm": 0.2379099726676941, + "learning_rate": 0.001, + "loss": 2.3855, + "step": 9621 + }, + { + "epoch": 0.40705643455453083, + "grad_norm": 0.2802619934082031, + "learning_rate": 0.001, + "loss": 2.562, + "step": 9622 + }, + { + "epoch": 0.4070987393180472, + "grad_norm": 0.2311367690563202, + "learning_rate": 0.001, + "loss": 3.7875, + "step": 9623 + }, + { + "epoch": 0.4071410440815636, + "grad_norm": 0.4283418655395508, + "learning_rate": 0.001, + "loss": 1.7183, + "step": 9624 + }, + { + "epoch": 0.40718334884507995, + "grad_norm": 0.31401526927948, + "learning_rate": 0.001, + "loss": 2.5655, + "step": 9625 + }, + { + "epoch": 0.4072256536085963, + "grad_norm": 0.22191263735294342, + "learning_rate": 0.001, + "loss": 2.251, + "step": 9626 + }, + { + "epoch": 0.4072679583721127, + "grad_norm": 0.23123349249362946, + "learning_rate": 0.001, + "loss": 2.5949, + "step": 9627 + }, + { + "epoch": 0.40731026313562907, + "grad_norm": 0.20630641281604767, + "learning_rate": 0.001, + "loss": 2.8717, + "step": 9628 + }, + { + "epoch": 0.4073525678991454, + "grad_norm": 1.072459101676941, + "learning_rate": 0.001, + "loss": 2.4395, + "step": 9629 + }, + { + "epoch": 0.40739487266266183, + "grad_norm": 0.22625629603862762, + "learning_rate": 0.001, + "loss": 2.2453, + "step": 9630 + }, + { + "epoch": 0.4074371774261782, + "grad_norm": 1.8554446697235107, + "learning_rate": 0.001, + "loss": 2.1412, + "step": 9631 + }, + { + "epoch": 0.40747948218969454, + "grad_norm": 0.2199331372976303, + "learning_rate": 0.001, + "loss": 1.5662, + "step": 9632 + }, + { + "epoch": 0.40752178695321095, + "grad_norm": 1.69846773147583, + "learning_rate": 0.001, + "loss": 2.1245, + "step": 9633 + }, + { + "epoch": 0.4075640917167273, + "grad_norm": 0.22622716426849365, + "learning_rate": 0.001, + "loss": 2.1381, + "step": 9634 + }, + { + "epoch": 0.40760639648024366, + "grad_norm": 0.21170629560947418, + "learning_rate": 0.001, + "loss": 2.0326, + "step": 9635 + }, + { + "epoch": 0.40764870124376007, + "grad_norm": 0.26057133078575134, + "learning_rate": 0.001, + "loss": 1.9696, + "step": 9636 + }, + { + "epoch": 0.4076910060072764, + "grad_norm": 5.649323463439941, + "learning_rate": 0.001, + "loss": 2.8521, + "step": 9637 + }, + { + "epoch": 0.4077333107707928, + "grad_norm": 0.32760143280029297, + "learning_rate": 0.001, + "loss": 3.0351, + "step": 9638 + }, + { + "epoch": 0.4077756155343092, + "grad_norm": 1.6153209209442139, + "learning_rate": 0.001, + "loss": 1.9322, + "step": 9639 + }, + { + "epoch": 0.40781792029782554, + "grad_norm": 1.4140678644180298, + "learning_rate": 0.001, + "loss": 2.2613, + "step": 9640 + }, + { + "epoch": 0.4078602250613419, + "grad_norm": 0.33353015780448914, + "learning_rate": 0.001, + "loss": 3.0011, + "step": 9641 + }, + { + "epoch": 0.4079025298248583, + "grad_norm": 27.971038818359375, + "learning_rate": 0.001, + "loss": 2.6894, + "step": 9642 + }, + { + "epoch": 0.40794483458837466, + "grad_norm": 0.9342984557151794, + "learning_rate": 0.001, + "loss": 2.7896, + "step": 9643 + }, + { + "epoch": 0.407987139351891, + "grad_norm": 2.035511016845703, + "learning_rate": 0.001, + "loss": 2.9743, + "step": 9644 + }, + { + "epoch": 0.40802944411540737, + "grad_norm": 0.682000994682312, + "learning_rate": 0.001, + "loss": 2.2909, + "step": 9645 + }, + { + "epoch": 0.4080717488789238, + "grad_norm": 6.955329418182373, + "learning_rate": 0.001, + "loss": 1.8326, + "step": 9646 + }, + { + "epoch": 0.40811405364244013, + "grad_norm": 0.23917068541049957, + "learning_rate": 0.001, + "loss": 2.0451, + "step": 9647 + }, + { + "epoch": 0.4081563584059565, + "grad_norm": 0.3635198473930359, + "learning_rate": 0.001, + "loss": 3.2204, + "step": 9648 + }, + { + "epoch": 0.4081986631694729, + "grad_norm": 0.5178560614585876, + "learning_rate": 0.001, + "loss": 3.7043, + "step": 9649 + }, + { + "epoch": 0.40824096793298925, + "grad_norm": 0.21071864664554596, + "learning_rate": 0.001, + "loss": 2.7752, + "step": 9650 + }, + { + "epoch": 0.4082832726965056, + "grad_norm": 0.22183898091316223, + "learning_rate": 0.001, + "loss": 2.269, + "step": 9651 + }, + { + "epoch": 0.408325577460022, + "grad_norm": 0.19146625697612762, + "learning_rate": 0.001, + "loss": 2.5297, + "step": 9652 + }, + { + "epoch": 0.40836788222353837, + "grad_norm": 0.2734749913215637, + "learning_rate": 0.001, + "loss": 2.3894, + "step": 9653 + }, + { + "epoch": 0.4084101869870547, + "grad_norm": 0.24852819740772247, + "learning_rate": 0.001, + "loss": 2.2241, + "step": 9654 + }, + { + "epoch": 0.40845249175057113, + "grad_norm": 0.2272268384695053, + "learning_rate": 0.001, + "loss": 2.39, + "step": 9655 + }, + { + "epoch": 0.4084947965140875, + "grad_norm": 0.5135742425918579, + "learning_rate": 0.001, + "loss": 3.2057, + "step": 9656 + }, + { + "epoch": 0.40853710127760384, + "grad_norm": 0.4296305477619171, + "learning_rate": 0.001, + "loss": 2.3507, + "step": 9657 + }, + { + "epoch": 0.40857940604112025, + "grad_norm": 0.17050939798355103, + "learning_rate": 0.001, + "loss": 1.5197, + "step": 9658 + }, + { + "epoch": 0.4086217108046366, + "grad_norm": 4.443729877471924, + "learning_rate": 0.001, + "loss": 2.8856, + "step": 9659 + }, + { + "epoch": 0.40866401556815296, + "grad_norm": 0.18589919805526733, + "learning_rate": 0.001, + "loss": 1.711, + "step": 9660 + }, + { + "epoch": 0.40870632033166937, + "grad_norm": 0.9820543527603149, + "learning_rate": 0.001, + "loss": 2.7134, + "step": 9661 + }, + { + "epoch": 0.4087486250951857, + "grad_norm": 3.852842092514038, + "learning_rate": 0.001, + "loss": 1.894, + "step": 9662 + }, + { + "epoch": 0.4087909298587021, + "grad_norm": 0.6740947365760803, + "learning_rate": 0.001, + "loss": 1.9484, + "step": 9663 + }, + { + "epoch": 0.4088332346222185, + "grad_norm": 0.22364242374897003, + "learning_rate": 0.001, + "loss": 2.7015, + "step": 9664 + }, + { + "epoch": 0.40887553938573484, + "grad_norm": 0.283769428730011, + "learning_rate": 0.001, + "loss": 2.3642, + "step": 9665 + }, + { + "epoch": 0.4089178441492512, + "grad_norm": 0.9470567107200623, + "learning_rate": 0.001, + "loss": 2.4213, + "step": 9666 + }, + { + "epoch": 0.40896014891276755, + "grad_norm": 0.29617589712142944, + "learning_rate": 0.001, + "loss": 2.9135, + "step": 9667 + }, + { + "epoch": 0.40900245367628396, + "grad_norm": 0.20563291013240814, + "learning_rate": 0.001, + "loss": 2.1679, + "step": 9668 + }, + { + "epoch": 0.4090447584398003, + "grad_norm": 0.18204866349697113, + "learning_rate": 0.001, + "loss": 2.6882, + "step": 9669 + }, + { + "epoch": 0.40908706320331667, + "grad_norm": 0.2885321080684662, + "learning_rate": 0.001, + "loss": 2.095, + "step": 9670 + }, + { + "epoch": 0.4091293679668331, + "grad_norm": 2.8684329986572266, + "learning_rate": 0.001, + "loss": 2.4832, + "step": 9671 + }, + { + "epoch": 0.40917167273034943, + "grad_norm": 0.32784703373908997, + "learning_rate": 0.001, + "loss": 2.5857, + "step": 9672 + }, + { + "epoch": 0.4092139774938658, + "grad_norm": 0.7078139185905457, + "learning_rate": 0.001, + "loss": 1.8139, + "step": 9673 + }, + { + "epoch": 0.4092562822573822, + "grad_norm": 0.25039342045783997, + "learning_rate": 0.001, + "loss": 3.3178, + "step": 9674 + }, + { + "epoch": 0.40929858702089855, + "grad_norm": 0.17222987115383148, + "learning_rate": 0.001, + "loss": 2.2217, + "step": 9675 + }, + { + "epoch": 0.4093408917844149, + "grad_norm": 0.15359899401664734, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 9676 + }, + { + "epoch": 0.4093831965479313, + "grad_norm": 0.2080516666173935, + "learning_rate": 0.001, + "loss": 2.7983, + "step": 9677 + }, + { + "epoch": 0.40942550131144767, + "grad_norm": 0.17450571060180664, + "learning_rate": 0.001, + "loss": 1.8784, + "step": 9678 + }, + { + "epoch": 0.409467806074964, + "grad_norm": 0.19042570888996124, + "learning_rate": 0.001, + "loss": 1.8635, + "step": 9679 + }, + { + "epoch": 0.40951011083848043, + "grad_norm": 0.17510387301445007, + "learning_rate": 0.001, + "loss": 2.8996, + "step": 9680 + }, + { + "epoch": 0.4095524156019968, + "grad_norm": 0.7525598406791687, + "learning_rate": 0.001, + "loss": 2.7079, + "step": 9681 + }, + { + "epoch": 0.40959472036551314, + "grad_norm": 1.063751459121704, + "learning_rate": 0.001, + "loss": 2.0613, + "step": 9682 + }, + { + "epoch": 0.40963702512902955, + "grad_norm": 0.35080280900001526, + "learning_rate": 0.001, + "loss": 4.928, + "step": 9683 + }, + { + "epoch": 0.4096793298925459, + "grad_norm": 0.2547611594200134, + "learning_rate": 0.001, + "loss": 2.8568, + "step": 9684 + }, + { + "epoch": 0.40972163465606226, + "grad_norm": 0.23505434393882751, + "learning_rate": 0.001, + "loss": 2.4675, + "step": 9685 + }, + { + "epoch": 0.40976393941957867, + "grad_norm": 0.2235456109046936, + "learning_rate": 0.001, + "loss": 2.3584, + "step": 9686 + }, + { + "epoch": 0.409806244183095, + "grad_norm": 0.26593148708343506, + "learning_rate": 0.001, + "loss": 2.3399, + "step": 9687 + }, + { + "epoch": 0.4098485489466114, + "grad_norm": 0.38054850697517395, + "learning_rate": 0.001, + "loss": 1.9254, + "step": 9688 + }, + { + "epoch": 0.40989085371012773, + "grad_norm": 0.18785469233989716, + "learning_rate": 0.001, + "loss": 2.3267, + "step": 9689 + }, + { + "epoch": 0.40993315847364414, + "grad_norm": 0.19680415093898773, + "learning_rate": 0.001, + "loss": 2.1284, + "step": 9690 + }, + { + "epoch": 0.4099754632371605, + "grad_norm": 0.8819651007652283, + "learning_rate": 0.001, + "loss": 2.2144, + "step": 9691 + }, + { + "epoch": 0.41001776800067685, + "grad_norm": 0.36599695682525635, + "learning_rate": 0.001, + "loss": 2.5738, + "step": 9692 + }, + { + "epoch": 0.41006007276419326, + "grad_norm": 1.350098967552185, + "learning_rate": 0.001, + "loss": 3.2411, + "step": 9693 + }, + { + "epoch": 0.4101023775277096, + "grad_norm": 0.252407968044281, + "learning_rate": 0.001, + "loss": 3.065, + "step": 9694 + }, + { + "epoch": 0.41014468229122597, + "grad_norm": 5.906357765197754, + "learning_rate": 0.001, + "loss": 1.996, + "step": 9695 + }, + { + "epoch": 0.4101869870547424, + "grad_norm": 0.30286088585853577, + "learning_rate": 0.001, + "loss": 2.4888, + "step": 9696 + }, + { + "epoch": 0.41022929181825873, + "grad_norm": 11.067898750305176, + "learning_rate": 0.001, + "loss": 2.2548, + "step": 9697 + }, + { + "epoch": 0.4102715965817751, + "grad_norm": 0.21052215993404388, + "learning_rate": 0.001, + "loss": 3.1103, + "step": 9698 + }, + { + "epoch": 0.4103139013452915, + "grad_norm": 0.24739009141921997, + "learning_rate": 0.001, + "loss": 3.126, + "step": 9699 + }, + { + "epoch": 0.41035620610880785, + "grad_norm": 0.22047722339630127, + "learning_rate": 0.001, + "loss": 2.3277, + "step": 9700 + }, + { + "epoch": 0.4103985108723242, + "grad_norm": 0.7261704802513123, + "learning_rate": 0.001, + "loss": 2.8432, + "step": 9701 + }, + { + "epoch": 0.4104408156358406, + "grad_norm": 0.2856643795967102, + "learning_rate": 0.001, + "loss": 1.9512, + "step": 9702 + }, + { + "epoch": 0.41048312039935697, + "grad_norm": 0.49634721875190735, + "learning_rate": 0.001, + "loss": 2.1401, + "step": 9703 + }, + { + "epoch": 0.4105254251628733, + "grad_norm": 0.24824899435043335, + "learning_rate": 0.001, + "loss": 3.2889, + "step": 9704 + }, + { + "epoch": 0.41056772992638974, + "grad_norm": 0.20967476069927216, + "learning_rate": 0.001, + "loss": 2.091, + "step": 9705 + }, + { + "epoch": 0.4106100346899061, + "grad_norm": 0.1925218403339386, + "learning_rate": 0.001, + "loss": 1.8878, + "step": 9706 + }, + { + "epoch": 0.41065233945342244, + "grad_norm": 1.0082390308380127, + "learning_rate": 0.001, + "loss": 1.9122, + "step": 9707 + }, + { + "epoch": 0.41069464421693885, + "grad_norm": 6.59636926651001, + "learning_rate": 0.001, + "loss": 1.9186, + "step": 9708 + }, + { + "epoch": 0.4107369489804552, + "grad_norm": 0.2302708476781845, + "learning_rate": 0.001, + "loss": 2.3975, + "step": 9709 + }, + { + "epoch": 0.41077925374397156, + "grad_norm": 5.976405143737793, + "learning_rate": 0.001, + "loss": 2.8853, + "step": 9710 + }, + { + "epoch": 0.4108215585074879, + "grad_norm": 5.888969421386719, + "learning_rate": 0.001, + "loss": 1.6746, + "step": 9711 + }, + { + "epoch": 0.4108638632710043, + "grad_norm": 2.457515001296997, + "learning_rate": 0.001, + "loss": 3.2117, + "step": 9712 + }, + { + "epoch": 0.4109061680345207, + "grad_norm": 0.266607403755188, + "learning_rate": 0.001, + "loss": 1.9498, + "step": 9713 + }, + { + "epoch": 0.41094847279803703, + "grad_norm": 1.2756965160369873, + "learning_rate": 0.001, + "loss": 2.7778, + "step": 9714 + }, + { + "epoch": 0.41099077756155344, + "grad_norm": 0.4476830065250397, + "learning_rate": 0.001, + "loss": 3.3292, + "step": 9715 + }, + { + "epoch": 0.4110330823250698, + "grad_norm": 0.2495070993900299, + "learning_rate": 0.001, + "loss": 2.3151, + "step": 9716 + }, + { + "epoch": 0.41107538708858615, + "grad_norm": 0.30039656162261963, + "learning_rate": 0.001, + "loss": 1.9914, + "step": 9717 + }, + { + "epoch": 0.41111769185210256, + "grad_norm": 0.47619497776031494, + "learning_rate": 0.001, + "loss": 3.4457, + "step": 9718 + }, + { + "epoch": 0.4111599966156189, + "grad_norm": 0.21117328107357025, + "learning_rate": 0.001, + "loss": 1.7526, + "step": 9719 + }, + { + "epoch": 0.41120230137913527, + "grad_norm": 0.4838405251502991, + "learning_rate": 0.001, + "loss": 2.325, + "step": 9720 + }, + { + "epoch": 0.4112446061426517, + "grad_norm": 6.137927532196045, + "learning_rate": 0.001, + "loss": 3.0532, + "step": 9721 + }, + { + "epoch": 0.41128691090616804, + "grad_norm": 0.5445790886878967, + "learning_rate": 0.001, + "loss": 2.6303, + "step": 9722 + }, + { + "epoch": 0.4113292156696844, + "grad_norm": 0.24159128963947296, + "learning_rate": 0.001, + "loss": 2.4226, + "step": 9723 + }, + { + "epoch": 0.4113715204332008, + "grad_norm": 0.29332977533340454, + "learning_rate": 0.001, + "loss": 2.8964, + "step": 9724 + }, + { + "epoch": 0.41141382519671715, + "grad_norm": 0.6242585182189941, + "learning_rate": 0.001, + "loss": 2.5372, + "step": 9725 + }, + { + "epoch": 0.4114561299602335, + "grad_norm": 0.2993077039718628, + "learning_rate": 0.001, + "loss": 2.4038, + "step": 9726 + }, + { + "epoch": 0.4114984347237499, + "grad_norm": 0.4130201041698456, + "learning_rate": 0.001, + "loss": 2.5595, + "step": 9727 + }, + { + "epoch": 0.41154073948726627, + "grad_norm": 0.3834516406059265, + "learning_rate": 0.001, + "loss": 3.7084, + "step": 9728 + }, + { + "epoch": 0.4115830442507826, + "grad_norm": 0.29505565762519836, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 9729 + }, + { + "epoch": 0.41162534901429904, + "grad_norm": 0.4178231954574585, + "learning_rate": 0.001, + "loss": 2.9802, + "step": 9730 + }, + { + "epoch": 0.4116676537778154, + "grad_norm": 0.2813502252101898, + "learning_rate": 0.001, + "loss": 1.898, + "step": 9731 + }, + { + "epoch": 0.41170995854133174, + "grad_norm": 0.24282923340797424, + "learning_rate": 0.001, + "loss": 3.137, + "step": 9732 + }, + { + "epoch": 0.4117522633048481, + "grad_norm": 0.21911819279193878, + "learning_rate": 0.001, + "loss": 2.452, + "step": 9733 + }, + { + "epoch": 0.4117945680683645, + "grad_norm": 0.6716076731681824, + "learning_rate": 0.001, + "loss": 2.1814, + "step": 9734 + }, + { + "epoch": 0.41183687283188086, + "grad_norm": 0.9324064254760742, + "learning_rate": 0.001, + "loss": 3.5659, + "step": 9735 + }, + { + "epoch": 0.4118791775953972, + "grad_norm": 0.3203246593475342, + "learning_rate": 0.001, + "loss": 2.3815, + "step": 9736 + }, + { + "epoch": 0.4119214823589136, + "grad_norm": 1.0174567699432373, + "learning_rate": 0.001, + "loss": 2.8275, + "step": 9737 + }, + { + "epoch": 0.41196378712243, + "grad_norm": 5.53692626953125, + "learning_rate": 0.001, + "loss": 1.5753, + "step": 9738 + }, + { + "epoch": 0.41200609188594634, + "grad_norm": 0.3196747303009033, + "learning_rate": 0.001, + "loss": 2.2303, + "step": 9739 + }, + { + "epoch": 0.41204839664946274, + "grad_norm": 7.535372257232666, + "learning_rate": 0.001, + "loss": 2.7096, + "step": 9740 + }, + { + "epoch": 0.4120907014129791, + "grad_norm": 0.5855947136878967, + "learning_rate": 0.001, + "loss": 2.4305, + "step": 9741 + }, + { + "epoch": 0.41213300617649545, + "grad_norm": 0.4567291736602783, + "learning_rate": 0.001, + "loss": 2.9843, + "step": 9742 + }, + { + "epoch": 0.41217531094001186, + "grad_norm": 0.276134729385376, + "learning_rate": 0.001, + "loss": 2.6929, + "step": 9743 + }, + { + "epoch": 0.4122176157035282, + "grad_norm": 2.817307472229004, + "learning_rate": 0.001, + "loss": 2.2315, + "step": 9744 + }, + { + "epoch": 0.41225992046704457, + "grad_norm": 1.8684515953063965, + "learning_rate": 0.001, + "loss": 1.7073, + "step": 9745 + }, + { + "epoch": 0.412302225230561, + "grad_norm": 0.2631063759326935, + "learning_rate": 0.001, + "loss": 2.4196, + "step": 9746 + }, + { + "epoch": 0.41234452999407734, + "grad_norm": 0.9968166947364807, + "learning_rate": 0.001, + "loss": 2.3738, + "step": 9747 + }, + { + "epoch": 0.4123868347575937, + "grad_norm": 0.2650030851364136, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 9748 + }, + { + "epoch": 0.4124291395211101, + "grad_norm": 0.18941166996955872, + "learning_rate": 0.001, + "loss": 2.4076, + "step": 9749 + }, + { + "epoch": 0.41247144428462645, + "grad_norm": 16.938201904296875, + "learning_rate": 0.001, + "loss": 2.0145, + "step": 9750 + }, + { + "epoch": 0.4125137490481428, + "grad_norm": 2.8237738609313965, + "learning_rate": 0.001, + "loss": 2.1931, + "step": 9751 + }, + { + "epoch": 0.4125560538116592, + "grad_norm": 0.2568257749080658, + "learning_rate": 0.001, + "loss": 2.2738, + "step": 9752 + }, + { + "epoch": 0.41259835857517557, + "grad_norm": 0.2610342502593994, + "learning_rate": 0.001, + "loss": 2.0419, + "step": 9753 + }, + { + "epoch": 0.4126406633386919, + "grad_norm": 0.22543591260910034, + "learning_rate": 0.001, + "loss": 2.1172, + "step": 9754 + }, + { + "epoch": 0.41268296810220834, + "grad_norm": 1.7136520147323608, + "learning_rate": 0.001, + "loss": 2.7651, + "step": 9755 + }, + { + "epoch": 0.4127252728657247, + "grad_norm": 1.0864278078079224, + "learning_rate": 0.001, + "loss": 2.6813, + "step": 9756 + }, + { + "epoch": 0.41276757762924104, + "grad_norm": 0.16335022449493408, + "learning_rate": 0.001, + "loss": 2.7791, + "step": 9757 + }, + { + "epoch": 0.4128098823927574, + "grad_norm": 0.2040264904499054, + "learning_rate": 0.001, + "loss": 2.0147, + "step": 9758 + }, + { + "epoch": 0.4128521871562738, + "grad_norm": 0.23535273969173431, + "learning_rate": 0.001, + "loss": 2.2457, + "step": 9759 + }, + { + "epoch": 0.41289449191979016, + "grad_norm": 0.2834819257259369, + "learning_rate": 0.001, + "loss": 2.5383, + "step": 9760 + }, + { + "epoch": 0.4129367966833065, + "grad_norm": 0.44039419293403625, + "learning_rate": 0.001, + "loss": 2.2677, + "step": 9761 + }, + { + "epoch": 0.4129791014468229, + "grad_norm": 0.19605425000190735, + "learning_rate": 0.001, + "loss": 2.0914, + "step": 9762 + }, + { + "epoch": 0.4130214062103393, + "grad_norm": 0.3128882050514221, + "learning_rate": 0.001, + "loss": 2.4405, + "step": 9763 + }, + { + "epoch": 0.41306371097385564, + "grad_norm": 0.19780918955802917, + "learning_rate": 0.001, + "loss": 2.9869, + "step": 9764 + }, + { + "epoch": 0.41310601573737205, + "grad_norm": 0.7669022679328918, + "learning_rate": 0.001, + "loss": 2.0915, + "step": 9765 + }, + { + "epoch": 0.4131483205008884, + "grad_norm": 1.3960890769958496, + "learning_rate": 0.001, + "loss": 2.018, + "step": 9766 + }, + { + "epoch": 0.41319062526440475, + "grad_norm": 0.4077872931957245, + "learning_rate": 0.001, + "loss": 2.4418, + "step": 9767 + }, + { + "epoch": 0.41323293002792116, + "grad_norm": 0.22895626723766327, + "learning_rate": 0.001, + "loss": 3.1219, + "step": 9768 + }, + { + "epoch": 0.4132752347914375, + "grad_norm": 0.3246360719203949, + "learning_rate": 0.001, + "loss": 2.5731, + "step": 9769 + }, + { + "epoch": 0.41331753955495387, + "grad_norm": 0.1943444311618805, + "learning_rate": 0.001, + "loss": 2.6833, + "step": 9770 + }, + { + "epoch": 0.4133598443184703, + "grad_norm": 0.23944945633411407, + "learning_rate": 0.001, + "loss": 2.1968, + "step": 9771 + }, + { + "epoch": 0.41340214908198664, + "grad_norm": 0.2529198229312897, + "learning_rate": 0.001, + "loss": 2.5286, + "step": 9772 + }, + { + "epoch": 0.413444453845503, + "grad_norm": 1.6711100339889526, + "learning_rate": 0.001, + "loss": 2.4631, + "step": 9773 + }, + { + "epoch": 0.4134867586090194, + "grad_norm": 0.16065281629562378, + "learning_rate": 0.001, + "loss": 2.8067, + "step": 9774 + }, + { + "epoch": 0.41352906337253575, + "grad_norm": 0.1823013722896576, + "learning_rate": 0.001, + "loss": 2.4533, + "step": 9775 + }, + { + "epoch": 0.4135713681360521, + "grad_norm": 0.5769549012184143, + "learning_rate": 0.001, + "loss": 1.9118, + "step": 9776 + }, + { + "epoch": 0.4136136728995685, + "grad_norm": 0.3120274841785431, + "learning_rate": 0.001, + "loss": 2.6537, + "step": 9777 + }, + { + "epoch": 0.4136559776630849, + "grad_norm": 2.704646110534668, + "learning_rate": 0.001, + "loss": 2.6022, + "step": 9778 + }, + { + "epoch": 0.4136982824266012, + "grad_norm": 3.760073661804199, + "learning_rate": 0.001, + "loss": 2.4984, + "step": 9779 + }, + { + "epoch": 0.4137405871901176, + "grad_norm": 0.3028760254383087, + "learning_rate": 0.001, + "loss": 2.7438, + "step": 9780 + }, + { + "epoch": 0.413782891953634, + "grad_norm": 0.19027100503444672, + "learning_rate": 0.001, + "loss": 2.5773, + "step": 9781 + }, + { + "epoch": 0.41382519671715035, + "grad_norm": 0.703472375869751, + "learning_rate": 0.001, + "loss": 1.811, + "step": 9782 + }, + { + "epoch": 0.4138675014806667, + "grad_norm": 0.26506584882736206, + "learning_rate": 0.001, + "loss": 2.8884, + "step": 9783 + }, + { + "epoch": 0.4139098062441831, + "grad_norm": 0.2612300515174866, + "learning_rate": 0.001, + "loss": 2.7629, + "step": 9784 + }, + { + "epoch": 0.41395211100769946, + "grad_norm": 0.2701547145843506, + "learning_rate": 0.001, + "loss": 2.6545, + "step": 9785 + }, + { + "epoch": 0.4139944157712158, + "grad_norm": 0.5775189995765686, + "learning_rate": 0.001, + "loss": 2.2237, + "step": 9786 + }, + { + "epoch": 0.4140367205347322, + "grad_norm": 0.29414102435112, + "learning_rate": 0.001, + "loss": 2.5268, + "step": 9787 + }, + { + "epoch": 0.4140790252982486, + "grad_norm": 0.3888061046600342, + "learning_rate": 0.001, + "loss": 2.466, + "step": 9788 + }, + { + "epoch": 0.41412133006176494, + "grad_norm": 0.42892327904701233, + "learning_rate": 0.001, + "loss": 2.6945, + "step": 9789 + }, + { + "epoch": 0.41416363482528135, + "grad_norm": 0.3391939103603363, + "learning_rate": 0.001, + "loss": 2.2967, + "step": 9790 + }, + { + "epoch": 0.4142059395887977, + "grad_norm": 0.1967991441488266, + "learning_rate": 0.001, + "loss": 2.2695, + "step": 9791 + }, + { + "epoch": 0.41424824435231405, + "grad_norm": 0.18619079887866974, + "learning_rate": 0.001, + "loss": 2.6969, + "step": 9792 + }, + { + "epoch": 0.41429054911583046, + "grad_norm": 0.21667589247226715, + "learning_rate": 0.001, + "loss": 2.346, + "step": 9793 + }, + { + "epoch": 0.4143328538793468, + "grad_norm": 0.1584491729736328, + "learning_rate": 0.001, + "loss": 1.6244, + "step": 9794 + }, + { + "epoch": 0.4143751586428632, + "grad_norm": 0.16956403851509094, + "learning_rate": 0.001, + "loss": 2.5346, + "step": 9795 + }, + { + "epoch": 0.4144174634063796, + "grad_norm": 0.2773437798023224, + "learning_rate": 0.001, + "loss": 3.2338, + "step": 9796 + }, + { + "epoch": 0.41445976816989594, + "grad_norm": 0.2252512127161026, + "learning_rate": 0.001, + "loss": 2.1501, + "step": 9797 + }, + { + "epoch": 0.4145020729334123, + "grad_norm": 0.9727465510368347, + "learning_rate": 0.001, + "loss": 2.7049, + "step": 9798 + }, + { + "epoch": 0.4145443776969287, + "grad_norm": 0.23791512846946716, + "learning_rate": 0.001, + "loss": 1.7446, + "step": 9799 + }, + { + "epoch": 0.41458668246044506, + "grad_norm": 14.202518463134766, + "learning_rate": 0.001, + "loss": 3.8168, + "step": 9800 + }, + { + "epoch": 0.4146289872239614, + "grad_norm": 0.44175243377685547, + "learning_rate": 0.001, + "loss": 2.1732, + "step": 9801 + }, + { + "epoch": 0.41467129198747776, + "grad_norm": 0.17590700089931488, + "learning_rate": 0.001, + "loss": 1.9734, + "step": 9802 + }, + { + "epoch": 0.4147135967509942, + "grad_norm": 0.18787901103496552, + "learning_rate": 0.001, + "loss": 2.4456, + "step": 9803 + }, + { + "epoch": 0.4147559015145105, + "grad_norm": 0.17629052698612213, + "learning_rate": 0.001, + "loss": 1.8945, + "step": 9804 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 0.17695337533950806, + "learning_rate": 0.001, + "loss": 2.3594, + "step": 9805 + }, + { + "epoch": 0.4148405110415433, + "grad_norm": 0.24011565744876862, + "learning_rate": 0.001, + "loss": 1.9115, + "step": 9806 + }, + { + "epoch": 0.41488281580505965, + "grad_norm": 3.5047898292541504, + "learning_rate": 0.001, + "loss": 2.0496, + "step": 9807 + }, + { + "epoch": 0.414925120568576, + "grad_norm": 3.9481778144836426, + "learning_rate": 0.001, + "loss": 3.1042, + "step": 9808 + }, + { + "epoch": 0.4149674253320924, + "grad_norm": 0.44774043560028076, + "learning_rate": 0.001, + "loss": 2.4119, + "step": 9809 + }, + { + "epoch": 0.41500973009560876, + "grad_norm": 0.9812766313552856, + "learning_rate": 0.001, + "loss": 2.6236, + "step": 9810 + }, + { + "epoch": 0.4150520348591251, + "grad_norm": 0.47764018177986145, + "learning_rate": 0.001, + "loss": 2.3742, + "step": 9811 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.18392595648765564, + "learning_rate": 0.001, + "loss": 2.4516, + "step": 9812 + }, + { + "epoch": 0.4151366443861579, + "grad_norm": 1.547833800315857, + "learning_rate": 0.001, + "loss": 1.7756, + "step": 9813 + }, + { + "epoch": 0.41517894914967424, + "grad_norm": 0.18931736052036285, + "learning_rate": 0.001, + "loss": 1.7584, + "step": 9814 + }, + { + "epoch": 0.41522125391319065, + "grad_norm": 0.1544259935617447, + "learning_rate": 0.001, + "loss": 1.4929, + "step": 9815 + }, + { + "epoch": 0.415263558676707, + "grad_norm": 0.2300853431224823, + "learning_rate": 0.001, + "loss": 2.7883, + "step": 9816 + }, + { + "epoch": 0.41530586344022336, + "grad_norm": 1.0780434608459473, + "learning_rate": 0.001, + "loss": 2.5039, + "step": 9817 + }, + { + "epoch": 0.41534816820373976, + "grad_norm": 0.5089141130447388, + "learning_rate": 0.001, + "loss": 1.7734, + "step": 9818 + }, + { + "epoch": 0.4153904729672561, + "grad_norm": 3.429790496826172, + "learning_rate": 0.001, + "loss": 2.3779, + "step": 9819 + }, + { + "epoch": 0.4154327777307725, + "grad_norm": 6.695871353149414, + "learning_rate": 0.001, + "loss": 2.3627, + "step": 9820 + }, + { + "epoch": 0.4154750824942889, + "grad_norm": 0.1914941370487213, + "learning_rate": 0.001, + "loss": 2.655, + "step": 9821 + }, + { + "epoch": 0.41551738725780524, + "grad_norm": 0.6268720030784607, + "learning_rate": 0.001, + "loss": 2.0422, + "step": 9822 + }, + { + "epoch": 0.4155596920213216, + "grad_norm": 0.47744235396385193, + "learning_rate": 0.001, + "loss": 2.074, + "step": 9823 + }, + { + "epoch": 0.41560199678483795, + "grad_norm": 0.15372218191623688, + "learning_rate": 0.001, + "loss": 2.065, + "step": 9824 + }, + { + "epoch": 0.41564430154835436, + "grad_norm": 0.4260820746421814, + "learning_rate": 0.001, + "loss": 2.9168, + "step": 9825 + }, + { + "epoch": 0.4156866063118707, + "grad_norm": 0.19325514137744904, + "learning_rate": 0.001, + "loss": 1.8882, + "step": 9826 + }, + { + "epoch": 0.41572891107538706, + "grad_norm": 0.1592322140932083, + "learning_rate": 0.001, + "loss": 2.4256, + "step": 9827 + }, + { + "epoch": 0.4157712158389035, + "grad_norm": 0.17341256141662598, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 9828 + }, + { + "epoch": 0.41581352060241983, + "grad_norm": 0.20238442718982697, + "learning_rate": 0.001, + "loss": 2.2101, + "step": 9829 + }, + { + "epoch": 0.4158558253659362, + "grad_norm": 0.3207574784755707, + "learning_rate": 0.001, + "loss": 2.9256, + "step": 9830 + }, + { + "epoch": 0.4158981301294526, + "grad_norm": 0.1748972088098526, + "learning_rate": 0.001, + "loss": 1.8409, + "step": 9831 + }, + { + "epoch": 0.41594043489296895, + "grad_norm": 0.24550606310367584, + "learning_rate": 0.001, + "loss": 3.3066, + "step": 9832 + }, + { + "epoch": 0.4159827396564853, + "grad_norm": 0.9889479279518127, + "learning_rate": 0.001, + "loss": 2.1071, + "step": 9833 + }, + { + "epoch": 0.4160250444200017, + "grad_norm": 0.20585237443447113, + "learning_rate": 0.001, + "loss": 2.5545, + "step": 9834 + }, + { + "epoch": 0.41606734918351806, + "grad_norm": 0.13945859670639038, + "learning_rate": 0.001, + "loss": 1.4628, + "step": 9835 + }, + { + "epoch": 0.4161096539470344, + "grad_norm": 27.057586669921875, + "learning_rate": 0.001, + "loss": 2.8699, + "step": 9836 + }, + { + "epoch": 0.41615195871055083, + "grad_norm": 0.18365734815597534, + "learning_rate": 0.001, + "loss": 1.5084, + "step": 9837 + }, + { + "epoch": 0.4161942634740672, + "grad_norm": 0.43779316544532776, + "learning_rate": 0.001, + "loss": 2.1716, + "step": 9838 + }, + { + "epoch": 0.41623656823758354, + "grad_norm": 0.7131558060646057, + "learning_rate": 0.001, + "loss": 2.9858, + "step": 9839 + }, + { + "epoch": 0.41627887300109995, + "grad_norm": 3.8856091499328613, + "learning_rate": 0.001, + "loss": 3.9777, + "step": 9840 + }, + { + "epoch": 0.4163211777646163, + "grad_norm": 0.17805153131484985, + "learning_rate": 0.001, + "loss": 1.8416, + "step": 9841 + }, + { + "epoch": 0.41636348252813266, + "grad_norm": 0.5579624772071838, + "learning_rate": 0.001, + "loss": 3.2418, + "step": 9842 + }, + { + "epoch": 0.41640578729164907, + "grad_norm": 0.2010784149169922, + "learning_rate": 0.001, + "loss": 2.5104, + "step": 9843 + }, + { + "epoch": 0.4164480920551654, + "grad_norm": 0.1619592159986496, + "learning_rate": 0.001, + "loss": 1.9748, + "step": 9844 + }, + { + "epoch": 0.4164903968186818, + "grad_norm": 0.40515822172164917, + "learning_rate": 0.001, + "loss": 2.7918, + "step": 9845 + }, + { + "epoch": 0.41653270158219813, + "grad_norm": 0.22229692339897156, + "learning_rate": 0.001, + "loss": 2.0756, + "step": 9846 + }, + { + "epoch": 0.41657500634571454, + "grad_norm": 0.21411965787410736, + "learning_rate": 0.001, + "loss": 2.1632, + "step": 9847 + }, + { + "epoch": 0.4166173111092309, + "grad_norm": 0.21786244213581085, + "learning_rate": 0.001, + "loss": 3.3767, + "step": 9848 + }, + { + "epoch": 0.41665961587274725, + "grad_norm": 0.20544399321079254, + "learning_rate": 0.001, + "loss": 2.1629, + "step": 9849 + }, + { + "epoch": 0.41670192063626366, + "grad_norm": 0.17390744388103485, + "learning_rate": 0.001, + "loss": 2.2281, + "step": 9850 + }, + { + "epoch": 0.41674422539978, + "grad_norm": 0.5404853820800781, + "learning_rate": 0.001, + "loss": 2.8778, + "step": 9851 + }, + { + "epoch": 0.41678653016329636, + "grad_norm": 0.18228615820407867, + "learning_rate": 0.001, + "loss": 2.2249, + "step": 9852 + }, + { + "epoch": 0.4168288349268128, + "grad_norm": 1.0131566524505615, + "learning_rate": 0.001, + "loss": 2.8682, + "step": 9853 + }, + { + "epoch": 0.41687113969032913, + "grad_norm": 0.1611110270023346, + "learning_rate": 0.001, + "loss": 2.1683, + "step": 9854 + }, + { + "epoch": 0.4169134444538455, + "grad_norm": 0.7783699631690979, + "learning_rate": 0.001, + "loss": 2.6472, + "step": 9855 + }, + { + "epoch": 0.4169557492173619, + "grad_norm": 0.15802177786827087, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 9856 + }, + { + "epoch": 0.41699805398087825, + "grad_norm": 0.2226634919643402, + "learning_rate": 0.001, + "loss": 1.9478, + "step": 9857 + }, + { + "epoch": 0.4170403587443946, + "grad_norm": 0.4912348687648773, + "learning_rate": 0.001, + "loss": 1.7005, + "step": 9858 + }, + { + "epoch": 0.417082663507911, + "grad_norm": 0.1934366077184677, + "learning_rate": 0.001, + "loss": 2.5896, + "step": 9859 + }, + { + "epoch": 0.41712496827142737, + "grad_norm": 0.20569314062595367, + "learning_rate": 0.001, + "loss": 2.9554, + "step": 9860 + }, + { + "epoch": 0.4171672730349437, + "grad_norm": 0.18698929250240326, + "learning_rate": 0.001, + "loss": 2.1069, + "step": 9861 + }, + { + "epoch": 0.41720957779846013, + "grad_norm": 0.2676628530025482, + "learning_rate": 0.001, + "loss": 2.5049, + "step": 9862 + }, + { + "epoch": 0.4172518825619765, + "grad_norm": 0.21008531749248505, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 9863 + }, + { + "epoch": 0.41729418732549284, + "grad_norm": 0.19284501671791077, + "learning_rate": 0.001, + "loss": 3.301, + "step": 9864 + }, + { + "epoch": 0.41733649208900925, + "grad_norm": 0.19138003885746002, + "learning_rate": 0.001, + "loss": 2.2343, + "step": 9865 + }, + { + "epoch": 0.4173787968525256, + "grad_norm": 3.3157238960266113, + "learning_rate": 0.001, + "loss": 1.9117, + "step": 9866 + }, + { + "epoch": 0.41742110161604196, + "grad_norm": 0.1964617371559143, + "learning_rate": 0.001, + "loss": 2.8301, + "step": 9867 + }, + { + "epoch": 0.4174634063795583, + "grad_norm": 0.27521830797195435, + "learning_rate": 0.001, + "loss": 1.853, + "step": 9868 + }, + { + "epoch": 0.4175057111430747, + "grad_norm": 0.26007047295570374, + "learning_rate": 0.001, + "loss": 2.6994, + "step": 9869 + }, + { + "epoch": 0.4175480159065911, + "grad_norm": 0.2345419079065323, + "learning_rate": 0.001, + "loss": 3.8451, + "step": 9870 + }, + { + "epoch": 0.41759032067010743, + "grad_norm": 0.5388814210891724, + "learning_rate": 0.001, + "loss": 2.7057, + "step": 9871 + }, + { + "epoch": 0.41763262543362384, + "grad_norm": 0.1674017757177353, + "learning_rate": 0.001, + "loss": 1.9514, + "step": 9872 + }, + { + "epoch": 0.4176749301971402, + "grad_norm": 0.2036234587430954, + "learning_rate": 0.001, + "loss": 2.4295, + "step": 9873 + }, + { + "epoch": 0.41771723496065655, + "grad_norm": 5.006783485412598, + "learning_rate": 0.001, + "loss": 2.2285, + "step": 9874 + }, + { + "epoch": 0.41775953972417296, + "grad_norm": 0.3043411374092102, + "learning_rate": 0.001, + "loss": 2.1786, + "step": 9875 + }, + { + "epoch": 0.4178018444876893, + "grad_norm": 0.18863193690776825, + "learning_rate": 0.001, + "loss": 2.0902, + "step": 9876 + }, + { + "epoch": 0.41784414925120567, + "grad_norm": 0.21312756836414337, + "learning_rate": 0.001, + "loss": 2.418, + "step": 9877 + }, + { + "epoch": 0.4178864540147221, + "grad_norm": 0.25817111134529114, + "learning_rate": 0.001, + "loss": 3.3019, + "step": 9878 + }, + { + "epoch": 0.41792875877823843, + "grad_norm": 0.17878010869026184, + "learning_rate": 0.001, + "loss": 2.1354, + "step": 9879 + }, + { + "epoch": 0.4179710635417548, + "grad_norm": 0.2418394237756729, + "learning_rate": 0.001, + "loss": 2.7489, + "step": 9880 + }, + { + "epoch": 0.4180133683052712, + "grad_norm": 5.557830333709717, + "learning_rate": 0.001, + "loss": 1.8045, + "step": 9881 + }, + { + "epoch": 0.41805567306878755, + "grad_norm": 0.17561772465705872, + "learning_rate": 0.001, + "loss": 2.6087, + "step": 9882 + }, + { + "epoch": 0.4180979778323039, + "grad_norm": 0.1871245950460434, + "learning_rate": 0.001, + "loss": 2.7775, + "step": 9883 + }, + { + "epoch": 0.4181402825958203, + "grad_norm": 0.17754124104976654, + "learning_rate": 0.001, + "loss": 1.7533, + "step": 9884 + }, + { + "epoch": 0.41818258735933667, + "grad_norm": 0.406012624502182, + "learning_rate": 0.001, + "loss": 2.6556, + "step": 9885 + }, + { + "epoch": 0.418224892122853, + "grad_norm": 0.2209833264350891, + "learning_rate": 0.001, + "loss": 2.6001, + "step": 9886 + }, + { + "epoch": 0.41826719688636943, + "grad_norm": 5.273458480834961, + "learning_rate": 0.001, + "loss": 2.443, + "step": 9887 + }, + { + "epoch": 0.4183095016498858, + "grad_norm": 0.18473027646541595, + "learning_rate": 0.001, + "loss": 1.6151, + "step": 9888 + }, + { + "epoch": 0.41835180641340214, + "grad_norm": 3.541088342666626, + "learning_rate": 0.001, + "loss": 2.5512, + "step": 9889 + }, + { + "epoch": 0.41839411117691855, + "grad_norm": 0.17209482192993164, + "learning_rate": 0.001, + "loss": 2.3301, + "step": 9890 + }, + { + "epoch": 0.4184364159404349, + "grad_norm": 0.2227262258529663, + "learning_rate": 0.001, + "loss": 2.3766, + "step": 9891 + }, + { + "epoch": 0.41847872070395126, + "grad_norm": 0.17431975901126862, + "learning_rate": 0.001, + "loss": 1.6998, + "step": 9892 + }, + { + "epoch": 0.4185210254674676, + "grad_norm": 0.3029531240463257, + "learning_rate": 0.001, + "loss": 2.4269, + "step": 9893 + }, + { + "epoch": 0.418563330230984, + "grad_norm": 0.22647684812545776, + "learning_rate": 0.001, + "loss": 1.8381, + "step": 9894 + }, + { + "epoch": 0.4186056349945004, + "grad_norm": 0.21035243570804596, + "learning_rate": 0.001, + "loss": 3.3303, + "step": 9895 + }, + { + "epoch": 0.41864793975801673, + "grad_norm": 1.0283381938934326, + "learning_rate": 0.001, + "loss": 2.258, + "step": 9896 + }, + { + "epoch": 0.41869024452153314, + "grad_norm": 1.2793562412261963, + "learning_rate": 0.001, + "loss": 3.1089, + "step": 9897 + }, + { + "epoch": 0.4187325492850495, + "grad_norm": 0.21786046028137207, + "learning_rate": 0.001, + "loss": 1.7205, + "step": 9898 + }, + { + "epoch": 0.41877485404856585, + "grad_norm": 0.16256938874721527, + "learning_rate": 0.001, + "loss": 2.8039, + "step": 9899 + }, + { + "epoch": 0.41881715881208226, + "grad_norm": 0.4168480634689331, + "learning_rate": 0.001, + "loss": 2.3668, + "step": 9900 + }, + { + "epoch": 0.4188594635755986, + "grad_norm": 0.1868831068277359, + "learning_rate": 0.001, + "loss": 2.6872, + "step": 9901 + }, + { + "epoch": 0.41890176833911497, + "grad_norm": 2.076476573944092, + "learning_rate": 0.001, + "loss": 2.4575, + "step": 9902 + }, + { + "epoch": 0.4189440731026314, + "grad_norm": 0.22399549186229706, + "learning_rate": 0.001, + "loss": 2.7196, + "step": 9903 + }, + { + "epoch": 0.41898637786614773, + "grad_norm": 0.24914605915546417, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 9904 + }, + { + "epoch": 0.4190286826296641, + "grad_norm": 0.265962690114975, + "learning_rate": 0.001, + "loss": 1.5826, + "step": 9905 + }, + { + "epoch": 0.4190709873931805, + "grad_norm": 0.20687235891819, + "learning_rate": 0.001, + "loss": 2.7606, + "step": 9906 + }, + { + "epoch": 0.41911329215669685, + "grad_norm": 0.18030163645744324, + "learning_rate": 0.001, + "loss": 1.7922, + "step": 9907 + }, + { + "epoch": 0.4191555969202132, + "grad_norm": 0.17712494730949402, + "learning_rate": 0.001, + "loss": 1.9416, + "step": 9908 + }, + { + "epoch": 0.4191979016837296, + "grad_norm": 0.2647468149662018, + "learning_rate": 0.001, + "loss": 1.6112, + "step": 9909 + }, + { + "epoch": 0.41924020644724597, + "grad_norm": 10.648910522460938, + "learning_rate": 0.001, + "loss": 2.2037, + "step": 9910 + }, + { + "epoch": 0.4192825112107623, + "grad_norm": 0.22634510695934296, + "learning_rate": 0.001, + "loss": 1.8658, + "step": 9911 + }, + { + "epoch": 0.41932481597427873, + "grad_norm": 0.19005584716796875, + "learning_rate": 0.001, + "loss": 1.784, + "step": 9912 + }, + { + "epoch": 0.4193671207377951, + "grad_norm": 11.0584077835083, + "learning_rate": 0.001, + "loss": 2.403, + "step": 9913 + }, + { + "epoch": 0.41940942550131144, + "grad_norm": 0.18908476829528809, + "learning_rate": 0.001, + "loss": 2.2306, + "step": 9914 + }, + { + "epoch": 0.4194517302648278, + "grad_norm": 0.196416437625885, + "learning_rate": 0.001, + "loss": 2.8057, + "step": 9915 + }, + { + "epoch": 0.4194940350283442, + "grad_norm": 0.37092915177345276, + "learning_rate": 0.001, + "loss": 3.7798, + "step": 9916 + }, + { + "epoch": 0.41953633979186056, + "grad_norm": 0.1975582391023636, + "learning_rate": 0.001, + "loss": 2.7689, + "step": 9917 + }, + { + "epoch": 0.4195786445553769, + "grad_norm": 16.982938766479492, + "learning_rate": 0.001, + "loss": 3.2579, + "step": 9918 + }, + { + "epoch": 0.4196209493188933, + "grad_norm": 0.22323065996170044, + "learning_rate": 0.001, + "loss": 2.1371, + "step": 9919 + }, + { + "epoch": 0.4196632540824097, + "grad_norm": 0.3615620732307434, + "learning_rate": 0.001, + "loss": 1.9409, + "step": 9920 + }, + { + "epoch": 0.41970555884592603, + "grad_norm": 0.7461709976196289, + "learning_rate": 0.001, + "loss": 2.4982, + "step": 9921 + }, + { + "epoch": 0.41974786360944244, + "grad_norm": 0.20795656740665436, + "learning_rate": 0.001, + "loss": 2.3513, + "step": 9922 + }, + { + "epoch": 0.4197901683729588, + "grad_norm": 0.19558289647102356, + "learning_rate": 0.001, + "loss": 2.7092, + "step": 9923 + }, + { + "epoch": 0.41983247313647515, + "grad_norm": 0.2396133542060852, + "learning_rate": 0.001, + "loss": 2.6774, + "step": 9924 + }, + { + "epoch": 0.41987477789999156, + "grad_norm": 0.2055635303258896, + "learning_rate": 0.001, + "loss": 3.1715, + "step": 9925 + }, + { + "epoch": 0.4199170826635079, + "grad_norm": 0.28740987181663513, + "learning_rate": 0.001, + "loss": 2.0135, + "step": 9926 + }, + { + "epoch": 0.41995938742702427, + "grad_norm": 3.5585386753082275, + "learning_rate": 0.001, + "loss": 2.0134, + "step": 9927 + }, + { + "epoch": 0.4200016921905407, + "grad_norm": 0.19014716148376465, + "learning_rate": 0.001, + "loss": 3.6427, + "step": 9928 + }, + { + "epoch": 0.42004399695405703, + "grad_norm": 0.32993394136428833, + "learning_rate": 0.001, + "loss": 1.8829, + "step": 9929 + }, + { + "epoch": 0.4200863017175734, + "grad_norm": 0.22422336041927338, + "learning_rate": 0.001, + "loss": 2.4582, + "step": 9930 + }, + { + "epoch": 0.4201286064810898, + "grad_norm": 1.323824167251587, + "learning_rate": 0.001, + "loss": 3.1731, + "step": 9931 + }, + { + "epoch": 0.42017091124460615, + "grad_norm": 0.17324510216712952, + "learning_rate": 0.001, + "loss": 1.8609, + "step": 9932 + }, + { + "epoch": 0.4202132160081225, + "grad_norm": 1.34519624710083, + "learning_rate": 0.001, + "loss": 2.1091, + "step": 9933 + }, + { + "epoch": 0.4202555207716389, + "grad_norm": 2.9261980056762695, + "learning_rate": 0.001, + "loss": 2.0931, + "step": 9934 + }, + { + "epoch": 0.42029782553515527, + "grad_norm": 0.26872578263282776, + "learning_rate": 0.001, + "loss": 2.4059, + "step": 9935 + }, + { + "epoch": 0.4203401302986716, + "grad_norm": 10.175525665283203, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 9936 + }, + { + "epoch": 0.420382435062188, + "grad_norm": 0.20974968373775482, + "learning_rate": 0.001, + "loss": 2.474, + "step": 9937 + }, + { + "epoch": 0.4204247398257044, + "grad_norm": 2.2708218097686768, + "learning_rate": 0.001, + "loss": 2.441, + "step": 9938 + }, + { + "epoch": 0.42046704458922074, + "grad_norm": 0.1942100077867508, + "learning_rate": 0.001, + "loss": 2.4447, + "step": 9939 + }, + { + "epoch": 0.4205093493527371, + "grad_norm": 0.24804472923278809, + "learning_rate": 0.001, + "loss": 2.365, + "step": 9940 + }, + { + "epoch": 0.4205516541162535, + "grad_norm": 0.21028558909893036, + "learning_rate": 0.001, + "loss": 1.9267, + "step": 9941 + }, + { + "epoch": 0.42059395887976986, + "grad_norm": 0.25556495785713196, + "learning_rate": 0.001, + "loss": 2.5246, + "step": 9942 + }, + { + "epoch": 0.4206362636432862, + "grad_norm": 0.6381620764732361, + "learning_rate": 0.001, + "loss": 2.455, + "step": 9943 + }, + { + "epoch": 0.4206785684068026, + "grad_norm": 0.20870067179203033, + "learning_rate": 0.001, + "loss": 1.8444, + "step": 9944 + }, + { + "epoch": 0.420720873170319, + "grad_norm": 0.4335130751132965, + "learning_rate": 0.001, + "loss": 1.7678, + "step": 9945 + }, + { + "epoch": 0.42076317793383533, + "grad_norm": 0.20234449207782745, + "learning_rate": 0.001, + "loss": 1.6857, + "step": 9946 + }, + { + "epoch": 0.42080548269735174, + "grad_norm": 0.21633018553256989, + "learning_rate": 0.001, + "loss": 2.6976, + "step": 9947 + }, + { + "epoch": 0.4208477874608681, + "grad_norm": 0.20859219133853912, + "learning_rate": 0.001, + "loss": 1.9807, + "step": 9948 + }, + { + "epoch": 0.42089009222438445, + "grad_norm": 0.19663840532302856, + "learning_rate": 0.001, + "loss": 2.1092, + "step": 9949 + }, + { + "epoch": 0.42093239698790086, + "grad_norm": 0.24110959470272064, + "learning_rate": 0.001, + "loss": 2.0922, + "step": 9950 + }, + { + "epoch": 0.4209747017514172, + "grad_norm": 0.18923258781433105, + "learning_rate": 0.001, + "loss": 1.8416, + "step": 9951 + }, + { + "epoch": 0.42101700651493357, + "grad_norm": 0.16037270426750183, + "learning_rate": 0.001, + "loss": 2.9326, + "step": 9952 + }, + { + "epoch": 0.42105931127845, + "grad_norm": 0.15264320373535156, + "learning_rate": 0.001, + "loss": 1.6577, + "step": 9953 + }, + { + "epoch": 0.42110161604196633, + "grad_norm": 4.530941486358643, + "learning_rate": 0.001, + "loss": 2.5381, + "step": 9954 + }, + { + "epoch": 0.4211439208054827, + "grad_norm": 0.17232206463813782, + "learning_rate": 0.001, + "loss": 3.0177, + "step": 9955 + }, + { + "epoch": 0.4211862255689991, + "grad_norm": 0.18628808856010437, + "learning_rate": 0.001, + "loss": 2.0065, + "step": 9956 + }, + { + "epoch": 0.42122853033251545, + "grad_norm": 0.1941269338130951, + "learning_rate": 0.001, + "loss": 3.6867, + "step": 9957 + }, + { + "epoch": 0.4212708350960318, + "grad_norm": 0.18407343327999115, + "learning_rate": 0.001, + "loss": 2.8404, + "step": 9958 + }, + { + "epoch": 0.42131313985954816, + "grad_norm": 0.27422475814819336, + "learning_rate": 0.001, + "loss": 1.9193, + "step": 9959 + }, + { + "epoch": 0.42135544462306457, + "grad_norm": 0.3834380805492401, + "learning_rate": 0.001, + "loss": 1.587, + "step": 9960 + }, + { + "epoch": 0.4213977493865809, + "grad_norm": 0.28580230474472046, + "learning_rate": 0.001, + "loss": 2.604, + "step": 9961 + }, + { + "epoch": 0.4214400541500973, + "grad_norm": 0.7395582795143127, + "learning_rate": 0.001, + "loss": 2.5038, + "step": 9962 + }, + { + "epoch": 0.4214823589136137, + "grad_norm": 0.19765476882457733, + "learning_rate": 0.001, + "loss": 2.0077, + "step": 9963 + }, + { + "epoch": 0.42152466367713004, + "grad_norm": 0.2048027217388153, + "learning_rate": 0.001, + "loss": 2.1928, + "step": 9964 + }, + { + "epoch": 0.4215669684406464, + "grad_norm": 0.1922098696231842, + "learning_rate": 0.001, + "loss": 1.5701, + "step": 9965 + }, + { + "epoch": 0.4216092732041628, + "grad_norm": 0.4606640040874481, + "learning_rate": 0.001, + "loss": 1.8854, + "step": 9966 + }, + { + "epoch": 0.42165157796767916, + "grad_norm": 0.17350491881370544, + "learning_rate": 0.001, + "loss": 3.4361, + "step": 9967 + }, + { + "epoch": 0.4216938827311955, + "grad_norm": 0.17392922937870026, + "learning_rate": 0.001, + "loss": 1.8771, + "step": 9968 + }, + { + "epoch": 0.4217361874947119, + "grad_norm": 0.25780975818634033, + "learning_rate": 0.001, + "loss": 2.3047, + "step": 9969 + }, + { + "epoch": 0.4217784922582283, + "grad_norm": 0.22652190923690796, + "learning_rate": 0.001, + "loss": 3.8245, + "step": 9970 + }, + { + "epoch": 0.42182079702174463, + "grad_norm": 0.1851012408733368, + "learning_rate": 0.001, + "loss": 2.5209, + "step": 9971 + }, + { + "epoch": 0.42186310178526104, + "grad_norm": 0.6701632738113403, + "learning_rate": 0.001, + "loss": 2.6308, + "step": 9972 + }, + { + "epoch": 0.4219054065487774, + "grad_norm": 0.19841192662715912, + "learning_rate": 0.001, + "loss": 3.4615, + "step": 9973 + }, + { + "epoch": 0.42194771131229375, + "grad_norm": 0.18321669101715088, + "learning_rate": 0.001, + "loss": 2.0243, + "step": 9974 + }, + { + "epoch": 0.42199001607581016, + "grad_norm": 0.1863401234149933, + "learning_rate": 0.001, + "loss": 2.2838, + "step": 9975 + }, + { + "epoch": 0.4220323208393265, + "grad_norm": 0.5115291476249695, + "learning_rate": 0.001, + "loss": 2.6378, + "step": 9976 + }, + { + "epoch": 0.42207462560284287, + "grad_norm": 0.20143838226795197, + "learning_rate": 0.001, + "loss": 2.8185, + "step": 9977 + }, + { + "epoch": 0.4221169303663593, + "grad_norm": 0.7708411812782288, + "learning_rate": 0.001, + "loss": 2.9274, + "step": 9978 + }, + { + "epoch": 0.42215923512987563, + "grad_norm": 0.18255001306533813, + "learning_rate": 0.001, + "loss": 2.3446, + "step": 9979 + }, + { + "epoch": 0.422201539893392, + "grad_norm": 2.8448588848114014, + "learning_rate": 0.001, + "loss": 2.0094, + "step": 9980 + }, + { + "epoch": 0.42224384465690834, + "grad_norm": 0.2241508811712265, + "learning_rate": 0.001, + "loss": 2.4382, + "step": 9981 + }, + { + "epoch": 0.42228614942042475, + "grad_norm": 4.693768501281738, + "learning_rate": 0.001, + "loss": 2.416, + "step": 9982 + }, + { + "epoch": 0.4223284541839411, + "grad_norm": 0.18450792133808136, + "learning_rate": 0.001, + "loss": 1.5582, + "step": 9983 + }, + { + "epoch": 0.42237075894745746, + "grad_norm": 0.21260473132133484, + "learning_rate": 0.001, + "loss": 1.7709, + "step": 9984 + }, + { + "epoch": 0.42241306371097387, + "grad_norm": 0.23331928253173828, + "learning_rate": 0.001, + "loss": 2.4449, + "step": 9985 + }, + { + "epoch": 0.4224553684744902, + "grad_norm": 0.38122308254241943, + "learning_rate": 0.001, + "loss": 2.4113, + "step": 9986 + }, + { + "epoch": 0.4224976732380066, + "grad_norm": 0.16446325182914734, + "learning_rate": 0.001, + "loss": 2.0169, + "step": 9987 + }, + { + "epoch": 0.422539978001523, + "grad_norm": 0.1864539235830307, + "learning_rate": 0.001, + "loss": 2.2757, + "step": 9988 + }, + { + "epoch": 0.42258228276503934, + "grad_norm": 0.208041250705719, + "learning_rate": 0.001, + "loss": 1.9249, + "step": 9989 + }, + { + "epoch": 0.4226245875285557, + "grad_norm": 0.6653062105178833, + "learning_rate": 0.001, + "loss": 2.7106, + "step": 9990 + }, + { + "epoch": 0.4226668922920721, + "grad_norm": 0.2309500128030777, + "learning_rate": 0.001, + "loss": 2.502, + "step": 9991 + }, + { + "epoch": 0.42270919705558846, + "grad_norm": 8.24671745300293, + "learning_rate": 0.001, + "loss": 1.6391, + "step": 9992 + }, + { + "epoch": 0.4227515018191048, + "grad_norm": 0.389308363199234, + "learning_rate": 0.001, + "loss": 1.9749, + "step": 9993 + }, + { + "epoch": 0.4227938065826212, + "grad_norm": 0.2275291532278061, + "learning_rate": 0.001, + "loss": 3.4856, + "step": 9994 + }, + { + "epoch": 0.4228361113461376, + "grad_norm": 0.8399461507797241, + "learning_rate": 0.001, + "loss": 3.256, + "step": 9995 + }, + { + "epoch": 0.42287841610965393, + "grad_norm": 0.9205453395843506, + "learning_rate": 0.001, + "loss": 2.8834, + "step": 9996 + }, + { + "epoch": 0.42292072087317034, + "grad_norm": 20.110586166381836, + "learning_rate": 0.001, + "loss": 3.0527, + "step": 9997 + }, + { + "epoch": 0.4229630256366867, + "grad_norm": 0.2495739907026291, + "learning_rate": 0.001, + "loss": 2.4493, + "step": 9998 + }, + { + "epoch": 0.42300533040020305, + "grad_norm": 16.760589599609375, + "learning_rate": 0.001, + "loss": 3.6418, + "step": 9999 + }, + { + "epoch": 0.42304763516371946, + "grad_norm": 0.20367243885993958, + "learning_rate": 0.001, + "loss": 2.063, + "step": 10000 + }, + { + "epoch": 0.4230899399272358, + "grad_norm": 0.2168806493282318, + "learning_rate": 0.001, + "loss": 2.816, + "step": 10001 + }, + { + "epoch": 0.42313224469075217, + "grad_norm": 1.293717384338379, + "learning_rate": 0.001, + "loss": 2.3445, + "step": 10002 + }, + { + "epoch": 0.4231745494542686, + "grad_norm": 139.1713409423828, + "learning_rate": 0.001, + "loss": 3.2346, + "step": 10003 + }, + { + "epoch": 0.42321685421778493, + "grad_norm": 0.39124611020088196, + "learning_rate": 0.001, + "loss": 2.108, + "step": 10004 + }, + { + "epoch": 0.4232591589813013, + "grad_norm": 0.9725698828697205, + "learning_rate": 0.001, + "loss": 2.4943, + "step": 10005 + }, + { + "epoch": 0.42330146374481764, + "grad_norm": 2.8331100940704346, + "learning_rate": 0.001, + "loss": 2.6655, + "step": 10006 + }, + { + "epoch": 0.42334376850833405, + "grad_norm": 0.20734193921089172, + "learning_rate": 0.001, + "loss": 2.3108, + "step": 10007 + }, + { + "epoch": 0.4233860732718504, + "grad_norm": 1.9373948574066162, + "learning_rate": 0.001, + "loss": 1.867, + "step": 10008 + }, + { + "epoch": 0.42342837803536676, + "grad_norm": 1.3012683391571045, + "learning_rate": 0.001, + "loss": 2.6328, + "step": 10009 + }, + { + "epoch": 0.42347068279888317, + "grad_norm": 0.838748037815094, + "learning_rate": 0.001, + "loss": 2.1099, + "step": 10010 + }, + { + "epoch": 0.4235129875623995, + "grad_norm": 0.27665624022483826, + "learning_rate": 0.001, + "loss": 2.0056, + "step": 10011 + }, + { + "epoch": 0.4235552923259159, + "grad_norm": 0.2420514076948166, + "learning_rate": 0.001, + "loss": 1.8592, + "step": 10012 + }, + { + "epoch": 0.4235975970894323, + "grad_norm": 38.479976654052734, + "learning_rate": 0.001, + "loss": 1.7052, + "step": 10013 + }, + { + "epoch": 0.42363990185294864, + "grad_norm": 0.297911673784256, + "learning_rate": 0.001, + "loss": 2.694, + "step": 10014 + }, + { + "epoch": 0.423682206616465, + "grad_norm": 0.2878136932849884, + "learning_rate": 0.001, + "loss": 2.3265, + "step": 10015 + }, + { + "epoch": 0.4237245113799814, + "grad_norm": 0.3122667968273163, + "learning_rate": 0.001, + "loss": 3.1903, + "step": 10016 + }, + { + "epoch": 0.42376681614349776, + "grad_norm": 6.599530220031738, + "learning_rate": 0.001, + "loss": 3.0154, + "step": 10017 + }, + { + "epoch": 0.4238091209070141, + "grad_norm": 0.2963060438632965, + "learning_rate": 0.001, + "loss": 2.9263, + "step": 10018 + }, + { + "epoch": 0.4238514256705305, + "grad_norm": 0.3066433370113373, + "learning_rate": 0.001, + "loss": 2.8319, + "step": 10019 + }, + { + "epoch": 0.4238937304340469, + "grad_norm": 0.5115453004837036, + "learning_rate": 0.001, + "loss": 2.9406, + "step": 10020 + }, + { + "epoch": 0.42393603519756323, + "grad_norm": 0.24337489902973175, + "learning_rate": 0.001, + "loss": 2.2676, + "step": 10021 + }, + { + "epoch": 0.42397833996107964, + "grad_norm": 0.20824651420116425, + "learning_rate": 0.001, + "loss": 3.0662, + "step": 10022 + }, + { + "epoch": 0.424020644724596, + "grad_norm": 0.22388121485710144, + "learning_rate": 0.001, + "loss": 2.5818, + "step": 10023 + }, + { + "epoch": 0.42406294948811235, + "grad_norm": 0.29800713062286377, + "learning_rate": 0.001, + "loss": 3.4957, + "step": 10024 + }, + { + "epoch": 0.42410525425162876, + "grad_norm": 0.23895491659641266, + "learning_rate": 0.001, + "loss": 2.0719, + "step": 10025 + }, + { + "epoch": 0.4241475590151451, + "grad_norm": 0.21432499587535858, + "learning_rate": 0.001, + "loss": 2.6525, + "step": 10026 + }, + { + "epoch": 0.42418986377866147, + "grad_norm": 0.22099368274211884, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 10027 + }, + { + "epoch": 0.4242321685421778, + "grad_norm": 0.18679919838905334, + "learning_rate": 0.001, + "loss": 2.411, + "step": 10028 + }, + { + "epoch": 0.42427447330569423, + "grad_norm": 3.992774486541748, + "learning_rate": 0.001, + "loss": 2.1948, + "step": 10029 + }, + { + "epoch": 0.4243167780692106, + "grad_norm": 0.29614201188087463, + "learning_rate": 0.001, + "loss": 2.3378, + "step": 10030 + }, + { + "epoch": 0.42435908283272694, + "grad_norm": 0.2120267003774643, + "learning_rate": 0.001, + "loss": 1.9312, + "step": 10031 + }, + { + "epoch": 0.42440138759624335, + "grad_norm": 0.39450037479400635, + "learning_rate": 0.001, + "loss": 2.7093, + "step": 10032 + }, + { + "epoch": 0.4244436923597597, + "grad_norm": 0.28339096903800964, + "learning_rate": 0.001, + "loss": 2.5644, + "step": 10033 + }, + { + "epoch": 0.42448599712327606, + "grad_norm": 0.24640879034996033, + "learning_rate": 0.001, + "loss": 2.3985, + "step": 10034 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 4.143669128417969, + "learning_rate": 0.001, + "loss": 3.2367, + "step": 10035 + }, + { + "epoch": 0.4245706066503088, + "grad_norm": 0.21093952655792236, + "learning_rate": 0.001, + "loss": 2.113, + "step": 10036 + }, + { + "epoch": 0.4246129114138252, + "grad_norm": 0.17772096395492554, + "learning_rate": 0.001, + "loss": 1.6009, + "step": 10037 + }, + { + "epoch": 0.4246552161773416, + "grad_norm": 2.9753193855285645, + "learning_rate": 0.001, + "loss": 2.4802, + "step": 10038 + }, + { + "epoch": 0.42469752094085794, + "grad_norm": 0.23098500072956085, + "learning_rate": 0.001, + "loss": 2.2026, + "step": 10039 + }, + { + "epoch": 0.4247398257043743, + "grad_norm": 4.376840114593506, + "learning_rate": 0.001, + "loss": 2.04, + "step": 10040 + }, + { + "epoch": 0.4247821304678907, + "grad_norm": 1.3142906427383423, + "learning_rate": 0.001, + "loss": 2.0858, + "step": 10041 + }, + { + "epoch": 0.42482443523140706, + "grad_norm": 0.25867971777915955, + "learning_rate": 0.001, + "loss": 2.6186, + "step": 10042 + }, + { + "epoch": 0.4248667399949234, + "grad_norm": 0.8242712616920471, + "learning_rate": 0.001, + "loss": 2.2378, + "step": 10043 + }, + { + "epoch": 0.4249090447584398, + "grad_norm": 0.21803343296051025, + "learning_rate": 0.001, + "loss": 3.163, + "step": 10044 + }, + { + "epoch": 0.4249513495219562, + "grad_norm": 2.3811676502227783, + "learning_rate": 0.001, + "loss": 3.5696, + "step": 10045 + }, + { + "epoch": 0.42499365428547253, + "grad_norm": 0.5477252006530762, + "learning_rate": 0.001, + "loss": 2.5047, + "step": 10046 + }, + { + "epoch": 0.42503595904898894, + "grad_norm": 0.2212408185005188, + "learning_rate": 0.001, + "loss": 2.2622, + "step": 10047 + }, + { + "epoch": 0.4250782638125053, + "grad_norm": 0.40021783113479614, + "learning_rate": 0.001, + "loss": 2.4889, + "step": 10048 + }, + { + "epoch": 0.42512056857602165, + "grad_norm": 0.24213816225528717, + "learning_rate": 0.001, + "loss": 2.1517, + "step": 10049 + }, + { + "epoch": 0.425162873339538, + "grad_norm": 0.3857748210430145, + "learning_rate": 0.001, + "loss": 3.2273, + "step": 10050 + }, + { + "epoch": 0.4252051781030544, + "grad_norm": 4.682729721069336, + "learning_rate": 0.001, + "loss": 2.5041, + "step": 10051 + }, + { + "epoch": 0.42524748286657077, + "grad_norm": 0.7291063070297241, + "learning_rate": 0.001, + "loss": 2.3562, + "step": 10052 + }, + { + "epoch": 0.4252897876300871, + "grad_norm": 1.8503894805908203, + "learning_rate": 0.001, + "loss": 2.678, + "step": 10053 + }, + { + "epoch": 0.42533209239360353, + "grad_norm": 0.2773977220058441, + "learning_rate": 0.001, + "loss": 2.6154, + "step": 10054 + }, + { + "epoch": 0.4253743971571199, + "grad_norm": 0.2241569608449936, + "learning_rate": 0.001, + "loss": 2.2345, + "step": 10055 + }, + { + "epoch": 0.42541670192063624, + "grad_norm": 0.5478442311286926, + "learning_rate": 0.001, + "loss": 3.3277, + "step": 10056 + }, + { + "epoch": 0.42545900668415265, + "grad_norm": 2.7203447818756104, + "learning_rate": 0.001, + "loss": 3.8307, + "step": 10057 + }, + { + "epoch": 0.425501311447669, + "grad_norm": 0.9797614216804504, + "learning_rate": 0.001, + "loss": 2.2508, + "step": 10058 + }, + { + "epoch": 0.42554361621118536, + "grad_norm": 0.4571799039840698, + "learning_rate": 0.001, + "loss": 2.9476, + "step": 10059 + }, + { + "epoch": 0.42558592097470177, + "grad_norm": 0.34742364287376404, + "learning_rate": 0.001, + "loss": 2.4439, + "step": 10060 + }, + { + "epoch": 0.4256282257382181, + "grad_norm": 0.3077751100063324, + "learning_rate": 0.001, + "loss": 2.7386, + "step": 10061 + }, + { + "epoch": 0.4256705305017345, + "grad_norm": 0.34680673480033875, + "learning_rate": 0.001, + "loss": 2.7423, + "step": 10062 + }, + { + "epoch": 0.4257128352652509, + "grad_norm": 0.5962140560150146, + "learning_rate": 0.001, + "loss": 3.1924, + "step": 10063 + }, + { + "epoch": 0.42575514002876724, + "grad_norm": 0.4237496554851532, + "learning_rate": 0.001, + "loss": 2.7427, + "step": 10064 + }, + { + "epoch": 0.4257974447922836, + "grad_norm": 0.3010278046131134, + "learning_rate": 0.001, + "loss": 1.9587, + "step": 10065 + }, + { + "epoch": 0.4258397495558, + "grad_norm": 0.9290482997894287, + "learning_rate": 0.001, + "loss": 2.3115, + "step": 10066 + }, + { + "epoch": 0.42588205431931636, + "grad_norm": 0.18896250426769257, + "learning_rate": 0.001, + "loss": 1.8756, + "step": 10067 + }, + { + "epoch": 0.4259243590828327, + "grad_norm": 0.7285879254341125, + "learning_rate": 0.001, + "loss": 2.1316, + "step": 10068 + }, + { + "epoch": 0.4259666638463491, + "grad_norm": 0.1963866502046585, + "learning_rate": 0.001, + "loss": 2.351, + "step": 10069 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 5.0258283615112305, + "learning_rate": 0.001, + "loss": 2.4576, + "step": 10070 + }, + { + "epoch": 0.42605127337338183, + "grad_norm": 0.7055137157440186, + "learning_rate": 0.001, + "loss": 2.9171, + "step": 10071 + }, + { + "epoch": 0.4260935781368982, + "grad_norm": 0.2101747691631317, + "learning_rate": 0.001, + "loss": 1.9525, + "step": 10072 + }, + { + "epoch": 0.4261358829004146, + "grad_norm": 0.2830001711845398, + "learning_rate": 0.001, + "loss": 2.4924, + "step": 10073 + }, + { + "epoch": 0.42617818766393095, + "grad_norm": 0.2799762189388275, + "learning_rate": 0.001, + "loss": 3.067, + "step": 10074 + }, + { + "epoch": 0.4262204924274473, + "grad_norm": 0.2741358280181885, + "learning_rate": 0.001, + "loss": 2.3429, + "step": 10075 + }, + { + "epoch": 0.4262627971909637, + "grad_norm": 0.5189446210861206, + "learning_rate": 0.001, + "loss": 3.3767, + "step": 10076 + }, + { + "epoch": 0.42630510195448007, + "grad_norm": 0.2622857391834259, + "learning_rate": 0.001, + "loss": 2.2139, + "step": 10077 + }, + { + "epoch": 0.4263474067179964, + "grad_norm": 0.25797808170318604, + "learning_rate": 0.001, + "loss": 2.4526, + "step": 10078 + }, + { + "epoch": 0.42638971148151283, + "grad_norm": 0.20280535519123077, + "learning_rate": 0.001, + "loss": 2.2921, + "step": 10079 + }, + { + "epoch": 0.4264320162450292, + "grad_norm": 0.18795685470104218, + "learning_rate": 0.001, + "loss": 2.6622, + "step": 10080 + }, + { + "epoch": 0.42647432100854554, + "grad_norm": 1.9836920499801636, + "learning_rate": 0.001, + "loss": 3.0886, + "step": 10081 + }, + { + "epoch": 0.42651662577206195, + "grad_norm": 0.41229698061943054, + "learning_rate": 0.001, + "loss": 2.5115, + "step": 10082 + }, + { + "epoch": 0.4265589305355783, + "grad_norm": 0.17537228763103485, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 10083 + }, + { + "epoch": 0.42660123529909466, + "grad_norm": 0.2055114507675171, + "learning_rate": 0.001, + "loss": 2.0749, + "step": 10084 + }, + { + "epoch": 0.42664354006261107, + "grad_norm": 0.7603585720062256, + "learning_rate": 0.001, + "loss": 3.0483, + "step": 10085 + }, + { + "epoch": 0.4266858448261274, + "grad_norm": 0.15949027240276337, + "learning_rate": 0.001, + "loss": 2.3814, + "step": 10086 + }, + { + "epoch": 0.4267281495896438, + "grad_norm": 0.18160079419612885, + "learning_rate": 0.001, + "loss": 2.764, + "step": 10087 + }, + { + "epoch": 0.4267704543531602, + "grad_norm": 1.1477245092391968, + "learning_rate": 0.001, + "loss": 2.4743, + "step": 10088 + }, + { + "epoch": 0.42681275911667654, + "grad_norm": 0.2228129804134369, + "learning_rate": 0.001, + "loss": 3.238, + "step": 10089 + }, + { + "epoch": 0.4268550638801929, + "grad_norm": 0.7572936415672302, + "learning_rate": 0.001, + "loss": 2.4797, + "step": 10090 + }, + { + "epoch": 0.4268973686437093, + "grad_norm": 0.1875317394733429, + "learning_rate": 0.001, + "loss": 2.2772, + "step": 10091 + }, + { + "epoch": 0.42693967340722566, + "grad_norm": 0.16233602166175842, + "learning_rate": 0.001, + "loss": 1.489, + "step": 10092 + }, + { + "epoch": 0.426981978170742, + "grad_norm": 0.15060947835445404, + "learning_rate": 0.001, + "loss": 1.9052, + "step": 10093 + }, + { + "epoch": 0.42702428293425837, + "grad_norm": 0.28614911437034607, + "learning_rate": 0.001, + "loss": 2.401, + "step": 10094 + }, + { + "epoch": 0.4270665876977748, + "grad_norm": 0.1590694785118103, + "learning_rate": 0.001, + "loss": 2.4475, + "step": 10095 + }, + { + "epoch": 0.42710889246129113, + "grad_norm": 0.2464892566204071, + "learning_rate": 0.001, + "loss": 2.7287, + "step": 10096 + }, + { + "epoch": 0.4271511972248075, + "grad_norm": 0.25172311067581177, + "learning_rate": 0.001, + "loss": 2.6355, + "step": 10097 + }, + { + "epoch": 0.4271935019883239, + "grad_norm": 0.19006308913230896, + "learning_rate": 0.001, + "loss": 1.8972, + "step": 10098 + }, + { + "epoch": 0.42723580675184025, + "grad_norm": 0.17476560175418854, + "learning_rate": 0.001, + "loss": 2.0524, + "step": 10099 + }, + { + "epoch": 0.4272781115153566, + "grad_norm": 0.1643456220626831, + "learning_rate": 0.001, + "loss": 1.7454, + "step": 10100 + }, + { + "epoch": 0.427320416278873, + "grad_norm": 0.1704128235578537, + "learning_rate": 0.001, + "loss": 1.8189, + "step": 10101 + }, + { + "epoch": 0.42736272104238937, + "grad_norm": 0.16339196264743805, + "learning_rate": 0.001, + "loss": 2.1309, + "step": 10102 + }, + { + "epoch": 0.4274050258059057, + "grad_norm": 0.27944785356521606, + "learning_rate": 0.001, + "loss": 2.072, + "step": 10103 + }, + { + "epoch": 0.42744733056942213, + "grad_norm": 0.8265528082847595, + "learning_rate": 0.001, + "loss": 2.8271, + "step": 10104 + }, + { + "epoch": 0.4274896353329385, + "grad_norm": 0.16367796063423157, + "learning_rate": 0.001, + "loss": 2.0271, + "step": 10105 + }, + { + "epoch": 0.42753194009645484, + "grad_norm": 0.17860738933086395, + "learning_rate": 0.001, + "loss": 1.4789, + "step": 10106 + }, + { + "epoch": 0.42757424485997125, + "grad_norm": 0.2383193075656891, + "learning_rate": 0.001, + "loss": 2.6022, + "step": 10107 + }, + { + "epoch": 0.4276165496234876, + "grad_norm": 0.9572615027427673, + "learning_rate": 0.001, + "loss": 3.0684, + "step": 10108 + }, + { + "epoch": 0.42765885438700396, + "grad_norm": 0.31203147768974304, + "learning_rate": 0.001, + "loss": 2.1978, + "step": 10109 + }, + { + "epoch": 0.42770115915052037, + "grad_norm": 0.2197791486978531, + "learning_rate": 0.001, + "loss": 2.5708, + "step": 10110 + }, + { + "epoch": 0.4277434639140367, + "grad_norm": 0.5677201151847839, + "learning_rate": 0.001, + "loss": 1.6285, + "step": 10111 + }, + { + "epoch": 0.4277857686775531, + "grad_norm": 0.18387927114963531, + "learning_rate": 0.001, + "loss": 1.9223, + "step": 10112 + }, + { + "epoch": 0.4278280734410695, + "grad_norm": 0.16864502429962158, + "learning_rate": 0.001, + "loss": 1.8276, + "step": 10113 + }, + { + "epoch": 0.42787037820458584, + "grad_norm": 0.23291771113872528, + "learning_rate": 0.001, + "loss": 3.0319, + "step": 10114 + }, + { + "epoch": 0.4279126829681022, + "grad_norm": 0.18459783494472504, + "learning_rate": 0.001, + "loss": 1.8791, + "step": 10115 + }, + { + "epoch": 0.4279549877316186, + "grad_norm": 1.1795158386230469, + "learning_rate": 0.001, + "loss": 2.6287, + "step": 10116 + }, + { + "epoch": 0.42799729249513496, + "grad_norm": 0.20951585471630096, + "learning_rate": 0.001, + "loss": 1.9405, + "step": 10117 + }, + { + "epoch": 0.4280395972586513, + "grad_norm": 0.31406983733177185, + "learning_rate": 0.001, + "loss": 3.4603, + "step": 10118 + }, + { + "epoch": 0.42808190202216767, + "grad_norm": 0.267755925655365, + "learning_rate": 0.001, + "loss": 3.2455, + "step": 10119 + }, + { + "epoch": 0.4281242067856841, + "grad_norm": 0.21892082691192627, + "learning_rate": 0.001, + "loss": 2.5571, + "step": 10120 + }, + { + "epoch": 0.42816651154920043, + "grad_norm": 0.17393141984939575, + "learning_rate": 0.001, + "loss": 1.6598, + "step": 10121 + }, + { + "epoch": 0.4282088163127168, + "grad_norm": 0.17320716381072998, + "learning_rate": 0.001, + "loss": 2.0091, + "step": 10122 + }, + { + "epoch": 0.4282511210762332, + "grad_norm": 0.1930369734764099, + "learning_rate": 0.001, + "loss": 1.7533, + "step": 10123 + }, + { + "epoch": 0.42829342583974955, + "grad_norm": 0.18624812364578247, + "learning_rate": 0.001, + "loss": 2.0146, + "step": 10124 + }, + { + "epoch": 0.4283357306032659, + "grad_norm": 0.7041241526603699, + "learning_rate": 0.001, + "loss": 2.8163, + "step": 10125 + }, + { + "epoch": 0.4283780353667823, + "grad_norm": 0.2105880081653595, + "learning_rate": 0.001, + "loss": 1.9341, + "step": 10126 + }, + { + "epoch": 0.42842034013029867, + "grad_norm": 0.1799003928899765, + "learning_rate": 0.001, + "loss": 2.2052, + "step": 10127 + }, + { + "epoch": 0.428462644893815, + "grad_norm": 1.5456318855285645, + "learning_rate": 0.001, + "loss": 1.8783, + "step": 10128 + }, + { + "epoch": 0.42850494965733144, + "grad_norm": 0.17844410240650177, + "learning_rate": 0.001, + "loss": 2.3216, + "step": 10129 + }, + { + "epoch": 0.4285472544208478, + "grad_norm": 0.16258619725704193, + "learning_rate": 0.001, + "loss": 2.56, + "step": 10130 + }, + { + "epoch": 0.42858955918436414, + "grad_norm": 0.18964774906635284, + "learning_rate": 0.001, + "loss": 1.9728, + "step": 10131 + }, + { + "epoch": 0.42863186394788055, + "grad_norm": 0.18737366795539856, + "learning_rate": 0.001, + "loss": 2.6866, + "step": 10132 + }, + { + "epoch": 0.4286741687113969, + "grad_norm": 0.19086334109306335, + "learning_rate": 0.001, + "loss": 2.0617, + "step": 10133 + }, + { + "epoch": 0.42871647347491326, + "grad_norm": 0.2464657872915268, + "learning_rate": 0.001, + "loss": 2.9955, + "step": 10134 + }, + { + "epoch": 0.42875877823842967, + "grad_norm": 0.3504067063331604, + "learning_rate": 0.001, + "loss": 2.5686, + "step": 10135 + }, + { + "epoch": 0.428801083001946, + "grad_norm": 0.17571739852428436, + "learning_rate": 0.001, + "loss": 2.1223, + "step": 10136 + }, + { + "epoch": 0.4288433877654624, + "grad_norm": 0.381346195936203, + "learning_rate": 0.001, + "loss": 1.5416, + "step": 10137 + }, + { + "epoch": 0.4288856925289788, + "grad_norm": 0.2737821936607361, + "learning_rate": 0.001, + "loss": 1.7345, + "step": 10138 + }, + { + "epoch": 0.42892799729249514, + "grad_norm": 0.17216061055660248, + "learning_rate": 0.001, + "loss": 1.6129, + "step": 10139 + }, + { + "epoch": 0.4289703020560115, + "grad_norm": 0.20674671232700348, + "learning_rate": 0.001, + "loss": 3.2899, + "step": 10140 + }, + { + "epoch": 0.42901260681952785, + "grad_norm": 1.2608942985534668, + "learning_rate": 0.001, + "loss": 1.8988, + "step": 10141 + }, + { + "epoch": 0.42905491158304426, + "grad_norm": 0.18615379929542542, + "learning_rate": 0.001, + "loss": 1.873, + "step": 10142 + }, + { + "epoch": 0.4290972163465606, + "grad_norm": 4.064700603485107, + "learning_rate": 0.001, + "loss": 2.7955, + "step": 10143 + }, + { + "epoch": 0.42913952111007697, + "grad_norm": 0.2223389595746994, + "learning_rate": 0.001, + "loss": 2.7764, + "step": 10144 + }, + { + "epoch": 0.4291818258735934, + "grad_norm": 0.4159873127937317, + "learning_rate": 0.001, + "loss": 3.8135, + "step": 10145 + }, + { + "epoch": 0.42922413063710974, + "grad_norm": 0.23633842170238495, + "learning_rate": 0.001, + "loss": 3.5594, + "step": 10146 + }, + { + "epoch": 0.4292664354006261, + "grad_norm": 1.0280625820159912, + "learning_rate": 0.001, + "loss": 1.7139, + "step": 10147 + }, + { + "epoch": 0.4293087401641425, + "grad_norm": 0.1797591596841812, + "learning_rate": 0.001, + "loss": 3.0655, + "step": 10148 + }, + { + "epoch": 0.42935104492765885, + "grad_norm": 0.4903676509857178, + "learning_rate": 0.001, + "loss": 1.8549, + "step": 10149 + }, + { + "epoch": 0.4293933496911752, + "grad_norm": 0.15862198173999786, + "learning_rate": 0.001, + "loss": 2.6685, + "step": 10150 + }, + { + "epoch": 0.4294356544546916, + "grad_norm": 0.22118091583251953, + "learning_rate": 0.001, + "loss": 2.2547, + "step": 10151 + }, + { + "epoch": 0.42947795921820797, + "grad_norm": 0.18271955847740173, + "learning_rate": 0.001, + "loss": 2.2298, + "step": 10152 + }, + { + "epoch": 0.4295202639817243, + "grad_norm": 0.170980766415596, + "learning_rate": 0.001, + "loss": 2.8904, + "step": 10153 + }, + { + "epoch": 0.42956256874524074, + "grad_norm": 0.15399718284606934, + "learning_rate": 0.001, + "loss": 2.2953, + "step": 10154 + }, + { + "epoch": 0.4296048735087571, + "grad_norm": 0.17953842878341675, + "learning_rate": 0.001, + "loss": 1.9147, + "step": 10155 + }, + { + "epoch": 0.42964717827227344, + "grad_norm": 0.1702369898557663, + "learning_rate": 0.001, + "loss": 2.2872, + "step": 10156 + }, + { + "epoch": 0.42968948303578985, + "grad_norm": 0.19081392884254456, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 10157 + }, + { + "epoch": 0.4297317877993062, + "grad_norm": 0.1783342808485031, + "learning_rate": 0.001, + "loss": 1.9582, + "step": 10158 + }, + { + "epoch": 0.42977409256282256, + "grad_norm": 0.20043319463729858, + "learning_rate": 0.001, + "loss": 2.3066, + "step": 10159 + }, + { + "epoch": 0.429816397326339, + "grad_norm": 0.17219308018684387, + "learning_rate": 0.001, + "loss": 2.5435, + "step": 10160 + }, + { + "epoch": 0.4298587020898553, + "grad_norm": 0.29747068881988525, + "learning_rate": 0.001, + "loss": 2.9951, + "step": 10161 + }, + { + "epoch": 0.4299010068533717, + "grad_norm": 4.550970077514648, + "learning_rate": 0.001, + "loss": 2.5154, + "step": 10162 + }, + { + "epoch": 0.42994331161688804, + "grad_norm": 0.18527233600616455, + "learning_rate": 0.001, + "loss": 2.0328, + "step": 10163 + }, + { + "epoch": 0.42998561638040445, + "grad_norm": 0.18644128739833832, + "learning_rate": 0.001, + "loss": 2.3224, + "step": 10164 + }, + { + "epoch": 0.4300279211439208, + "grad_norm": 0.2518557906150818, + "learning_rate": 0.001, + "loss": 2.1388, + "step": 10165 + }, + { + "epoch": 0.43007022590743715, + "grad_norm": 0.1625964343547821, + "learning_rate": 0.001, + "loss": 1.8495, + "step": 10166 + }, + { + "epoch": 0.43011253067095356, + "grad_norm": 0.2331293672323227, + "learning_rate": 0.001, + "loss": 2.216, + "step": 10167 + }, + { + "epoch": 0.4301548354344699, + "grad_norm": 0.2668088376522064, + "learning_rate": 0.001, + "loss": 2.3291, + "step": 10168 + }, + { + "epoch": 0.43019714019798627, + "grad_norm": 0.17521047592163086, + "learning_rate": 0.001, + "loss": 1.9921, + "step": 10169 + }, + { + "epoch": 0.4302394449615027, + "grad_norm": 0.19229163229465485, + "learning_rate": 0.001, + "loss": 2.0076, + "step": 10170 + }, + { + "epoch": 0.43028174972501904, + "grad_norm": 0.2229296714067459, + "learning_rate": 0.001, + "loss": 2.5076, + "step": 10171 + }, + { + "epoch": 0.4303240544885354, + "grad_norm": 0.47033241391181946, + "learning_rate": 0.001, + "loss": 2.2927, + "step": 10172 + }, + { + "epoch": 0.4303663592520518, + "grad_norm": 0.1660117357969284, + "learning_rate": 0.001, + "loss": 2.2447, + "step": 10173 + }, + { + "epoch": 0.43040866401556815, + "grad_norm": 2.12497615814209, + "learning_rate": 0.001, + "loss": 2.0389, + "step": 10174 + }, + { + "epoch": 0.4304509687790845, + "grad_norm": 0.18803949654102325, + "learning_rate": 0.001, + "loss": 3.1254, + "step": 10175 + }, + { + "epoch": 0.4304932735426009, + "grad_norm": 0.2218303233385086, + "learning_rate": 0.001, + "loss": 1.7379, + "step": 10176 + }, + { + "epoch": 0.4305355783061173, + "grad_norm": 0.14620518684387207, + "learning_rate": 0.001, + "loss": 1.6227, + "step": 10177 + }, + { + "epoch": 0.4305778830696336, + "grad_norm": 0.17080405354499817, + "learning_rate": 0.001, + "loss": 1.9532, + "step": 10178 + }, + { + "epoch": 0.43062018783315004, + "grad_norm": 2.5071756839752197, + "learning_rate": 0.001, + "loss": 3.2385, + "step": 10179 + }, + { + "epoch": 0.4306624925966664, + "grad_norm": 0.17002911865711212, + "learning_rate": 0.001, + "loss": 3.0736, + "step": 10180 + }, + { + "epoch": 0.43070479736018275, + "grad_norm": 0.1898619830608368, + "learning_rate": 0.001, + "loss": 1.6932, + "step": 10181 + }, + { + "epoch": 0.43074710212369915, + "grad_norm": 0.6082344651222229, + "learning_rate": 0.001, + "loss": 3.6207, + "step": 10182 + }, + { + "epoch": 0.4307894068872155, + "grad_norm": 0.6188779473304749, + "learning_rate": 0.001, + "loss": 1.6398, + "step": 10183 + }, + { + "epoch": 0.43083171165073186, + "grad_norm": 0.7821673154830933, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 10184 + }, + { + "epoch": 0.4308740164142482, + "grad_norm": 0.543019711971283, + "learning_rate": 0.001, + "loss": 1.6907, + "step": 10185 + }, + { + "epoch": 0.4309163211777646, + "grad_norm": 0.5596624612808228, + "learning_rate": 0.001, + "loss": 3.4859, + "step": 10186 + }, + { + "epoch": 0.430958625941281, + "grad_norm": 0.22357861697673798, + "learning_rate": 0.001, + "loss": 1.896, + "step": 10187 + }, + { + "epoch": 0.43100093070479734, + "grad_norm": 0.16170749068260193, + "learning_rate": 0.001, + "loss": 1.8071, + "step": 10188 + }, + { + "epoch": 0.43104323546831375, + "grad_norm": 0.15653282403945923, + "learning_rate": 0.001, + "loss": 1.6408, + "step": 10189 + }, + { + "epoch": 0.4310855402318301, + "grad_norm": 0.22891934216022491, + "learning_rate": 0.001, + "loss": 1.2615, + "step": 10190 + }, + { + "epoch": 0.43112784499534645, + "grad_norm": 0.9978185892105103, + "learning_rate": 0.001, + "loss": 2.0704, + "step": 10191 + }, + { + "epoch": 0.43117014975886286, + "grad_norm": 0.1806502640247345, + "learning_rate": 0.001, + "loss": 2.1097, + "step": 10192 + }, + { + "epoch": 0.4312124545223792, + "grad_norm": 0.16584782302379608, + "learning_rate": 0.001, + "loss": 2.528, + "step": 10193 + }, + { + "epoch": 0.4312547592858956, + "grad_norm": 0.39718106389045715, + "learning_rate": 0.001, + "loss": 2.2111, + "step": 10194 + }, + { + "epoch": 0.431297064049412, + "grad_norm": 0.37005481123924255, + "learning_rate": 0.001, + "loss": 2.8491, + "step": 10195 + }, + { + "epoch": 0.43133936881292834, + "grad_norm": 0.18187114596366882, + "learning_rate": 0.001, + "loss": 2.0542, + "step": 10196 + }, + { + "epoch": 0.4313816735764447, + "grad_norm": 0.21417534351348877, + "learning_rate": 0.001, + "loss": 1.842, + "step": 10197 + }, + { + "epoch": 0.4314239783399611, + "grad_norm": 0.2430461347103119, + "learning_rate": 0.001, + "loss": 3.2218, + "step": 10198 + }, + { + "epoch": 0.43146628310347745, + "grad_norm": 0.1812175065279007, + "learning_rate": 0.001, + "loss": 2.5477, + "step": 10199 + }, + { + "epoch": 0.4315085878669938, + "grad_norm": 0.5557557940483093, + "learning_rate": 0.001, + "loss": 2.0329, + "step": 10200 + }, + { + "epoch": 0.4315508926305102, + "grad_norm": 0.16666504740715027, + "learning_rate": 0.001, + "loss": 1.8955, + "step": 10201 + }, + { + "epoch": 0.4315931973940266, + "grad_norm": 8.631728172302246, + "learning_rate": 0.001, + "loss": 2.2758, + "step": 10202 + }, + { + "epoch": 0.4316355021575429, + "grad_norm": 0.28974688053131104, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 10203 + }, + { + "epoch": 0.43167780692105934, + "grad_norm": 0.17004406452178955, + "learning_rate": 0.001, + "loss": 1.7401, + "step": 10204 + }, + { + "epoch": 0.4317201116845757, + "grad_norm": 0.3253629803657532, + "learning_rate": 0.001, + "loss": 2.3806, + "step": 10205 + }, + { + "epoch": 0.43176241644809205, + "grad_norm": 0.18079756200313568, + "learning_rate": 0.001, + "loss": 1.6866, + "step": 10206 + }, + { + "epoch": 0.4318047212116084, + "grad_norm": 0.16561074554920197, + "learning_rate": 0.001, + "loss": 2.1134, + "step": 10207 + }, + { + "epoch": 0.4318470259751248, + "grad_norm": 0.16736382246017456, + "learning_rate": 0.001, + "loss": 2.5646, + "step": 10208 + }, + { + "epoch": 0.43188933073864116, + "grad_norm": 0.16732652485370636, + "learning_rate": 0.001, + "loss": 1.7875, + "step": 10209 + }, + { + "epoch": 0.4319316355021575, + "grad_norm": 1.6806914806365967, + "learning_rate": 0.001, + "loss": 1.8155, + "step": 10210 + }, + { + "epoch": 0.43197394026567393, + "grad_norm": 0.902766764163971, + "learning_rate": 0.001, + "loss": 2.1906, + "step": 10211 + }, + { + "epoch": 0.4320162450291903, + "grad_norm": 0.22574925422668457, + "learning_rate": 0.001, + "loss": 2.4012, + "step": 10212 + }, + { + "epoch": 0.43205854979270664, + "grad_norm": 0.20272192358970642, + "learning_rate": 0.001, + "loss": 2.7397, + "step": 10213 + }, + { + "epoch": 0.43210085455622305, + "grad_norm": 0.18217982351779938, + "learning_rate": 0.001, + "loss": 1.8363, + "step": 10214 + }, + { + "epoch": 0.4321431593197394, + "grad_norm": 0.32555845379829407, + "learning_rate": 0.001, + "loss": 3.1817, + "step": 10215 + }, + { + "epoch": 0.43218546408325575, + "grad_norm": 4.589909076690674, + "learning_rate": 0.001, + "loss": 2.9862, + "step": 10216 + }, + { + "epoch": 0.43222776884677216, + "grad_norm": 0.24728332459926605, + "learning_rate": 0.001, + "loss": 1.834, + "step": 10217 + }, + { + "epoch": 0.4322700736102885, + "grad_norm": 0.331990510225296, + "learning_rate": 0.001, + "loss": 3.6178, + "step": 10218 + }, + { + "epoch": 0.4323123783738049, + "grad_norm": 0.31489211320877075, + "learning_rate": 0.001, + "loss": 1.8217, + "step": 10219 + }, + { + "epoch": 0.4323546831373213, + "grad_norm": 1.1140625476837158, + "learning_rate": 0.001, + "loss": 2.5692, + "step": 10220 + }, + { + "epoch": 0.43239698790083764, + "grad_norm": 0.1725742071866989, + "learning_rate": 0.001, + "loss": 1.7017, + "step": 10221 + }, + { + "epoch": 0.432439292664354, + "grad_norm": 0.19503003358840942, + "learning_rate": 0.001, + "loss": 2.5324, + "step": 10222 + }, + { + "epoch": 0.4324815974278704, + "grad_norm": 0.26874589920043945, + "learning_rate": 0.001, + "loss": 3.1633, + "step": 10223 + }, + { + "epoch": 0.43252390219138676, + "grad_norm": 0.24442531168460846, + "learning_rate": 0.001, + "loss": 2.4106, + "step": 10224 + }, + { + "epoch": 0.4325662069549031, + "grad_norm": 0.32704654335975647, + "learning_rate": 0.001, + "loss": 3.401, + "step": 10225 + }, + { + "epoch": 0.4326085117184195, + "grad_norm": 0.2951880693435669, + "learning_rate": 0.001, + "loss": 2.9172, + "step": 10226 + }, + { + "epoch": 0.4326508164819359, + "grad_norm": 0.16954240202903748, + "learning_rate": 0.001, + "loss": 1.8404, + "step": 10227 + }, + { + "epoch": 0.43269312124545223, + "grad_norm": 0.1641477644443512, + "learning_rate": 0.001, + "loss": 3.1311, + "step": 10228 + }, + { + "epoch": 0.4327354260089686, + "grad_norm": 0.7109168171882629, + "learning_rate": 0.001, + "loss": 1.9146, + "step": 10229 + }, + { + "epoch": 0.432777730772485, + "grad_norm": 0.21353894472122192, + "learning_rate": 0.001, + "loss": 2.6354, + "step": 10230 + }, + { + "epoch": 0.43282003553600135, + "grad_norm": 0.32388636469841003, + "learning_rate": 0.001, + "loss": 1.8109, + "step": 10231 + }, + { + "epoch": 0.4328623402995177, + "grad_norm": 0.46089649200439453, + "learning_rate": 0.001, + "loss": 3.1302, + "step": 10232 + }, + { + "epoch": 0.4329046450630341, + "grad_norm": 0.28092247247695923, + "learning_rate": 0.001, + "loss": 2.9537, + "step": 10233 + }, + { + "epoch": 0.43294694982655046, + "grad_norm": 0.1795322597026825, + "learning_rate": 0.001, + "loss": 2.0469, + "step": 10234 + }, + { + "epoch": 0.4329892545900668, + "grad_norm": 1.1064038276672363, + "learning_rate": 0.001, + "loss": 2.0708, + "step": 10235 + }, + { + "epoch": 0.43303155935358323, + "grad_norm": 0.1590038537979126, + "learning_rate": 0.001, + "loss": 3.0817, + "step": 10236 + }, + { + "epoch": 0.4330738641170996, + "grad_norm": 0.1741926223039627, + "learning_rate": 0.001, + "loss": 2.1068, + "step": 10237 + }, + { + "epoch": 0.43311616888061594, + "grad_norm": 0.15864573419094086, + "learning_rate": 0.001, + "loss": 2.225, + "step": 10238 + }, + { + "epoch": 0.43315847364413235, + "grad_norm": 0.1976540982723236, + "learning_rate": 0.001, + "loss": 2.0396, + "step": 10239 + }, + { + "epoch": 0.4332007784076487, + "grad_norm": 0.1970503330230713, + "learning_rate": 0.001, + "loss": 2.3251, + "step": 10240 + }, + { + "epoch": 0.43324308317116506, + "grad_norm": 0.19585905969142914, + "learning_rate": 0.001, + "loss": 2.9537, + "step": 10241 + }, + { + "epoch": 0.43328538793468147, + "grad_norm": 0.5313712358474731, + "learning_rate": 0.001, + "loss": 2.066, + "step": 10242 + }, + { + "epoch": 0.4333276926981978, + "grad_norm": 0.3845883905887604, + "learning_rate": 0.001, + "loss": 3.0735, + "step": 10243 + }, + { + "epoch": 0.4333699974617142, + "grad_norm": 4.409493446350098, + "learning_rate": 0.001, + "loss": 2.7471, + "step": 10244 + }, + { + "epoch": 0.4334123022252306, + "grad_norm": 0.15902769565582275, + "learning_rate": 0.001, + "loss": 2.0389, + "step": 10245 + }, + { + "epoch": 0.43345460698874694, + "grad_norm": 0.19496986269950867, + "learning_rate": 0.001, + "loss": 2.2368, + "step": 10246 + }, + { + "epoch": 0.4334969117522633, + "grad_norm": 3.496062755584717, + "learning_rate": 0.001, + "loss": 2.2234, + "step": 10247 + }, + { + "epoch": 0.4335392165157797, + "grad_norm": 0.28134244680404663, + "learning_rate": 0.001, + "loss": 1.4976, + "step": 10248 + }, + { + "epoch": 0.43358152127929606, + "grad_norm": 0.2013697773218155, + "learning_rate": 0.001, + "loss": 1.9873, + "step": 10249 + }, + { + "epoch": 0.4336238260428124, + "grad_norm": 0.42853641510009766, + "learning_rate": 0.001, + "loss": 3.4437, + "step": 10250 + }, + { + "epoch": 0.4336661308063288, + "grad_norm": 2.750063896179199, + "learning_rate": 0.001, + "loss": 2.0177, + "step": 10251 + }, + { + "epoch": 0.4337084355698452, + "grad_norm": 0.21362249553203583, + "learning_rate": 0.001, + "loss": 2.6798, + "step": 10252 + }, + { + "epoch": 0.43375074033336153, + "grad_norm": 0.2875850200653076, + "learning_rate": 0.001, + "loss": 2.2333, + "step": 10253 + }, + { + "epoch": 0.4337930450968779, + "grad_norm": 0.2688775360584259, + "learning_rate": 0.001, + "loss": 2.2072, + "step": 10254 + }, + { + "epoch": 0.4338353498603943, + "grad_norm": 0.3297450840473175, + "learning_rate": 0.001, + "loss": 2.5938, + "step": 10255 + }, + { + "epoch": 0.43387765462391065, + "grad_norm": 0.4499950706958771, + "learning_rate": 0.001, + "loss": 2.221, + "step": 10256 + }, + { + "epoch": 0.433919959387427, + "grad_norm": 0.19243115186691284, + "learning_rate": 0.001, + "loss": 3.2843, + "step": 10257 + }, + { + "epoch": 0.4339622641509434, + "grad_norm": 0.18401968479156494, + "learning_rate": 0.001, + "loss": 1.8309, + "step": 10258 + }, + { + "epoch": 0.43400456891445977, + "grad_norm": 0.18404164910316467, + "learning_rate": 0.001, + "loss": 2.295, + "step": 10259 + }, + { + "epoch": 0.4340468736779761, + "grad_norm": 0.1946217566728592, + "learning_rate": 0.001, + "loss": 2.0939, + "step": 10260 + }, + { + "epoch": 0.43408917844149253, + "grad_norm": 0.1470000147819519, + "learning_rate": 0.001, + "loss": 1.7336, + "step": 10261 + }, + { + "epoch": 0.4341314832050089, + "grad_norm": 0.32551971077919006, + "learning_rate": 0.001, + "loss": 2.4746, + "step": 10262 + }, + { + "epoch": 0.43417378796852524, + "grad_norm": 2.652297019958496, + "learning_rate": 0.001, + "loss": 3.1451, + "step": 10263 + }, + { + "epoch": 0.43421609273204165, + "grad_norm": 0.1553354561328888, + "learning_rate": 0.001, + "loss": 1.976, + "step": 10264 + }, + { + "epoch": 0.434258397495558, + "grad_norm": 0.23101647198200226, + "learning_rate": 0.001, + "loss": 2.6715, + "step": 10265 + }, + { + "epoch": 0.43430070225907436, + "grad_norm": 0.20663489401340485, + "learning_rate": 0.001, + "loss": 1.561, + "step": 10266 + }, + { + "epoch": 0.43434300702259077, + "grad_norm": 0.19089651107788086, + "learning_rate": 0.001, + "loss": 2.8731, + "step": 10267 + }, + { + "epoch": 0.4343853117861071, + "grad_norm": 0.14528456330299377, + "learning_rate": 0.001, + "loss": 2.2012, + "step": 10268 + }, + { + "epoch": 0.4344276165496235, + "grad_norm": 0.1953110247850418, + "learning_rate": 0.001, + "loss": 2.5863, + "step": 10269 + }, + { + "epoch": 0.4344699213131399, + "grad_norm": 0.19322249293327332, + "learning_rate": 0.001, + "loss": 2.5648, + "step": 10270 + }, + { + "epoch": 0.43451222607665624, + "grad_norm": 0.15629512071609497, + "learning_rate": 0.001, + "loss": 2.6854, + "step": 10271 + }, + { + "epoch": 0.4345545308401726, + "grad_norm": 0.17981792986392975, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 10272 + }, + { + "epoch": 0.434596835603689, + "grad_norm": 0.18551938235759735, + "learning_rate": 0.001, + "loss": 2.6762, + "step": 10273 + }, + { + "epoch": 0.43463914036720536, + "grad_norm": 2.313119649887085, + "learning_rate": 0.001, + "loss": 3.5626, + "step": 10274 + }, + { + "epoch": 0.4346814451307217, + "grad_norm": 0.34999650716781616, + "learning_rate": 0.001, + "loss": 2.6788, + "step": 10275 + }, + { + "epoch": 0.43472374989423807, + "grad_norm": 0.36393770575523376, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 10276 + }, + { + "epoch": 0.4347660546577545, + "grad_norm": 0.4789801239967346, + "learning_rate": 0.001, + "loss": 1.7946, + "step": 10277 + }, + { + "epoch": 0.43480835942127083, + "grad_norm": 2.8181819915771484, + "learning_rate": 0.001, + "loss": 2.2287, + "step": 10278 + }, + { + "epoch": 0.4348506641847872, + "grad_norm": 0.5375362038612366, + "learning_rate": 0.001, + "loss": 2.2571, + "step": 10279 + }, + { + "epoch": 0.4348929689483036, + "grad_norm": 0.1631775200366974, + "learning_rate": 0.001, + "loss": 2.6874, + "step": 10280 + }, + { + "epoch": 0.43493527371181995, + "grad_norm": 0.1752665489912033, + "learning_rate": 0.001, + "loss": 2.6751, + "step": 10281 + }, + { + "epoch": 0.4349775784753363, + "grad_norm": 0.28829485177993774, + "learning_rate": 0.001, + "loss": 1.4542, + "step": 10282 + }, + { + "epoch": 0.4350198832388527, + "grad_norm": 0.6851754188537598, + "learning_rate": 0.001, + "loss": 2.094, + "step": 10283 + }, + { + "epoch": 0.43506218800236907, + "grad_norm": 0.1630009263753891, + "learning_rate": 0.001, + "loss": 2.3703, + "step": 10284 + }, + { + "epoch": 0.4351044927658854, + "grad_norm": 0.7183140516281128, + "learning_rate": 0.001, + "loss": 3.3134, + "step": 10285 + }, + { + "epoch": 0.43514679752940183, + "grad_norm": 0.17298482358455658, + "learning_rate": 0.001, + "loss": 2.7842, + "step": 10286 + }, + { + "epoch": 0.4351891022929182, + "grad_norm": 0.16329266130924225, + "learning_rate": 0.001, + "loss": 1.5573, + "step": 10287 + }, + { + "epoch": 0.43523140705643454, + "grad_norm": 0.15913952887058258, + "learning_rate": 0.001, + "loss": 2.8684, + "step": 10288 + }, + { + "epoch": 0.43527371181995095, + "grad_norm": 0.16433869302272797, + "learning_rate": 0.001, + "loss": 1.6984, + "step": 10289 + }, + { + "epoch": 0.4353160165834673, + "grad_norm": 0.1634620726108551, + "learning_rate": 0.001, + "loss": 2.2076, + "step": 10290 + }, + { + "epoch": 0.43535832134698366, + "grad_norm": 0.2353067547082901, + "learning_rate": 0.001, + "loss": 2.8474, + "step": 10291 + }, + { + "epoch": 0.43540062611050007, + "grad_norm": 0.1973012238740921, + "learning_rate": 0.001, + "loss": 2.5835, + "step": 10292 + }, + { + "epoch": 0.4354429308740164, + "grad_norm": 0.16465625166893005, + "learning_rate": 0.001, + "loss": 1.8952, + "step": 10293 + }, + { + "epoch": 0.4354852356375328, + "grad_norm": 1.9926971197128296, + "learning_rate": 0.001, + "loss": 1.7796, + "step": 10294 + }, + { + "epoch": 0.4355275404010492, + "grad_norm": 1.1302298307418823, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 10295 + }, + { + "epoch": 0.43556984516456554, + "grad_norm": 0.2590234577655792, + "learning_rate": 0.001, + "loss": 1.8019, + "step": 10296 + }, + { + "epoch": 0.4356121499280819, + "grad_norm": 0.14725469052791595, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 10297 + }, + { + "epoch": 0.43565445469159825, + "grad_norm": 3.516937255859375, + "learning_rate": 0.001, + "loss": 1.8395, + "step": 10298 + }, + { + "epoch": 0.43569675945511466, + "grad_norm": 0.1837252527475357, + "learning_rate": 0.001, + "loss": 2.3293, + "step": 10299 + }, + { + "epoch": 0.435739064218631, + "grad_norm": 0.15408559143543243, + "learning_rate": 0.001, + "loss": 2.7421, + "step": 10300 + }, + { + "epoch": 0.43578136898214737, + "grad_norm": 2.384270191192627, + "learning_rate": 0.001, + "loss": 2.6062, + "step": 10301 + }, + { + "epoch": 0.4358236737456638, + "grad_norm": 0.22565071284770966, + "learning_rate": 0.001, + "loss": 2.2583, + "step": 10302 + }, + { + "epoch": 0.43586597850918013, + "grad_norm": 0.19346702098846436, + "learning_rate": 0.001, + "loss": 2.36, + "step": 10303 + }, + { + "epoch": 0.4359082832726965, + "grad_norm": 0.15725278854370117, + "learning_rate": 0.001, + "loss": 1.6664, + "step": 10304 + }, + { + "epoch": 0.4359505880362129, + "grad_norm": 0.19711074233055115, + "learning_rate": 0.001, + "loss": 1.736, + "step": 10305 + }, + { + "epoch": 0.43599289279972925, + "grad_norm": 0.2101636379957199, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 10306 + }, + { + "epoch": 0.4360351975632456, + "grad_norm": 0.1750393956899643, + "learning_rate": 0.001, + "loss": 1.6158, + "step": 10307 + }, + { + "epoch": 0.436077502326762, + "grad_norm": 0.1906735897064209, + "learning_rate": 0.001, + "loss": 2.4902, + "step": 10308 + }, + { + "epoch": 0.43611980709027837, + "grad_norm": 1.6329114437103271, + "learning_rate": 0.001, + "loss": 2.1277, + "step": 10309 + }, + { + "epoch": 0.4361621118537947, + "grad_norm": 0.9235885739326477, + "learning_rate": 0.001, + "loss": 1.4367, + "step": 10310 + }, + { + "epoch": 0.43620441661731113, + "grad_norm": 0.14719220995903015, + "learning_rate": 0.001, + "loss": 2.4318, + "step": 10311 + }, + { + "epoch": 0.4362467213808275, + "grad_norm": 0.2758025825023651, + "learning_rate": 0.001, + "loss": 1.9603, + "step": 10312 + }, + { + "epoch": 0.43628902614434384, + "grad_norm": 0.25476765632629395, + "learning_rate": 0.001, + "loss": 4.3574, + "step": 10313 + }, + { + "epoch": 0.43633133090786025, + "grad_norm": 0.180466890335083, + "learning_rate": 0.001, + "loss": 2.5561, + "step": 10314 + }, + { + "epoch": 0.4363736356713766, + "grad_norm": 0.19408752024173737, + "learning_rate": 0.001, + "loss": 2.5026, + "step": 10315 + }, + { + "epoch": 0.43641594043489296, + "grad_norm": 0.25457096099853516, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 10316 + }, + { + "epoch": 0.43645824519840937, + "grad_norm": 0.3525673747062683, + "learning_rate": 0.001, + "loss": 2.2683, + "step": 10317 + }, + { + "epoch": 0.4365005499619257, + "grad_norm": 0.1872408390045166, + "learning_rate": 0.001, + "loss": 2.0554, + "step": 10318 + }, + { + "epoch": 0.4365428547254421, + "grad_norm": 0.31182733178138733, + "learning_rate": 0.001, + "loss": 2.6635, + "step": 10319 + }, + { + "epoch": 0.43658515948895843, + "grad_norm": 0.18361353874206543, + "learning_rate": 0.001, + "loss": 3.2744, + "step": 10320 + }, + { + "epoch": 0.43662746425247484, + "grad_norm": 0.20506177842617035, + "learning_rate": 0.001, + "loss": 2.086, + "step": 10321 + }, + { + "epoch": 0.4366697690159912, + "grad_norm": 0.1834511160850525, + "learning_rate": 0.001, + "loss": 2.1743, + "step": 10322 + }, + { + "epoch": 0.43671207377950755, + "grad_norm": 1.0289106369018555, + "learning_rate": 0.001, + "loss": 2.5491, + "step": 10323 + }, + { + "epoch": 0.43675437854302396, + "grad_norm": 1.9735060930252075, + "learning_rate": 0.001, + "loss": 2.4487, + "step": 10324 + }, + { + "epoch": 0.4367966833065403, + "grad_norm": 0.17474709451198578, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 10325 + }, + { + "epoch": 0.43683898807005667, + "grad_norm": 0.17368879914283752, + "learning_rate": 0.001, + "loss": 2.0555, + "step": 10326 + }, + { + "epoch": 0.4368812928335731, + "grad_norm": 0.2503129839897156, + "learning_rate": 0.001, + "loss": 2.6008, + "step": 10327 + }, + { + "epoch": 0.43692359759708943, + "grad_norm": 0.16766273975372314, + "learning_rate": 0.001, + "loss": 2.033, + "step": 10328 + }, + { + "epoch": 0.4369659023606058, + "grad_norm": 0.19881922006607056, + "learning_rate": 0.001, + "loss": 1.9835, + "step": 10329 + }, + { + "epoch": 0.4370082071241222, + "grad_norm": 2.48319149017334, + "learning_rate": 0.001, + "loss": 2.4763, + "step": 10330 + }, + { + "epoch": 0.43705051188763855, + "grad_norm": 0.19117386639118195, + "learning_rate": 0.001, + "loss": 1.7097, + "step": 10331 + }, + { + "epoch": 0.4370928166511549, + "grad_norm": 0.2317335158586502, + "learning_rate": 0.001, + "loss": 3.0248, + "step": 10332 + }, + { + "epoch": 0.4371351214146713, + "grad_norm": 3.407411813735962, + "learning_rate": 0.001, + "loss": 2.5108, + "step": 10333 + }, + { + "epoch": 0.43717742617818767, + "grad_norm": 0.32746651768684387, + "learning_rate": 0.001, + "loss": 2.6375, + "step": 10334 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.4482516348361969, + "learning_rate": 0.001, + "loss": 1.3975, + "step": 10335 + }, + { + "epoch": 0.43726203570522043, + "grad_norm": 0.20397043228149414, + "learning_rate": 0.001, + "loss": 1.8973, + "step": 10336 + }, + { + "epoch": 0.4373043404687368, + "grad_norm": 0.20856736600399017, + "learning_rate": 0.001, + "loss": 2.7255, + "step": 10337 + }, + { + "epoch": 0.43734664523225314, + "grad_norm": 0.4428465664386749, + "learning_rate": 0.001, + "loss": 2.4357, + "step": 10338 + }, + { + "epoch": 0.43738894999576955, + "grad_norm": 0.2584536075592041, + "learning_rate": 0.001, + "loss": 3.332, + "step": 10339 + }, + { + "epoch": 0.4374312547592859, + "grad_norm": 0.2311519831418991, + "learning_rate": 0.001, + "loss": 3.5032, + "step": 10340 + }, + { + "epoch": 0.43747355952280226, + "grad_norm": 0.25852352380752563, + "learning_rate": 0.001, + "loss": 2.6795, + "step": 10341 + }, + { + "epoch": 0.4375158642863186, + "grad_norm": 0.1977977156639099, + "learning_rate": 0.001, + "loss": 2.0136, + "step": 10342 + }, + { + "epoch": 0.437558169049835, + "grad_norm": 0.20671911537647247, + "learning_rate": 0.001, + "loss": 2.0685, + "step": 10343 + }, + { + "epoch": 0.4376004738133514, + "grad_norm": 0.22240790724754333, + "learning_rate": 0.001, + "loss": 2.3391, + "step": 10344 + }, + { + "epoch": 0.43764277857686773, + "grad_norm": 0.9896284937858582, + "learning_rate": 0.001, + "loss": 2.4624, + "step": 10345 + }, + { + "epoch": 0.43768508334038414, + "grad_norm": 0.18324188888072968, + "learning_rate": 0.001, + "loss": 2.2438, + "step": 10346 + }, + { + "epoch": 0.4377273881039005, + "grad_norm": 0.20104867219924927, + "learning_rate": 0.001, + "loss": 2.8267, + "step": 10347 + }, + { + "epoch": 0.43776969286741685, + "grad_norm": 0.26405319571495056, + "learning_rate": 0.001, + "loss": 2.1973, + "step": 10348 + }, + { + "epoch": 0.43781199763093326, + "grad_norm": 0.5061947107315063, + "learning_rate": 0.001, + "loss": 2.5415, + "step": 10349 + }, + { + "epoch": 0.4378543023944496, + "grad_norm": 0.46758270263671875, + "learning_rate": 0.001, + "loss": 3.0251, + "step": 10350 + }, + { + "epoch": 0.43789660715796597, + "grad_norm": 1.8289541006088257, + "learning_rate": 0.001, + "loss": 2.4671, + "step": 10351 + }, + { + "epoch": 0.4379389119214824, + "grad_norm": 2.7915992736816406, + "learning_rate": 0.001, + "loss": 2.0793, + "step": 10352 + }, + { + "epoch": 0.43798121668499873, + "grad_norm": 0.1846030205488205, + "learning_rate": 0.001, + "loss": 1.7337, + "step": 10353 + }, + { + "epoch": 0.4380235214485151, + "grad_norm": 0.8970298171043396, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 10354 + }, + { + "epoch": 0.4380658262120315, + "grad_norm": 0.6228232979774475, + "learning_rate": 0.001, + "loss": 2.245, + "step": 10355 + }, + { + "epoch": 0.43810813097554785, + "grad_norm": 0.18022607266902924, + "learning_rate": 0.001, + "loss": 2.4163, + "step": 10356 + }, + { + "epoch": 0.4381504357390642, + "grad_norm": 0.3440026640892029, + "learning_rate": 0.001, + "loss": 2.0781, + "step": 10357 + }, + { + "epoch": 0.4381927405025806, + "grad_norm": 0.26871904730796814, + "learning_rate": 0.001, + "loss": 3.507, + "step": 10358 + }, + { + "epoch": 0.43823504526609697, + "grad_norm": 0.1928730607032776, + "learning_rate": 0.001, + "loss": 1.6934, + "step": 10359 + }, + { + "epoch": 0.4382773500296133, + "grad_norm": 0.23119626939296722, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 10360 + }, + { + "epoch": 0.43831965479312973, + "grad_norm": 0.20869280397891998, + "learning_rate": 0.001, + "loss": 2.3586, + "step": 10361 + }, + { + "epoch": 0.4383619595566461, + "grad_norm": 0.26448631286621094, + "learning_rate": 0.001, + "loss": 2.1414, + "step": 10362 + }, + { + "epoch": 0.43840426432016244, + "grad_norm": 0.2913392186164856, + "learning_rate": 0.001, + "loss": 2.2654, + "step": 10363 + }, + { + "epoch": 0.43844656908367885, + "grad_norm": 6.033486843109131, + "learning_rate": 0.001, + "loss": 2.6725, + "step": 10364 + }, + { + "epoch": 0.4384888738471952, + "grad_norm": 0.8384542465209961, + "learning_rate": 0.001, + "loss": 1.926, + "step": 10365 + }, + { + "epoch": 0.43853117861071156, + "grad_norm": 0.20631474256515503, + "learning_rate": 0.001, + "loss": 1.8508, + "step": 10366 + }, + { + "epoch": 0.4385734833742279, + "grad_norm": 0.3741599917411804, + "learning_rate": 0.001, + "loss": 2.8569, + "step": 10367 + }, + { + "epoch": 0.4386157881377443, + "grad_norm": 0.19838982820510864, + "learning_rate": 0.001, + "loss": 2.0593, + "step": 10368 + }, + { + "epoch": 0.4386580929012607, + "grad_norm": 0.16171355545520782, + "learning_rate": 0.001, + "loss": 1.7758, + "step": 10369 + }, + { + "epoch": 0.43870039766477703, + "grad_norm": 2.109577178955078, + "learning_rate": 0.001, + "loss": 2.4595, + "step": 10370 + }, + { + "epoch": 0.43874270242829344, + "grad_norm": 0.23121996223926544, + "learning_rate": 0.001, + "loss": 2.1405, + "step": 10371 + }, + { + "epoch": 0.4387850071918098, + "grad_norm": 0.19020533561706543, + "learning_rate": 0.001, + "loss": 2.0821, + "step": 10372 + }, + { + "epoch": 0.43882731195532615, + "grad_norm": 0.3466572165489197, + "learning_rate": 0.001, + "loss": 2.352, + "step": 10373 + }, + { + "epoch": 0.43886961671884256, + "grad_norm": 0.22979247570037842, + "learning_rate": 0.001, + "loss": 1.8425, + "step": 10374 + }, + { + "epoch": 0.4389119214823589, + "grad_norm": 0.2191963642835617, + "learning_rate": 0.001, + "loss": 1.9217, + "step": 10375 + }, + { + "epoch": 0.43895422624587527, + "grad_norm": 0.29709628224372864, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 10376 + }, + { + "epoch": 0.4389965310093917, + "grad_norm": 1.1428967714309692, + "learning_rate": 0.001, + "loss": 2.3209, + "step": 10377 + }, + { + "epoch": 0.43903883577290803, + "grad_norm": 0.21593493223190308, + "learning_rate": 0.001, + "loss": 2.2734, + "step": 10378 + }, + { + "epoch": 0.4390811405364244, + "grad_norm": 0.2102469801902771, + "learning_rate": 0.001, + "loss": 2.5221, + "step": 10379 + }, + { + "epoch": 0.4391234452999408, + "grad_norm": 0.19701038300991058, + "learning_rate": 0.001, + "loss": 2.7192, + "step": 10380 + }, + { + "epoch": 0.43916575006345715, + "grad_norm": 0.6864662170410156, + "learning_rate": 0.001, + "loss": 2.4796, + "step": 10381 + }, + { + "epoch": 0.4392080548269735, + "grad_norm": 0.5476522445678711, + "learning_rate": 0.001, + "loss": 1.725, + "step": 10382 + }, + { + "epoch": 0.4392503595904899, + "grad_norm": 0.19268248975276947, + "learning_rate": 0.001, + "loss": 1.8153, + "step": 10383 + }, + { + "epoch": 0.43929266435400627, + "grad_norm": 0.40893223881721497, + "learning_rate": 0.001, + "loss": 2.7319, + "step": 10384 + }, + { + "epoch": 0.4393349691175226, + "grad_norm": 2.664677143096924, + "learning_rate": 0.001, + "loss": 2.4025, + "step": 10385 + }, + { + "epoch": 0.43937727388103903, + "grad_norm": 0.18634748458862305, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 10386 + }, + { + "epoch": 0.4394195786445554, + "grad_norm": 0.1849360466003418, + "learning_rate": 0.001, + "loss": 2.2008, + "step": 10387 + }, + { + "epoch": 0.43946188340807174, + "grad_norm": 0.15529468655586243, + "learning_rate": 0.001, + "loss": 2.3172, + "step": 10388 + }, + { + "epoch": 0.4395041881715881, + "grad_norm": 0.15574440360069275, + "learning_rate": 0.001, + "loss": 1.7872, + "step": 10389 + }, + { + "epoch": 0.4395464929351045, + "grad_norm": 0.24909093976020813, + "learning_rate": 0.001, + "loss": 2.3089, + "step": 10390 + }, + { + "epoch": 0.43958879769862086, + "grad_norm": 0.27436402440071106, + "learning_rate": 0.001, + "loss": 2.7429, + "step": 10391 + }, + { + "epoch": 0.4396311024621372, + "grad_norm": 0.6394477486610413, + "learning_rate": 0.001, + "loss": 3.1377, + "step": 10392 + }, + { + "epoch": 0.4396734072256536, + "grad_norm": 0.1768225133419037, + "learning_rate": 0.001, + "loss": 1.7332, + "step": 10393 + }, + { + "epoch": 0.43971571198917, + "grad_norm": 0.18364182114601135, + "learning_rate": 0.001, + "loss": 3.3755, + "step": 10394 + }, + { + "epoch": 0.43975801675268633, + "grad_norm": 0.1930709332227707, + "learning_rate": 0.001, + "loss": 2.2478, + "step": 10395 + }, + { + "epoch": 0.43980032151620274, + "grad_norm": 4.500080585479736, + "learning_rate": 0.001, + "loss": 1.6433, + "step": 10396 + }, + { + "epoch": 0.4398426262797191, + "grad_norm": 0.20036698877811432, + "learning_rate": 0.001, + "loss": 2.2089, + "step": 10397 + }, + { + "epoch": 0.43988493104323545, + "grad_norm": 0.19765986502170563, + "learning_rate": 0.001, + "loss": 2.012, + "step": 10398 + }, + { + "epoch": 0.43992723580675186, + "grad_norm": 0.1463608741760254, + "learning_rate": 0.001, + "loss": 1.4149, + "step": 10399 + }, + { + "epoch": 0.4399695405702682, + "grad_norm": 6.610461235046387, + "learning_rate": 0.001, + "loss": 2.9959, + "step": 10400 + }, + { + "epoch": 0.44001184533378457, + "grad_norm": 0.23446570336818695, + "learning_rate": 0.001, + "loss": 2.249, + "step": 10401 + }, + { + "epoch": 0.440054150097301, + "grad_norm": 1.4213615655899048, + "learning_rate": 0.001, + "loss": 2.6993, + "step": 10402 + }, + { + "epoch": 0.44009645486081733, + "grad_norm": 0.6602863073348999, + "learning_rate": 0.001, + "loss": 1.8553, + "step": 10403 + }, + { + "epoch": 0.4401387596243337, + "grad_norm": 0.2995677590370178, + "learning_rate": 0.001, + "loss": 3.3502, + "step": 10404 + }, + { + "epoch": 0.4401810643878501, + "grad_norm": 0.1976313292980194, + "learning_rate": 0.001, + "loss": 2.0083, + "step": 10405 + }, + { + "epoch": 0.44022336915136645, + "grad_norm": 0.2297874391078949, + "learning_rate": 0.001, + "loss": 3.5225, + "step": 10406 + }, + { + "epoch": 0.4402656739148828, + "grad_norm": 0.24849973618984222, + "learning_rate": 0.001, + "loss": 2.3344, + "step": 10407 + }, + { + "epoch": 0.4403079786783992, + "grad_norm": 0.15684276819229126, + "learning_rate": 0.001, + "loss": 3.1674, + "step": 10408 + }, + { + "epoch": 0.44035028344191557, + "grad_norm": 0.24899160861968994, + "learning_rate": 0.001, + "loss": 1.7431, + "step": 10409 + }, + { + "epoch": 0.4403925882054319, + "grad_norm": 0.16840027272701263, + "learning_rate": 0.001, + "loss": 2.1913, + "step": 10410 + }, + { + "epoch": 0.4404348929689483, + "grad_norm": 0.18736068904399872, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 10411 + }, + { + "epoch": 0.4404771977324647, + "grad_norm": 0.19807054102420807, + "learning_rate": 0.001, + "loss": 2.3152, + "step": 10412 + }, + { + "epoch": 0.44051950249598104, + "grad_norm": 0.7695265412330627, + "learning_rate": 0.001, + "loss": 1.9602, + "step": 10413 + }, + { + "epoch": 0.4405618072594974, + "grad_norm": 0.19554787874221802, + "learning_rate": 0.001, + "loss": 1.9298, + "step": 10414 + }, + { + "epoch": 0.4406041120230138, + "grad_norm": 0.22768321633338928, + "learning_rate": 0.001, + "loss": 1.8096, + "step": 10415 + }, + { + "epoch": 0.44064641678653016, + "grad_norm": 0.16684657335281372, + "learning_rate": 0.001, + "loss": 2.6149, + "step": 10416 + }, + { + "epoch": 0.4406887215500465, + "grad_norm": 6.711855888366699, + "learning_rate": 0.001, + "loss": 2.506, + "step": 10417 + }, + { + "epoch": 0.4407310263135629, + "grad_norm": 0.199007049202919, + "learning_rate": 0.001, + "loss": 2.6235, + "step": 10418 + }, + { + "epoch": 0.4407733310770793, + "grad_norm": 4.696216106414795, + "learning_rate": 0.001, + "loss": 2.0736, + "step": 10419 + }, + { + "epoch": 0.44081563584059563, + "grad_norm": 0.18525268137454987, + "learning_rate": 0.001, + "loss": 1.6978, + "step": 10420 + }, + { + "epoch": 0.44085794060411204, + "grad_norm": 0.22784662246704102, + "learning_rate": 0.001, + "loss": 2.0242, + "step": 10421 + }, + { + "epoch": 0.4409002453676284, + "grad_norm": 0.49683111906051636, + "learning_rate": 0.001, + "loss": 3.07, + "step": 10422 + }, + { + "epoch": 0.44094255013114475, + "grad_norm": 0.29818785190582275, + "learning_rate": 0.001, + "loss": 2.0421, + "step": 10423 + }, + { + "epoch": 0.44098485489466116, + "grad_norm": 0.30528533458709717, + "learning_rate": 0.001, + "loss": 2.3055, + "step": 10424 + }, + { + "epoch": 0.4410271596581775, + "grad_norm": 0.17285211384296417, + "learning_rate": 0.001, + "loss": 2.8575, + "step": 10425 + }, + { + "epoch": 0.44106946442169387, + "grad_norm": 0.2074311375617981, + "learning_rate": 0.001, + "loss": 2.9005, + "step": 10426 + }, + { + "epoch": 0.4411117691852103, + "grad_norm": 13.308959007263184, + "learning_rate": 0.001, + "loss": 2.1316, + "step": 10427 + }, + { + "epoch": 0.44115407394872663, + "grad_norm": 8.290491104125977, + "learning_rate": 0.001, + "loss": 3.0784, + "step": 10428 + }, + { + "epoch": 0.441196378712243, + "grad_norm": 6.2142252922058105, + "learning_rate": 0.001, + "loss": 2.8017, + "step": 10429 + }, + { + "epoch": 0.4412386834757594, + "grad_norm": 0.41313034296035767, + "learning_rate": 0.001, + "loss": 2.1858, + "step": 10430 + }, + { + "epoch": 0.44128098823927575, + "grad_norm": 13.561888694763184, + "learning_rate": 0.001, + "loss": 2.6311, + "step": 10431 + }, + { + "epoch": 0.4413232930027921, + "grad_norm": 0.19502195715904236, + "learning_rate": 0.001, + "loss": 2.5145, + "step": 10432 + }, + { + "epoch": 0.44136559776630846, + "grad_norm": 0.2449101358652115, + "learning_rate": 0.001, + "loss": 2.9577, + "step": 10433 + }, + { + "epoch": 0.44140790252982487, + "grad_norm": 0.4394688606262207, + "learning_rate": 0.001, + "loss": 2.156, + "step": 10434 + }, + { + "epoch": 0.4414502072933412, + "grad_norm": 0.9801822900772095, + "learning_rate": 0.001, + "loss": 2.5387, + "step": 10435 + }, + { + "epoch": 0.4414925120568576, + "grad_norm": 0.2750179171562195, + "learning_rate": 0.001, + "loss": 2.1277, + "step": 10436 + }, + { + "epoch": 0.441534816820374, + "grad_norm": 0.17588399350643158, + "learning_rate": 0.001, + "loss": 3.0843, + "step": 10437 + }, + { + "epoch": 0.44157712158389034, + "grad_norm": 0.18611468374729156, + "learning_rate": 0.001, + "loss": 2.0977, + "step": 10438 + }, + { + "epoch": 0.4416194263474067, + "grad_norm": 0.4915761649608612, + "learning_rate": 0.001, + "loss": 2.4029, + "step": 10439 + }, + { + "epoch": 0.4416617311109231, + "grad_norm": 0.2895006835460663, + "learning_rate": 0.001, + "loss": 2.6397, + "step": 10440 + }, + { + "epoch": 0.44170403587443946, + "grad_norm": 0.19709734618663788, + "learning_rate": 0.001, + "loss": 2.1713, + "step": 10441 + }, + { + "epoch": 0.4417463406379558, + "grad_norm": 0.7510543465614319, + "learning_rate": 0.001, + "loss": 2.5878, + "step": 10442 + }, + { + "epoch": 0.4417886454014722, + "grad_norm": 0.18231795728206635, + "learning_rate": 0.001, + "loss": 1.8023, + "step": 10443 + }, + { + "epoch": 0.4418309501649886, + "grad_norm": 0.18479089438915253, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 10444 + }, + { + "epoch": 0.44187325492850493, + "grad_norm": 0.35781130194664, + "learning_rate": 0.001, + "loss": 2.4331, + "step": 10445 + }, + { + "epoch": 0.44191555969202134, + "grad_norm": 0.23527033627033234, + "learning_rate": 0.001, + "loss": 1.5157, + "step": 10446 + }, + { + "epoch": 0.4419578644555377, + "grad_norm": 0.21979889273643494, + "learning_rate": 0.001, + "loss": 2.0873, + "step": 10447 + }, + { + "epoch": 0.44200016921905405, + "grad_norm": 0.2087160348892212, + "learning_rate": 0.001, + "loss": 2.237, + "step": 10448 + }, + { + "epoch": 0.44204247398257046, + "grad_norm": 0.30770057439804077, + "learning_rate": 0.001, + "loss": 2.6998, + "step": 10449 + }, + { + "epoch": 0.4420847787460868, + "grad_norm": 0.27149367332458496, + "learning_rate": 0.001, + "loss": 2.438, + "step": 10450 + }, + { + "epoch": 0.44212708350960317, + "grad_norm": 0.18169645965099335, + "learning_rate": 0.001, + "loss": 1.4572, + "step": 10451 + }, + { + "epoch": 0.4421693882731196, + "grad_norm": 0.1930762231349945, + "learning_rate": 0.001, + "loss": 1.9931, + "step": 10452 + }, + { + "epoch": 0.44221169303663593, + "grad_norm": 1.637730598449707, + "learning_rate": 0.001, + "loss": 3.6527, + "step": 10453 + }, + { + "epoch": 0.4422539978001523, + "grad_norm": 0.19526685774326324, + "learning_rate": 0.001, + "loss": 2.715, + "step": 10454 + }, + { + "epoch": 0.44229630256366864, + "grad_norm": 0.25344404578208923, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 10455 + }, + { + "epoch": 0.44233860732718505, + "grad_norm": 0.4373885989189148, + "learning_rate": 0.001, + "loss": 2.1229, + "step": 10456 + }, + { + "epoch": 0.4423809120907014, + "grad_norm": 0.49542349576950073, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 10457 + }, + { + "epoch": 0.44242321685421776, + "grad_norm": 0.21734249591827393, + "learning_rate": 0.001, + "loss": 2.5501, + "step": 10458 + }, + { + "epoch": 0.44246552161773417, + "grad_norm": 0.37125974893569946, + "learning_rate": 0.001, + "loss": 2.2032, + "step": 10459 + }, + { + "epoch": 0.4425078263812505, + "grad_norm": 1.202412724494934, + "learning_rate": 0.001, + "loss": 2.6156, + "step": 10460 + }, + { + "epoch": 0.4425501311447669, + "grad_norm": 0.2056283950805664, + "learning_rate": 0.001, + "loss": 2.3052, + "step": 10461 + }, + { + "epoch": 0.4425924359082833, + "grad_norm": 0.24419522285461426, + "learning_rate": 0.001, + "loss": 1.9584, + "step": 10462 + }, + { + "epoch": 0.44263474067179964, + "grad_norm": 3.809471845626831, + "learning_rate": 0.001, + "loss": 2.0806, + "step": 10463 + }, + { + "epoch": 0.442677045435316, + "grad_norm": 0.1795853078365326, + "learning_rate": 0.001, + "loss": 1.8472, + "step": 10464 + }, + { + "epoch": 0.4427193501988324, + "grad_norm": 0.21403010189533234, + "learning_rate": 0.001, + "loss": 2.5621, + "step": 10465 + }, + { + "epoch": 0.44276165496234876, + "grad_norm": 0.23772773146629333, + "learning_rate": 0.001, + "loss": 4.0021, + "step": 10466 + }, + { + "epoch": 0.4428039597258651, + "grad_norm": 0.1654462069272995, + "learning_rate": 0.001, + "loss": 2.3355, + "step": 10467 + }, + { + "epoch": 0.4428462644893815, + "grad_norm": 0.16145402193069458, + "learning_rate": 0.001, + "loss": 3.1889, + "step": 10468 + }, + { + "epoch": 0.4428885692528979, + "grad_norm": 0.27223458886146545, + "learning_rate": 0.001, + "loss": 1.7634, + "step": 10469 + }, + { + "epoch": 0.44293087401641423, + "grad_norm": 0.1729728877544403, + "learning_rate": 0.001, + "loss": 1.2275, + "step": 10470 + }, + { + "epoch": 0.44297317877993064, + "grad_norm": 0.17519982159137726, + "learning_rate": 0.001, + "loss": 2.0601, + "step": 10471 + }, + { + "epoch": 0.443015483543447, + "grad_norm": 0.2915792167186737, + "learning_rate": 0.001, + "loss": 3.0095, + "step": 10472 + }, + { + "epoch": 0.44305778830696335, + "grad_norm": 0.212400883436203, + "learning_rate": 0.001, + "loss": 2.7761, + "step": 10473 + }, + { + "epoch": 0.44310009307047976, + "grad_norm": 1.2644349336624146, + "learning_rate": 0.001, + "loss": 2.631, + "step": 10474 + }, + { + "epoch": 0.4431423978339961, + "grad_norm": 0.2543438673019409, + "learning_rate": 0.001, + "loss": 2.9616, + "step": 10475 + }, + { + "epoch": 0.44318470259751247, + "grad_norm": 0.5095729827880859, + "learning_rate": 0.001, + "loss": 2.9212, + "step": 10476 + }, + { + "epoch": 0.4432270073610288, + "grad_norm": 0.22304564714431763, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 10477 + }, + { + "epoch": 0.44326931212454523, + "grad_norm": 0.2589484751224518, + "learning_rate": 0.001, + "loss": 1.8629, + "step": 10478 + }, + { + "epoch": 0.4433116168880616, + "grad_norm": 0.171627938747406, + "learning_rate": 0.001, + "loss": 2.1287, + "step": 10479 + }, + { + "epoch": 0.44335392165157794, + "grad_norm": 0.18898135423660278, + "learning_rate": 0.001, + "loss": 1.9837, + "step": 10480 + }, + { + "epoch": 0.44339622641509435, + "grad_norm": 0.17940890789031982, + "learning_rate": 0.001, + "loss": 1.8571, + "step": 10481 + }, + { + "epoch": 0.4434385311786107, + "grad_norm": 0.3940030634403229, + "learning_rate": 0.001, + "loss": 2.5347, + "step": 10482 + }, + { + "epoch": 0.44348083594212706, + "grad_norm": 0.16100460290908813, + "learning_rate": 0.001, + "loss": 3.2192, + "step": 10483 + }, + { + "epoch": 0.44352314070564347, + "grad_norm": 0.3118045926094055, + "learning_rate": 0.001, + "loss": 2.7277, + "step": 10484 + }, + { + "epoch": 0.4435654454691598, + "grad_norm": 0.38664117455482483, + "learning_rate": 0.001, + "loss": 3.5834, + "step": 10485 + }, + { + "epoch": 0.4436077502326762, + "grad_norm": 0.15630090236663818, + "learning_rate": 0.001, + "loss": 1.5872, + "step": 10486 + }, + { + "epoch": 0.4436500549961926, + "grad_norm": 0.1633961796760559, + "learning_rate": 0.001, + "loss": 2.2324, + "step": 10487 + }, + { + "epoch": 0.44369235975970894, + "grad_norm": 0.2306484431028366, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 10488 + }, + { + "epoch": 0.4437346645232253, + "grad_norm": 0.3159201741218567, + "learning_rate": 0.001, + "loss": 2.2158, + "step": 10489 + }, + { + "epoch": 0.4437769692867417, + "grad_norm": 0.1387176662683487, + "learning_rate": 0.001, + "loss": 1.596, + "step": 10490 + }, + { + "epoch": 0.44381927405025806, + "grad_norm": 0.17673452198505402, + "learning_rate": 0.001, + "loss": 2.1328, + "step": 10491 + }, + { + "epoch": 0.4438615788137744, + "grad_norm": 0.22988873720169067, + "learning_rate": 0.001, + "loss": 1.8124, + "step": 10492 + }, + { + "epoch": 0.4439038835772908, + "grad_norm": 0.1935664266347885, + "learning_rate": 0.001, + "loss": 1.9753, + "step": 10493 + }, + { + "epoch": 0.4439461883408072, + "grad_norm": 0.17069995403289795, + "learning_rate": 0.001, + "loss": 2.0083, + "step": 10494 + }, + { + "epoch": 0.44398849310432353, + "grad_norm": 1.7595449686050415, + "learning_rate": 0.001, + "loss": 2.3677, + "step": 10495 + }, + { + "epoch": 0.44403079786783994, + "grad_norm": 0.2332594096660614, + "learning_rate": 0.001, + "loss": 1.9425, + "step": 10496 + }, + { + "epoch": 0.4440731026313563, + "grad_norm": 3.590059757232666, + "learning_rate": 0.001, + "loss": 2.2229, + "step": 10497 + }, + { + "epoch": 0.44411540739487265, + "grad_norm": 0.1739518791437149, + "learning_rate": 0.001, + "loss": 1.8198, + "step": 10498 + }, + { + "epoch": 0.44415771215838906, + "grad_norm": 0.18792513012886047, + "learning_rate": 0.001, + "loss": 1.8256, + "step": 10499 + }, + { + "epoch": 0.4442000169219054, + "grad_norm": 0.16739851236343384, + "learning_rate": 0.001, + "loss": 1.96, + "step": 10500 + }, + { + "epoch": 0.44424232168542177, + "grad_norm": 0.17911168932914734, + "learning_rate": 0.001, + "loss": 2.0014, + "step": 10501 + }, + { + "epoch": 0.4442846264489381, + "grad_norm": 0.2152254283428192, + "learning_rate": 0.001, + "loss": 2.1837, + "step": 10502 + }, + { + "epoch": 0.44432693121245453, + "grad_norm": 0.16652332246303558, + "learning_rate": 0.001, + "loss": 1.8485, + "step": 10503 + }, + { + "epoch": 0.4443692359759709, + "grad_norm": 0.1486539989709854, + "learning_rate": 0.001, + "loss": 1.8012, + "step": 10504 + }, + { + "epoch": 0.44441154073948724, + "grad_norm": 0.4087856113910675, + "learning_rate": 0.001, + "loss": 1.6707, + "step": 10505 + }, + { + "epoch": 0.44445384550300365, + "grad_norm": 0.20888468623161316, + "learning_rate": 0.001, + "loss": 2.1961, + "step": 10506 + }, + { + "epoch": 0.44449615026652, + "grad_norm": 0.5997106432914734, + "learning_rate": 0.001, + "loss": 2.2202, + "step": 10507 + }, + { + "epoch": 0.44453845503003636, + "grad_norm": 3.278139591217041, + "learning_rate": 0.001, + "loss": 2.1766, + "step": 10508 + }, + { + "epoch": 0.44458075979355277, + "grad_norm": 0.16290752589702606, + "learning_rate": 0.001, + "loss": 3.9578, + "step": 10509 + }, + { + "epoch": 0.4446230645570691, + "grad_norm": 0.8776594996452332, + "learning_rate": 0.001, + "loss": 1.7571, + "step": 10510 + }, + { + "epoch": 0.4446653693205855, + "grad_norm": 0.7716861367225647, + "learning_rate": 0.001, + "loss": 3.2192, + "step": 10511 + }, + { + "epoch": 0.4447076740841019, + "grad_norm": 1.821780800819397, + "learning_rate": 0.001, + "loss": 2.1203, + "step": 10512 + }, + { + "epoch": 0.44474997884761824, + "grad_norm": 4.94421911239624, + "learning_rate": 0.001, + "loss": 2.127, + "step": 10513 + }, + { + "epoch": 0.4447922836111346, + "grad_norm": 0.17742355167865753, + "learning_rate": 0.001, + "loss": 1.7438, + "step": 10514 + }, + { + "epoch": 0.444834588374651, + "grad_norm": 0.18554328382015228, + "learning_rate": 0.001, + "loss": 2.8967, + "step": 10515 + }, + { + "epoch": 0.44487689313816736, + "grad_norm": 0.1628502756357193, + "learning_rate": 0.001, + "loss": 1.8562, + "step": 10516 + }, + { + "epoch": 0.4449191979016837, + "grad_norm": 0.7208130955696106, + "learning_rate": 0.001, + "loss": 2.7975, + "step": 10517 + }, + { + "epoch": 0.4449615026652001, + "grad_norm": 0.6698981523513794, + "learning_rate": 0.001, + "loss": 2.7178, + "step": 10518 + }, + { + "epoch": 0.4450038074287165, + "grad_norm": 0.20229113101959229, + "learning_rate": 0.001, + "loss": 2.9919, + "step": 10519 + }, + { + "epoch": 0.44504611219223283, + "grad_norm": 0.1968151330947876, + "learning_rate": 0.001, + "loss": 3.2183, + "step": 10520 + }, + { + "epoch": 0.44508841695574924, + "grad_norm": 0.225248783826828, + "learning_rate": 0.001, + "loss": 3.2217, + "step": 10521 + }, + { + "epoch": 0.4451307217192656, + "grad_norm": 0.1952434927225113, + "learning_rate": 0.001, + "loss": 2.5245, + "step": 10522 + }, + { + "epoch": 0.44517302648278195, + "grad_norm": 1.0436367988586426, + "learning_rate": 0.001, + "loss": 3.3446, + "step": 10523 + }, + { + "epoch": 0.4452153312462983, + "grad_norm": 0.18723340332508087, + "learning_rate": 0.001, + "loss": 2.3264, + "step": 10524 + }, + { + "epoch": 0.4452576360098147, + "grad_norm": 0.2530616521835327, + "learning_rate": 0.001, + "loss": 2.052, + "step": 10525 + }, + { + "epoch": 0.44529994077333107, + "grad_norm": 0.22954061627388, + "learning_rate": 0.001, + "loss": 2.7963, + "step": 10526 + }, + { + "epoch": 0.4453422455368474, + "grad_norm": 0.18304547667503357, + "learning_rate": 0.001, + "loss": 2.3264, + "step": 10527 + }, + { + "epoch": 0.44538455030036384, + "grad_norm": 0.39500945806503296, + "learning_rate": 0.001, + "loss": 1.9825, + "step": 10528 + }, + { + "epoch": 0.4454268550638802, + "grad_norm": 2.5766665935516357, + "learning_rate": 0.001, + "loss": 2.5105, + "step": 10529 + }, + { + "epoch": 0.44546915982739654, + "grad_norm": 0.23480746150016785, + "learning_rate": 0.001, + "loss": 1.8403, + "step": 10530 + }, + { + "epoch": 0.44551146459091295, + "grad_norm": 0.18920212984085083, + "learning_rate": 0.001, + "loss": 2.8969, + "step": 10531 + }, + { + "epoch": 0.4455537693544293, + "grad_norm": 0.1778506338596344, + "learning_rate": 0.001, + "loss": 2.9153, + "step": 10532 + }, + { + "epoch": 0.44559607411794566, + "grad_norm": 0.28486740589141846, + "learning_rate": 0.001, + "loss": 3.6157, + "step": 10533 + }, + { + "epoch": 0.44563837888146207, + "grad_norm": 0.19953373074531555, + "learning_rate": 0.001, + "loss": 2.1183, + "step": 10534 + }, + { + "epoch": 0.4456806836449784, + "grad_norm": 0.17005318403244019, + "learning_rate": 0.001, + "loss": 3.2538, + "step": 10535 + }, + { + "epoch": 0.4457229884084948, + "grad_norm": 0.2750973105430603, + "learning_rate": 0.001, + "loss": 2.6134, + "step": 10536 + }, + { + "epoch": 0.4457652931720112, + "grad_norm": 0.18177230656147003, + "learning_rate": 0.001, + "loss": 2.8142, + "step": 10537 + }, + { + "epoch": 0.44580759793552754, + "grad_norm": 0.17410710453987122, + "learning_rate": 0.001, + "loss": 2.2946, + "step": 10538 + }, + { + "epoch": 0.4458499026990439, + "grad_norm": 0.42573174834251404, + "learning_rate": 0.001, + "loss": 2.5024, + "step": 10539 + }, + { + "epoch": 0.4458922074625603, + "grad_norm": 0.19186319410800934, + "learning_rate": 0.001, + "loss": 1.8359, + "step": 10540 + }, + { + "epoch": 0.44593451222607666, + "grad_norm": 0.3824551999568939, + "learning_rate": 0.001, + "loss": 2.5268, + "step": 10541 + }, + { + "epoch": 0.445976816989593, + "grad_norm": 0.23018528521060944, + "learning_rate": 0.001, + "loss": 2.834, + "step": 10542 + }, + { + "epoch": 0.4460191217531094, + "grad_norm": 0.1816767454147339, + "learning_rate": 0.001, + "loss": 2.0896, + "step": 10543 + }, + { + "epoch": 0.4460614265166258, + "grad_norm": 0.17159247398376465, + "learning_rate": 0.001, + "loss": 1.84, + "step": 10544 + }, + { + "epoch": 0.44610373128014214, + "grad_norm": 1.8396812677383423, + "learning_rate": 0.001, + "loss": 2.0751, + "step": 10545 + }, + { + "epoch": 0.4461460360436585, + "grad_norm": 0.3504880666732788, + "learning_rate": 0.001, + "loss": 2.3788, + "step": 10546 + }, + { + "epoch": 0.4461883408071749, + "grad_norm": 0.18512053787708282, + "learning_rate": 0.001, + "loss": 1.6916, + "step": 10547 + }, + { + "epoch": 0.44623064557069125, + "grad_norm": 0.18409253656864166, + "learning_rate": 0.001, + "loss": 1.5346, + "step": 10548 + }, + { + "epoch": 0.4462729503342076, + "grad_norm": 2.5886762142181396, + "learning_rate": 0.001, + "loss": 1.8567, + "step": 10549 + }, + { + "epoch": 0.446315255097724, + "grad_norm": 0.6384299993515015, + "learning_rate": 0.001, + "loss": 2.2381, + "step": 10550 + }, + { + "epoch": 0.44635755986124037, + "grad_norm": 2.2755072116851807, + "learning_rate": 0.001, + "loss": 2.2896, + "step": 10551 + }, + { + "epoch": 0.4463998646247567, + "grad_norm": 0.4430360198020935, + "learning_rate": 0.001, + "loss": 3.7153, + "step": 10552 + }, + { + "epoch": 0.44644216938827314, + "grad_norm": 1.0187450647354126, + "learning_rate": 0.001, + "loss": 3.2753, + "step": 10553 + }, + { + "epoch": 0.4464844741517895, + "grad_norm": 0.2957771122455597, + "learning_rate": 0.001, + "loss": 1.7454, + "step": 10554 + }, + { + "epoch": 0.44652677891530584, + "grad_norm": 0.2080879807472229, + "learning_rate": 0.001, + "loss": 2.0611, + "step": 10555 + }, + { + "epoch": 0.44656908367882225, + "grad_norm": 1.7243143320083618, + "learning_rate": 0.001, + "loss": 2.981, + "step": 10556 + }, + { + "epoch": 0.4466113884423386, + "grad_norm": 0.2195415198802948, + "learning_rate": 0.001, + "loss": 1.9373, + "step": 10557 + }, + { + "epoch": 0.44665369320585496, + "grad_norm": 0.19098582863807678, + "learning_rate": 0.001, + "loss": 1.9921, + "step": 10558 + }, + { + "epoch": 0.4466959979693714, + "grad_norm": 0.2138836830854416, + "learning_rate": 0.001, + "loss": 2.2429, + "step": 10559 + }, + { + "epoch": 0.4467383027328877, + "grad_norm": 0.18263570964336395, + "learning_rate": 0.001, + "loss": 2.5905, + "step": 10560 + }, + { + "epoch": 0.4467806074964041, + "grad_norm": 0.20292994379997253, + "learning_rate": 0.001, + "loss": 2.3356, + "step": 10561 + }, + { + "epoch": 0.4468229122599205, + "grad_norm": 0.2716735601425171, + "learning_rate": 0.001, + "loss": 1.7254, + "step": 10562 + }, + { + "epoch": 0.44686521702343684, + "grad_norm": 0.16340197622776031, + "learning_rate": 0.001, + "loss": 1.7591, + "step": 10563 + }, + { + "epoch": 0.4469075217869532, + "grad_norm": 0.2119547724723816, + "learning_rate": 0.001, + "loss": 2.3141, + "step": 10564 + }, + { + "epoch": 0.4469498265504696, + "grad_norm": 0.18071070313453674, + "learning_rate": 0.001, + "loss": 1.8618, + "step": 10565 + }, + { + "epoch": 0.44699213131398596, + "grad_norm": 0.3588818311691284, + "learning_rate": 0.001, + "loss": 3.3872, + "step": 10566 + }, + { + "epoch": 0.4470344360775023, + "grad_norm": 0.19287428259849548, + "learning_rate": 0.001, + "loss": 2.3166, + "step": 10567 + }, + { + "epoch": 0.44707674084101867, + "grad_norm": 0.43625664710998535, + "learning_rate": 0.001, + "loss": 2.8193, + "step": 10568 + }, + { + "epoch": 0.4471190456045351, + "grad_norm": 0.4928484559059143, + "learning_rate": 0.001, + "loss": 1.6347, + "step": 10569 + }, + { + "epoch": 0.44716135036805144, + "grad_norm": 0.685997724533081, + "learning_rate": 0.001, + "loss": 2.317, + "step": 10570 + }, + { + "epoch": 0.4472036551315678, + "grad_norm": 0.15223908424377441, + "learning_rate": 0.001, + "loss": 2.1719, + "step": 10571 + }, + { + "epoch": 0.4472459598950842, + "grad_norm": 0.18542735278606415, + "learning_rate": 0.001, + "loss": 1.5935, + "step": 10572 + }, + { + "epoch": 0.44728826465860055, + "grad_norm": 0.18469227850437164, + "learning_rate": 0.001, + "loss": 2.4737, + "step": 10573 + }, + { + "epoch": 0.4473305694221169, + "grad_norm": 0.24252432584762573, + "learning_rate": 0.001, + "loss": 1.9852, + "step": 10574 + }, + { + "epoch": 0.4473728741856333, + "grad_norm": 0.16159304976463318, + "learning_rate": 0.001, + "loss": 2.6181, + "step": 10575 + }, + { + "epoch": 0.44741517894914967, + "grad_norm": 3.23504900932312, + "learning_rate": 0.001, + "loss": 3.0528, + "step": 10576 + }, + { + "epoch": 0.447457483712666, + "grad_norm": 2.2757198810577393, + "learning_rate": 0.001, + "loss": 3.2706, + "step": 10577 + }, + { + "epoch": 0.44749978847618244, + "grad_norm": 0.22294031083583832, + "learning_rate": 0.001, + "loss": 2.1312, + "step": 10578 + }, + { + "epoch": 0.4475420932396988, + "grad_norm": 0.19114287197589874, + "learning_rate": 0.001, + "loss": 2.8998, + "step": 10579 + }, + { + "epoch": 0.44758439800321514, + "grad_norm": 0.6231216192245483, + "learning_rate": 0.001, + "loss": 3.3509, + "step": 10580 + }, + { + "epoch": 0.44762670276673155, + "grad_norm": 0.18667848408222198, + "learning_rate": 0.001, + "loss": 2.477, + "step": 10581 + }, + { + "epoch": 0.4476690075302479, + "grad_norm": 0.5230020880699158, + "learning_rate": 0.001, + "loss": 3.9596, + "step": 10582 + }, + { + "epoch": 0.44771131229376426, + "grad_norm": 0.2077375054359436, + "learning_rate": 0.001, + "loss": 2.1221, + "step": 10583 + }, + { + "epoch": 0.4477536170572807, + "grad_norm": 0.21029135584831238, + "learning_rate": 0.001, + "loss": 2.7445, + "step": 10584 + }, + { + "epoch": 0.447795921820797, + "grad_norm": 0.30405527353286743, + "learning_rate": 0.001, + "loss": 3.0613, + "step": 10585 + }, + { + "epoch": 0.4478382265843134, + "grad_norm": 0.16450397670269012, + "learning_rate": 0.001, + "loss": 2.518, + "step": 10586 + }, + { + "epoch": 0.4478805313478298, + "grad_norm": 1.3755722045898438, + "learning_rate": 0.001, + "loss": 2.3927, + "step": 10587 + }, + { + "epoch": 0.44792283611134615, + "grad_norm": 0.18358470499515533, + "learning_rate": 0.001, + "loss": 2.7458, + "step": 10588 + }, + { + "epoch": 0.4479651408748625, + "grad_norm": 5.526552200317383, + "learning_rate": 0.001, + "loss": 1.9233, + "step": 10589 + }, + { + "epoch": 0.44800744563837885, + "grad_norm": 0.2043481022119522, + "learning_rate": 0.001, + "loss": 2.0861, + "step": 10590 + }, + { + "epoch": 0.44804975040189526, + "grad_norm": 0.16888974606990814, + "learning_rate": 0.001, + "loss": 3.3986, + "step": 10591 + }, + { + "epoch": 0.4480920551654116, + "grad_norm": 0.208000048995018, + "learning_rate": 0.001, + "loss": 2.8552, + "step": 10592 + }, + { + "epoch": 0.44813435992892797, + "grad_norm": 2.04957914352417, + "learning_rate": 0.001, + "loss": 3.1259, + "step": 10593 + }, + { + "epoch": 0.4481766646924444, + "grad_norm": 0.4959050118923187, + "learning_rate": 0.001, + "loss": 2.161, + "step": 10594 + }, + { + "epoch": 0.44821896945596074, + "grad_norm": 0.20981314778327942, + "learning_rate": 0.001, + "loss": 2.1842, + "step": 10595 + }, + { + "epoch": 0.4482612742194771, + "grad_norm": 0.29932335019111633, + "learning_rate": 0.001, + "loss": 3.8183, + "step": 10596 + }, + { + "epoch": 0.4483035789829935, + "grad_norm": 0.22173571586608887, + "learning_rate": 0.001, + "loss": 2.4446, + "step": 10597 + }, + { + "epoch": 0.44834588374650985, + "grad_norm": 0.19373901188373566, + "learning_rate": 0.001, + "loss": 2.1564, + "step": 10598 + }, + { + "epoch": 0.4483881885100262, + "grad_norm": 0.802295446395874, + "learning_rate": 0.001, + "loss": 1.8698, + "step": 10599 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.1932401806116104, + "learning_rate": 0.001, + "loss": 2.4997, + "step": 10600 + }, + { + "epoch": 0.448472798037059, + "grad_norm": 0.19700828194618225, + "learning_rate": 0.001, + "loss": 2.4923, + "step": 10601 + }, + { + "epoch": 0.4485151028005753, + "grad_norm": 0.338742196559906, + "learning_rate": 0.001, + "loss": 1.6468, + "step": 10602 + }, + { + "epoch": 0.44855740756409174, + "grad_norm": 0.31011953949928284, + "learning_rate": 0.001, + "loss": 1.9476, + "step": 10603 + }, + { + "epoch": 0.4485997123276081, + "grad_norm": 0.1957470327615738, + "learning_rate": 0.001, + "loss": 1.7986, + "step": 10604 + }, + { + "epoch": 0.44864201709112445, + "grad_norm": 0.18266044557094574, + "learning_rate": 0.001, + "loss": 1.9285, + "step": 10605 + }, + { + "epoch": 0.44868432185464086, + "grad_norm": 0.17342445254325867, + "learning_rate": 0.001, + "loss": 2.4775, + "step": 10606 + }, + { + "epoch": 0.4487266266181572, + "grad_norm": 0.17653240263462067, + "learning_rate": 0.001, + "loss": 2.5431, + "step": 10607 + }, + { + "epoch": 0.44876893138167356, + "grad_norm": 0.1600387543439865, + "learning_rate": 0.001, + "loss": 3.0684, + "step": 10608 + }, + { + "epoch": 0.44881123614519, + "grad_norm": 0.20908235013484955, + "learning_rate": 0.001, + "loss": 2.4038, + "step": 10609 + }, + { + "epoch": 0.4488535409087063, + "grad_norm": 0.21198759973049164, + "learning_rate": 0.001, + "loss": 2.5214, + "step": 10610 + }, + { + "epoch": 0.4488958456722227, + "grad_norm": 0.16591830551624298, + "learning_rate": 0.001, + "loss": 2.178, + "step": 10611 + }, + { + "epoch": 0.4489381504357391, + "grad_norm": 3.6654651165008545, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 10612 + }, + { + "epoch": 0.44898045519925545, + "grad_norm": 0.17161937057971954, + "learning_rate": 0.001, + "loss": 2.2148, + "step": 10613 + }, + { + "epoch": 0.4490227599627718, + "grad_norm": 4.128261089324951, + "learning_rate": 0.001, + "loss": 2.2959, + "step": 10614 + }, + { + "epoch": 0.44906506472628815, + "grad_norm": 0.16964125633239746, + "learning_rate": 0.001, + "loss": 2.1389, + "step": 10615 + }, + { + "epoch": 0.44910736948980456, + "grad_norm": 1.7417608499526978, + "learning_rate": 0.001, + "loss": 1.705, + "step": 10616 + }, + { + "epoch": 0.4491496742533209, + "grad_norm": 0.9313798546791077, + "learning_rate": 0.001, + "loss": 2.6124, + "step": 10617 + }, + { + "epoch": 0.4491919790168373, + "grad_norm": 0.7544777393341064, + "learning_rate": 0.001, + "loss": 2.773, + "step": 10618 + }, + { + "epoch": 0.4492342837803537, + "grad_norm": 0.26840439438819885, + "learning_rate": 0.001, + "loss": 2.0986, + "step": 10619 + }, + { + "epoch": 0.44927658854387004, + "grad_norm": 0.18352295458316803, + "learning_rate": 0.001, + "loss": 2.9253, + "step": 10620 + }, + { + "epoch": 0.4493188933073864, + "grad_norm": 0.25015711784362793, + "learning_rate": 0.001, + "loss": 2.9165, + "step": 10621 + }, + { + "epoch": 0.4493611980709028, + "grad_norm": 0.15691161155700684, + "learning_rate": 0.001, + "loss": 1.6252, + "step": 10622 + }, + { + "epoch": 0.44940350283441916, + "grad_norm": 0.2128295749425888, + "learning_rate": 0.001, + "loss": 2.6712, + "step": 10623 + }, + { + "epoch": 0.4494458075979355, + "grad_norm": 2.622805118560791, + "learning_rate": 0.001, + "loss": 1.9613, + "step": 10624 + }, + { + "epoch": 0.4494881123614519, + "grad_norm": 1.922769546508789, + "learning_rate": 0.001, + "loss": 2.0337, + "step": 10625 + }, + { + "epoch": 0.4495304171249683, + "grad_norm": 0.1844593733549118, + "learning_rate": 0.001, + "loss": 2.4377, + "step": 10626 + }, + { + "epoch": 0.4495727218884846, + "grad_norm": 0.36522769927978516, + "learning_rate": 0.001, + "loss": 2.3181, + "step": 10627 + }, + { + "epoch": 0.44961502665200104, + "grad_norm": 0.796236515045166, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 10628 + }, + { + "epoch": 0.4496573314155174, + "grad_norm": 25.66647720336914, + "learning_rate": 0.001, + "loss": 1.9776, + "step": 10629 + }, + { + "epoch": 0.44969963617903375, + "grad_norm": 0.1876109093427658, + "learning_rate": 0.001, + "loss": 3.1408, + "step": 10630 + }, + { + "epoch": 0.44974194094255016, + "grad_norm": 0.2271474301815033, + "learning_rate": 0.001, + "loss": 2.7732, + "step": 10631 + }, + { + "epoch": 0.4497842457060665, + "grad_norm": 0.14895124733448029, + "learning_rate": 0.001, + "loss": 1.4203, + "step": 10632 + }, + { + "epoch": 0.44982655046958286, + "grad_norm": 0.5558005571365356, + "learning_rate": 0.001, + "loss": 2.295, + "step": 10633 + }, + { + "epoch": 0.4498688552330993, + "grad_norm": 0.2572641968727112, + "learning_rate": 0.001, + "loss": 2.5265, + "step": 10634 + }, + { + "epoch": 0.44991115999661563, + "grad_norm": 0.22297467291355133, + "learning_rate": 0.001, + "loss": 2.1774, + "step": 10635 + }, + { + "epoch": 0.449953464760132, + "grad_norm": 1.5451843738555908, + "learning_rate": 0.001, + "loss": 2.925, + "step": 10636 + }, + { + "epoch": 0.44999576952364834, + "grad_norm": 0.1914081871509552, + "learning_rate": 0.001, + "loss": 2.0183, + "step": 10637 + }, + { + "epoch": 0.45003807428716475, + "grad_norm": 0.192905992269516, + "learning_rate": 0.001, + "loss": 1.8431, + "step": 10638 + }, + { + "epoch": 0.4500803790506811, + "grad_norm": 0.1977374255657196, + "learning_rate": 0.001, + "loss": 2.5427, + "step": 10639 + }, + { + "epoch": 0.45012268381419746, + "grad_norm": 0.193924680352211, + "learning_rate": 0.001, + "loss": 2.4878, + "step": 10640 + }, + { + "epoch": 0.45016498857771386, + "grad_norm": 0.19579516351222992, + "learning_rate": 0.001, + "loss": 2.6807, + "step": 10641 + }, + { + "epoch": 0.4502072933412302, + "grad_norm": 0.17491202056407928, + "learning_rate": 0.001, + "loss": 1.4516, + "step": 10642 + }, + { + "epoch": 0.4502495981047466, + "grad_norm": 0.20388638973236084, + "learning_rate": 0.001, + "loss": 1.9766, + "step": 10643 + }, + { + "epoch": 0.450291902868263, + "grad_norm": 6.089061737060547, + "learning_rate": 0.001, + "loss": 1.7728, + "step": 10644 + }, + { + "epoch": 0.45033420763177934, + "grad_norm": 0.19175101816654205, + "learning_rate": 0.001, + "loss": 2.4468, + "step": 10645 + }, + { + "epoch": 0.4503765123952957, + "grad_norm": 0.18164491653442383, + "learning_rate": 0.001, + "loss": 1.6122, + "step": 10646 + }, + { + "epoch": 0.4504188171588121, + "grad_norm": 0.28620779514312744, + "learning_rate": 0.001, + "loss": 2.1983, + "step": 10647 + }, + { + "epoch": 0.45046112192232846, + "grad_norm": 0.17143702507019043, + "learning_rate": 0.001, + "loss": 3.1166, + "step": 10648 + }, + { + "epoch": 0.4505034266858448, + "grad_norm": 0.187702938914299, + "learning_rate": 0.001, + "loss": 2.7673, + "step": 10649 + }, + { + "epoch": 0.4505457314493612, + "grad_norm": 0.18621903657913208, + "learning_rate": 0.001, + "loss": 2.0963, + "step": 10650 + }, + { + "epoch": 0.4505880362128776, + "grad_norm": 0.18782265484333038, + "learning_rate": 0.001, + "loss": 2.6116, + "step": 10651 + }, + { + "epoch": 0.45063034097639393, + "grad_norm": 0.18947267532348633, + "learning_rate": 0.001, + "loss": 2.3372, + "step": 10652 + }, + { + "epoch": 0.45067264573991034, + "grad_norm": 0.6221209764480591, + "learning_rate": 0.001, + "loss": 1.4182, + "step": 10653 + }, + { + "epoch": 0.4507149505034267, + "grad_norm": 0.18391531705856323, + "learning_rate": 0.001, + "loss": 2.4517, + "step": 10654 + }, + { + "epoch": 0.45075725526694305, + "grad_norm": 0.31003570556640625, + "learning_rate": 0.001, + "loss": 2.6419, + "step": 10655 + }, + { + "epoch": 0.45079956003045946, + "grad_norm": 3.465576410293579, + "learning_rate": 0.001, + "loss": 2.7737, + "step": 10656 + }, + { + "epoch": 0.4508418647939758, + "grad_norm": 0.17568989098072052, + "learning_rate": 0.001, + "loss": 1.5883, + "step": 10657 + }, + { + "epoch": 0.45088416955749216, + "grad_norm": 0.18298137187957764, + "learning_rate": 0.001, + "loss": 2.9265, + "step": 10658 + }, + { + "epoch": 0.4509264743210085, + "grad_norm": 13.20496654510498, + "learning_rate": 0.001, + "loss": 2.632, + "step": 10659 + }, + { + "epoch": 0.45096877908452493, + "grad_norm": 0.2141726016998291, + "learning_rate": 0.001, + "loss": 1.5008, + "step": 10660 + }, + { + "epoch": 0.4510110838480413, + "grad_norm": 0.22003300487995148, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 10661 + }, + { + "epoch": 0.45105338861155764, + "grad_norm": 0.4876539409160614, + "learning_rate": 0.001, + "loss": 2.3984, + "step": 10662 + }, + { + "epoch": 0.45109569337507405, + "grad_norm": 0.2729082703590393, + "learning_rate": 0.001, + "loss": 1.7232, + "step": 10663 + }, + { + "epoch": 0.4511379981385904, + "grad_norm": 0.21046875417232513, + "learning_rate": 0.001, + "loss": 1.6638, + "step": 10664 + }, + { + "epoch": 0.45118030290210676, + "grad_norm": 6.246835708618164, + "learning_rate": 0.001, + "loss": 2.2441, + "step": 10665 + }, + { + "epoch": 0.45122260766562317, + "grad_norm": 0.24696436524391174, + "learning_rate": 0.001, + "loss": 2.4485, + "step": 10666 + }, + { + "epoch": 0.4512649124291395, + "grad_norm": 0.20760883390903473, + "learning_rate": 0.001, + "loss": 1.8772, + "step": 10667 + }, + { + "epoch": 0.4513072171926559, + "grad_norm": 0.2544911801815033, + "learning_rate": 0.001, + "loss": 3.1766, + "step": 10668 + }, + { + "epoch": 0.4513495219561723, + "grad_norm": 0.7529020309448242, + "learning_rate": 0.001, + "loss": 3.3731, + "step": 10669 + }, + { + "epoch": 0.45139182671968864, + "grad_norm": 0.23148514330387115, + "learning_rate": 0.001, + "loss": 2.8049, + "step": 10670 + }, + { + "epoch": 0.451434131483205, + "grad_norm": 0.28361719846725464, + "learning_rate": 0.001, + "loss": 2.4099, + "step": 10671 + }, + { + "epoch": 0.4514764362467214, + "grad_norm": 0.23970754444599152, + "learning_rate": 0.001, + "loss": 2.2309, + "step": 10672 + }, + { + "epoch": 0.45151874101023776, + "grad_norm": 0.20568141341209412, + "learning_rate": 0.001, + "loss": 2.9698, + "step": 10673 + }, + { + "epoch": 0.4515610457737541, + "grad_norm": 0.1887006312608719, + "learning_rate": 0.001, + "loss": 1.9361, + "step": 10674 + }, + { + "epoch": 0.4516033505372705, + "grad_norm": 0.31066638231277466, + "learning_rate": 0.001, + "loss": 2.2419, + "step": 10675 + }, + { + "epoch": 0.4516456553007869, + "grad_norm": 0.18941254913806915, + "learning_rate": 0.001, + "loss": 2.7735, + "step": 10676 + }, + { + "epoch": 0.45168796006430323, + "grad_norm": 0.16498833894729614, + "learning_rate": 0.001, + "loss": 3.2135, + "step": 10677 + }, + { + "epoch": 0.45173026482781964, + "grad_norm": 0.18185345828533173, + "learning_rate": 0.001, + "loss": 2.1368, + "step": 10678 + }, + { + "epoch": 0.451772569591336, + "grad_norm": 0.17669661343097687, + "learning_rate": 0.001, + "loss": 2.4485, + "step": 10679 + }, + { + "epoch": 0.45181487435485235, + "grad_norm": 0.22324764728546143, + "learning_rate": 0.001, + "loss": 2.542, + "step": 10680 + }, + { + "epoch": 0.4518571791183687, + "grad_norm": 0.17187261581420898, + "learning_rate": 0.001, + "loss": 2.604, + "step": 10681 + }, + { + "epoch": 0.4518994838818851, + "grad_norm": 0.184734508395195, + "learning_rate": 0.001, + "loss": 2.2802, + "step": 10682 + }, + { + "epoch": 0.45194178864540147, + "grad_norm": 0.14700573682785034, + "learning_rate": 0.001, + "loss": 3.5054, + "step": 10683 + }, + { + "epoch": 0.4519840934089178, + "grad_norm": 2.5030956268310547, + "learning_rate": 0.001, + "loss": 2.5592, + "step": 10684 + }, + { + "epoch": 0.45202639817243423, + "grad_norm": 0.20399565994739532, + "learning_rate": 0.001, + "loss": 3.7827, + "step": 10685 + }, + { + "epoch": 0.4520687029359506, + "grad_norm": 1.0105069875717163, + "learning_rate": 0.001, + "loss": 2.5169, + "step": 10686 + }, + { + "epoch": 0.45211100769946694, + "grad_norm": 0.39226314425468445, + "learning_rate": 0.001, + "loss": 1.9134, + "step": 10687 + }, + { + "epoch": 0.45215331246298335, + "grad_norm": 0.19782952964305878, + "learning_rate": 0.001, + "loss": 3.265, + "step": 10688 + }, + { + "epoch": 0.4521956172264997, + "grad_norm": 1.149984359741211, + "learning_rate": 0.001, + "loss": 2.6159, + "step": 10689 + }, + { + "epoch": 0.45223792199001606, + "grad_norm": 0.13431769609451294, + "learning_rate": 0.001, + "loss": 1.4885, + "step": 10690 + }, + { + "epoch": 0.45228022675353247, + "grad_norm": 0.18614532053470612, + "learning_rate": 0.001, + "loss": 1.9288, + "step": 10691 + }, + { + "epoch": 0.4523225315170488, + "grad_norm": 0.2908714711666107, + "learning_rate": 0.001, + "loss": 3.1537, + "step": 10692 + }, + { + "epoch": 0.4523648362805652, + "grad_norm": 0.21332935988903046, + "learning_rate": 0.001, + "loss": 2.2536, + "step": 10693 + }, + { + "epoch": 0.4524071410440816, + "grad_norm": 0.3098827302455902, + "learning_rate": 0.001, + "loss": 5.137, + "step": 10694 + }, + { + "epoch": 0.45244944580759794, + "grad_norm": 0.2915304899215698, + "learning_rate": 0.001, + "loss": 2.5853, + "step": 10695 + }, + { + "epoch": 0.4524917505711143, + "grad_norm": 1.5301575660705566, + "learning_rate": 0.001, + "loss": 2.4361, + "step": 10696 + }, + { + "epoch": 0.4525340553346307, + "grad_norm": 0.1704961210489273, + "learning_rate": 0.001, + "loss": 2.1001, + "step": 10697 + }, + { + "epoch": 0.45257636009814706, + "grad_norm": 0.3829928934574127, + "learning_rate": 0.001, + "loss": 1.9829, + "step": 10698 + }, + { + "epoch": 0.4526186648616634, + "grad_norm": 0.14519084990024567, + "learning_rate": 0.001, + "loss": 1.8952, + "step": 10699 + }, + { + "epoch": 0.4526609696251798, + "grad_norm": 5.465473651885986, + "learning_rate": 0.001, + "loss": 2.0073, + "step": 10700 + }, + { + "epoch": 0.4527032743886962, + "grad_norm": 0.17210935056209564, + "learning_rate": 0.001, + "loss": 2.0146, + "step": 10701 + }, + { + "epoch": 0.45274557915221253, + "grad_norm": 2.133603811264038, + "learning_rate": 0.001, + "loss": 3.2162, + "step": 10702 + }, + { + "epoch": 0.4527878839157289, + "grad_norm": 0.194070965051651, + "learning_rate": 0.001, + "loss": 2.5802, + "step": 10703 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.21091453731060028, + "learning_rate": 0.001, + "loss": 2.4426, + "step": 10704 + }, + { + "epoch": 0.45287249344276165, + "grad_norm": 0.1854954957962036, + "learning_rate": 0.001, + "loss": 1.5763, + "step": 10705 + }, + { + "epoch": 0.452914798206278, + "grad_norm": 0.1935143917798996, + "learning_rate": 0.001, + "loss": 1.7313, + "step": 10706 + }, + { + "epoch": 0.4529571029697944, + "grad_norm": 0.32737401127815247, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 10707 + }, + { + "epoch": 0.45299940773331077, + "grad_norm": 3.649876832962036, + "learning_rate": 0.001, + "loss": 2.0724, + "step": 10708 + }, + { + "epoch": 0.4530417124968271, + "grad_norm": 1.8369063138961792, + "learning_rate": 0.001, + "loss": 1.6074, + "step": 10709 + }, + { + "epoch": 0.45308401726034353, + "grad_norm": 0.22719253599643707, + "learning_rate": 0.001, + "loss": 2.463, + "step": 10710 + }, + { + "epoch": 0.4531263220238599, + "grad_norm": 0.38920852541923523, + "learning_rate": 0.001, + "loss": 1.6571, + "step": 10711 + }, + { + "epoch": 0.45316862678737624, + "grad_norm": 0.18624505400657654, + "learning_rate": 0.001, + "loss": 2.507, + "step": 10712 + }, + { + "epoch": 0.45321093155089265, + "grad_norm": 0.2142438441514969, + "learning_rate": 0.001, + "loss": 2.3202, + "step": 10713 + }, + { + "epoch": 0.453253236314409, + "grad_norm": 1.3076987266540527, + "learning_rate": 0.001, + "loss": 1.6404, + "step": 10714 + }, + { + "epoch": 0.45329554107792536, + "grad_norm": 0.9134403467178345, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 10715 + }, + { + "epoch": 0.45333784584144177, + "grad_norm": 0.3806881904602051, + "learning_rate": 0.001, + "loss": 2.0147, + "step": 10716 + }, + { + "epoch": 0.4533801506049581, + "grad_norm": 0.1724647432565689, + "learning_rate": 0.001, + "loss": 2.2622, + "step": 10717 + }, + { + "epoch": 0.4534224553684745, + "grad_norm": 0.5232478976249695, + "learning_rate": 0.001, + "loss": 3.3054, + "step": 10718 + }, + { + "epoch": 0.4534647601319909, + "grad_norm": 0.16044361889362335, + "learning_rate": 0.001, + "loss": 3.2017, + "step": 10719 + }, + { + "epoch": 0.45350706489550724, + "grad_norm": 0.17905324697494507, + "learning_rate": 0.001, + "loss": 1.8238, + "step": 10720 + }, + { + "epoch": 0.4535493696590236, + "grad_norm": 0.20264795422554016, + "learning_rate": 0.001, + "loss": 1.8288, + "step": 10721 + }, + { + "epoch": 0.45359167442254, + "grad_norm": 42.060760498046875, + "learning_rate": 0.001, + "loss": 3.1013, + "step": 10722 + }, + { + "epoch": 0.45363397918605636, + "grad_norm": 0.8245616555213928, + "learning_rate": 0.001, + "loss": 1.8905, + "step": 10723 + }, + { + "epoch": 0.4536762839495727, + "grad_norm": 934.2753295898438, + "learning_rate": 0.001, + "loss": 3.5141, + "step": 10724 + }, + { + "epoch": 0.4537185887130891, + "grad_norm": 0.4653104245662689, + "learning_rate": 0.001, + "loss": 2.0465, + "step": 10725 + }, + { + "epoch": 0.4537608934766055, + "grad_norm": 0.2089160531759262, + "learning_rate": 0.001, + "loss": 2.6989, + "step": 10726 + }, + { + "epoch": 0.45380319824012183, + "grad_norm": 1.048384428024292, + "learning_rate": 0.001, + "loss": 2.7521, + "step": 10727 + }, + { + "epoch": 0.4538455030036382, + "grad_norm": 0.18600456416606903, + "learning_rate": 0.001, + "loss": 2.0483, + "step": 10728 + }, + { + "epoch": 0.4538878077671546, + "grad_norm": 2.7133305072784424, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 10729 + }, + { + "epoch": 0.45393011253067095, + "grad_norm": 0.19306029379367828, + "learning_rate": 0.001, + "loss": 2.466, + "step": 10730 + }, + { + "epoch": 0.4539724172941873, + "grad_norm": 0.1988273561000824, + "learning_rate": 0.001, + "loss": 2.7235, + "step": 10731 + }, + { + "epoch": 0.4540147220577037, + "grad_norm": 6.111566543579102, + "learning_rate": 0.001, + "loss": 2.7007, + "step": 10732 + }, + { + "epoch": 0.45405702682122007, + "grad_norm": 0.4075400233268738, + "learning_rate": 0.001, + "loss": 3.3743, + "step": 10733 + }, + { + "epoch": 0.4540993315847364, + "grad_norm": 0.1714090257883072, + "learning_rate": 0.001, + "loss": 2.4183, + "step": 10734 + }, + { + "epoch": 0.45414163634825283, + "grad_norm": 0.27098438143730164, + "learning_rate": 0.001, + "loss": 2.5263, + "step": 10735 + }, + { + "epoch": 0.4541839411117692, + "grad_norm": 0.17201238870620728, + "learning_rate": 0.001, + "loss": 2.4768, + "step": 10736 + }, + { + "epoch": 0.45422624587528554, + "grad_norm": 0.23325662314891815, + "learning_rate": 0.001, + "loss": 3.6854, + "step": 10737 + }, + { + "epoch": 0.45426855063880195, + "grad_norm": 0.5161541104316711, + "learning_rate": 0.001, + "loss": 2.359, + "step": 10738 + }, + { + "epoch": 0.4543108554023183, + "grad_norm": 0.16155464947223663, + "learning_rate": 0.001, + "loss": 2.0626, + "step": 10739 + }, + { + "epoch": 0.45435316016583466, + "grad_norm": 0.43314120173454285, + "learning_rate": 0.001, + "loss": 2.4758, + "step": 10740 + }, + { + "epoch": 0.45439546492935107, + "grad_norm": 0.17236952483654022, + "learning_rate": 0.001, + "loss": 2.7448, + "step": 10741 + }, + { + "epoch": 0.4544377696928674, + "grad_norm": 1.6672812700271606, + "learning_rate": 0.001, + "loss": 1.8297, + "step": 10742 + }, + { + "epoch": 0.4544800744563838, + "grad_norm": 0.6431288123130798, + "learning_rate": 0.001, + "loss": 2.5279, + "step": 10743 + }, + { + "epoch": 0.4545223792199002, + "grad_norm": 0.1590161770582199, + "learning_rate": 0.001, + "loss": 1.7153, + "step": 10744 + }, + { + "epoch": 0.45456468398341654, + "grad_norm": 0.1368952840566635, + "learning_rate": 0.001, + "loss": 2.4392, + "step": 10745 + }, + { + "epoch": 0.4546069887469329, + "grad_norm": 0.9663378596305847, + "learning_rate": 0.001, + "loss": 2.5891, + "step": 10746 + }, + { + "epoch": 0.4546492935104493, + "grad_norm": 0.19427025318145752, + "learning_rate": 0.001, + "loss": 1.8954, + "step": 10747 + }, + { + "epoch": 0.45469159827396566, + "grad_norm": 2.1707139015197754, + "learning_rate": 0.001, + "loss": 3.155, + "step": 10748 + }, + { + "epoch": 0.454733903037482, + "grad_norm": 0.20545673370361328, + "learning_rate": 0.001, + "loss": 3.2715, + "step": 10749 + }, + { + "epoch": 0.45477620780099837, + "grad_norm": 0.1690516173839569, + "learning_rate": 0.001, + "loss": 2.7757, + "step": 10750 + }, + { + "epoch": 0.4548185125645148, + "grad_norm": 0.4233834743499756, + "learning_rate": 0.001, + "loss": 2.896, + "step": 10751 + }, + { + "epoch": 0.45486081732803113, + "grad_norm": 0.2022281438112259, + "learning_rate": 0.001, + "loss": 2.3454, + "step": 10752 + }, + { + "epoch": 0.4549031220915475, + "grad_norm": 0.17715947329998016, + "learning_rate": 0.001, + "loss": 2.0872, + "step": 10753 + }, + { + "epoch": 0.4549454268550639, + "grad_norm": 0.19292153418064117, + "learning_rate": 0.001, + "loss": 2.1691, + "step": 10754 + }, + { + "epoch": 0.45498773161858025, + "grad_norm": 1.835961103439331, + "learning_rate": 0.001, + "loss": 1.9661, + "step": 10755 + }, + { + "epoch": 0.4550300363820966, + "grad_norm": 0.16486118733882904, + "learning_rate": 0.001, + "loss": 2.5038, + "step": 10756 + }, + { + "epoch": 0.455072341145613, + "grad_norm": 0.7095552682876587, + "learning_rate": 0.001, + "loss": 2.1318, + "step": 10757 + }, + { + "epoch": 0.45511464590912937, + "grad_norm": 0.8125048875808716, + "learning_rate": 0.001, + "loss": 2.7182, + "step": 10758 + }, + { + "epoch": 0.4551569506726457, + "grad_norm": 6.972851753234863, + "learning_rate": 0.001, + "loss": 2.4598, + "step": 10759 + }, + { + "epoch": 0.45519925543616213, + "grad_norm": 1.0214871168136597, + "learning_rate": 0.001, + "loss": 2.0997, + "step": 10760 + }, + { + "epoch": 0.4552415601996785, + "grad_norm": 8.2625093460083, + "learning_rate": 0.001, + "loss": 2.4566, + "step": 10761 + }, + { + "epoch": 0.45528386496319484, + "grad_norm": 0.16502858698368073, + "learning_rate": 0.001, + "loss": 1.868, + "step": 10762 + }, + { + "epoch": 0.45532616972671125, + "grad_norm": 0.4280339777469635, + "learning_rate": 0.001, + "loss": 1.6946, + "step": 10763 + }, + { + "epoch": 0.4553684744902276, + "grad_norm": 0.23198087513446808, + "learning_rate": 0.001, + "loss": 3.1451, + "step": 10764 + }, + { + "epoch": 0.45541077925374396, + "grad_norm": 0.8714398145675659, + "learning_rate": 0.001, + "loss": 3.1186, + "step": 10765 + }, + { + "epoch": 0.45545308401726037, + "grad_norm": 0.2438887655735016, + "learning_rate": 0.001, + "loss": 2.7754, + "step": 10766 + }, + { + "epoch": 0.4554953887807767, + "grad_norm": 0.3168472349643707, + "learning_rate": 0.001, + "loss": 2.4378, + "step": 10767 + }, + { + "epoch": 0.4555376935442931, + "grad_norm": 0.214552104473114, + "learning_rate": 0.001, + "loss": 2.1332, + "step": 10768 + }, + { + "epoch": 0.4555799983078095, + "grad_norm": 0.18347600102424622, + "learning_rate": 0.001, + "loss": 1.9963, + "step": 10769 + }, + { + "epoch": 0.45562230307132584, + "grad_norm": 0.28068867325782776, + "learning_rate": 0.001, + "loss": 2.335, + "step": 10770 + }, + { + "epoch": 0.4556646078348422, + "grad_norm": 0.21685069799423218, + "learning_rate": 0.001, + "loss": 2.0136, + "step": 10771 + }, + { + "epoch": 0.45570691259835855, + "grad_norm": 0.18479874730110168, + "learning_rate": 0.001, + "loss": 2.1247, + "step": 10772 + }, + { + "epoch": 0.45574921736187496, + "grad_norm": 9.378438949584961, + "learning_rate": 0.001, + "loss": 1.8411, + "step": 10773 + }, + { + "epoch": 0.4557915221253913, + "grad_norm": 0.18619363009929657, + "learning_rate": 0.001, + "loss": 2.4149, + "step": 10774 + }, + { + "epoch": 0.45583382688890767, + "grad_norm": 0.48514658212661743, + "learning_rate": 0.001, + "loss": 1.8157, + "step": 10775 + }, + { + "epoch": 0.4558761316524241, + "grad_norm": 0.21699242293834686, + "learning_rate": 0.001, + "loss": 2.5075, + "step": 10776 + }, + { + "epoch": 0.45591843641594043, + "grad_norm": 0.5351647138595581, + "learning_rate": 0.001, + "loss": 2.0256, + "step": 10777 + }, + { + "epoch": 0.4559607411794568, + "grad_norm": 0.18083517253398895, + "learning_rate": 0.001, + "loss": 1.8436, + "step": 10778 + }, + { + "epoch": 0.4560030459429732, + "grad_norm": 0.16347528994083405, + "learning_rate": 0.001, + "loss": 1.6929, + "step": 10779 + }, + { + "epoch": 0.45604535070648955, + "grad_norm": 0.59482342004776, + "learning_rate": 0.001, + "loss": 2.8624, + "step": 10780 + }, + { + "epoch": 0.4560876554700059, + "grad_norm": 0.28643858432769775, + "learning_rate": 0.001, + "loss": 2.5811, + "step": 10781 + }, + { + "epoch": 0.4561299602335223, + "grad_norm": 0.22716449201107025, + "learning_rate": 0.001, + "loss": 2.7431, + "step": 10782 + }, + { + "epoch": 0.45617226499703867, + "grad_norm": 4.5622477531433105, + "learning_rate": 0.001, + "loss": 1.9823, + "step": 10783 + }, + { + "epoch": 0.456214569760555, + "grad_norm": 0.4299876391887665, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 10784 + }, + { + "epoch": 0.45625687452407143, + "grad_norm": 0.2948051989078522, + "learning_rate": 0.001, + "loss": 2.2761, + "step": 10785 + }, + { + "epoch": 0.4562991792875878, + "grad_norm": 0.16705124080181122, + "learning_rate": 0.001, + "loss": 3.0812, + "step": 10786 + }, + { + "epoch": 0.45634148405110414, + "grad_norm": 0.33032622933387756, + "learning_rate": 0.001, + "loss": 2.9178, + "step": 10787 + }, + { + "epoch": 0.45638378881462055, + "grad_norm": 0.3047260642051697, + "learning_rate": 0.001, + "loss": 2.0885, + "step": 10788 + }, + { + "epoch": 0.4564260935781369, + "grad_norm": 0.2009318619966507, + "learning_rate": 0.001, + "loss": 2.5426, + "step": 10789 + }, + { + "epoch": 0.45646839834165326, + "grad_norm": 0.26665037870407104, + "learning_rate": 0.001, + "loss": 3.0866, + "step": 10790 + }, + { + "epoch": 0.45651070310516967, + "grad_norm": 0.15253359079360962, + "learning_rate": 0.001, + "loss": 1.9167, + "step": 10791 + }, + { + "epoch": 0.456553007868686, + "grad_norm": 0.17726293206214905, + "learning_rate": 0.001, + "loss": 1.9892, + "step": 10792 + }, + { + "epoch": 0.4565953126322024, + "grad_norm": 0.1824486255645752, + "learning_rate": 0.001, + "loss": 2.0296, + "step": 10793 + }, + { + "epoch": 0.45663761739571873, + "grad_norm": 0.7334975004196167, + "learning_rate": 0.001, + "loss": 2.5563, + "step": 10794 + }, + { + "epoch": 0.45667992215923514, + "grad_norm": 0.14659246802330017, + "learning_rate": 0.001, + "loss": 1.8249, + "step": 10795 + }, + { + "epoch": 0.4567222269227515, + "grad_norm": 1.9629788398742676, + "learning_rate": 0.001, + "loss": 3.2285, + "step": 10796 + }, + { + "epoch": 0.45676453168626785, + "grad_norm": 0.18042373657226562, + "learning_rate": 0.001, + "loss": 3.0951, + "step": 10797 + }, + { + "epoch": 0.45680683644978426, + "grad_norm": 0.2109190672636032, + "learning_rate": 0.001, + "loss": 2.0418, + "step": 10798 + }, + { + "epoch": 0.4568491412133006, + "grad_norm": 0.2082575410604477, + "learning_rate": 0.001, + "loss": 2.7988, + "step": 10799 + }, + { + "epoch": 0.45689144597681697, + "grad_norm": 0.14357468485832214, + "learning_rate": 0.001, + "loss": 1.6557, + "step": 10800 + }, + { + "epoch": 0.4569337507403334, + "grad_norm": 0.16632072627544403, + "learning_rate": 0.001, + "loss": 1.9405, + "step": 10801 + }, + { + "epoch": 0.45697605550384973, + "grad_norm": 0.4018622636795044, + "learning_rate": 0.001, + "loss": 2.1912, + "step": 10802 + }, + { + "epoch": 0.4570183602673661, + "grad_norm": 0.19781026244163513, + "learning_rate": 0.001, + "loss": 2.1403, + "step": 10803 + }, + { + "epoch": 0.4570606650308825, + "grad_norm": 0.16085539758205414, + "learning_rate": 0.001, + "loss": 1.8415, + "step": 10804 + }, + { + "epoch": 0.45710296979439885, + "grad_norm": 0.2825770080089569, + "learning_rate": 0.001, + "loss": 3.2822, + "step": 10805 + }, + { + "epoch": 0.4571452745579152, + "grad_norm": 0.15267670154571533, + "learning_rate": 0.001, + "loss": 2.0494, + "step": 10806 + }, + { + "epoch": 0.4571875793214316, + "grad_norm": 1.4387484788894653, + "learning_rate": 0.001, + "loss": 3.0406, + "step": 10807 + }, + { + "epoch": 0.45722988408494797, + "grad_norm": 0.14792972803115845, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 10808 + }, + { + "epoch": 0.4572721888484643, + "grad_norm": 0.17246343195438385, + "learning_rate": 0.001, + "loss": 2.1818, + "step": 10809 + }, + { + "epoch": 0.45731449361198073, + "grad_norm": 0.40068310499191284, + "learning_rate": 0.001, + "loss": 1.7939, + "step": 10810 + }, + { + "epoch": 0.4573567983754971, + "grad_norm": 0.19033183157444, + "learning_rate": 0.001, + "loss": 2.0297, + "step": 10811 + }, + { + "epoch": 0.45739910313901344, + "grad_norm": 0.16076108813285828, + "learning_rate": 0.001, + "loss": 1.9428, + "step": 10812 + }, + { + "epoch": 0.45744140790252985, + "grad_norm": 0.1628408133983612, + "learning_rate": 0.001, + "loss": 2.0893, + "step": 10813 + }, + { + "epoch": 0.4574837126660462, + "grad_norm": 0.29927513003349304, + "learning_rate": 0.001, + "loss": 2.7233, + "step": 10814 + }, + { + "epoch": 0.45752601742956256, + "grad_norm": 0.1823870986700058, + "learning_rate": 0.001, + "loss": 2.4618, + "step": 10815 + }, + { + "epoch": 0.4575683221930789, + "grad_norm": 0.4014187455177307, + "learning_rate": 0.001, + "loss": 2.2749, + "step": 10816 + }, + { + "epoch": 0.4576106269565953, + "grad_norm": 0.17997021973133087, + "learning_rate": 0.001, + "loss": 3.2688, + "step": 10817 + }, + { + "epoch": 0.4576529317201117, + "grad_norm": 0.15339773893356323, + "learning_rate": 0.001, + "loss": 1.9151, + "step": 10818 + }, + { + "epoch": 0.45769523648362803, + "grad_norm": 0.36581626534461975, + "learning_rate": 0.001, + "loss": 1.4648, + "step": 10819 + }, + { + "epoch": 0.45773754124714444, + "grad_norm": 0.17709127068519592, + "learning_rate": 0.001, + "loss": 2.4527, + "step": 10820 + }, + { + "epoch": 0.4577798460106608, + "grad_norm": 0.4405263066291809, + "learning_rate": 0.001, + "loss": 2.7126, + "step": 10821 + }, + { + "epoch": 0.45782215077417715, + "grad_norm": 0.6151844263076782, + "learning_rate": 0.001, + "loss": 2.4587, + "step": 10822 + }, + { + "epoch": 0.45786445553769356, + "grad_norm": 0.18310926854610443, + "learning_rate": 0.001, + "loss": 1.8446, + "step": 10823 + }, + { + "epoch": 0.4579067603012099, + "grad_norm": 0.18017029762268066, + "learning_rate": 0.001, + "loss": 2.2304, + "step": 10824 + }, + { + "epoch": 0.45794906506472627, + "grad_norm": 0.9790677428245544, + "learning_rate": 0.001, + "loss": 3.1814, + "step": 10825 + }, + { + "epoch": 0.4579913698282427, + "grad_norm": 0.20925338566303253, + "learning_rate": 0.001, + "loss": 3.3553, + "step": 10826 + }, + { + "epoch": 0.45803367459175903, + "grad_norm": 0.17660897970199585, + "learning_rate": 0.001, + "loss": 2.1409, + "step": 10827 + }, + { + "epoch": 0.4580759793552754, + "grad_norm": 0.48942068219184875, + "learning_rate": 0.001, + "loss": 2.4424, + "step": 10828 + }, + { + "epoch": 0.4581182841187918, + "grad_norm": 0.2551790475845337, + "learning_rate": 0.001, + "loss": 1.7616, + "step": 10829 + }, + { + "epoch": 0.45816058888230815, + "grad_norm": 0.20451773703098297, + "learning_rate": 0.001, + "loss": 2.0661, + "step": 10830 + }, + { + "epoch": 0.4582028936458245, + "grad_norm": 0.2622445225715637, + "learning_rate": 0.001, + "loss": 1.9234, + "step": 10831 + }, + { + "epoch": 0.4582451984093409, + "grad_norm": 0.16220338642597198, + "learning_rate": 0.001, + "loss": 1.6778, + "step": 10832 + }, + { + "epoch": 0.45828750317285727, + "grad_norm": 0.2827407121658325, + "learning_rate": 0.001, + "loss": 2.8145, + "step": 10833 + }, + { + "epoch": 0.4583298079363736, + "grad_norm": 0.21268029510974884, + "learning_rate": 0.001, + "loss": 2.3485, + "step": 10834 + }, + { + "epoch": 0.45837211269989003, + "grad_norm": 0.23748518526554108, + "learning_rate": 0.001, + "loss": 2.6988, + "step": 10835 + }, + { + "epoch": 0.4584144174634064, + "grad_norm": 0.4688171148300171, + "learning_rate": 0.001, + "loss": 2.2598, + "step": 10836 + }, + { + "epoch": 0.45845672222692274, + "grad_norm": 2.4489855766296387, + "learning_rate": 0.001, + "loss": 2.3562, + "step": 10837 + }, + { + "epoch": 0.4584990269904391, + "grad_norm": 0.1477189064025879, + "learning_rate": 0.001, + "loss": 2.064, + "step": 10838 + }, + { + "epoch": 0.4585413317539555, + "grad_norm": 0.20878483355045319, + "learning_rate": 0.001, + "loss": 2.477, + "step": 10839 + }, + { + "epoch": 0.45858363651747186, + "grad_norm": 0.47598791122436523, + "learning_rate": 0.001, + "loss": 2.7616, + "step": 10840 + }, + { + "epoch": 0.4586259412809882, + "grad_norm": 0.16954365372657776, + "learning_rate": 0.001, + "loss": 3.0694, + "step": 10841 + }, + { + "epoch": 0.4586682460445046, + "grad_norm": 0.25619927048683167, + "learning_rate": 0.001, + "loss": 1.9389, + "step": 10842 + }, + { + "epoch": 0.458710550808021, + "grad_norm": 1.3417062759399414, + "learning_rate": 0.001, + "loss": 2.3684, + "step": 10843 + }, + { + "epoch": 0.45875285557153733, + "grad_norm": 1.6794036626815796, + "learning_rate": 0.001, + "loss": 2.2654, + "step": 10844 + }, + { + "epoch": 0.45879516033505374, + "grad_norm": 0.185557559132576, + "learning_rate": 0.001, + "loss": 2.1782, + "step": 10845 + }, + { + "epoch": 0.4588374650985701, + "grad_norm": 0.1776900589466095, + "learning_rate": 0.001, + "loss": 2.9762, + "step": 10846 + }, + { + "epoch": 0.45887976986208645, + "grad_norm": 0.19009187817573547, + "learning_rate": 0.001, + "loss": 2.3947, + "step": 10847 + }, + { + "epoch": 0.45892207462560286, + "grad_norm": 0.17857715487480164, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 10848 + }, + { + "epoch": 0.4589643793891192, + "grad_norm": 0.19130466878414154, + "learning_rate": 0.001, + "loss": 3.6092, + "step": 10849 + }, + { + "epoch": 0.45900668415263557, + "grad_norm": 1.0953785181045532, + "learning_rate": 0.001, + "loss": 2.8111, + "step": 10850 + }, + { + "epoch": 0.459048988916152, + "grad_norm": 0.1537635773420334, + "learning_rate": 0.001, + "loss": 1.9802, + "step": 10851 + }, + { + "epoch": 0.45909129367966833, + "grad_norm": 0.16897273063659668, + "learning_rate": 0.001, + "loss": 2.0585, + "step": 10852 + }, + { + "epoch": 0.4591335984431847, + "grad_norm": 0.9091619253158569, + "learning_rate": 0.001, + "loss": 2.9428, + "step": 10853 + }, + { + "epoch": 0.4591759032067011, + "grad_norm": 0.16677002608776093, + "learning_rate": 0.001, + "loss": 1.8913, + "step": 10854 + }, + { + "epoch": 0.45921820797021745, + "grad_norm": 0.21832408010959625, + "learning_rate": 0.001, + "loss": 3.3002, + "step": 10855 + }, + { + "epoch": 0.4592605127337338, + "grad_norm": 0.1814076453447342, + "learning_rate": 0.001, + "loss": 1.5207, + "step": 10856 + }, + { + "epoch": 0.4593028174972502, + "grad_norm": 0.17598986625671387, + "learning_rate": 0.001, + "loss": 2.3064, + "step": 10857 + }, + { + "epoch": 0.45934512226076657, + "grad_norm": 0.24575437605381012, + "learning_rate": 0.001, + "loss": 2.6707, + "step": 10858 + }, + { + "epoch": 0.4593874270242829, + "grad_norm": 0.1765861064195633, + "learning_rate": 0.001, + "loss": 1.8149, + "step": 10859 + }, + { + "epoch": 0.45942973178779933, + "grad_norm": 0.7690317034721375, + "learning_rate": 0.001, + "loss": 2.0892, + "step": 10860 + }, + { + "epoch": 0.4594720365513157, + "grad_norm": 0.1727459579706192, + "learning_rate": 0.001, + "loss": 1.9199, + "step": 10861 + }, + { + "epoch": 0.45951434131483204, + "grad_norm": 0.1738310158252716, + "learning_rate": 0.001, + "loss": 2.377, + "step": 10862 + }, + { + "epoch": 0.4595566460783484, + "grad_norm": 12.975968360900879, + "learning_rate": 0.001, + "loss": 2.9698, + "step": 10863 + }, + { + "epoch": 0.4595989508418648, + "grad_norm": 0.23020650446414948, + "learning_rate": 0.001, + "loss": 2.3748, + "step": 10864 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 0.14543908834457397, + "learning_rate": 0.001, + "loss": 1.7494, + "step": 10865 + }, + { + "epoch": 0.4596835603688975, + "grad_norm": 1.8263896703720093, + "learning_rate": 0.001, + "loss": 2.5401, + "step": 10866 + }, + { + "epoch": 0.4597258651324139, + "grad_norm": 0.23865966498851776, + "learning_rate": 0.001, + "loss": 2.27, + "step": 10867 + }, + { + "epoch": 0.4597681698959303, + "grad_norm": 0.7951701283454895, + "learning_rate": 0.001, + "loss": 1.5266, + "step": 10868 + }, + { + "epoch": 0.45981047465944663, + "grad_norm": 0.18856553733348846, + "learning_rate": 0.001, + "loss": 3.1963, + "step": 10869 + }, + { + "epoch": 0.45985277942296304, + "grad_norm": 0.16450615227222443, + "learning_rate": 0.001, + "loss": 2.5576, + "step": 10870 + }, + { + "epoch": 0.4598950841864794, + "grad_norm": 0.18288759887218475, + "learning_rate": 0.001, + "loss": 1.5935, + "step": 10871 + }, + { + "epoch": 0.45993738894999575, + "grad_norm": 0.16834130883216858, + "learning_rate": 0.001, + "loss": 1.5081, + "step": 10872 + }, + { + "epoch": 0.45997969371351216, + "grad_norm": 0.1692802906036377, + "learning_rate": 0.001, + "loss": 1.8773, + "step": 10873 + }, + { + "epoch": 0.4600219984770285, + "grad_norm": 0.1614452451467514, + "learning_rate": 0.001, + "loss": 2.5756, + "step": 10874 + }, + { + "epoch": 0.46006430324054487, + "grad_norm": 0.18225987255573273, + "learning_rate": 0.001, + "loss": 1.9148, + "step": 10875 + }, + { + "epoch": 0.4601066080040613, + "grad_norm": 0.16945257782936096, + "learning_rate": 0.001, + "loss": 1.9408, + "step": 10876 + }, + { + "epoch": 0.46014891276757763, + "grad_norm": 0.18923348188400269, + "learning_rate": 0.001, + "loss": 3.1651, + "step": 10877 + }, + { + "epoch": 0.460191217531094, + "grad_norm": 0.7960704565048218, + "learning_rate": 0.001, + "loss": 2.0982, + "step": 10878 + }, + { + "epoch": 0.4602335222946104, + "grad_norm": 0.16281262040138245, + "learning_rate": 0.001, + "loss": 2.7911, + "step": 10879 + }, + { + "epoch": 0.46027582705812675, + "grad_norm": 0.15493243932724, + "learning_rate": 0.001, + "loss": 1.7558, + "step": 10880 + }, + { + "epoch": 0.4603181318216431, + "grad_norm": 1.4312360286712646, + "learning_rate": 0.001, + "loss": 2.6279, + "step": 10881 + }, + { + "epoch": 0.4603604365851595, + "grad_norm": 0.18182626366615295, + "learning_rate": 0.001, + "loss": 2.3233, + "step": 10882 + }, + { + "epoch": 0.46040274134867587, + "grad_norm": 0.17061913013458252, + "learning_rate": 0.001, + "loss": 2.0113, + "step": 10883 + }, + { + "epoch": 0.4604450461121922, + "grad_norm": 1.3097604513168335, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 10884 + }, + { + "epoch": 0.4604873508757086, + "grad_norm": 0.1752685308456421, + "learning_rate": 0.001, + "loss": 2.0929, + "step": 10885 + }, + { + "epoch": 0.460529655639225, + "grad_norm": 0.3267187178134918, + "learning_rate": 0.001, + "loss": 2.5284, + "step": 10886 + }, + { + "epoch": 0.46057196040274134, + "grad_norm": 0.17250603437423706, + "learning_rate": 0.001, + "loss": 2.8212, + "step": 10887 + }, + { + "epoch": 0.4606142651662577, + "grad_norm": 1.1300095319747925, + "learning_rate": 0.001, + "loss": 2.0045, + "step": 10888 + }, + { + "epoch": 0.4606565699297741, + "grad_norm": 0.18050777912139893, + "learning_rate": 0.001, + "loss": 2.8097, + "step": 10889 + }, + { + "epoch": 0.46069887469329046, + "grad_norm": 0.16347844898700714, + "learning_rate": 0.001, + "loss": 2.505, + "step": 10890 + }, + { + "epoch": 0.4607411794568068, + "grad_norm": 4.939105033874512, + "learning_rate": 0.001, + "loss": 2.3762, + "step": 10891 + }, + { + "epoch": 0.4607834842203232, + "grad_norm": 13.591337203979492, + "learning_rate": 0.001, + "loss": 2.09, + "step": 10892 + }, + { + "epoch": 0.4608257889838396, + "grad_norm": 0.9323223829269409, + "learning_rate": 0.001, + "loss": 1.6851, + "step": 10893 + }, + { + "epoch": 0.46086809374735593, + "grad_norm": 0.7729269862174988, + "learning_rate": 0.001, + "loss": 1.4762, + "step": 10894 + }, + { + "epoch": 0.46091039851087234, + "grad_norm": 8.235560417175293, + "learning_rate": 0.001, + "loss": 2.5788, + "step": 10895 + }, + { + "epoch": 0.4609527032743887, + "grad_norm": 0.15181410312652588, + "learning_rate": 0.001, + "loss": 1.448, + "step": 10896 + }, + { + "epoch": 0.46099500803790505, + "grad_norm": 0.18965478241443634, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 10897 + }, + { + "epoch": 0.46103731280142146, + "grad_norm": 1.7368152141571045, + "learning_rate": 0.001, + "loss": 3.6529, + "step": 10898 + }, + { + "epoch": 0.4610796175649378, + "grad_norm": 0.22795704007148743, + "learning_rate": 0.001, + "loss": 2.778, + "step": 10899 + }, + { + "epoch": 0.46112192232845417, + "grad_norm": 0.15993206202983856, + "learning_rate": 0.001, + "loss": 1.6612, + "step": 10900 + }, + { + "epoch": 0.4611642270919706, + "grad_norm": 0.1846051812171936, + "learning_rate": 0.001, + "loss": 1.5179, + "step": 10901 + }, + { + "epoch": 0.46120653185548693, + "grad_norm": 0.22482596337795258, + "learning_rate": 0.001, + "loss": 2.6861, + "step": 10902 + }, + { + "epoch": 0.4612488366190033, + "grad_norm": 10.483909606933594, + "learning_rate": 0.001, + "loss": 2.1016, + "step": 10903 + }, + { + "epoch": 0.4612911413825197, + "grad_norm": 0.45602327585220337, + "learning_rate": 0.001, + "loss": 2.7724, + "step": 10904 + }, + { + "epoch": 0.46133344614603605, + "grad_norm": 0.19587081670761108, + "learning_rate": 0.001, + "loss": 1.8976, + "step": 10905 + }, + { + "epoch": 0.4613757509095524, + "grad_norm": 0.22737310826778412, + "learning_rate": 0.001, + "loss": 3.4405, + "step": 10906 + }, + { + "epoch": 0.46141805567306876, + "grad_norm": 0.17420297861099243, + "learning_rate": 0.001, + "loss": 2.4252, + "step": 10907 + }, + { + "epoch": 0.46146036043658517, + "grad_norm": 0.15364967286586761, + "learning_rate": 0.001, + "loss": 1.766, + "step": 10908 + }, + { + "epoch": 0.4615026652001015, + "grad_norm": 1.44530189037323, + "learning_rate": 0.001, + "loss": 2.0774, + "step": 10909 + }, + { + "epoch": 0.4615449699636179, + "grad_norm": 0.16581588983535767, + "learning_rate": 0.001, + "loss": 1.5537, + "step": 10910 + }, + { + "epoch": 0.4615872747271343, + "grad_norm": 0.18303801119327545, + "learning_rate": 0.001, + "loss": 2.2337, + "step": 10911 + }, + { + "epoch": 0.46162957949065064, + "grad_norm": 0.1899944394826889, + "learning_rate": 0.001, + "loss": 1.3449, + "step": 10912 + }, + { + "epoch": 0.461671884254167, + "grad_norm": 1.3561519384384155, + "learning_rate": 0.001, + "loss": 2.3431, + "step": 10913 + }, + { + "epoch": 0.4617141890176834, + "grad_norm": 0.17157615721225739, + "learning_rate": 0.001, + "loss": 2.5933, + "step": 10914 + }, + { + "epoch": 0.46175649378119976, + "grad_norm": 1.1707208156585693, + "learning_rate": 0.001, + "loss": 2.3276, + "step": 10915 + }, + { + "epoch": 0.4617987985447161, + "grad_norm": 0.3634940981864929, + "learning_rate": 0.001, + "loss": 2.8591, + "step": 10916 + }, + { + "epoch": 0.4618411033082325, + "grad_norm": 0.3375161588191986, + "learning_rate": 0.001, + "loss": 2.2901, + "step": 10917 + }, + { + "epoch": 0.4618834080717489, + "grad_norm": 31.233200073242188, + "learning_rate": 0.001, + "loss": 2.6661, + "step": 10918 + }, + { + "epoch": 0.46192571283526523, + "grad_norm": 0.4107927083969116, + "learning_rate": 0.001, + "loss": 2.019, + "step": 10919 + }, + { + "epoch": 0.46196801759878164, + "grad_norm": 0.7090280652046204, + "learning_rate": 0.001, + "loss": 2.5535, + "step": 10920 + }, + { + "epoch": 0.462010322362298, + "grad_norm": 2.35701060295105, + "learning_rate": 0.001, + "loss": 1.7864, + "step": 10921 + }, + { + "epoch": 0.46205262712581435, + "grad_norm": 0.16083607077598572, + "learning_rate": 0.001, + "loss": 1.9362, + "step": 10922 + }, + { + "epoch": 0.46209493188933076, + "grad_norm": 0.20161986351013184, + "learning_rate": 0.001, + "loss": 2.4446, + "step": 10923 + }, + { + "epoch": 0.4621372366528471, + "grad_norm": 7.899982929229736, + "learning_rate": 0.001, + "loss": 3.1712, + "step": 10924 + }, + { + "epoch": 0.46217954141636347, + "grad_norm": 0.15877409279346466, + "learning_rate": 0.001, + "loss": 2.2344, + "step": 10925 + }, + { + "epoch": 0.4622218461798799, + "grad_norm": 0.19421695172786713, + "learning_rate": 0.001, + "loss": 2.574, + "step": 10926 + }, + { + "epoch": 0.46226415094339623, + "grad_norm": 0.5524720549583435, + "learning_rate": 0.001, + "loss": 2.2769, + "step": 10927 + }, + { + "epoch": 0.4623064557069126, + "grad_norm": 0.30834636092185974, + "learning_rate": 0.001, + "loss": 1.5401, + "step": 10928 + }, + { + "epoch": 0.46234876047042894, + "grad_norm": 0.17443600296974182, + "learning_rate": 0.001, + "loss": 3.5821, + "step": 10929 + }, + { + "epoch": 0.46239106523394535, + "grad_norm": 0.17415951192378998, + "learning_rate": 0.001, + "loss": 2.5898, + "step": 10930 + }, + { + "epoch": 0.4624333699974617, + "grad_norm": 0.14136327803134918, + "learning_rate": 0.001, + "loss": 2.4831, + "step": 10931 + }, + { + "epoch": 0.46247567476097806, + "grad_norm": 13.823705673217773, + "learning_rate": 0.001, + "loss": 3.3177, + "step": 10932 + }, + { + "epoch": 0.46251797952449447, + "grad_norm": 0.24503834545612335, + "learning_rate": 0.001, + "loss": 2.119, + "step": 10933 + }, + { + "epoch": 0.4625602842880108, + "grad_norm": 0.24650239944458008, + "learning_rate": 0.001, + "loss": 2.9394, + "step": 10934 + }, + { + "epoch": 0.4626025890515272, + "grad_norm": 0.193034827709198, + "learning_rate": 0.001, + "loss": 2.3874, + "step": 10935 + }, + { + "epoch": 0.4626448938150436, + "grad_norm": 0.16548699140548706, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 10936 + }, + { + "epoch": 0.46268719857855994, + "grad_norm": 0.2028469741344452, + "learning_rate": 0.001, + "loss": 3.2924, + "step": 10937 + }, + { + "epoch": 0.4627295033420763, + "grad_norm": 0.25958749651908875, + "learning_rate": 0.001, + "loss": 1.914, + "step": 10938 + }, + { + "epoch": 0.4627718081055927, + "grad_norm": 0.2172216922044754, + "learning_rate": 0.001, + "loss": 2.3915, + "step": 10939 + }, + { + "epoch": 0.46281411286910906, + "grad_norm": 0.22597426176071167, + "learning_rate": 0.001, + "loss": 2.447, + "step": 10940 + }, + { + "epoch": 0.4628564176326254, + "grad_norm": 0.6521003842353821, + "learning_rate": 0.001, + "loss": 3.1807, + "step": 10941 + }, + { + "epoch": 0.4628987223961418, + "grad_norm": 0.2350475788116455, + "learning_rate": 0.001, + "loss": 2.4876, + "step": 10942 + }, + { + "epoch": 0.4629410271596582, + "grad_norm": 0.15000639855861664, + "learning_rate": 0.001, + "loss": 2.5798, + "step": 10943 + }, + { + "epoch": 0.46298333192317453, + "grad_norm": 13.369725227355957, + "learning_rate": 0.001, + "loss": 2.4465, + "step": 10944 + }, + { + "epoch": 0.46302563668669094, + "grad_norm": 0.21107615530490875, + "learning_rate": 0.001, + "loss": 3.5766, + "step": 10945 + }, + { + "epoch": 0.4630679414502073, + "grad_norm": 0.45922183990478516, + "learning_rate": 0.001, + "loss": 3.318, + "step": 10946 + }, + { + "epoch": 0.46311024621372365, + "grad_norm": 0.15640473365783691, + "learning_rate": 0.001, + "loss": 1.9649, + "step": 10947 + }, + { + "epoch": 0.46315255097724006, + "grad_norm": 0.17740632593631744, + "learning_rate": 0.001, + "loss": 2.4352, + "step": 10948 + }, + { + "epoch": 0.4631948557407564, + "grad_norm": 0.21806718409061432, + "learning_rate": 0.001, + "loss": 3.8975, + "step": 10949 + }, + { + "epoch": 0.46323716050427277, + "grad_norm": 0.17442677915096283, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 10950 + }, + { + "epoch": 0.4632794652677891, + "grad_norm": 0.18725450336933136, + "learning_rate": 0.001, + "loss": 1.6942, + "step": 10951 + }, + { + "epoch": 0.46332177003130554, + "grad_norm": 0.2647855281829834, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 10952 + }, + { + "epoch": 0.4633640747948219, + "grad_norm": 0.22588206827640533, + "learning_rate": 0.001, + "loss": 2.6015, + "step": 10953 + }, + { + "epoch": 0.46340637955833824, + "grad_norm": 0.18170830607414246, + "learning_rate": 0.001, + "loss": 2.4247, + "step": 10954 + }, + { + "epoch": 0.46344868432185465, + "grad_norm": 0.19047954678535461, + "learning_rate": 0.001, + "loss": 2.5552, + "step": 10955 + }, + { + "epoch": 0.463490989085371, + "grad_norm": 0.18378345668315887, + "learning_rate": 0.001, + "loss": 3.1469, + "step": 10956 + }, + { + "epoch": 0.46353329384888736, + "grad_norm": 1.0354962348937988, + "learning_rate": 0.001, + "loss": 3.1662, + "step": 10957 + }, + { + "epoch": 0.46357559861240377, + "grad_norm": 0.16644562780857086, + "learning_rate": 0.001, + "loss": 1.9066, + "step": 10958 + }, + { + "epoch": 0.4636179033759201, + "grad_norm": 0.17657527327537537, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 10959 + }, + { + "epoch": 0.4636602081394365, + "grad_norm": 0.43034595251083374, + "learning_rate": 0.001, + "loss": 2.2832, + "step": 10960 + }, + { + "epoch": 0.4637025129029529, + "grad_norm": 1.4906646013259888, + "learning_rate": 0.001, + "loss": 2.355, + "step": 10961 + }, + { + "epoch": 0.46374481766646924, + "grad_norm": 0.324938029050827, + "learning_rate": 0.001, + "loss": 1.9415, + "step": 10962 + }, + { + "epoch": 0.4637871224299856, + "grad_norm": 1.2835636138916016, + "learning_rate": 0.001, + "loss": 1.9729, + "step": 10963 + }, + { + "epoch": 0.463829427193502, + "grad_norm": 0.1738107055425644, + "learning_rate": 0.001, + "loss": 2.4461, + "step": 10964 + }, + { + "epoch": 0.46387173195701836, + "grad_norm": 0.23237141966819763, + "learning_rate": 0.001, + "loss": 1.4232, + "step": 10965 + }, + { + "epoch": 0.4639140367205347, + "grad_norm": 0.30422407388687134, + "learning_rate": 0.001, + "loss": 2.2135, + "step": 10966 + }, + { + "epoch": 0.4639563414840511, + "grad_norm": 0.17090842127799988, + "learning_rate": 0.001, + "loss": 2.0125, + "step": 10967 + }, + { + "epoch": 0.4639986462475675, + "grad_norm": 0.1509457677602768, + "learning_rate": 0.001, + "loss": 2.2202, + "step": 10968 + }, + { + "epoch": 0.46404095101108384, + "grad_norm": 0.16378089785575867, + "learning_rate": 0.001, + "loss": 2.301, + "step": 10969 + }, + { + "epoch": 0.46408325577460025, + "grad_norm": 6.196132659912109, + "learning_rate": 0.001, + "loss": 2.2314, + "step": 10970 + }, + { + "epoch": 0.4641255605381166, + "grad_norm": 0.44474777579307556, + "learning_rate": 0.001, + "loss": 2.6964, + "step": 10971 + }, + { + "epoch": 0.46416786530163295, + "grad_norm": 0.21871718764305115, + "learning_rate": 0.001, + "loss": 2.0155, + "step": 10972 + }, + { + "epoch": 0.46421017006514936, + "grad_norm": 0.24714459478855133, + "learning_rate": 0.001, + "loss": 1.7315, + "step": 10973 + }, + { + "epoch": 0.4642524748286657, + "grad_norm": 0.2337181717157364, + "learning_rate": 0.001, + "loss": 3.461, + "step": 10974 + }, + { + "epoch": 0.46429477959218207, + "grad_norm": 0.19878096878528595, + "learning_rate": 0.001, + "loss": 2.4174, + "step": 10975 + }, + { + "epoch": 0.4643370843556984, + "grad_norm": 0.19646792113780975, + "learning_rate": 0.001, + "loss": 3.2401, + "step": 10976 + }, + { + "epoch": 0.46437938911921484, + "grad_norm": 0.5441222786903381, + "learning_rate": 0.001, + "loss": 2.0292, + "step": 10977 + }, + { + "epoch": 0.4644216938827312, + "grad_norm": 0.19950753450393677, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 10978 + }, + { + "epoch": 0.46446399864624754, + "grad_norm": 0.301084965467453, + "learning_rate": 0.001, + "loss": 1.6781, + "step": 10979 + }, + { + "epoch": 0.46450630340976395, + "grad_norm": 0.1772916615009308, + "learning_rate": 0.001, + "loss": 1.5019, + "step": 10980 + }, + { + "epoch": 0.4645486081732803, + "grad_norm": 0.170380100607872, + "learning_rate": 0.001, + "loss": 1.8409, + "step": 10981 + }, + { + "epoch": 0.46459091293679666, + "grad_norm": 0.1674899011850357, + "learning_rate": 0.001, + "loss": 1.9776, + "step": 10982 + }, + { + "epoch": 0.4646332177003131, + "grad_norm": 0.20366886258125305, + "learning_rate": 0.001, + "loss": 2.4532, + "step": 10983 + }, + { + "epoch": 0.4646755224638294, + "grad_norm": 0.18768104910850525, + "learning_rate": 0.001, + "loss": 1.868, + "step": 10984 + }, + { + "epoch": 0.4647178272273458, + "grad_norm": 0.19560806453227997, + "learning_rate": 0.001, + "loss": 2.4912, + "step": 10985 + }, + { + "epoch": 0.4647601319908622, + "grad_norm": 0.3951105773448944, + "learning_rate": 0.001, + "loss": 2.1868, + "step": 10986 + }, + { + "epoch": 0.46480243675437855, + "grad_norm": 0.9056179523468018, + "learning_rate": 0.001, + "loss": 2.7611, + "step": 10987 + }, + { + "epoch": 0.4648447415178949, + "grad_norm": 0.1860157996416092, + "learning_rate": 0.001, + "loss": 2.0899, + "step": 10988 + }, + { + "epoch": 0.4648870462814113, + "grad_norm": 0.7894105911254883, + "learning_rate": 0.001, + "loss": 2.2895, + "step": 10989 + }, + { + "epoch": 0.46492935104492766, + "grad_norm": 0.16466152667999268, + "learning_rate": 0.001, + "loss": 2.6064, + "step": 10990 + }, + { + "epoch": 0.464971655808444, + "grad_norm": 0.17930325865745544, + "learning_rate": 0.001, + "loss": 1.7395, + "step": 10991 + }, + { + "epoch": 0.4650139605719604, + "grad_norm": 0.1949310451745987, + "learning_rate": 0.001, + "loss": 2.3786, + "step": 10992 + }, + { + "epoch": 0.4650562653354768, + "grad_norm": 0.17911165952682495, + "learning_rate": 0.001, + "loss": 1.5416, + "step": 10993 + }, + { + "epoch": 0.46509857009899314, + "grad_norm": 0.207133486866951, + "learning_rate": 0.001, + "loss": 2.3997, + "step": 10994 + }, + { + "epoch": 0.46514087486250955, + "grad_norm": 0.1573377251625061, + "learning_rate": 0.001, + "loss": 1.966, + "step": 10995 + }, + { + "epoch": 0.4651831796260259, + "grad_norm": 0.7414722442626953, + "learning_rate": 0.001, + "loss": 3.0765, + "step": 10996 + }, + { + "epoch": 0.46522548438954225, + "grad_norm": 0.17348453402519226, + "learning_rate": 0.001, + "loss": 2.8971, + "step": 10997 + }, + { + "epoch": 0.4652677891530586, + "grad_norm": 1.594288945198059, + "learning_rate": 0.001, + "loss": 2.3042, + "step": 10998 + }, + { + "epoch": 0.465310093916575, + "grad_norm": 0.24077658355236053, + "learning_rate": 0.001, + "loss": 2.2233, + "step": 10999 + }, + { + "epoch": 0.4653523986800914, + "grad_norm": 0.2647017538547516, + "learning_rate": 0.001, + "loss": 1.2681, + "step": 11000 + }, + { + "epoch": 0.4653947034436077, + "grad_norm": 0.15793390572071075, + "learning_rate": 0.001, + "loss": 1.6003, + "step": 11001 + }, + { + "epoch": 0.46543700820712414, + "grad_norm": 0.1997009813785553, + "learning_rate": 0.001, + "loss": 1.7316, + "step": 11002 + }, + { + "epoch": 0.4654793129706405, + "grad_norm": 0.18965008854866028, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 11003 + }, + { + "epoch": 0.46552161773415685, + "grad_norm": 0.24327543377876282, + "learning_rate": 0.001, + "loss": 2.5611, + "step": 11004 + }, + { + "epoch": 0.46556392249767325, + "grad_norm": 6.968198776245117, + "learning_rate": 0.001, + "loss": 2.3886, + "step": 11005 + }, + { + "epoch": 0.4656062272611896, + "grad_norm": 0.2511080503463745, + "learning_rate": 0.001, + "loss": 2.9208, + "step": 11006 + }, + { + "epoch": 0.46564853202470596, + "grad_norm": 0.1881697177886963, + "learning_rate": 0.001, + "loss": 1.5614, + "step": 11007 + }, + { + "epoch": 0.4656908367882224, + "grad_norm": 0.4397967755794525, + "learning_rate": 0.001, + "loss": 3.3356, + "step": 11008 + }, + { + "epoch": 0.4657331415517387, + "grad_norm": 0.2115083485841751, + "learning_rate": 0.001, + "loss": 2.1108, + "step": 11009 + }, + { + "epoch": 0.4657754463152551, + "grad_norm": 0.17709064483642578, + "learning_rate": 0.001, + "loss": 3.367, + "step": 11010 + }, + { + "epoch": 0.4658177510787715, + "grad_norm": 0.263375848531723, + "learning_rate": 0.001, + "loss": 2.0427, + "step": 11011 + }, + { + "epoch": 0.46586005584228785, + "grad_norm": 0.20178444683551788, + "learning_rate": 0.001, + "loss": 2.385, + "step": 11012 + }, + { + "epoch": 0.4659023606058042, + "grad_norm": 0.19885046780109406, + "learning_rate": 0.001, + "loss": 3.0593, + "step": 11013 + }, + { + "epoch": 0.4659446653693206, + "grad_norm": 0.27875369787216187, + "learning_rate": 0.001, + "loss": 3.4957, + "step": 11014 + }, + { + "epoch": 0.46598697013283696, + "grad_norm": 0.21964240074157715, + "learning_rate": 0.001, + "loss": 2.5046, + "step": 11015 + }, + { + "epoch": 0.4660292748963533, + "grad_norm": 0.16891005635261536, + "learning_rate": 0.001, + "loss": 2.9566, + "step": 11016 + }, + { + "epoch": 0.46607157965986973, + "grad_norm": 0.17071029543876648, + "learning_rate": 0.001, + "loss": 2.141, + "step": 11017 + }, + { + "epoch": 0.4661138844233861, + "grad_norm": 17.83469009399414, + "learning_rate": 0.001, + "loss": 2.0753, + "step": 11018 + }, + { + "epoch": 0.46615618918690244, + "grad_norm": 0.20334742963314056, + "learning_rate": 0.001, + "loss": 3.0571, + "step": 11019 + }, + { + "epoch": 0.4661984939504188, + "grad_norm": 0.1513107866048813, + "learning_rate": 0.001, + "loss": 2.1704, + "step": 11020 + }, + { + "epoch": 0.4662407987139352, + "grad_norm": 0.15699905157089233, + "learning_rate": 0.001, + "loss": 2.4362, + "step": 11021 + }, + { + "epoch": 0.46628310347745155, + "grad_norm": 0.25063201785087585, + "learning_rate": 0.001, + "loss": 2.6234, + "step": 11022 + }, + { + "epoch": 0.4663254082409679, + "grad_norm": 3.148904323577881, + "learning_rate": 0.001, + "loss": 3.4478, + "step": 11023 + }, + { + "epoch": 0.4663677130044843, + "grad_norm": 0.22315865755081177, + "learning_rate": 0.001, + "loss": 2.1858, + "step": 11024 + }, + { + "epoch": 0.4664100177680007, + "grad_norm": 0.24025224149227142, + "learning_rate": 0.001, + "loss": 2.9627, + "step": 11025 + }, + { + "epoch": 0.466452322531517, + "grad_norm": 0.2143600881099701, + "learning_rate": 0.001, + "loss": 1.9114, + "step": 11026 + }, + { + "epoch": 0.46649462729503344, + "grad_norm": 0.5619837641716003, + "learning_rate": 0.001, + "loss": 2.6092, + "step": 11027 + }, + { + "epoch": 0.4665369320585498, + "grad_norm": 0.2009797990322113, + "learning_rate": 0.001, + "loss": 1.9033, + "step": 11028 + }, + { + "epoch": 0.46657923682206615, + "grad_norm": 1.4359869956970215, + "learning_rate": 0.001, + "loss": 2.7386, + "step": 11029 + }, + { + "epoch": 0.46662154158558256, + "grad_norm": 0.15855303406715393, + "learning_rate": 0.001, + "loss": 3.3832, + "step": 11030 + }, + { + "epoch": 0.4666638463490989, + "grad_norm": 0.1745101362466812, + "learning_rate": 0.001, + "loss": 2.5427, + "step": 11031 + }, + { + "epoch": 0.46670615111261526, + "grad_norm": 0.20111802220344543, + "learning_rate": 0.001, + "loss": 2.204, + "step": 11032 + }, + { + "epoch": 0.4667484558761317, + "grad_norm": 0.22334951162338257, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 11033 + }, + { + "epoch": 0.46679076063964803, + "grad_norm": 0.17813622951507568, + "learning_rate": 0.001, + "loss": 2.9059, + "step": 11034 + }, + { + "epoch": 0.4668330654031644, + "grad_norm": 0.19768601655960083, + "learning_rate": 0.001, + "loss": 1.7435, + "step": 11035 + }, + { + "epoch": 0.4668753701666808, + "grad_norm": 0.4055560529232025, + "learning_rate": 0.001, + "loss": 1.8149, + "step": 11036 + }, + { + "epoch": 0.46691767493019715, + "grad_norm": 0.14512808620929718, + "learning_rate": 0.001, + "loss": 2.3372, + "step": 11037 + }, + { + "epoch": 0.4669599796937135, + "grad_norm": 0.2370501309633255, + "learning_rate": 0.001, + "loss": 2.0583, + "step": 11038 + }, + { + "epoch": 0.4670022844572299, + "grad_norm": 0.16900447010993958, + "learning_rate": 0.001, + "loss": 1.8195, + "step": 11039 + }, + { + "epoch": 0.46704458922074626, + "grad_norm": 0.16092944145202637, + "learning_rate": 0.001, + "loss": 2.3941, + "step": 11040 + }, + { + "epoch": 0.4670868939842626, + "grad_norm": 0.1543259471654892, + "learning_rate": 0.001, + "loss": 1.7284, + "step": 11041 + }, + { + "epoch": 0.467129198747779, + "grad_norm": 0.16476312279701233, + "learning_rate": 0.001, + "loss": 2.297, + "step": 11042 + }, + { + "epoch": 0.4671715035112954, + "grad_norm": 0.1504514068365097, + "learning_rate": 0.001, + "loss": 2.2147, + "step": 11043 + }, + { + "epoch": 0.46721380827481174, + "grad_norm": 0.1745976358652115, + "learning_rate": 0.001, + "loss": 2.5716, + "step": 11044 + }, + { + "epoch": 0.4672561130383281, + "grad_norm": 0.17237746715545654, + "learning_rate": 0.001, + "loss": 2.1571, + "step": 11045 + }, + { + "epoch": 0.4672984178018445, + "grad_norm": 0.14179447293281555, + "learning_rate": 0.001, + "loss": 1.7693, + "step": 11046 + }, + { + "epoch": 0.46734072256536086, + "grad_norm": 0.2136073112487793, + "learning_rate": 0.001, + "loss": 1.9455, + "step": 11047 + }, + { + "epoch": 0.4673830273288772, + "grad_norm": 0.15894265472888947, + "learning_rate": 0.001, + "loss": 1.9635, + "step": 11048 + }, + { + "epoch": 0.4674253320923936, + "grad_norm": 1.5329841375350952, + "learning_rate": 0.001, + "loss": 2.4782, + "step": 11049 + }, + { + "epoch": 0.46746763685591, + "grad_norm": 0.17458941042423248, + "learning_rate": 0.001, + "loss": 2.1225, + "step": 11050 + }, + { + "epoch": 0.46750994161942633, + "grad_norm": 0.21708954870700836, + "learning_rate": 0.001, + "loss": 2.2734, + "step": 11051 + }, + { + "epoch": 0.46755224638294274, + "grad_norm": 0.7924477458000183, + "learning_rate": 0.001, + "loss": 1.9928, + "step": 11052 + }, + { + "epoch": 0.4675945511464591, + "grad_norm": 0.16856947541236877, + "learning_rate": 0.001, + "loss": 2.1516, + "step": 11053 + }, + { + "epoch": 0.46763685590997545, + "grad_norm": 0.15970207750797272, + "learning_rate": 0.001, + "loss": 2.7033, + "step": 11054 + }, + { + "epoch": 0.46767916067349186, + "grad_norm": 8.105451583862305, + "learning_rate": 0.001, + "loss": 2.0742, + "step": 11055 + }, + { + "epoch": 0.4677214654370082, + "grad_norm": 0.1840921938419342, + "learning_rate": 0.001, + "loss": 1.9796, + "step": 11056 + }, + { + "epoch": 0.46776377020052456, + "grad_norm": 0.15253804624080658, + "learning_rate": 0.001, + "loss": 2.0105, + "step": 11057 + }, + { + "epoch": 0.467806074964041, + "grad_norm": 5.88224458694458, + "learning_rate": 0.001, + "loss": 1.9661, + "step": 11058 + }, + { + "epoch": 0.46784837972755733, + "grad_norm": 0.16145947575569153, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 11059 + }, + { + "epoch": 0.4678906844910737, + "grad_norm": 0.19684232771396637, + "learning_rate": 0.001, + "loss": 1.6197, + "step": 11060 + }, + { + "epoch": 0.4679329892545901, + "grad_norm": 45.53775405883789, + "learning_rate": 0.001, + "loss": 1.5319, + "step": 11061 + }, + { + "epoch": 0.46797529401810645, + "grad_norm": 2.51239275932312, + "learning_rate": 0.001, + "loss": 2.2339, + "step": 11062 + }, + { + "epoch": 0.4680175987816228, + "grad_norm": 0.19354106485843658, + "learning_rate": 0.001, + "loss": 2.2878, + "step": 11063 + }, + { + "epoch": 0.46805990354513916, + "grad_norm": 63.523895263671875, + "learning_rate": 0.001, + "loss": 2.1583, + "step": 11064 + }, + { + "epoch": 0.46810220830865557, + "grad_norm": 0.21123424172401428, + "learning_rate": 0.001, + "loss": 1.6651, + "step": 11065 + }, + { + "epoch": 0.4681445130721719, + "grad_norm": 0.5979301333427429, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 11066 + }, + { + "epoch": 0.4681868178356883, + "grad_norm": 0.15575318038463593, + "learning_rate": 0.001, + "loss": 2.2109, + "step": 11067 + }, + { + "epoch": 0.4682291225992047, + "grad_norm": 0.2060658186674118, + "learning_rate": 0.001, + "loss": 1.9252, + "step": 11068 + }, + { + "epoch": 0.46827142736272104, + "grad_norm": 0.19070769846439362, + "learning_rate": 0.001, + "loss": 2.1284, + "step": 11069 + }, + { + "epoch": 0.4683137321262374, + "grad_norm": 0.14260248839855194, + "learning_rate": 0.001, + "loss": 1.8289, + "step": 11070 + }, + { + "epoch": 0.4683560368897538, + "grad_norm": 0.2190844863653183, + "learning_rate": 0.001, + "loss": 2.1024, + "step": 11071 + }, + { + "epoch": 0.46839834165327016, + "grad_norm": 0.1667780727148056, + "learning_rate": 0.001, + "loss": 2.5757, + "step": 11072 + }, + { + "epoch": 0.4684406464167865, + "grad_norm": 0.19789689779281616, + "learning_rate": 0.001, + "loss": 2.0722, + "step": 11073 + }, + { + "epoch": 0.4684829511803029, + "grad_norm": 0.16835686564445496, + "learning_rate": 0.001, + "loss": 1.8042, + "step": 11074 + }, + { + "epoch": 0.4685252559438193, + "grad_norm": 0.9263663291931152, + "learning_rate": 0.001, + "loss": 2.0782, + "step": 11075 + }, + { + "epoch": 0.46856756070733563, + "grad_norm": 0.21614162623882294, + "learning_rate": 0.001, + "loss": 2.924, + "step": 11076 + }, + { + "epoch": 0.46860986547085204, + "grad_norm": 3.207810163497925, + "learning_rate": 0.001, + "loss": 2.6613, + "step": 11077 + }, + { + "epoch": 0.4686521702343684, + "grad_norm": 0.5324118733406067, + "learning_rate": 0.001, + "loss": 2.3755, + "step": 11078 + }, + { + "epoch": 0.46869447499788475, + "grad_norm": 0.17789721488952637, + "learning_rate": 0.001, + "loss": 2.0421, + "step": 11079 + }, + { + "epoch": 0.46873677976140116, + "grad_norm": 0.17907847464084625, + "learning_rate": 0.001, + "loss": 2.6084, + "step": 11080 + }, + { + "epoch": 0.4687790845249175, + "grad_norm": 0.23135465383529663, + "learning_rate": 0.001, + "loss": 1.7174, + "step": 11081 + }, + { + "epoch": 0.46882138928843387, + "grad_norm": 0.17100471258163452, + "learning_rate": 0.001, + "loss": 2.5313, + "step": 11082 + }, + { + "epoch": 0.4688636940519503, + "grad_norm": 0.16610606014728546, + "learning_rate": 0.001, + "loss": 2.9865, + "step": 11083 + }, + { + "epoch": 0.46890599881546663, + "grad_norm": 0.15432819724082947, + "learning_rate": 0.001, + "loss": 3.2703, + "step": 11084 + }, + { + "epoch": 0.468948303578983, + "grad_norm": 0.18594351410865784, + "learning_rate": 0.001, + "loss": 3.4266, + "step": 11085 + }, + { + "epoch": 0.46899060834249934, + "grad_norm": 6.842674732208252, + "learning_rate": 0.001, + "loss": 2.4358, + "step": 11086 + }, + { + "epoch": 0.46903291310601575, + "grad_norm": 0.17697936296463013, + "learning_rate": 0.001, + "loss": 1.7158, + "step": 11087 + }, + { + "epoch": 0.4690752178695321, + "grad_norm": 0.17671042680740356, + "learning_rate": 0.001, + "loss": 2.4786, + "step": 11088 + }, + { + "epoch": 0.46911752263304846, + "grad_norm": 1.0439887046813965, + "learning_rate": 0.001, + "loss": 2.1289, + "step": 11089 + }, + { + "epoch": 0.46915982739656487, + "grad_norm": 0.21606509387493134, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 11090 + }, + { + "epoch": 0.4692021321600812, + "grad_norm": 0.24721235036849976, + "learning_rate": 0.001, + "loss": 2.8101, + "step": 11091 + }, + { + "epoch": 0.4692444369235976, + "grad_norm": 0.21566319465637207, + "learning_rate": 0.001, + "loss": 2.1605, + "step": 11092 + }, + { + "epoch": 0.469286741687114, + "grad_norm": 0.5193756818771362, + "learning_rate": 0.001, + "loss": 2.787, + "step": 11093 + }, + { + "epoch": 0.46932904645063034, + "grad_norm": 0.20842444896697998, + "learning_rate": 0.001, + "loss": 1.9445, + "step": 11094 + }, + { + "epoch": 0.4693713512141467, + "grad_norm": 0.1922614574432373, + "learning_rate": 0.001, + "loss": 2.0826, + "step": 11095 + }, + { + "epoch": 0.4694136559776631, + "grad_norm": 0.9516851305961609, + "learning_rate": 0.001, + "loss": 1.9346, + "step": 11096 + }, + { + "epoch": 0.46945596074117946, + "grad_norm": 0.18452809751033783, + "learning_rate": 0.001, + "loss": 2.7229, + "step": 11097 + }, + { + "epoch": 0.4694982655046958, + "grad_norm": 0.21180051565170288, + "learning_rate": 0.001, + "loss": 2.497, + "step": 11098 + }, + { + "epoch": 0.4695405702682122, + "grad_norm": 3.082894802093506, + "learning_rate": 0.001, + "loss": 1.9389, + "step": 11099 + }, + { + "epoch": 0.4695828750317286, + "grad_norm": 0.17565667629241943, + "learning_rate": 0.001, + "loss": 1.4547, + "step": 11100 + }, + { + "epoch": 0.46962517979524493, + "grad_norm": 0.2029939591884613, + "learning_rate": 0.001, + "loss": 2.4275, + "step": 11101 + }, + { + "epoch": 0.46966748455876134, + "grad_norm": 0.20764219760894775, + "learning_rate": 0.001, + "loss": 2.1751, + "step": 11102 + }, + { + "epoch": 0.4697097893222777, + "grad_norm": 0.19430102407932281, + "learning_rate": 0.001, + "loss": 1.736, + "step": 11103 + }, + { + "epoch": 0.46975209408579405, + "grad_norm": 0.180740088224411, + "learning_rate": 0.001, + "loss": 1.6614, + "step": 11104 + }, + { + "epoch": 0.46979439884931046, + "grad_norm": 8.314878463745117, + "learning_rate": 0.001, + "loss": 2.2919, + "step": 11105 + }, + { + "epoch": 0.4698367036128268, + "grad_norm": 0.21296004951000214, + "learning_rate": 0.001, + "loss": 2.1662, + "step": 11106 + }, + { + "epoch": 0.46987900837634317, + "grad_norm": 0.21107828617095947, + "learning_rate": 0.001, + "loss": 2.2709, + "step": 11107 + }, + { + "epoch": 0.4699213131398596, + "grad_norm": 0.19116267561912537, + "learning_rate": 0.001, + "loss": 1.8983, + "step": 11108 + }, + { + "epoch": 0.46996361790337593, + "grad_norm": 0.17168934643268585, + "learning_rate": 0.001, + "loss": 1.8434, + "step": 11109 + }, + { + "epoch": 0.4700059226668923, + "grad_norm": 0.18456289172172546, + "learning_rate": 0.001, + "loss": 2.4683, + "step": 11110 + }, + { + "epoch": 0.47004822743040864, + "grad_norm": 0.874439001083374, + "learning_rate": 0.001, + "loss": 3.1001, + "step": 11111 + }, + { + "epoch": 0.47009053219392505, + "grad_norm": 0.16725631058216095, + "learning_rate": 0.001, + "loss": 1.8565, + "step": 11112 + }, + { + "epoch": 0.4701328369574414, + "grad_norm": 0.1976451575756073, + "learning_rate": 0.001, + "loss": 2.5576, + "step": 11113 + }, + { + "epoch": 0.47017514172095776, + "grad_norm": 0.17343725264072418, + "learning_rate": 0.001, + "loss": 2.7862, + "step": 11114 + }, + { + "epoch": 0.47021744648447417, + "grad_norm": 0.22081388533115387, + "learning_rate": 0.001, + "loss": 2.4001, + "step": 11115 + }, + { + "epoch": 0.4702597512479905, + "grad_norm": 0.375754177570343, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 11116 + }, + { + "epoch": 0.4703020560115069, + "grad_norm": 0.20346909761428833, + "learning_rate": 0.001, + "loss": 2.6006, + "step": 11117 + }, + { + "epoch": 0.4703443607750233, + "grad_norm": 0.21501335501670837, + "learning_rate": 0.001, + "loss": 1.8364, + "step": 11118 + }, + { + "epoch": 0.47038666553853964, + "grad_norm": 2.512514352798462, + "learning_rate": 0.001, + "loss": 2.0339, + "step": 11119 + }, + { + "epoch": 0.470428970302056, + "grad_norm": 0.18703396618366241, + "learning_rate": 0.001, + "loss": 2.2686, + "step": 11120 + }, + { + "epoch": 0.4704712750655724, + "grad_norm": 0.2161937803030014, + "learning_rate": 0.001, + "loss": 2.3499, + "step": 11121 + }, + { + "epoch": 0.47051357982908876, + "grad_norm": 0.22765906155109406, + "learning_rate": 0.001, + "loss": 2.2785, + "step": 11122 + }, + { + "epoch": 0.4705558845926051, + "grad_norm": 0.16340266168117523, + "learning_rate": 0.001, + "loss": 2.2066, + "step": 11123 + }, + { + "epoch": 0.4705981893561215, + "grad_norm": 0.17582248151302338, + "learning_rate": 0.001, + "loss": 3.2365, + "step": 11124 + }, + { + "epoch": 0.4706404941196379, + "grad_norm": 2.7968180179595947, + "learning_rate": 0.001, + "loss": 2.1311, + "step": 11125 + }, + { + "epoch": 0.47068279888315423, + "grad_norm": 0.19796128571033478, + "learning_rate": 0.001, + "loss": 2.6811, + "step": 11126 + }, + { + "epoch": 0.47072510364667064, + "grad_norm": 0.3247911334037781, + "learning_rate": 0.001, + "loss": 2.0883, + "step": 11127 + }, + { + "epoch": 0.470767408410187, + "grad_norm": 0.21730183064937592, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 11128 + }, + { + "epoch": 0.47080971317370335, + "grad_norm": 0.226370707154274, + "learning_rate": 0.001, + "loss": 3.4228, + "step": 11129 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 2.1633243560791016, + "learning_rate": 0.001, + "loss": 2.2012, + "step": 11130 + }, + { + "epoch": 0.4708943227007361, + "grad_norm": 0.2938755750656128, + "learning_rate": 0.001, + "loss": 3.1938, + "step": 11131 + }, + { + "epoch": 0.47093662746425247, + "grad_norm": 0.15652383863925934, + "learning_rate": 0.001, + "loss": 1.8906, + "step": 11132 + }, + { + "epoch": 0.4709789322277688, + "grad_norm": 0.17891067266464233, + "learning_rate": 0.001, + "loss": 3.3451, + "step": 11133 + }, + { + "epoch": 0.47102123699128523, + "grad_norm": 0.47045284509658813, + "learning_rate": 0.001, + "loss": 2.3286, + "step": 11134 + }, + { + "epoch": 0.4710635417548016, + "grad_norm": 0.3203991949558258, + "learning_rate": 0.001, + "loss": 2.0339, + "step": 11135 + }, + { + "epoch": 0.47110584651831794, + "grad_norm": 0.1709599643945694, + "learning_rate": 0.001, + "loss": 1.9023, + "step": 11136 + }, + { + "epoch": 0.47114815128183435, + "grad_norm": 0.2922183871269226, + "learning_rate": 0.001, + "loss": 2.1608, + "step": 11137 + }, + { + "epoch": 0.4711904560453507, + "grad_norm": 8.45193862915039, + "learning_rate": 0.001, + "loss": 1.8432, + "step": 11138 + }, + { + "epoch": 0.47123276080886706, + "grad_norm": 0.18489998579025269, + "learning_rate": 0.001, + "loss": 2.5255, + "step": 11139 + }, + { + "epoch": 0.47127506557238347, + "grad_norm": 0.5812740325927734, + "learning_rate": 0.001, + "loss": 2.3951, + "step": 11140 + }, + { + "epoch": 0.4713173703358998, + "grad_norm": 0.18123511970043182, + "learning_rate": 0.001, + "loss": 1.8583, + "step": 11141 + }, + { + "epoch": 0.4713596750994162, + "grad_norm": 0.22358356416225433, + "learning_rate": 0.001, + "loss": 2.1568, + "step": 11142 + }, + { + "epoch": 0.4714019798629326, + "grad_norm": 0.363231360912323, + "learning_rate": 0.001, + "loss": 1.5782, + "step": 11143 + }, + { + "epoch": 0.47144428462644894, + "grad_norm": 0.2926989793777466, + "learning_rate": 0.001, + "loss": 2.6199, + "step": 11144 + }, + { + "epoch": 0.4714865893899653, + "grad_norm": 0.17961940169334412, + "learning_rate": 0.001, + "loss": 1.9892, + "step": 11145 + }, + { + "epoch": 0.4715288941534817, + "grad_norm": 0.17067760229110718, + "learning_rate": 0.001, + "loss": 2.2848, + "step": 11146 + }, + { + "epoch": 0.47157119891699806, + "grad_norm": 0.16751837730407715, + "learning_rate": 0.001, + "loss": 1.7813, + "step": 11147 + }, + { + "epoch": 0.4716135036805144, + "grad_norm": 0.20085522532463074, + "learning_rate": 0.001, + "loss": 2.9713, + "step": 11148 + }, + { + "epoch": 0.4716558084440308, + "grad_norm": 0.20381022989749908, + "learning_rate": 0.001, + "loss": 3.0651, + "step": 11149 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 0.2398488074541092, + "learning_rate": 0.001, + "loss": 2.5814, + "step": 11150 + }, + { + "epoch": 0.47174041797106353, + "grad_norm": 1.2999179363250732, + "learning_rate": 0.001, + "loss": 1.6057, + "step": 11151 + }, + { + "epoch": 0.47178272273457994, + "grad_norm": 0.7444708347320557, + "learning_rate": 0.001, + "loss": 2.5191, + "step": 11152 + }, + { + "epoch": 0.4718250274980963, + "grad_norm": 0.16335336863994598, + "learning_rate": 0.001, + "loss": 2.7914, + "step": 11153 + }, + { + "epoch": 0.47186733226161265, + "grad_norm": 0.5560107827186584, + "learning_rate": 0.001, + "loss": 2.2272, + "step": 11154 + }, + { + "epoch": 0.471909637025129, + "grad_norm": 0.16256298124790192, + "learning_rate": 0.001, + "loss": 1.6319, + "step": 11155 + }, + { + "epoch": 0.4719519417886454, + "grad_norm": 0.23074427247047424, + "learning_rate": 0.001, + "loss": 2.5899, + "step": 11156 + }, + { + "epoch": 0.47199424655216177, + "grad_norm": 0.17271144688129425, + "learning_rate": 0.001, + "loss": 2.4049, + "step": 11157 + }, + { + "epoch": 0.4720365513156781, + "grad_norm": 0.21863944828510284, + "learning_rate": 0.001, + "loss": 2.4901, + "step": 11158 + }, + { + "epoch": 0.47207885607919453, + "grad_norm": 0.15925271809101105, + "learning_rate": 0.001, + "loss": 2.034, + "step": 11159 + }, + { + "epoch": 0.4721211608427109, + "grad_norm": 0.1758195459842682, + "learning_rate": 0.001, + "loss": 2.6075, + "step": 11160 + }, + { + "epoch": 0.47216346560622724, + "grad_norm": 0.2449083775281906, + "learning_rate": 0.001, + "loss": 2.4067, + "step": 11161 + }, + { + "epoch": 0.47220577036974365, + "grad_norm": 0.14552642405033112, + "learning_rate": 0.001, + "loss": 1.9266, + "step": 11162 + }, + { + "epoch": 0.47224807513326, + "grad_norm": 0.3583971858024597, + "learning_rate": 0.001, + "loss": 1.9896, + "step": 11163 + }, + { + "epoch": 0.47229037989677636, + "grad_norm": 0.22816874086856842, + "learning_rate": 0.001, + "loss": 2.1876, + "step": 11164 + }, + { + "epoch": 0.47233268466029277, + "grad_norm": 0.18025673925876617, + "learning_rate": 0.001, + "loss": 1.8776, + "step": 11165 + }, + { + "epoch": 0.4723749894238091, + "grad_norm": 0.17543093860149384, + "learning_rate": 0.001, + "loss": 2.6487, + "step": 11166 + }, + { + "epoch": 0.4724172941873255, + "grad_norm": 0.1770016998052597, + "learning_rate": 0.001, + "loss": 3.3751, + "step": 11167 + }, + { + "epoch": 0.4724595989508419, + "grad_norm": 0.1971733570098877, + "learning_rate": 0.001, + "loss": 2.0229, + "step": 11168 + }, + { + "epoch": 0.47250190371435824, + "grad_norm": 0.22783638536930084, + "learning_rate": 0.001, + "loss": 1.7606, + "step": 11169 + }, + { + "epoch": 0.4725442084778746, + "grad_norm": 0.17248298227787018, + "learning_rate": 0.001, + "loss": 1.8782, + "step": 11170 + }, + { + "epoch": 0.472586513241391, + "grad_norm": 0.1579669564962387, + "learning_rate": 0.001, + "loss": 1.4778, + "step": 11171 + }, + { + "epoch": 0.47262881800490736, + "grad_norm": 0.5877176523208618, + "learning_rate": 0.001, + "loss": 1.8328, + "step": 11172 + }, + { + "epoch": 0.4726711227684237, + "grad_norm": 0.2934803366661072, + "learning_rate": 0.001, + "loss": 2.7896, + "step": 11173 + }, + { + "epoch": 0.4727134275319401, + "grad_norm": 0.28287965059280396, + "learning_rate": 0.001, + "loss": 3.311, + "step": 11174 + }, + { + "epoch": 0.4727557322954565, + "grad_norm": 0.1633968949317932, + "learning_rate": 0.001, + "loss": 1.8999, + "step": 11175 + }, + { + "epoch": 0.47279803705897283, + "grad_norm": 0.2607437074184418, + "learning_rate": 0.001, + "loss": 1.9724, + "step": 11176 + }, + { + "epoch": 0.4728403418224892, + "grad_norm": 0.6347095370292664, + "learning_rate": 0.001, + "loss": 2.791, + "step": 11177 + }, + { + "epoch": 0.4728826465860056, + "grad_norm": 0.20074185729026794, + "learning_rate": 0.001, + "loss": 3.0955, + "step": 11178 + }, + { + "epoch": 0.47292495134952195, + "grad_norm": 0.19658905267715454, + "learning_rate": 0.001, + "loss": 2.3828, + "step": 11179 + }, + { + "epoch": 0.4729672561130383, + "grad_norm": 0.14689096808433533, + "learning_rate": 0.001, + "loss": 2.1146, + "step": 11180 + }, + { + "epoch": 0.4730095608765547, + "grad_norm": 0.21568824350833893, + "learning_rate": 0.001, + "loss": 3.265, + "step": 11181 + }, + { + "epoch": 0.47305186564007107, + "grad_norm": 0.20874953269958496, + "learning_rate": 0.001, + "loss": 2.4134, + "step": 11182 + }, + { + "epoch": 0.4730941704035874, + "grad_norm": 5.027991771697998, + "learning_rate": 0.001, + "loss": 3.1703, + "step": 11183 + }, + { + "epoch": 0.47313647516710383, + "grad_norm": 0.1729307919740677, + "learning_rate": 0.001, + "loss": 1.7616, + "step": 11184 + }, + { + "epoch": 0.4731787799306202, + "grad_norm": 0.1840585619211197, + "learning_rate": 0.001, + "loss": 2.9509, + "step": 11185 + }, + { + "epoch": 0.47322108469413654, + "grad_norm": 0.19434517621994019, + "learning_rate": 0.001, + "loss": 2.1666, + "step": 11186 + }, + { + "epoch": 0.47326338945765295, + "grad_norm": 0.1548105627298355, + "learning_rate": 0.001, + "loss": 2.6967, + "step": 11187 + }, + { + "epoch": 0.4733056942211693, + "grad_norm": 0.15446540713310242, + "learning_rate": 0.001, + "loss": 1.9184, + "step": 11188 + }, + { + "epoch": 0.47334799898468566, + "grad_norm": 0.15472517907619476, + "learning_rate": 0.001, + "loss": 1.9038, + "step": 11189 + }, + { + "epoch": 0.47339030374820207, + "grad_norm": 0.29968515038490295, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 11190 + }, + { + "epoch": 0.4734326085117184, + "grad_norm": 3.557898759841919, + "learning_rate": 0.001, + "loss": 3.0507, + "step": 11191 + }, + { + "epoch": 0.4734749132752348, + "grad_norm": 1.0494880676269531, + "learning_rate": 0.001, + "loss": 1.6712, + "step": 11192 + }, + { + "epoch": 0.4735172180387512, + "grad_norm": 0.18932278454303741, + "learning_rate": 0.001, + "loss": 2.3845, + "step": 11193 + }, + { + "epoch": 0.47355952280226754, + "grad_norm": 0.7504813075065613, + "learning_rate": 0.001, + "loss": 2.8903, + "step": 11194 + }, + { + "epoch": 0.4736018275657839, + "grad_norm": 0.2353019416332245, + "learning_rate": 0.001, + "loss": 2.2606, + "step": 11195 + }, + { + "epoch": 0.4736441323293003, + "grad_norm": 0.22455672919750214, + "learning_rate": 0.001, + "loss": 2.4879, + "step": 11196 + }, + { + "epoch": 0.47368643709281666, + "grad_norm": 0.28473368287086487, + "learning_rate": 0.001, + "loss": 2.6755, + "step": 11197 + }, + { + "epoch": 0.473728741856333, + "grad_norm": 0.19479401409626007, + "learning_rate": 0.001, + "loss": 2.784, + "step": 11198 + }, + { + "epoch": 0.47377104661984937, + "grad_norm": 0.17094643414020538, + "learning_rate": 0.001, + "loss": 2.3451, + "step": 11199 + }, + { + "epoch": 0.4738133513833658, + "grad_norm": 0.17563167214393616, + "learning_rate": 0.001, + "loss": 2.3602, + "step": 11200 + }, + { + "epoch": 0.47385565614688213, + "grad_norm": 0.19318290054798126, + "learning_rate": 0.001, + "loss": 1.8098, + "step": 11201 + }, + { + "epoch": 0.4738979609103985, + "grad_norm": 0.1785099357366562, + "learning_rate": 0.001, + "loss": 2.6017, + "step": 11202 + }, + { + "epoch": 0.4739402656739149, + "grad_norm": 0.23971965909004211, + "learning_rate": 0.001, + "loss": 3.1176, + "step": 11203 + }, + { + "epoch": 0.47398257043743125, + "grad_norm": 0.17552334070205688, + "learning_rate": 0.001, + "loss": 2.6471, + "step": 11204 + }, + { + "epoch": 0.4740248752009476, + "grad_norm": 0.309535950422287, + "learning_rate": 0.001, + "loss": 1.5446, + "step": 11205 + }, + { + "epoch": 0.474067179964464, + "grad_norm": 0.23233279585838318, + "learning_rate": 0.001, + "loss": 2.4261, + "step": 11206 + }, + { + "epoch": 0.47410948472798037, + "grad_norm": 0.17341110110282898, + "learning_rate": 0.001, + "loss": 1.5443, + "step": 11207 + }, + { + "epoch": 0.4741517894914967, + "grad_norm": 0.9063680768013, + "learning_rate": 0.001, + "loss": 1.5549, + "step": 11208 + }, + { + "epoch": 0.47419409425501313, + "grad_norm": 0.20930500328540802, + "learning_rate": 0.001, + "loss": 2.9428, + "step": 11209 + }, + { + "epoch": 0.4742363990185295, + "grad_norm": 0.1631225347518921, + "learning_rate": 0.001, + "loss": 2.0326, + "step": 11210 + }, + { + "epoch": 0.47427870378204584, + "grad_norm": 0.3474618196487427, + "learning_rate": 0.001, + "loss": 2.0844, + "step": 11211 + }, + { + "epoch": 0.47432100854556225, + "grad_norm": 0.16792355477809906, + "learning_rate": 0.001, + "loss": 2.1918, + "step": 11212 + }, + { + "epoch": 0.4743633133090786, + "grad_norm": 0.2315523475408554, + "learning_rate": 0.001, + "loss": 2.7786, + "step": 11213 + }, + { + "epoch": 0.47440561807259496, + "grad_norm": 0.21026229858398438, + "learning_rate": 0.001, + "loss": 2.1381, + "step": 11214 + }, + { + "epoch": 0.47444792283611137, + "grad_norm": 0.1558665633201599, + "learning_rate": 0.001, + "loss": 1.828, + "step": 11215 + }, + { + "epoch": 0.4744902275996277, + "grad_norm": 0.19669584929943085, + "learning_rate": 0.001, + "loss": 2.3019, + "step": 11216 + }, + { + "epoch": 0.4745325323631441, + "grad_norm": 0.16526706516742706, + "learning_rate": 0.001, + "loss": 2.1734, + "step": 11217 + }, + { + "epoch": 0.4745748371266605, + "grad_norm": 0.1388012170791626, + "learning_rate": 0.001, + "loss": 2.5851, + "step": 11218 + }, + { + "epoch": 0.47461714189017684, + "grad_norm": 0.20129254460334778, + "learning_rate": 0.001, + "loss": 2.3251, + "step": 11219 + }, + { + "epoch": 0.4746594466536932, + "grad_norm": 0.1600094735622406, + "learning_rate": 0.001, + "loss": 1.8666, + "step": 11220 + }, + { + "epoch": 0.4747017514172096, + "grad_norm": 0.1788060963153839, + "learning_rate": 0.001, + "loss": 3.1572, + "step": 11221 + }, + { + "epoch": 0.47474405618072596, + "grad_norm": 0.34340453147888184, + "learning_rate": 0.001, + "loss": 1.7197, + "step": 11222 + }, + { + "epoch": 0.4747863609442423, + "grad_norm": 0.18429534137248993, + "learning_rate": 0.001, + "loss": 2.1796, + "step": 11223 + }, + { + "epoch": 0.47482866570775867, + "grad_norm": 0.19663597643375397, + "learning_rate": 0.001, + "loss": 2.397, + "step": 11224 + }, + { + "epoch": 0.4748709704712751, + "grad_norm": 0.16480304300785065, + "learning_rate": 0.001, + "loss": 1.901, + "step": 11225 + }, + { + "epoch": 0.47491327523479143, + "grad_norm": 0.6021777391433716, + "learning_rate": 0.001, + "loss": 2.8861, + "step": 11226 + }, + { + "epoch": 0.4749555799983078, + "grad_norm": 0.14651986956596375, + "learning_rate": 0.001, + "loss": 2.2698, + "step": 11227 + }, + { + "epoch": 0.4749978847618242, + "grad_norm": 0.521893322467804, + "learning_rate": 0.001, + "loss": 2.3953, + "step": 11228 + }, + { + "epoch": 0.47504018952534055, + "grad_norm": 0.20987311005592346, + "learning_rate": 0.001, + "loss": 2.5025, + "step": 11229 + }, + { + "epoch": 0.4750824942888569, + "grad_norm": 0.14994093775749207, + "learning_rate": 0.001, + "loss": 1.6545, + "step": 11230 + }, + { + "epoch": 0.4751247990523733, + "grad_norm": 0.47113507986068726, + "learning_rate": 0.001, + "loss": 2.1639, + "step": 11231 + }, + { + "epoch": 0.47516710381588967, + "grad_norm": 0.14544245600700378, + "learning_rate": 0.001, + "loss": 2.5572, + "step": 11232 + }, + { + "epoch": 0.475209408579406, + "grad_norm": 2.366591215133667, + "learning_rate": 0.001, + "loss": 2.1375, + "step": 11233 + }, + { + "epoch": 0.47525171334292243, + "grad_norm": 0.14105114340782166, + "learning_rate": 0.001, + "loss": 2.686, + "step": 11234 + }, + { + "epoch": 0.4752940181064388, + "grad_norm": 0.49145644903182983, + "learning_rate": 0.001, + "loss": 1.8348, + "step": 11235 + }, + { + "epoch": 0.47533632286995514, + "grad_norm": 0.18383541703224182, + "learning_rate": 0.001, + "loss": 1.5315, + "step": 11236 + }, + { + "epoch": 0.47537862763347155, + "grad_norm": 0.17911598086357117, + "learning_rate": 0.001, + "loss": 1.9033, + "step": 11237 + }, + { + "epoch": 0.4754209323969879, + "grad_norm": 3.2058968544006348, + "learning_rate": 0.001, + "loss": 3.8925, + "step": 11238 + }, + { + "epoch": 0.47546323716050426, + "grad_norm": 0.1748027503490448, + "learning_rate": 0.001, + "loss": 2.9772, + "step": 11239 + }, + { + "epoch": 0.47550554192402067, + "grad_norm": 0.16488003730773926, + "learning_rate": 0.001, + "loss": 1.9438, + "step": 11240 + }, + { + "epoch": 0.475547846687537, + "grad_norm": 0.2031308114528656, + "learning_rate": 0.001, + "loss": 1.5888, + "step": 11241 + }, + { + "epoch": 0.4755901514510534, + "grad_norm": 0.2005867063999176, + "learning_rate": 0.001, + "loss": 2.3343, + "step": 11242 + }, + { + "epoch": 0.4756324562145698, + "grad_norm": 5.769382953643799, + "learning_rate": 0.001, + "loss": 1.9595, + "step": 11243 + }, + { + "epoch": 0.47567476097808614, + "grad_norm": 0.16878317296504974, + "learning_rate": 0.001, + "loss": 1.9647, + "step": 11244 + }, + { + "epoch": 0.4757170657416025, + "grad_norm": 0.1860329806804657, + "learning_rate": 0.001, + "loss": 1.8979, + "step": 11245 + }, + { + "epoch": 0.47575937050511885, + "grad_norm": 0.25839003920555115, + "learning_rate": 0.001, + "loss": 2.7318, + "step": 11246 + }, + { + "epoch": 0.47580167526863526, + "grad_norm": 3.520124912261963, + "learning_rate": 0.001, + "loss": 2.6687, + "step": 11247 + }, + { + "epoch": 0.4758439800321516, + "grad_norm": 0.2002669721841812, + "learning_rate": 0.001, + "loss": 3.1077, + "step": 11248 + }, + { + "epoch": 0.47588628479566797, + "grad_norm": 0.31239837408065796, + "learning_rate": 0.001, + "loss": 2.4069, + "step": 11249 + }, + { + "epoch": 0.4759285895591844, + "grad_norm": 0.2110671103000641, + "learning_rate": 0.001, + "loss": 2.3489, + "step": 11250 + }, + { + "epoch": 0.47597089432270073, + "grad_norm": 0.22103223204612732, + "learning_rate": 0.001, + "loss": 2.9316, + "step": 11251 + }, + { + "epoch": 0.4760131990862171, + "grad_norm": 0.7828096747398376, + "learning_rate": 0.001, + "loss": 2.5561, + "step": 11252 + }, + { + "epoch": 0.4760555038497335, + "grad_norm": 0.20706386864185333, + "learning_rate": 0.001, + "loss": 2.9993, + "step": 11253 + }, + { + "epoch": 0.47609780861324985, + "grad_norm": 0.19384732842445374, + "learning_rate": 0.001, + "loss": 1.7905, + "step": 11254 + }, + { + "epoch": 0.4761401133767662, + "grad_norm": 0.25102004408836365, + "learning_rate": 0.001, + "loss": 2.7746, + "step": 11255 + }, + { + "epoch": 0.4761824181402826, + "grad_norm": 0.1942043900489807, + "learning_rate": 0.001, + "loss": 2.215, + "step": 11256 + }, + { + "epoch": 0.47622472290379897, + "grad_norm": 0.2124514877796173, + "learning_rate": 0.001, + "loss": 2.1071, + "step": 11257 + }, + { + "epoch": 0.4762670276673153, + "grad_norm": 0.1634020060300827, + "learning_rate": 0.001, + "loss": 1.5839, + "step": 11258 + }, + { + "epoch": 0.47630933243083173, + "grad_norm": 0.19070233404636383, + "learning_rate": 0.001, + "loss": 3.202, + "step": 11259 + }, + { + "epoch": 0.4763516371943481, + "grad_norm": 0.18325842916965485, + "learning_rate": 0.001, + "loss": 3.1532, + "step": 11260 + }, + { + "epoch": 0.47639394195786444, + "grad_norm": 0.23473717272281647, + "learning_rate": 0.001, + "loss": 2.2308, + "step": 11261 + }, + { + "epoch": 0.47643624672138085, + "grad_norm": 0.17958499491214752, + "learning_rate": 0.001, + "loss": 2.0782, + "step": 11262 + }, + { + "epoch": 0.4764785514848972, + "grad_norm": 0.31303510069847107, + "learning_rate": 0.001, + "loss": 2.5105, + "step": 11263 + }, + { + "epoch": 0.47652085624841356, + "grad_norm": 0.18192079663276672, + "learning_rate": 0.001, + "loss": 1.7983, + "step": 11264 + }, + { + "epoch": 0.47656316101192997, + "grad_norm": 0.31585457921028137, + "learning_rate": 0.001, + "loss": 2.8891, + "step": 11265 + }, + { + "epoch": 0.4766054657754463, + "grad_norm": 0.19254378974437714, + "learning_rate": 0.001, + "loss": 1.9023, + "step": 11266 + }, + { + "epoch": 0.4766477705389627, + "grad_norm": 0.5017719268798828, + "learning_rate": 0.001, + "loss": 1.9545, + "step": 11267 + }, + { + "epoch": 0.47669007530247903, + "grad_norm": 0.1998423933982849, + "learning_rate": 0.001, + "loss": 2.9385, + "step": 11268 + }, + { + "epoch": 0.47673238006599544, + "grad_norm": 0.18288664519786835, + "learning_rate": 0.001, + "loss": 1.5928, + "step": 11269 + }, + { + "epoch": 0.4767746848295118, + "grad_norm": 0.1892680823802948, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 11270 + }, + { + "epoch": 0.47681698959302815, + "grad_norm": 0.7018859386444092, + "learning_rate": 0.001, + "loss": 2.2363, + "step": 11271 + }, + { + "epoch": 0.47685929435654456, + "grad_norm": 0.6510068774223328, + "learning_rate": 0.001, + "loss": 2.1761, + "step": 11272 + }, + { + "epoch": 0.4769015991200609, + "grad_norm": 1.0637061595916748, + "learning_rate": 0.001, + "loss": 1.9906, + "step": 11273 + }, + { + "epoch": 0.47694390388357727, + "grad_norm": 0.15826399624347687, + "learning_rate": 0.001, + "loss": 2.4301, + "step": 11274 + }, + { + "epoch": 0.4769862086470937, + "grad_norm": 0.17290960252285004, + "learning_rate": 0.001, + "loss": 3.1263, + "step": 11275 + }, + { + "epoch": 0.47702851341061003, + "grad_norm": 0.19678086042404175, + "learning_rate": 0.001, + "loss": 2.8927, + "step": 11276 + }, + { + "epoch": 0.4770708181741264, + "grad_norm": 0.19402678310871124, + "learning_rate": 0.001, + "loss": 2.1192, + "step": 11277 + }, + { + "epoch": 0.4771131229376428, + "grad_norm": 0.20166024565696716, + "learning_rate": 0.001, + "loss": 2.2653, + "step": 11278 + }, + { + "epoch": 0.47715542770115915, + "grad_norm": 0.15184548497200012, + "learning_rate": 0.001, + "loss": 3.5353, + "step": 11279 + }, + { + "epoch": 0.4771977324646755, + "grad_norm": 0.15294674038887024, + "learning_rate": 0.001, + "loss": 2.1343, + "step": 11280 + }, + { + "epoch": 0.4772400372281919, + "grad_norm": 0.23221707344055176, + "learning_rate": 0.001, + "loss": 1.9696, + "step": 11281 + }, + { + "epoch": 0.47728234199170827, + "grad_norm": 0.541606605052948, + "learning_rate": 0.001, + "loss": 1.8145, + "step": 11282 + }, + { + "epoch": 0.4773246467552246, + "grad_norm": 12.742988586425781, + "learning_rate": 0.001, + "loss": 2.5281, + "step": 11283 + }, + { + "epoch": 0.47736695151874103, + "grad_norm": 0.15748733282089233, + "learning_rate": 0.001, + "loss": 2.6597, + "step": 11284 + }, + { + "epoch": 0.4774092562822574, + "grad_norm": 0.1852605640888214, + "learning_rate": 0.001, + "loss": 2.1034, + "step": 11285 + }, + { + "epoch": 0.47745156104577374, + "grad_norm": 0.5693913102149963, + "learning_rate": 0.001, + "loss": 3.0643, + "step": 11286 + }, + { + "epoch": 0.47749386580929015, + "grad_norm": 0.570594072341919, + "learning_rate": 0.001, + "loss": 3.6691, + "step": 11287 + }, + { + "epoch": 0.4775361705728065, + "grad_norm": 0.20175811648368835, + "learning_rate": 0.001, + "loss": 2.628, + "step": 11288 + }, + { + "epoch": 0.47757847533632286, + "grad_norm": 0.2098332792520523, + "learning_rate": 0.001, + "loss": 2.3156, + "step": 11289 + }, + { + "epoch": 0.4776207800998392, + "grad_norm": 0.18259800970554352, + "learning_rate": 0.001, + "loss": 3.0061, + "step": 11290 + }, + { + "epoch": 0.4776630848633556, + "grad_norm": 0.17582936584949493, + "learning_rate": 0.001, + "loss": 2.0574, + "step": 11291 + }, + { + "epoch": 0.477705389626872, + "grad_norm": 3.1900362968444824, + "learning_rate": 0.001, + "loss": 2.023, + "step": 11292 + }, + { + "epoch": 0.47774769439038833, + "grad_norm": 0.18563833832740784, + "learning_rate": 0.001, + "loss": 2.3776, + "step": 11293 + }, + { + "epoch": 0.47778999915390474, + "grad_norm": 0.1735265702009201, + "learning_rate": 0.001, + "loss": 3.3993, + "step": 11294 + }, + { + "epoch": 0.4778323039174211, + "grad_norm": 0.17807775735855103, + "learning_rate": 0.001, + "loss": 1.9739, + "step": 11295 + }, + { + "epoch": 0.47787460868093745, + "grad_norm": 0.42924782633781433, + "learning_rate": 0.001, + "loss": 2.0357, + "step": 11296 + }, + { + "epoch": 0.47791691344445386, + "grad_norm": 0.2000165730714798, + "learning_rate": 0.001, + "loss": 2.0919, + "step": 11297 + }, + { + "epoch": 0.4779592182079702, + "grad_norm": 0.14181675016880035, + "learning_rate": 0.001, + "loss": 2.5228, + "step": 11298 + }, + { + "epoch": 0.47800152297148657, + "grad_norm": 5.60988187789917, + "learning_rate": 0.001, + "loss": 1.5685, + "step": 11299 + }, + { + "epoch": 0.478043827735003, + "grad_norm": 0.1896638423204422, + "learning_rate": 0.001, + "loss": 2.7071, + "step": 11300 + }, + { + "epoch": 0.47808613249851933, + "grad_norm": 46.45356750488281, + "learning_rate": 0.001, + "loss": 3.0575, + "step": 11301 + }, + { + "epoch": 0.4781284372620357, + "grad_norm": 0.5738785266876221, + "learning_rate": 0.001, + "loss": 2.3252, + "step": 11302 + }, + { + "epoch": 0.4781707420255521, + "grad_norm": 0.18505975604057312, + "learning_rate": 0.001, + "loss": 2.2331, + "step": 11303 + }, + { + "epoch": 0.47821304678906845, + "grad_norm": 0.18580776453018188, + "learning_rate": 0.001, + "loss": 2.7998, + "step": 11304 + }, + { + "epoch": 0.4782553515525848, + "grad_norm": 2.873171329498291, + "learning_rate": 0.001, + "loss": 3.3834, + "step": 11305 + }, + { + "epoch": 0.4782976563161012, + "grad_norm": 0.17090004682540894, + "learning_rate": 0.001, + "loss": 2.4539, + "step": 11306 + }, + { + "epoch": 0.47833996107961757, + "grad_norm": 0.5887268781661987, + "learning_rate": 0.001, + "loss": 1.9116, + "step": 11307 + }, + { + "epoch": 0.4783822658431339, + "grad_norm": 0.2121281623840332, + "learning_rate": 0.001, + "loss": 2.3149, + "step": 11308 + }, + { + "epoch": 0.47842457060665033, + "grad_norm": 0.31115978956222534, + "learning_rate": 0.001, + "loss": 2.1889, + "step": 11309 + }, + { + "epoch": 0.4784668753701667, + "grad_norm": 0.1875309944152832, + "learning_rate": 0.001, + "loss": 2.1295, + "step": 11310 + }, + { + "epoch": 0.47850918013368304, + "grad_norm": 0.1967727243900299, + "learning_rate": 0.001, + "loss": 1.7845, + "step": 11311 + }, + { + "epoch": 0.4785514848971994, + "grad_norm": 2.0214593410491943, + "learning_rate": 0.001, + "loss": 2.2464, + "step": 11312 + }, + { + "epoch": 0.4785937896607158, + "grad_norm": 0.18990519642829895, + "learning_rate": 0.001, + "loss": 2.0703, + "step": 11313 + }, + { + "epoch": 0.47863609442423216, + "grad_norm": 1.7356641292572021, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 11314 + }, + { + "epoch": 0.4786783991877485, + "grad_norm": 0.220098614692688, + "learning_rate": 0.001, + "loss": 2.4705, + "step": 11315 + }, + { + "epoch": 0.4787207039512649, + "grad_norm": 0.1770542711019516, + "learning_rate": 0.001, + "loss": 2.8367, + "step": 11316 + }, + { + "epoch": 0.4787630087147813, + "grad_norm": 0.22387665510177612, + "learning_rate": 0.001, + "loss": 2.1766, + "step": 11317 + }, + { + "epoch": 0.47880531347829763, + "grad_norm": 0.1825076937675476, + "learning_rate": 0.001, + "loss": 3.0979, + "step": 11318 + }, + { + "epoch": 0.47884761824181404, + "grad_norm": 0.2045036405324936, + "learning_rate": 0.001, + "loss": 2.9486, + "step": 11319 + }, + { + "epoch": 0.4788899230053304, + "grad_norm": 0.20427003502845764, + "learning_rate": 0.001, + "loss": 2.7808, + "step": 11320 + }, + { + "epoch": 0.47893222776884675, + "grad_norm": 0.21618923544883728, + "learning_rate": 0.001, + "loss": 2.3455, + "step": 11321 + }, + { + "epoch": 0.47897453253236316, + "grad_norm": 0.19083094596862793, + "learning_rate": 0.001, + "loss": 2.621, + "step": 11322 + }, + { + "epoch": 0.4790168372958795, + "grad_norm": 10.630045890808105, + "learning_rate": 0.001, + "loss": 2.8354, + "step": 11323 + }, + { + "epoch": 0.47905914205939587, + "grad_norm": 0.20413336157798767, + "learning_rate": 0.001, + "loss": 2.2648, + "step": 11324 + }, + { + "epoch": 0.4791014468229123, + "grad_norm": 0.23747549951076508, + "learning_rate": 0.001, + "loss": 3.3187, + "step": 11325 + }, + { + "epoch": 0.47914375158642863, + "grad_norm": 21.391681671142578, + "learning_rate": 0.001, + "loss": 1.6874, + "step": 11326 + }, + { + "epoch": 0.479186056349945, + "grad_norm": 0.2882399559020996, + "learning_rate": 0.001, + "loss": 2.4637, + "step": 11327 + }, + { + "epoch": 0.4792283611134614, + "grad_norm": 0.330314576625824, + "learning_rate": 0.001, + "loss": 3.253, + "step": 11328 + }, + { + "epoch": 0.47927066587697775, + "grad_norm": 1.4046648740768433, + "learning_rate": 0.001, + "loss": 2.8288, + "step": 11329 + }, + { + "epoch": 0.4793129706404941, + "grad_norm": 0.7811962962150574, + "learning_rate": 0.001, + "loss": 2.6857, + "step": 11330 + }, + { + "epoch": 0.4793552754040105, + "grad_norm": 0.2128724753856659, + "learning_rate": 0.001, + "loss": 2.478, + "step": 11331 + }, + { + "epoch": 0.47939758016752687, + "grad_norm": 0.2501393258571625, + "learning_rate": 0.001, + "loss": 2.4157, + "step": 11332 + }, + { + "epoch": 0.4794398849310432, + "grad_norm": 0.22114041447639465, + "learning_rate": 0.001, + "loss": 1.7828, + "step": 11333 + }, + { + "epoch": 0.47948218969455964, + "grad_norm": 0.1902758628129959, + "learning_rate": 0.001, + "loss": 2.0687, + "step": 11334 + }, + { + "epoch": 0.479524494458076, + "grad_norm": 0.548904538154602, + "learning_rate": 0.001, + "loss": 2.5692, + "step": 11335 + }, + { + "epoch": 0.47956679922159234, + "grad_norm": 0.18059691786766052, + "learning_rate": 0.001, + "loss": 2.0568, + "step": 11336 + }, + { + "epoch": 0.4796091039851087, + "grad_norm": 0.17880599200725555, + "learning_rate": 0.001, + "loss": 2.1907, + "step": 11337 + }, + { + "epoch": 0.4796514087486251, + "grad_norm": 0.3715362548828125, + "learning_rate": 0.001, + "loss": 2.5385, + "step": 11338 + }, + { + "epoch": 0.47969371351214146, + "grad_norm": 0.1764996498823166, + "learning_rate": 0.001, + "loss": 1.8525, + "step": 11339 + }, + { + "epoch": 0.4797360182756578, + "grad_norm": 0.8168947696685791, + "learning_rate": 0.001, + "loss": 2.5725, + "step": 11340 + }, + { + "epoch": 0.4797783230391742, + "grad_norm": 0.161908358335495, + "learning_rate": 0.001, + "loss": 2.0986, + "step": 11341 + }, + { + "epoch": 0.4798206278026906, + "grad_norm": 0.1834360510110855, + "learning_rate": 0.001, + "loss": 2.0397, + "step": 11342 + }, + { + "epoch": 0.47986293256620693, + "grad_norm": 0.17309635877609253, + "learning_rate": 0.001, + "loss": 2.5099, + "step": 11343 + }, + { + "epoch": 0.47990523732972334, + "grad_norm": 0.19647128880023956, + "learning_rate": 0.001, + "loss": 2.0393, + "step": 11344 + }, + { + "epoch": 0.4799475420932397, + "grad_norm": 0.670528769493103, + "learning_rate": 0.001, + "loss": 3.4804, + "step": 11345 + }, + { + "epoch": 0.47998984685675605, + "grad_norm": 0.1826171725988388, + "learning_rate": 0.001, + "loss": 2.0048, + "step": 11346 + }, + { + "epoch": 0.48003215162027246, + "grad_norm": 0.20341645181179047, + "learning_rate": 0.001, + "loss": 2.7362, + "step": 11347 + }, + { + "epoch": 0.4800744563837888, + "grad_norm": 0.2916952967643738, + "learning_rate": 0.001, + "loss": 1.6388, + "step": 11348 + }, + { + "epoch": 0.48011676114730517, + "grad_norm": 0.5783160328865051, + "learning_rate": 0.001, + "loss": 2.3783, + "step": 11349 + }, + { + "epoch": 0.4801590659108216, + "grad_norm": 0.20249702036380768, + "learning_rate": 0.001, + "loss": 2.3376, + "step": 11350 + }, + { + "epoch": 0.48020137067433794, + "grad_norm": 0.18847358226776123, + "learning_rate": 0.001, + "loss": 2.1271, + "step": 11351 + }, + { + "epoch": 0.4802436754378543, + "grad_norm": 2.4059906005859375, + "learning_rate": 0.001, + "loss": 1.7613, + "step": 11352 + }, + { + "epoch": 0.4802859802013707, + "grad_norm": 0.1835888773202896, + "learning_rate": 0.001, + "loss": 2.379, + "step": 11353 + }, + { + "epoch": 0.48032828496488705, + "grad_norm": 0.34716445207595825, + "learning_rate": 0.001, + "loss": 2.0613, + "step": 11354 + }, + { + "epoch": 0.4803705897284034, + "grad_norm": 0.154496431350708, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 11355 + }, + { + "epoch": 0.4804128944919198, + "grad_norm": 0.5748295783996582, + "learning_rate": 0.001, + "loss": 2.2405, + "step": 11356 + }, + { + "epoch": 0.48045519925543617, + "grad_norm": 0.3923312723636627, + "learning_rate": 0.001, + "loss": 2.7319, + "step": 11357 + }, + { + "epoch": 0.4804975040189525, + "grad_norm": 0.5746713280677795, + "learning_rate": 0.001, + "loss": 1.8951, + "step": 11358 + }, + { + "epoch": 0.4805398087824689, + "grad_norm": 1.0330185890197754, + "learning_rate": 0.001, + "loss": 3.1664, + "step": 11359 + }, + { + "epoch": 0.4805821135459853, + "grad_norm": 0.1738303303718567, + "learning_rate": 0.001, + "loss": 2.808, + "step": 11360 + }, + { + "epoch": 0.48062441830950164, + "grad_norm": 8.028282165527344, + "learning_rate": 0.001, + "loss": 1.9271, + "step": 11361 + }, + { + "epoch": 0.480666723073018, + "grad_norm": 0.18833428621292114, + "learning_rate": 0.001, + "loss": 2.1628, + "step": 11362 + }, + { + "epoch": 0.4807090278365344, + "grad_norm": 0.1686827838420868, + "learning_rate": 0.001, + "loss": 1.6737, + "step": 11363 + }, + { + "epoch": 0.48075133260005076, + "grad_norm": 0.18253150582313538, + "learning_rate": 0.001, + "loss": 2.337, + "step": 11364 + }, + { + "epoch": 0.4807936373635671, + "grad_norm": 0.23626413941383362, + "learning_rate": 0.001, + "loss": 3.2853, + "step": 11365 + }, + { + "epoch": 0.4808359421270835, + "grad_norm": 0.718061089515686, + "learning_rate": 0.001, + "loss": 2.8863, + "step": 11366 + }, + { + "epoch": 0.4808782468905999, + "grad_norm": 0.21760138869285583, + "learning_rate": 0.001, + "loss": 2.4297, + "step": 11367 + }, + { + "epoch": 0.48092055165411624, + "grad_norm": 0.19987843930721283, + "learning_rate": 0.001, + "loss": 3.1579, + "step": 11368 + }, + { + "epoch": 0.48096285641763264, + "grad_norm": 0.16361162066459656, + "learning_rate": 0.001, + "loss": 1.5716, + "step": 11369 + }, + { + "epoch": 0.481005161181149, + "grad_norm": 0.169572651386261, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 11370 + }, + { + "epoch": 0.48104746594466535, + "grad_norm": 1.216856598854065, + "learning_rate": 0.001, + "loss": 2.5188, + "step": 11371 + }, + { + "epoch": 0.48108977070818176, + "grad_norm": 0.20964156091213226, + "learning_rate": 0.001, + "loss": 2.9859, + "step": 11372 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 0.5650727152824402, + "learning_rate": 0.001, + "loss": 3.1159, + "step": 11373 + }, + { + "epoch": 0.48117438023521447, + "grad_norm": 0.2616911232471466, + "learning_rate": 0.001, + "loss": 2.1743, + "step": 11374 + }, + { + "epoch": 0.4812166849987309, + "grad_norm": 0.22309203445911407, + "learning_rate": 0.001, + "loss": 2.7001, + "step": 11375 + }, + { + "epoch": 0.48125898976224724, + "grad_norm": 0.18695230782032013, + "learning_rate": 0.001, + "loss": 2.203, + "step": 11376 + }, + { + "epoch": 0.4813012945257636, + "grad_norm": 0.1963418573141098, + "learning_rate": 0.001, + "loss": 3.2709, + "step": 11377 + }, + { + "epoch": 0.48134359928928, + "grad_norm": 0.16834786534309387, + "learning_rate": 0.001, + "loss": 2.769, + "step": 11378 + }, + { + "epoch": 0.48138590405279635, + "grad_norm": 0.1826411634683609, + "learning_rate": 0.001, + "loss": 1.7105, + "step": 11379 + }, + { + "epoch": 0.4814282088163127, + "grad_norm": 0.43672510981559753, + "learning_rate": 0.001, + "loss": 2.3934, + "step": 11380 + }, + { + "epoch": 0.48147051357982906, + "grad_norm": 0.16782554984092712, + "learning_rate": 0.001, + "loss": 2.589, + "step": 11381 + }, + { + "epoch": 0.4815128183433455, + "grad_norm": 0.16116951406002045, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 11382 + }, + { + "epoch": 0.4815551231068618, + "grad_norm": 0.18043899536132812, + "learning_rate": 0.001, + "loss": 1.7793, + "step": 11383 + }, + { + "epoch": 0.4815974278703782, + "grad_norm": 0.5931171774864197, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 11384 + }, + { + "epoch": 0.4816397326338946, + "grad_norm": 0.4791402816772461, + "learning_rate": 0.001, + "loss": 3.4351, + "step": 11385 + }, + { + "epoch": 0.48168203739741094, + "grad_norm": 0.2714325487613678, + "learning_rate": 0.001, + "loss": 2.2362, + "step": 11386 + }, + { + "epoch": 0.4817243421609273, + "grad_norm": 0.19778069853782654, + "learning_rate": 0.001, + "loss": 2.5545, + "step": 11387 + }, + { + "epoch": 0.4817666469244437, + "grad_norm": 0.208540141582489, + "learning_rate": 0.001, + "loss": 2.9821, + "step": 11388 + }, + { + "epoch": 0.48180895168796006, + "grad_norm": 0.16977502405643463, + "learning_rate": 0.001, + "loss": 1.8149, + "step": 11389 + }, + { + "epoch": 0.4818512564514764, + "grad_norm": 0.22707661986351013, + "learning_rate": 0.001, + "loss": 2.7141, + "step": 11390 + }, + { + "epoch": 0.4818935612149928, + "grad_norm": 0.38424187898635864, + "learning_rate": 0.001, + "loss": 3.1576, + "step": 11391 + }, + { + "epoch": 0.4819358659785092, + "grad_norm": 1.370203971862793, + "learning_rate": 0.001, + "loss": 2.2048, + "step": 11392 + }, + { + "epoch": 0.48197817074202554, + "grad_norm": 0.18375247716903687, + "learning_rate": 0.001, + "loss": 2.3193, + "step": 11393 + }, + { + "epoch": 0.48202047550554195, + "grad_norm": 0.9614766836166382, + "learning_rate": 0.001, + "loss": 1.794, + "step": 11394 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.18902945518493652, + "learning_rate": 0.001, + "loss": 2.1396, + "step": 11395 + }, + { + "epoch": 0.48210508503257465, + "grad_norm": 0.16414183378219604, + "learning_rate": 0.001, + "loss": 1.6427, + "step": 11396 + }, + { + "epoch": 0.48214738979609106, + "grad_norm": 0.16200171411037445, + "learning_rate": 0.001, + "loss": 2.0166, + "step": 11397 + }, + { + "epoch": 0.4821896945596074, + "grad_norm": 3.9230146408081055, + "learning_rate": 0.001, + "loss": 2.5453, + "step": 11398 + }, + { + "epoch": 0.4822319993231238, + "grad_norm": 0.211333766579628, + "learning_rate": 0.001, + "loss": 2.9545, + "step": 11399 + }, + { + "epoch": 0.4822743040866402, + "grad_norm": 0.2078399807214737, + "learning_rate": 0.001, + "loss": 2.085, + "step": 11400 + }, + { + "epoch": 0.48231660885015654, + "grad_norm": 0.1923077553510666, + "learning_rate": 0.001, + "loss": 2.3923, + "step": 11401 + }, + { + "epoch": 0.4823589136136729, + "grad_norm": 1.6450941562652588, + "learning_rate": 0.001, + "loss": 2.4251, + "step": 11402 + }, + { + "epoch": 0.48240121837718924, + "grad_norm": 0.2333688586950302, + "learning_rate": 0.001, + "loss": 2.2939, + "step": 11403 + }, + { + "epoch": 0.48244352314070565, + "grad_norm": 6.017459392547607, + "learning_rate": 0.001, + "loss": 1.9482, + "step": 11404 + }, + { + "epoch": 0.482485827904222, + "grad_norm": 1.64377760887146, + "learning_rate": 0.001, + "loss": 1.9915, + "step": 11405 + }, + { + "epoch": 0.48252813266773836, + "grad_norm": 0.22818098962306976, + "learning_rate": 0.001, + "loss": 3.1697, + "step": 11406 + }, + { + "epoch": 0.4825704374312548, + "grad_norm": 0.20856161415576935, + "learning_rate": 0.001, + "loss": 1.7394, + "step": 11407 + }, + { + "epoch": 0.4826127421947711, + "grad_norm": 0.2079148292541504, + "learning_rate": 0.001, + "loss": 2.85, + "step": 11408 + }, + { + "epoch": 0.4826550469582875, + "grad_norm": 3.2609381675720215, + "learning_rate": 0.001, + "loss": 2.744, + "step": 11409 + }, + { + "epoch": 0.4826973517218039, + "grad_norm": 0.4268743395805359, + "learning_rate": 0.001, + "loss": 3.3981, + "step": 11410 + }, + { + "epoch": 0.48273965648532025, + "grad_norm": 0.260139137506485, + "learning_rate": 0.001, + "loss": 2.0163, + "step": 11411 + }, + { + "epoch": 0.4827819612488366, + "grad_norm": 0.21238379180431366, + "learning_rate": 0.001, + "loss": 2.566, + "step": 11412 + }, + { + "epoch": 0.482824266012353, + "grad_norm": 0.19492459297180176, + "learning_rate": 0.001, + "loss": 2.2858, + "step": 11413 + }, + { + "epoch": 0.48286657077586936, + "grad_norm": 2.3117878437042236, + "learning_rate": 0.001, + "loss": 2.0708, + "step": 11414 + }, + { + "epoch": 0.4829088755393857, + "grad_norm": 0.20078395307064056, + "learning_rate": 0.001, + "loss": 3.0703, + "step": 11415 + }, + { + "epoch": 0.4829511803029021, + "grad_norm": 0.18445464968681335, + "learning_rate": 0.001, + "loss": 2.7102, + "step": 11416 + }, + { + "epoch": 0.4829934850664185, + "grad_norm": 0.2198317050933838, + "learning_rate": 0.001, + "loss": 2.8357, + "step": 11417 + }, + { + "epoch": 0.48303578982993484, + "grad_norm": 2.0135302543640137, + "learning_rate": 0.001, + "loss": 1.8806, + "step": 11418 + }, + { + "epoch": 0.48307809459345125, + "grad_norm": 0.25936415791511536, + "learning_rate": 0.001, + "loss": 3.3516, + "step": 11419 + }, + { + "epoch": 0.4831203993569676, + "grad_norm": 0.2253940850496292, + "learning_rate": 0.001, + "loss": 2.3307, + "step": 11420 + }, + { + "epoch": 0.48316270412048395, + "grad_norm": 0.2977537214756012, + "learning_rate": 0.001, + "loss": 2.1588, + "step": 11421 + }, + { + "epoch": 0.48320500888400036, + "grad_norm": 1.3595587015151978, + "learning_rate": 0.001, + "loss": 2.6004, + "step": 11422 + }, + { + "epoch": 0.4832473136475167, + "grad_norm": 2.3829686641693115, + "learning_rate": 0.001, + "loss": 2.6128, + "step": 11423 + }, + { + "epoch": 0.4832896184110331, + "grad_norm": 0.2193344682455063, + "learning_rate": 0.001, + "loss": 2.9908, + "step": 11424 + }, + { + "epoch": 0.4833319231745494, + "grad_norm": 0.19066233932971954, + "learning_rate": 0.001, + "loss": 2.025, + "step": 11425 + }, + { + "epoch": 0.48337422793806584, + "grad_norm": 0.20256930589675903, + "learning_rate": 0.001, + "loss": 1.9787, + "step": 11426 + }, + { + "epoch": 0.4834165327015822, + "grad_norm": 0.22213421761989594, + "learning_rate": 0.001, + "loss": 1.9955, + "step": 11427 + }, + { + "epoch": 0.48345883746509855, + "grad_norm": 0.2642061412334442, + "learning_rate": 0.001, + "loss": 2.1242, + "step": 11428 + }, + { + "epoch": 0.48350114222861496, + "grad_norm": 0.23516133427619934, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 11429 + }, + { + "epoch": 0.4835434469921313, + "grad_norm": 0.22922705113887787, + "learning_rate": 0.001, + "loss": 2.3993, + "step": 11430 + }, + { + "epoch": 0.48358575175564766, + "grad_norm": 0.2899877727031708, + "learning_rate": 0.001, + "loss": 3.879, + "step": 11431 + }, + { + "epoch": 0.4836280565191641, + "grad_norm": 0.17571720480918884, + "learning_rate": 0.001, + "loss": 2.1495, + "step": 11432 + }, + { + "epoch": 0.4836703612826804, + "grad_norm": 2.013770580291748, + "learning_rate": 0.001, + "loss": 2.7839, + "step": 11433 + }, + { + "epoch": 0.4837126660461968, + "grad_norm": 3.0192947387695312, + "learning_rate": 0.001, + "loss": 2.1995, + "step": 11434 + }, + { + "epoch": 0.4837549708097132, + "grad_norm": 0.20939669013023376, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 11435 + }, + { + "epoch": 0.48379727557322955, + "grad_norm": 0.2533853352069855, + "learning_rate": 0.001, + "loss": 2.8203, + "step": 11436 + }, + { + "epoch": 0.4838395803367459, + "grad_norm": 0.2081190049648285, + "learning_rate": 0.001, + "loss": 2.3127, + "step": 11437 + }, + { + "epoch": 0.4838818851002623, + "grad_norm": 0.1944902241230011, + "learning_rate": 0.001, + "loss": 1.8991, + "step": 11438 + }, + { + "epoch": 0.48392418986377866, + "grad_norm": 0.19556209444999695, + "learning_rate": 0.001, + "loss": 2.3796, + "step": 11439 + }, + { + "epoch": 0.483966494627295, + "grad_norm": 0.27376073598861694, + "learning_rate": 0.001, + "loss": 2.5798, + "step": 11440 + }, + { + "epoch": 0.48400879939081143, + "grad_norm": 0.8901777863502502, + "learning_rate": 0.001, + "loss": 3.6993, + "step": 11441 + }, + { + "epoch": 0.4840511041543278, + "grad_norm": 0.5265764594078064, + "learning_rate": 0.001, + "loss": 1.7463, + "step": 11442 + }, + { + "epoch": 0.48409340891784414, + "grad_norm": 0.4732452630996704, + "learning_rate": 0.001, + "loss": 1.6442, + "step": 11443 + }, + { + "epoch": 0.48413571368136055, + "grad_norm": 0.23689015209674835, + "learning_rate": 0.001, + "loss": 2.367, + "step": 11444 + }, + { + "epoch": 0.4841780184448769, + "grad_norm": 0.1857021003961563, + "learning_rate": 0.001, + "loss": 2.0991, + "step": 11445 + }, + { + "epoch": 0.48422032320839326, + "grad_norm": 9.68715763092041, + "learning_rate": 0.001, + "loss": 1.781, + "step": 11446 + }, + { + "epoch": 0.4842626279719096, + "grad_norm": 0.18469716608524323, + "learning_rate": 0.001, + "loss": 2.3133, + "step": 11447 + }, + { + "epoch": 0.484304932735426, + "grad_norm": 0.195147305727005, + "learning_rate": 0.001, + "loss": 3.6543, + "step": 11448 + }, + { + "epoch": 0.4843472374989424, + "grad_norm": 0.19625887274742126, + "learning_rate": 0.001, + "loss": 1.7704, + "step": 11449 + }, + { + "epoch": 0.4843895422624587, + "grad_norm": 1.0234781503677368, + "learning_rate": 0.001, + "loss": 2.4147, + "step": 11450 + }, + { + "epoch": 0.48443184702597514, + "grad_norm": 1.8966562747955322, + "learning_rate": 0.001, + "loss": 1.4547, + "step": 11451 + }, + { + "epoch": 0.4844741517894915, + "grad_norm": 0.29470208287239075, + "learning_rate": 0.001, + "loss": 2.3858, + "step": 11452 + }, + { + "epoch": 0.48451645655300785, + "grad_norm": 0.4111064672470093, + "learning_rate": 0.001, + "loss": 2.43, + "step": 11453 + }, + { + "epoch": 0.48455876131652426, + "grad_norm": 0.1965835988521576, + "learning_rate": 0.001, + "loss": 1.5909, + "step": 11454 + }, + { + "epoch": 0.4846010660800406, + "grad_norm": 0.19790798425674438, + "learning_rate": 0.001, + "loss": 2.1684, + "step": 11455 + }, + { + "epoch": 0.48464337084355696, + "grad_norm": 0.20888280868530273, + "learning_rate": 0.001, + "loss": 2.1558, + "step": 11456 + }, + { + "epoch": 0.4846856756070734, + "grad_norm": 0.16559502482414246, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 11457 + }, + { + "epoch": 0.48472798037058973, + "grad_norm": 0.1633719652891159, + "learning_rate": 0.001, + "loss": 2.0244, + "step": 11458 + }, + { + "epoch": 0.4847702851341061, + "grad_norm": 0.1771150678396225, + "learning_rate": 0.001, + "loss": 2.4581, + "step": 11459 + }, + { + "epoch": 0.4848125898976225, + "grad_norm": 0.2200511246919632, + "learning_rate": 0.001, + "loss": 2.4251, + "step": 11460 + }, + { + "epoch": 0.48485489466113885, + "grad_norm": 0.17225952446460724, + "learning_rate": 0.001, + "loss": 2.7225, + "step": 11461 + }, + { + "epoch": 0.4848971994246552, + "grad_norm": 0.555501401424408, + "learning_rate": 0.001, + "loss": 2.3918, + "step": 11462 + }, + { + "epoch": 0.4849395041881716, + "grad_norm": 0.16668234765529633, + "learning_rate": 0.001, + "loss": 2.0368, + "step": 11463 + }, + { + "epoch": 0.48498180895168796, + "grad_norm": 0.27358829975128174, + "learning_rate": 0.001, + "loss": 1.9445, + "step": 11464 + }, + { + "epoch": 0.4850241137152043, + "grad_norm": 0.18736232817173004, + "learning_rate": 0.001, + "loss": 2.7039, + "step": 11465 + }, + { + "epoch": 0.48506641847872073, + "grad_norm": 0.15612122416496277, + "learning_rate": 0.001, + "loss": 2.6935, + "step": 11466 + }, + { + "epoch": 0.4851087232422371, + "grad_norm": 2.6563851833343506, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 11467 + }, + { + "epoch": 0.48515102800575344, + "grad_norm": 0.1503165066242218, + "learning_rate": 0.001, + "loss": 2.1244, + "step": 11468 + }, + { + "epoch": 0.48519333276926985, + "grad_norm": 0.19594672322273254, + "learning_rate": 0.001, + "loss": 2.0981, + "step": 11469 + }, + { + "epoch": 0.4852356375327862, + "grad_norm": 0.1888262778520584, + "learning_rate": 0.001, + "loss": 2.8126, + "step": 11470 + }, + { + "epoch": 0.48527794229630256, + "grad_norm": 0.20222362875938416, + "learning_rate": 0.001, + "loss": 2.2918, + "step": 11471 + }, + { + "epoch": 0.4853202470598189, + "grad_norm": 1.1823304891586304, + "learning_rate": 0.001, + "loss": 1.7587, + "step": 11472 + }, + { + "epoch": 0.4853625518233353, + "grad_norm": 0.19512400031089783, + "learning_rate": 0.001, + "loss": 2.1734, + "step": 11473 + }, + { + "epoch": 0.4854048565868517, + "grad_norm": 1.402773380279541, + "learning_rate": 0.001, + "loss": 2.6338, + "step": 11474 + }, + { + "epoch": 0.48544716135036803, + "grad_norm": 0.22815269231796265, + "learning_rate": 0.001, + "loss": 2.9793, + "step": 11475 + }, + { + "epoch": 0.48548946611388444, + "grad_norm": 0.17088262736797333, + "learning_rate": 0.001, + "loss": 2.0868, + "step": 11476 + }, + { + "epoch": 0.4855317708774008, + "grad_norm": 0.16063320636749268, + "learning_rate": 0.001, + "loss": 1.6893, + "step": 11477 + }, + { + "epoch": 0.48557407564091715, + "grad_norm": 0.8555530309677124, + "learning_rate": 0.001, + "loss": 2.3043, + "step": 11478 + }, + { + "epoch": 0.48561638040443356, + "grad_norm": 0.27816876769065857, + "learning_rate": 0.001, + "loss": 3.0965, + "step": 11479 + }, + { + "epoch": 0.4856586851679499, + "grad_norm": 0.7349559664726257, + "learning_rate": 0.001, + "loss": 2.1048, + "step": 11480 + }, + { + "epoch": 0.48570098993146626, + "grad_norm": 0.16931132972240448, + "learning_rate": 0.001, + "loss": 2.0924, + "step": 11481 + }, + { + "epoch": 0.4857432946949827, + "grad_norm": 0.15484954416751862, + "learning_rate": 0.001, + "loss": 1.9129, + "step": 11482 + }, + { + "epoch": 0.48578559945849903, + "grad_norm": 0.3390078842639923, + "learning_rate": 0.001, + "loss": 2.2157, + "step": 11483 + }, + { + "epoch": 0.4858279042220154, + "grad_norm": 0.20084303617477417, + "learning_rate": 0.001, + "loss": 2.4884, + "step": 11484 + }, + { + "epoch": 0.4858702089855318, + "grad_norm": 0.24924889206886292, + "learning_rate": 0.001, + "loss": 2.4044, + "step": 11485 + }, + { + "epoch": 0.48591251374904815, + "grad_norm": 0.19741028547286987, + "learning_rate": 0.001, + "loss": 2.1854, + "step": 11486 + }, + { + "epoch": 0.4859548185125645, + "grad_norm": 0.20189212262630463, + "learning_rate": 0.001, + "loss": 2.4314, + "step": 11487 + }, + { + "epoch": 0.4859971232760809, + "grad_norm": 1.0460984706878662, + "learning_rate": 0.001, + "loss": 2.094, + "step": 11488 + }, + { + "epoch": 0.48603942803959727, + "grad_norm": 2.5869855880737305, + "learning_rate": 0.001, + "loss": 2.3084, + "step": 11489 + }, + { + "epoch": 0.4860817328031136, + "grad_norm": 0.25881245732307434, + "learning_rate": 0.001, + "loss": 3.5865, + "step": 11490 + }, + { + "epoch": 0.48612403756663003, + "grad_norm": 3.5187735557556152, + "learning_rate": 0.001, + "loss": 1.9536, + "step": 11491 + }, + { + "epoch": 0.4861663423301464, + "grad_norm": 0.179852694272995, + "learning_rate": 0.001, + "loss": 2.2011, + "step": 11492 + }, + { + "epoch": 0.48620864709366274, + "grad_norm": 0.37321174144744873, + "learning_rate": 0.001, + "loss": 2.0074, + "step": 11493 + }, + { + "epoch": 0.4862509518571791, + "grad_norm": 0.5720106959342957, + "learning_rate": 0.001, + "loss": 2.0811, + "step": 11494 + }, + { + "epoch": 0.4862932566206955, + "grad_norm": 0.31446918845176697, + "learning_rate": 0.001, + "loss": 2.0816, + "step": 11495 + }, + { + "epoch": 0.48633556138421186, + "grad_norm": 0.24225953221321106, + "learning_rate": 0.001, + "loss": 1.9998, + "step": 11496 + }, + { + "epoch": 0.4863778661477282, + "grad_norm": 0.20280331373214722, + "learning_rate": 0.001, + "loss": 2.222, + "step": 11497 + }, + { + "epoch": 0.4864201709112446, + "grad_norm": 0.1985984891653061, + "learning_rate": 0.001, + "loss": 2.5404, + "step": 11498 + }, + { + "epoch": 0.486462475674761, + "grad_norm": 0.25399044156074524, + "learning_rate": 0.001, + "loss": 2.2409, + "step": 11499 + }, + { + "epoch": 0.48650478043827733, + "grad_norm": 0.376498281955719, + "learning_rate": 0.001, + "loss": 2.584, + "step": 11500 + }, + { + "epoch": 0.48654708520179374, + "grad_norm": 0.15857632458209991, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 11501 + }, + { + "epoch": 0.4865893899653101, + "grad_norm": 0.19442448019981384, + "learning_rate": 0.001, + "loss": 1.911, + "step": 11502 + }, + { + "epoch": 0.48663169472882645, + "grad_norm": 0.28401103615760803, + "learning_rate": 0.001, + "loss": 2.037, + "step": 11503 + }, + { + "epoch": 0.48667399949234286, + "grad_norm": 0.1915339082479477, + "learning_rate": 0.001, + "loss": 2.5886, + "step": 11504 + }, + { + "epoch": 0.4867163042558592, + "grad_norm": 0.13712449371814728, + "learning_rate": 0.001, + "loss": 1.6988, + "step": 11505 + }, + { + "epoch": 0.48675860901937557, + "grad_norm": 0.16903550922870636, + "learning_rate": 0.001, + "loss": 2.3553, + "step": 11506 + }, + { + "epoch": 0.486800913782892, + "grad_norm": 0.3302583396434784, + "learning_rate": 0.001, + "loss": 2.4041, + "step": 11507 + }, + { + "epoch": 0.48684321854640833, + "grad_norm": 0.16169388592243195, + "learning_rate": 0.001, + "loss": 1.5953, + "step": 11508 + }, + { + "epoch": 0.4868855233099247, + "grad_norm": 0.19090786576271057, + "learning_rate": 0.001, + "loss": 2.6609, + "step": 11509 + }, + { + "epoch": 0.4869278280734411, + "grad_norm": 0.2367997020483017, + "learning_rate": 0.001, + "loss": 2.7336, + "step": 11510 + }, + { + "epoch": 0.48697013283695745, + "grad_norm": 29.403594970703125, + "learning_rate": 0.001, + "loss": 2.4013, + "step": 11511 + }, + { + "epoch": 0.4870124376004738, + "grad_norm": 0.26779159903526306, + "learning_rate": 0.001, + "loss": 2.9784, + "step": 11512 + }, + { + "epoch": 0.4870547423639902, + "grad_norm": 0.4845396876335144, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 11513 + }, + { + "epoch": 0.48709704712750657, + "grad_norm": 0.5120859742164612, + "learning_rate": 0.001, + "loss": 1.9852, + "step": 11514 + }, + { + "epoch": 0.4871393518910229, + "grad_norm": 0.1959608644247055, + "learning_rate": 0.001, + "loss": 2.3048, + "step": 11515 + }, + { + "epoch": 0.4871816566545393, + "grad_norm": 5.542116641998291, + "learning_rate": 0.001, + "loss": 2.3137, + "step": 11516 + }, + { + "epoch": 0.4872239614180557, + "grad_norm": 0.26933497190475464, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 11517 + }, + { + "epoch": 0.48726626618157204, + "grad_norm": 0.24525229632854462, + "learning_rate": 0.001, + "loss": 2.0777, + "step": 11518 + }, + { + "epoch": 0.4873085709450884, + "grad_norm": 0.20497070252895355, + "learning_rate": 0.001, + "loss": 2.317, + "step": 11519 + }, + { + "epoch": 0.4873508757086048, + "grad_norm": 0.31247276067733765, + "learning_rate": 0.001, + "loss": 2.4255, + "step": 11520 + }, + { + "epoch": 0.48739318047212116, + "grad_norm": 0.29333776235580444, + "learning_rate": 0.001, + "loss": 1.945, + "step": 11521 + }, + { + "epoch": 0.4874354852356375, + "grad_norm": 1.8194544315338135, + "learning_rate": 0.001, + "loss": 2.4755, + "step": 11522 + }, + { + "epoch": 0.4874777899991539, + "grad_norm": 0.182512104511261, + "learning_rate": 0.001, + "loss": 2.1394, + "step": 11523 + }, + { + "epoch": 0.4875200947626703, + "grad_norm": 0.19571569561958313, + "learning_rate": 0.001, + "loss": 2.2593, + "step": 11524 + }, + { + "epoch": 0.48756239952618663, + "grad_norm": 0.21386699378490448, + "learning_rate": 0.001, + "loss": 1.9803, + "step": 11525 + }, + { + "epoch": 0.48760470428970304, + "grad_norm": 1.0484811067581177, + "learning_rate": 0.001, + "loss": 1.9055, + "step": 11526 + }, + { + "epoch": 0.4876470090532194, + "grad_norm": 0.19381646811962128, + "learning_rate": 0.001, + "loss": 2.8984, + "step": 11527 + }, + { + "epoch": 0.48768931381673575, + "grad_norm": 0.18061847984790802, + "learning_rate": 0.001, + "loss": 2.9683, + "step": 11528 + }, + { + "epoch": 0.48773161858025216, + "grad_norm": 1.0904494524002075, + "learning_rate": 0.001, + "loss": 1.7487, + "step": 11529 + }, + { + "epoch": 0.4877739233437685, + "grad_norm": 0.5851990580558777, + "learning_rate": 0.001, + "loss": 4.4732, + "step": 11530 + }, + { + "epoch": 0.48781622810728487, + "grad_norm": 0.21629658341407776, + "learning_rate": 0.001, + "loss": 3.3176, + "step": 11531 + }, + { + "epoch": 0.4878585328708013, + "grad_norm": 0.1830849051475525, + "learning_rate": 0.001, + "loss": 1.9741, + "step": 11532 + }, + { + "epoch": 0.48790083763431763, + "grad_norm": 0.6447810530662537, + "learning_rate": 0.001, + "loss": 1.6206, + "step": 11533 + }, + { + "epoch": 0.487943142397834, + "grad_norm": 0.22213850915431976, + "learning_rate": 0.001, + "loss": 2.1542, + "step": 11534 + }, + { + "epoch": 0.4879854471613504, + "grad_norm": 0.6052437424659729, + "learning_rate": 0.001, + "loss": 2.0151, + "step": 11535 + }, + { + "epoch": 0.48802775192486675, + "grad_norm": 0.15839864313602448, + "learning_rate": 0.001, + "loss": 1.9729, + "step": 11536 + }, + { + "epoch": 0.4880700566883831, + "grad_norm": 1.3613576889038086, + "learning_rate": 0.001, + "loss": 2.4232, + "step": 11537 + }, + { + "epoch": 0.48811236145189946, + "grad_norm": 0.19417421519756317, + "learning_rate": 0.001, + "loss": 1.7537, + "step": 11538 + }, + { + "epoch": 0.48815466621541587, + "grad_norm": 0.3457423448562622, + "learning_rate": 0.001, + "loss": 2.1222, + "step": 11539 + }, + { + "epoch": 0.4881969709789322, + "grad_norm": 0.20159010589122772, + "learning_rate": 0.001, + "loss": 2.4426, + "step": 11540 + }, + { + "epoch": 0.4882392757424486, + "grad_norm": 0.188156396150589, + "learning_rate": 0.001, + "loss": 2.3599, + "step": 11541 + }, + { + "epoch": 0.488281580505965, + "grad_norm": 0.5900790095329285, + "learning_rate": 0.001, + "loss": 2.0175, + "step": 11542 + }, + { + "epoch": 0.48832388526948134, + "grad_norm": 0.2750971019268036, + "learning_rate": 0.001, + "loss": 2.0602, + "step": 11543 + }, + { + "epoch": 0.4883661900329977, + "grad_norm": 0.1912364661693573, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 11544 + }, + { + "epoch": 0.4884084947965141, + "grad_norm": 0.20477798581123352, + "learning_rate": 0.001, + "loss": 2.7998, + "step": 11545 + }, + { + "epoch": 0.48845079956003046, + "grad_norm": 0.1736663430929184, + "learning_rate": 0.001, + "loss": 2.1871, + "step": 11546 + }, + { + "epoch": 0.4884931043235468, + "grad_norm": 0.1695953756570816, + "learning_rate": 0.001, + "loss": 2.4564, + "step": 11547 + }, + { + "epoch": 0.4885354090870632, + "grad_norm": 0.19503745436668396, + "learning_rate": 0.001, + "loss": 2.5058, + "step": 11548 + }, + { + "epoch": 0.4885777138505796, + "grad_norm": 0.1817982941865921, + "learning_rate": 0.001, + "loss": 2.7636, + "step": 11549 + }, + { + "epoch": 0.48862001861409593, + "grad_norm": 0.18275293707847595, + "learning_rate": 0.001, + "loss": 3.3698, + "step": 11550 + }, + { + "epoch": 0.48866232337761234, + "grad_norm": 5.7995758056640625, + "learning_rate": 0.001, + "loss": 3.3286, + "step": 11551 + }, + { + "epoch": 0.4887046281411287, + "grad_norm": 0.1749723106622696, + "learning_rate": 0.001, + "loss": 2.4678, + "step": 11552 + }, + { + "epoch": 0.48874693290464505, + "grad_norm": 0.22669129073619843, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 11553 + }, + { + "epoch": 0.48878923766816146, + "grad_norm": 0.4565039277076721, + "learning_rate": 0.001, + "loss": 2.2781, + "step": 11554 + }, + { + "epoch": 0.4888315424316778, + "grad_norm": 0.15998007357120514, + "learning_rate": 0.001, + "loss": 2.4679, + "step": 11555 + }, + { + "epoch": 0.48887384719519417, + "grad_norm": 0.17309017479419708, + "learning_rate": 0.001, + "loss": 2.0567, + "step": 11556 + }, + { + "epoch": 0.4889161519587106, + "grad_norm": 0.16389106214046478, + "learning_rate": 0.001, + "loss": 2.0645, + "step": 11557 + }, + { + "epoch": 0.48895845672222693, + "grad_norm": 0.1628233641386032, + "learning_rate": 0.001, + "loss": 2.142, + "step": 11558 + }, + { + "epoch": 0.4890007614857433, + "grad_norm": 0.17292705178260803, + "learning_rate": 0.001, + "loss": 2.4255, + "step": 11559 + }, + { + "epoch": 0.48904306624925964, + "grad_norm": 0.13833680748939514, + "learning_rate": 0.001, + "loss": 1.5506, + "step": 11560 + }, + { + "epoch": 0.48908537101277605, + "grad_norm": 0.1819964051246643, + "learning_rate": 0.001, + "loss": 2.4794, + "step": 11561 + }, + { + "epoch": 0.4891276757762924, + "grad_norm": 0.17787259817123413, + "learning_rate": 0.001, + "loss": 2.5472, + "step": 11562 + }, + { + "epoch": 0.48916998053980876, + "grad_norm": 0.1847681850194931, + "learning_rate": 0.001, + "loss": 1.929, + "step": 11563 + }, + { + "epoch": 0.48921228530332517, + "grad_norm": 0.16343270242214203, + "learning_rate": 0.001, + "loss": 1.9423, + "step": 11564 + }, + { + "epoch": 0.4892545900668415, + "grad_norm": 0.18636071681976318, + "learning_rate": 0.001, + "loss": 1.5396, + "step": 11565 + }, + { + "epoch": 0.4892968948303579, + "grad_norm": 0.3037877678871155, + "learning_rate": 0.001, + "loss": 1.9659, + "step": 11566 + }, + { + "epoch": 0.4893391995938743, + "grad_norm": 5.196141242980957, + "learning_rate": 0.001, + "loss": 2.4534, + "step": 11567 + }, + { + "epoch": 0.48938150435739064, + "grad_norm": 0.16297638416290283, + "learning_rate": 0.001, + "loss": 2.0671, + "step": 11568 + }, + { + "epoch": 0.489423809120907, + "grad_norm": 0.14544682204723358, + "learning_rate": 0.001, + "loss": 2.069, + "step": 11569 + }, + { + "epoch": 0.4894661138844234, + "grad_norm": 0.922217071056366, + "learning_rate": 0.001, + "loss": 2.0644, + "step": 11570 + }, + { + "epoch": 0.48950841864793976, + "grad_norm": 0.13835956156253815, + "learning_rate": 0.001, + "loss": 1.6519, + "step": 11571 + }, + { + "epoch": 0.4895507234114561, + "grad_norm": 0.3947632610797882, + "learning_rate": 0.001, + "loss": 2.2417, + "step": 11572 + }, + { + "epoch": 0.4895930281749725, + "grad_norm": 0.15445981919765472, + "learning_rate": 0.001, + "loss": 2.5499, + "step": 11573 + }, + { + "epoch": 0.4896353329384889, + "grad_norm": 0.17741024494171143, + "learning_rate": 0.001, + "loss": 1.8943, + "step": 11574 + }, + { + "epoch": 0.48967763770200523, + "grad_norm": 0.17265914380550385, + "learning_rate": 0.001, + "loss": 2.0673, + "step": 11575 + }, + { + "epoch": 0.48971994246552164, + "grad_norm": 0.640771210193634, + "learning_rate": 0.001, + "loss": 3.3352, + "step": 11576 + }, + { + "epoch": 0.489762247229038, + "grad_norm": 0.17222177982330322, + "learning_rate": 0.001, + "loss": 1.9422, + "step": 11577 + }, + { + "epoch": 0.48980455199255435, + "grad_norm": 0.18314416706562042, + "learning_rate": 0.001, + "loss": 1.9239, + "step": 11578 + }, + { + "epoch": 0.48984685675607076, + "grad_norm": 0.3542311489582062, + "learning_rate": 0.001, + "loss": 3.8196, + "step": 11579 + }, + { + "epoch": 0.4898891615195871, + "grad_norm": 0.14956578612327576, + "learning_rate": 0.001, + "loss": 1.9715, + "step": 11580 + }, + { + "epoch": 0.48993146628310347, + "grad_norm": 0.17212481796741486, + "learning_rate": 0.001, + "loss": 2.7058, + "step": 11581 + }, + { + "epoch": 0.4899737710466199, + "grad_norm": 0.2047460526227951, + "learning_rate": 0.001, + "loss": 2.4195, + "step": 11582 + }, + { + "epoch": 0.49001607581013623, + "grad_norm": 0.16925156116485596, + "learning_rate": 0.001, + "loss": 1.4925, + "step": 11583 + }, + { + "epoch": 0.4900583805736526, + "grad_norm": 0.17683045566082, + "learning_rate": 0.001, + "loss": 2.1021, + "step": 11584 + }, + { + "epoch": 0.49010068533716894, + "grad_norm": 0.19544382393360138, + "learning_rate": 0.001, + "loss": 2.8976, + "step": 11585 + }, + { + "epoch": 0.49014299010068535, + "grad_norm": 0.17539872229099274, + "learning_rate": 0.001, + "loss": 2.7534, + "step": 11586 + }, + { + "epoch": 0.4901852948642017, + "grad_norm": 0.17753051221370697, + "learning_rate": 0.001, + "loss": 2.2047, + "step": 11587 + }, + { + "epoch": 0.49022759962771806, + "grad_norm": 0.16815239191055298, + "learning_rate": 0.001, + "loss": 1.6325, + "step": 11588 + }, + { + "epoch": 0.49026990439123447, + "grad_norm": 0.2511793375015259, + "learning_rate": 0.001, + "loss": 2.8271, + "step": 11589 + }, + { + "epoch": 0.4903122091547508, + "grad_norm": 3.128819465637207, + "learning_rate": 0.001, + "loss": 1.7008, + "step": 11590 + }, + { + "epoch": 0.4903545139182672, + "grad_norm": 0.2257847785949707, + "learning_rate": 0.001, + "loss": 2.4141, + "step": 11591 + }, + { + "epoch": 0.4903968186817836, + "grad_norm": 1.591920256614685, + "learning_rate": 0.001, + "loss": 1.9112, + "step": 11592 + }, + { + "epoch": 0.49043912344529994, + "grad_norm": 0.2209502011537552, + "learning_rate": 0.001, + "loss": 2.9542, + "step": 11593 + }, + { + "epoch": 0.4904814282088163, + "grad_norm": 0.19603808224201202, + "learning_rate": 0.001, + "loss": 2.5474, + "step": 11594 + }, + { + "epoch": 0.4905237329723327, + "grad_norm": 0.6029075980186462, + "learning_rate": 0.001, + "loss": 1.8916, + "step": 11595 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.1723041832447052, + "learning_rate": 0.001, + "loss": 2.0029, + "step": 11596 + }, + { + "epoch": 0.4906083424993654, + "grad_norm": 0.156670480966568, + "learning_rate": 0.001, + "loss": 1.8345, + "step": 11597 + }, + { + "epoch": 0.4906506472628818, + "grad_norm": 0.2527156174182892, + "learning_rate": 0.001, + "loss": 3.4863, + "step": 11598 + }, + { + "epoch": 0.4906929520263982, + "grad_norm": 0.15961304306983948, + "learning_rate": 0.001, + "loss": 2.3809, + "step": 11599 + }, + { + "epoch": 0.49073525678991453, + "grad_norm": 0.16630440950393677, + "learning_rate": 0.001, + "loss": 2.33, + "step": 11600 + }, + { + "epoch": 0.49077756155343094, + "grad_norm": 0.1635364443063736, + "learning_rate": 0.001, + "loss": 2.1123, + "step": 11601 + }, + { + "epoch": 0.4908198663169473, + "grad_norm": 12.366936683654785, + "learning_rate": 0.001, + "loss": 2.755, + "step": 11602 + }, + { + "epoch": 0.49086217108046365, + "grad_norm": 0.16810069978237152, + "learning_rate": 0.001, + "loss": 2.0871, + "step": 11603 + }, + { + "epoch": 0.49090447584398006, + "grad_norm": 0.16595399379730225, + "learning_rate": 0.001, + "loss": 2.1468, + "step": 11604 + }, + { + "epoch": 0.4909467806074964, + "grad_norm": 0.5384510159492493, + "learning_rate": 0.001, + "loss": 2.1914, + "step": 11605 + }, + { + "epoch": 0.49098908537101277, + "grad_norm": 0.18624725937843323, + "learning_rate": 0.001, + "loss": 3.0921, + "step": 11606 + }, + { + "epoch": 0.4910313901345291, + "grad_norm": 0.18056440353393555, + "learning_rate": 0.001, + "loss": 1.936, + "step": 11607 + }, + { + "epoch": 0.49107369489804553, + "grad_norm": 1.4659069776535034, + "learning_rate": 0.001, + "loss": 2.5418, + "step": 11608 + }, + { + "epoch": 0.4911159996615619, + "grad_norm": 0.23289033770561218, + "learning_rate": 0.001, + "loss": 3.2947, + "step": 11609 + }, + { + "epoch": 0.49115830442507824, + "grad_norm": 0.18133248388767242, + "learning_rate": 0.001, + "loss": 1.8595, + "step": 11610 + }, + { + "epoch": 0.49120060918859465, + "grad_norm": 0.16130225360393524, + "learning_rate": 0.001, + "loss": 2.1366, + "step": 11611 + }, + { + "epoch": 0.491242913952111, + "grad_norm": 0.6706212759017944, + "learning_rate": 0.001, + "loss": 3.0816, + "step": 11612 + }, + { + "epoch": 0.49128521871562736, + "grad_norm": 0.2708061635494232, + "learning_rate": 0.001, + "loss": 2.2324, + "step": 11613 + }, + { + "epoch": 0.49132752347914377, + "grad_norm": 0.1868041753768921, + "learning_rate": 0.001, + "loss": 2.1089, + "step": 11614 + }, + { + "epoch": 0.4913698282426601, + "grad_norm": 0.21697242558002472, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 11615 + }, + { + "epoch": 0.4914121330061765, + "grad_norm": 0.1955055445432663, + "learning_rate": 0.001, + "loss": 2.3835, + "step": 11616 + }, + { + "epoch": 0.4914544377696929, + "grad_norm": 0.18908244371414185, + "learning_rate": 0.001, + "loss": 3.2207, + "step": 11617 + }, + { + "epoch": 0.49149674253320924, + "grad_norm": 0.17205405235290527, + "learning_rate": 0.001, + "loss": 2.0338, + "step": 11618 + }, + { + "epoch": 0.4915390472967256, + "grad_norm": 0.2222311645746231, + "learning_rate": 0.001, + "loss": 2.4758, + "step": 11619 + }, + { + "epoch": 0.491581352060242, + "grad_norm": 0.552980899810791, + "learning_rate": 0.001, + "loss": 1.7828, + "step": 11620 + }, + { + "epoch": 0.49162365682375836, + "grad_norm": 0.17426225543022156, + "learning_rate": 0.001, + "loss": 2.4719, + "step": 11621 + }, + { + "epoch": 0.4916659615872747, + "grad_norm": 0.17539294064044952, + "learning_rate": 0.001, + "loss": 2.2529, + "step": 11622 + }, + { + "epoch": 0.4917082663507911, + "grad_norm": 0.38558903336524963, + "learning_rate": 0.001, + "loss": 1.621, + "step": 11623 + }, + { + "epoch": 0.4917505711143075, + "grad_norm": 12.589545249938965, + "learning_rate": 0.001, + "loss": 3.9498, + "step": 11624 + }, + { + "epoch": 0.49179287587782383, + "grad_norm": 0.18124458193778992, + "learning_rate": 0.001, + "loss": 2.783, + "step": 11625 + }, + { + "epoch": 0.49183518064134024, + "grad_norm": 0.1963767558336258, + "learning_rate": 0.001, + "loss": 1.9605, + "step": 11626 + }, + { + "epoch": 0.4918774854048566, + "grad_norm": 0.6345813274383545, + "learning_rate": 0.001, + "loss": 2.0434, + "step": 11627 + }, + { + "epoch": 0.49191979016837295, + "grad_norm": 0.21551281213760376, + "learning_rate": 0.001, + "loss": 3.1873, + "step": 11628 + }, + { + "epoch": 0.4919620949318893, + "grad_norm": 0.17613928020000458, + "learning_rate": 0.001, + "loss": 2.0574, + "step": 11629 + }, + { + "epoch": 0.4920043996954057, + "grad_norm": 0.20698675513267517, + "learning_rate": 0.001, + "loss": 3.0403, + "step": 11630 + }, + { + "epoch": 0.49204670445892207, + "grad_norm": 0.2309427410364151, + "learning_rate": 0.001, + "loss": 2.8583, + "step": 11631 + }, + { + "epoch": 0.4920890092224384, + "grad_norm": 0.18390582501888275, + "learning_rate": 0.001, + "loss": 1.6705, + "step": 11632 + }, + { + "epoch": 0.49213131398595483, + "grad_norm": 0.2480992078781128, + "learning_rate": 0.001, + "loss": 2.2437, + "step": 11633 + }, + { + "epoch": 0.4921736187494712, + "grad_norm": 0.5748304724693298, + "learning_rate": 0.001, + "loss": 2.3204, + "step": 11634 + }, + { + "epoch": 0.49221592351298754, + "grad_norm": 0.151663139462471, + "learning_rate": 0.001, + "loss": 2.4468, + "step": 11635 + }, + { + "epoch": 0.49225822827650395, + "grad_norm": 0.7663713693618774, + "learning_rate": 0.001, + "loss": 2.0351, + "step": 11636 + }, + { + "epoch": 0.4923005330400203, + "grad_norm": 0.2127770632505417, + "learning_rate": 0.001, + "loss": 2.0751, + "step": 11637 + }, + { + "epoch": 0.49234283780353666, + "grad_norm": 0.6651235222816467, + "learning_rate": 0.001, + "loss": 2.5762, + "step": 11638 + }, + { + "epoch": 0.49238514256705307, + "grad_norm": 0.18715116381645203, + "learning_rate": 0.001, + "loss": 1.8795, + "step": 11639 + }, + { + "epoch": 0.4924274473305694, + "grad_norm": 0.45163238048553467, + "learning_rate": 0.001, + "loss": 2.6674, + "step": 11640 + }, + { + "epoch": 0.4924697520940858, + "grad_norm": 0.2655963599681854, + "learning_rate": 0.001, + "loss": 1.9527, + "step": 11641 + }, + { + "epoch": 0.4925120568576022, + "grad_norm": 0.2013363540172577, + "learning_rate": 0.001, + "loss": 3.415, + "step": 11642 + }, + { + "epoch": 0.49255436162111854, + "grad_norm": 0.20579899847507477, + "learning_rate": 0.001, + "loss": 2.5534, + "step": 11643 + }, + { + "epoch": 0.4925966663846349, + "grad_norm": 0.6283543705940247, + "learning_rate": 0.001, + "loss": 2.4812, + "step": 11644 + }, + { + "epoch": 0.4926389711481513, + "grad_norm": 0.19560718536376953, + "learning_rate": 0.001, + "loss": 1.6794, + "step": 11645 + }, + { + "epoch": 0.49268127591166766, + "grad_norm": 0.20248031616210938, + "learning_rate": 0.001, + "loss": 2.4745, + "step": 11646 + }, + { + "epoch": 0.492723580675184, + "grad_norm": 0.20276713371276855, + "learning_rate": 0.001, + "loss": 2.1166, + "step": 11647 + }, + { + "epoch": 0.4927658854387004, + "grad_norm": 0.21462062001228333, + "learning_rate": 0.001, + "loss": 2.0011, + "step": 11648 + }, + { + "epoch": 0.4928081902022168, + "grad_norm": 0.1634545773267746, + "learning_rate": 0.001, + "loss": 1.8337, + "step": 11649 + }, + { + "epoch": 0.49285049496573313, + "grad_norm": 0.18558084964752197, + "learning_rate": 0.001, + "loss": 1.4515, + "step": 11650 + }, + { + "epoch": 0.4928927997292495, + "grad_norm": 0.21327702701091766, + "learning_rate": 0.001, + "loss": 2.023, + "step": 11651 + }, + { + "epoch": 0.4929351044927659, + "grad_norm": 0.18165309727191925, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 11652 + }, + { + "epoch": 0.49297740925628225, + "grad_norm": 0.16911716759204865, + "learning_rate": 0.001, + "loss": 2.1337, + "step": 11653 + }, + { + "epoch": 0.4930197140197986, + "grad_norm": 0.7974897623062134, + "learning_rate": 0.001, + "loss": 3.1178, + "step": 11654 + }, + { + "epoch": 0.493062018783315, + "grad_norm": 0.2036375254392624, + "learning_rate": 0.001, + "loss": 2.142, + "step": 11655 + }, + { + "epoch": 0.49310432354683137, + "grad_norm": 0.21488189697265625, + "learning_rate": 0.001, + "loss": 2.9894, + "step": 11656 + }, + { + "epoch": 0.4931466283103477, + "grad_norm": 0.1535966992378235, + "learning_rate": 0.001, + "loss": 2.5775, + "step": 11657 + }, + { + "epoch": 0.49318893307386413, + "grad_norm": 0.19664905965328217, + "learning_rate": 0.001, + "loss": 2.1606, + "step": 11658 + }, + { + "epoch": 0.4932312378373805, + "grad_norm": 0.45215147733688354, + "learning_rate": 0.001, + "loss": 1.9469, + "step": 11659 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.17420543730258942, + "learning_rate": 0.001, + "loss": 1.9453, + "step": 11660 + }, + { + "epoch": 0.49331584736441325, + "grad_norm": 0.1706647127866745, + "learning_rate": 0.001, + "loss": 2.4864, + "step": 11661 + }, + { + "epoch": 0.4933581521279296, + "grad_norm": 0.17685231566429138, + "learning_rate": 0.001, + "loss": 2.1576, + "step": 11662 + }, + { + "epoch": 0.49340045689144596, + "grad_norm": 0.14751426875591278, + "learning_rate": 0.001, + "loss": 2.7795, + "step": 11663 + }, + { + "epoch": 0.49344276165496237, + "grad_norm": 0.33146944642066956, + "learning_rate": 0.001, + "loss": 2.1473, + "step": 11664 + }, + { + "epoch": 0.4934850664184787, + "grad_norm": 0.1822434514760971, + "learning_rate": 0.001, + "loss": 2.3431, + "step": 11665 + }, + { + "epoch": 0.4935273711819951, + "grad_norm": 0.17229153215885162, + "learning_rate": 0.001, + "loss": 1.9659, + "step": 11666 + }, + { + "epoch": 0.4935696759455115, + "grad_norm": 0.17021816968917847, + "learning_rate": 0.001, + "loss": 2.4419, + "step": 11667 + }, + { + "epoch": 0.49361198070902784, + "grad_norm": 0.15960778295993805, + "learning_rate": 0.001, + "loss": 1.9672, + "step": 11668 + }, + { + "epoch": 0.4936542854725442, + "grad_norm": 0.28872665762901306, + "learning_rate": 0.001, + "loss": 4.1413, + "step": 11669 + }, + { + "epoch": 0.4936965902360606, + "grad_norm": 0.16721804440021515, + "learning_rate": 0.001, + "loss": 1.9916, + "step": 11670 + }, + { + "epoch": 0.49373889499957696, + "grad_norm": 0.16407166421413422, + "learning_rate": 0.001, + "loss": 1.6337, + "step": 11671 + }, + { + "epoch": 0.4937811997630933, + "grad_norm": 0.16153931617736816, + "learning_rate": 0.001, + "loss": 1.9922, + "step": 11672 + }, + { + "epoch": 0.49382350452660967, + "grad_norm": 0.14842648804187775, + "learning_rate": 0.001, + "loss": 1.9083, + "step": 11673 + }, + { + "epoch": 0.4938658092901261, + "grad_norm": 0.301229327917099, + "learning_rate": 0.001, + "loss": 2.2091, + "step": 11674 + }, + { + "epoch": 0.49390811405364243, + "grad_norm": 0.1671200543642044, + "learning_rate": 0.001, + "loss": 2.6286, + "step": 11675 + }, + { + "epoch": 0.4939504188171588, + "grad_norm": 0.1930837333202362, + "learning_rate": 0.001, + "loss": 2.7532, + "step": 11676 + }, + { + "epoch": 0.4939927235806752, + "grad_norm": 0.1453826129436493, + "learning_rate": 0.001, + "loss": 2.1149, + "step": 11677 + }, + { + "epoch": 0.49403502834419155, + "grad_norm": 0.16736523807048798, + "learning_rate": 0.001, + "loss": 2.0743, + "step": 11678 + }, + { + "epoch": 0.4940773331077079, + "grad_norm": 0.14059995114803314, + "learning_rate": 0.001, + "loss": 1.8672, + "step": 11679 + }, + { + "epoch": 0.4941196378712243, + "grad_norm": 0.8272671699523926, + "learning_rate": 0.001, + "loss": 2.879, + "step": 11680 + }, + { + "epoch": 0.49416194263474067, + "grad_norm": 0.1517970710992813, + "learning_rate": 0.001, + "loss": 2.3603, + "step": 11681 + }, + { + "epoch": 0.494204247398257, + "grad_norm": 0.19491681456565857, + "learning_rate": 0.001, + "loss": 1.7711, + "step": 11682 + }, + { + "epoch": 0.49424655216177343, + "grad_norm": 0.16503435373306274, + "learning_rate": 0.001, + "loss": 2.2452, + "step": 11683 + }, + { + "epoch": 0.4942888569252898, + "grad_norm": 0.2520374059677124, + "learning_rate": 0.001, + "loss": 2.4305, + "step": 11684 + }, + { + "epoch": 0.49433116168880614, + "grad_norm": 0.325879842042923, + "learning_rate": 0.001, + "loss": 1.5981, + "step": 11685 + }, + { + "epoch": 0.49437346645232255, + "grad_norm": 0.17779172956943512, + "learning_rate": 0.001, + "loss": 2.5801, + "step": 11686 + }, + { + "epoch": 0.4944157712158389, + "grad_norm": 0.21204549074172974, + "learning_rate": 0.001, + "loss": 2.7278, + "step": 11687 + }, + { + "epoch": 0.49445807597935526, + "grad_norm": 0.21863149106502533, + "learning_rate": 0.001, + "loss": 2.3487, + "step": 11688 + }, + { + "epoch": 0.49450038074287167, + "grad_norm": 0.2039804309606552, + "learning_rate": 0.001, + "loss": 2.3773, + "step": 11689 + }, + { + "epoch": 0.494542685506388, + "grad_norm": 0.39557844400405884, + "learning_rate": 0.001, + "loss": 1.7114, + "step": 11690 + }, + { + "epoch": 0.4945849902699044, + "grad_norm": 0.28419822454452515, + "learning_rate": 0.001, + "loss": 1.9244, + "step": 11691 + }, + { + "epoch": 0.4946272950334208, + "grad_norm": 3.456125020980835, + "learning_rate": 0.001, + "loss": 2.3652, + "step": 11692 + }, + { + "epoch": 0.49466959979693714, + "grad_norm": 0.1735409051179886, + "learning_rate": 0.001, + "loss": 1.5861, + "step": 11693 + }, + { + "epoch": 0.4947119045604535, + "grad_norm": 0.2236192673444748, + "learning_rate": 0.001, + "loss": 2.8274, + "step": 11694 + }, + { + "epoch": 0.49475420932396985, + "grad_norm": 4.79661226272583, + "learning_rate": 0.001, + "loss": 2.3105, + "step": 11695 + }, + { + "epoch": 0.49479651408748626, + "grad_norm": 0.15469114482402802, + "learning_rate": 0.001, + "loss": 1.5461, + "step": 11696 + }, + { + "epoch": 0.4948388188510026, + "grad_norm": 0.383112370967865, + "learning_rate": 0.001, + "loss": 1.8817, + "step": 11697 + }, + { + "epoch": 0.49488112361451897, + "grad_norm": 0.17388460040092468, + "learning_rate": 0.001, + "loss": 2.0987, + "step": 11698 + }, + { + "epoch": 0.4949234283780354, + "grad_norm": 0.21153829991817474, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 11699 + }, + { + "epoch": 0.49496573314155173, + "grad_norm": 0.19076654314994812, + "learning_rate": 0.001, + "loss": 2.7168, + "step": 11700 + }, + { + "epoch": 0.4950080379050681, + "grad_norm": 0.17775197327136993, + "learning_rate": 0.001, + "loss": 2.3872, + "step": 11701 + }, + { + "epoch": 0.4950503426685845, + "grad_norm": 0.18127349019050598, + "learning_rate": 0.001, + "loss": 2.3911, + "step": 11702 + }, + { + "epoch": 0.49509264743210085, + "grad_norm": 0.20719602704048157, + "learning_rate": 0.001, + "loss": 2.4219, + "step": 11703 + }, + { + "epoch": 0.4951349521956172, + "grad_norm": 1.0996108055114746, + "learning_rate": 0.001, + "loss": 2.0785, + "step": 11704 + }, + { + "epoch": 0.4951772569591336, + "grad_norm": 0.20830421149730682, + "learning_rate": 0.001, + "loss": 2.3977, + "step": 11705 + }, + { + "epoch": 0.49521956172264997, + "grad_norm": 0.16215115785598755, + "learning_rate": 0.001, + "loss": 2.1351, + "step": 11706 + }, + { + "epoch": 0.4952618664861663, + "grad_norm": 0.18560096621513367, + "learning_rate": 0.001, + "loss": 2.045, + "step": 11707 + }, + { + "epoch": 0.49530417124968273, + "grad_norm": 1.037510871887207, + "learning_rate": 0.001, + "loss": 3.0518, + "step": 11708 + }, + { + "epoch": 0.4953464760131991, + "grad_norm": 0.197851300239563, + "learning_rate": 0.001, + "loss": 2.118, + "step": 11709 + }, + { + "epoch": 0.49538878077671544, + "grad_norm": 0.38033074140548706, + "learning_rate": 0.001, + "loss": 1.8406, + "step": 11710 + }, + { + "epoch": 0.49543108554023185, + "grad_norm": 0.17631156742572784, + "learning_rate": 0.001, + "loss": 1.9683, + "step": 11711 + }, + { + "epoch": 0.4954733903037482, + "grad_norm": 51.784454345703125, + "learning_rate": 0.001, + "loss": 2.3855, + "step": 11712 + }, + { + "epoch": 0.49551569506726456, + "grad_norm": 0.14018891751766205, + "learning_rate": 0.001, + "loss": 2.3842, + "step": 11713 + }, + { + "epoch": 0.49555799983078097, + "grad_norm": 0.2898732125759125, + "learning_rate": 0.001, + "loss": 2.951, + "step": 11714 + }, + { + "epoch": 0.4956003045942973, + "grad_norm": 0.1552821397781372, + "learning_rate": 0.001, + "loss": 1.5485, + "step": 11715 + }, + { + "epoch": 0.4956426093578137, + "grad_norm": 0.2228671759366989, + "learning_rate": 0.001, + "loss": 3.3397, + "step": 11716 + }, + { + "epoch": 0.4956849141213301, + "grad_norm": 0.17871029675006866, + "learning_rate": 0.001, + "loss": 2.4764, + "step": 11717 + }, + { + "epoch": 0.49572721888484644, + "grad_norm": 0.15905271470546722, + "learning_rate": 0.001, + "loss": 2.8845, + "step": 11718 + }, + { + "epoch": 0.4957695236483628, + "grad_norm": 0.19031955301761627, + "learning_rate": 0.001, + "loss": 2.0218, + "step": 11719 + }, + { + "epoch": 0.49581182841187915, + "grad_norm": 0.1882801651954651, + "learning_rate": 0.001, + "loss": 2.6585, + "step": 11720 + }, + { + "epoch": 0.49585413317539556, + "grad_norm": 0.19072453677654266, + "learning_rate": 0.001, + "loss": 1.9483, + "step": 11721 + }, + { + "epoch": 0.4958964379389119, + "grad_norm": 0.1833895891904831, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 11722 + }, + { + "epoch": 0.49593874270242827, + "grad_norm": 0.1797013133764267, + "learning_rate": 0.001, + "loss": 1.9646, + "step": 11723 + }, + { + "epoch": 0.4959810474659447, + "grad_norm": 0.15591877698898315, + "learning_rate": 0.001, + "loss": 1.7613, + "step": 11724 + }, + { + "epoch": 0.49602335222946103, + "grad_norm": 0.19704052805900574, + "learning_rate": 0.001, + "loss": 2.5827, + "step": 11725 + }, + { + "epoch": 0.4960656569929774, + "grad_norm": 0.14575530588626862, + "learning_rate": 0.001, + "loss": 1.9924, + "step": 11726 + }, + { + "epoch": 0.4961079617564938, + "grad_norm": 0.27594003081321716, + "learning_rate": 0.001, + "loss": 2.2579, + "step": 11727 + }, + { + "epoch": 0.49615026652001015, + "grad_norm": 0.4626797139644623, + "learning_rate": 0.001, + "loss": 1.6564, + "step": 11728 + }, + { + "epoch": 0.4961925712835265, + "grad_norm": 0.16353465616703033, + "learning_rate": 0.001, + "loss": 2.0006, + "step": 11729 + }, + { + "epoch": 0.4962348760470429, + "grad_norm": 0.12566418945789337, + "learning_rate": 0.001, + "loss": 1.9094, + "step": 11730 + }, + { + "epoch": 0.49627718081055927, + "grad_norm": 0.24055039882659912, + "learning_rate": 0.001, + "loss": 2.1373, + "step": 11731 + }, + { + "epoch": 0.4963194855740756, + "grad_norm": 0.5243616104125977, + "learning_rate": 0.001, + "loss": 1.875, + "step": 11732 + }, + { + "epoch": 0.49636179033759203, + "grad_norm": 0.2162822037935257, + "learning_rate": 0.001, + "loss": 1.5605, + "step": 11733 + }, + { + "epoch": 0.4964040951011084, + "grad_norm": 0.3283466398715973, + "learning_rate": 0.001, + "loss": 1.8846, + "step": 11734 + }, + { + "epoch": 0.49644639986462474, + "grad_norm": 0.20547711849212646, + "learning_rate": 0.001, + "loss": 2.197, + "step": 11735 + }, + { + "epoch": 0.49648870462814115, + "grad_norm": 0.26368358731269836, + "learning_rate": 0.001, + "loss": 3.4798, + "step": 11736 + }, + { + "epoch": 0.4965310093916575, + "grad_norm": 0.6923643350601196, + "learning_rate": 0.001, + "loss": 2.3685, + "step": 11737 + }, + { + "epoch": 0.49657331415517386, + "grad_norm": 0.21621890366077423, + "learning_rate": 0.001, + "loss": 2.4319, + "step": 11738 + }, + { + "epoch": 0.49661561891869027, + "grad_norm": 0.16339389979839325, + "learning_rate": 0.001, + "loss": 2.7695, + "step": 11739 + }, + { + "epoch": 0.4966579236822066, + "grad_norm": 0.16505463421344757, + "learning_rate": 0.001, + "loss": 2.034, + "step": 11740 + }, + { + "epoch": 0.496700228445723, + "grad_norm": 0.1787840873003006, + "learning_rate": 0.001, + "loss": 2.2629, + "step": 11741 + }, + { + "epoch": 0.49674253320923933, + "grad_norm": 0.34870707988739014, + "learning_rate": 0.001, + "loss": 2.0484, + "step": 11742 + }, + { + "epoch": 0.49678483797275574, + "grad_norm": 0.20475776493549347, + "learning_rate": 0.001, + "loss": 2.0176, + "step": 11743 + }, + { + "epoch": 0.4968271427362721, + "grad_norm": 3.398766040802002, + "learning_rate": 0.001, + "loss": 1.8543, + "step": 11744 + }, + { + "epoch": 0.49686944749978845, + "grad_norm": 0.15450245141983032, + "learning_rate": 0.001, + "loss": 2.6702, + "step": 11745 + }, + { + "epoch": 0.49691175226330486, + "grad_norm": 0.58338463306427, + "learning_rate": 0.001, + "loss": 3.001, + "step": 11746 + }, + { + "epoch": 0.4969540570268212, + "grad_norm": 0.1919245570898056, + "learning_rate": 0.001, + "loss": 2.2014, + "step": 11747 + }, + { + "epoch": 0.49699636179033757, + "grad_norm": 0.2006487101316452, + "learning_rate": 0.001, + "loss": 1.9795, + "step": 11748 + }, + { + "epoch": 0.497038666553854, + "grad_norm": 2.63030743598938, + "learning_rate": 0.001, + "loss": 2.3299, + "step": 11749 + }, + { + "epoch": 0.49708097131737033, + "grad_norm": 0.16151180863380432, + "learning_rate": 0.001, + "loss": 1.5329, + "step": 11750 + }, + { + "epoch": 0.4971232760808867, + "grad_norm": 0.2099246382713318, + "learning_rate": 0.001, + "loss": 1.9674, + "step": 11751 + }, + { + "epoch": 0.4971655808444031, + "grad_norm": 0.18836641311645508, + "learning_rate": 0.001, + "loss": 2.001, + "step": 11752 + }, + { + "epoch": 0.49720788560791945, + "grad_norm": 0.17593342065811157, + "learning_rate": 0.001, + "loss": 2.0003, + "step": 11753 + }, + { + "epoch": 0.4972501903714358, + "grad_norm": 0.21019577980041504, + "learning_rate": 0.001, + "loss": 3.3791, + "step": 11754 + }, + { + "epoch": 0.4972924951349522, + "grad_norm": 0.19833973050117493, + "learning_rate": 0.001, + "loss": 1.5016, + "step": 11755 + }, + { + "epoch": 0.49733479989846857, + "grad_norm": 0.1627396196126938, + "learning_rate": 0.001, + "loss": 1.9796, + "step": 11756 + }, + { + "epoch": 0.4973771046619849, + "grad_norm": 0.18876367807388306, + "learning_rate": 0.001, + "loss": 2.0286, + "step": 11757 + }, + { + "epoch": 0.49741940942550134, + "grad_norm": 0.2460365891456604, + "learning_rate": 0.001, + "loss": 2.237, + "step": 11758 + }, + { + "epoch": 0.4974617141890177, + "grad_norm": 0.34945163130760193, + "learning_rate": 0.001, + "loss": 1.7602, + "step": 11759 + }, + { + "epoch": 0.49750401895253404, + "grad_norm": 0.24172699451446533, + "learning_rate": 0.001, + "loss": 3.5869, + "step": 11760 + }, + { + "epoch": 0.49754632371605045, + "grad_norm": 0.19941669702529907, + "learning_rate": 0.001, + "loss": 1.7282, + "step": 11761 + }, + { + "epoch": 0.4975886284795668, + "grad_norm": 0.16542641818523407, + "learning_rate": 0.001, + "loss": 2.2762, + "step": 11762 + }, + { + "epoch": 0.49763093324308316, + "grad_norm": 0.18127164244651794, + "learning_rate": 0.001, + "loss": 1.7135, + "step": 11763 + }, + { + "epoch": 0.4976732380065995, + "grad_norm": 0.16587230563163757, + "learning_rate": 0.001, + "loss": 1.7269, + "step": 11764 + }, + { + "epoch": 0.4977155427701159, + "grad_norm": 0.15559205412864685, + "learning_rate": 0.001, + "loss": 2.0026, + "step": 11765 + }, + { + "epoch": 0.4977578475336323, + "grad_norm": 0.1646786332130432, + "learning_rate": 0.001, + "loss": 2.0569, + "step": 11766 + }, + { + "epoch": 0.49780015229714863, + "grad_norm": 0.1686537265777588, + "learning_rate": 0.001, + "loss": 2.0358, + "step": 11767 + }, + { + "epoch": 0.49784245706066504, + "grad_norm": 0.20397379994392395, + "learning_rate": 0.001, + "loss": 2.5713, + "step": 11768 + }, + { + "epoch": 0.4978847618241814, + "grad_norm": 0.2577773928642273, + "learning_rate": 0.001, + "loss": 1.9993, + "step": 11769 + }, + { + "epoch": 0.49792706658769775, + "grad_norm": 0.1761125773191452, + "learning_rate": 0.001, + "loss": 2.0988, + "step": 11770 + }, + { + "epoch": 0.49796937135121416, + "grad_norm": 0.3869670629501343, + "learning_rate": 0.001, + "loss": 2.7036, + "step": 11771 + }, + { + "epoch": 0.4980116761147305, + "grad_norm": 0.205720916390419, + "learning_rate": 0.001, + "loss": 2.7888, + "step": 11772 + }, + { + "epoch": 0.49805398087824687, + "grad_norm": 0.17144684493541718, + "learning_rate": 0.001, + "loss": 2.9138, + "step": 11773 + }, + { + "epoch": 0.4980962856417633, + "grad_norm": 0.4699496626853943, + "learning_rate": 0.001, + "loss": 2.1829, + "step": 11774 + }, + { + "epoch": 0.49813859040527964, + "grad_norm": 0.19264104962348938, + "learning_rate": 0.001, + "loss": 1.7184, + "step": 11775 + }, + { + "epoch": 0.498180895168796, + "grad_norm": 0.21069301664829254, + "learning_rate": 0.001, + "loss": 1.8648, + "step": 11776 + }, + { + "epoch": 0.4982231999323124, + "grad_norm": 0.17750690877437592, + "learning_rate": 0.001, + "loss": 2.6243, + "step": 11777 + }, + { + "epoch": 0.49826550469582875, + "grad_norm": 0.17451412975788116, + "learning_rate": 0.001, + "loss": 1.9341, + "step": 11778 + }, + { + "epoch": 0.4983078094593451, + "grad_norm": 0.18429610133171082, + "learning_rate": 0.001, + "loss": 1.8176, + "step": 11779 + }, + { + "epoch": 0.4983501142228615, + "grad_norm": 0.18034300208091736, + "learning_rate": 0.001, + "loss": 1.4398, + "step": 11780 + }, + { + "epoch": 0.49839241898637787, + "grad_norm": 0.2214241474866867, + "learning_rate": 0.001, + "loss": 2.8547, + "step": 11781 + }, + { + "epoch": 0.4984347237498942, + "grad_norm": 0.3767451047897339, + "learning_rate": 0.001, + "loss": 1.8602, + "step": 11782 + }, + { + "epoch": 0.49847702851341064, + "grad_norm": 0.16168661415576935, + "learning_rate": 0.001, + "loss": 1.378, + "step": 11783 + }, + { + "epoch": 0.498519333276927, + "grad_norm": 0.19141288101673126, + "learning_rate": 0.001, + "loss": 2.4181, + "step": 11784 + }, + { + "epoch": 0.49856163804044334, + "grad_norm": 0.17355014383792877, + "learning_rate": 0.001, + "loss": 1.8218, + "step": 11785 + }, + { + "epoch": 0.4986039428039597, + "grad_norm": 0.15057973563671112, + "learning_rate": 0.001, + "loss": 2.808, + "step": 11786 + }, + { + "epoch": 0.4986462475674761, + "grad_norm": 1.3171411752700806, + "learning_rate": 0.001, + "loss": 2.5091, + "step": 11787 + }, + { + "epoch": 0.49868855233099246, + "grad_norm": 0.20347653329372406, + "learning_rate": 0.001, + "loss": 2.2627, + "step": 11788 + }, + { + "epoch": 0.4987308570945088, + "grad_norm": 0.189688041806221, + "learning_rate": 0.001, + "loss": 2.2906, + "step": 11789 + }, + { + "epoch": 0.4987731618580252, + "grad_norm": 3.7641372680664062, + "learning_rate": 0.001, + "loss": 2.4856, + "step": 11790 + }, + { + "epoch": 0.4988154666215416, + "grad_norm": 0.2324923574924469, + "learning_rate": 0.001, + "loss": 3.4146, + "step": 11791 + }, + { + "epoch": 0.49885777138505794, + "grad_norm": 0.9900776147842407, + "learning_rate": 0.001, + "loss": 1.8225, + "step": 11792 + }, + { + "epoch": 0.49890007614857435, + "grad_norm": 0.4273008406162262, + "learning_rate": 0.001, + "loss": 2.9934, + "step": 11793 + }, + { + "epoch": 0.4989423809120907, + "grad_norm": 0.14736799895763397, + "learning_rate": 0.001, + "loss": 2.5134, + "step": 11794 + }, + { + "epoch": 0.49898468567560705, + "grad_norm": 2.7909343242645264, + "learning_rate": 0.001, + "loss": 1.9295, + "step": 11795 + }, + { + "epoch": 0.49902699043912346, + "grad_norm": 0.27986812591552734, + "learning_rate": 0.001, + "loss": 3.138, + "step": 11796 + }, + { + "epoch": 0.4990692952026398, + "grad_norm": 0.18478405475616455, + "learning_rate": 0.001, + "loss": 1.9531, + "step": 11797 + }, + { + "epoch": 0.49911159996615617, + "grad_norm": 0.18496938049793243, + "learning_rate": 0.001, + "loss": 2.3311, + "step": 11798 + }, + { + "epoch": 0.4991539047296726, + "grad_norm": 0.18009823560714722, + "learning_rate": 0.001, + "loss": 2.0114, + "step": 11799 + }, + { + "epoch": 0.49919620949318894, + "grad_norm": 0.23303596675395966, + "learning_rate": 0.001, + "loss": 1.9756, + "step": 11800 + }, + { + "epoch": 0.4992385142567053, + "grad_norm": 0.1994001865386963, + "learning_rate": 0.001, + "loss": 2.563, + "step": 11801 + }, + { + "epoch": 0.4992808190202217, + "grad_norm": 0.2099967747926712, + "learning_rate": 0.001, + "loss": 1.5605, + "step": 11802 + }, + { + "epoch": 0.49932312378373805, + "grad_norm": 0.34757867455482483, + "learning_rate": 0.001, + "loss": 3.3986, + "step": 11803 + }, + { + "epoch": 0.4993654285472544, + "grad_norm": 0.15620851516723633, + "learning_rate": 0.001, + "loss": 2.5944, + "step": 11804 + }, + { + "epoch": 0.4994077333107708, + "grad_norm": 0.2044965624809265, + "learning_rate": 0.001, + "loss": 3.5678, + "step": 11805 + }, + { + "epoch": 0.4994500380742872, + "grad_norm": 0.1616724729537964, + "learning_rate": 0.001, + "loss": 2.2451, + "step": 11806 + }, + { + "epoch": 0.4994923428378035, + "grad_norm": 0.21824952960014343, + "learning_rate": 0.001, + "loss": 3.1134, + "step": 11807 + }, + { + "epoch": 0.4995346476013199, + "grad_norm": 0.19981542229652405, + "learning_rate": 0.001, + "loss": 2.2409, + "step": 11808 + }, + { + "epoch": 0.4995769523648363, + "grad_norm": 0.538329541683197, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 11809 + }, + { + "epoch": 0.49961925712835265, + "grad_norm": 0.18705959618091583, + "learning_rate": 0.001, + "loss": 2.367, + "step": 11810 + }, + { + "epoch": 0.499661561891869, + "grad_norm": 1.9119819402694702, + "learning_rate": 0.001, + "loss": 1.9958, + "step": 11811 + }, + { + "epoch": 0.4997038666553854, + "grad_norm": 0.1916486769914627, + "learning_rate": 0.001, + "loss": 2.6751, + "step": 11812 + }, + { + "epoch": 0.49974617141890176, + "grad_norm": 0.17251789569854736, + "learning_rate": 0.001, + "loss": 1.6438, + "step": 11813 + }, + { + "epoch": 0.4997884761824181, + "grad_norm": 0.2549959421157837, + "learning_rate": 0.001, + "loss": 2.6528, + "step": 11814 + }, + { + "epoch": 0.4998307809459345, + "grad_norm": 0.16786809265613556, + "learning_rate": 0.001, + "loss": 1.7551, + "step": 11815 + }, + { + "epoch": 0.4998730857094509, + "grad_norm": 0.22102710604667664, + "learning_rate": 0.001, + "loss": 1.6668, + "step": 11816 + }, + { + "epoch": 0.49991539047296724, + "grad_norm": 0.2043444961309433, + "learning_rate": 0.001, + "loss": 2.0156, + "step": 11817 + }, + { + "epoch": 0.49995769523648365, + "grad_norm": 0.1693449169397354, + "learning_rate": 0.001, + "loss": 1.7542, + "step": 11818 + }, + { + "epoch": 0.5, + "grad_norm": 0.18922848999500275, + "learning_rate": 0.001, + "loss": 3.11, + "step": 11819 + }, + { + "epoch": 0.5000423047635164, + "grad_norm": 0.19555483758449554, + "learning_rate": 0.001, + "loss": 3.5945, + "step": 11820 + }, + { + "epoch": 0.5000846095270327, + "grad_norm": 0.22335830330848694, + "learning_rate": 0.001, + "loss": 3.515, + "step": 11821 + }, + { + "epoch": 0.5001269142905491, + "grad_norm": 0.17894285917282104, + "learning_rate": 0.001, + "loss": 1.6961, + "step": 11822 + }, + { + "epoch": 0.5001692190540655, + "grad_norm": 0.16279856860637665, + "learning_rate": 0.001, + "loss": 2.7065, + "step": 11823 + }, + { + "epoch": 0.5002115238175818, + "grad_norm": 0.2117103487253189, + "learning_rate": 0.001, + "loss": 2.481, + "step": 11824 + }, + { + "epoch": 0.5002538285810982, + "grad_norm": 0.1833728402853012, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 11825 + }, + { + "epoch": 0.5002961333446146, + "grad_norm": 0.1608177125453949, + "learning_rate": 0.001, + "loss": 1.8915, + "step": 11826 + }, + { + "epoch": 0.500338438108131, + "grad_norm": 0.15755756199359894, + "learning_rate": 0.001, + "loss": 2.0134, + "step": 11827 + }, + { + "epoch": 0.5003807428716474, + "grad_norm": 1.6536004543304443, + "learning_rate": 0.001, + "loss": 2.4632, + "step": 11828 + }, + { + "epoch": 0.5004230476351638, + "grad_norm": 0.17885924875736237, + "learning_rate": 0.001, + "loss": 2.8906, + "step": 11829 + }, + { + "epoch": 0.5004653523986801, + "grad_norm": 0.17914816737174988, + "learning_rate": 0.001, + "loss": 1.82, + "step": 11830 + }, + { + "epoch": 0.5005076571621965, + "grad_norm": 0.4052477478981018, + "learning_rate": 0.001, + "loss": 1.7167, + "step": 11831 + }, + { + "epoch": 0.5005499619257129, + "grad_norm": 0.18463678658008575, + "learning_rate": 0.001, + "loss": 2.2511, + "step": 11832 + }, + { + "epoch": 0.5005922666892292, + "grad_norm": 0.194928377866745, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 11833 + }, + { + "epoch": 0.5006345714527456, + "grad_norm": 0.17245490849018097, + "learning_rate": 0.001, + "loss": 1.9821, + "step": 11834 + }, + { + "epoch": 0.500676876216262, + "grad_norm": 0.1762954294681549, + "learning_rate": 0.001, + "loss": 1.6249, + "step": 11835 + }, + { + "epoch": 0.5007191809797783, + "grad_norm": 0.16809524595737457, + "learning_rate": 0.001, + "loss": 1.6515, + "step": 11836 + }, + { + "epoch": 0.5007614857432947, + "grad_norm": 0.2633932828903198, + "learning_rate": 0.001, + "loss": 2.8323, + "step": 11837 + }, + { + "epoch": 0.5008037905068111, + "grad_norm": 1.2055695056915283, + "learning_rate": 0.001, + "loss": 2.7611, + "step": 11838 + }, + { + "epoch": 0.5008460952703274, + "grad_norm": 0.5635671019554138, + "learning_rate": 0.001, + "loss": 2.3381, + "step": 11839 + }, + { + "epoch": 0.5008884000338438, + "grad_norm": 0.22210033237934113, + "learning_rate": 0.001, + "loss": 2.4092, + "step": 11840 + }, + { + "epoch": 0.5009307047973602, + "grad_norm": 0.3583562672138214, + "learning_rate": 0.001, + "loss": 2.3941, + "step": 11841 + }, + { + "epoch": 0.5009730095608765, + "grad_norm": 1.5744633674621582, + "learning_rate": 0.001, + "loss": 2.1961, + "step": 11842 + }, + { + "epoch": 0.501015314324393, + "grad_norm": 0.19284333288669586, + "learning_rate": 0.001, + "loss": 2.6481, + "step": 11843 + }, + { + "epoch": 0.5010576190879092, + "grad_norm": 0.16375435888767242, + "learning_rate": 0.001, + "loss": 3.0202, + "step": 11844 + }, + { + "epoch": 0.5010999238514257, + "grad_norm": 0.1810348629951477, + "learning_rate": 0.001, + "loss": 2.0206, + "step": 11845 + }, + { + "epoch": 0.5011422286149421, + "grad_norm": 0.15742111206054688, + "learning_rate": 0.001, + "loss": 2.0867, + "step": 11846 + }, + { + "epoch": 0.5011845333784584, + "grad_norm": 0.18012715876102448, + "learning_rate": 0.001, + "loss": 1.8844, + "step": 11847 + }, + { + "epoch": 0.5012268381419748, + "grad_norm": 0.21920199692249298, + "learning_rate": 0.001, + "loss": 2.0386, + "step": 11848 + }, + { + "epoch": 0.5012691429054912, + "grad_norm": 0.20530590415000916, + "learning_rate": 0.001, + "loss": 2.7012, + "step": 11849 + }, + { + "epoch": 0.5013114476690075, + "grad_norm": 0.1777116060256958, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 11850 + }, + { + "epoch": 0.5013537524325239, + "grad_norm": 0.9319034814834595, + "learning_rate": 0.001, + "loss": 2.1148, + "step": 11851 + }, + { + "epoch": 0.5013960571960403, + "grad_norm": 0.22064313292503357, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 11852 + }, + { + "epoch": 0.5014383619595566, + "grad_norm": 0.19049854576587677, + "learning_rate": 0.001, + "loss": 2.2153, + "step": 11853 + }, + { + "epoch": 0.501480666723073, + "grad_norm": 0.1864825040102005, + "learning_rate": 0.001, + "loss": 2.225, + "step": 11854 + }, + { + "epoch": 0.5015229714865894, + "grad_norm": 0.20713578164577484, + "learning_rate": 0.001, + "loss": 2.5584, + "step": 11855 + }, + { + "epoch": 0.5015652762501057, + "grad_norm": 0.2419637143611908, + "learning_rate": 0.001, + "loss": 1.9168, + "step": 11856 + }, + { + "epoch": 0.5016075810136221, + "grad_norm": 0.18344347178936005, + "learning_rate": 0.001, + "loss": 1.6852, + "step": 11857 + }, + { + "epoch": 0.5016498857771385, + "grad_norm": 3.606316328048706, + "learning_rate": 0.001, + "loss": 1.7524, + "step": 11858 + }, + { + "epoch": 0.5016921905406548, + "grad_norm": 0.1817534863948822, + "learning_rate": 0.001, + "loss": 1.6807, + "step": 11859 + }, + { + "epoch": 0.5017344953041712, + "grad_norm": 0.22895671427249908, + "learning_rate": 0.001, + "loss": 2.1781, + "step": 11860 + }, + { + "epoch": 0.5017768000676877, + "grad_norm": 0.1826825588941574, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 11861 + }, + { + "epoch": 0.501819104831204, + "grad_norm": 0.1994072049856186, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 11862 + }, + { + "epoch": 0.5018614095947204, + "grad_norm": 0.1609574556350708, + "learning_rate": 0.001, + "loss": 2.5509, + "step": 11863 + }, + { + "epoch": 0.5019037143582368, + "grad_norm": 0.28358885645866394, + "learning_rate": 0.001, + "loss": 2.2666, + "step": 11864 + }, + { + "epoch": 0.5019460191217531, + "grad_norm": 0.19718047976493835, + "learning_rate": 0.001, + "loss": 2.8378, + "step": 11865 + }, + { + "epoch": 0.5019883238852695, + "grad_norm": 0.17806696891784668, + "learning_rate": 0.001, + "loss": 2.5277, + "step": 11866 + }, + { + "epoch": 0.5020306286487859, + "grad_norm": 0.22065989673137665, + "learning_rate": 0.001, + "loss": 3.4721, + "step": 11867 + }, + { + "epoch": 0.5020729334123022, + "grad_norm": 1.0695648193359375, + "learning_rate": 0.001, + "loss": 2.6934, + "step": 11868 + }, + { + "epoch": 0.5021152381758186, + "grad_norm": 0.34497925639152527, + "learning_rate": 0.001, + "loss": 2.5293, + "step": 11869 + }, + { + "epoch": 0.502157542939335, + "grad_norm": 0.2196062207221985, + "learning_rate": 0.001, + "loss": 1.7822, + "step": 11870 + }, + { + "epoch": 0.5021998477028513, + "grad_norm": 1.1649377346038818, + "learning_rate": 0.001, + "loss": 2.6756, + "step": 11871 + }, + { + "epoch": 0.5022421524663677, + "grad_norm": 0.18098509311676025, + "learning_rate": 0.001, + "loss": 2.4486, + "step": 11872 + }, + { + "epoch": 0.5022844572298841, + "grad_norm": 0.21399042010307312, + "learning_rate": 0.001, + "loss": 2.5243, + "step": 11873 + }, + { + "epoch": 0.5023267619934004, + "grad_norm": 0.19336672127246857, + "learning_rate": 0.001, + "loss": 1.3858, + "step": 11874 + }, + { + "epoch": 0.5023690667569168, + "grad_norm": 1.1209372282028198, + "learning_rate": 0.001, + "loss": 2.7417, + "step": 11875 + }, + { + "epoch": 0.5024113715204332, + "grad_norm": 0.1669924110174179, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 11876 + }, + { + "epoch": 0.5024536762839495, + "grad_norm": 0.20557548105716705, + "learning_rate": 0.001, + "loss": 1.9532, + "step": 11877 + }, + { + "epoch": 0.502495981047466, + "grad_norm": 1.1194974184036255, + "learning_rate": 0.001, + "loss": 3.5188, + "step": 11878 + }, + { + "epoch": 0.5025382858109824, + "grad_norm": 1.5623433589935303, + "learning_rate": 0.001, + "loss": 2.5858, + "step": 11879 + }, + { + "epoch": 0.5025805905744987, + "grad_norm": 0.32952699065208435, + "learning_rate": 0.001, + "loss": 2.9636, + "step": 11880 + }, + { + "epoch": 0.5026228953380151, + "grad_norm": 0.18571840226650238, + "learning_rate": 0.001, + "loss": 1.854, + "step": 11881 + }, + { + "epoch": 0.5026652001015315, + "grad_norm": 6.811452865600586, + "learning_rate": 0.001, + "loss": 1.903, + "step": 11882 + }, + { + "epoch": 0.5027075048650478, + "grad_norm": 0.1865660697221756, + "learning_rate": 0.001, + "loss": 2.0409, + "step": 11883 + }, + { + "epoch": 0.5027498096285642, + "grad_norm": 0.17085236310958862, + "learning_rate": 0.001, + "loss": 2.2718, + "step": 11884 + }, + { + "epoch": 0.5027921143920806, + "grad_norm": 0.19499720633029938, + "learning_rate": 0.001, + "loss": 2.1248, + "step": 11885 + }, + { + "epoch": 0.5028344191555969, + "grad_norm": 0.1698610633611679, + "learning_rate": 0.001, + "loss": 1.8143, + "step": 11886 + }, + { + "epoch": 0.5028767239191133, + "grad_norm": 0.18589156866073608, + "learning_rate": 0.001, + "loss": 2.1618, + "step": 11887 + }, + { + "epoch": 0.5029190286826296, + "grad_norm": 0.17679286003112793, + "learning_rate": 0.001, + "loss": 2.1766, + "step": 11888 + }, + { + "epoch": 0.502961333446146, + "grad_norm": 0.18507613241672516, + "learning_rate": 0.001, + "loss": 1.8563, + "step": 11889 + }, + { + "epoch": 0.5030036382096624, + "grad_norm": 0.1703113168478012, + "learning_rate": 0.001, + "loss": 1.8142, + "step": 11890 + }, + { + "epoch": 0.5030459429731787, + "grad_norm": 0.1954764574766159, + "learning_rate": 0.001, + "loss": 2.0398, + "step": 11891 + }, + { + "epoch": 0.5030882477366951, + "grad_norm": 0.1964946985244751, + "learning_rate": 0.001, + "loss": 2.9447, + "step": 11892 + }, + { + "epoch": 0.5031305525002115, + "grad_norm": 0.23403804004192352, + "learning_rate": 0.001, + "loss": 2.4022, + "step": 11893 + }, + { + "epoch": 0.5031728572637278, + "grad_norm": 0.20903852581977844, + "learning_rate": 0.001, + "loss": 2.6347, + "step": 11894 + }, + { + "epoch": 0.5032151620272443, + "grad_norm": 0.14716748893260956, + "learning_rate": 0.001, + "loss": 2.6717, + "step": 11895 + }, + { + "epoch": 0.5032574667907607, + "grad_norm": 0.1724325567483902, + "learning_rate": 0.001, + "loss": 2.4629, + "step": 11896 + }, + { + "epoch": 0.503299771554277, + "grad_norm": 2.00115704536438, + "learning_rate": 0.001, + "loss": 2.8087, + "step": 11897 + }, + { + "epoch": 0.5033420763177934, + "grad_norm": 0.3015700876712799, + "learning_rate": 0.001, + "loss": 1.8208, + "step": 11898 + }, + { + "epoch": 0.5033843810813098, + "grad_norm": 0.18735451996326447, + "learning_rate": 0.001, + "loss": 1.9337, + "step": 11899 + }, + { + "epoch": 0.5034266858448261, + "grad_norm": 0.18726201355457306, + "learning_rate": 0.001, + "loss": 2.8956, + "step": 11900 + }, + { + "epoch": 0.5034689906083425, + "grad_norm": 0.22728313505649567, + "learning_rate": 0.001, + "loss": 2.3571, + "step": 11901 + }, + { + "epoch": 0.5035112953718589, + "grad_norm": 0.3538459241390228, + "learning_rate": 0.001, + "loss": 2.0404, + "step": 11902 + }, + { + "epoch": 0.5035536001353752, + "grad_norm": 0.2191903442144394, + "learning_rate": 0.001, + "loss": 2.8035, + "step": 11903 + }, + { + "epoch": 0.5035959048988916, + "grad_norm": 1.2303228378295898, + "learning_rate": 0.001, + "loss": 2.9012, + "step": 11904 + }, + { + "epoch": 0.503638209662408, + "grad_norm": 0.6738932132720947, + "learning_rate": 0.001, + "loss": 2.0589, + "step": 11905 + }, + { + "epoch": 0.5036805144259243, + "grad_norm": 0.18187807500362396, + "learning_rate": 0.001, + "loss": 2.1458, + "step": 11906 + }, + { + "epoch": 0.5037228191894407, + "grad_norm": 0.20437604188919067, + "learning_rate": 0.001, + "loss": 2.5876, + "step": 11907 + }, + { + "epoch": 0.5037651239529571, + "grad_norm": 0.17892666161060333, + "learning_rate": 0.001, + "loss": 2.4489, + "step": 11908 + }, + { + "epoch": 0.5038074287164734, + "grad_norm": 0.1988033950328827, + "learning_rate": 0.001, + "loss": 2.935, + "step": 11909 + }, + { + "epoch": 0.5038497334799898, + "grad_norm": 0.16450054943561554, + "learning_rate": 0.001, + "loss": 1.6965, + "step": 11910 + }, + { + "epoch": 0.5038920382435063, + "grad_norm": 0.1781654804944992, + "learning_rate": 0.001, + "loss": 2.5796, + "step": 11911 + }, + { + "epoch": 0.5039343430070226, + "grad_norm": 0.17082349956035614, + "learning_rate": 0.001, + "loss": 2.9056, + "step": 11912 + }, + { + "epoch": 0.503976647770539, + "grad_norm": 0.23962025344371796, + "learning_rate": 0.001, + "loss": 2.1747, + "step": 11913 + }, + { + "epoch": 0.5040189525340554, + "grad_norm": 0.18529817461967468, + "learning_rate": 0.001, + "loss": 2.4123, + "step": 11914 + }, + { + "epoch": 0.5040612572975717, + "grad_norm": 0.3135690689086914, + "learning_rate": 0.001, + "loss": 1.7927, + "step": 11915 + }, + { + "epoch": 0.5041035620610881, + "grad_norm": 0.7973864674568176, + "learning_rate": 0.001, + "loss": 1.7801, + "step": 11916 + }, + { + "epoch": 0.5041458668246045, + "grad_norm": 0.25282907485961914, + "learning_rate": 0.001, + "loss": 2.5156, + "step": 11917 + }, + { + "epoch": 0.5041881715881208, + "grad_norm": 0.8695905804634094, + "learning_rate": 0.001, + "loss": 2.0658, + "step": 11918 + }, + { + "epoch": 0.5042304763516372, + "grad_norm": 0.1739739030599594, + "learning_rate": 0.001, + "loss": 1.7482, + "step": 11919 + }, + { + "epoch": 0.5042727811151536, + "grad_norm": 0.639236330986023, + "learning_rate": 0.001, + "loss": 1.906, + "step": 11920 + }, + { + "epoch": 0.5043150858786699, + "grad_norm": 0.2065761834383011, + "learning_rate": 0.001, + "loss": 1.6717, + "step": 11921 + }, + { + "epoch": 0.5043573906421863, + "grad_norm": 0.22129188477993011, + "learning_rate": 0.001, + "loss": 2.7184, + "step": 11922 + }, + { + "epoch": 0.5043996954057027, + "grad_norm": 0.16457290947437286, + "learning_rate": 0.001, + "loss": 1.9362, + "step": 11923 + }, + { + "epoch": 0.504442000169219, + "grad_norm": 0.7113840579986572, + "learning_rate": 0.001, + "loss": 1.3661, + "step": 11924 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.16901074349880219, + "learning_rate": 0.001, + "loss": 2.3126, + "step": 11925 + }, + { + "epoch": 0.5045266096962518, + "grad_norm": 0.2004851996898651, + "learning_rate": 0.001, + "loss": 2.6944, + "step": 11926 + }, + { + "epoch": 0.5045689144597681, + "grad_norm": 0.17636452615261078, + "learning_rate": 0.001, + "loss": 2.3345, + "step": 11927 + }, + { + "epoch": 0.5046112192232846, + "grad_norm": 0.16024649143218994, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 11928 + }, + { + "epoch": 0.504653523986801, + "grad_norm": 0.18146340548992157, + "learning_rate": 0.001, + "loss": 2.6534, + "step": 11929 + }, + { + "epoch": 0.5046958287503173, + "grad_norm": 0.19489388167858124, + "learning_rate": 0.001, + "loss": 2.5024, + "step": 11930 + }, + { + "epoch": 0.5047381335138337, + "grad_norm": 0.5140373706817627, + "learning_rate": 0.001, + "loss": 3.1386, + "step": 11931 + }, + { + "epoch": 0.50478043827735, + "grad_norm": 1.8529261350631714, + "learning_rate": 0.001, + "loss": 2.1581, + "step": 11932 + }, + { + "epoch": 0.5048227430408664, + "grad_norm": 0.16722053289413452, + "learning_rate": 0.001, + "loss": 2.019, + "step": 11933 + }, + { + "epoch": 0.5048650478043828, + "grad_norm": 0.16939833760261536, + "learning_rate": 0.001, + "loss": 1.843, + "step": 11934 + }, + { + "epoch": 0.5049073525678991, + "grad_norm": 0.3722081780433655, + "learning_rate": 0.001, + "loss": 2.8935, + "step": 11935 + }, + { + "epoch": 0.5049496573314155, + "grad_norm": 0.15549683570861816, + "learning_rate": 0.001, + "loss": 1.5844, + "step": 11936 + }, + { + "epoch": 0.5049919620949319, + "grad_norm": 0.19663242995738983, + "learning_rate": 0.001, + "loss": 2.0955, + "step": 11937 + }, + { + "epoch": 0.5050342668584482, + "grad_norm": 0.557640016078949, + "learning_rate": 0.001, + "loss": 3.2234, + "step": 11938 + }, + { + "epoch": 0.5050765716219646, + "grad_norm": 0.24764712154865265, + "learning_rate": 0.001, + "loss": 2.313, + "step": 11939 + }, + { + "epoch": 0.505118876385481, + "grad_norm": 0.22433580458164215, + "learning_rate": 0.001, + "loss": 2.8558, + "step": 11940 + }, + { + "epoch": 0.5051611811489973, + "grad_norm": 0.29622960090637207, + "learning_rate": 0.001, + "loss": 2.7506, + "step": 11941 + }, + { + "epoch": 0.5052034859125137, + "grad_norm": 0.23077332973480225, + "learning_rate": 0.001, + "loss": 1.8445, + "step": 11942 + }, + { + "epoch": 0.5052457906760301, + "grad_norm": 0.16419456899166107, + "learning_rate": 0.001, + "loss": 2.2429, + "step": 11943 + }, + { + "epoch": 0.5052880954395464, + "grad_norm": 1.3198111057281494, + "learning_rate": 0.001, + "loss": 2.769, + "step": 11944 + }, + { + "epoch": 0.5053304002030629, + "grad_norm": 0.32443439960479736, + "learning_rate": 0.001, + "loss": 2.6581, + "step": 11945 + }, + { + "epoch": 0.5053727049665793, + "grad_norm": 0.34906241297721863, + "learning_rate": 0.001, + "loss": 2.769, + "step": 11946 + }, + { + "epoch": 0.5054150097300956, + "grad_norm": 0.1365213692188263, + "learning_rate": 0.001, + "loss": 2.729, + "step": 11947 + }, + { + "epoch": 0.505457314493612, + "grad_norm": 0.3047373294830322, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 11948 + }, + { + "epoch": 0.5054996192571284, + "grad_norm": 0.4273305833339691, + "learning_rate": 0.001, + "loss": 2.3561, + "step": 11949 + }, + { + "epoch": 0.5055419240206447, + "grad_norm": 0.22199836373329163, + "learning_rate": 0.001, + "loss": 2.0293, + "step": 11950 + }, + { + "epoch": 0.5055842287841611, + "grad_norm": 0.3782862424850464, + "learning_rate": 0.001, + "loss": 1.7579, + "step": 11951 + }, + { + "epoch": 0.5056265335476775, + "grad_norm": 0.19411024451255798, + "learning_rate": 0.001, + "loss": 1.4811, + "step": 11952 + }, + { + "epoch": 0.5056688383111938, + "grad_norm": 0.1856779307126999, + "learning_rate": 0.001, + "loss": 2.2312, + "step": 11953 + }, + { + "epoch": 0.5057111430747102, + "grad_norm": 0.17408660054206848, + "learning_rate": 0.001, + "loss": 2.3105, + "step": 11954 + }, + { + "epoch": 0.5057534478382266, + "grad_norm": 0.19032998383045197, + "learning_rate": 0.001, + "loss": 2.088, + "step": 11955 + }, + { + "epoch": 0.5057957526017429, + "grad_norm": 0.18069171905517578, + "learning_rate": 0.001, + "loss": 2.6122, + "step": 11956 + }, + { + "epoch": 0.5058380573652593, + "grad_norm": 0.16551326215267181, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 11957 + }, + { + "epoch": 0.5058803621287757, + "grad_norm": 2.8570401668548584, + "learning_rate": 0.001, + "loss": 1.9708, + "step": 11958 + }, + { + "epoch": 0.505922666892292, + "grad_norm": 0.20645208656787872, + "learning_rate": 0.001, + "loss": 2.3922, + "step": 11959 + }, + { + "epoch": 0.5059649716558084, + "grad_norm": 4.1343817710876465, + "learning_rate": 0.001, + "loss": 2.1069, + "step": 11960 + }, + { + "epoch": 0.5060072764193249, + "grad_norm": 0.17171384394168854, + "learning_rate": 0.001, + "loss": 1.5699, + "step": 11961 + }, + { + "epoch": 0.5060495811828412, + "grad_norm": 0.4710404574871063, + "learning_rate": 0.001, + "loss": 1.9224, + "step": 11962 + }, + { + "epoch": 0.5060918859463576, + "grad_norm": 0.9586241245269775, + "learning_rate": 0.001, + "loss": 2.1856, + "step": 11963 + }, + { + "epoch": 0.506134190709874, + "grad_norm": 0.1798468381166458, + "learning_rate": 0.001, + "loss": 2.836, + "step": 11964 + }, + { + "epoch": 0.5061764954733903, + "grad_norm": 0.2874303162097931, + "learning_rate": 0.001, + "loss": 2.2059, + "step": 11965 + }, + { + "epoch": 0.5062188002369067, + "grad_norm": 0.1568385362625122, + "learning_rate": 0.001, + "loss": 1.7764, + "step": 11966 + }, + { + "epoch": 0.5062611050004231, + "grad_norm": 0.17159751057624817, + "learning_rate": 0.001, + "loss": 2.1703, + "step": 11967 + }, + { + "epoch": 0.5063034097639394, + "grad_norm": 0.20359356701374054, + "learning_rate": 0.001, + "loss": 2.4419, + "step": 11968 + }, + { + "epoch": 0.5063457145274558, + "grad_norm": 0.1408676654100418, + "learning_rate": 0.001, + "loss": 2.12, + "step": 11969 + }, + { + "epoch": 0.5063880192909722, + "grad_norm": 0.16593052446842194, + "learning_rate": 0.001, + "loss": 1.9553, + "step": 11970 + }, + { + "epoch": 0.5064303240544885, + "grad_norm": 0.12770260870456696, + "learning_rate": 0.001, + "loss": 2.1708, + "step": 11971 + }, + { + "epoch": 0.5064726288180049, + "grad_norm": 0.15011616051197052, + "learning_rate": 0.001, + "loss": 2.1359, + "step": 11972 + }, + { + "epoch": 0.5065149335815213, + "grad_norm": 0.47157904505729675, + "learning_rate": 0.001, + "loss": 1.9827, + "step": 11973 + }, + { + "epoch": 0.5065572383450376, + "grad_norm": 0.15799900889396667, + "learning_rate": 0.001, + "loss": 2.2315, + "step": 11974 + }, + { + "epoch": 0.506599543108554, + "grad_norm": 0.20162101089954376, + "learning_rate": 0.001, + "loss": 2.2226, + "step": 11975 + }, + { + "epoch": 0.5066418478720704, + "grad_norm": 0.21195806562900543, + "learning_rate": 0.001, + "loss": 2.3964, + "step": 11976 + }, + { + "epoch": 0.5066841526355867, + "grad_norm": 0.18306732177734375, + "learning_rate": 0.001, + "loss": 3.3474, + "step": 11977 + }, + { + "epoch": 0.5067264573991032, + "grad_norm": 0.15694935619831085, + "learning_rate": 0.001, + "loss": 2.5094, + "step": 11978 + }, + { + "epoch": 0.5067687621626195, + "grad_norm": 0.22060388326644897, + "learning_rate": 0.001, + "loss": 2.6429, + "step": 11979 + }, + { + "epoch": 0.5068110669261359, + "grad_norm": 0.2225179672241211, + "learning_rate": 0.001, + "loss": 1.6857, + "step": 11980 + }, + { + "epoch": 0.5068533716896523, + "grad_norm": 0.19372129440307617, + "learning_rate": 0.001, + "loss": 2.5101, + "step": 11981 + }, + { + "epoch": 0.5068956764531686, + "grad_norm": 0.13902443647384644, + "learning_rate": 0.001, + "loss": 2.4499, + "step": 11982 + }, + { + "epoch": 0.506937981216685, + "grad_norm": 0.15087106823921204, + "learning_rate": 0.001, + "loss": 2.1793, + "step": 11983 + }, + { + "epoch": 0.5069802859802014, + "grad_norm": 0.16260752081871033, + "learning_rate": 0.001, + "loss": 2.0164, + "step": 11984 + }, + { + "epoch": 0.5070225907437177, + "grad_norm": 0.159589946269989, + "learning_rate": 0.001, + "loss": 2.8375, + "step": 11985 + }, + { + "epoch": 0.5070648955072341, + "grad_norm": 0.21032920479774475, + "learning_rate": 0.001, + "loss": 4.2796, + "step": 11986 + }, + { + "epoch": 0.5071072002707505, + "grad_norm": 0.19876354932785034, + "learning_rate": 0.001, + "loss": 2.9237, + "step": 11987 + }, + { + "epoch": 0.5071495050342668, + "grad_norm": 0.35270553827285767, + "learning_rate": 0.001, + "loss": 2.9928, + "step": 11988 + }, + { + "epoch": 0.5071918097977832, + "grad_norm": 0.16005408763885498, + "learning_rate": 0.001, + "loss": 2.1871, + "step": 11989 + }, + { + "epoch": 0.5072341145612996, + "grad_norm": 0.13983991742134094, + "learning_rate": 0.001, + "loss": 2.6865, + "step": 11990 + }, + { + "epoch": 0.5072764193248159, + "grad_norm": 0.17701087892055511, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 11991 + }, + { + "epoch": 0.5073187240883323, + "grad_norm": 0.15667004883289337, + "learning_rate": 0.001, + "loss": 1.466, + "step": 11992 + }, + { + "epoch": 0.5073610288518487, + "grad_norm": 0.22075319290161133, + "learning_rate": 0.001, + "loss": 2.0947, + "step": 11993 + }, + { + "epoch": 0.507403333615365, + "grad_norm": 0.14011307060718536, + "learning_rate": 0.001, + "loss": 2.5497, + "step": 11994 + }, + { + "epoch": 0.5074456383788815, + "grad_norm": 0.1682954877614975, + "learning_rate": 0.001, + "loss": 1.8733, + "step": 11995 + }, + { + "epoch": 0.5074879431423979, + "grad_norm": 30.817886352539062, + "learning_rate": 0.001, + "loss": 2.5975, + "step": 11996 + }, + { + "epoch": 0.5075302479059142, + "grad_norm": 0.17083485424518585, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 11997 + }, + { + "epoch": 0.5075725526694306, + "grad_norm": 0.23582878708839417, + "learning_rate": 0.001, + "loss": 2.6255, + "step": 11998 + }, + { + "epoch": 0.507614857432947, + "grad_norm": 0.16995792090892792, + "learning_rate": 0.001, + "loss": 2.413, + "step": 11999 + }, + { + "epoch": 0.5076571621964633, + "grad_norm": 0.1696588695049286, + "learning_rate": 0.001, + "loss": 2.9008, + "step": 12000 + }, + { + "epoch": 0.5076994669599797, + "grad_norm": 1.6405519247055054, + "learning_rate": 0.001, + "loss": 1.7866, + "step": 12001 + }, + { + "epoch": 0.5077417717234961, + "grad_norm": 0.8979272246360779, + "learning_rate": 0.001, + "loss": 2.6012, + "step": 12002 + }, + { + "epoch": 0.5077840764870124, + "grad_norm": 0.20839910209178925, + "learning_rate": 0.001, + "loss": 2.8517, + "step": 12003 + }, + { + "epoch": 0.5078263812505288, + "grad_norm": 0.2238219529390335, + "learning_rate": 0.001, + "loss": 2.2089, + "step": 12004 + }, + { + "epoch": 0.5078686860140452, + "grad_norm": 0.21982823312282562, + "learning_rate": 0.001, + "loss": 1.9233, + "step": 12005 + }, + { + "epoch": 0.5079109907775615, + "grad_norm": 0.20902688801288605, + "learning_rate": 0.001, + "loss": 2.416, + "step": 12006 + }, + { + "epoch": 0.5079532955410779, + "grad_norm": 0.4519989788532257, + "learning_rate": 0.001, + "loss": 1.7383, + "step": 12007 + }, + { + "epoch": 0.5079956003045943, + "grad_norm": 0.15626175701618195, + "learning_rate": 0.001, + "loss": 3.0287, + "step": 12008 + }, + { + "epoch": 0.5080379050681106, + "grad_norm": 0.1568712294101715, + "learning_rate": 0.001, + "loss": 2.3071, + "step": 12009 + }, + { + "epoch": 0.508080209831627, + "grad_norm": 0.17470332980155945, + "learning_rate": 0.001, + "loss": 1.9627, + "step": 12010 + }, + { + "epoch": 0.5081225145951435, + "grad_norm": 0.29563674330711365, + "learning_rate": 0.001, + "loss": 2.2517, + "step": 12011 + }, + { + "epoch": 0.5081648193586598, + "grad_norm": 0.1537560075521469, + "learning_rate": 0.001, + "loss": 1.5154, + "step": 12012 + }, + { + "epoch": 0.5082071241221762, + "grad_norm": 0.24751636385917664, + "learning_rate": 0.001, + "loss": 2.1838, + "step": 12013 + }, + { + "epoch": 0.5082494288856926, + "grad_norm": 0.2212720662355423, + "learning_rate": 0.001, + "loss": 2.7298, + "step": 12014 + }, + { + "epoch": 0.5082917336492089, + "grad_norm": 0.1655520498752594, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 12015 + }, + { + "epoch": 0.5083340384127253, + "grad_norm": 0.17084214091300964, + "learning_rate": 0.001, + "loss": 2.413, + "step": 12016 + }, + { + "epoch": 0.5083763431762417, + "grad_norm": 0.1613297313451767, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 12017 + }, + { + "epoch": 0.508418647939758, + "grad_norm": 0.28114500641822815, + "learning_rate": 0.001, + "loss": 1.6721, + "step": 12018 + }, + { + "epoch": 0.5084609527032744, + "grad_norm": 0.19233211874961853, + "learning_rate": 0.001, + "loss": 1.9725, + "step": 12019 + }, + { + "epoch": 0.5085032574667908, + "grad_norm": 0.17586541175842285, + "learning_rate": 0.001, + "loss": 1.7528, + "step": 12020 + }, + { + "epoch": 0.5085455622303071, + "grad_norm": 0.1447594314813614, + "learning_rate": 0.001, + "loss": 2.6402, + "step": 12021 + }, + { + "epoch": 0.5085878669938235, + "grad_norm": 0.14185841381549835, + "learning_rate": 0.001, + "loss": 1.6817, + "step": 12022 + }, + { + "epoch": 0.5086301717573398, + "grad_norm": 0.171967551112175, + "learning_rate": 0.001, + "loss": 2.3004, + "step": 12023 + }, + { + "epoch": 0.5086724765208562, + "grad_norm": 0.18312491476535797, + "learning_rate": 0.001, + "loss": 2.5789, + "step": 12024 + }, + { + "epoch": 0.5087147812843726, + "grad_norm": 0.298218697309494, + "learning_rate": 0.001, + "loss": 2.98, + "step": 12025 + }, + { + "epoch": 0.5087570860478889, + "grad_norm": 0.1913619339466095, + "learning_rate": 0.001, + "loss": 3.5224, + "step": 12026 + }, + { + "epoch": 0.5087993908114053, + "grad_norm": 0.18118982017040253, + "learning_rate": 0.001, + "loss": 1.9072, + "step": 12027 + }, + { + "epoch": 0.5088416955749218, + "grad_norm": 0.6203705072402954, + "learning_rate": 0.001, + "loss": 2.6366, + "step": 12028 + }, + { + "epoch": 0.5088840003384381, + "grad_norm": 0.17132526636123657, + "learning_rate": 0.001, + "loss": 2.0598, + "step": 12029 + }, + { + "epoch": 0.5089263051019545, + "grad_norm": 0.18120494484901428, + "learning_rate": 0.001, + "loss": 2.783, + "step": 12030 + }, + { + "epoch": 0.5089686098654709, + "grad_norm": 0.20556822419166565, + "learning_rate": 0.001, + "loss": 2.4337, + "step": 12031 + }, + { + "epoch": 0.5090109146289872, + "grad_norm": 0.16901428997516632, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 12032 + }, + { + "epoch": 0.5090532193925036, + "grad_norm": 0.16724789142608643, + "learning_rate": 0.001, + "loss": 2.1285, + "step": 12033 + }, + { + "epoch": 0.50909552415602, + "grad_norm": 0.15585707128047943, + "learning_rate": 0.001, + "loss": 1.7745, + "step": 12034 + }, + { + "epoch": 0.5091378289195363, + "grad_norm": 0.5937365889549255, + "learning_rate": 0.001, + "loss": 2.4572, + "step": 12035 + }, + { + "epoch": 0.5091801336830527, + "grad_norm": 0.17110083997249603, + "learning_rate": 0.001, + "loss": 2.4326, + "step": 12036 + }, + { + "epoch": 0.5092224384465691, + "grad_norm": 0.18722589313983917, + "learning_rate": 0.001, + "loss": 3.232, + "step": 12037 + }, + { + "epoch": 0.5092647432100854, + "grad_norm": 0.2509753704071045, + "learning_rate": 0.001, + "loss": 2.1435, + "step": 12038 + }, + { + "epoch": 0.5093070479736018, + "grad_norm": 0.1581326723098755, + "learning_rate": 0.001, + "loss": 2.3724, + "step": 12039 + }, + { + "epoch": 0.5093493527371182, + "grad_norm": 0.2504560053348541, + "learning_rate": 0.001, + "loss": 2.3174, + "step": 12040 + }, + { + "epoch": 0.5093916575006345, + "grad_norm": 0.19486159086227417, + "learning_rate": 0.001, + "loss": 2.1135, + "step": 12041 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 0.16291144490242004, + "learning_rate": 0.001, + "loss": 1.9069, + "step": 12042 + }, + { + "epoch": 0.5094762670276674, + "grad_norm": 0.17306125164031982, + "learning_rate": 0.001, + "loss": 2.8266, + "step": 12043 + }, + { + "epoch": 0.5095185717911836, + "grad_norm": 0.17126202583312988, + "learning_rate": 0.001, + "loss": 3.3905, + "step": 12044 + }, + { + "epoch": 0.5095608765547001, + "grad_norm": 0.18580661714076996, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 12045 + }, + { + "epoch": 0.5096031813182165, + "grad_norm": 1.9326337575912476, + "learning_rate": 0.001, + "loss": 1.8201, + "step": 12046 + }, + { + "epoch": 0.5096454860817328, + "grad_norm": 0.24594885110855103, + "learning_rate": 0.001, + "loss": 2.2209, + "step": 12047 + }, + { + "epoch": 0.5096877908452492, + "grad_norm": 0.14384353160858154, + "learning_rate": 0.001, + "loss": 1.7018, + "step": 12048 + }, + { + "epoch": 0.5097300956087656, + "grad_norm": 0.1911388784646988, + "learning_rate": 0.001, + "loss": 2.9794, + "step": 12049 + }, + { + "epoch": 0.5097724003722819, + "grad_norm": 0.15660209953784943, + "learning_rate": 0.001, + "loss": 2.4576, + "step": 12050 + }, + { + "epoch": 0.5098147051357983, + "grad_norm": 0.27428507804870605, + "learning_rate": 0.001, + "loss": 2.893, + "step": 12051 + }, + { + "epoch": 0.5098570098993147, + "grad_norm": 0.17285604774951935, + "learning_rate": 0.001, + "loss": 1.984, + "step": 12052 + }, + { + "epoch": 0.509899314662831, + "grad_norm": 0.17490775883197784, + "learning_rate": 0.001, + "loss": 2.0082, + "step": 12053 + }, + { + "epoch": 0.5099416194263474, + "grad_norm": 0.1558762937784195, + "learning_rate": 0.001, + "loss": 1.9196, + "step": 12054 + }, + { + "epoch": 0.5099839241898638, + "grad_norm": 0.42359229922294617, + "learning_rate": 0.001, + "loss": 2.7122, + "step": 12055 + }, + { + "epoch": 0.5100262289533801, + "grad_norm": 0.17612966895103455, + "learning_rate": 0.001, + "loss": 1.7214, + "step": 12056 + }, + { + "epoch": 0.5100685337168965, + "grad_norm": 0.3643682599067688, + "learning_rate": 0.001, + "loss": 1.7763, + "step": 12057 + }, + { + "epoch": 0.5101108384804129, + "grad_norm": 0.205791637301445, + "learning_rate": 0.001, + "loss": 2.0399, + "step": 12058 + }, + { + "epoch": 0.5101531432439292, + "grad_norm": 2.9326553344726562, + "learning_rate": 0.001, + "loss": 1.8045, + "step": 12059 + }, + { + "epoch": 0.5101954480074457, + "grad_norm": 0.22265328466892242, + "learning_rate": 0.001, + "loss": 2.1796, + "step": 12060 + }, + { + "epoch": 0.5102377527709621, + "grad_norm": 0.17063187062740326, + "learning_rate": 0.001, + "loss": 2.4523, + "step": 12061 + }, + { + "epoch": 0.5102800575344784, + "grad_norm": 0.18064865469932556, + "learning_rate": 0.001, + "loss": 1.5731, + "step": 12062 + }, + { + "epoch": 0.5103223622979948, + "grad_norm": 2.2805631160736084, + "learning_rate": 0.001, + "loss": 3.7232, + "step": 12063 + }, + { + "epoch": 0.5103646670615112, + "grad_norm": 0.1840154379606247, + "learning_rate": 0.001, + "loss": 1.7874, + "step": 12064 + }, + { + "epoch": 0.5104069718250275, + "grad_norm": 0.16922450065612793, + "learning_rate": 0.001, + "loss": 1.9633, + "step": 12065 + }, + { + "epoch": 0.5104492765885439, + "grad_norm": 0.1468036025762558, + "learning_rate": 0.001, + "loss": 1.8785, + "step": 12066 + }, + { + "epoch": 0.5104915813520603, + "grad_norm": 0.15584182739257812, + "learning_rate": 0.001, + "loss": 2.47, + "step": 12067 + }, + { + "epoch": 0.5105338861155766, + "grad_norm": 0.19544000923633575, + "learning_rate": 0.001, + "loss": 2.2966, + "step": 12068 + }, + { + "epoch": 0.510576190879093, + "grad_norm": 0.15952937304973602, + "learning_rate": 0.001, + "loss": 1.9361, + "step": 12069 + }, + { + "epoch": 0.5106184956426093, + "grad_norm": 2.266671657562256, + "learning_rate": 0.001, + "loss": 3.9638, + "step": 12070 + }, + { + "epoch": 0.5106608004061257, + "grad_norm": 0.20230835676193237, + "learning_rate": 0.001, + "loss": 2.0883, + "step": 12071 + }, + { + "epoch": 0.5107031051696421, + "grad_norm": 0.22591619193553925, + "learning_rate": 0.001, + "loss": 2.8491, + "step": 12072 + }, + { + "epoch": 0.5107454099331584, + "grad_norm": 0.18922626972198486, + "learning_rate": 0.001, + "loss": 1.9407, + "step": 12073 + }, + { + "epoch": 0.5107877146966748, + "grad_norm": 0.17691117525100708, + "learning_rate": 0.001, + "loss": 1.8264, + "step": 12074 + }, + { + "epoch": 0.5108300194601912, + "grad_norm": 0.17793789505958557, + "learning_rate": 0.001, + "loss": 2.0651, + "step": 12075 + }, + { + "epoch": 0.5108723242237075, + "grad_norm": 0.2747514545917511, + "learning_rate": 0.001, + "loss": 1.9631, + "step": 12076 + }, + { + "epoch": 0.510914628987224, + "grad_norm": 0.16500215232372284, + "learning_rate": 0.001, + "loss": 1.5828, + "step": 12077 + }, + { + "epoch": 0.5109569337507404, + "grad_norm": 0.2846604883670807, + "learning_rate": 0.001, + "loss": 3.0458, + "step": 12078 + }, + { + "epoch": 0.5109992385142567, + "grad_norm": 0.18914783000946045, + "learning_rate": 0.001, + "loss": 2.501, + "step": 12079 + }, + { + "epoch": 0.5110415432777731, + "grad_norm": 3.4462015628814697, + "learning_rate": 0.001, + "loss": 1.8007, + "step": 12080 + }, + { + "epoch": 0.5110838480412895, + "grad_norm": 0.22221924364566803, + "learning_rate": 0.001, + "loss": 2.4593, + "step": 12081 + }, + { + "epoch": 0.5111261528048058, + "grad_norm": 0.3206009268760681, + "learning_rate": 0.001, + "loss": 2.1852, + "step": 12082 + }, + { + "epoch": 0.5111684575683222, + "grad_norm": 0.19568482041358948, + "learning_rate": 0.001, + "loss": 2.0723, + "step": 12083 + }, + { + "epoch": 0.5112107623318386, + "grad_norm": 0.13651347160339355, + "learning_rate": 0.001, + "loss": 2.1792, + "step": 12084 + }, + { + "epoch": 0.5112530670953549, + "grad_norm": 0.9014352560043335, + "learning_rate": 0.001, + "loss": 2.1313, + "step": 12085 + }, + { + "epoch": 0.5112953718588713, + "grad_norm": 0.17774808406829834, + "learning_rate": 0.001, + "loss": 2.194, + "step": 12086 + }, + { + "epoch": 0.5113376766223877, + "grad_norm": 0.2228837013244629, + "learning_rate": 0.001, + "loss": 3.0247, + "step": 12087 + }, + { + "epoch": 0.511379981385904, + "grad_norm": 0.20427215099334717, + "learning_rate": 0.001, + "loss": 2.176, + "step": 12088 + }, + { + "epoch": 0.5114222861494204, + "grad_norm": 0.1658029705286026, + "learning_rate": 0.001, + "loss": 2.0688, + "step": 12089 + }, + { + "epoch": 0.5114645909129368, + "grad_norm": 0.19632932543754578, + "learning_rate": 0.001, + "loss": 3.3308, + "step": 12090 + }, + { + "epoch": 0.5115068956764531, + "grad_norm": 0.17053432762622833, + "learning_rate": 0.001, + "loss": 2.1211, + "step": 12091 + }, + { + "epoch": 0.5115492004399695, + "grad_norm": 0.17195382714271545, + "learning_rate": 0.001, + "loss": 1.7132, + "step": 12092 + }, + { + "epoch": 0.511591505203486, + "grad_norm": 0.17372830212116241, + "learning_rate": 0.001, + "loss": 2.7497, + "step": 12093 + }, + { + "epoch": 0.5116338099670023, + "grad_norm": 0.18535274267196655, + "learning_rate": 0.001, + "loss": 2.2781, + "step": 12094 + }, + { + "epoch": 0.5116761147305187, + "grad_norm": 0.1690640151500702, + "learning_rate": 0.001, + "loss": 1.9845, + "step": 12095 + }, + { + "epoch": 0.5117184194940351, + "grad_norm": 0.18286339938640594, + "learning_rate": 0.001, + "loss": 3.2735, + "step": 12096 + }, + { + "epoch": 0.5117607242575514, + "grad_norm": 0.15377508103847504, + "learning_rate": 0.001, + "loss": 2.0673, + "step": 12097 + }, + { + "epoch": 0.5118030290210678, + "grad_norm": 1.7860561609268188, + "learning_rate": 0.001, + "loss": 2.9895, + "step": 12098 + }, + { + "epoch": 0.5118453337845842, + "grad_norm": 0.2976422607898712, + "learning_rate": 0.001, + "loss": 2.5911, + "step": 12099 + }, + { + "epoch": 0.5118876385481005, + "grad_norm": 0.6552213430404663, + "learning_rate": 0.001, + "loss": 3.01, + "step": 12100 + }, + { + "epoch": 0.5119299433116169, + "grad_norm": 0.1674286127090454, + "learning_rate": 0.001, + "loss": 1.5554, + "step": 12101 + }, + { + "epoch": 0.5119722480751333, + "grad_norm": 0.17942678928375244, + "learning_rate": 0.001, + "loss": 2.5887, + "step": 12102 + }, + { + "epoch": 0.5120145528386496, + "grad_norm": 0.3610702455043793, + "learning_rate": 0.001, + "loss": 2.0485, + "step": 12103 + }, + { + "epoch": 0.512056857602166, + "grad_norm": 9.683119773864746, + "learning_rate": 0.001, + "loss": 1.5999, + "step": 12104 + }, + { + "epoch": 0.5120991623656824, + "grad_norm": 0.23564742505550385, + "learning_rate": 0.001, + "loss": 2.6848, + "step": 12105 + }, + { + "epoch": 0.5121414671291987, + "grad_norm": 0.20375941693782806, + "learning_rate": 0.001, + "loss": 2.348, + "step": 12106 + }, + { + "epoch": 0.5121837718927151, + "grad_norm": 0.17690829932689667, + "learning_rate": 0.001, + "loss": 1.8304, + "step": 12107 + }, + { + "epoch": 0.5122260766562315, + "grad_norm": 0.3078082501888275, + "learning_rate": 0.001, + "loss": 2.4127, + "step": 12108 + }, + { + "epoch": 0.5122683814197478, + "grad_norm": 0.17831426858901978, + "learning_rate": 0.001, + "loss": 2.4171, + "step": 12109 + }, + { + "epoch": 0.5123106861832643, + "grad_norm": 0.19849298894405365, + "learning_rate": 0.001, + "loss": 3.024, + "step": 12110 + }, + { + "epoch": 0.5123529909467807, + "grad_norm": 0.25595778226852417, + "learning_rate": 0.001, + "loss": 1.8982, + "step": 12111 + }, + { + "epoch": 0.512395295710297, + "grad_norm": 0.19416412711143494, + "learning_rate": 0.001, + "loss": 2.3878, + "step": 12112 + }, + { + "epoch": 0.5124376004738134, + "grad_norm": 0.5815975666046143, + "learning_rate": 0.001, + "loss": 2.9711, + "step": 12113 + }, + { + "epoch": 0.5124799052373297, + "grad_norm": 0.34129148721694946, + "learning_rate": 0.001, + "loss": 2.6444, + "step": 12114 + }, + { + "epoch": 0.5125222100008461, + "grad_norm": 0.495937705039978, + "learning_rate": 0.001, + "loss": 2.1635, + "step": 12115 + }, + { + "epoch": 0.5125645147643625, + "grad_norm": 0.23061814904212952, + "learning_rate": 0.001, + "loss": 2.387, + "step": 12116 + }, + { + "epoch": 0.5126068195278788, + "grad_norm": 0.28698426485061646, + "learning_rate": 0.001, + "loss": 2.4555, + "step": 12117 + }, + { + "epoch": 0.5126491242913952, + "grad_norm": 0.3059033453464508, + "learning_rate": 0.001, + "loss": 2.024, + "step": 12118 + }, + { + "epoch": 0.5126914290549116, + "grad_norm": 0.24312277138233185, + "learning_rate": 0.001, + "loss": 2.1431, + "step": 12119 + }, + { + "epoch": 0.5127337338184279, + "grad_norm": 0.17924846708774567, + "learning_rate": 0.001, + "loss": 2.322, + "step": 12120 + }, + { + "epoch": 0.5127760385819443, + "grad_norm": 0.15020184218883514, + "learning_rate": 0.001, + "loss": 1.4752, + "step": 12121 + }, + { + "epoch": 0.5128183433454607, + "grad_norm": 3.8008360862731934, + "learning_rate": 0.001, + "loss": 2.591, + "step": 12122 + }, + { + "epoch": 0.512860648108977, + "grad_norm": 0.20697496831417084, + "learning_rate": 0.001, + "loss": 2.5281, + "step": 12123 + }, + { + "epoch": 0.5129029528724934, + "grad_norm": 0.18046455085277557, + "learning_rate": 0.001, + "loss": 1.9783, + "step": 12124 + }, + { + "epoch": 0.5129452576360098, + "grad_norm": 4.528714656829834, + "learning_rate": 0.001, + "loss": 1.7568, + "step": 12125 + }, + { + "epoch": 0.5129875623995261, + "grad_norm": 0.520558774471283, + "learning_rate": 0.001, + "loss": 2.9131, + "step": 12126 + }, + { + "epoch": 0.5130298671630426, + "grad_norm": 0.529538631439209, + "learning_rate": 0.001, + "loss": 2.5839, + "step": 12127 + }, + { + "epoch": 0.513072171926559, + "grad_norm": 0.3380816578865051, + "learning_rate": 0.001, + "loss": 1.9767, + "step": 12128 + }, + { + "epoch": 0.5131144766900753, + "grad_norm": 0.5433550477027893, + "learning_rate": 0.001, + "loss": 2.5798, + "step": 12129 + }, + { + "epoch": 0.5131567814535917, + "grad_norm": 0.17710618674755096, + "learning_rate": 0.001, + "loss": 2.614, + "step": 12130 + }, + { + "epoch": 0.5131990862171081, + "grad_norm": 0.23254965245723724, + "learning_rate": 0.001, + "loss": 2.8047, + "step": 12131 + }, + { + "epoch": 0.5132413909806244, + "grad_norm": 0.1852521151304245, + "learning_rate": 0.001, + "loss": 2.0367, + "step": 12132 + }, + { + "epoch": 0.5132836957441408, + "grad_norm": 0.19175705313682556, + "learning_rate": 0.001, + "loss": 2.0662, + "step": 12133 + }, + { + "epoch": 0.5133260005076572, + "grad_norm": 0.18979766964912415, + "learning_rate": 0.001, + "loss": 2.3063, + "step": 12134 + }, + { + "epoch": 0.5133683052711735, + "grad_norm": 0.1920190006494522, + "learning_rate": 0.001, + "loss": 2.7921, + "step": 12135 + }, + { + "epoch": 0.5134106100346899, + "grad_norm": 0.5775536298751831, + "learning_rate": 0.001, + "loss": 2.0244, + "step": 12136 + }, + { + "epoch": 0.5134529147982063, + "grad_norm": 0.1514301896095276, + "learning_rate": 0.001, + "loss": 1.7011, + "step": 12137 + }, + { + "epoch": 0.5134952195617226, + "grad_norm": 0.15868137776851654, + "learning_rate": 0.001, + "loss": 2.9314, + "step": 12138 + }, + { + "epoch": 0.513537524325239, + "grad_norm": 0.18454696238040924, + "learning_rate": 0.001, + "loss": 2.7521, + "step": 12139 + }, + { + "epoch": 0.5135798290887554, + "grad_norm": 0.21990300714969635, + "learning_rate": 0.001, + "loss": 1.5834, + "step": 12140 + }, + { + "epoch": 0.5136221338522717, + "grad_norm": 0.21516460180282593, + "learning_rate": 0.001, + "loss": 2.0828, + "step": 12141 + }, + { + "epoch": 0.5136644386157881, + "grad_norm": 0.2636964023113251, + "learning_rate": 0.001, + "loss": 2.2502, + "step": 12142 + }, + { + "epoch": 0.5137067433793046, + "grad_norm": 0.21557240188121796, + "learning_rate": 0.001, + "loss": 2.4408, + "step": 12143 + }, + { + "epoch": 0.5137490481428209, + "grad_norm": 8.734152793884277, + "learning_rate": 0.001, + "loss": 2.9185, + "step": 12144 + }, + { + "epoch": 0.5137913529063373, + "grad_norm": 0.32210806012153625, + "learning_rate": 0.001, + "loss": 2.2702, + "step": 12145 + }, + { + "epoch": 0.5138336576698537, + "grad_norm": 0.3446987569332123, + "learning_rate": 0.001, + "loss": 2.9083, + "step": 12146 + }, + { + "epoch": 0.51387596243337, + "grad_norm": 3.7084527015686035, + "learning_rate": 0.001, + "loss": 1.4133, + "step": 12147 + }, + { + "epoch": 0.5139182671968864, + "grad_norm": 3.1599371433258057, + "learning_rate": 0.001, + "loss": 2.0267, + "step": 12148 + }, + { + "epoch": 0.5139605719604028, + "grad_norm": 0.19752854108810425, + "learning_rate": 0.001, + "loss": 2.0721, + "step": 12149 + }, + { + "epoch": 0.5140028767239191, + "grad_norm": 0.49823862314224243, + "learning_rate": 0.001, + "loss": 1.9638, + "step": 12150 + }, + { + "epoch": 0.5140451814874355, + "grad_norm": 0.1748315989971161, + "learning_rate": 0.001, + "loss": 3.1363, + "step": 12151 + }, + { + "epoch": 0.5140874862509519, + "grad_norm": 0.1747720092535019, + "learning_rate": 0.001, + "loss": 2.1459, + "step": 12152 + }, + { + "epoch": 0.5141297910144682, + "grad_norm": 2.5184714794158936, + "learning_rate": 0.001, + "loss": 1.7912, + "step": 12153 + }, + { + "epoch": 0.5141720957779846, + "grad_norm": 0.30171647667884827, + "learning_rate": 0.001, + "loss": 2.5741, + "step": 12154 + }, + { + "epoch": 0.514214400541501, + "grad_norm": 0.19412563741207123, + "learning_rate": 0.001, + "loss": 1.8221, + "step": 12155 + }, + { + "epoch": 0.5142567053050173, + "grad_norm": 0.44253069162368774, + "learning_rate": 0.001, + "loss": 2.7872, + "step": 12156 + }, + { + "epoch": 0.5142990100685337, + "grad_norm": 0.16020557284355164, + "learning_rate": 0.001, + "loss": 1.8939, + "step": 12157 + }, + { + "epoch": 0.51434131483205, + "grad_norm": 0.6392630934715271, + "learning_rate": 0.001, + "loss": 2.4378, + "step": 12158 + }, + { + "epoch": 0.5143836195955664, + "grad_norm": 0.15356384217739105, + "learning_rate": 0.001, + "loss": 2.2201, + "step": 12159 + }, + { + "epoch": 0.5144259243590829, + "grad_norm": 0.2277487814426422, + "learning_rate": 0.001, + "loss": 2.1108, + "step": 12160 + }, + { + "epoch": 0.5144682291225992, + "grad_norm": 0.19493798911571503, + "learning_rate": 0.001, + "loss": 2.7649, + "step": 12161 + }, + { + "epoch": 0.5145105338861156, + "grad_norm": 0.8036705851554871, + "learning_rate": 0.001, + "loss": 2.6234, + "step": 12162 + }, + { + "epoch": 0.514552838649632, + "grad_norm": 0.23683510720729828, + "learning_rate": 0.001, + "loss": 3.3676, + "step": 12163 + }, + { + "epoch": 0.5145951434131483, + "grad_norm": 0.20313666760921478, + "learning_rate": 0.001, + "loss": 2.2018, + "step": 12164 + }, + { + "epoch": 0.5146374481766647, + "grad_norm": 0.16080090403556824, + "learning_rate": 0.001, + "loss": 1.4551, + "step": 12165 + }, + { + "epoch": 0.5146797529401811, + "grad_norm": 0.19314774870872498, + "learning_rate": 0.001, + "loss": 2.5364, + "step": 12166 + }, + { + "epoch": 0.5147220577036974, + "grad_norm": 0.2120058387517929, + "learning_rate": 0.001, + "loss": 2.1422, + "step": 12167 + }, + { + "epoch": 0.5147643624672138, + "grad_norm": 0.15870290994644165, + "learning_rate": 0.001, + "loss": 1.7913, + "step": 12168 + }, + { + "epoch": 0.5148066672307302, + "grad_norm": 1.8567591905593872, + "learning_rate": 0.001, + "loss": 1.8428, + "step": 12169 + }, + { + "epoch": 0.5148489719942465, + "grad_norm": 0.25935545563697815, + "learning_rate": 0.001, + "loss": 2.1154, + "step": 12170 + }, + { + "epoch": 0.5148912767577629, + "grad_norm": 0.19411538541316986, + "learning_rate": 0.001, + "loss": 1.6982, + "step": 12171 + }, + { + "epoch": 0.5149335815212793, + "grad_norm": 0.1572023630142212, + "learning_rate": 0.001, + "loss": 1.9115, + "step": 12172 + }, + { + "epoch": 0.5149758862847956, + "grad_norm": 1.9859943389892578, + "learning_rate": 0.001, + "loss": 2.9479, + "step": 12173 + }, + { + "epoch": 0.515018191048312, + "grad_norm": 0.16157686710357666, + "learning_rate": 0.001, + "loss": 2.1507, + "step": 12174 + }, + { + "epoch": 0.5150604958118284, + "grad_norm": 1.8809139728546143, + "learning_rate": 0.001, + "loss": 2.054, + "step": 12175 + }, + { + "epoch": 0.5151028005753447, + "grad_norm": 0.17030459642410278, + "learning_rate": 0.001, + "loss": 1.8893, + "step": 12176 + }, + { + "epoch": 0.5151451053388612, + "grad_norm": 0.16130401194095612, + "learning_rate": 0.001, + "loss": 2.5668, + "step": 12177 + }, + { + "epoch": 0.5151874101023776, + "grad_norm": 0.19014528393745422, + "learning_rate": 0.001, + "loss": 2.4916, + "step": 12178 + }, + { + "epoch": 0.5152297148658939, + "grad_norm": 0.4937807321548462, + "learning_rate": 0.001, + "loss": 2.3189, + "step": 12179 + }, + { + "epoch": 0.5152720196294103, + "grad_norm": 0.24521322548389435, + "learning_rate": 0.001, + "loss": 1.4842, + "step": 12180 + }, + { + "epoch": 0.5153143243929267, + "grad_norm": 0.17198513448238373, + "learning_rate": 0.001, + "loss": 2.2285, + "step": 12181 + }, + { + "epoch": 0.515356629156443, + "grad_norm": 0.2905202805995941, + "learning_rate": 0.001, + "loss": 3.2654, + "step": 12182 + }, + { + "epoch": 0.5153989339199594, + "grad_norm": 0.1737702488899231, + "learning_rate": 0.001, + "loss": 2.1419, + "step": 12183 + }, + { + "epoch": 0.5154412386834758, + "grad_norm": 0.23815499246120453, + "learning_rate": 0.001, + "loss": 1.6129, + "step": 12184 + }, + { + "epoch": 0.5154835434469921, + "grad_norm": 0.19331566989421844, + "learning_rate": 0.001, + "loss": 2.0372, + "step": 12185 + }, + { + "epoch": 0.5155258482105085, + "grad_norm": 0.20304280519485474, + "learning_rate": 0.001, + "loss": 2.8925, + "step": 12186 + }, + { + "epoch": 0.5155681529740249, + "grad_norm": 0.16830262541770935, + "learning_rate": 0.001, + "loss": 2.4063, + "step": 12187 + }, + { + "epoch": 0.5156104577375412, + "grad_norm": 0.2067635953426361, + "learning_rate": 0.001, + "loss": 2.4644, + "step": 12188 + }, + { + "epoch": 0.5156527625010576, + "grad_norm": 0.26112356781959534, + "learning_rate": 0.001, + "loss": 1.8161, + "step": 12189 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 0.317443311214447, + "learning_rate": 0.001, + "loss": 2.3216, + "step": 12190 + }, + { + "epoch": 0.5157373720280903, + "grad_norm": 0.1734699159860611, + "learning_rate": 0.001, + "loss": 2.4626, + "step": 12191 + }, + { + "epoch": 0.5157796767916067, + "grad_norm": 0.21294571459293365, + "learning_rate": 0.001, + "loss": 3.3039, + "step": 12192 + }, + { + "epoch": 0.5158219815551232, + "grad_norm": 0.6978985071182251, + "learning_rate": 0.001, + "loss": 3.0031, + "step": 12193 + }, + { + "epoch": 0.5158642863186395, + "grad_norm": 0.38809531927108765, + "learning_rate": 0.001, + "loss": 1.5207, + "step": 12194 + }, + { + "epoch": 0.5159065910821559, + "grad_norm": 0.17903557419776917, + "learning_rate": 0.001, + "loss": 3.2565, + "step": 12195 + }, + { + "epoch": 0.5159488958456723, + "grad_norm": 3.361924648284912, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 12196 + }, + { + "epoch": 0.5159912006091886, + "grad_norm": 0.16305619478225708, + "learning_rate": 0.001, + "loss": 1.7818, + "step": 12197 + }, + { + "epoch": 0.516033505372705, + "grad_norm": 2.384248733520508, + "learning_rate": 0.001, + "loss": 2.5049, + "step": 12198 + }, + { + "epoch": 0.5160758101362214, + "grad_norm": 0.1969674527645111, + "learning_rate": 0.001, + "loss": 2.1683, + "step": 12199 + }, + { + "epoch": 0.5161181148997377, + "grad_norm": 0.42641380429267883, + "learning_rate": 0.001, + "loss": 1.9276, + "step": 12200 + }, + { + "epoch": 0.5161604196632541, + "grad_norm": 0.18019993603229523, + "learning_rate": 0.001, + "loss": 2.4077, + "step": 12201 + }, + { + "epoch": 0.5162027244267705, + "grad_norm": 0.2010054886341095, + "learning_rate": 0.001, + "loss": 2.147, + "step": 12202 + }, + { + "epoch": 0.5162450291902868, + "grad_norm": 0.31541866064071655, + "learning_rate": 0.001, + "loss": 2.5211, + "step": 12203 + }, + { + "epoch": 0.5162873339538032, + "grad_norm": 0.16396036744117737, + "learning_rate": 0.001, + "loss": 2.0067, + "step": 12204 + }, + { + "epoch": 0.5163296387173195, + "grad_norm": 0.1641075164079666, + "learning_rate": 0.001, + "loss": 1.8241, + "step": 12205 + }, + { + "epoch": 0.5163719434808359, + "grad_norm": 0.9481202960014343, + "learning_rate": 0.001, + "loss": 2.4869, + "step": 12206 + }, + { + "epoch": 0.5164142482443523, + "grad_norm": 0.17808358371257782, + "learning_rate": 0.001, + "loss": 1.9912, + "step": 12207 + }, + { + "epoch": 0.5164565530078686, + "grad_norm": 0.15392626821994781, + "learning_rate": 0.001, + "loss": 2.3439, + "step": 12208 + }, + { + "epoch": 0.516498857771385, + "grad_norm": 0.1524217277765274, + "learning_rate": 0.001, + "loss": 1.5912, + "step": 12209 + }, + { + "epoch": 0.5165411625349015, + "grad_norm": 0.21630914509296417, + "learning_rate": 0.001, + "loss": 1.8016, + "step": 12210 + }, + { + "epoch": 0.5165834672984178, + "grad_norm": 0.1926201581954956, + "learning_rate": 0.001, + "loss": 2.1263, + "step": 12211 + }, + { + "epoch": 0.5166257720619342, + "grad_norm": 0.4192337691783905, + "learning_rate": 0.001, + "loss": 2.6866, + "step": 12212 + }, + { + "epoch": 0.5166680768254506, + "grad_norm": 2.8745205402374268, + "learning_rate": 0.001, + "loss": 1.7879, + "step": 12213 + }, + { + "epoch": 0.5167103815889669, + "grad_norm": 0.4913157820701599, + "learning_rate": 0.001, + "loss": 1.9727, + "step": 12214 + }, + { + "epoch": 0.5167526863524833, + "grad_norm": 0.2506055533885956, + "learning_rate": 0.001, + "loss": 2.5039, + "step": 12215 + }, + { + "epoch": 0.5167949911159997, + "grad_norm": 0.19540032744407654, + "learning_rate": 0.001, + "loss": 1.7671, + "step": 12216 + }, + { + "epoch": 0.516837295879516, + "grad_norm": 0.174033522605896, + "learning_rate": 0.001, + "loss": 2.5238, + "step": 12217 + }, + { + "epoch": 0.5168796006430324, + "grad_norm": 0.1758839190006256, + "learning_rate": 0.001, + "loss": 1.882, + "step": 12218 + }, + { + "epoch": 0.5169219054065488, + "grad_norm": 0.1755172461271286, + "learning_rate": 0.001, + "loss": 2.0126, + "step": 12219 + }, + { + "epoch": 0.5169642101700651, + "grad_norm": 0.2143266797065735, + "learning_rate": 0.001, + "loss": 2.2329, + "step": 12220 + }, + { + "epoch": 0.5170065149335815, + "grad_norm": 0.20903651416301727, + "learning_rate": 0.001, + "loss": 2.5899, + "step": 12221 + }, + { + "epoch": 0.5170488196970979, + "grad_norm": 0.19313296675682068, + "learning_rate": 0.001, + "loss": 1.7994, + "step": 12222 + }, + { + "epoch": 0.5170911244606142, + "grad_norm": 0.1702556163072586, + "learning_rate": 0.001, + "loss": 1.8909, + "step": 12223 + }, + { + "epoch": 0.5171334292241306, + "grad_norm": 0.17211031913757324, + "learning_rate": 0.001, + "loss": 2.1869, + "step": 12224 + }, + { + "epoch": 0.517175733987647, + "grad_norm": 0.18440312147140503, + "learning_rate": 0.001, + "loss": 2.1057, + "step": 12225 + }, + { + "epoch": 0.5172180387511633, + "grad_norm": 0.18326327204704285, + "learning_rate": 0.001, + "loss": 2.9968, + "step": 12226 + }, + { + "epoch": 0.5172603435146798, + "grad_norm": 0.3492085933685303, + "learning_rate": 0.001, + "loss": 1.8782, + "step": 12227 + }, + { + "epoch": 0.5173026482781962, + "grad_norm": 0.20614147186279297, + "learning_rate": 0.001, + "loss": 1.6796, + "step": 12228 + }, + { + "epoch": 0.5173449530417125, + "grad_norm": 0.18441130220890045, + "learning_rate": 0.001, + "loss": 1.968, + "step": 12229 + }, + { + "epoch": 0.5173872578052289, + "grad_norm": 0.21896757185459137, + "learning_rate": 0.001, + "loss": 2.4805, + "step": 12230 + }, + { + "epoch": 0.5174295625687453, + "grad_norm": 0.3186761736869812, + "learning_rate": 0.001, + "loss": 2.0246, + "step": 12231 + }, + { + "epoch": 0.5174718673322616, + "grad_norm": 0.24206700921058655, + "learning_rate": 0.001, + "loss": 2.3907, + "step": 12232 + }, + { + "epoch": 0.517514172095778, + "grad_norm": 0.19111651182174683, + "learning_rate": 0.001, + "loss": 1.9669, + "step": 12233 + }, + { + "epoch": 0.5175564768592944, + "grad_norm": 0.5492585897445679, + "learning_rate": 0.001, + "loss": 1.8345, + "step": 12234 + }, + { + "epoch": 0.5175987816228107, + "grad_norm": 0.4502498209476471, + "learning_rate": 0.001, + "loss": 2.326, + "step": 12235 + }, + { + "epoch": 0.5176410863863271, + "grad_norm": 0.22445209324359894, + "learning_rate": 0.001, + "loss": 1.5903, + "step": 12236 + }, + { + "epoch": 0.5176833911498435, + "grad_norm": 0.24608655273914337, + "learning_rate": 0.001, + "loss": 3.3088, + "step": 12237 + }, + { + "epoch": 0.5177256959133598, + "grad_norm": 0.25341930985450745, + "learning_rate": 0.001, + "loss": 2.5506, + "step": 12238 + }, + { + "epoch": 0.5177680006768762, + "grad_norm": 0.18000063300132751, + "learning_rate": 0.001, + "loss": 2.4819, + "step": 12239 + }, + { + "epoch": 0.5178103054403926, + "grad_norm": 0.21232323348522186, + "learning_rate": 0.001, + "loss": 2.9278, + "step": 12240 + }, + { + "epoch": 0.5178526102039089, + "grad_norm": 0.14666417241096497, + "learning_rate": 0.001, + "loss": 2.0134, + "step": 12241 + }, + { + "epoch": 0.5178949149674253, + "grad_norm": 0.15804670751094818, + "learning_rate": 0.001, + "loss": 1.7012, + "step": 12242 + }, + { + "epoch": 0.5179372197309418, + "grad_norm": 0.25225335359573364, + "learning_rate": 0.001, + "loss": 2.1588, + "step": 12243 + }, + { + "epoch": 0.517979524494458, + "grad_norm": 0.18607112765312195, + "learning_rate": 0.001, + "loss": 1.8788, + "step": 12244 + }, + { + "epoch": 0.5180218292579745, + "grad_norm": 0.20208168029785156, + "learning_rate": 0.001, + "loss": 1.9131, + "step": 12245 + }, + { + "epoch": 0.5180641340214909, + "grad_norm": 1.9069957733154297, + "learning_rate": 0.001, + "loss": 1.4981, + "step": 12246 + }, + { + "epoch": 0.5181064387850072, + "grad_norm": 1.0007692575454712, + "learning_rate": 0.001, + "loss": 1.9453, + "step": 12247 + }, + { + "epoch": 0.5181487435485236, + "grad_norm": 0.1494317203760147, + "learning_rate": 0.001, + "loss": 1.5441, + "step": 12248 + }, + { + "epoch": 0.5181910483120399, + "grad_norm": 0.19041891396045685, + "learning_rate": 0.001, + "loss": 1.7273, + "step": 12249 + }, + { + "epoch": 0.5182333530755563, + "grad_norm": 0.4171985983848572, + "learning_rate": 0.001, + "loss": 1.4179, + "step": 12250 + }, + { + "epoch": 0.5182756578390727, + "grad_norm": 0.19176238775253296, + "learning_rate": 0.001, + "loss": 2.2115, + "step": 12251 + }, + { + "epoch": 0.518317962602589, + "grad_norm": 1.5357853174209595, + "learning_rate": 0.001, + "loss": 2.3413, + "step": 12252 + }, + { + "epoch": 0.5183602673661054, + "grad_norm": 0.19013294577598572, + "learning_rate": 0.001, + "loss": 2.2362, + "step": 12253 + }, + { + "epoch": 0.5184025721296218, + "grad_norm": 0.19564557075500488, + "learning_rate": 0.001, + "loss": 1.971, + "step": 12254 + }, + { + "epoch": 0.5184448768931381, + "grad_norm": 0.21926617622375488, + "learning_rate": 0.001, + "loss": 2.6159, + "step": 12255 + }, + { + "epoch": 0.5184871816566545, + "grad_norm": 0.17380297183990479, + "learning_rate": 0.001, + "loss": 2.8202, + "step": 12256 + }, + { + "epoch": 0.5185294864201709, + "grad_norm": 0.5753722190856934, + "learning_rate": 0.001, + "loss": 3.7983, + "step": 12257 + }, + { + "epoch": 0.5185717911836872, + "grad_norm": 0.25849559903144836, + "learning_rate": 0.001, + "loss": 2.8161, + "step": 12258 + }, + { + "epoch": 0.5186140959472036, + "grad_norm": 0.18519815802574158, + "learning_rate": 0.001, + "loss": 2.1676, + "step": 12259 + }, + { + "epoch": 0.5186564007107201, + "grad_norm": 0.19013738632202148, + "learning_rate": 0.001, + "loss": 1.5707, + "step": 12260 + }, + { + "epoch": 0.5186987054742364, + "grad_norm": 0.17344141006469727, + "learning_rate": 0.001, + "loss": 2.1209, + "step": 12261 + }, + { + "epoch": 0.5187410102377528, + "grad_norm": 1.2750663757324219, + "learning_rate": 0.001, + "loss": 1.9932, + "step": 12262 + }, + { + "epoch": 0.5187833150012692, + "grad_norm": 0.16777728497982025, + "learning_rate": 0.001, + "loss": 1.5308, + "step": 12263 + }, + { + "epoch": 0.5188256197647855, + "grad_norm": 0.1650819033384323, + "learning_rate": 0.001, + "loss": 2.7328, + "step": 12264 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 0.14448492228984833, + "learning_rate": 0.001, + "loss": 2.4757, + "step": 12265 + }, + { + "epoch": 0.5189102292918183, + "grad_norm": 0.18940666317939758, + "learning_rate": 0.001, + "loss": 1.5308, + "step": 12266 + }, + { + "epoch": 0.5189525340553346, + "grad_norm": 0.19672076404094696, + "learning_rate": 0.001, + "loss": 1.9485, + "step": 12267 + }, + { + "epoch": 0.518994838818851, + "grad_norm": 0.16662020981311798, + "learning_rate": 0.001, + "loss": 1.9421, + "step": 12268 + }, + { + "epoch": 0.5190371435823674, + "grad_norm": 0.20337186753749847, + "learning_rate": 0.001, + "loss": 2.6949, + "step": 12269 + }, + { + "epoch": 0.5190794483458837, + "grad_norm": 3.0193428993225098, + "learning_rate": 0.001, + "loss": 1.9591, + "step": 12270 + }, + { + "epoch": 0.5191217531094001, + "grad_norm": 0.2497682422399521, + "learning_rate": 0.001, + "loss": 2.3117, + "step": 12271 + }, + { + "epoch": 0.5191640578729165, + "grad_norm": 0.16508491337299347, + "learning_rate": 0.001, + "loss": 2.6591, + "step": 12272 + }, + { + "epoch": 0.5192063626364328, + "grad_norm": 0.17598336935043335, + "learning_rate": 0.001, + "loss": 1.634, + "step": 12273 + }, + { + "epoch": 0.5192486673999492, + "grad_norm": 0.22913537919521332, + "learning_rate": 0.001, + "loss": 1.7489, + "step": 12274 + }, + { + "epoch": 0.5192909721634656, + "grad_norm": 0.19798220694065094, + "learning_rate": 0.001, + "loss": 1.8771, + "step": 12275 + }, + { + "epoch": 0.519333276926982, + "grad_norm": 0.181406632065773, + "learning_rate": 0.001, + "loss": 1.9688, + "step": 12276 + }, + { + "epoch": 0.5193755816904984, + "grad_norm": 0.19419100880622864, + "learning_rate": 0.001, + "loss": 3.1894, + "step": 12277 + }, + { + "epoch": 0.5194178864540148, + "grad_norm": 0.17426498234272003, + "learning_rate": 0.001, + "loss": 2.2271, + "step": 12278 + }, + { + "epoch": 0.5194601912175311, + "grad_norm": 0.2602224349975586, + "learning_rate": 0.001, + "loss": 1.8576, + "step": 12279 + }, + { + "epoch": 0.5195024959810475, + "grad_norm": 0.16941271722316742, + "learning_rate": 0.001, + "loss": 2.652, + "step": 12280 + }, + { + "epoch": 0.5195448007445639, + "grad_norm": 2.4823379516601562, + "learning_rate": 0.001, + "loss": 3.1498, + "step": 12281 + }, + { + "epoch": 0.5195871055080802, + "grad_norm": 0.16280913352966309, + "learning_rate": 0.001, + "loss": 2.2317, + "step": 12282 + }, + { + "epoch": 0.5196294102715966, + "grad_norm": 0.2174593210220337, + "learning_rate": 0.001, + "loss": 2.4013, + "step": 12283 + }, + { + "epoch": 0.519671715035113, + "grad_norm": 0.2575055956840515, + "learning_rate": 0.001, + "loss": 3.4478, + "step": 12284 + }, + { + "epoch": 0.5197140197986293, + "grad_norm": 0.18891672790050507, + "learning_rate": 0.001, + "loss": 3.3149, + "step": 12285 + }, + { + "epoch": 0.5197563245621457, + "grad_norm": 0.17338892817497253, + "learning_rate": 0.001, + "loss": 1.7199, + "step": 12286 + }, + { + "epoch": 0.5197986293256621, + "grad_norm": 0.4391571283340454, + "learning_rate": 0.001, + "loss": 1.8649, + "step": 12287 + }, + { + "epoch": 0.5198409340891784, + "grad_norm": 0.17235347628593445, + "learning_rate": 0.001, + "loss": 2.0456, + "step": 12288 + }, + { + "epoch": 0.5198832388526948, + "grad_norm": 0.2375982701778412, + "learning_rate": 0.001, + "loss": 3.4219, + "step": 12289 + }, + { + "epoch": 0.5199255436162112, + "grad_norm": 0.201025128364563, + "learning_rate": 0.001, + "loss": 1.732, + "step": 12290 + }, + { + "epoch": 0.5199678483797275, + "grad_norm": 0.3689859211444855, + "learning_rate": 0.001, + "loss": 3.6554, + "step": 12291 + }, + { + "epoch": 0.520010153143244, + "grad_norm": 0.4115980267524719, + "learning_rate": 0.001, + "loss": 3.1115, + "step": 12292 + }, + { + "epoch": 0.5200524579067602, + "grad_norm": 0.20673608779907227, + "learning_rate": 0.001, + "loss": 2.0847, + "step": 12293 + }, + { + "epoch": 0.5200947626702767, + "grad_norm": 0.18936006724834442, + "learning_rate": 0.001, + "loss": 1.8575, + "step": 12294 + }, + { + "epoch": 0.5201370674337931, + "grad_norm": 0.18387524783611298, + "learning_rate": 0.001, + "loss": 2.73, + "step": 12295 + }, + { + "epoch": 0.5201793721973094, + "grad_norm": 0.1719491332769394, + "learning_rate": 0.001, + "loss": 2.1344, + "step": 12296 + }, + { + "epoch": 0.5202216769608258, + "grad_norm": 0.18453410267829895, + "learning_rate": 0.001, + "loss": 1.9734, + "step": 12297 + }, + { + "epoch": 0.5202639817243422, + "grad_norm": 0.1657096892595291, + "learning_rate": 0.001, + "loss": 2.1985, + "step": 12298 + }, + { + "epoch": 0.5203062864878585, + "grad_norm": 0.16189830005168915, + "learning_rate": 0.001, + "loss": 1.5063, + "step": 12299 + }, + { + "epoch": 0.5203485912513749, + "grad_norm": 1.7175111770629883, + "learning_rate": 0.001, + "loss": 2.564, + "step": 12300 + }, + { + "epoch": 0.5203908960148913, + "grad_norm": 0.5387628674507141, + "learning_rate": 0.001, + "loss": 2.2738, + "step": 12301 + }, + { + "epoch": 0.5204332007784076, + "grad_norm": 0.16023603081703186, + "learning_rate": 0.001, + "loss": 2.5196, + "step": 12302 + }, + { + "epoch": 0.520475505541924, + "grad_norm": 0.21019423007965088, + "learning_rate": 0.001, + "loss": 1.763, + "step": 12303 + }, + { + "epoch": 0.5205178103054404, + "grad_norm": 0.5185931921005249, + "learning_rate": 0.001, + "loss": 1.9254, + "step": 12304 + }, + { + "epoch": 0.5205601150689567, + "grad_norm": 0.16839724779129028, + "learning_rate": 0.001, + "loss": 1.5031, + "step": 12305 + }, + { + "epoch": 0.5206024198324731, + "grad_norm": 0.22643537819385529, + "learning_rate": 0.001, + "loss": 2.4948, + "step": 12306 + }, + { + "epoch": 0.5206447245959895, + "grad_norm": 0.40977147221565247, + "learning_rate": 0.001, + "loss": 2.2601, + "step": 12307 + }, + { + "epoch": 0.5206870293595058, + "grad_norm": 0.1929093897342682, + "learning_rate": 0.001, + "loss": 1.6832, + "step": 12308 + }, + { + "epoch": 0.5207293341230222, + "grad_norm": 0.1705029159784317, + "learning_rate": 0.001, + "loss": 2.3835, + "step": 12309 + }, + { + "epoch": 0.5207716388865387, + "grad_norm": 0.17721432447433472, + "learning_rate": 0.001, + "loss": 2.0197, + "step": 12310 + }, + { + "epoch": 0.520813943650055, + "grad_norm": 0.17962828278541565, + "learning_rate": 0.001, + "loss": 2.0358, + "step": 12311 + }, + { + "epoch": 0.5208562484135714, + "grad_norm": 0.17918899655342102, + "learning_rate": 0.001, + "loss": 2.6547, + "step": 12312 + }, + { + "epoch": 0.5208985531770878, + "grad_norm": 0.18718057870864868, + "learning_rate": 0.001, + "loss": 2.2771, + "step": 12313 + }, + { + "epoch": 0.5209408579406041, + "grad_norm": 0.21974588930606842, + "learning_rate": 0.001, + "loss": 1.9895, + "step": 12314 + }, + { + "epoch": 0.5209831627041205, + "grad_norm": 0.2035195678472519, + "learning_rate": 0.001, + "loss": 2.0884, + "step": 12315 + }, + { + "epoch": 0.5210254674676369, + "grad_norm": 0.8761468529701233, + "learning_rate": 0.001, + "loss": 3.3274, + "step": 12316 + }, + { + "epoch": 0.5210677722311532, + "grad_norm": 0.20555326342582703, + "learning_rate": 0.001, + "loss": 2.0817, + "step": 12317 + }, + { + "epoch": 0.5211100769946696, + "grad_norm": 0.18637920916080475, + "learning_rate": 0.001, + "loss": 2.6589, + "step": 12318 + }, + { + "epoch": 0.521152381758186, + "grad_norm": 0.1818656623363495, + "learning_rate": 0.001, + "loss": 2.0427, + "step": 12319 + }, + { + "epoch": 0.5211946865217023, + "grad_norm": 2.5001955032348633, + "learning_rate": 0.001, + "loss": 1.729, + "step": 12320 + }, + { + "epoch": 0.5212369912852187, + "grad_norm": 0.3406553566455841, + "learning_rate": 0.001, + "loss": 1.5588, + "step": 12321 + }, + { + "epoch": 0.5212792960487351, + "grad_norm": 0.2558114528656006, + "learning_rate": 0.001, + "loss": 2.0768, + "step": 12322 + }, + { + "epoch": 0.5213216008122514, + "grad_norm": 7.105432033538818, + "learning_rate": 0.001, + "loss": 2.6181, + "step": 12323 + }, + { + "epoch": 0.5213639055757678, + "grad_norm": 0.18078719079494476, + "learning_rate": 0.001, + "loss": 2.1346, + "step": 12324 + }, + { + "epoch": 0.5214062103392842, + "grad_norm": 0.2121170312166214, + "learning_rate": 0.001, + "loss": 2.2811, + "step": 12325 + }, + { + "epoch": 0.5214485151028005, + "grad_norm": 0.21996238827705383, + "learning_rate": 0.001, + "loss": 2.6754, + "step": 12326 + }, + { + "epoch": 0.521490819866317, + "grad_norm": 0.1870933473110199, + "learning_rate": 0.001, + "loss": 2.6883, + "step": 12327 + }, + { + "epoch": 0.5215331246298334, + "grad_norm": 0.2098461240530014, + "learning_rate": 0.001, + "loss": 2.1624, + "step": 12328 + }, + { + "epoch": 0.5215754293933497, + "grad_norm": 0.1886197030544281, + "learning_rate": 0.001, + "loss": 1.961, + "step": 12329 + }, + { + "epoch": 0.5216177341568661, + "grad_norm": 0.2189781665802002, + "learning_rate": 0.001, + "loss": 2.2462, + "step": 12330 + }, + { + "epoch": 0.5216600389203825, + "grad_norm": 0.1763230264186859, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 12331 + }, + { + "epoch": 0.5217023436838988, + "grad_norm": 1.2748078107833862, + "learning_rate": 0.001, + "loss": 2.7384, + "step": 12332 + }, + { + "epoch": 0.5217446484474152, + "grad_norm": 0.18424801528453827, + "learning_rate": 0.001, + "loss": 2.4212, + "step": 12333 + }, + { + "epoch": 0.5217869532109316, + "grad_norm": 0.8379740715026855, + "learning_rate": 0.001, + "loss": 2.8512, + "step": 12334 + }, + { + "epoch": 0.5218292579744479, + "grad_norm": 0.19673675298690796, + "learning_rate": 0.001, + "loss": 1.5325, + "step": 12335 + }, + { + "epoch": 0.5218715627379643, + "grad_norm": 1.2396926879882812, + "learning_rate": 0.001, + "loss": 3.2144, + "step": 12336 + }, + { + "epoch": 0.5219138675014807, + "grad_norm": 0.1718030571937561, + "learning_rate": 0.001, + "loss": 2.4473, + "step": 12337 + }, + { + "epoch": 0.521956172264997, + "grad_norm": 0.1788032203912735, + "learning_rate": 0.001, + "loss": 3.1653, + "step": 12338 + }, + { + "epoch": 0.5219984770285134, + "grad_norm": 0.17052021622657776, + "learning_rate": 0.001, + "loss": 3.2152, + "step": 12339 + }, + { + "epoch": 0.5220407817920297, + "grad_norm": 2.6638343334198, + "learning_rate": 0.001, + "loss": 2.5477, + "step": 12340 + }, + { + "epoch": 0.5220830865555461, + "grad_norm": 0.20238353312015533, + "learning_rate": 0.001, + "loss": 2.1136, + "step": 12341 + }, + { + "epoch": 0.5221253913190625, + "grad_norm": 0.17800480127334595, + "learning_rate": 0.001, + "loss": 1.9329, + "step": 12342 + }, + { + "epoch": 0.5221676960825788, + "grad_norm": 0.1630057841539383, + "learning_rate": 0.001, + "loss": 2.7349, + "step": 12343 + }, + { + "epoch": 0.5222100008460953, + "grad_norm": 0.20257359743118286, + "learning_rate": 0.001, + "loss": 2.3163, + "step": 12344 + }, + { + "epoch": 0.5222523056096117, + "grad_norm": 0.2002708911895752, + "learning_rate": 0.001, + "loss": 1.8127, + "step": 12345 + }, + { + "epoch": 0.522294610373128, + "grad_norm": 0.6013284921646118, + "learning_rate": 0.001, + "loss": 1.9079, + "step": 12346 + }, + { + "epoch": 0.5223369151366444, + "grad_norm": 0.1917499452829361, + "learning_rate": 0.001, + "loss": 2.3516, + "step": 12347 + }, + { + "epoch": 0.5223792199001608, + "grad_norm": 0.27086690068244934, + "learning_rate": 0.001, + "loss": 1.9564, + "step": 12348 + }, + { + "epoch": 0.5224215246636771, + "grad_norm": 0.22160665690898895, + "learning_rate": 0.001, + "loss": 2.3388, + "step": 12349 + }, + { + "epoch": 0.5224638294271935, + "grad_norm": 0.17383019626140594, + "learning_rate": 0.001, + "loss": 2.104, + "step": 12350 + }, + { + "epoch": 0.5225061341907099, + "grad_norm": 12.725953102111816, + "learning_rate": 0.001, + "loss": 1.8924, + "step": 12351 + }, + { + "epoch": 0.5225484389542262, + "grad_norm": 0.20004230737686157, + "learning_rate": 0.001, + "loss": 1.8242, + "step": 12352 + }, + { + "epoch": 0.5225907437177426, + "grad_norm": 0.20954293012619019, + "learning_rate": 0.001, + "loss": 2.2262, + "step": 12353 + }, + { + "epoch": 0.522633048481259, + "grad_norm": 0.2141389548778534, + "learning_rate": 0.001, + "loss": 2.7329, + "step": 12354 + }, + { + "epoch": 0.5226753532447753, + "grad_norm": 0.21340107917785645, + "learning_rate": 0.001, + "loss": 2.121, + "step": 12355 + }, + { + "epoch": 0.5227176580082917, + "grad_norm": 0.17848850786685944, + "learning_rate": 0.001, + "loss": 1.7691, + "step": 12356 + }, + { + "epoch": 0.5227599627718081, + "grad_norm": 0.21579593420028687, + "learning_rate": 0.001, + "loss": 4.1183, + "step": 12357 + }, + { + "epoch": 0.5228022675353244, + "grad_norm": 0.3494706451892853, + "learning_rate": 0.001, + "loss": 2.0613, + "step": 12358 + }, + { + "epoch": 0.5228445722988408, + "grad_norm": 0.1930229514837265, + "learning_rate": 0.001, + "loss": 1.9758, + "step": 12359 + }, + { + "epoch": 0.5228868770623573, + "grad_norm": 0.18557684123516083, + "learning_rate": 0.001, + "loss": 1.7799, + "step": 12360 + }, + { + "epoch": 0.5229291818258736, + "grad_norm": 0.6549309492111206, + "learning_rate": 0.001, + "loss": 2.3321, + "step": 12361 + }, + { + "epoch": 0.52297148658939, + "grad_norm": 0.4236035645008087, + "learning_rate": 0.001, + "loss": 1.793, + "step": 12362 + }, + { + "epoch": 0.5230137913529064, + "grad_norm": 0.21107986569404602, + "learning_rate": 0.001, + "loss": 2.1026, + "step": 12363 + }, + { + "epoch": 0.5230560961164227, + "grad_norm": 0.2254447042942047, + "learning_rate": 0.001, + "loss": 3.0301, + "step": 12364 + }, + { + "epoch": 0.5230984008799391, + "grad_norm": 0.19465897977352142, + "learning_rate": 0.001, + "loss": 2.5741, + "step": 12365 + }, + { + "epoch": 0.5231407056434555, + "grad_norm": 0.2160634696483612, + "learning_rate": 0.001, + "loss": 2.5059, + "step": 12366 + }, + { + "epoch": 0.5231830104069718, + "grad_norm": 0.20978963375091553, + "learning_rate": 0.001, + "loss": 2.9964, + "step": 12367 + }, + { + "epoch": 0.5232253151704882, + "grad_norm": 0.22652214765548706, + "learning_rate": 0.001, + "loss": 2.0517, + "step": 12368 + }, + { + "epoch": 0.5232676199340046, + "grad_norm": 0.20549549162387848, + "learning_rate": 0.001, + "loss": 2.668, + "step": 12369 + }, + { + "epoch": 0.5233099246975209, + "grad_norm": 0.16967639327049255, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 12370 + }, + { + "epoch": 0.5233522294610373, + "grad_norm": 0.21468955278396606, + "learning_rate": 0.001, + "loss": 2.0383, + "step": 12371 + }, + { + "epoch": 0.5233945342245537, + "grad_norm": 0.1573602259159088, + "learning_rate": 0.001, + "loss": 1.9794, + "step": 12372 + }, + { + "epoch": 0.52343683898807, + "grad_norm": 0.8761491775512695, + "learning_rate": 0.001, + "loss": 2.5732, + "step": 12373 + }, + { + "epoch": 0.5234791437515864, + "grad_norm": 0.3122228980064392, + "learning_rate": 0.001, + "loss": 3.2694, + "step": 12374 + }, + { + "epoch": 0.5235214485151028, + "grad_norm": 0.19162549078464508, + "learning_rate": 0.001, + "loss": 2.2557, + "step": 12375 + }, + { + "epoch": 0.5235637532786191, + "grad_norm": 0.22822991013526917, + "learning_rate": 0.001, + "loss": 2.1365, + "step": 12376 + }, + { + "epoch": 0.5236060580421356, + "grad_norm": 0.16961225867271423, + "learning_rate": 0.001, + "loss": 1.7234, + "step": 12377 + }, + { + "epoch": 0.523648362805652, + "grad_norm": 0.1936628669500351, + "learning_rate": 0.001, + "loss": 1.4112, + "step": 12378 + }, + { + "epoch": 0.5236906675691683, + "grad_norm": 0.1944187730550766, + "learning_rate": 0.001, + "loss": 1.7775, + "step": 12379 + }, + { + "epoch": 0.5237329723326847, + "grad_norm": 0.2097603678703308, + "learning_rate": 0.001, + "loss": 2.2723, + "step": 12380 + }, + { + "epoch": 0.5237752770962011, + "grad_norm": 0.4582602381706238, + "learning_rate": 0.001, + "loss": 2.1067, + "step": 12381 + }, + { + "epoch": 0.5238175818597174, + "grad_norm": 0.15353432297706604, + "learning_rate": 0.001, + "loss": 2.1572, + "step": 12382 + }, + { + "epoch": 0.5238598866232338, + "grad_norm": 0.2179485708475113, + "learning_rate": 0.001, + "loss": 2.1966, + "step": 12383 + }, + { + "epoch": 0.5239021913867501, + "grad_norm": 0.2067425549030304, + "learning_rate": 0.001, + "loss": 1.6399, + "step": 12384 + }, + { + "epoch": 0.5239444961502665, + "grad_norm": 0.19505983591079712, + "learning_rate": 0.001, + "loss": 2.661, + "step": 12385 + }, + { + "epoch": 0.5239868009137829, + "grad_norm": 0.17970894277095795, + "learning_rate": 0.001, + "loss": 1.9452, + "step": 12386 + }, + { + "epoch": 0.5240291056772992, + "grad_norm": 1.098179817199707, + "learning_rate": 0.001, + "loss": 2.2812, + "step": 12387 + }, + { + "epoch": 0.5240714104408156, + "grad_norm": 6.329110622406006, + "learning_rate": 0.001, + "loss": 3.141, + "step": 12388 + }, + { + "epoch": 0.524113715204332, + "grad_norm": 0.18318776786327362, + "learning_rate": 0.001, + "loss": 2.4143, + "step": 12389 + }, + { + "epoch": 0.5241560199678483, + "grad_norm": 0.22181005775928497, + "learning_rate": 0.001, + "loss": 2.1807, + "step": 12390 + }, + { + "epoch": 0.5241983247313647, + "grad_norm": 0.17904549837112427, + "learning_rate": 0.001, + "loss": 1.9226, + "step": 12391 + }, + { + "epoch": 0.5242406294948811, + "grad_norm": 0.21022500097751617, + "learning_rate": 0.001, + "loss": 2.4073, + "step": 12392 + }, + { + "epoch": 0.5242829342583974, + "grad_norm": 0.17983204126358032, + "learning_rate": 0.001, + "loss": 2.1061, + "step": 12393 + }, + { + "epoch": 0.5243252390219139, + "grad_norm": 0.29281267523765564, + "learning_rate": 0.001, + "loss": 3.1756, + "step": 12394 + }, + { + "epoch": 0.5243675437854303, + "grad_norm": 1.9023278951644897, + "learning_rate": 0.001, + "loss": 2.1109, + "step": 12395 + }, + { + "epoch": 0.5244098485489466, + "grad_norm": 0.2698245942592621, + "learning_rate": 0.001, + "loss": 1.7295, + "step": 12396 + }, + { + "epoch": 0.524452153312463, + "grad_norm": 0.22318899631500244, + "learning_rate": 0.001, + "loss": 2.1475, + "step": 12397 + }, + { + "epoch": 0.5244944580759794, + "grad_norm": 0.1895742565393448, + "learning_rate": 0.001, + "loss": 2.3827, + "step": 12398 + }, + { + "epoch": 0.5245367628394957, + "grad_norm": 0.22232870757579803, + "learning_rate": 0.001, + "loss": 2.3418, + "step": 12399 + }, + { + "epoch": 0.5245790676030121, + "grad_norm": 0.2738935947418213, + "learning_rate": 0.001, + "loss": 4.0447, + "step": 12400 + }, + { + "epoch": 0.5246213723665285, + "grad_norm": 0.18201978504657745, + "learning_rate": 0.001, + "loss": 2.4426, + "step": 12401 + }, + { + "epoch": 0.5246636771300448, + "grad_norm": 0.26819124817848206, + "learning_rate": 0.001, + "loss": 2.2201, + "step": 12402 + }, + { + "epoch": 0.5247059818935612, + "grad_norm": 0.40238019824028015, + "learning_rate": 0.001, + "loss": 1.9465, + "step": 12403 + }, + { + "epoch": 0.5247482866570776, + "grad_norm": 0.18625056743621826, + "learning_rate": 0.001, + "loss": 2.6789, + "step": 12404 + }, + { + "epoch": 0.5247905914205939, + "grad_norm": 0.19995659589767456, + "learning_rate": 0.001, + "loss": 1.4283, + "step": 12405 + }, + { + "epoch": 0.5248328961841103, + "grad_norm": 0.19528824090957642, + "learning_rate": 0.001, + "loss": 1.6882, + "step": 12406 + }, + { + "epoch": 0.5248752009476267, + "grad_norm": 0.17695745825767517, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 12407 + }, + { + "epoch": 0.524917505711143, + "grad_norm": 0.1534528285264969, + "learning_rate": 0.001, + "loss": 1.7225, + "step": 12408 + }, + { + "epoch": 0.5249598104746594, + "grad_norm": 0.21268439292907715, + "learning_rate": 0.001, + "loss": 2.0355, + "step": 12409 + }, + { + "epoch": 0.5250021152381759, + "grad_norm": 0.18775634467601776, + "learning_rate": 0.001, + "loss": 2.0029, + "step": 12410 + }, + { + "epoch": 0.5250444200016922, + "grad_norm": 1.877071738243103, + "learning_rate": 0.001, + "loss": 2.0333, + "step": 12411 + }, + { + "epoch": 0.5250867247652086, + "grad_norm": 0.19529949128627777, + "learning_rate": 0.001, + "loss": 2.5546, + "step": 12412 + }, + { + "epoch": 0.525129029528725, + "grad_norm": 0.16856685280799866, + "learning_rate": 0.001, + "loss": 2.258, + "step": 12413 + }, + { + "epoch": 0.5251713342922413, + "grad_norm": 0.16465047001838684, + "learning_rate": 0.001, + "loss": 2.2241, + "step": 12414 + }, + { + "epoch": 0.5252136390557577, + "grad_norm": 0.16923588514328003, + "learning_rate": 0.001, + "loss": 1.7148, + "step": 12415 + }, + { + "epoch": 0.5252559438192741, + "grad_norm": 0.15065397322177887, + "learning_rate": 0.001, + "loss": 1.7362, + "step": 12416 + }, + { + "epoch": 0.5252982485827904, + "grad_norm": 0.18638429045677185, + "learning_rate": 0.001, + "loss": 2.3777, + "step": 12417 + }, + { + "epoch": 0.5253405533463068, + "grad_norm": 0.16178961098194122, + "learning_rate": 0.001, + "loss": 2.0762, + "step": 12418 + }, + { + "epoch": 0.5253828581098232, + "grad_norm": 0.1650550216436386, + "learning_rate": 0.001, + "loss": 2.1552, + "step": 12419 + }, + { + "epoch": 0.5254251628733395, + "grad_norm": 1.2916520833969116, + "learning_rate": 0.001, + "loss": 3.1018, + "step": 12420 + }, + { + "epoch": 0.5254674676368559, + "grad_norm": 0.17819340527057648, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 12421 + }, + { + "epoch": 0.5255097724003723, + "grad_norm": 0.18574517965316772, + "learning_rate": 0.001, + "loss": 2.1839, + "step": 12422 + }, + { + "epoch": 0.5255520771638886, + "grad_norm": 0.15550824999809265, + "learning_rate": 0.001, + "loss": 1.4498, + "step": 12423 + }, + { + "epoch": 0.525594381927405, + "grad_norm": 0.2876248061656952, + "learning_rate": 0.001, + "loss": 2.4418, + "step": 12424 + }, + { + "epoch": 0.5256366866909215, + "grad_norm": 0.5879155397415161, + "learning_rate": 0.001, + "loss": 2.641, + "step": 12425 + }, + { + "epoch": 0.5256789914544377, + "grad_norm": 0.29421496391296387, + "learning_rate": 0.001, + "loss": 3.3343, + "step": 12426 + }, + { + "epoch": 0.5257212962179542, + "grad_norm": 0.17267128825187683, + "learning_rate": 0.001, + "loss": 1.9214, + "step": 12427 + }, + { + "epoch": 0.5257636009814705, + "grad_norm": 0.24222494661808014, + "learning_rate": 0.001, + "loss": 2.4036, + "step": 12428 + }, + { + "epoch": 0.5258059057449869, + "grad_norm": 0.171603262424469, + "learning_rate": 0.001, + "loss": 2.1059, + "step": 12429 + }, + { + "epoch": 0.5258482105085033, + "grad_norm": 0.2517464756965637, + "learning_rate": 0.001, + "loss": 2.3314, + "step": 12430 + }, + { + "epoch": 0.5258905152720196, + "grad_norm": 0.19529391825199127, + "learning_rate": 0.001, + "loss": 2.3075, + "step": 12431 + }, + { + "epoch": 0.525932820035536, + "grad_norm": 3.9220962524414062, + "learning_rate": 0.001, + "loss": 2.0059, + "step": 12432 + }, + { + "epoch": 0.5259751247990524, + "grad_norm": 0.2157333493232727, + "learning_rate": 0.001, + "loss": 2.0792, + "step": 12433 + }, + { + "epoch": 0.5260174295625687, + "grad_norm": 0.7744253277778625, + "learning_rate": 0.001, + "loss": 1.6057, + "step": 12434 + }, + { + "epoch": 0.5260597343260851, + "grad_norm": 0.18592911958694458, + "learning_rate": 0.001, + "loss": 1.5647, + "step": 12435 + }, + { + "epoch": 0.5261020390896015, + "grad_norm": 0.15915216505527496, + "learning_rate": 0.001, + "loss": 1.7973, + "step": 12436 + }, + { + "epoch": 0.5261443438531178, + "grad_norm": 26.635772705078125, + "learning_rate": 0.001, + "loss": 2.6575, + "step": 12437 + }, + { + "epoch": 0.5261866486166342, + "grad_norm": 0.17832812666893005, + "learning_rate": 0.001, + "loss": 1.701, + "step": 12438 + }, + { + "epoch": 0.5262289533801506, + "grad_norm": 0.3404991328716278, + "learning_rate": 0.001, + "loss": 1.7622, + "step": 12439 + }, + { + "epoch": 0.5262712581436669, + "grad_norm": 0.17686522006988525, + "learning_rate": 0.001, + "loss": 1.5594, + "step": 12440 + }, + { + "epoch": 0.5263135629071833, + "grad_norm": 0.17130149900913239, + "learning_rate": 0.001, + "loss": 2.038, + "step": 12441 + }, + { + "epoch": 0.5263558676706998, + "grad_norm": 0.7331603169441223, + "learning_rate": 0.001, + "loss": 1.3103, + "step": 12442 + }, + { + "epoch": 0.526398172434216, + "grad_norm": 0.1968250870704651, + "learning_rate": 0.001, + "loss": 1.8041, + "step": 12443 + }, + { + "epoch": 0.5264404771977325, + "grad_norm": 0.2132941633462906, + "learning_rate": 0.001, + "loss": 1.9928, + "step": 12444 + }, + { + "epoch": 0.5264827819612489, + "grad_norm": 0.36532631516456604, + "learning_rate": 0.001, + "loss": 2.7363, + "step": 12445 + }, + { + "epoch": 0.5265250867247652, + "grad_norm": 0.2068646103143692, + "learning_rate": 0.001, + "loss": 1.6601, + "step": 12446 + }, + { + "epoch": 0.5265673914882816, + "grad_norm": 0.20064789056777954, + "learning_rate": 0.001, + "loss": 2.081, + "step": 12447 + }, + { + "epoch": 0.526609696251798, + "grad_norm": 0.23239631950855255, + "learning_rate": 0.001, + "loss": 1.8456, + "step": 12448 + }, + { + "epoch": 0.5266520010153143, + "grad_norm": 0.186894029378891, + "learning_rate": 0.001, + "loss": 2.5725, + "step": 12449 + }, + { + "epoch": 0.5266943057788307, + "grad_norm": 0.16730260848999023, + "learning_rate": 0.001, + "loss": 1.9904, + "step": 12450 + }, + { + "epoch": 0.5267366105423471, + "grad_norm": 0.2060299515724182, + "learning_rate": 0.001, + "loss": 2.014, + "step": 12451 + }, + { + "epoch": 0.5267789153058634, + "grad_norm": 0.5314529538154602, + "learning_rate": 0.001, + "loss": 2.1433, + "step": 12452 + }, + { + "epoch": 0.5268212200693798, + "grad_norm": 0.18238548934459686, + "learning_rate": 0.001, + "loss": 1.9902, + "step": 12453 + }, + { + "epoch": 0.5268635248328962, + "grad_norm": 0.235755056142807, + "learning_rate": 0.001, + "loss": 1.8839, + "step": 12454 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.19867976009845734, + "learning_rate": 0.001, + "loss": 3.061, + "step": 12455 + }, + { + "epoch": 0.5269481343599289, + "grad_norm": 0.1549888551235199, + "learning_rate": 0.001, + "loss": 2.0597, + "step": 12456 + }, + { + "epoch": 0.5269904391234453, + "grad_norm": 0.1728786826133728, + "learning_rate": 0.001, + "loss": 2.26, + "step": 12457 + }, + { + "epoch": 0.5270327438869616, + "grad_norm": 0.20742489397525787, + "learning_rate": 0.001, + "loss": 2.246, + "step": 12458 + }, + { + "epoch": 0.527075048650478, + "grad_norm": 0.15079474449157715, + "learning_rate": 0.001, + "loss": 1.3225, + "step": 12459 + }, + { + "epoch": 0.5271173534139945, + "grad_norm": 0.18330729007720947, + "learning_rate": 0.001, + "loss": 1.6184, + "step": 12460 + }, + { + "epoch": 0.5271596581775108, + "grad_norm": 0.17771340906620026, + "learning_rate": 0.001, + "loss": 2.4973, + "step": 12461 + }, + { + "epoch": 0.5272019629410272, + "grad_norm": 6.8827996253967285, + "learning_rate": 0.001, + "loss": 2.6216, + "step": 12462 + }, + { + "epoch": 0.5272442677045436, + "grad_norm": 0.1736050844192505, + "learning_rate": 0.001, + "loss": 1.9857, + "step": 12463 + }, + { + "epoch": 0.5272865724680599, + "grad_norm": 0.17495718598365784, + "learning_rate": 0.001, + "loss": 1.9657, + "step": 12464 + }, + { + "epoch": 0.5273288772315763, + "grad_norm": 0.16420039534568787, + "learning_rate": 0.001, + "loss": 1.7354, + "step": 12465 + }, + { + "epoch": 0.5273711819950927, + "grad_norm": 0.4817570447921753, + "learning_rate": 0.001, + "loss": 1.7285, + "step": 12466 + }, + { + "epoch": 0.527413486758609, + "grad_norm": 0.17338165640830994, + "learning_rate": 0.001, + "loss": 2.617, + "step": 12467 + }, + { + "epoch": 0.5274557915221254, + "grad_norm": 0.19620858132839203, + "learning_rate": 0.001, + "loss": 3.0908, + "step": 12468 + }, + { + "epoch": 0.5274980962856418, + "grad_norm": 0.2547888457775116, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 12469 + }, + { + "epoch": 0.5275404010491581, + "grad_norm": 0.20097443461418152, + "learning_rate": 0.001, + "loss": 2.0749, + "step": 12470 + }, + { + "epoch": 0.5275827058126745, + "grad_norm": 0.16553282737731934, + "learning_rate": 0.001, + "loss": 2.3331, + "step": 12471 + }, + { + "epoch": 0.5276250105761909, + "grad_norm": 0.3828006982803345, + "learning_rate": 0.001, + "loss": 1.2995, + "step": 12472 + }, + { + "epoch": 0.5276673153397072, + "grad_norm": 0.1714460253715515, + "learning_rate": 0.001, + "loss": 1.5497, + "step": 12473 + }, + { + "epoch": 0.5277096201032236, + "grad_norm": 0.38405609130859375, + "learning_rate": 0.001, + "loss": 2.9096, + "step": 12474 + }, + { + "epoch": 0.5277519248667399, + "grad_norm": 0.19936303794384003, + "learning_rate": 0.001, + "loss": 1.9383, + "step": 12475 + }, + { + "epoch": 0.5277942296302564, + "grad_norm": 0.20733779668807983, + "learning_rate": 0.001, + "loss": 2.1244, + "step": 12476 + }, + { + "epoch": 0.5278365343937728, + "grad_norm": 0.19483071565628052, + "learning_rate": 0.001, + "loss": 2.6053, + "step": 12477 + }, + { + "epoch": 0.5278788391572891, + "grad_norm": 0.1924906224012375, + "learning_rate": 0.001, + "loss": 2.3413, + "step": 12478 + }, + { + "epoch": 0.5279211439208055, + "grad_norm": 0.15672799944877625, + "learning_rate": 0.001, + "loss": 2.4487, + "step": 12479 + }, + { + "epoch": 0.5279634486843219, + "grad_norm": 4.736910820007324, + "learning_rate": 0.001, + "loss": 1.852, + "step": 12480 + }, + { + "epoch": 0.5280057534478382, + "grad_norm": 0.17773562669754028, + "learning_rate": 0.001, + "loss": 2.2705, + "step": 12481 + }, + { + "epoch": 0.5280480582113546, + "grad_norm": 0.17934168875217438, + "learning_rate": 0.001, + "loss": 2.0574, + "step": 12482 + }, + { + "epoch": 0.528090362974871, + "grad_norm": 0.5093991160392761, + "learning_rate": 0.001, + "loss": 3.1038, + "step": 12483 + }, + { + "epoch": 0.5281326677383873, + "grad_norm": 0.18139603734016418, + "learning_rate": 0.001, + "loss": 1.9695, + "step": 12484 + }, + { + "epoch": 0.5281749725019037, + "grad_norm": 0.17985884845256805, + "learning_rate": 0.001, + "loss": 2.5194, + "step": 12485 + }, + { + "epoch": 0.5282172772654201, + "grad_norm": 0.3463243842124939, + "learning_rate": 0.001, + "loss": 2.4281, + "step": 12486 + }, + { + "epoch": 0.5282595820289364, + "grad_norm": 0.21633853018283844, + "learning_rate": 0.001, + "loss": 3.2592, + "step": 12487 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.16759130358695984, + "learning_rate": 0.001, + "loss": 2.6158, + "step": 12488 + }, + { + "epoch": 0.5283441915559692, + "grad_norm": 0.20517773926258087, + "learning_rate": 0.001, + "loss": 1.9567, + "step": 12489 + }, + { + "epoch": 0.5283864963194855, + "grad_norm": 1.2722563743591309, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 12490 + }, + { + "epoch": 0.5284288010830019, + "grad_norm": 0.21564523875713348, + "learning_rate": 0.001, + "loss": 2.731, + "step": 12491 + }, + { + "epoch": 0.5284711058465184, + "grad_norm": 0.18117859959602356, + "learning_rate": 0.001, + "loss": 1.7917, + "step": 12492 + }, + { + "epoch": 0.5285134106100347, + "grad_norm": 0.18480683863162994, + "learning_rate": 0.001, + "loss": 1.7639, + "step": 12493 + }, + { + "epoch": 0.5285557153735511, + "grad_norm": 0.18521404266357422, + "learning_rate": 0.001, + "loss": 1.7627, + "step": 12494 + }, + { + "epoch": 0.5285980201370675, + "grad_norm": 0.15031592547893524, + "learning_rate": 0.001, + "loss": 1.4082, + "step": 12495 + }, + { + "epoch": 0.5286403249005838, + "grad_norm": 0.2322620302438736, + "learning_rate": 0.001, + "loss": 2.1222, + "step": 12496 + }, + { + "epoch": 0.5286826296641002, + "grad_norm": 0.21710190176963806, + "learning_rate": 0.001, + "loss": 2.4748, + "step": 12497 + }, + { + "epoch": 0.5287249344276166, + "grad_norm": 2.9545514583587646, + "learning_rate": 0.001, + "loss": 2.2093, + "step": 12498 + }, + { + "epoch": 0.5287672391911329, + "grad_norm": 0.20980048179626465, + "learning_rate": 0.001, + "loss": 2.3927, + "step": 12499 + }, + { + "epoch": 0.5288095439546493, + "grad_norm": 0.2512090802192688, + "learning_rate": 0.001, + "loss": 1.5181, + "step": 12500 + }, + { + "epoch": 0.5288518487181657, + "grad_norm": 0.19579851627349854, + "learning_rate": 0.001, + "loss": 2.7172, + "step": 12501 + }, + { + "epoch": 0.528894153481682, + "grad_norm": 0.16972453892230988, + "learning_rate": 0.001, + "loss": 2.4574, + "step": 12502 + }, + { + "epoch": 0.5289364582451984, + "grad_norm": 2.3002378940582275, + "learning_rate": 0.001, + "loss": 2.8155, + "step": 12503 + }, + { + "epoch": 0.5289787630087148, + "grad_norm": 0.15065890550613403, + "learning_rate": 0.001, + "loss": 1.7237, + "step": 12504 + }, + { + "epoch": 0.5290210677722311, + "grad_norm": 0.1834668517112732, + "learning_rate": 0.001, + "loss": 2.3786, + "step": 12505 + }, + { + "epoch": 0.5290633725357475, + "grad_norm": 0.37765857577323914, + "learning_rate": 0.001, + "loss": 2.7428, + "step": 12506 + }, + { + "epoch": 0.5291056772992639, + "grad_norm": 0.27641263604164124, + "learning_rate": 0.001, + "loss": 2.5696, + "step": 12507 + }, + { + "epoch": 0.5291479820627802, + "grad_norm": 0.2046346813440323, + "learning_rate": 0.001, + "loss": 2.0424, + "step": 12508 + }, + { + "epoch": 0.5291902868262967, + "grad_norm": 2.7164547443389893, + "learning_rate": 0.001, + "loss": 3.0047, + "step": 12509 + }, + { + "epoch": 0.5292325915898131, + "grad_norm": 0.24318911135196686, + "learning_rate": 0.001, + "loss": 2.8722, + "step": 12510 + }, + { + "epoch": 0.5292748963533294, + "grad_norm": 0.20161695778369904, + "learning_rate": 0.001, + "loss": 3.1377, + "step": 12511 + }, + { + "epoch": 0.5293172011168458, + "grad_norm": 0.18776634335517883, + "learning_rate": 0.001, + "loss": 3.193, + "step": 12512 + }, + { + "epoch": 0.5293595058803622, + "grad_norm": 0.17002810537815094, + "learning_rate": 0.001, + "loss": 2.015, + "step": 12513 + }, + { + "epoch": 0.5294018106438785, + "grad_norm": 0.16018223762512207, + "learning_rate": 0.001, + "loss": 2.0451, + "step": 12514 + }, + { + "epoch": 0.5294441154073949, + "grad_norm": 0.2891288101673126, + "learning_rate": 0.001, + "loss": 1.7617, + "step": 12515 + }, + { + "epoch": 0.5294864201709113, + "grad_norm": 0.19036170840263367, + "learning_rate": 0.001, + "loss": 2.1966, + "step": 12516 + }, + { + "epoch": 0.5295287249344276, + "grad_norm": 0.16855932772159576, + "learning_rate": 0.001, + "loss": 3.1857, + "step": 12517 + }, + { + "epoch": 0.529571029697944, + "grad_norm": 0.1747637689113617, + "learning_rate": 0.001, + "loss": 1.8404, + "step": 12518 + }, + { + "epoch": 0.5296133344614603, + "grad_norm": 0.572921633720398, + "learning_rate": 0.001, + "loss": 2.1041, + "step": 12519 + }, + { + "epoch": 0.5296556392249767, + "grad_norm": 0.17884446680545807, + "learning_rate": 0.001, + "loss": 2.4167, + "step": 12520 + }, + { + "epoch": 0.5296979439884931, + "grad_norm": 0.15950831770896912, + "learning_rate": 0.001, + "loss": 1.9228, + "step": 12521 + }, + { + "epoch": 0.5297402487520094, + "grad_norm": 0.18645338714122772, + "learning_rate": 0.001, + "loss": 1.7911, + "step": 12522 + }, + { + "epoch": 0.5297825535155258, + "grad_norm": 0.1739381104707718, + "learning_rate": 0.001, + "loss": 2.4198, + "step": 12523 + }, + { + "epoch": 0.5298248582790422, + "grad_norm": 0.2655837833881378, + "learning_rate": 0.001, + "loss": 3.0543, + "step": 12524 + }, + { + "epoch": 0.5298671630425585, + "grad_norm": 0.16320541501045227, + "learning_rate": 0.001, + "loss": 1.8807, + "step": 12525 + }, + { + "epoch": 0.529909467806075, + "grad_norm": 0.6725846529006958, + "learning_rate": 0.001, + "loss": 2.298, + "step": 12526 + }, + { + "epoch": 0.5299517725695914, + "grad_norm": 0.18599937856197357, + "learning_rate": 0.001, + "loss": 3.1388, + "step": 12527 + }, + { + "epoch": 0.5299940773331077, + "grad_norm": 0.17061804234981537, + "learning_rate": 0.001, + "loss": 1.4874, + "step": 12528 + }, + { + "epoch": 0.5300363820966241, + "grad_norm": 0.15029898285865784, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 12529 + }, + { + "epoch": 0.5300786868601405, + "grad_norm": 0.16090351343154907, + "learning_rate": 0.001, + "loss": 2.6116, + "step": 12530 + }, + { + "epoch": 0.5301209916236568, + "grad_norm": 0.19355669617652893, + "learning_rate": 0.001, + "loss": 1.8321, + "step": 12531 + }, + { + "epoch": 0.5301632963871732, + "grad_norm": 0.15436525642871857, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 12532 + }, + { + "epoch": 0.5302056011506896, + "grad_norm": 0.20881301164627075, + "learning_rate": 0.001, + "loss": 3.927, + "step": 12533 + }, + { + "epoch": 0.5302479059142059, + "grad_norm": 1.6116613149642944, + "learning_rate": 0.001, + "loss": 2.1104, + "step": 12534 + }, + { + "epoch": 0.5302902106777223, + "grad_norm": 0.4252428710460663, + "learning_rate": 0.001, + "loss": 2.0893, + "step": 12535 + }, + { + "epoch": 0.5303325154412387, + "grad_norm": 3.8983938694000244, + "learning_rate": 0.001, + "loss": 2.5195, + "step": 12536 + }, + { + "epoch": 0.530374820204755, + "grad_norm": 0.23319566249847412, + "learning_rate": 0.001, + "loss": 2.5801, + "step": 12537 + }, + { + "epoch": 0.5304171249682714, + "grad_norm": 2.187912940979004, + "learning_rate": 0.001, + "loss": 1.8214, + "step": 12538 + }, + { + "epoch": 0.5304594297317878, + "grad_norm": 0.18814197182655334, + "learning_rate": 0.001, + "loss": 2.2121, + "step": 12539 + }, + { + "epoch": 0.5305017344953041, + "grad_norm": 0.22966532409191132, + "learning_rate": 0.001, + "loss": 1.7224, + "step": 12540 + }, + { + "epoch": 0.5305440392588205, + "grad_norm": 0.9131950736045837, + "learning_rate": 0.001, + "loss": 1.7284, + "step": 12541 + }, + { + "epoch": 0.530586344022337, + "grad_norm": 0.2272689938545227, + "learning_rate": 0.001, + "loss": 2.2831, + "step": 12542 + }, + { + "epoch": 0.5306286487858533, + "grad_norm": 0.23645274341106415, + "learning_rate": 0.001, + "loss": 1.8332, + "step": 12543 + }, + { + "epoch": 0.5306709535493697, + "grad_norm": 0.2579519748687744, + "learning_rate": 0.001, + "loss": 2.0468, + "step": 12544 + }, + { + "epoch": 0.5307132583128861, + "grad_norm": 0.2276112288236618, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 12545 + }, + { + "epoch": 0.5307555630764024, + "grad_norm": 0.23741212487220764, + "learning_rate": 0.001, + "loss": 2.4542, + "step": 12546 + }, + { + "epoch": 0.5307978678399188, + "grad_norm": 0.2526029050350189, + "learning_rate": 0.001, + "loss": 2.5269, + "step": 12547 + }, + { + "epoch": 0.5308401726034352, + "grad_norm": 0.20093029737472534, + "learning_rate": 0.001, + "loss": 1.6386, + "step": 12548 + }, + { + "epoch": 0.5308824773669515, + "grad_norm": 0.20511755347251892, + "learning_rate": 0.001, + "loss": 2.6388, + "step": 12549 + }, + { + "epoch": 0.5309247821304679, + "grad_norm": 0.2155929058790207, + "learning_rate": 0.001, + "loss": 2.2045, + "step": 12550 + }, + { + "epoch": 0.5309670868939843, + "grad_norm": 0.20179694890975952, + "learning_rate": 0.001, + "loss": 2.0887, + "step": 12551 + }, + { + "epoch": 0.5310093916575006, + "grad_norm": 0.7059963941574097, + "learning_rate": 0.001, + "loss": 1.7561, + "step": 12552 + }, + { + "epoch": 0.531051696421017, + "grad_norm": 0.2252509593963623, + "learning_rate": 0.001, + "loss": 3.4456, + "step": 12553 + }, + { + "epoch": 0.5310940011845334, + "grad_norm": 0.15286624431610107, + "learning_rate": 0.001, + "loss": 2.5038, + "step": 12554 + }, + { + "epoch": 0.5311363059480497, + "grad_norm": 0.17845967411994934, + "learning_rate": 0.001, + "loss": 1.9147, + "step": 12555 + }, + { + "epoch": 0.5311786107115661, + "grad_norm": 0.2328542023897171, + "learning_rate": 0.001, + "loss": 2.0514, + "step": 12556 + }, + { + "epoch": 0.5312209154750825, + "grad_norm": 0.661899745464325, + "learning_rate": 0.001, + "loss": 2.1049, + "step": 12557 + }, + { + "epoch": 0.5312632202385988, + "grad_norm": 0.5505738258361816, + "learning_rate": 0.001, + "loss": 2.063, + "step": 12558 + }, + { + "epoch": 0.5313055250021153, + "grad_norm": 0.14693965017795563, + "learning_rate": 0.001, + "loss": 2.2266, + "step": 12559 + }, + { + "epoch": 0.5313478297656317, + "grad_norm": 0.14494706690311432, + "learning_rate": 0.001, + "loss": 2.0662, + "step": 12560 + }, + { + "epoch": 0.531390134529148, + "grad_norm": 0.4453897178173065, + "learning_rate": 0.001, + "loss": 2.1152, + "step": 12561 + }, + { + "epoch": 0.5314324392926644, + "grad_norm": 0.23559346795082092, + "learning_rate": 0.001, + "loss": 1.9954, + "step": 12562 + }, + { + "epoch": 0.5314747440561808, + "grad_norm": 0.17883579432964325, + "learning_rate": 0.001, + "loss": 2.1284, + "step": 12563 + }, + { + "epoch": 0.5315170488196971, + "grad_norm": 0.16119042038917542, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 12564 + }, + { + "epoch": 0.5315593535832135, + "grad_norm": 0.16220824420452118, + "learning_rate": 0.001, + "loss": 1.5702, + "step": 12565 + }, + { + "epoch": 0.5316016583467298, + "grad_norm": 0.8208723068237305, + "learning_rate": 0.001, + "loss": 2.2647, + "step": 12566 + }, + { + "epoch": 0.5316439631102462, + "grad_norm": 0.17101448774337769, + "learning_rate": 0.001, + "loss": 1.8805, + "step": 12567 + }, + { + "epoch": 0.5316862678737626, + "grad_norm": 0.18567869067192078, + "learning_rate": 0.001, + "loss": 1.8957, + "step": 12568 + }, + { + "epoch": 0.5317285726372789, + "grad_norm": 0.15984505414962769, + "learning_rate": 0.001, + "loss": 2.0752, + "step": 12569 + }, + { + "epoch": 0.5317708774007953, + "grad_norm": 0.1836562305688858, + "learning_rate": 0.001, + "loss": 2.0664, + "step": 12570 + }, + { + "epoch": 0.5318131821643117, + "grad_norm": 0.2100599855184555, + "learning_rate": 0.001, + "loss": 3.0159, + "step": 12571 + }, + { + "epoch": 0.531855486927828, + "grad_norm": 0.16221459209918976, + "learning_rate": 0.001, + "loss": 1.8475, + "step": 12572 + }, + { + "epoch": 0.5318977916913444, + "grad_norm": 0.18409012258052826, + "learning_rate": 0.001, + "loss": 1.9189, + "step": 12573 + }, + { + "epoch": 0.5319400964548608, + "grad_norm": 0.2355443686246872, + "learning_rate": 0.001, + "loss": 2.4434, + "step": 12574 + }, + { + "epoch": 0.5319824012183771, + "grad_norm": 1.7131577730178833, + "learning_rate": 0.001, + "loss": 2.3271, + "step": 12575 + }, + { + "epoch": 0.5320247059818936, + "grad_norm": 0.16390523314476013, + "learning_rate": 0.001, + "loss": 3.1412, + "step": 12576 + }, + { + "epoch": 0.53206701074541, + "grad_norm": 0.17767739295959473, + "learning_rate": 0.001, + "loss": 3.1423, + "step": 12577 + }, + { + "epoch": 0.5321093155089263, + "grad_norm": 0.28065043687820435, + "learning_rate": 0.001, + "loss": 1.9327, + "step": 12578 + }, + { + "epoch": 0.5321516202724427, + "grad_norm": 0.15590040385723114, + "learning_rate": 0.001, + "loss": 3.3144, + "step": 12579 + }, + { + "epoch": 0.5321939250359591, + "grad_norm": 0.18487408757209778, + "learning_rate": 0.001, + "loss": 2.1503, + "step": 12580 + }, + { + "epoch": 0.5322362297994754, + "grad_norm": 0.4396058917045593, + "learning_rate": 0.001, + "loss": 2.4607, + "step": 12581 + }, + { + "epoch": 0.5322785345629918, + "grad_norm": 1.2496229410171509, + "learning_rate": 0.001, + "loss": 2.3387, + "step": 12582 + }, + { + "epoch": 0.5323208393265082, + "grad_norm": 0.19142375886440277, + "learning_rate": 0.001, + "loss": 1.9137, + "step": 12583 + }, + { + "epoch": 0.5323631440900245, + "grad_norm": 1.4793001413345337, + "learning_rate": 0.001, + "loss": 2.1602, + "step": 12584 + }, + { + "epoch": 0.5324054488535409, + "grad_norm": 0.4164004921913147, + "learning_rate": 0.001, + "loss": 2.9867, + "step": 12585 + }, + { + "epoch": 0.5324477536170573, + "grad_norm": 0.17239639163017273, + "learning_rate": 0.001, + "loss": 2.5442, + "step": 12586 + }, + { + "epoch": 0.5324900583805736, + "grad_norm": 0.19600702822208405, + "learning_rate": 0.001, + "loss": 1.9799, + "step": 12587 + }, + { + "epoch": 0.53253236314409, + "grad_norm": 0.1782451719045639, + "learning_rate": 0.001, + "loss": 2.5812, + "step": 12588 + }, + { + "epoch": 0.5325746679076064, + "grad_norm": 0.18545618653297424, + "learning_rate": 0.001, + "loss": 2.3101, + "step": 12589 + }, + { + "epoch": 0.5326169726711227, + "grad_norm": 0.21294978260993958, + "learning_rate": 0.001, + "loss": 1.6784, + "step": 12590 + }, + { + "epoch": 0.5326592774346391, + "grad_norm": 0.16218768060207367, + "learning_rate": 0.001, + "loss": 2.438, + "step": 12591 + }, + { + "epoch": 0.5327015821981556, + "grad_norm": 1.644671082496643, + "learning_rate": 0.001, + "loss": 2.0642, + "step": 12592 + }, + { + "epoch": 0.5327438869616719, + "grad_norm": 0.2141799032688141, + "learning_rate": 0.001, + "loss": 3.1325, + "step": 12593 + }, + { + "epoch": 0.5327861917251883, + "grad_norm": 0.17733773589134216, + "learning_rate": 0.001, + "loss": 2.3464, + "step": 12594 + }, + { + "epoch": 0.5328284964887047, + "grad_norm": 0.1840038150548935, + "learning_rate": 0.001, + "loss": 1.9205, + "step": 12595 + }, + { + "epoch": 0.532870801252221, + "grad_norm": 0.3170630931854248, + "learning_rate": 0.001, + "loss": 2.405, + "step": 12596 + }, + { + "epoch": 0.5329131060157374, + "grad_norm": 0.24258072674274445, + "learning_rate": 0.001, + "loss": 1.855, + "step": 12597 + }, + { + "epoch": 0.5329554107792538, + "grad_norm": 0.9366814494132996, + "learning_rate": 0.001, + "loss": 2.0033, + "step": 12598 + }, + { + "epoch": 0.5329977155427701, + "grad_norm": 0.8258916139602661, + "learning_rate": 0.001, + "loss": 2.5754, + "step": 12599 + }, + { + "epoch": 0.5330400203062865, + "grad_norm": 1.7923530340194702, + "learning_rate": 0.001, + "loss": 2.3026, + "step": 12600 + }, + { + "epoch": 0.5330823250698029, + "grad_norm": 0.5301435589790344, + "learning_rate": 0.001, + "loss": 2.2184, + "step": 12601 + }, + { + "epoch": 0.5331246298333192, + "grad_norm": 0.1785147339105606, + "learning_rate": 0.001, + "loss": 2.0238, + "step": 12602 + }, + { + "epoch": 0.5331669345968356, + "grad_norm": 0.15722912549972534, + "learning_rate": 0.001, + "loss": 1.7873, + "step": 12603 + }, + { + "epoch": 0.533209239360352, + "grad_norm": 0.17702187597751617, + "learning_rate": 0.001, + "loss": 3.0993, + "step": 12604 + }, + { + "epoch": 0.5332515441238683, + "grad_norm": 0.21016950905323029, + "learning_rate": 0.001, + "loss": 2.2791, + "step": 12605 + }, + { + "epoch": 0.5332938488873847, + "grad_norm": 0.19180504977703094, + "learning_rate": 0.001, + "loss": 1.7831, + "step": 12606 + }, + { + "epoch": 0.5333361536509011, + "grad_norm": 0.17286181449890137, + "learning_rate": 0.001, + "loss": 1.94, + "step": 12607 + }, + { + "epoch": 0.5333784584144174, + "grad_norm": 0.25898101925849915, + "learning_rate": 0.001, + "loss": 2.2634, + "step": 12608 + }, + { + "epoch": 0.5334207631779339, + "grad_norm": 0.2569015324115753, + "learning_rate": 0.001, + "loss": 2.2876, + "step": 12609 + }, + { + "epoch": 0.5334630679414502, + "grad_norm": 0.32984477281570435, + "learning_rate": 0.001, + "loss": 2.9021, + "step": 12610 + }, + { + "epoch": 0.5335053727049666, + "grad_norm": 0.2177520990371704, + "learning_rate": 0.001, + "loss": 2.2496, + "step": 12611 + }, + { + "epoch": 0.533547677468483, + "grad_norm": 0.18992137908935547, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 12612 + }, + { + "epoch": 0.5335899822319993, + "grad_norm": 0.20471316576004028, + "learning_rate": 0.001, + "loss": 2.4037, + "step": 12613 + }, + { + "epoch": 0.5336322869955157, + "grad_norm": 0.21054227650165558, + "learning_rate": 0.001, + "loss": 1.9332, + "step": 12614 + }, + { + "epoch": 0.5336745917590321, + "grad_norm": 0.20003412663936615, + "learning_rate": 0.001, + "loss": 2.4779, + "step": 12615 + }, + { + "epoch": 0.5337168965225484, + "grad_norm": 0.17985932528972626, + "learning_rate": 0.001, + "loss": 1.5322, + "step": 12616 + }, + { + "epoch": 0.5337592012860648, + "grad_norm": 2.2871792316436768, + "learning_rate": 0.001, + "loss": 2.4277, + "step": 12617 + }, + { + "epoch": 0.5338015060495812, + "grad_norm": 0.16966228187084198, + "learning_rate": 0.001, + "loss": 2.4963, + "step": 12618 + }, + { + "epoch": 0.5338438108130975, + "grad_norm": 0.19583900272846222, + "learning_rate": 0.001, + "loss": 2.4291, + "step": 12619 + }, + { + "epoch": 0.5338861155766139, + "grad_norm": 0.18535350263118744, + "learning_rate": 0.001, + "loss": 2.1341, + "step": 12620 + }, + { + "epoch": 0.5339284203401303, + "grad_norm": 0.8092800378799438, + "learning_rate": 0.001, + "loss": 2.5207, + "step": 12621 + }, + { + "epoch": 0.5339707251036466, + "grad_norm": 0.19231988489627838, + "learning_rate": 0.001, + "loss": 2.378, + "step": 12622 + }, + { + "epoch": 0.534013029867163, + "grad_norm": 0.22486920654773712, + "learning_rate": 0.001, + "loss": 2.1152, + "step": 12623 + }, + { + "epoch": 0.5340553346306794, + "grad_norm": 0.32526126503944397, + "learning_rate": 0.001, + "loss": 3.1697, + "step": 12624 + }, + { + "epoch": 0.5340976393941957, + "grad_norm": 0.20092487335205078, + "learning_rate": 0.001, + "loss": 2.8525, + "step": 12625 + }, + { + "epoch": 0.5341399441577122, + "grad_norm": 0.2700296640396118, + "learning_rate": 0.001, + "loss": 2.0498, + "step": 12626 + }, + { + "epoch": 0.5341822489212286, + "grad_norm": 0.48722365498542786, + "learning_rate": 0.001, + "loss": 1.8687, + "step": 12627 + }, + { + "epoch": 0.5342245536847449, + "grad_norm": 0.16998623311519623, + "learning_rate": 0.001, + "loss": 2.2888, + "step": 12628 + }, + { + "epoch": 0.5342668584482613, + "grad_norm": 0.15864147245883942, + "learning_rate": 0.001, + "loss": 1.7734, + "step": 12629 + }, + { + "epoch": 0.5343091632117777, + "grad_norm": 0.31583741307258606, + "learning_rate": 0.001, + "loss": 2.4233, + "step": 12630 + }, + { + "epoch": 0.534351467975294, + "grad_norm": 0.19196642935276031, + "learning_rate": 0.001, + "loss": 1.7955, + "step": 12631 + }, + { + "epoch": 0.5343937727388104, + "grad_norm": 0.1647808700799942, + "learning_rate": 0.001, + "loss": 2.5027, + "step": 12632 + }, + { + "epoch": 0.5344360775023268, + "grad_norm": 0.5048976540565491, + "learning_rate": 0.001, + "loss": 2.1264, + "step": 12633 + }, + { + "epoch": 0.5344783822658431, + "grad_norm": 0.22664372622966766, + "learning_rate": 0.001, + "loss": 2.9306, + "step": 12634 + }, + { + "epoch": 0.5345206870293595, + "grad_norm": 0.20378440618515015, + "learning_rate": 0.001, + "loss": 2.8485, + "step": 12635 + }, + { + "epoch": 0.5345629917928759, + "grad_norm": 0.4030454158782959, + "learning_rate": 0.001, + "loss": 2.3177, + "step": 12636 + }, + { + "epoch": 0.5346052965563922, + "grad_norm": 0.7921542525291443, + "learning_rate": 0.001, + "loss": 3.1555, + "step": 12637 + }, + { + "epoch": 0.5346476013199086, + "grad_norm": 0.7897359728813171, + "learning_rate": 0.001, + "loss": 2.6089, + "step": 12638 + }, + { + "epoch": 0.534689906083425, + "grad_norm": 0.2273802012205124, + "learning_rate": 0.001, + "loss": 2.2542, + "step": 12639 + }, + { + "epoch": 0.5347322108469413, + "grad_norm": 0.27411505579948425, + "learning_rate": 0.001, + "loss": 2.8019, + "step": 12640 + }, + { + "epoch": 0.5347745156104577, + "grad_norm": 0.26895782351493835, + "learning_rate": 0.001, + "loss": 2.6258, + "step": 12641 + }, + { + "epoch": 0.5348168203739742, + "grad_norm": 0.17039963603019714, + "learning_rate": 0.001, + "loss": 2.5587, + "step": 12642 + }, + { + "epoch": 0.5348591251374905, + "grad_norm": 0.2543392777442932, + "learning_rate": 0.001, + "loss": 2.3699, + "step": 12643 + }, + { + "epoch": 0.5349014299010069, + "grad_norm": 0.19515450298786163, + "learning_rate": 0.001, + "loss": 2.1001, + "step": 12644 + }, + { + "epoch": 0.5349437346645233, + "grad_norm": 0.16606023907661438, + "learning_rate": 0.001, + "loss": 1.7925, + "step": 12645 + }, + { + "epoch": 0.5349860394280396, + "grad_norm": 0.4345617890357971, + "learning_rate": 0.001, + "loss": 1.9236, + "step": 12646 + }, + { + "epoch": 0.535028344191556, + "grad_norm": 0.18390463292598724, + "learning_rate": 0.001, + "loss": 2.1396, + "step": 12647 + }, + { + "epoch": 0.5350706489550724, + "grad_norm": 0.17461326718330383, + "learning_rate": 0.001, + "loss": 1.7379, + "step": 12648 + }, + { + "epoch": 0.5351129537185887, + "grad_norm": 0.17248356342315674, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 12649 + }, + { + "epoch": 0.5351552584821051, + "grad_norm": 0.17978371679782867, + "learning_rate": 0.001, + "loss": 2.7611, + "step": 12650 + }, + { + "epoch": 0.5351975632456215, + "grad_norm": 0.4586908221244812, + "learning_rate": 0.001, + "loss": 1.4555, + "step": 12651 + }, + { + "epoch": 0.5352398680091378, + "grad_norm": 0.233989417552948, + "learning_rate": 0.001, + "loss": 1.9321, + "step": 12652 + }, + { + "epoch": 0.5352821727726542, + "grad_norm": 0.1826629787683487, + "learning_rate": 0.001, + "loss": 2.3415, + "step": 12653 + }, + { + "epoch": 0.5353244775361705, + "grad_norm": 0.15267974138259888, + "learning_rate": 0.001, + "loss": 1.4848, + "step": 12654 + }, + { + "epoch": 0.5353667822996869, + "grad_norm": 4.387269020080566, + "learning_rate": 0.001, + "loss": 2.716, + "step": 12655 + }, + { + "epoch": 0.5354090870632033, + "grad_norm": 0.26923081278800964, + "learning_rate": 0.001, + "loss": 3.246, + "step": 12656 + }, + { + "epoch": 0.5354513918267196, + "grad_norm": 0.15483248233795166, + "learning_rate": 0.001, + "loss": 1.5195, + "step": 12657 + }, + { + "epoch": 0.535493696590236, + "grad_norm": 0.18110813200473785, + "learning_rate": 0.001, + "loss": 1.7063, + "step": 12658 + }, + { + "epoch": 0.5355360013537525, + "grad_norm": 0.1986570507287979, + "learning_rate": 0.001, + "loss": 2.1137, + "step": 12659 + }, + { + "epoch": 0.5355783061172688, + "grad_norm": 0.16241492331027985, + "learning_rate": 0.001, + "loss": 1.7866, + "step": 12660 + }, + { + "epoch": 0.5356206108807852, + "grad_norm": 0.22161731123924255, + "learning_rate": 0.001, + "loss": 2.3589, + "step": 12661 + }, + { + "epoch": 0.5356629156443016, + "grad_norm": 0.17555823922157288, + "learning_rate": 0.001, + "loss": 1.9163, + "step": 12662 + }, + { + "epoch": 0.5357052204078179, + "grad_norm": 0.20892333984375, + "learning_rate": 0.001, + "loss": 2.7506, + "step": 12663 + }, + { + "epoch": 0.5357475251713343, + "grad_norm": 0.1600860357284546, + "learning_rate": 0.001, + "loss": 1.9907, + "step": 12664 + }, + { + "epoch": 0.5357898299348507, + "grad_norm": 0.20017333328723907, + "learning_rate": 0.001, + "loss": 2.6324, + "step": 12665 + }, + { + "epoch": 0.535832134698367, + "grad_norm": 0.18923135101795197, + "learning_rate": 0.001, + "loss": 2.1908, + "step": 12666 + }, + { + "epoch": 0.5358744394618834, + "grad_norm": 0.24769580364227295, + "learning_rate": 0.001, + "loss": 1.9219, + "step": 12667 + }, + { + "epoch": 0.5359167442253998, + "grad_norm": 0.43653157353401184, + "learning_rate": 0.001, + "loss": 2.1292, + "step": 12668 + }, + { + "epoch": 0.5359590489889161, + "grad_norm": 0.18919190764427185, + "learning_rate": 0.001, + "loss": 2.4091, + "step": 12669 + }, + { + "epoch": 0.5360013537524325, + "grad_norm": 16.16994857788086, + "learning_rate": 0.001, + "loss": 2.1238, + "step": 12670 + }, + { + "epoch": 0.5360436585159489, + "grad_norm": 0.16744960844516754, + "learning_rate": 0.001, + "loss": 2.7491, + "step": 12671 + }, + { + "epoch": 0.5360859632794652, + "grad_norm": 0.1780613660812378, + "learning_rate": 0.001, + "loss": 1.724, + "step": 12672 + }, + { + "epoch": 0.5361282680429816, + "grad_norm": 0.16598299145698547, + "learning_rate": 0.001, + "loss": 1.8861, + "step": 12673 + }, + { + "epoch": 0.536170572806498, + "grad_norm": 0.38400644063949585, + "learning_rate": 0.001, + "loss": 2.247, + "step": 12674 + }, + { + "epoch": 0.5362128775700143, + "grad_norm": 0.24128788709640503, + "learning_rate": 0.001, + "loss": 2.6687, + "step": 12675 + }, + { + "epoch": 0.5362551823335308, + "grad_norm": 0.17561288177967072, + "learning_rate": 0.001, + "loss": 1.8747, + "step": 12676 + }, + { + "epoch": 0.5362974870970472, + "grad_norm": 0.2160193771123886, + "learning_rate": 0.001, + "loss": 1.8482, + "step": 12677 + }, + { + "epoch": 0.5363397918605635, + "grad_norm": 0.2257080227136612, + "learning_rate": 0.001, + "loss": 2.167, + "step": 12678 + }, + { + "epoch": 0.5363820966240799, + "grad_norm": 0.16601672768592834, + "learning_rate": 0.001, + "loss": 1.2827, + "step": 12679 + }, + { + "epoch": 0.5364244013875963, + "grad_norm": 0.20976103842258453, + "learning_rate": 0.001, + "loss": 2.4844, + "step": 12680 + }, + { + "epoch": 0.5364667061511126, + "grad_norm": 0.18916988372802734, + "learning_rate": 0.001, + "loss": 1.8666, + "step": 12681 + }, + { + "epoch": 0.536509010914629, + "grad_norm": 0.20311769843101501, + "learning_rate": 0.001, + "loss": 2.84, + "step": 12682 + }, + { + "epoch": 0.5365513156781454, + "grad_norm": 0.21866144239902496, + "learning_rate": 0.001, + "loss": 2.5776, + "step": 12683 + }, + { + "epoch": 0.5365936204416617, + "grad_norm": 0.2240811288356781, + "learning_rate": 0.001, + "loss": 2.7133, + "step": 12684 + }, + { + "epoch": 0.5366359252051781, + "grad_norm": 0.13089242577552795, + "learning_rate": 0.001, + "loss": 1.3683, + "step": 12685 + }, + { + "epoch": 0.5366782299686945, + "grad_norm": 0.16819548606872559, + "learning_rate": 0.001, + "loss": 2.8725, + "step": 12686 + }, + { + "epoch": 0.5367205347322108, + "grad_norm": 0.2232055813074112, + "learning_rate": 0.001, + "loss": 2.0103, + "step": 12687 + }, + { + "epoch": 0.5367628394957272, + "grad_norm": 0.1957560032606125, + "learning_rate": 0.001, + "loss": 2.4051, + "step": 12688 + }, + { + "epoch": 0.5368051442592436, + "grad_norm": 0.18882183730602264, + "learning_rate": 0.001, + "loss": 2.382, + "step": 12689 + }, + { + "epoch": 0.5368474490227599, + "grad_norm": 0.211518332362175, + "learning_rate": 0.001, + "loss": 2.4219, + "step": 12690 + }, + { + "epoch": 0.5368897537862763, + "grad_norm": 0.4028577506542206, + "learning_rate": 0.001, + "loss": 1.6842, + "step": 12691 + }, + { + "epoch": 0.5369320585497928, + "grad_norm": 0.5534846782684326, + "learning_rate": 0.001, + "loss": 2.8452, + "step": 12692 + }, + { + "epoch": 0.536974363313309, + "grad_norm": 0.6162401437759399, + "learning_rate": 0.001, + "loss": 1.8964, + "step": 12693 + }, + { + "epoch": 0.5370166680768255, + "grad_norm": 0.2093658596277237, + "learning_rate": 0.001, + "loss": 2.2681, + "step": 12694 + }, + { + "epoch": 0.5370589728403419, + "grad_norm": 0.17829060554504395, + "learning_rate": 0.001, + "loss": 1.7986, + "step": 12695 + }, + { + "epoch": 0.5371012776038582, + "grad_norm": 0.29665249586105347, + "learning_rate": 0.001, + "loss": 2.194, + "step": 12696 + }, + { + "epoch": 0.5371435823673746, + "grad_norm": 0.19808493554592133, + "learning_rate": 0.001, + "loss": 2.5451, + "step": 12697 + }, + { + "epoch": 0.537185887130891, + "grad_norm": 0.14953912794589996, + "learning_rate": 0.001, + "loss": 1.6499, + "step": 12698 + }, + { + "epoch": 0.5372281918944073, + "grad_norm": 0.25920218229293823, + "learning_rate": 0.001, + "loss": 2.035, + "step": 12699 + }, + { + "epoch": 0.5372704966579237, + "grad_norm": 0.2867739200592041, + "learning_rate": 0.001, + "loss": 2.2908, + "step": 12700 + }, + { + "epoch": 0.53731280142144, + "grad_norm": 0.17412187159061432, + "learning_rate": 0.001, + "loss": 1.7965, + "step": 12701 + }, + { + "epoch": 0.5373551061849564, + "grad_norm": 0.19530300796031952, + "learning_rate": 0.001, + "loss": 2.733, + "step": 12702 + }, + { + "epoch": 0.5373974109484728, + "grad_norm": 0.8992458581924438, + "learning_rate": 0.001, + "loss": 1.9335, + "step": 12703 + }, + { + "epoch": 0.5374397157119891, + "grad_norm": 0.16157057881355286, + "learning_rate": 0.001, + "loss": 1.4152, + "step": 12704 + }, + { + "epoch": 0.5374820204755055, + "grad_norm": 0.7205559611320496, + "learning_rate": 0.001, + "loss": 1.7546, + "step": 12705 + }, + { + "epoch": 0.5375243252390219, + "grad_norm": 0.16520078480243683, + "learning_rate": 0.001, + "loss": 1.9752, + "step": 12706 + }, + { + "epoch": 0.5375666300025382, + "grad_norm": 1.2817127704620361, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 12707 + }, + { + "epoch": 0.5376089347660546, + "grad_norm": 0.576266348361969, + "learning_rate": 0.001, + "loss": 1.898, + "step": 12708 + }, + { + "epoch": 0.5376512395295711, + "grad_norm": 0.25358226895332336, + "learning_rate": 0.001, + "loss": 2.1221, + "step": 12709 + }, + { + "epoch": 0.5376935442930874, + "grad_norm": 0.23752950131893158, + "learning_rate": 0.001, + "loss": 2.4297, + "step": 12710 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 0.17121052742004395, + "learning_rate": 0.001, + "loss": 2.2977, + "step": 12711 + }, + { + "epoch": 0.5377781538201202, + "grad_norm": 5.213639736175537, + "learning_rate": 0.001, + "loss": 1.5537, + "step": 12712 + }, + { + "epoch": 0.5378204585836365, + "grad_norm": 0.15050987899303436, + "learning_rate": 0.001, + "loss": 2.1932, + "step": 12713 + }, + { + "epoch": 0.5378627633471529, + "grad_norm": 0.2271626889705658, + "learning_rate": 0.001, + "loss": 2.2467, + "step": 12714 + }, + { + "epoch": 0.5379050681106693, + "grad_norm": 0.20240074396133423, + "learning_rate": 0.001, + "loss": 2.7468, + "step": 12715 + }, + { + "epoch": 0.5379473728741856, + "grad_norm": 4.134634971618652, + "learning_rate": 0.001, + "loss": 1.7431, + "step": 12716 + }, + { + "epoch": 0.537989677637702, + "grad_norm": 0.23283101618289948, + "learning_rate": 0.001, + "loss": 1.8593, + "step": 12717 + }, + { + "epoch": 0.5380319824012184, + "grad_norm": 0.1952308714389801, + "learning_rate": 0.001, + "loss": 2.5033, + "step": 12718 + }, + { + "epoch": 0.5380742871647347, + "grad_norm": 0.17305073142051697, + "learning_rate": 0.001, + "loss": 2.3579, + "step": 12719 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.18969833850860596, + "learning_rate": 0.001, + "loss": 1.8591, + "step": 12720 + }, + { + "epoch": 0.5381588966917675, + "grad_norm": 0.19060935080051422, + "learning_rate": 0.001, + "loss": 1.6669, + "step": 12721 + }, + { + "epoch": 0.5382012014552838, + "grad_norm": 10.8906888961792, + "learning_rate": 0.001, + "loss": 1.7601, + "step": 12722 + }, + { + "epoch": 0.5382435062188002, + "grad_norm": 0.2497670203447342, + "learning_rate": 0.001, + "loss": 2.2148, + "step": 12723 + }, + { + "epoch": 0.5382858109823166, + "grad_norm": 0.2164415568113327, + "learning_rate": 0.001, + "loss": 2.146, + "step": 12724 + }, + { + "epoch": 0.538328115745833, + "grad_norm": 0.2605139911174774, + "learning_rate": 0.001, + "loss": 1.9347, + "step": 12725 + }, + { + "epoch": 0.5383704205093494, + "grad_norm": 23.029006958007812, + "learning_rate": 0.001, + "loss": 1.8033, + "step": 12726 + }, + { + "epoch": 0.5384127252728658, + "grad_norm": 0.16982685029506683, + "learning_rate": 0.001, + "loss": 2.4865, + "step": 12727 + }, + { + "epoch": 0.5384550300363821, + "grad_norm": 0.525914192199707, + "learning_rate": 0.001, + "loss": 1.8129, + "step": 12728 + }, + { + "epoch": 0.5384973347998985, + "grad_norm": 0.15495622158050537, + "learning_rate": 0.001, + "loss": 2.5025, + "step": 12729 + }, + { + "epoch": 0.5385396395634149, + "grad_norm": 0.19135193526744843, + "learning_rate": 0.001, + "loss": 1.598, + "step": 12730 + }, + { + "epoch": 0.5385819443269312, + "grad_norm": 1.3947174549102783, + "learning_rate": 0.001, + "loss": 2.7353, + "step": 12731 + }, + { + "epoch": 0.5386242490904476, + "grad_norm": 0.16519546508789062, + "learning_rate": 0.001, + "loss": 1.5702, + "step": 12732 + }, + { + "epoch": 0.538666553853964, + "grad_norm": 0.19213086366653442, + "learning_rate": 0.001, + "loss": 2.4648, + "step": 12733 + }, + { + "epoch": 0.5387088586174803, + "grad_norm": 1.1840908527374268, + "learning_rate": 0.001, + "loss": 2.0265, + "step": 12734 + }, + { + "epoch": 0.5387511633809967, + "grad_norm": 0.32493284344673157, + "learning_rate": 0.001, + "loss": 2.6341, + "step": 12735 + }, + { + "epoch": 0.5387934681445131, + "grad_norm": 0.1699162721633911, + "learning_rate": 0.001, + "loss": 2.3637, + "step": 12736 + }, + { + "epoch": 0.5388357729080294, + "grad_norm": 0.40613803267478943, + "learning_rate": 0.001, + "loss": 1.9464, + "step": 12737 + }, + { + "epoch": 0.5388780776715458, + "grad_norm": 0.2300586849451065, + "learning_rate": 0.001, + "loss": 1.9922, + "step": 12738 + }, + { + "epoch": 0.5389203824350622, + "grad_norm": 0.23353518545627594, + "learning_rate": 0.001, + "loss": 1.4807, + "step": 12739 + }, + { + "epoch": 0.5389626871985785, + "grad_norm": 0.20910601317882538, + "learning_rate": 0.001, + "loss": 2.0366, + "step": 12740 + }, + { + "epoch": 0.539004991962095, + "grad_norm": 0.17505596578121185, + "learning_rate": 0.001, + "loss": 1.9472, + "step": 12741 + }, + { + "epoch": 0.5390472967256114, + "grad_norm": 0.2388429343700409, + "learning_rate": 0.001, + "loss": 1.8028, + "step": 12742 + }, + { + "epoch": 0.5390896014891277, + "grad_norm": 0.211119145154953, + "learning_rate": 0.001, + "loss": 3.0593, + "step": 12743 + }, + { + "epoch": 0.5391319062526441, + "grad_norm": 0.1894591599702835, + "learning_rate": 0.001, + "loss": 3.2891, + "step": 12744 + }, + { + "epoch": 0.5391742110161604, + "grad_norm": 0.20445877313613892, + "learning_rate": 0.001, + "loss": 1.9584, + "step": 12745 + }, + { + "epoch": 0.5392165157796768, + "grad_norm": 0.2004939168691635, + "learning_rate": 0.001, + "loss": 2.1161, + "step": 12746 + }, + { + "epoch": 0.5392588205431932, + "grad_norm": 0.20584918558597565, + "learning_rate": 0.001, + "loss": 2.124, + "step": 12747 + }, + { + "epoch": 0.5393011253067095, + "grad_norm": 0.4080961048603058, + "learning_rate": 0.001, + "loss": 2.1308, + "step": 12748 + }, + { + "epoch": 0.5393434300702259, + "grad_norm": 0.261042982339859, + "learning_rate": 0.001, + "loss": 2.3502, + "step": 12749 + }, + { + "epoch": 0.5393857348337423, + "grad_norm": 0.18766193091869354, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 12750 + }, + { + "epoch": 0.5394280395972586, + "grad_norm": 0.178992360830307, + "learning_rate": 0.001, + "loss": 1.9826, + "step": 12751 + }, + { + "epoch": 0.539470344360775, + "grad_norm": 0.22362402081489563, + "learning_rate": 0.001, + "loss": 2.6882, + "step": 12752 + }, + { + "epoch": 0.5395126491242914, + "grad_norm": 0.6612523198127747, + "learning_rate": 0.001, + "loss": 2.3889, + "step": 12753 + }, + { + "epoch": 0.5395549538878077, + "grad_norm": 0.24479064345359802, + "learning_rate": 0.001, + "loss": 2.4542, + "step": 12754 + }, + { + "epoch": 0.5395972586513241, + "grad_norm": 0.3169039189815521, + "learning_rate": 0.001, + "loss": 2.1717, + "step": 12755 + }, + { + "epoch": 0.5396395634148405, + "grad_norm": 0.22589461505413055, + "learning_rate": 0.001, + "loss": 3.304, + "step": 12756 + }, + { + "epoch": 0.5396818681783568, + "grad_norm": 0.22644683718681335, + "learning_rate": 0.001, + "loss": 1.7265, + "step": 12757 + }, + { + "epoch": 0.5397241729418732, + "grad_norm": 0.19094765186309814, + "learning_rate": 0.001, + "loss": 1.681, + "step": 12758 + }, + { + "epoch": 0.5397664777053897, + "grad_norm": 0.17020857334136963, + "learning_rate": 0.001, + "loss": 2.3251, + "step": 12759 + }, + { + "epoch": 0.539808782468906, + "grad_norm": 9.065228462219238, + "learning_rate": 0.001, + "loss": 2.7484, + "step": 12760 + }, + { + "epoch": 0.5398510872324224, + "grad_norm": 0.32355996966362, + "learning_rate": 0.001, + "loss": 2.1063, + "step": 12761 + }, + { + "epoch": 0.5398933919959388, + "grad_norm": 0.22074800729751587, + "learning_rate": 0.001, + "loss": 2.3476, + "step": 12762 + }, + { + "epoch": 0.5399356967594551, + "grad_norm": 1.8396928310394287, + "learning_rate": 0.001, + "loss": 2.3019, + "step": 12763 + }, + { + "epoch": 0.5399780015229715, + "grad_norm": 0.18278691172599792, + "learning_rate": 0.001, + "loss": 2.0778, + "step": 12764 + }, + { + "epoch": 0.5400203062864879, + "grad_norm": 0.2510671019554138, + "learning_rate": 0.001, + "loss": 1.8773, + "step": 12765 + }, + { + "epoch": 0.5400626110500042, + "grad_norm": 0.17996838688850403, + "learning_rate": 0.001, + "loss": 2.517, + "step": 12766 + }, + { + "epoch": 0.5401049158135206, + "grad_norm": 0.15889018774032593, + "learning_rate": 0.001, + "loss": 1.7955, + "step": 12767 + }, + { + "epoch": 0.540147220577037, + "grad_norm": 0.18059539794921875, + "learning_rate": 0.001, + "loss": 2.4617, + "step": 12768 + }, + { + "epoch": 0.5401895253405533, + "grad_norm": 1.934308648109436, + "learning_rate": 0.001, + "loss": 2.4551, + "step": 12769 + }, + { + "epoch": 0.5402318301040697, + "grad_norm": 0.764201283454895, + "learning_rate": 0.001, + "loss": 2.2693, + "step": 12770 + }, + { + "epoch": 0.5402741348675861, + "grad_norm": 0.16094563901424408, + "learning_rate": 0.001, + "loss": 3.8751, + "step": 12771 + }, + { + "epoch": 0.5403164396311024, + "grad_norm": 0.20989876985549927, + "learning_rate": 0.001, + "loss": 2.4648, + "step": 12772 + }, + { + "epoch": 0.5403587443946188, + "grad_norm": 0.4561876058578491, + "learning_rate": 0.001, + "loss": 2.0358, + "step": 12773 + }, + { + "epoch": 0.5404010491581352, + "grad_norm": 0.19272848963737488, + "learning_rate": 0.001, + "loss": 2.3598, + "step": 12774 + }, + { + "epoch": 0.5404433539216515, + "grad_norm": 0.1573980748653412, + "learning_rate": 0.001, + "loss": 3.2602, + "step": 12775 + }, + { + "epoch": 0.540485658685168, + "grad_norm": 0.20343995094299316, + "learning_rate": 0.001, + "loss": 2.9864, + "step": 12776 + }, + { + "epoch": 0.5405279634486844, + "grad_norm": 1.2975257635116577, + "learning_rate": 0.001, + "loss": 2.7595, + "step": 12777 + }, + { + "epoch": 0.5405702682122007, + "grad_norm": 0.20765641331672668, + "learning_rate": 0.001, + "loss": 2.8354, + "step": 12778 + }, + { + "epoch": 0.5406125729757171, + "grad_norm": 0.1908862143754959, + "learning_rate": 0.001, + "loss": 1.833, + "step": 12779 + }, + { + "epoch": 0.5406548777392335, + "grad_norm": 0.15055324137210846, + "learning_rate": 0.001, + "loss": 1.4764, + "step": 12780 + }, + { + "epoch": 0.5406971825027498, + "grad_norm": 0.19887371361255646, + "learning_rate": 0.001, + "loss": 2.6805, + "step": 12781 + }, + { + "epoch": 0.5407394872662662, + "grad_norm": 0.19075161218643188, + "learning_rate": 0.001, + "loss": 2.7236, + "step": 12782 + }, + { + "epoch": 0.5407817920297826, + "grad_norm": 0.14533232152462006, + "learning_rate": 0.001, + "loss": 1.5061, + "step": 12783 + }, + { + "epoch": 0.5408240967932989, + "grad_norm": 0.26192668080329895, + "learning_rate": 0.001, + "loss": 2.8406, + "step": 12784 + }, + { + "epoch": 0.5408664015568153, + "grad_norm": 0.1515062302350998, + "learning_rate": 0.001, + "loss": 2.997, + "step": 12785 + }, + { + "epoch": 0.5409087063203317, + "grad_norm": 4.155677795410156, + "learning_rate": 0.001, + "loss": 2.7793, + "step": 12786 + }, + { + "epoch": 0.540951011083848, + "grad_norm": 0.24522151052951813, + "learning_rate": 0.001, + "loss": 1.8206, + "step": 12787 + }, + { + "epoch": 0.5409933158473644, + "grad_norm": 1.2809727191925049, + "learning_rate": 0.001, + "loss": 3.2282, + "step": 12788 + }, + { + "epoch": 0.5410356206108807, + "grad_norm": 0.21048471331596375, + "learning_rate": 0.001, + "loss": 2.7134, + "step": 12789 + }, + { + "epoch": 0.5410779253743971, + "grad_norm": 0.1910332590341568, + "learning_rate": 0.001, + "loss": 1.9444, + "step": 12790 + }, + { + "epoch": 0.5411202301379135, + "grad_norm": 0.20288752019405365, + "learning_rate": 0.001, + "loss": 1.7156, + "step": 12791 + }, + { + "epoch": 0.5411625349014298, + "grad_norm": 0.1752224564552307, + "learning_rate": 0.001, + "loss": 2.0678, + "step": 12792 + }, + { + "epoch": 0.5412048396649463, + "grad_norm": 0.18332380056381226, + "learning_rate": 0.001, + "loss": 1.6786, + "step": 12793 + }, + { + "epoch": 0.5412471444284627, + "grad_norm": 0.16607828438282013, + "learning_rate": 0.001, + "loss": 2.0846, + "step": 12794 + }, + { + "epoch": 0.541289449191979, + "grad_norm": 0.2198660671710968, + "learning_rate": 0.001, + "loss": 2.083, + "step": 12795 + }, + { + "epoch": 0.5413317539554954, + "grad_norm": 0.19361701607704163, + "learning_rate": 0.001, + "loss": 1.8434, + "step": 12796 + }, + { + "epoch": 0.5413740587190118, + "grad_norm": 0.17056864500045776, + "learning_rate": 0.001, + "loss": 1.7502, + "step": 12797 + }, + { + "epoch": 0.5414163634825281, + "grad_norm": 0.2116164267063141, + "learning_rate": 0.001, + "loss": 2.3516, + "step": 12798 + }, + { + "epoch": 0.5414586682460445, + "grad_norm": 0.1802276223897934, + "learning_rate": 0.001, + "loss": 2.6387, + "step": 12799 + }, + { + "epoch": 0.5415009730095609, + "grad_norm": 0.1857595592737198, + "learning_rate": 0.001, + "loss": 2.052, + "step": 12800 + }, + { + "epoch": 0.5415432777730772, + "grad_norm": 0.16181547939777374, + "learning_rate": 0.001, + "loss": 2.9238, + "step": 12801 + }, + { + "epoch": 0.5415855825365936, + "grad_norm": 0.17999279499053955, + "learning_rate": 0.001, + "loss": 2.1358, + "step": 12802 + }, + { + "epoch": 0.54162788730011, + "grad_norm": 0.17646092176437378, + "learning_rate": 0.001, + "loss": 1.9901, + "step": 12803 + }, + { + "epoch": 0.5416701920636263, + "grad_norm": 0.19860303401947021, + "learning_rate": 0.001, + "loss": 2.5992, + "step": 12804 + }, + { + "epoch": 0.5417124968271427, + "grad_norm": 0.43836483359336853, + "learning_rate": 0.001, + "loss": 2.0902, + "step": 12805 + }, + { + "epoch": 0.5417548015906591, + "grad_norm": 0.32996633648872375, + "learning_rate": 0.001, + "loss": 1.8199, + "step": 12806 + }, + { + "epoch": 0.5417971063541754, + "grad_norm": 0.16782180964946747, + "learning_rate": 0.001, + "loss": 1.8684, + "step": 12807 + }, + { + "epoch": 0.5418394111176918, + "grad_norm": 0.29726442694664, + "learning_rate": 0.001, + "loss": 2.1423, + "step": 12808 + }, + { + "epoch": 0.5418817158812083, + "grad_norm": 0.19097116589546204, + "learning_rate": 0.001, + "loss": 3.4195, + "step": 12809 + }, + { + "epoch": 0.5419240206447246, + "grad_norm": 0.41679999232292175, + "learning_rate": 0.001, + "loss": 1.9064, + "step": 12810 + }, + { + "epoch": 0.541966325408241, + "grad_norm": 0.15182755887508392, + "learning_rate": 0.001, + "loss": 2.4639, + "step": 12811 + }, + { + "epoch": 0.5420086301717574, + "grad_norm": 0.16130530834197998, + "learning_rate": 0.001, + "loss": 1.777, + "step": 12812 + }, + { + "epoch": 0.5420509349352737, + "grad_norm": 0.15703986585140228, + "learning_rate": 0.001, + "loss": 2.4515, + "step": 12813 + }, + { + "epoch": 0.5420932396987901, + "grad_norm": 22.553680419921875, + "learning_rate": 0.001, + "loss": 2.077, + "step": 12814 + }, + { + "epoch": 0.5421355444623065, + "grad_norm": 0.17222054302692413, + "learning_rate": 0.001, + "loss": 1.8372, + "step": 12815 + }, + { + "epoch": 0.5421778492258228, + "grad_norm": 0.21091894805431366, + "learning_rate": 0.001, + "loss": 2.3193, + "step": 12816 + }, + { + "epoch": 0.5422201539893392, + "grad_norm": 0.17758257687091827, + "learning_rate": 0.001, + "loss": 2.2889, + "step": 12817 + }, + { + "epoch": 0.5422624587528556, + "grad_norm": 0.2192198932170868, + "learning_rate": 0.001, + "loss": 2.8897, + "step": 12818 + }, + { + "epoch": 0.5423047635163719, + "grad_norm": 0.16406521201133728, + "learning_rate": 0.001, + "loss": 2.1214, + "step": 12819 + }, + { + "epoch": 0.5423470682798883, + "grad_norm": 0.29036909341812134, + "learning_rate": 0.001, + "loss": 1.6994, + "step": 12820 + }, + { + "epoch": 0.5423893730434047, + "grad_norm": 0.16406984627246857, + "learning_rate": 0.001, + "loss": 2.4504, + "step": 12821 + }, + { + "epoch": 0.542431677806921, + "grad_norm": 0.310654878616333, + "learning_rate": 0.001, + "loss": 2.7693, + "step": 12822 + }, + { + "epoch": 0.5424739825704374, + "grad_norm": 0.3856427073478699, + "learning_rate": 0.001, + "loss": 1.8619, + "step": 12823 + }, + { + "epoch": 0.5425162873339539, + "grad_norm": 0.15692728757858276, + "learning_rate": 0.001, + "loss": 2.5887, + "step": 12824 + }, + { + "epoch": 0.5425585920974701, + "grad_norm": 0.1917227804660797, + "learning_rate": 0.001, + "loss": 2.4362, + "step": 12825 + }, + { + "epoch": 0.5426008968609866, + "grad_norm": 0.16880708932876587, + "learning_rate": 0.001, + "loss": 2.6024, + "step": 12826 + }, + { + "epoch": 0.542643201624503, + "grad_norm": 0.22414904832839966, + "learning_rate": 0.001, + "loss": 1.8351, + "step": 12827 + }, + { + "epoch": 0.5426855063880193, + "grad_norm": 0.19142870604991913, + "learning_rate": 0.001, + "loss": 2.1463, + "step": 12828 + }, + { + "epoch": 0.5427278111515357, + "grad_norm": 0.9796238541603088, + "learning_rate": 0.001, + "loss": 2.4191, + "step": 12829 + }, + { + "epoch": 0.5427701159150521, + "grad_norm": 0.18335986137390137, + "learning_rate": 0.001, + "loss": 2.8576, + "step": 12830 + }, + { + "epoch": 0.5428124206785684, + "grad_norm": 0.162856325507164, + "learning_rate": 0.001, + "loss": 1.473, + "step": 12831 + }, + { + "epoch": 0.5428547254420848, + "grad_norm": 0.162824347615242, + "learning_rate": 0.001, + "loss": 1.9001, + "step": 12832 + }, + { + "epoch": 0.5428970302056012, + "grad_norm": 0.1477918028831482, + "learning_rate": 0.001, + "loss": 1.5069, + "step": 12833 + }, + { + "epoch": 0.5429393349691175, + "grad_norm": 0.21265248954296112, + "learning_rate": 0.001, + "loss": 2.1842, + "step": 12834 + }, + { + "epoch": 0.5429816397326339, + "grad_norm": 0.19099631905555725, + "learning_rate": 0.001, + "loss": 1.1751, + "step": 12835 + }, + { + "epoch": 0.5430239444961502, + "grad_norm": 8.667723655700684, + "learning_rate": 0.001, + "loss": 1.6845, + "step": 12836 + }, + { + "epoch": 0.5430662492596666, + "grad_norm": 0.22022154927253723, + "learning_rate": 0.001, + "loss": 2.5801, + "step": 12837 + }, + { + "epoch": 0.543108554023183, + "grad_norm": 0.17374145984649658, + "learning_rate": 0.001, + "loss": 2.4922, + "step": 12838 + }, + { + "epoch": 0.5431508587866993, + "grad_norm": 0.16898944973945618, + "learning_rate": 0.001, + "loss": 1.8262, + "step": 12839 + }, + { + "epoch": 0.5431931635502157, + "grad_norm": 0.18354932963848114, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 12840 + }, + { + "epoch": 0.5432354683137322, + "grad_norm": 0.16196799278259277, + "learning_rate": 0.001, + "loss": 2.6641, + "step": 12841 + }, + { + "epoch": 0.5432777730772484, + "grad_norm": 0.2222384810447693, + "learning_rate": 0.001, + "loss": 2.1457, + "step": 12842 + }, + { + "epoch": 0.5433200778407649, + "grad_norm": 0.19985118508338928, + "learning_rate": 0.001, + "loss": 2.302, + "step": 12843 + }, + { + "epoch": 0.5433623826042813, + "grad_norm": 0.1899694949388504, + "learning_rate": 0.001, + "loss": 3.4242, + "step": 12844 + }, + { + "epoch": 0.5434046873677976, + "grad_norm": 0.19545935094356537, + "learning_rate": 0.001, + "loss": 1.9181, + "step": 12845 + }, + { + "epoch": 0.543446992131314, + "grad_norm": 0.16950981318950653, + "learning_rate": 0.001, + "loss": 2.4149, + "step": 12846 + }, + { + "epoch": 0.5434892968948304, + "grad_norm": 0.19809971749782562, + "learning_rate": 0.001, + "loss": 1.6052, + "step": 12847 + }, + { + "epoch": 0.5435316016583467, + "grad_norm": 0.2023109793663025, + "learning_rate": 0.001, + "loss": 2.1515, + "step": 12848 + }, + { + "epoch": 0.5435739064218631, + "grad_norm": 0.889703631401062, + "learning_rate": 0.001, + "loss": 2.0461, + "step": 12849 + }, + { + "epoch": 0.5436162111853795, + "grad_norm": 0.18343132734298706, + "learning_rate": 0.001, + "loss": 2.4869, + "step": 12850 + }, + { + "epoch": 0.5436585159488958, + "grad_norm": 0.16023565828800201, + "learning_rate": 0.001, + "loss": 2.1514, + "step": 12851 + }, + { + "epoch": 0.5437008207124122, + "grad_norm": 0.16939814388751984, + "learning_rate": 0.001, + "loss": 1.9151, + "step": 12852 + }, + { + "epoch": 0.5437431254759286, + "grad_norm": 0.16399548947811127, + "learning_rate": 0.001, + "loss": 2.4465, + "step": 12853 + }, + { + "epoch": 0.5437854302394449, + "grad_norm": 0.1430288702249527, + "learning_rate": 0.001, + "loss": 1.7558, + "step": 12854 + }, + { + "epoch": 0.5438277350029613, + "grad_norm": 0.5551788210868835, + "learning_rate": 0.001, + "loss": 1.9211, + "step": 12855 + }, + { + "epoch": 0.5438700397664777, + "grad_norm": 0.15507280826568604, + "learning_rate": 0.001, + "loss": 1.4507, + "step": 12856 + }, + { + "epoch": 0.543912344529994, + "grad_norm": 0.27142950892448425, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 12857 + }, + { + "epoch": 0.5439546492935105, + "grad_norm": 0.7331579327583313, + "learning_rate": 0.001, + "loss": 3.3138, + "step": 12858 + }, + { + "epoch": 0.5439969540570269, + "grad_norm": 1.1282596588134766, + "learning_rate": 0.001, + "loss": 2.4492, + "step": 12859 + }, + { + "epoch": 0.5440392588205432, + "grad_norm": 0.1771765947341919, + "learning_rate": 0.001, + "loss": 2.3039, + "step": 12860 + }, + { + "epoch": 0.5440815635840596, + "grad_norm": 0.24000783264636993, + "learning_rate": 0.001, + "loss": 1.6899, + "step": 12861 + }, + { + "epoch": 0.544123868347576, + "grad_norm": 0.23145051300525665, + "learning_rate": 0.001, + "loss": 2.4085, + "step": 12862 + }, + { + "epoch": 0.5441661731110923, + "grad_norm": 0.21521174907684326, + "learning_rate": 0.001, + "loss": 3.2071, + "step": 12863 + }, + { + "epoch": 0.5442084778746087, + "grad_norm": 0.16820016503334045, + "learning_rate": 0.001, + "loss": 2.267, + "step": 12864 + }, + { + "epoch": 0.5442507826381251, + "grad_norm": 0.21583041548728943, + "learning_rate": 0.001, + "loss": 2.1649, + "step": 12865 + }, + { + "epoch": 0.5442930874016414, + "grad_norm": 0.15462155640125275, + "learning_rate": 0.001, + "loss": 2.2514, + "step": 12866 + }, + { + "epoch": 0.5443353921651578, + "grad_norm": 0.30019253492355347, + "learning_rate": 0.001, + "loss": 2.4805, + "step": 12867 + }, + { + "epoch": 0.5443776969286742, + "grad_norm": 0.17040373384952545, + "learning_rate": 0.001, + "loss": 2.1466, + "step": 12868 + }, + { + "epoch": 0.5444200016921905, + "grad_norm": 0.2993798553943634, + "learning_rate": 0.001, + "loss": 1.5786, + "step": 12869 + }, + { + "epoch": 0.5444623064557069, + "grad_norm": 0.17798520624637604, + "learning_rate": 0.001, + "loss": 1.7204, + "step": 12870 + }, + { + "epoch": 0.5445046112192233, + "grad_norm": 0.18030913174152374, + "learning_rate": 0.001, + "loss": 2.9747, + "step": 12871 + }, + { + "epoch": 0.5445469159827396, + "grad_norm": 0.22262489795684814, + "learning_rate": 0.001, + "loss": 2.4943, + "step": 12872 + }, + { + "epoch": 0.544589220746256, + "grad_norm": 0.16953717172145844, + "learning_rate": 0.001, + "loss": 3.0568, + "step": 12873 + }, + { + "epoch": 0.5446315255097725, + "grad_norm": 0.34046247601509094, + "learning_rate": 0.001, + "loss": 2.8593, + "step": 12874 + }, + { + "epoch": 0.5446738302732888, + "grad_norm": 0.1570599377155304, + "learning_rate": 0.001, + "loss": 2.5897, + "step": 12875 + }, + { + "epoch": 0.5447161350368052, + "grad_norm": 0.17176446318626404, + "learning_rate": 0.001, + "loss": 1.9241, + "step": 12876 + }, + { + "epoch": 0.5447584398003216, + "grad_norm": 0.17753027379512787, + "learning_rate": 0.001, + "loss": 2.161, + "step": 12877 + }, + { + "epoch": 0.5448007445638379, + "grad_norm": 0.1912051886320114, + "learning_rate": 0.001, + "loss": 1.8007, + "step": 12878 + }, + { + "epoch": 0.5448430493273543, + "grad_norm": 0.49159884452819824, + "learning_rate": 0.001, + "loss": 1.7072, + "step": 12879 + }, + { + "epoch": 0.5448853540908706, + "grad_norm": 0.23397985100746155, + "learning_rate": 0.001, + "loss": 1.9658, + "step": 12880 + }, + { + "epoch": 0.544927658854387, + "grad_norm": 0.17725947499275208, + "learning_rate": 0.001, + "loss": 1.8988, + "step": 12881 + }, + { + "epoch": 0.5449699636179034, + "grad_norm": 0.14953695237636566, + "learning_rate": 0.001, + "loss": 2.4378, + "step": 12882 + }, + { + "epoch": 0.5450122683814197, + "grad_norm": 0.1612466275691986, + "learning_rate": 0.001, + "loss": 2.0907, + "step": 12883 + }, + { + "epoch": 0.5450545731449361, + "grad_norm": 0.2178923636674881, + "learning_rate": 0.001, + "loss": 2.9344, + "step": 12884 + }, + { + "epoch": 0.5450968779084525, + "grad_norm": 0.19458921253681183, + "learning_rate": 0.001, + "loss": 2.2585, + "step": 12885 + }, + { + "epoch": 0.5451391826719688, + "grad_norm": 0.24837756156921387, + "learning_rate": 0.001, + "loss": 2.0981, + "step": 12886 + }, + { + "epoch": 0.5451814874354852, + "grad_norm": 0.16423627734184265, + "learning_rate": 0.001, + "loss": 1.4668, + "step": 12887 + }, + { + "epoch": 0.5452237921990016, + "grad_norm": 0.17279313504695892, + "learning_rate": 0.001, + "loss": 1.8586, + "step": 12888 + }, + { + "epoch": 0.5452660969625179, + "grad_norm": 0.1528485268354416, + "learning_rate": 0.001, + "loss": 2.2937, + "step": 12889 + }, + { + "epoch": 0.5453084017260343, + "grad_norm": 0.14856481552124023, + "learning_rate": 0.001, + "loss": 2.3376, + "step": 12890 + }, + { + "epoch": 0.5453507064895508, + "grad_norm": 0.1969471573829651, + "learning_rate": 0.001, + "loss": 2.8098, + "step": 12891 + }, + { + "epoch": 0.545393011253067, + "grad_norm": 0.17648574709892273, + "learning_rate": 0.001, + "loss": 2.3516, + "step": 12892 + }, + { + "epoch": 0.5454353160165835, + "grad_norm": 0.2679540812969208, + "learning_rate": 0.001, + "loss": 2.5151, + "step": 12893 + }, + { + "epoch": 0.5454776207800999, + "grad_norm": 0.17323081195354462, + "learning_rate": 0.001, + "loss": 1.472, + "step": 12894 + }, + { + "epoch": 0.5455199255436162, + "grad_norm": 0.7942979335784912, + "learning_rate": 0.001, + "loss": 2.2294, + "step": 12895 + }, + { + "epoch": 0.5455622303071326, + "grad_norm": 0.2997882664203644, + "learning_rate": 0.001, + "loss": 2.7284, + "step": 12896 + }, + { + "epoch": 0.545604535070649, + "grad_norm": 0.1583070456981659, + "learning_rate": 0.001, + "loss": 2.6989, + "step": 12897 + }, + { + "epoch": 0.5456468398341653, + "grad_norm": 0.19625523686408997, + "learning_rate": 0.001, + "loss": 1.7012, + "step": 12898 + }, + { + "epoch": 0.5456891445976817, + "grad_norm": 0.16871203482151031, + "learning_rate": 0.001, + "loss": 1.7467, + "step": 12899 + }, + { + "epoch": 0.5457314493611981, + "grad_norm": 0.17078301310539246, + "learning_rate": 0.001, + "loss": 2.7648, + "step": 12900 + }, + { + "epoch": 0.5457737541247144, + "grad_norm": 0.1832430213689804, + "learning_rate": 0.001, + "loss": 2.109, + "step": 12901 + }, + { + "epoch": 0.5458160588882308, + "grad_norm": 0.18720830976963043, + "learning_rate": 0.001, + "loss": 2.4266, + "step": 12902 + }, + { + "epoch": 0.5458583636517472, + "grad_norm": 0.7275272011756897, + "learning_rate": 0.001, + "loss": 2.3496, + "step": 12903 + }, + { + "epoch": 0.5459006684152635, + "grad_norm": 0.1972072273492813, + "learning_rate": 0.001, + "loss": 1.8186, + "step": 12904 + }, + { + "epoch": 0.5459429731787799, + "grad_norm": 0.17827208340168, + "learning_rate": 0.001, + "loss": 2.7438, + "step": 12905 + }, + { + "epoch": 0.5459852779422963, + "grad_norm": 7.450066089630127, + "learning_rate": 0.001, + "loss": 2.246, + "step": 12906 + }, + { + "epoch": 0.5460275827058126, + "grad_norm": 0.15848639607429504, + "learning_rate": 0.001, + "loss": 1.7138, + "step": 12907 + }, + { + "epoch": 0.546069887469329, + "grad_norm": 0.1964915245771408, + "learning_rate": 0.001, + "loss": 1.9687, + "step": 12908 + }, + { + "epoch": 0.5461121922328455, + "grad_norm": 0.1862536370754242, + "learning_rate": 0.001, + "loss": 1.9072, + "step": 12909 + }, + { + "epoch": 0.5461544969963618, + "grad_norm": 0.18136167526245117, + "learning_rate": 0.001, + "loss": 1.7342, + "step": 12910 + }, + { + "epoch": 0.5461968017598782, + "grad_norm": 0.16859842836856842, + "learning_rate": 0.001, + "loss": 2.6862, + "step": 12911 + }, + { + "epoch": 0.5462391065233946, + "grad_norm": 0.4886167049407959, + "learning_rate": 0.001, + "loss": 2.0441, + "step": 12912 + }, + { + "epoch": 0.5462814112869109, + "grad_norm": 0.25893867015838623, + "learning_rate": 0.001, + "loss": 3.6045, + "step": 12913 + }, + { + "epoch": 0.5463237160504273, + "grad_norm": 0.2573549747467041, + "learning_rate": 0.001, + "loss": 1.9222, + "step": 12914 + }, + { + "epoch": 0.5463660208139437, + "grad_norm": 0.17247262597084045, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 12915 + }, + { + "epoch": 0.54640832557746, + "grad_norm": 2.4677224159240723, + "learning_rate": 0.001, + "loss": 2.0424, + "step": 12916 + }, + { + "epoch": 0.5464506303409764, + "grad_norm": 0.20727023482322693, + "learning_rate": 0.001, + "loss": 1.9061, + "step": 12917 + }, + { + "epoch": 0.5464929351044928, + "grad_norm": 0.1556202620267868, + "learning_rate": 0.001, + "loss": 2.4613, + "step": 12918 + }, + { + "epoch": 0.5465352398680091, + "grad_norm": 0.18038994073867798, + "learning_rate": 0.001, + "loss": 2.001, + "step": 12919 + }, + { + "epoch": 0.5465775446315255, + "grad_norm": 0.20001617074012756, + "learning_rate": 0.001, + "loss": 2.1099, + "step": 12920 + }, + { + "epoch": 0.5466198493950419, + "grad_norm": 0.15489211678504944, + "learning_rate": 0.001, + "loss": 1.7065, + "step": 12921 + }, + { + "epoch": 0.5466621541585582, + "grad_norm": 0.41907620429992676, + "learning_rate": 0.001, + "loss": 2.915, + "step": 12922 + }, + { + "epoch": 0.5467044589220746, + "grad_norm": 0.15193894505500793, + "learning_rate": 0.001, + "loss": 2.4002, + "step": 12923 + }, + { + "epoch": 0.546746763685591, + "grad_norm": 0.16042934358119965, + "learning_rate": 0.001, + "loss": 1.7455, + "step": 12924 + }, + { + "epoch": 0.5467890684491074, + "grad_norm": 8.326644897460938, + "learning_rate": 0.001, + "loss": 2.3913, + "step": 12925 + }, + { + "epoch": 0.5468313732126238, + "grad_norm": 0.22436167299747467, + "learning_rate": 0.001, + "loss": 2.9849, + "step": 12926 + }, + { + "epoch": 0.5468736779761401, + "grad_norm": 0.17266489565372467, + "learning_rate": 0.001, + "loss": 1.7618, + "step": 12927 + }, + { + "epoch": 0.5469159827396565, + "grad_norm": 0.21502335369586945, + "learning_rate": 0.001, + "loss": 1.632, + "step": 12928 + }, + { + "epoch": 0.5469582875031729, + "grad_norm": 0.18782548606395721, + "learning_rate": 0.001, + "loss": 2.7248, + "step": 12929 + }, + { + "epoch": 0.5470005922666892, + "grad_norm": 0.44364723563194275, + "learning_rate": 0.001, + "loss": 2.0572, + "step": 12930 + }, + { + "epoch": 0.5470428970302056, + "grad_norm": 0.22250796854496002, + "learning_rate": 0.001, + "loss": 1.8936, + "step": 12931 + }, + { + "epoch": 0.547085201793722, + "grad_norm": 0.19908872246742249, + "learning_rate": 0.001, + "loss": 2.5246, + "step": 12932 + }, + { + "epoch": 0.5471275065572383, + "grad_norm": 0.15950053930282593, + "learning_rate": 0.001, + "loss": 2.6949, + "step": 12933 + }, + { + "epoch": 0.5471698113207547, + "grad_norm": 0.45760655403137207, + "learning_rate": 0.001, + "loss": 1.5259, + "step": 12934 + }, + { + "epoch": 0.5472121160842711, + "grad_norm": 1.588517427444458, + "learning_rate": 0.001, + "loss": 3.1214, + "step": 12935 + }, + { + "epoch": 0.5472544208477874, + "grad_norm": 0.22262723743915558, + "learning_rate": 0.001, + "loss": 3.0762, + "step": 12936 + }, + { + "epoch": 0.5472967256113038, + "grad_norm": 0.1947306990623474, + "learning_rate": 0.001, + "loss": 2.2629, + "step": 12937 + }, + { + "epoch": 0.5473390303748202, + "grad_norm": 0.31676188111305237, + "learning_rate": 0.001, + "loss": 1.8793, + "step": 12938 + }, + { + "epoch": 0.5473813351383365, + "grad_norm": 0.2185761034488678, + "learning_rate": 0.001, + "loss": 2.5543, + "step": 12939 + }, + { + "epoch": 0.5474236399018529, + "grad_norm": 0.21403570473194122, + "learning_rate": 0.001, + "loss": 2.5567, + "step": 12940 + }, + { + "epoch": 0.5474659446653694, + "grad_norm": 0.23452875018119812, + "learning_rate": 0.001, + "loss": 2.3212, + "step": 12941 + }, + { + "epoch": 0.5475082494288857, + "grad_norm": 0.1822385936975479, + "learning_rate": 0.001, + "loss": 1.6779, + "step": 12942 + }, + { + "epoch": 0.5475505541924021, + "grad_norm": 1.615887999534607, + "learning_rate": 0.001, + "loss": 1.8927, + "step": 12943 + }, + { + "epoch": 0.5475928589559185, + "grad_norm": 0.2374815195798874, + "learning_rate": 0.001, + "loss": 1.8798, + "step": 12944 + }, + { + "epoch": 0.5476351637194348, + "grad_norm": 0.3294655680656433, + "learning_rate": 0.001, + "loss": 3.5583, + "step": 12945 + }, + { + "epoch": 0.5476774684829512, + "grad_norm": 0.17319463193416595, + "learning_rate": 0.001, + "loss": 2.3341, + "step": 12946 + }, + { + "epoch": 0.5477197732464676, + "grad_norm": 0.18420511484146118, + "learning_rate": 0.001, + "loss": 2.6946, + "step": 12947 + }, + { + "epoch": 0.5477620780099839, + "grad_norm": 0.5846876502037048, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 12948 + }, + { + "epoch": 0.5478043827735003, + "grad_norm": 0.23985953629016876, + "learning_rate": 0.001, + "loss": 2.0719, + "step": 12949 + }, + { + "epoch": 0.5478466875370167, + "grad_norm": 0.1828928142786026, + "learning_rate": 0.001, + "loss": 2.1034, + "step": 12950 + }, + { + "epoch": 0.547888992300533, + "grad_norm": 0.2998861074447632, + "learning_rate": 0.001, + "loss": 1.9081, + "step": 12951 + }, + { + "epoch": 0.5479312970640494, + "grad_norm": 0.22397345304489136, + "learning_rate": 0.001, + "loss": 2.3908, + "step": 12952 + }, + { + "epoch": 0.5479736018275658, + "grad_norm": 0.19218890368938446, + "learning_rate": 0.001, + "loss": 2.7397, + "step": 12953 + }, + { + "epoch": 0.5480159065910821, + "grad_norm": 0.1939002424478531, + "learning_rate": 0.001, + "loss": 2.3556, + "step": 12954 + }, + { + "epoch": 0.5480582113545985, + "grad_norm": 0.18285121023654938, + "learning_rate": 0.001, + "loss": 2.6213, + "step": 12955 + }, + { + "epoch": 0.5481005161181149, + "grad_norm": 0.21203845739364624, + "learning_rate": 0.001, + "loss": 2.7509, + "step": 12956 + }, + { + "epoch": 0.5481428208816312, + "grad_norm": 0.306901752948761, + "learning_rate": 0.001, + "loss": 2.286, + "step": 12957 + }, + { + "epoch": 0.5481851256451477, + "grad_norm": 0.18208357691764832, + "learning_rate": 0.001, + "loss": 2.0693, + "step": 12958 + }, + { + "epoch": 0.5482274304086641, + "grad_norm": 0.22899079322814941, + "learning_rate": 0.001, + "loss": 2.8397, + "step": 12959 + }, + { + "epoch": 0.5482697351721804, + "grad_norm": 0.16618044674396515, + "learning_rate": 0.001, + "loss": 2.9579, + "step": 12960 + }, + { + "epoch": 0.5483120399356968, + "grad_norm": 0.17978453636169434, + "learning_rate": 0.001, + "loss": 1.903, + "step": 12961 + }, + { + "epoch": 0.5483543446992132, + "grad_norm": 0.18377989530563354, + "learning_rate": 0.001, + "loss": 2.4393, + "step": 12962 + }, + { + "epoch": 0.5483966494627295, + "grad_norm": 0.21771173179149628, + "learning_rate": 0.001, + "loss": 3.4764, + "step": 12963 + }, + { + "epoch": 0.5484389542262459, + "grad_norm": 0.16839885711669922, + "learning_rate": 0.001, + "loss": 1.7648, + "step": 12964 + }, + { + "epoch": 0.5484812589897623, + "grad_norm": 0.17164848744869232, + "learning_rate": 0.001, + "loss": 1.8755, + "step": 12965 + }, + { + "epoch": 0.5485235637532786, + "grad_norm": 0.25105589628219604, + "learning_rate": 0.001, + "loss": 2.2976, + "step": 12966 + }, + { + "epoch": 0.548565868516795, + "grad_norm": 0.17218393087387085, + "learning_rate": 0.001, + "loss": 1.842, + "step": 12967 + }, + { + "epoch": 0.5486081732803114, + "grad_norm": 0.2835632264614105, + "learning_rate": 0.001, + "loss": 2.5885, + "step": 12968 + }, + { + "epoch": 0.5486504780438277, + "grad_norm": 0.2169799655675888, + "learning_rate": 0.001, + "loss": 2.1249, + "step": 12969 + }, + { + "epoch": 0.5486927828073441, + "grad_norm": 0.28637203574180603, + "learning_rate": 0.001, + "loss": 1.6262, + "step": 12970 + }, + { + "epoch": 0.5487350875708604, + "grad_norm": 0.2180042862892151, + "learning_rate": 0.001, + "loss": 1.9191, + "step": 12971 + }, + { + "epoch": 0.5487773923343768, + "grad_norm": 0.20677872002124786, + "learning_rate": 0.001, + "loss": 3.2069, + "step": 12972 + }, + { + "epoch": 0.5488196970978932, + "grad_norm": 2.7703301906585693, + "learning_rate": 0.001, + "loss": 3.6456, + "step": 12973 + }, + { + "epoch": 0.5488620018614095, + "grad_norm": 0.4277131259441376, + "learning_rate": 0.001, + "loss": 3.3904, + "step": 12974 + }, + { + "epoch": 0.548904306624926, + "grad_norm": 0.19730469584465027, + "learning_rate": 0.001, + "loss": 2.4932, + "step": 12975 + }, + { + "epoch": 0.5489466113884424, + "grad_norm": 7.952031135559082, + "learning_rate": 0.001, + "loss": 1.7476, + "step": 12976 + }, + { + "epoch": 0.5489889161519587, + "grad_norm": 0.3109131157398224, + "learning_rate": 0.001, + "loss": 2.152, + "step": 12977 + }, + { + "epoch": 0.5490312209154751, + "grad_norm": 0.36644262075424194, + "learning_rate": 0.001, + "loss": 1.6802, + "step": 12978 + }, + { + "epoch": 0.5490735256789915, + "grad_norm": 0.24093574285507202, + "learning_rate": 0.001, + "loss": 1.4931, + "step": 12979 + }, + { + "epoch": 0.5491158304425078, + "grad_norm": 0.22954730689525604, + "learning_rate": 0.001, + "loss": 1.6246, + "step": 12980 + }, + { + "epoch": 0.5491581352060242, + "grad_norm": 0.21042773127555847, + "learning_rate": 0.001, + "loss": 2.835, + "step": 12981 + }, + { + "epoch": 0.5492004399695406, + "grad_norm": 0.1865551769733429, + "learning_rate": 0.001, + "loss": 2.28, + "step": 12982 + }, + { + "epoch": 0.5492427447330569, + "grad_norm": 0.15189926326274872, + "learning_rate": 0.001, + "loss": 1.629, + "step": 12983 + }, + { + "epoch": 0.5492850494965733, + "grad_norm": 0.3665443956851959, + "learning_rate": 0.001, + "loss": 2.1248, + "step": 12984 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.13915681838989258, + "learning_rate": 0.001, + "loss": 3.073, + "step": 12985 + }, + { + "epoch": 0.549369659023606, + "grad_norm": 0.1796284019947052, + "learning_rate": 0.001, + "loss": 1.1191, + "step": 12986 + }, + { + "epoch": 0.5494119637871224, + "grad_norm": 0.3686416745185852, + "learning_rate": 0.001, + "loss": 2.0302, + "step": 12987 + }, + { + "epoch": 0.5494542685506388, + "grad_norm": 0.685794472694397, + "learning_rate": 0.001, + "loss": 2.1283, + "step": 12988 + }, + { + "epoch": 0.5494965733141551, + "grad_norm": 0.20203375816345215, + "learning_rate": 0.001, + "loss": 3.4477, + "step": 12989 + }, + { + "epoch": 0.5495388780776715, + "grad_norm": 0.23336085677146912, + "learning_rate": 0.001, + "loss": 2.3422, + "step": 12990 + }, + { + "epoch": 0.549581182841188, + "grad_norm": 0.1661679744720459, + "learning_rate": 0.001, + "loss": 2.7796, + "step": 12991 + }, + { + "epoch": 0.5496234876047043, + "grad_norm": 0.17486825585365295, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 12992 + }, + { + "epoch": 0.5496657923682207, + "grad_norm": 0.1599670946598053, + "learning_rate": 0.001, + "loss": 2.1133, + "step": 12993 + }, + { + "epoch": 0.5497080971317371, + "grad_norm": 0.17942267656326294, + "learning_rate": 0.001, + "loss": 2.227, + "step": 12994 + }, + { + "epoch": 0.5497504018952534, + "grad_norm": 0.18072116374969482, + "learning_rate": 0.001, + "loss": 1.7973, + "step": 12995 + }, + { + "epoch": 0.5497927066587698, + "grad_norm": 0.17400150001049042, + "learning_rate": 0.001, + "loss": 1.676, + "step": 12996 + }, + { + "epoch": 0.5498350114222862, + "grad_norm": 0.2297709584236145, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 12997 + }, + { + "epoch": 0.5498773161858025, + "grad_norm": 0.2160550206899643, + "learning_rate": 0.001, + "loss": 2.0131, + "step": 12998 + }, + { + "epoch": 0.5499196209493189, + "grad_norm": 0.22192339599132538, + "learning_rate": 0.001, + "loss": 3.1413, + "step": 12999 + }, + { + "epoch": 0.5499619257128353, + "grad_norm": 0.2530875504016876, + "learning_rate": 0.001, + "loss": 2.6736, + "step": 13000 + }, + { + "epoch": 0.5500042304763516, + "grad_norm": 0.17102450132369995, + "learning_rate": 0.001, + "loss": 2.3392, + "step": 13001 + }, + { + "epoch": 0.550046535239868, + "grad_norm": 0.17439766228199005, + "learning_rate": 0.001, + "loss": 2.1533, + "step": 13002 + }, + { + "epoch": 0.5500888400033844, + "grad_norm": 0.21379339694976807, + "learning_rate": 0.001, + "loss": 2.4908, + "step": 13003 + }, + { + "epoch": 0.5501311447669007, + "grad_norm": 0.19133144617080688, + "learning_rate": 0.001, + "loss": 1.8931, + "step": 13004 + }, + { + "epoch": 0.5501734495304171, + "grad_norm": 0.1691633015871048, + "learning_rate": 0.001, + "loss": 1.3374, + "step": 13005 + }, + { + "epoch": 0.5502157542939335, + "grad_norm": 0.24116800725460052, + "learning_rate": 0.001, + "loss": 2.4278, + "step": 13006 + }, + { + "epoch": 0.5502580590574498, + "grad_norm": 0.19122330844402313, + "learning_rate": 0.001, + "loss": 2.0226, + "step": 13007 + }, + { + "epoch": 0.5503003638209663, + "grad_norm": 0.15570446848869324, + "learning_rate": 0.001, + "loss": 1.7573, + "step": 13008 + }, + { + "epoch": 0.5503426685844827, + "grad_norm": 0.20239442586898804, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 13009 + }, + { + "epoch": 0.550384973347999, + "grad_norm": 0.1555495709180832, + "learning_rate": 0.001, + "loss": 3.2426, + "step": 13010 + }, + { + "epoch": 0.5504272781115154, + "grad_norm": 0.16427169740200043, + "learning_rate": 0.001, + "loss": 1.7691, + "step": 13011 + }, + { + "epoch": 0.5504695828750318, + "grad_norm": 0.16692587733268738, + "learning_rate": 0.001, + "loss": 2.0761, + "step": 13012 + }, + { + "epoch": 0.5505118876385481, + "grad_norm": 0.1881280094385147, + "learning_rate": 0.001, + "loss": 2.1607, + "step": 13013 + }, + { + "epoch": 0.5505541924020645, + "grad_norm": 0.1543162316083908, + "learning_rate": 0.001, + "loss": 2.4762, + "step": 13014 + }, + { + "epoch": 0.5505964971655808, + "grad_norm": 0.1498258113861084, + "learning_rate": 0.001, + "loss": 2.1568, + "step": 13015 + }, + { + "epoch": 0.5506388019290972, + "grad_norm": 0.15180440247058868, + "learning_rate": 0.001, + "loss": 2.5071, + "step": 13016 + }, + { + "epoch": 0.5506811066926136, + "grad_norm": 0.16765782237052917, + "learning_rate": 0.001, + "loss": 2.642, + "step": 13017 + }, + { + "epoch": 0.5507234114561299, + "grad_norm": 0.16857920587062836, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 13018 + }, + { + "epoch": 0.5507657162196463, + "grad_norm": 0.1566455215215683, + "learning_rate": 0.001, + "loss": 2.5403, + "step": 13019 + }, + { + "epoch": 0.5508080209831627, + "grad_norm": 0.1734788417816162, + "learning_rate": 0.001, + "loss": 2.1921, + "step": 13020 + }, + { + "epoch": 0.550850325746679, + "grad_norm": 0.15811264514923096, + "learning_rate": 0.001, + "loss": 1.4409, + "step": 13021 + }, + { + "epoch": 0.5508926305101954, + "grad_norm": 0.25248953700065613, + "learning_rate": 0.001, + "loss": 1.9164, + "step": 13022 + }, + { + "epoch": 0.5509349352737118, + "grad_norm": 0.19516262412071228, + "learning_rate": 0.001, + "loss": 2.1415, + "step": 13023 + }, + { + "epoch": 0.5509772400372281, + "grad_norm": 0.16840478777885437, + "learning_rate": 0.001, + "loss": 2.2028, + "step": 13024 + }, + { + "epoch": 0.5510195448007446, + "grad_norm": 6.247500896453857, + "learning_rate": 0.001, + "loss": 2.3796, + "step": 13025 + }, + { + "epoch": 0.551061849564261, + "grad_norm": 2.1479499340057373, + "learning_rate": 0.001, + "loss": 2.4607, + "step": 13026 + }, + { + "epoch": 0.5511041543277773, + "grad_norm": 0.1944126933813095, + "learning_rate": 0.001, + "loss": 1.8729, + "step": 13027 + }, + { + "epoch": 0.5511464590912937, + "grad_norm": 0.16638629138469696, + "learning_rate": 0.001, + "loss": 1.7599, + "step": 13028 + }, + { + "epoch": 0.5511887638548101, + "grad_norm": 0.17286452651023865, + "learning_rate": 0.001, + "loss": 3.7786, + "step": 13029 + }, + { + "epoch": 0.5512310686183264, + "grad_norm": 0.18785692751407623, + "learning_rate": 0.001, + "loss": 1.624, + "step": 13030 + }, + { + "epoch": 0.5512733733818428, + "grad_norm": 0.2681181728839874, + "learning_rate": 0.001, + "loss": 2.4162, + "step": 13031 + }, + { + "epoch": 0.5513156781453592, + "grad_norm": 1.382765531539917, + "learning_rate": 0.001, + "loss": 2.8352, + "step": 13032 + }, + { + "epoch": 0.5513579829088755, + "grad_norm": 0.1791725754737854, + "learning_rate": 0.001, + "loss": 1.5965, + "step": 13033 + }, + { + "epoch": 0.5514002876723919, + "grad_norm": 0.22276152670383453, + "learning_rate": 0.001, + "loss": 2.2998, + "step": 13034 + }, + { + "epoch": 0.5514425924359083, + "grad_norm": 0.21398457884788513, + "learning_rate": 0.001, + "loss": 1.873, + "step": 13035 + }, + { + "epoch": 0.5514848971994246, + "grad_norm": 0.19248297810554504, + "learning_rate": 0.001, + "loss": 2.0772, + "step": 13036 + }, + { + "epoch": 0.551527201962941, + "grad_norm": 0.19134873151779175, + "learning_rate": 0.001, + "loss": 2.2879, + "step": 13037 + }, + { + "epoch": 0.5515695067264574, + "grad_norm": 0.2077552080154419, + "learning_rate": 0.001, + "loss": 2.2834, + "step": 13038 + }, + { + "epoch": 0.5516118114899737, + "grad_norm": 0.18088361620903015, + "learning_rate": 0.001, + "loss": 1.7441, + "step": 13039 + }, + { + "epoch": 0.5516541162534901, + "grad_norm": 0.19229553639888763, + "learning_rate": 0.001, + "loss": 2.3027, + "step": 13040 + }, + { + "epoch": 0.5516964210170066, + "grad_norm": 0.20166753232479095, + "learning_rate": 0.001, + "loss": 2.6727, + "step": 13041 + }, + { + "epoch": 0.5517387257805229, + "grad_norm": 0.24196840822696686, + "learning_rate": 0.001, + "loss": 2.5962, + "step": 13042 + }, + { + "epoch": 0.5517810305440393, + "grad_norm": 0.18725115060806274, + "learning_rate": 0.001, + "loss": 2.1676, + "step": 13043 + }, + { + "epoch": 0.5518233353075557, + "grad_norm": 0.20191504061222076, + "learning_rate": 0.001, + "loss": 2.0362, + "step": 13044 + }, + { + "epoch": 0.551865640071072, + "grad_norm": 0.1696188747882843, + "learning_rate": 0.001, + "loss": 2.5537, + "step": 13045 + }, + { + "epoch": 0.5519079448345884, + "grad_norm": 0.28044790029525757, + "learning_rate": 0.001, + "loss": 1.6491, + "step": 13046 + }, + { + "epoch": 0.5519502495981048, + "grad_norm": 0.15965206921100616, + "learning_rate": 0.001, + "loss": 2.5324, + "step": 13047 + }, + { + "epoch": 0.5519925543616211, + "grad_norm": 0.16759942471981049, + "learning_rate": 0.001, + "loss": 2.6668, + "step": 13048 + }, + { + "epoch": 0.5520348591251375, + "grad_norm": 0.16884449124336243, + "learning_rate": 0.001, + "loss": 1.9035, + "step": 13049 + }, + { + "epoch": 0.5520771638886539, + "grad_norm": 0.39007458090782166, + "learning_rate": 0.001, + "loss": 1.7417, + "step": 13050 + }, + { + "epoch": 0.5521194686521702, + "grad_norm": 0.15368978679180145, + "learning_rate": 0.001, + "loss": 3.4138, + "step": 13051 + }, + { + "epoch": 0.5521617734156866, + "grad_norm": 0.443830281496048, + "learning_rate": 0.001, + "loss": 1.5381, + "step": 13052 + }, + { + "epoch": 0.552204078179203, + "grad_norm": 0.29350072145462036, + "learning_rate": 0.001, + "loss": 2.4967, + "step": 13053 + }, + { + "epoch": 0.5522463829427193, + "grad_norm": 0.14899539947509766, + "learning_rate": 0.001, + "loss": 2.6561, + "step": 13054 + }, + { + "epoch": 0.5522886877062357, + "grad_norm": 0.18092837929725647, + "learning_rate": 0.001, + "loss": 1.9984, + "step": 13055 + }, + { + "epoch": 0.5523309924697521, + "grad_norm": 0.21982833743095398, + "learning_rate": 0.001, + "loss": 3.1229, + "step": 13056 + }, + { + "epoch": 0.5523732972332684, + "grad_norm": 1.6211110353469849, + "learning_rate": 0.001, + "loss": 1.9592, + "step": 13057 + }, + { + "epoch": 0.5524156019967849, + "grad_norm": 0.16090841591358185, + "learning_rate": 0.001, + "loss": 2.6147, + "step": 13058 + }, + { + "epoch": 0.5524579067603013, + "grad_norm": 0.20968280732631683, + "learning_rate": 0.001, + "loss": 2.6148, + "step": 13059 + }, + { + "epoch": 0.5525002115238176, + "grad_norm": 0.16014662384986877, + "learning_rate": 0.001, + "loss": 2.2449, + "step": 13060 + }, + { + "epoch": 0.552542516287334, + "grad_norm": 0.4493609070777893, + "learning_rate": 0.001, + "loss": 1.5071, + "step": 13061 + }, + { + "epoch": 0.5525848210508503, + "grad_norm": 0.20664609968662262, + "learning_rate": 0.001, + "loss": 2.2177, + "step": 13062 + }, + { + "epoch": 0.5526271258143667, + "grad_norm": 0.1800384223461151, + "learning_rate": 0.001, + "loss": 1.9051, + "step": 13063 + }, + { + "epoch": 0.5526694305778831, + "grad_norm": 0.16999885439872742, + "learning_rate": 0.001, + "loss": 2.2448, + "step": 13064 + }, + { + "epoch": 0.5527117353413994, + "grad_norm": 0.1717803031206131, + "learning_rate": 0.001, + "loss": 1.679, + "step": 13065 + }, + { + "epoch": 0.5527540401049158, + "grad_norm": 0.3044344186782837, + "learning_rate": 0.001, + "loss": 2.5296, + "step": 13066 + }, + { + "epoch": 0.5527963448684322, + "grad_norm": 0.15595632791519165, + "learning_rate": 0.001, + "loss": 1.7021, + "step": 13067 + }, + { + "epoch": 0.5528386496319485, + "grad_norm": 0.15000808238983154, + "learning_rate": 0.001, + "loss": 1.5892, + "step": 13068 + }, + { + "epoch": 0.5528809543954649, + "grad_norm": 1.946815848350525, + "learning_rate": 0.001, + "loss": 1.4456, + "step": 13069 + }, + { + "epoch": 0.5529232591589813, + "grad_norm": 0.16129404306411743, + "learning_rate": 0.001, + "loss": 1.5363, + "step": 13070 + }, + { + "epoch": 0.5529655639224976, + "grad_norm": 0.22747275233268738, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 13071 + }, + { + "epoch": 0.553007868686014, + "grad_norm": 0.16564899682998657, + "learning_rate": 0.001, + "loss": 2.298, + "step": 13072 + }, + { + "epoch": 0.5530501734495304, + "grad_norm": 0.1645679473876953, + "learning_rate": 0.001, + "loss": 2.7037, + "step": 13073 + }, + { + "epoch": 0.5530924782130467, + "grad_norm": 0.18701903522014618, + "learning_rate": 0.001, + "loss": 1.9455, + "step": 13074 + }, + { + "epoch": 0.5531347829765632, + "grad_norm": 0.16058051586151123, + "learning_rate": 0.001, + "loss": 2.1071, + "step": 13075 + }, + { + "epoch": 0.5531770877400796, + "grad_norm": 0.16248644888401031, + "learning_rate": 0.001, + "loss": 2.7373, + "step": 13076 + }, + { + "epoch": 0.5532193925035959, + "grad_norm": 0.20399704575538635, + "learning_rate": 0.001, + "loss": 2.0442, + "step": 13077 + }, + { + "epoch": 0.5532616972671123, + "grad_norm": 0.14781156182289124, + "learning_rate": 0.001, + "loss": 1.7802, + "step": 13078 + }, + { + "epoch": 0.5533040020306287, + "grad_norm": 0.3947334587574005, + "learning_rate": 0.001, + "loss": 1.9844, + "step": 13079 + }, + { + "epoch": 0.553346306794145, + "grad_norm": 0.16816608607769012, + "learning_rate": 0.001, + "loss": 2.2254, + "step": 13080 + }, + { + "epoch": 0.5533886115576614, + "grad_norm": 0.548812210559845, + "learning_rate": 0.001, + "loss": 2.331, + "step": 13081 + }, + { + "epoch": 0.5534309163211778, + "grad_norm": 0.16253416240215302, + "learning_rate": 0.001, + "loss": 2.1435, + "step": 13082 + }, + { + "epoch": 0.5534732210846941, + "grad_norm": 0.33334457874298096, + "learning_rate": 0.001, + "loss": 1.7767, + "step": 13083 + }, + { + "epoch": 0.5535155258482105, + "grad_norm": 0.7046263813972473, + "learning_rate": 0.001, + "loss": 2.3232, + "step": 13084 + }, + { + "epoch": 0.5535578306117269, + "grad_norm": 0.18305683135986328, + "learning_rate": 0.001, + "loss": 2.4841, + "step": 13085 + }, + { + "epoch": 0.5536001353752432, + "grad_norm": 0.9522390365600586, + "learning_rate": 0.001, + "loss": 3.2129, + "step": 13086 + }, + { + "epoch": 0.5536424401387596, + "grad_norm": 0.16383132338523865, + "learning_rate": 0.001, + "loss": 1.8029, + "step": 13087 + }, + { + "epoch": 0.553684744902276, + "grad_norm": 0.16644108295440674, + "learning_rate": 0.001, + "loss": 1.9539, + "step": 13088 + }, + { + "epoch": 0.5537270496657923, + "grad_norm": 0.16594970226287842, + "learning_rate": 0.001, + "loss": 2.1514, + "step": 13089 + }, + { + "epoch": 0.5537693544293087, + "grad_norm": 0.19843560457229614, + "learning_rate": 0.001, + "loss": 2.751, + "step": 13090 + }, + { + "epoch": 0.5538116591928252, + "grad_norm": 0.166713684797287, + "learning_rate": 0.001, + "loss": 1.9202, + "step": 13091 + }, + { + "epoch": 0.5538539639563415, + "grad_norm": 0.18431521952152252, + "learning_rate": 0.001, + "loss": 2.3885, + "step": 13092 + }, + { + "epoch": 0.5538962687198579, + "grad_norm": 0.16817611455917358, + "learning_rate": 0.001, + "loss": 2.1286, + "step": 13093 + }, + { + "epoch": 0.5539385734833743, + "grad_norm": 0.15801174938678741, + "learning_rate": 0.001, + "loss": 1.8729, + "step": 13094 + }, + { + "epoch": 0.5539808782468906, + "grad_norm": 0.3943849802017212, + "learning_rate": 0.001, + "loss": 2.5361, + "step": 13095 + }, + { + "epoch": 0.554023183010407, + "grad_norm": 0.14820386469364166, + "learning_rate": 0.001, + "loss": 1.7892, + "step": 13096 + }, + { + "epoch": 0.5540654877739234, + "grad_norm": 0.15670961141586304, + "learning_rate": 0.001, + "loss": 2.8515, + "step": 13097 + }, + { + "epoch": 0.5541077925374397, + "grad_norm": 0.1757040172815323, + "learning_rate": 0.001, + "loss": 2.1255, + "step": 13098 + }, + { + "epoch": 0.5541500973009561, + "grad_norm": 0.5804983377456665, + "learning_rate": 0.001, + "loss": 1.6095, + "step": 13099 + }, + { + "epoch": 0.5541924020644725, + "grad_norm": 0.18276961147785187, + "learning_rate": 0.001, + "loss": 1.8125, + "step": 13100 + }, + { + "epoch": 0.5542347068279888, + "grad_norm": 0.24642734229564667, + "learning_rate": 0.001, + "loss": 2.0911, + "step": 13101 + }, + { + "epoch": 0.5542770115915052, + "grad_norm": 0.16965211927890778, + "learning_rate": 0.001, + "loss": 1.8867, + "step": 13102 + }, + { + "epoch": 0.5543193163550216, + "grad_norm": 0.1579209566116333, + "learning_rate": 0.001, + "loss": 1.7393, + "step": 13103 + }, + { + "epoch": 0.5543616211185379, + "grad_norm": 0.2219139188528061, + "learning_rate": 0.001, + "loss": 1.8149, + "step": 13104 + }, + { + "epoch": 0.5544039258820543, + "grad_norm": 0.17188802361488342, + "learning_rate": 0.001, + "loss": 1.8126, + "step": 13105 + }, + { + "epoch": 0.5544462306455706, + "grad_norm": 0.14392749965190887, + "learning_rate": 0.001, + "loss": 2.0954, + "step": 13106 + }, + { + "epoch": 0.554488535409087, + "grad_norm": 0.19999627768993378, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 13107 + }, + { + "epoch": 0.5545308401726035, + "grad_norm": 0.20926088094711304, + "learning_rate": 0.001, + "loss": 3.1452, + "step": 13108 + }, + { + "epoch": 0.5545731449361198, + "grad_norm": 0.19249233603477478, + "learning_rate": 0.001, + "loss": 2.3985, + "step": 13109 + }, + { + "epoch": 0.5546154496996362, + "grad_norm": 0.1850895881652832, + "learning_rate": 0.001, + "loss": 1.7581, + "step": 13110 + }, + { + "epoch": 0.5546577544631526, + "grad_norm": 0.1713310182094574, + "learning_rate": 0.001, + "loss": 3.1031, + "step": 13111 + }, + { + "epoch": 0.5547000592266689, + "grad_norm": 0.21461325883865356, + "learning_rate": 0.001, + "loss": 1.462, + "step": 13112 + }, + { + "epoch": 0.5547423639901853, + "grad_norm": 0.19272612035274506, + "learning_rate": 0.001, + "loss": 2.0605, + "step": 13113 + }, + { + "epoch": 0.5547846687537017, + "grad_norm": 0.5416992902755737, + "learning_rate": 0.001, + "loss": 2.4492, + "step": 13114 + }, + { + "epoch": 0.554826973517218, + "grad_norm": 0.16244624555110931, + "learning_rate": 0.001, + "loss": 1.9301, + "step": 13115 + }, + { + "epoch": 0.5548692782807344, + "grad_norm": 0.1623430848121643, + "learning_rate": 0.001, + "loss": 1.8153, + "step": 13116 + }, + { + "epoch": 0.5549115830442508, + "grad_norm": 0.15452533960342407, + "learning_rate": 0.001, + "loss": 2.2119, + "step": 13117 + }, + { + "epoch": 0.5549538878077671, + "grad_norm": 0.1652129739522934, + "learning_rate": 0.001, + "loss": 2.0223, + "step": 13118 + }, + { + "epoch": 0.5549961925712835, + "grad_norm": 0.19139006733894348, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 13119 + }, + { + "epoch": 0.5550384973347999, + "grad_norm": 0.21414639055728912, + "learning_rate": 0.001, + "loss": 1.978, + "step": 13120 + }, + { + "epoch": 0.5550808020983162, + "grad_norm": 0.18957240879535675, + "learning_rate": 0.001, + "loss": 2.3123, + "step": 13121 + }, + { + "epoch": 0.5551231068618326, + "grad_norm": 0.16492891311645508, + "learning_rate": 0.001, + "loss": 2.2614, + "step": 13122 + }, + { + "epoch": 0.555165411625349, + "grad_norm": 0.17315563559532166, + "learning_rate": 0.001, + "loss": 1.4405, + "step": 13123 + }, + { + "epoch": 0.5552077163888653, + "grad_norm": 0.18722382187843323, + "learning_rate": 0.001, + "loss": 2.4344, + "step": 13124 + }, + { + "epoch": 0.5552500211523818, + "grad_norm": 0.13166561722755432, + "learning_rate": 0.001, + "loss": 2.4014, + "step": 13125 + }, + { + "epoch": 0.5552923259158982, + "grad_norm": 0.22480559349060059, + "learning_rate": 0.001, + "loss": 1.9176, + "step": 13126 + }, + { + "epoch": 0.5553346306794145, + "grad_norm": 0.15770910680294037, + "learning_rate": 0.001, + "loss": 2.2783, + "step": 13127 + }, + { + "epoch": 0.5553769354429309, + "grad_norm": 0.17504677176475525, + "learning_rate": 0.001, + "loss": 1.8905, + "step": 13128 + }, + { + "epoch": 0.5554192402064473, + "grad_norm": 0.18048499524593353, + "learning_rate": 0.001, + "loss": 2.557, + "step": 13129 + }, + { + "epoch": 0.5554615449699636, + "grad_norm": 0.19142648577690125, + "learning_rate": 0.001, + "loss": 1.8773, + "step": 13130 + }, + { + "epoch": 0.55550384973348, + "grad_norm": 0.16051210463047028, + "learning_rate": 0.001, + "loss": 1.8186, + "step": 13131 + }, + { + "epoch": 0.5555461544969964, + "grad_norm": 0.21467453241348267, + "learning_rate": 0.001, + "loss": 2.09, + "step": 13132 + }, + { + "epoch": 0.5555884592605127, + "grad_norm": 0.7346461415290833, + "learning_rate": 0.001, + "loss": 2.0679, + "step": 13133 + }, + { + "epoch": 0.5556307640240291, + "grad_norm": 0.1747002899646759, + "learning_rate": 0.001, + "loss": 1.8083, + "step": 13134 + }, + { + "epoch": 0.5556730687875455, + "grad_norm": 0.17838482558727264, + "learning_rate": 0.001, + "loss": 2.428, + "step": 13135 + }, + { + "epoch": 0.5557153735510618, + "grad_norm": 0.15864208340644836, + "learning_rate": 0.001, + "loss": 2.0233, + "step": 13136 + }, + { + "epoch": 0.5557576783145782, + "grad_norm": 0.27574223279953003, + "learning_rate": 0.001, + "loss": 2.9788, + "step": 13137 + }, + { + "epoch": 0.5557999830780946, + "grad_norm": 0.18483395874500275, + "learning_rate": 0.001, + "loss": 2.0938, + "step": 13138 + }, + { + "epoch": 0.5558422878416109, + "grad_norm": 0.1585381031036377, + "learning_rate": 0.001, + "loss": 1.4932, + "step": 13139 + }, + { + "epoch": 0.5558845926051273, + "grad_norm": 0.19436490535736084, + "learning_rate": 0.001, + "loss": 2.827, + "step": 13140 + }, + { + "epoch": 0.5559268973686438, + "grad_norm": 0.9730831384658813, + "learning_rate": 0.001, + "loss": 1.8887, + "step": 13141 + }, + { + "epoch": 0.5559692021321601, + "grad_norm": 0.16187086701393127, + "learning_rate": 0.001, + "loss": 2.15, + "step": 13142 + }, + { + "epoch": 0.5560115068956765, + "grad_norm": 0.17080490291118622, + "learning_rate": 0.001, + "loss": 2.7272, + "step": 13143 + }, + { + "epoch": 0.5560538116591929, + "grad_norm": 0.4234422445297241, + "learning_rate": 0.001, + "loss": 2.6932, + "step": 13144 + }, + { + "epoch": 0.5560961164227092, + "grad_norm": 0.16992883384227753, + "learning_rate": 0.001, + "loss": 1.6274, + "step": 13145 + }, + { + "epoch": 0.5561384211862256, + "grad_norm": 0.1784851998090744, + "learning_rate": 0.001, + "loss": 1.9868, + "step": 13146 + }, + { + "epoch": 0.556180725949742, + "grad_norm": 0.18666186928749084, + "learning_rate": 0.001, + "loss": 2.8875, + "step": 13147 + }, + { + "epoch": 0.5562230307132583, + "grad_norm": 0.17807726562023163, + "learning_rate": 0.001, + "loss": 1.6625, + "step": 13148 + }, + { + "epoch": 0.5562653354767747, + "grad_norm": 0.14291433990001678, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 13149 + }, + { + "epoch": 0.556307640240291, + "grad_norm": 0.18871572613716125, + "learning_rate": 0.001, + "loss": 1.5161, + "step": 13150 + }, + { + "epoch": 0.5563499450038074, + "grad_norm": 0.2232998013496399, + "learning_rate": 0.001, + "loss": 1.7724, + "step": 13151 + }, + { + "epoch": 0.5563922497673238, + "grad_norm": 0.21239492297172546, + "learning_rate": 0.001, + "loss": 2.0644, + "step": 13152 + }, + { + "epoch": 0.5564345545308401, + "grad_norm": 0.177618145942688, + "learning_rate": 0.001, + "loss": 2.4754, + "step": 13153 + }, + { + "epoch": 0.5564768592943565, + "grad_norm": 0.2010020613670349, + "learning_rate": 0.001, + "loss": 1.8863, + "step": 13154 + }, + { + "epoch": 0.5565191640578729, + "grad_norm": 0.15301641821861267, + "learning_rate": 0.001, + "loss": 2.6225, + "step": 13155 + }, + { + "epoch": 0.5565614688213892, + "grad_norm": 0.16343499720096588, + "learning_rate": 0.001, + "loss": 2.5127, + "step": 13156 + }, + { + "epoch": 0.5566037735849056, + "grad_norm": 0.15531091392040253, + "learning_rate": 0.001, + "loss": 2.0179, + "step": 13157 + }, + { + "epoch": 0.5566460783484221, + "grad_norm": 0.18499448895454407, + "learning_rate": 0.001, + "loss": 1.4872, + "step": 13158 + }, + { + "epoch": 0.5566883831119384, + "grad_norm": 0.21716418862342834, + "learning_rate": 0.001, + "loss": 1.7633, + "step": 13159 + }, + { + "epoch": 0.5567306878754548, + "grad_norm": 0.3633074462413788, + "learning_rate": 0.001, + "loss": 1.7639, + "step": 13160 + }, + { + "epoch": 0.5567729926389712, + "grad_norm": 0.20026670396327972, + "learning_rate": 0.001, + "loss": 1.4217, + "step": 13161 + }, + { + "epoch": 0.5568152974024875, + "grad_norm": 0.18606965243816376, + "learning_rate": 0.001, + "loss": 1.8908, + "step": 13162 + }, + { + "epoch": 0.5568576021660039, + "grad_norm": 0.17996345460414886, + "learning_rate": 0.001, + "loss": 1.7892, + "step": 13163 + }, + { + "epoch": 0.5568999069295203, + "grad_norm": 0.1515737622976303, + "learning_rate": 0.001, + "loss": 1.8319, + "step": 13164 + }, + { + "epoch": 0.5569422116930366, + "grad_norm": 0.21763929724693298, + "learning_rate": 0.001, + "loss": 3.2495, + "step": 13165 + }, + { + "epoch": 0.556984516456553, + "grad_norm": 0.18336108326911926, + "learning_rate": 0.001, + "loss": 1.6638, + "step": 13166 + }, + { + "epoch": 0.5570268212200694, + "grad_norm": 0.1941053867340088, + "learning_rate": 0.001, + "loss": 2.2389, + "step": 13167 + }, + { + "epoch": 0.5570691259835857, + "grad_norm": 0.2293504923582077, + "learning_rate": 0.001, + "loss": 1.7683, + "step": 13168 + }, + { + "epoch": 0.5571114307471021, + "grad_norm": 0.18686215579509735, + "learning_rate": 0.001, + "loss": 1.6258, + "step": 13169 + }, + { + "epoch": 0.5571537355106185, + "grad_norm": 0.17129628360271454, + "learning_rate": 0.001, + "loss": 1.8111, + "step": 13170 + }, + { + "epoch": 0.5571960402741348, + "grad_norm": 0.20387300848960876, + "learning_rate": 0.001, + "loss": 2.2676, + "step": 13171 + }, + { + "epoch": 0.5572383450376512, + "grad_norm": 0.15360292792320251, + "learning_rate": 0.001, + "loss": 1.8776, + "step": 13172 + }, + { + "epoch": 0.5572806498011676, + "grad_norm": 0.20021335780620575, + "learning_rate": 0.001, + "loss": 1.9061, + "step": 13173 + }, + { + "epoch": 0.557322954564684, + "grad_norm": 0.19312505424022675, + "learning_rate": 0.001, + "loss": 1.8143, + "step": 13174 + }, + { + "epoch": 0.5573652593282004, + "grad_norm": 0.19307121634483337, + "learning_rate": 0.001, + "loss": 4.1747, + "step": 13175 + }, + { + "epoch": 0.5574075640917168, + "grad_norm": 0.16115133464336395, + "learning_rate": 0.001, + "loss": 2.7506, + "step": 13176 + }, + { + "epoch": 0.5574498688552331, + "grad_norm": 0.1843666285276413, + "learning_rate": 0.001, + "loss": 2.6517, + "step": 13177 + }, + { + "epoch": 0.5574921736187495, + "grad_norm": 0.21303428709506989, + "learning_rate": 0.001, + "loss": 2.7585, + "step": 13178 + }, + { + "epoch": 0.5575344783822659, + "grad_norm": 0.16230140626430511, + "learning_rate": 0.001, + "loss": 1.822, + "step": 13179 + }, + { + "epoch": 0.5575767831457822, + "grad_norm": 0.20180179178714752, + "learning_rate": 0.001, + "loss": 2.4642, + "step": 13180 + }, + { + "epoch": 0.5576190879092986, + "grad_norm": 0.1860133558511734, + "learning_rate": 0.001, + "loss": 2.4435, + "step": 13181 + }, + { + "epoch": 0.557661392672815, + "grad_norm": 0.33960339426994324, + "learning_rate": 0.001, + "loss": 3.8256, + "step": 13182 + }, + { + "epoch": 0.5577036974363313, + "grad_norm": 0.1435151845216751, + "learning_rate": 0.001, + "loss": 1.4867, + "step": 13183 + }, + { + "epoch": 0.5577460021998477, + "grad_norm": 0.1442885398864746, + "learning_rate": 0.001, + "loss": 3.0296, + "step": 13184 + }, + { + "epoch": 0.5577883069633641, + "grad_norm": 0.3879304528236389, + "learning_rate": 0.001, + "loss": 2.4309, + "step": 13185 + }, + { + "epoch": 0.5578306117268804, + "grad_norm": 0.15823771059513092, + "learning_rate": 0.001, + "loss": 2.0006, + "step": 13186 + }, + { + "epoch": 0.5578729164903968, + "grad_norm": 0.2137526571750641, + "learning_rate": 0.001, + "loss": 2.8213, + "step": 13187 + }, + { + "epoch": 0.5579152212539132, + "grad_norm": 0.2220454066991806, + "learning_rate": 0.001, + "loss": 1.751, + "step": 13188 + }, + { + "epoch": 0.5579575260174295, + "grad_norm": 0.15580366551876068, + "learning_rate": 0.001, + "loss": 3.3767, + "step": 13189 + }, + { + "epoch": 0.557999830780946, + "grad_norm": 0.1863831877708435, + "learning_rate": 0.001, + "loss": 2.8947, + "step": 13190 + }, + { + "epoch": 0.5580421355444624, + "grad_norm": 0.17505452036857605, + "learning_rate": 0.001, + "loss": 2.7388, + "step": 13191 + }, + { + "epoch": 0.5580844403079787, + "grad_norm": 0.5289129018783569, + "learning_rate": 0.001, + "loss": 1.7719, + "step": 13192 + }, + { + "epoch": 0.5581267450714951, + "grad_norm": 0.15774008631706238, + "learning_rate": 0.001, + "loss": 1.9727, + "step": 13193 + }, + { + "epoch": 0.5581690498350115, + "grad_norm": 0.16783976554870605, + "learning_rate": 0.001, + "loss": 1.9035, + "step": 13194 + }, + { + "epoch": 0.5582113545985278, + "grad_norm": 0.19274777173995972, + "learning_rate": 0.001, + "loss": 2.3897, + "step": 13195 + }, + { + "epoch": 0.5582536593620442, + "grad_norm": 0.15515726804733276, + "learning_rate": 0.001, + "loss": 2.1614, + "step": 13196 + }, + { + "epoch": 0.5582959641255605, + "grad_norm": 0.20335707068443298, + "learning_rate": 0.001, + "loss": 2.5368, + "step": 13197 + }, + { + "epoch": 0.5583382688890769, + "grad_norm": 0.17299920320510864, + "learning_rate": 0.001, + "loss": 1.9999, + "step": 13198 + }, + { + "epoch": 0.5583805736525933, + "grad_norm": 0.167136088013649, + "learning_rate": 0.001, + "loss": 1.841, + "step": 13199 + }, + { + "epoch": 0.5584228784161096, + "grad_norm": 0.19228960573673248, + "learning_rate": 0.001, + "loss": 3.3171, + "step": 13200 + }, + { + "epoch": 0.558465183179626, + "grad_norm": 0.13816307485103607, + "learning_rate": 0.001, + "loss": 3.1786, + "step": 13201 + }, + { + "epoch": 0.5585074879431424, + "grad_norm": 1.117954969406128, + "learning_rate": 0.001, + "loss": 2.6146, + "step": 13202 + }, + { + "epoch": 0.5585497927066587, + "grad_norm": 0.1563483625650406, + "learning_rate": 0.001, + "loss": 2.3211, + "step": 13203 + }, + { + "epoch": 0.5585920974701751, + "grad_norm": 0.15023623406887054, + "learning_rate": 0.001, + "loss": 1.7047, + "step": 13204 + }, + { + "epoch": 0.5586344022336915, + "grad_norm": 0.21180753409862518, + "learning_rate": 0.001, + "loss": 3.0361, + "step": 13205 + }, + { + "epoch": 0.5586767069972078, + "grad_norm": 1.6476328372955322, + "learning_rate": 0.001, + "loss": 2.0756, + "step": 13206 + }, + { + "epoch": 0.5587190117607242, + "grad_norm": 0.1770959347486496, + "learning_rate": 0.001, + "loss": 2.0876, + "step": 13207 + }, + { + "epoch": 0.5587613165242407, + "grad_norm": 0.16517281532287598, + "learning_rate": 0.001, + "loss": 2.9517, + "step": 13208 + }, + { + "epoch": 0.558803621287757, + "grad_norm": 0.24339810013771057, + "learning_rate": 0.001, + "loss": 2.227, + "step": 13209 + }, + { + "epoch": 0.5588459260512734, + "grad_norm": 0.2662486732006073, + "learning_rate": 0.001, + "loss": 1.6548, + "step": 13210 + }, + { + "epoch": 0.5588882308147898, + "grad_norm": 2.719054698944092, + "learning_rate": 0.001, + "loss": 2.0465, + "step": 13211 + }, + { + "epoch": 0.5589305355783061, + "grad_norm": 0.4585936367511749, + "learning_rate": 0.001, + "loss": 2.4569, + "step": 13212 + }, + { + "epoch": 0.5589728403418225, + "grad_norm": 0.2304200530052185, + "learning_rate": 0.001, + "loss": 2.2496, + "step": 13213 + }, + { + "epoch": 0.5590151451053389, + "grad_norm": 0.4977359175682068, + "learning_rate": 0.001, + "loss": 2.5334, + "step": 13214 + }, + { + "epoch": 0.5590574498688552, + "grad_norm": 0.1636125147342682, + "learning_rate": 0.001, + "loss": 2.0662, + "step": 13215 + }, + { + "epoch": 0.5590997546323716, + "grad_norm": 0.18210989236831665, + "learning_rate": 0.001, + "loss": 1.9016, + "step": 13216 + }, + { + "epoch": 0.559142059395888, + "grad_norm": 0.5983842611312866, + "learning_rate": 0.001, + "loss": 3.717, + "step": 13217 + }, + { + "epoch": 0.5591843641594043, + "grad_norm": 0.20817320048809052, + "learning_rate": 0.001, + "loss": 1.5124, + "step": 13218 + }, + { + "epoch": 0.5592266689229207, + "grad_norm": 0.16508392989635468, + "learning_rate": 0.001, + "loss": 1.7414, + "step": 13219 + }, + { + "epoch": 0.5592689736864371, + "grad_norm": 0.19145143032073975, + "learning_rate": 0.001, + "loss": 2.4444, + "step": 13220 + }, + { + "epoch": 0.5593112784499534, + "grad_norm": 0.2849757969379425, + "learning_rate": 0.001, + "loss": 4.1417, + "step": 13221 + }, + { + "epoch": 0.5593535832134698, + "grad_norm": 0.20053541660308838, + "learning_rate": 0.001, + "loss": 2.7225, + "step": 13222 + }, + { + "epoch": 0.5593958879769863, + "grad_norm": 0.19015778601169586, + "learning_rate": 0.001, + "loss": 1.9326, + "step": 13223 + }, + { + "epoch": 0.5594381927405025, + "grad_norm": 0.22139129042625427, + "learning_rate": 0.001, + "loss": 2.0288, + "step": 13224 + }, + { + "epoch": 0.559480497504019, + "grad_norm": 0.18793657422065735, + "learning_rate": 0.001, + "loss": 2.234, + "step": 13225 + }, + { + "epoch": 0.5595228022675354, + "grad_norm": 0.3711003363132477, + "learning_rate": 0.001, + "loss": 2.8216, + "step": 13226 + }, + { + "epoch": 0.5595651070310517, + "grad_norm": 0.35137489438056946, + "learning_rate": 0.001, + "loss": 2.3047, + "step": 13227 + }, + { + "epoch": 0.5596074117945681, + "grad_norm": 0.3833855092525482, + "learning_rate": 0.001, + "loss": 2.2983, + "step": 13228 + }, + { + "epoch": 0.5596497165580845, + "grad_norm": 0.19461855292320251, + "learning_rate": 0.001, + "loss": 2.4572, + "step": 13229 + }, + { + "epoch": 0.5596920213216008, + "grad_norm": 0.22493231296539307, + "learning_rate": 0.001, + "loss": 2.1651, + "step": 13230 + }, + { + "epoch": 0.5597343260851172, + "grad_norm": 0.19130484759807587, + "learning_rate": 0.001, + "loss": 2.1431, + "step": 13231 + }, + { + "epoch": 0.5597766308486336, + "grad_norm": 0.1570371687412262, + "learning_rate": 0.001, + "loss": 1.8944, + "step": 13232 + }, + { + "epoch": 0.5598189356121499, + "grad_norm": 0.16099748015403748, + "learning_rate": 0.001, + "loss": 2.277, + "step": 13233 + }, + { + "epoch": 0.5598612403756663, + "grad_norm": 0.18128852546215057, + "learning_rate": 0.001, + "loss": 1.8642, + "step": 13234 + }, + { + "epoch": 0.5599035451391827, + "grad_norm": 0.2672935426235199, + "learning_rate": 0.001, + "loss": 1.5749, + "step": 13235 + }, + { + "epoch": 0.559945849902699, + "grad_norm": 0.1762719750404358, + "learning_rate": 0.001, + "loss": 2.1202, + "step": 13236 + }, + { + "epoch": 0.5599881546662154, + "grad_norm": 0.17224456369876862, + "learning_rate": 0.001, + "loss": 2.2378, + "step": 13237 + }, + { + "epoch": 0.5600304594297318, + "grad_norm": 0.15539632737636566, + "learning_rate": 0.001, + "loss": 2.3729, + "step": 13238 + }, + { + "epoch": 0.5600727641932481, + "grad_norm": 0.16019120812416077, + "learning_rate": 0.001, + "loss": 2.0313, + "step": 13239 + }, + { + "epoch": 0.5601150689567646, + "grad_norm": 0.21855637431144714, + "learning_rate": 0.001, + "loss": 2.3144, + "step": 13240 + }, + { + "epoch": 0.5601573737202808, + "grad_norm": 0.20648352801799774, + "learning_rate": 0.001, + "loss": 1.9184, + "step": 13241 + }, + { + "epoch": 0.5601996784837973, + "grad_norm": 0.1866380274295807, + "learning_rate": 0.001, + "loss": 2.8954, + "step": 13242 + }, + { + "epoch": 0.5602419832473137, + "grad_norm": 4.305062294006348, + "learning_rate": 0.001, + "loss": 2.8966, + "step": 13243 + }, + { + "epoch": 0.56028428801083, + "grad_norm": 0.18245016038417816, + "learning_rate": 0.001, + "loss": 2.472, + "step": 13244 + }, + { + "epoch": 0.5603265927743464, + "grad_norm": 0.15738457441329956, + "learning_rate": 0.001, + "loss": 1.5529, + "step": 13245 + }, + { + "epoch": 0.5603688975378628, + "grad_norm": 0.1629572957754135, + "learning_rate": 0.001, + "loss": 1.5193, + "step": 13246 + }, + { + "epoch": 0.5604112023013791, + "grad_norm": 0.46464836597442627, + "learning_rate": 0.001, + "loss": 3.0418, + "step": 13247 + }, + { + "epoch": 0.5604535070648955, + "grad_norm": 0.2348056435585022, + "learning_rate": 0.001, + "loss": 2.3288, + "step": 13248 + }, + { + "epoch": 0.5604958118284119, + "grad_norm": 0.9263020753860474, + "learning_rate": 0.001, + "loss": 2.1417, + "step": 13249 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.32688605785369873, + "learning_rate": 0.001, + "loss": 2.0048, + "step": 13250 + }, + { + "epoch": 0.5605804213554446, + "grad_norm": 0.226731076836586, + "learning_rate": 0.001, + "loss": 2.6983, + "step": 13251 + }, + { + "epoch": 0.560622726118961, + "grad_norm": 0.40565672516822815, + "learning_rate": 0.001, + "loss": 2.6863, + "step": 13252 + }, + { + "epoch": 0.5606650308824773, + "grad_norm": 0.21357344090938568, + "learning_rate": 0.001, + "loss": 1.5538, + "step": 13253 + }, + { + "epoch": 0.5607073356459937, + "grad_norm": 0.20866970717906952, + "learning_rate": 0.001, + "loss": 1.9627, + "step": 13254 + }, + { + "epoch": 0.5607496404095101, + "grad_norm": 0.20830655097961426, + "learning_rate": 0.001, + "loss": 2.1551, + "step": 13255 + }, + { + "epoch": 0.5607919451730264, + "grad_norm": 0.19615496695041656, + "learning_rate": 0.001, + "loss": 2.1498, + "step": 13256 + }, + { + "epoch": 0.5608342499365429, + "grad_norm": 0.32191988825798035, + "learning_rate": 0.001, + "loss": 3.5976, + "step": 13257 + }, + { + "epoch": 0.5608765547000593, + "grad_norm": 0.22609533369541168, + "learning_rate": 0.001, + "loss": 3.2048, + "step": 13258 + }, + { + "epoch": 0.5609188594635756, + "grad_norm": 0.45731955766677856, + "learning_rate": 0.001, + "loss": 1.5562, + "step": 13259 + }, + { + "epoch": 0.560961164227092, + "grad_norm": 0.19741539657115936, + "learning_rate": 0.001, + "loss": 2.3421, + "step": 13260 + }, + { + "epoch": 0.5610034689906084, + "grad_norm": 0.24196267127990723, + "learning_rate": 0.001, + "loss": 3.386, + "step": 13261 + }, + { + "epoch": 0.5610457737541247, + "grad_norm": 0.24868476390838623, + "learning_rate": 0.001, + "loss": 1.7664, + "step": 13262 + }, + { + "epoch": 0.5610880785176411, + "grad_norm": 0.17724378407001495, + "learning_rate": 0.001, + "loss": 2.038, + "step": 13263 + }, + { + "epoch": 0.5611303832811575, + "grad_norm": 0.17900416254997253, + "learning_rate": 0.001, + "loss": 2.5304, + "step": 13264 + }, + { + "epoch": 0.5611726880446738, + "grad_norm": 0.2126455008983612, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 13265 + }, + { + "epoch": 0.5612149928081902, + "grad_norm": 0.20829962193965912, + "learning_rate": 0.001, + "loss": 2.4063, + "step": 13266 + }, + { + "epoch": 0.5612572975717066, + "grad_norm": 0.22738134860992432, + "learning_rate": 0.001, + "loss": 1.7318, + "step": 13267 + }, + { + "epoch": 0.5612996023352229, + "grad_norm": 0.18738828599452972, + "learning_rate": 0.001, + "loss": 2.0383, + "step": 13268 + }, + { + "epoch": 0.5613419070987393, + "grad_norm": 0.2207876741886139, + "learning_rate": 0.001, + "loss": 2.9998, + "step": 13269 + }, + { + "epoch": 0.5613842118622557, + "grad_norm": 0.1774303913116455, + "learning_rate": 0.001, + "loss": 2.2946, + "step": 13270 + }, + { + "epoch": 0.561426516625772, + "grad_norm": 0.16445757448673248, + "learning_rate": 0.001, + "loss": 2.1301, + "step": 13271 + }, + { + "epoch": 0.5614688213892884, + "grad_norm": 0.1810140758752823, + "learning_rate": 0.001, + "loss": 1.7674, + "step": 13272 + }, + { + "epoch": 0.5615111261528049, + "grad_norm": 0.2058965414762497, + "learning_rate": 0.001, + "loss": 1.5423, + "step": 13273 + }, + { + "epoch": 0.5615534309163212, + "grad_norm": 0.20573607087135315, + "learning_rate": 0.001, + "loss": 3.1317, + "step": 13274 + }, + { + "epoch": 0.5615957356798376, + "grad_norm": 0.2003343552350998, + "learning_rate": 0.001, + "loss": 3.3971, + "step": 13275 + }, + { + "epoch": 0.561638040443354, + "grad_norm": 0.16482414305210114, + "learning_rate": 0.001, + "loss": 2.5649, + "step": 13276 + }, + { + "epoch": 0.5616803452068703, + "grad_norm": 0.16140007972717285, + "learning_rate": 0.001, + "loss": 2.7057, + "step": 13277 + }, + { + "epoch": 0.5617226499703867, + "grad_norm": 2.4391520023345947, + "learning_rate": 0.001, + "loss": 1.9083, + "step": 13278 + }, + { + "epoch": 0.5617649547339031, + "grad_norm": 0.17537763714790344, + "learning_rate": 0.001, + "loss": 2.0083, + "step": 13279 + }, + { + "epoch": 0.5618072594974194, + "grad_norm": 0.17112372815608978, + "learning_rate": 0.001, + "loss": 1.7442, + "step": 13280 + }, + { + "epoch": 0.5618495642609358, + "grad_norm": 0.17031919956207275, + "learning_rate": 0.001, + "loss": 1.8762, + "step": 13281 + }, + { + "epoch": 0.5618918690244522, + "grad_norm": 0.15143336355686188, + "learning_rate": 0.001, + "loss": 2.3816, + "step": 13282 + }, + { + "epoch": 0.5619341737879685, + "grad_norm": 0.15341715514659882, + "learning_rate": 0.001, + "loss": 2.9387, + "step": 13283 + }, + { + "epoch": 0.5619764785514849, + "grad_norm": 0.18952025473117828, + "learning_rate": 0.001, + "loss": 2.3547, + "step": 13284 + }, + { + "epoch": 0.5620187833150013, + "grad_norm": 0.18509429693222046, + "learning_rate": 0.001, + "loss": 2.503, + "step": 13285 + }, + { + "epoch": 0.5620610880785176, + "grad_norm": 0.16015632450580597, + "learning_rate": 0.001, + "loss": 2.008, + "step": 13286 + }, + { + "epoch": 0.562103392842034, + "grad_norm": 0.1822974979877472, + "learning_rate": 0.001, + "loss": 1.8725, + "step": 13287 + }, + { + "epoch": 0.5621456976055503, + "grad_norm": 0.20895791053771973, + "learning_rate": 0.001, + "loss": 3.7383, + "step": 13288 + }, + { + "epoch": 0.5621880023690667, + "grad_norm": 0.2861064076423645, + "learning_rate": 0.001, + "loss": 1.7759, + "step": 13289 + }, + { + "epoch": 0.5622303071325832, + "grad_norm": 0.1581949144601822, + "learning_rate": 0.001, + "loss": 2.9572, + "step": 13290 + }, + { + "epoch": 0.5622726118960995, + "grad_norm": 0.1519918292760849, + "learning_rate": 0.001, + "loss": 1.911, + "step": 13291 + }, + { + "epoch": 0.5623149166596159, + "grad_norm": 1.3050894737243652, + "learning_rate": 0.001, + "loss": 1.6955, + "step": 13292 + }, + { + "epoch": 0.5623572214231323, + "grad_norm": 0.16207565367221832, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 13293 + }, + { + "epoch": 0.5623995261866486, + "grad_norm": 0.21494990587234497, + "learning_rate": 0.001, + "loss": 1.8584, + "step": 13294 + }, + { + "epoch": 0.562441830950165, + "grad_norm": 0.1507209986448288, + "learning_rate": 0.001, + "loss": 2.1614, + "step": 13295 + }, + { + "epoch": 0.5624841357136814, + "grad_norm": 0.5043923854827881, + "learning_rate": 0.001, + "loss": 2.4189, + "step": 13296 + }, + { + "epoch": 0.5625264404771977, + "grad_norm": 0.15872474014759064, + "learning_rate": 0.001, + "loss": 2.1197, + "step": 13297 + }, + { + "epoch": 0.5625687452407141, + "grad_norm": 0.18348495662212372, + "learning_rate": 0.001, + "loss": 1.9253, + "step": 13298 + }, + { + "epoch": 0.5626110500042305, + "grad_norm": 0.22080762684345245, + "learning_rate": 0.001, + "loss": 2.7428, + "step": 13299 + }, + { + "epoch": 0.5626533547677468, + "grad_norm": 0.21116049587726593, + "learning_rate": 0.001, + "loss": 2.4198, + "step": 13300 + }, + { + "epoch": 0.5626956595312632, + "grad_norm": 0.26892927289009094, + "learning_rate": 0.001, + "loss": 2.1072, + "step": 13301 + }, + { + "epoch": 0.5627379642947796, + "grad_norm": 0.17305731773376465, + "learning_rate": 0.001, + "loss": 2.569, + "step": 13302 + }, + { + "epoch": 0.5627802690582959, + "grad_norm": 0.18649710714817047, + "learning_rate": 0.001, + "loss": 3.0606, + "step": 13303 + }, + { + "epoch": 0.5628225738218123, + "grad_norm": 0.15958692133426666, + "learning_rate": 0.001, + "loss": 2.2803, + "step": 13304 + }, + { + "epoch": 0.5628648785853287, + "grad_norm": 0.20183780789375305, + "learning_rate": 0.001, + "loss": 3.1339, + "step": 13305 + }, + { + "epoch": 0.562907183348845, + "grad_norm": 0.23811282217502594, + "learning_rate": 0.001, + "loss": 2.0551, + "step": 13306 + }, + { + "epoch": 0.5629494881123615, + "grad_norm": 0.2011602222919464, + "learning_rate": 0.001, + "loss": 2.1267, + "step": 13307 + }, + { + "epoch": 0.5629917928758779, + "grad_norm": 0.18026520311832428, + "learning_rate": 0.001, + "loss": 2.5384, + "step": 13308 + }, + { + "epoch": 0.5630340976393942, + "grad_norm": 0.19247384369373322, + "learning_rate": 0.001, + "loss": 2.0309, + "step": 13309 + }, + { + "epoch": 0.5630764024029106, + "grad_norm": 0.23704922199249268, + "learning_rate": 0.001, + "loss": 2.4209, + "step": 13310 + }, + { + "epoch": 0.563118707166427, + "grad_norm": 0.19546206295490265, + "learning_rate": 0.001, + "loss": 2.3817, + "step": 13311 + }, + { + "epoch": 0.5631610119299433, + "grad_norm": 0.1445315182209015, + "learning_rate": 0.001, + "loss": 1.8728, + "step": 13312 + }, + { + "epoch": 0.5632033166934597, + "grad_norm": 0.14706316590309143, + "learning_rate": 0.001, + "loss": 1.7439, + "step": 13313 + }, + { + "epoch": 0.5632456214569761, + "grad_norm": 0.19955837726593018, + "learning_rate": 0.001, + "loss": 2.1125, + "step": 13314 + }, + { + "epoch": 0.5632879262204924, + "grad_norm": 0.19193534553050995, + "learning_rate": 0.001, + "loss": 3.1798, + "step": 13315 + }, + { + "epoch": 0.5633302309840088, + "grad_norm": 0.19683898985385895, + "learning_rate": 0.001, + "loss": 1.9713, + "step": 13316 + }, + { + "epoch": 0.5633725357475252, + "grad_norm": 0.182073175907135, + "learning_rate": 0.001, + "loss": 3.2368, + "step": 13317 + }, + { + "epoch": 0.5634148405110415, + "grad_norm": 0.3637740910053253, + "learning_rate": 0.001, + "loss": 2.0353, + "step": 13318 + }, + { + "epoch": 0.5634571452745579, + "grad_norm": 0.1640586256980896, + "learning_rate": 0.001, + "loss": 1.5906, + "step": 13319 + }, + { + "epoch": 0.5634994500380743, + "grad_norm": 0.2922709584236145, + "learning_rate": 0.001, + "loss": 2.5071, + "step": 13320 + }, + { + "epoch": 0.5635417548015906, + "grad_norm": 0.1738395392894745, + "learning_rate": 0.001, + "loss": 1.973, + "step": 13321 + }, + { + "epoch": 0.563584059565107, + "grad_norm": 0.18015141785144806, + "learning_rate": 0.001, + "loss": 1.9577, + "step": 13322 + }, + { + "epoch": 0.5636263643286235, + "grad_norm": 0.17254801094532013, + "learning_rate": 0.001, + "loss": 2.3277, + "step": 13323 + }, + { + "epoch": 0.5636686690921398, + "grad_norm": 0.20676442980766296, + "learning_rate": 0.001, + "loss": 1.898, + "step": 13324 + }, + { + "epoch": 0.5637109738556562, + "grad_norm": 0.19672460854053497, + "learning_rate": 0.001, + "loss": 2.5557, + "step": 13325 + }, + { + "epoch": 0.5637532786191726, + "grad_norm": 0.16016629338264465, + "learning_rate": 0.001, + "loss": 1.7529, + "step": 13326 + }, + { + "epoch": 0.5637955833826889, + "grad_norm": 0.1732088327407837, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 13327 + }, + { + "epoch": 0.5638378881462053, + "grad_norm": 0.1827797144651413, + "learning_rate": 0.001, + "loss": 1.7933, + "step": 13328 + }, + { + "epoch": 0.5638801929097217, + "grad_norm": 0.16698625683784485, + "learning_rate": 0.001, + "loss": 2.7892, + "step": 13329 + }, + { + "epoch": 0.563922497673238, + "grad_norm": 0.1709325909614563, + "learning_rate": 0.001, + "loss": 2.5153, + "step": 13330 + }, + { + "epoch": 0.5639648024367544, + "grad_norm": 0.25048211216926575, + "learning_rate": 0.001, + "loss": 2.6259, + "step": 13331 + }, + { + "epoch": 0.5640071072002707, + "grad_norm": 0.17622262239456177, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 13332 + }, + { + "epoch": 0.5640494119637871, + "grad_norm": 0.166434183716774, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 13333 + }, + { + "epoch": 0.5640917167273035, + "grad_norm": 0.7903834581375122, + "learning_rate": 0.001, + "loss": 1.9703, + "step": 13334 + }, + { + "epoch": 0.5641340214908198, + "grad_norm": 0.15010859072208405, + "learning_rate": 0.001, + "loss": 1.7071, + "step": 13335 + }, + { + "epoch": 0.5641763262543362, + "grad_norm": 0.1890597641468048, + "learning_rate": 0.001, + "loss": 2.995, + "step": 13336 + }, + { + "epoch": 0.5642186310178526, + "grad_norm": 0.19205980002880096, + "learning_rate": 0.001, + "loss": 2.0444, + "step": 13337 + }, + { + "epoch": 0.5642609357813689, + "grad_norm": 0.1756296306848526, + "learning_rate": 0.001, + "loss": 2.3472, + "step": 13338 + }, + { + "epoch": 0.5643032405448853, + "grad_norm": 0.22155724465847015, + "learning_rate": 0.001, + "loss": 3.1698, + "step": 13339 + }, + { + "epoch": 0.5643455453084018, + "grad_norm": 0.18253694474697113, + "learning_rate": 0.001, + "loss": 1.7616, + "step": 13340 + }, + { + "epoch": 0.564387850071918, + "grad_norm": 0.22829587757587433, + "learning_rate": 0.001, + "loss": 1.5545, + "step": 13341 + }, + { + "epoch": 0.5644301548354345, + "grad_norm": 2.0366973876953125, + "learning_rate": 0.001, + "loss": 1.7702, + "step": 13342 + }, + { + "epoch": 0.5644724595989509, + "grad_norm": 0.7380493879318237, + "learning_rate": 0.001, + "loss": 2.1662, + "step": 13343 + }, + { + "epoch": 0.5645147643624672, + "grad_norm": 0.19614994525909424, + "learning_rate": 0.001, + "loss": 2.5599, + "step": 13344 + }, + { + "epoch": 0.5645570691259836, + "grad_norm": 0.39196836948394775, + "learning_rate": 0.001, + "loss": 1.6906, + "step": 13345 + }, + { + "epoch": 0.5645993738895, + "grad_norm": 0.3456530272960663, + "learning_rate": 0.001, + "loss": 1.9598, + "step": 13346 + }, + { + "epoch": 0.5646416786530163, + "grad_norm": 0.20277409255504608, + "learning_rate": 0.001, + "loss": 2.0331, + "step": 13347 + }, + { + "epoch": 0.5646839834165327, + "grad_norm": 0.4468050003051758, + "learning_rate": 0.001, + "loss": 1.8206, + "step": 13348 + }, + { + "epoch": 0.5647262881800491, + "grad_norm": 0.3284309208393097, + "learning_rate": 0.001, + "loss": 2.2595, + "step": 13349 + }, + { + "epoch": 0.5647685929435654, + "grad_norm": 0.16732022166252136, + "learning_rate": 0.001, + "loss": 1.9984, + "step": 13350 + }, + { + "epoch": 0.5648108977070818, + "grad_norm": 3.110480785369873, + "learning_rate": 0.001, + "loss": 2.6128, + "step": 13351 + }, + { + "epoch": 0.5648532024705982, + "grad_norm": 0.18067099153995514, + "learning_rate": 0.001, + "loss": 2.0611, + "step": 13352 + }, + { + "epoch": 0.5648955072341145, + "grad_norm": 0.17390722036361694, + "learning_rate": 0.001, + "loss": 2.0928, + "step": 13353 + }, + { + "epoch": 0.5649378119976309, + "grad_norm": 0.18560457229614258, + "learning_rate": 0.001, + "loss": 3.1778, + "step": 13354 + }, + { + "epoch": 0.5649801167611473, + "grad_norm": 0.18010523915290833, + "learning_rate": 0.001, + "loss": 2.5076, + "step": 13355 + }, + { + "epoch": 0.5650224215246636, + "grad_norm": 0.18280434608459473, + "learning_rate": 0.001, + "loss": 2.2038, + "step": 13356 + }, + { + "epoch": 0.56506472628818, + "grad_norm": 0.9968026280403137, + "learning_rate": 0.001, + "loss": 2.715, + "step": 13357 + }, + { + "epoch": 0.5651070310516965, + "grad_norm": 1.5389236211776733, + "learning_rate": 0.001, + "loss": 2.5267, + "step": 13358 + }, + { + "epoch": 0.5651493358152128, + "grad_norm": 0.8743659257888794, + "learning_rate": 0.001, + "loss": 2.5618, + "step": 13359 + }, + { + "epoch": 0.5651916405787292, + "grad_norm": 0.20591340959072113, + "learning_rate": 0.001, + "loss": 2.3677, + "step": 13360 + }, + { + "epoch": 0.5652339453422456, + "grad_norm": 0.20366302132606506, + "learning_rate": 0.001, + "loss": 2.3497, + "step": 13361 + }, + { + "epoch": 0.5652762501057619, + "grad_norm": 0.18198545277118683, + "learning_rate": 0.001, + "loss": 2.1656, + "step": 13362 + }, + { + "epoch": 0.5653185548692783, + "grad_norm": 0.1899123340845108, + "learning_rate": 0.001, + "loss": 1.8184, + "step": 13363 + }, + { + "epoch": 0.5653608596327947, + "grad_norm": 0.21096497774124146, + "learning_rate": 0.001, + "loss": 2.3129, + "step": 13364 + }, + { + "epoch": 0.565403164396311, + "grad_norm": 0.17829644680023193, + "learning_rate": 0.001, + "loss": 2.4071, + "step": 13365 + }, + { + "epoch": 0.5654454691598274, + "grad_norm": 0.1842295229434967, + "learning_rate": 0.001, + "loss": 3.0018, + "step": 13366 + }, + { + "epoch": 0.5654877739233438, + "grad_norm": 0.2008989155292511, + "learning_rate": 0.001, + "loss": 2.1184, + "step": 13367 + }, + { + "epoch": 0.5655300786868601, + "grad_norm": 0.22309176623821259, + "learning_rate": 0.001, + "loss": 2.2204, + "step": 13368 + }, + { + "epoch": 0.5655723834503765, + "grad_norm": 0.4550027847290039, + "learning_rate": 0.001, + "loss": 2.8608, + "step": 13369 + }, + { + "epoch": 0.5656146882138929, + "grad_norm": 0.16846232116222382, + "learning_rate": 0.001, + "loss": 1.7115, + "step": 13370 + }, + { + "epoch": 0.5656569929774092, + "grad_norm": 0.9170604944229126, + "learning_rate": 0.001, + "loss": 2.3162, + "step": 13371 + }, + { + "epoch": 0.5656992977409256, + "grad_norm": 0.21079829335212708, + "learning_rate": 0.001, + "loss": 2.2757, + "step": 13372 + }, + { + "epoch": 0.565741602504442, + "grad_norm": 0.15155363082885742, + "learning_rate": 0.001, + "loss": 1.9875, + "step": 13373 + }, + { + "epoch": 0.5657839072679584, + "grad_norm": 0.5003759860992432, + "learning_rate": 0.001, + "loss": 2.126, + "step": 13374 + }, + { + "epoch": 0.5658262120314748, + "grad_norm": 0.2025512456893921, + "learning_rate": 0.001, + "loss": 1.8468, + "step": 13375 + }, + { + "epoch": 0.5658685167949911, + "grad_norm": 0.45572540163993835, + "learning_rate": 0.001, + "loss": 2.3299, + "step": 13376 + }, + { + "epoch": 0.5659108215585075, + "grad_norm": 0.17576457560062408, + "learning_rate": 0.001, + "loss": 2.3811, + "step": 13377 + }, + { + "epoch": 0.5659531263220239, + "grad_norm": 0.28992700576782227, + "learning_rate": 0.001, + "loss": 2.4098, + "step": 13378 + }, + { + "epoch": 0.5659954310855402, + "grad_norm": 0.6396428942680359, + "learning_rate": 0.001, + "loss": 2.3187, + "step": 13379 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.18657970428466797, + "learning_rate": 0.001, + "loss": 2.3154, + "step": 13380 + }, + { + "epoch": 0.566080040612573, + "grad_norm": 0.4546518623828888, + "learning_rate": 0.001, + "loss": 1.7812, + "step": 13381 + }, + { + "epoch": 0.5661223453760893, + "grad_norm": 0.4422479271888733, + "learning_rate": 0.001, + "loss": 2.0153, + "step": 13382 + }, + { + "epoch": 0.5661646501396057, + "grad_norm": 0.19912591576576233, + "learning_rate": 0.001, + "loss": 1.6749, + "step": 13383 + }, + { + "epoch": 0.5662069549031221, + "grad_norm": 0.20137451589107513, + "learning_rate": 0.001, + "loss": 2.3742, + "step": 13384 + }, + { + "epoch": 0.5662492596666384, + "grad_norm": 0.22245512902736664, + "learning_rate": 0.001, + "loss": 3.0519, + "step": 13385 + }, + { + "epoch": 0.5662915644301548, + "grad_norm": 0.179260715842247, + "learning_rate": 0.001, + "loss": 2.305, + "step": 13386 + }, + { + "epoch": 0.5663338691936712, + "grad_norm": 0.19476951658725739, + "learning_rate": 0.001, + "loss": 2.0913, + "step": 13387 + }, + { + "epoch": 0.5663761739571875, + "grad_norm": 0.1848563849925995, + "learning_rate": 0.001, + "loss": 2.4601, + "step": 13388 + }, + { + "epoch": 0.5664184787207039, + "grad_norm": 0.26256316900253296, + "learning_rate": 0.001, + "loss": 2.8465, + "step": 13389 + }, + { + "epoch": 0.5664607834842204, + "grad_norm": 0.1553989052772522, + "learning_rate": 0.001, + "loss": 1.6542, + "step": 13390 + }, + { + "epoch": 0.5665030882477367, + "grad_norm": 0.17743252217769623, + "learning_rate": 0.001, + "loss": 1.7768, + "step": 13391 + }, + { + "epoch": 0.5665453930112531, + "grad_norm": 0.19140616059303284, + "learning_rate": 0.001, + "loss": 1.461, + "step": 13392 + }, + { + "epoch": 0.5665876977747695, + "grad_norm": 0.19345830380916595, + "learning_rate": 0.001, + "loss": 2.0676, + "step": 13393 + }, + { + "epoch": 0.5666300025382858, + "grad_norm": 0.2207154482603073, + "learning_rate": 0.001, + "loss": 2.4146, + "step": 13394 + }, + { + "epoch": 0.5666723073018022, + "grad_norm": 0.1724683791399002, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 13395 + }, + { + "epoch": 0.5667146120653186, + "grad_norm": 0.16311654448509216, + "learning_rate": 0.001, + "loss": 3.1914, + "step": 13396 + }, + { + "epoch": 0.5667569168288349, + "grad_norm": 0.16985690593719482, + "learning_rate": 0.001, + "loss": 2.3727, + "step": 13397 + }, + { + "epoch": 0.5667992215923513, + "grad_norm": 0.1899961233139038, + "learning_rate": 0.001, + "loss": 2.2874, + "step": 13398 + }, + { + "epoch": 0.5668415263558677, + "grad_norm": 0.18617494404315948, + "learning_rate": 0.001, + "loss": 1.4395, + "step": 13399 + }, + { + "epoch": 0.566883831119384, + "grad_norm": 0.18079866468906403, + "learning_rate": 0.001, + "loss": 3.0455, + "step": 13400 + }, + { + "epoch": 0.5669261358829004, + "grad_norm": 0.18550261855125427, + "learning_rate": 0.001, + "loss": 3.1314, + "step": 13401 + }, + { + "epoch": 0.5669684406464168, + "grad_norm": 0.17485053837299347, + "learning_rate": 0.001, + "loss": 2.1399, + "step": 13402 + }, + { + "epoch": 0.5670107454099331, + "grad_norm": 0.15552884340286255, + "learning_rate": 0.001, + "loss": 1.5707, + "step": 13403 + }, + { + "epoch": 0.5670530501734495, + "grad_norm": 0.14861054718494415, + "learning_rate": 0.001, + "loss": 1.7765, + "step": 13404 + }, + { + "epoch": 0.567095354936966, + "grad_norm": 0.4289642572402954, + "learning_rate": 0.001, + "loss": 1.7516, + "step": 13405 + }, + { + "epoch": 0.5671376597004822, + "grad_norm": 0.14437785744667053, + "learning_rate": 0.001, + "loss": 2.3381, + "step": 13406 + }, + { + "epoch": 0.5671799644639987, + "grad_norm": 0.1592496633529663, + "learning_rate": 0.001, + "loss": 1.7272, + "step": 13407 + }, + { + "epoch": 0.5672222692275151, + "grad_norm": 0.2676011323928833, + "learning_rate": 0.001, + "loss": 2.5001, + "step": 13408 + }, + { + "epoch": 0.5672645739910314, + "grad_norm": 0.1811024695634842, + "learning_rate": 0.001, + "loss": 2.3746, + "step": 13409 + }, + { + "epoch": 0.5673068787545478, + "grad_norm": 0.16653041541576385, + "learning_rate": 0.001, + "loss": 1.9266, + "step": 13410 + }, + { + "epoch": 0.5673491835180642, + "grad_norm": 0.18817272782325745, + "learning_rate": 0.001, + "loss": 2.2231, + "step": 13411 + }, + { + "epoch": 0.5673914882815805, + "grad_norm": 0.381883442401886, + "learning_rate": 0.001, + "loss": 1.8577, + "step": 13412 + }, + { + "epoch": 0.5674337930450969, + "grad_norm": 0.1743682324886322, + "learning_rate": 0.001, + "loss": 2.6966, + "step": 13413 + }, + { + "epoch": 0.5674760978086133, + "grad_norm": 0.18651573359966278, + "learning_rate": 0.001, + "loss": 2.3024, + "step": 13414 + }, + { + "epoch": 0.5675184025721296, + "grad_norm": 2.339829206466675, + "learning_rate": 0.001, + "loss": 2.0363, + "step": 13415 + }, + { + "epoch": 0.567560707335646, + "grad_norm": 0.28542375564575195, + "learning_rate": 0.001, + "loss": 2.1393, + "step": 13416 + }, + { + "epoch": 0.5676030120991624, + "grad_norm": 0.17619076371192932, + "learning_rate": 0.001, + "loss": 1.4569, + "step": 13417 + }, + { + "epoch": 0.5676453168626787, + "grad_norm": 0.171681746840477, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 13418 + }, + { + "epoch": 0.5676876216261951, + "grad_norm": 0.23256142437458038, + "learning_rate": 0.001, + "loss": 1.973, + "step": 13419 + }, + { + "epoch": 0.5677299263897115, + "grad_norm": 0.22075031697750092, + "learning_rate": 0.001, + "loss": 1.9528, + "step": 13420 + }, + { + "epoch": 0.5677722311532278, + "grad_norm": 0.12937584519386292, + "learning_rate": 0.001, + "loss": 2.196, + "step": 13421 + }, + { + "epoch": 0.5678145359167442, + "grad_norm": 0.17965421080589294, + "learning_rate": 0.001, + "loss": 1.5309, + "step": 13422 + }, + { + "epoch": 0.5678568406802605, + "grad_norm": 2.4008188247680664, + "learning_rate": 0.001, + "loss": 2.3106, + "step": 13423 + }, + { + "epoch": 0.567899145443777, + "grad_norm": 0.16604921221733093, + "learning_rate": 0.001, + "loss": 2.2181, + "step": 13424 + }, + { + "epoch": 0.5679414502072934, + "grad_norm": 0.2204952985048294, + "learning_rate": 0.001, + "loss": 2.9946, + "step": 13425 + }, + { + "epoch": 0.5679837549708097, + "grad_norm": 0.2380129098892212, + "learning_rate": 0.001, + "loss": 1.8916, + "step": 13426 + }, + { + "epoch": 0.5680260597343261, + "grad_norm": 0.20674805343151093, + "learning_rate": 0.001, + "loss": 2.224, + "step": 13427 + }, + { + "epoch": 0.5680683644978425, + "grad_norm": 0.2611851990222931, + "learning_rate": 0.001, + "loss": 2.041, + "step": 13428 + }, + { + "epoch": 0.5681106692613588, + "grad_norm": 4.238038063049316, + "learning_rate": 0.001, + "loss": 3.423, + "step": 13429 + }, + { + "epoch": 0.5681529740248752, + "grad_norm": 0.26452019810676575, + "learning_rate": 0.001, + "loss": 2.9858, + "step": 13430 + }, + { + "epoch": 0.5681952787883916, + "grad_norm": 0.2289876937866211, + "learning_rate": 0.001, + "loss": 2.6998, + "step": 13431 + }, + { + "epoch": 0.5682375835519079, + "grad_norm": 0.3570123314857483, + "learning_rate": 0.001, + "loss": 2.1574, + "step": 13432 + }, + { + "epoch": 0.5682798883154243, + "grad_norm": 0.33323732018470764, + "learning_rate": 0.001, + "loss": 1.9346, + "step": 13433 + }, + { + "epoch": 0.5683221930789407, + "grad_norm": 0.2315545380115509, + "learning_rate": 0.001, + "loss": 1.9166, + "step": 13434 + }, + { + "epoch": 0.568364497842457, + "grad_norm": 0.42233508825302124, + "learning_rate": 0.001, + "loss": 2.1505, + "step": 13435 + }, + { + "epoch": 0.5684068026059734, + "grad_norm": 0.28535693883895874, + "learning_rate": 0.001, + "loss": 2.0267, + "step": 13436 + }, + { + "epoch": 0.5684491073694898, + "grad_norm": 0.6629549264907837, + "learning_rate": 0.001, + "loss": 2.3038, + "step": 13437 + }, + { + "epoch": 0.5684914121330061, + "grad_norm": 0.18485631048679352, + "learning_rate": 0.001, + "loss": 2.8315, + "step": 13438 + }, + { + "epoch": 0.5685337168965225, + "grad_norm": 1.2029775381088257, + "learning_rate": 0.001, + "loss": 2.7915, + "step": 13439 + }, + { + "epoch": 0.568576021660039, + "grad_norm": 0.16780279576778412, + "learning_rate": 0.001, + "loss": 2.0964, + "step": 13440 + }, + { + "epoch": 0.5686183264235553, + "grad_norm": 0.4147586524486542, + "learning_rate": 0.001, + "loss": 2.8658, + "step": 13441 + }, + { + "epoch": 0.5686606311870717, + "grad_norm": 0.21042154729366302, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 13442 + }, + { + "epoch": 0.5687029359505881, + "grad_norm": 0.2242194414138794, + "learning_rate": 0.001, + "loss": 2.0489, + "step": 13443 + }, + { + "epoch": 0.5687452407141044, + "grad_norm": 0.27875545620918274, + "learning_rate": 0.001, + "loss": 1.592, + "step": 13444 + }, + { + "epoch": 0.5687875454776208, + "grad_norm": 0.1999659240245819, + "learning_rate": 0.001, + "loss": 2.1463, + "step": 13445 + }, + { + "epoch": 0.5688298502411372, + "grad_norm": 0.15846888720989227, + "learning_rate": 0.001, + "loss": 2.1381, + "step": 13446 + }, + { + "epoch": 0.5688721550046535, + "grad_norm": 0.21760839223861694, + "learning_rate": 0.001, + "loss": 2.4506, + "step": 13447 + }, + { + "epoch": 0.5689144597681699, + "grad_norm": 0.15946030616760254, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 13448 + }, + { + "epoch": 0.5689567645316863, + "grad_norm": 0.21263648569583893, + "learning_rate": 0.001, + "loss": 1.999, + "step": 13449 + }, + { + "epoch": 0.5689990692952026, + "grad_norm": 0.1755702942609787, + "learning_rate": 0.001, + "loss": 1.845, + "step": 13450 + }, + { + "epoch": 0.569041374058719, + "grad_norm": 0.14761324226856232, + "learning_rate": 0.001, + "loss": 1.5729, + "step": 13451 + }, + { + "epoch": 0.5690836788222354, + "grad_norm": 0.16154663264751434, + "learning_rate": 0.001, + "loss": 1.4929, + "step": 13452 + }, + { + "epoch": 0.5691259835857517, + "grad_norm": 0.15619386732578278, + "learning_rate": 0.001, + "loss": 2.8627, + "step": 13453 + }, + { + "epoch": 0.5691682883492681, + "grad_norm": 0.2875896394252777, + "learning_rate": 0.001, + "loss": 3.1241, + "step": 13454 + }, + { + "epoch": 0.5692105931127845, + "grad_norm": 0.20529961585998535, + "learning_rate": 0.001, + "loss": 2.3149, + "step": 13455 + }, + { + "epoch": 0.5692528978763008, + "grad_norm": 0.134473517537117, + "learning_rate": 0.001, + "loss": 1.909, + "step": 13456 + }, + { + "epoch": 0.5692952026398173, + "grad_norm": 0.15408681333065033, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 13457 + }, + { + "epoch": 0.5693375074033337, + "grad_norm": 0.17341335117816925, + "learning_rate": 0.001, + "loss": 2.7923, + "step": 13458 + }, + { + "epoch": 0.56937981216685, + "grad_norm": 0.1393430083990097, + "learning_rate": 0.001, + "loss": 2.5653, + "step": 13459 + }, + { + "epoch": 0.5694221169303664, + "grad_norm": 0.20593148469924927, + "learning_rate": 0.001, + "loss": 2.7352, + "step": 13460 + }, + { + "epoch": 0.5694644216938828, + "grad_norm": 0.19775381684303284, + "learning_rate": 0.001, + "loss": 1.8592, + "step": 13461 + }, + { + "epoch": 0.5695067264573991, + "grad_norm": 0.17005833983421326, + "learning_rate": 0.001, + "loss": 1.5063, + "step": 13462 + }, + { + "epoch": 0.5695490312209155, + "grad_norm": 0.1618228405714035, + "learning_rate": 0.001, + "loss": 2.7133, + "step": 13463 + }, + { + "epoch": 0.5695913359844319, + "grad_norm": 0.19688403606414795, + "learning_rate": 0.001, + "loss": 3.1128, + "step": 13464 + }, + { + "epoch": 0.5696336407479482, + "grad_norm": 0.184058278799057, + "learning_rate": 0.001, + "loss": 2.203, + "step": 13465 + }, + { + "epoch": 0.5696759455114646, + "grad_norm": 0.4234599769115448, + "learning_rate": 0.001, + "loss": 1.4275, + "step": 13466 + }, + { + "epoch": 0.5697182502749809, + "grad_norm": 0.610440194606781, + "learning_rate": 0.001, + "loss": 1.8219, + "step": 13467 + }, + { + "epoch": 0.5697605550384973, + "grad_norm": 0.6861364841461182, + "learning_rate": 0.001, + "loss": 1.8298, + "step": 13468 + }, + { + "epoch": 0.5698028598020137, + "grad_norm": 0.17145289480686188, + "learning_rate": 0.001, + "loss": 2.5483, + "step": 13469 + }, + { + "epoch": 0.56984516456553, + "grad_norm": 0.2071324735879898, + "learning_rate": 0.001, + "loss": 1.5715, + "step": 13470 + }, + { + "epoch": 0.5698874693290464, + "grad_norm": 0.1772976666688919, + "learning_rate": 0.001, + "loss": 2.9208, + "step": 13471 + }, + { + "epoch": 0.5699297740925628, + "grad_norm": 0.14574064314365387, + "learning_rate": 0.001, + "loss": 2.1653, + "step": 13472 + }, + { + "epoch": 0.5699720788560791, + "grad_norm": 0.16666589677333832, + "learning_rate": 0.001, + "loss": 2.4713, + "step": 13473 + }, + { + "epoch": 0.5700143836195956, + "grad_norm": 0.19138103723526, + "learning_rate": 0.001, + "loss": 2.0317, + "step": 13474 + }, + { + "epoch": 0.570056688383112, + "grad_norm": 0.21919706463813782, + "learning_rate": 0.001, + "loss": 1.5688, + "step": 13475 + }, + { + "epoch": 0.5700989931466283, + "grad_norm": 0.6911723613739014, + "learning_rate": 0.001, + "loss": 2.5346, + "step": 13476 + }, + { + "epoch": 0.5701412979101447, + "grad_norm": 1.9358234405517578, + "learning_rate": 0.001, + "loss": 2.5255, + "step": 13477 + }, + { + "epoch": 0.5701836026736611, + "grad_norm": 0.43052294850349426, + "learning_rate": 0.001, + "loss": 2.2194, + "step": 13478 + }, + { + "epoch": 0.5702259074371774, + "grad_norm": 0.1567905694246292, + "learning_rate": 0.001, + "loss": 1.5455, + "step": 13479 + }, + { + "epoch": 0.5702682122006938, + "grad_norm": 0.16955433785915375, + "learning_rate": 0.001, + "loss": 2.2044, + "step": 13480 + }, + { + "epoch": 0.5703105169642102, + "grad_norm": 0.21759982407093048, + "learning_rate": 0.001, + "loss": 1.6764, + "step": 13481 + }, + { + "epoch": 0.5703528217277265, + "grad_norm": 0.26957741379737854, + "learning_rate": 0.001, + "loss": 3.0424, + "step": 13482 + }, + { + "epoch": 0.5703951264912429, + "grad_norm": 0.23791447281837463, + "learning_rate": 0.001, + "loss": 2.1794, + "step": 13483 + }, + { + "epoch": 0.5704374312547593, + "grad_norm": 0.17288313806056976, + "learning_rate": 0.001, + "loss": 2.8003, + "step": 13484 + }, + { + "epoch": 0.5704797360182756, + "grad_norm": 0.186492919921875, + "learning_rate": 0.001, + "loss": 2.5023, + "step": 13485 + }, + { + "epoch": 0.570522040781792, + "grad_norm": 0.17396779358386993, + "learning_rate": 0.001, + "loss": 2.3465, + "step": 13486 + }, + { + "epoch": 0.5705643455453084, + "grad_norm": 0.20612338185310364, + "learning_rate": 0.001, + "loss": 1.9371, + "step": 13487 + }, + { + "epoch": 0.5706066503088247, + "grad_norm": 0.16367042064666748, + "learning_rate": 0.001, + "loss": 1.9677, + "step": 13488 + }, + { + "epoch": 0.5706489550723411, + "grad_norm": 1.0515310764312744, + "learning_rate": 0.001, + "loss": 1.7665, + "step": 13489 + }, + { + "epoch": 0.5706912598358576, + "grad_norm": 0.4732486307621002, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 13490 + }, + { + "epoch": 0.5707335645993739, + "grad_norm": 0.14785124361515045, + "learning_rate": 0.001, + "loss": 1.9295, + "step": 13491 + }, + { + "epoch": 0.5707758693628903, + "grad_norm": 0.1713053435087204, + "learning_rate": 0.001, + "loss": 1.8555, + "step": 13492 + }, + { + "epoch": 0.5708181741264067, + "grad_norm": 0.17723537981510162, + "learning_rate": 0.001, + "loss": 2.1308, + "step": 13493 + }, + { + "epoch": 0.570860478889923, + "grad_norm": 1.3990591764450073, + "learning_rate": 0.001, + "loss": 2.5823, + "step": 13494 + }, + { + "epoch": 0.5709027836534394, + "grad_norm": 0.6561245918273926, + "learning_rate": 0.001, + "loss": 2.6915, + "step": 13495 + }, + { + "epoch": 0.5709450884169558, + "grad_norm": 1.7256704568862915, + "learning_rate": 0.001, + "loss": 1.4257, + "step": 13496 + }, + { + "epoch": 0.5709873931804721, + "grad_norm": 0.22391989827156067, + "learning_rate": 0.001, + "loss": 2.3335, + "step": 13497 + }, + { + "epoch": 0.5710296979439885, + "grad_norm": 0.1710107922554016, + "learning_rate": 0.001, + "loss": 1.8255, + "step": 13498 + }, + { + "epoch": 0.5710720027075049, + "grad_norm": 0.21606822311878204, + "learning_rate": 0.001, + "loss": 2.894, + "step": 13499 + }, + { + "epoch": 0.5711143074710212, + "grad_norm": 0.19133202731609344, + "learning_rate": 0.001, + "loss": 1.5382, + "step": 13500 + }, + { + "epoch": 0.5711566122345376, + "grad_norm": 0.2007291465997696, + "learning_rate": 0.001, + "loss": 2.0458, + "step": 13501 + }, + { + "epoch": 0.571198916998054, + "grad_norm": 0.21422770619392395, + "learning_rate": 0.001, + "loss": 2.758, + "step": 13502 + }, + { + "epoch": 0.5712412217615703, + "grad_norm": 0.1938318908214569, + "learning_rate": 0.001, + "loss": 2.7117, + "step": 13503 + }, + { + "epoch": 0.5712835265250867, + "grad_norm": 0.26207903027534485, + "learning_rate": 0.001, + "loss": 1.8618, + "step": 13504 + }, + { + "epoch": 0.5713258312886031, + "grad_norm": 0.2131361961364746, + "learning_rate": 0.001, + "loss": 1.9149, + "step": 13505 + }, + { + "epoch": 0.5713681360521194, + "grad_norm": 0.1706126481294632, + "learning_rate": 0.001, + "loss": 2.3218, + "step": 13506 + }, + { + "epoch": 0.5714104408156359, + "grad_norm": 0.20523306727409363, + "learning_rate": 0.001, + "loss": 2.2526, + "step": 13507 + }, + { + "epoch": 0.5714527455791523, + "grad_norm": 0.25251221656799316, + "learning_rate": 0.001, + "loss": 2.3534, + "step": 13508 + }, + { + "epoch": 0.5714950503426686, + "grad_norm": 0.3562532663345337, + "learning_rate": 0.001, + "loss": 1.6221, + "step": 13509 + }, + { + "epoch": 0.571537355106185, + "grad_norm": 0.19563472270965576, + "learning_rate": 0.001, + "loss": 2.2984, + "step": 13510 + }, + { + "epoch": 0.5715796598697013, + "grad_norm": 0.19130732119083405, + "learning_rate": 0.001, + "loss": 3.6665, + "step": 13511 + }, + { + "epoch": 0.5716219646332177, + "grad_norm": 0.2187919318675995, + "learning_rate": 0.001, + "loss": 2.7956, + "step": 13512 + }, + { + "epoch": 0.5716642693967341, + "grad_norm": 0.7064900994300842, + "learning_rate": 0.001, + "loss": 1.7473, + "step": 13513 + }, + { + "epoch": 0.5717065741602504, + "grad_norm": 0.17418193817138672, + "learning_rate": 0.001, + "loss": 2.1856, + "step": 13514 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.1919695883989334, + "learning_rate": 0.001, + "loss": 2.6144, + "step": 13515 + }, + { + "epoch": 0.5717911836872832, + "grad_norm": 0.1946498155593872, + "learning_rate": 0.001, + "loss": 1.8036, + "step": 13516 + }, + { + "epoch": 0.5718334884507995, + "grad_norm": 0.18168438971042633, + "learning_rate": 0.001, + "loss": 2.7927, + "step": 13517 + }, + { + "epoch": 0.5718757932143159, + "grad_norm": 0.410906583070755, + "learning_rate": 0.001, + "loss": 2.8412, + "step": 13518 + }, + { + "epoch": 0.5719180979778323, + "grad_norm": 0.3952532708644867, + "learning_rate": 0.001, + "loss": 1.6598, + "step": 13519 + }, + { + "epoch": 0.5719604027413486, + "grad_norm": 0.7694887518882751, + "learning_rate": 0.001, + "loss": 2.2268, + "step": 13520 + }, + { + "epoch": 0.572002707504865, + "grad_norm": 0.2468140423297882, + "learning_rate": 0.001, + "loss": 2.8834, + "step": 13521 + }, + { + "epoch": 0.5720450122683814, + "grad_norm": 0.23892976343631744, + "learning_rate": 0.001, + "loss": 1.4797, + "step": 13522 + }, + { + "epoch": 0.5720873170318977, + "grad_norm": 0.2801859974861145, + "learning_rate": 0.001, + "loss": 2.2426, + "step": 13523 + }, + { + "epoch": 0.5721296217954142, + "grad_norm": 0.1985364854335785, + "learning_rate": 0.001, + "loss": 2.5463, + "step": 13524 + }, + { + "epoch": 0.5721719265589306, + "grad_norm": 0.24735181033611298, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 13525 + }, + { + "epoch": 0.5722142313224469, + "grad_norm": 0.18506476283073425, + "learning_rate": 0.001, + "loss": 1.8858, + "step": 13526 + }, + { + "epoch": 0.5722565360859633, + "grad_norm": 0.2307346612215042, + "learning_rate": 0.001, + "loss": 2.1989, + "step": 13527 + }, + { + "epoch": 0.5722988408494797, + "grad_norm": 0.16953586041927338, + "learning_rate": 0.001, + "loss": 2.0845, + "step": 13528 + }, + { + "epoch": 0.572341145612996, + "grad_norm": 0.18418249487876892, + "learning_rate": 0.001, + "loss": 2.2964, + "step": 13529 + }, + { + "epoch": 0.5723834503765124, + "grad_norm": 0.2580213248729706, + "learning_rate": 0.001, + "loss": 2.1883, + "step": 13530 + }, + { + "epoch": 0.5724257551400288, + "grad_norm": 2.6409804821014404, + "learning_rate": 0.001, + "loss": 2.2781, + "step": 13531 + }, + { + "epoch": 0.5724680599035451, + "grad_norm": 0.15479686856269836, + "learning_rate": 0.001, + "loss": 2.2066, + "step": 13532 + }, + { + "epoch": 0.5725103646670615, + "grad_norm": 0.8504530787467957, + "learning_rate": 0.001, + "loss": 1.5421, + "step": 13533 + }, + { + "epoch": 0.5725526694305779, + "grad_norm": 0.17100943624973297, + "learning_rate": 0.001, + "loss": 3.2645, + "step": 13534 + }, + { + "epoch": 0.5725949741940942, + "grad_norm": 0.6614644527435303, + "learning_rate": 0.001, + "loss": 3.2304, + "step": 13535 + }, + { + "epoch": 0.5726372789576106, + "grad_norm": 1.6653027534484863, + "learning_rate": 0.001, + "loss": 2.8675, + "step": 13536 + }, + { + "epoch": 0.572679583721127, + "grad_norm": 0.16103801131248474, + "learning_rate": 0.001, + "loss": 1.9109, + "step": 13537 + }, + { + "epoch": 0.5727218884846433, + "grad_norm": 0.16726185381412506, + "learning_rate": 0.001, + "loss": 3.5696, + "step": 13538 + }, + { + "epoch": 0.5727641932481597, + "grad_norm": 0.3862936496734619, + "learning_rate": 0.001, + "loss": 2.0125, + "step": 13539 + }, + { + "epoch": 0.5728064980116762, + "grad_norm": 1.980115532875061, + "learning_rate": 0.001, + "loss": 2.5362, + "step": 13540 + }, + { + "epoch": 0.5728488027751925, + "grad_norm": 0.1421581506729126, + "learning_rate": 0.001, + "loss": 2.1459, + "step": 13541 + }, + { + "epoch": 0.5728911075387089, + "grad_norm": 0.18831764161586761, + "learning_rate": 0.001, + "loss": 2.523, + "step": 13542 + }, + { + "epoch": 0.5729334123022253, + "grad_norm": 0.15375645458698273, + "learning_rate": 0.001, + "loss": 1.82, + "step": 13543 + }, + { + "epoch": 0.5729757170657416, + "grad_norm": 0.14795835316181183, + "learning_rate": 0.001, + "loss": 2.3571, + "step": 13544 + }, + { + "epoch": 0.573018021829258, + "grad_norm": 0.15851891040802002, + "learning_rate": 0.001, + "loss": 2.5942, + "step": 13545 + }, + { + "epoch": 0.5730603265927744, + "grad_norm": 0.15028858184814453, + "learning_rate": 0.001, + "loss": 3.2621, + "step": 13546 + }, + { + "epoch": 0.5731026313562907, + "grad_norm": 0.15741689503192902, + "learning_rate": 0.001, + "loss": 1.9997, + "step": 13547 + }, + { + "epoch": 0.5731449361198071, + "grad_norm": 0.1891627013683319, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 13548 + }, + { + "epoch": 0.5731872408833235, + "grad_norm": 0.16026534140110016, + "learning_rate": 0.001, + "loss": 2.6502, + "step": 13549 + }, + { + "epoch": 0.5732295456468398, + "grad_norm": 0.18979184329509735, + "learning_rate": 0.001, + "loss": 2.1462, + "step": 13550 + }, + { + "epoch": 0.5732718504103562, + "grad_norm": 0.17325901985168457, + "learning_rate": 0.001, + "loss": 1.8367, + "step": 13551 + }, + { + "epoch": 0.5733141551738726, + "grad_norm": 0.2838257849216461, + "learning_rate": 0.001, + "loss": 2.2564, + "step": 13552 + }, + { + "epoch": 0.5733564599373889, + "grad_norm": 0.4795995056629181, + "learning_rate": 0.001, + "loss": 1.9584, + "step": 13553 + }, + { + "epoch": 0.5733987647009053, + "grad_norm": 1.5541175603866577, + "learning_rate": 0.001, + "loss": 3.0344, + "step": 13554 + }, + { + "epoch": 0.5734410694644217, + "grad_norm": 0.1850278079509735, + "learning_rate": 0.001, + "loss": 2.1873, + "step": 13555 + }, + { + "epoch": 0.573483374227938, + "grad_norm": 0.20095832645893097, + "learning_rate": 0.001, + "loss": 2.0309, + "step": 13556 + }, + { + "epoch": 0.5735256789914545, + "grad_norm": 0.212057963013649, + "learning_rate": 0.001, + "loss": 2.0179, + "step": 13557 + }, + { + "epoch": 0.5735679837549708, + "grad_norm": 0.18375861644744873, + "learning_rate": 0.001, + "loss": 1.7909, + "step": 13558 + }, + { + "epoch": 0.5736102885184872, + "grad_norm": 0.1902046501636505, + "learning_rate": 0.001, + "loss": 1.7385, + "step": 13559 + }, + { + "epoch": 0.5736525932820036, + "grad_norm": 0.1839946061372757, + "learning_rate": 0.001, + "loss": 1.9316, + "step": 13560 + }, + { + "epoch": 0.5736948980455199, + "grad_norm": 0.16849452257156372, + "learning_rate": 0.001, + "loss": 2.4155, + "step": 13561 + }, + { + "epoch": 0.5737372028090363, + "grad_norm": 0.18684113025665283, + "learning_rate": 0.001, + "loss": 2.9065, + "step": 13562 + }, + { + "epoch": 0.5737795075725527, + "grad_norm": 0.325171560049057, + "learning_rate": 0.001, + "loss": 3.7306, + "step": 13563 + }, + { + "epoch": 0.573821812336069, + "grad_norm": 0.1625150591135025, + "learning_rate": 0.001, + "loss": 1.9709, + "step": 13564 + }, + { + "epoch": 0.5738641170995854, + "grad_norm": 0.5446557402610779, + "learning_rate": 0.001, + "loss": 2.5266, + "step": 13565 + }, + { + "epoch": 0.5739064218631018, + "grad_norm": 0.1913897693157196, + "learning_rate": 0.001, + "loss": 2.5541, + "step": 13566 + }, + { + "epoch": 0.5739487266266181, + "grad_norm": 0.23677568137645721, + "learning_rate": 0.001, + "loss": 3.3124, + "step": 13567 + }, + { + "epoch": 0.5739910313901345, + "grad_norm": 0.18259602785110474, + "learning_rate": 0.001, + "loss": 2.398, + "step": 13568 + }, + { + "epoch": 0.5740333361536509, + "grad_norm": 0.166291281580925, + "learning_rate": 0.001, + "loss": 1.7677, + "step": 13569 + }, + { + "epoch": 0.5740756409171672, + "grad_norm": 0.19032956659793854, + "learning_rate": 0.001, + "loss": 2.0508, + "step": 13570 + }, + { + "epoch": 0.5741179456806836, + "grad_norm": 0.303861528635025, + "learning_rate": 0.001, + "loss": 1.8316, + "step": 13571 + }, + { + "epoch": 0.5741602504442, + "grad_norm": 0.41305580735206604, + "learning_rate": 0.001, + "loss": 2.0014, + "step": 13572 + }, + { + "epoch": 0.5742025552077163, + "grad_norm": 0.1796369105577469, + "learning_rate": 0.001, + "loss": 2.7648, + "step": 13573 + }, + { + "epoch": 0.5742448599712328, + "grad_norm": 0.15992271900177002, + "learning_rate": 0.001, + "loss": 1.8547, + "step": 13574 + }, + { + "epoch": 0.5742871647347492, + "grad_norm": 0.195455402135849, + "learning_rate": 0.001, + "loss": 1.5503, + "step": 13575 + }, + { + "epoch": 0.5743294694982655, + "grad_norm": 0.17384329438209534, + "learning_rate": 0.001, + "loss": 2.1787, + "step": 13576 + }, + { + "epoch": 0.5743717742617819, + "grad_norm": 0.3714596629142761, + "learning_rate": 0.001, + "loss": 2.782, + "step": 13577 + }, + { + "epoch": 0.5744140790252983, + "grad_norm": 0.9288785457611084, + "learning_rate": 0.001, + "loss": 2.5938, + "step": 13578 + }, + { + "epoch": 0.5744563837888146, + "grad_norm": 0.28560522198677063, + "learning_rate": 0.001, + "loss": 2.3935, + "step": 13579 + }, + { + "epoch": 0.574498688552331, + "grad_norm": 0.23299963772296906, + "learning_rate": 0.001, + "loss": 2.4586, + "step": 13580 + }, + { + "epoch": 0.5745409933158474, + "grad_norm": 0.19017741084098816, + "learning_rate": 0.001, + "loss": 2.3019, + "step": 13581 + }, + { + "epoch": 0.5745832980793637, + "grad_norm": 0.18954819440841675, + "learning_rate": 0.001, + "loss": 1.8131, + "step": 13582 + }, + { + "epoch": 0.5746256028428801, + "grad_norm": 0.1803303062915802, + "learning_rate": 0.001, + "loss": 1.9121, + "step": 13583 + }, + { + "epoch": 0.5746679076063965, + "grad_norm": 4.713398456573486, + "learning_rate": 0.001, + "loss": 1.9541, + "step": 13584 + }, + { + "epoch": 0.5747102123699128, + "grad_norm": 0.2334275096654892, + "learning_rate": 0.001, + "loss": 1.7124, + "step": 13585 + }, + { + "epoch": 0.5747525171334292, + "grad_norm": 0.1691213995218277, + "learning_rate": 0.001, + "loss": 1.6664, + "step": 13586 + }, + { + "epoch": 0.5747948218969456, + "grad_norm": 0.21941591799259186, + "learning_rate": 0.001, + "loss": 2.0986, + "step": 13587 + }, + { + "epoch": 0.5748371266604619, + "grad_norm": 0.30102136731147766, + "learning_rate": 0.001, + "loss": 2.5321, + "step": 13588 + }, + { + "epoch": 0.5748794314239783, + "grad_norm": 0.21132823824882507, + "learning_rate": 0.001, + "loss": 2.5585, + "step": 13589 + }, + { + "epoch": 0.5749217361874948, + "grad_norm": 3.1371676921844482, + "learning_rate": 0.001, + "loss": 3.0142, + "step": 13590 + }, + { + "epoch": 0.5749640409510111, + "grad_norm": 0.3118325471878052, + "learning_rate": 0.001, + "loss": 1.7528, + "step": 13591 + }, + { + "epoch": 0.5750063457145275, + "grad_norm": 0.26828131079673767, + "learning_rate": 0.001, + "loss": 2.4065, + "step": 13592 + }, + { + "epoch": 0.5750486504780439, + "grad_norm": 0.21891672909259796, + "learning_rate": 0.001, + "loss": 1.8595, + "step": 13593 + }, + { + "epoch": 0.5750909552415602, + "grad_norm": 0.20417436957359314, + "learning_rate": 0.001, + "loss": 1.8951, + "step": 13594 + }, + { + "epoch": 0.5751332600050766, + "grad_norm": 0.4515736997127533, + "learning_rate": 0.001, + "loss": 2.1664, + "step": 13595 + }, + { + "epoch": 0.575175564768593, + "grad_norm": 0.17579565942287445, + "learning_rate": 0.001, + "loss": 2.4499, + "step": 13596 + }, + { + "epoch": 0.5752178695321093, + "grad_norm": 0.18956947326660156, + "learning_rate": 0.001, + "loss": 1.7775, + "step": 13597 + }, + { + "epoch": 0.5752601742956257, + "grad_norm": 0.18791532516479492, + "learning_rate": 0.001, + "loss": 1.858, + "step": 13598 + }, + { + "epoch": 0.5753024790591421, + "grad_norm": 0.18148577213287354, + "learning_rate": 0.001, + "loss": 1.8695, + "step": 13599 + }, + { + "epoch": 0.5753447838226584, + "grad_norm": 0.16978557407855988, + "learning_rate": 0.001, + "loss": 2.0121, + "step": 13600 + }, + { + "epoch": 0.5753870885861748, + "grad_norm": 0.1755099594593048, + "learning_rate": 0.001, + "loss": 1.9807, + "step": 13601 + }, + { + "epoch": 0.5754293933496911, + "grad_norm": 0.22287164628505707, + "learning_rate": 0.001, + "loss": 2.9232, + "step": 13602 + }, + { + "epoch": 0.5754716981132075, + "grad_norm": 0.2872997224330902, + "learning_rate": 0.001, + "loss": 2.2583, + "step": 13603 + }, + { + "epoch": 0.5755140028767239, + "grad_norm": 0.43780237436294556, + "learning_rate": 0.001, + "loss": 1.995, + "step": 13604 + }, + { + "epoch": 0.5755563076402402, + "grad_norm": 0.20227646827697754, + "learning_rate": 0.001, + "loss": 2.8379, + "step": 13605 + }, + { + "epoch": 0.5755986124037566, + "grad_norm": 1.0031191110610962, + "learning_rate": 0.001, + "loss": 2.8675, + "step": 13606 + }, + { + "epoch": 0.5756409171672731, + "grad_norm": 0.1701931655406952, + "learning_rate": 0.001, + "loss": 2.2968, + "step": 13607 + }, + { + "epoch": 0.5756832219307894, + "grad_norm": 0.7706162929534912, + "learning_rate": 0.001, + "loss": 2.6568, + "step": 13608 + }, + { + "epoch": 0.5757255266943058, + "grad_norm": 0.22482161223888397, + "learning_rate": 0.001, + "loss": 1.7453, + "step": 13609 + }, + { + "epoch": 0.5757678314578222, + "grad_norm": 1.2364606857299805, + "learning_rate": 0.001, + "loss": 1.8909, + "step": 13610 + }, + { + "epoch": 0.5758101362213385, + "grad_norm": 0.154103621840477, + "learning_rate": 0.001, + "loss": 1.7456, + "step": 13611 + }, + { + "epoch": 0.5758524409848549, + "grad_norm": 0.15440334379673004, + "learning_rate": 0.001, + "loss": 2.0969, + "step": 13612 + }, + { + "epoch": 0.5758947457483713, + "grad_norm": 0.21539804339408875, + "learning_rate": 0.001, + "loss": 2.4492, + "step": 13613 + }, + { + "epoch": 0.5759370505118876, + "grad_norm": 0.17441478371620178, + "learning_rate": 0.001, + "loss": 1.9286, + "step": 13614 + }, + { + "epoch": 0.575979355275404, + "grad_norm": 0.16577987372875214, + "learning_rate": 0.001, + "loss": 1.7983, + "step": 13615 + }, + { + "epoch": 0.5760216600389204, + "grad_norm": 0.6091001033782959, + "learning_rate": 0.001, + "loss": 2.4681, + "step": 13616 + }, + { + "epoch": 0.5760639648024367, + "grad_norm": 0.7010548114776611, + "learning_rate": 0.001, + "loss": 2.991, + "step": 13617 + }, + { + "epoch": 0.5761062695659531, + "grad_norm": 0.18760816752910614, + "learning_rate": 0.001, + "loss": 2.7225, + "step": 13618 + }, + { + "epoch": 0.5761485743294695, + "grad_norm": 0.2499050348997116, + "learning_rate": 0.001, + "loss": 1.889, + "step": 13619 + }, + { + "epoch": 0.5761908790929858, + "grad_norm": 0.19262200593948364, + "learning_rate": 0.001, + "loss": 1.9694, + "step": 13620 + }, + { + "epoch": 0.5762331838565022, + "grad_norm": 0.2874251902103424, + "learning_rate": 0.001, + "loss": 2.0702, + "step": 13621 + }, + { + "epoch": 0.5762754886200186, + "grad_norm": 0.2215861827135086, + "learning_rate": 0.001, + "loss": 3.0974, + "step": 13622 + }, + { + "epoch": 0.576317793383535, + "grad_norm": 1.4504274129867554, + "learning_rate": 0.001, + "loss": 2.6667, + "step": 13623 + }, + { + "epoch": 0.5763600981470514, + "grad_norm": 0.3368513286113739, + "learning_rate": 0.001, + "loss": 2.2946, + "step": 13624 + }, + { + "epoch": 0.5764024029105678, + "grad_norm": 0.5865402817726135, + "learning_rate": 0.001, + "loss": 2.1963, + "step": 13625 + }, + { + "epoch": 0.5764447076740841, + "grad_norm": 0.16736286878585815, + "learning_rate": 0.001, + "loss": 2.4853, + "step": 13626 + }, + { + "epoch": 0.5764870124376005, + "grad_norm": 0.18231581151485443, + "learning_rate": 0.001, + "loss": 1.9809, + "step": 13627 + }, + { + "epoch": 0.5765293172011169, + "grad_norm": 0.16837336122989655, + "learning_rate": 0.001, + "loss": 3.887, + "step": 13628 + }, + { + "epoch": 0.5765716219646332, + "grad_norm": 0.4993027150630951, + "learning_rate": 0.001, + "loss": 2.3137, + "step": 13629 + }, + { + "epoch": 0.5766139267281496, + "grad_norm": 0.2104516625404358, + "learning_rate": 0.001, + "loss": 1.6916, + "step": 13630 + }, + { + "epoch": 0.576656231491666, + "grad_norm": 0.20687484741210938, + "learning_rate": 0.001, + "loss": 2.2164, + "step": 13631 + }, + { + "epoch": 0.5766985362551823, + "grad_norm": 0.18873409926891327, + "learning_rate": 0.001, + "loss": 2.7869, + "step": 13632 + }, + { + "epoch": 0.5767408410186987, + "grad_norm": 0.16937416791915894, + "learning_rate": 0.001, + "loss": 1.9121, + "step": 13633 + }, + { + "epoch": 0.5767831457822151, + "grad_norm": 0.15023858845233917, + "learning_rate": 0.001, + "loss": 2.6665, + "step": 13634 + }, + { + "epoch": 0.5768254505457314, + "grad_norm": 0.15277251601219177, + "learning_rate": 0.001, + "loss": 2.0511, + "step": 13635 + }, + { + "epoch": 0.5768677553092478, + "grad_norm": 0.19467462599277496, + "learning_rate": 0.001, + "loss": 1.6275, + "step": 13636 + }, + { + "epoch": 0.5769100600727642, + "grad_norm": 0.1640617549419403, + "learning_rate": 0.001, + "loss": 1.331, + "step": 13637 + }, + { + "epoch": 0.5769523648362805, + "grad_norm": 0.17704744637012482, + "learning_rate": 0.001, + "loss": 2.0327, + "step": 13638 + }, + { + "epoch": 0.576994669599797, + "grad_norm": 0.16801083087921143, + "learning_rate": 0.001, + "loss": 2.6699, + "step": 13639 + }, + { + "epoch": 0.5770369743633134, + "grad_norm": 0.22247737646102905, + "learning_rate": 0.001, + "loss": 1.7275, + "step": 13640 + }, + { + "epoch": 0.5770792791268297, + "grad_norm": 0.2886200249195099, + "learning_rate": 0.001, + "loss": 2.255, + "step": 13641 + }, + { + "epoch": 0.5771215838903461, + "grad_norm": 2.0032057762145996, + "learning_rate": 0.001, + "loss": 1.4168, + "step": 13642 + }, + { + "epoch": 0.5771638886538625, + "grad_norm": 0.1862766593694687, + "learning_rate": 0.001, + "loss": 3.6991, + "step": 13643 + }, + { + "epoch": 0.5772061934173788, + "grad_norm": 0.27933868765830994, + "learning_rate": 0.001, + "loss": 2.5766, + "step": 13644 + }, + { + "epoch": 0.5772484981808952, + "grad_norm": 0.15503652393817902, + "learning_rate": 0.001, + "loss": 1.4434, + "step": 13645 + }, + { + "epoch": 0.5772908029444115, + "grad_norm": 0.1637222021818161, + "learning_rate": 0.001, + "loss": 1.8226, + "step": 13646 + }, + { + "epoch": 0.5773331077079279, + "grad_norm": 0.1900179535150528, + "learning_rate": 0.001, + "loss": 2.8638, + "step": 13647 + }, + { + "epoch": 0.5773754124714443, + "grad_norm": 5.069095134735107, + "learning_rate": 0.001, + "loss": 2.3441, + "step": 13648 + }, + { + "epoch": 0.5774177172349606, + "grad_norm": 0.21330921351909637, + "learning_rate": 0.001, + "loss": 2.6057, + "step": 13649 + }, + { + "epoch": 0.577460021998477, + "grad_norm": 0.3153631389141083, + "learning_rate": 0.001, + "loss": 3.2548, + "step": 13650 + }, + { + "epoch": 0.5775023267619934, + "grad_norm": 0.18381884694099426, + "learning_rate": 0.001, + "loss": 1.7146, + "step": 13651 + }, + { + "epoch": 0.5775446315255097, + "grad_norm": 0.2848173677921295, + "learning_rate": 0.001, + "loss": 1.6744, + "step": 13652 + }, + { + "epoch": 0.5775869362890261, + "grad_norm": 0.16625621914863586, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 13653 + }, + { + "epoch": 0.5776292410525425, + "grad_norm": 0.14194847643375397, + "learning_rate": 0.001, + "loss": 1.6789, + "step": 13654 + }, + { + "epoch": 0.5776715458160588, + "grad_norm": 0.24383410811424255, + "learning_rate": 0.001, + "loss": 2.5261, + "step": 13655 + }, + { + "epoch": 0.5777138505795752, + "grad_norm": 0.18297232687473297, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 13656 + }, + { + "epoch": 0.5777561553430917, + "grad_norm": 0.16804887354373932, + "learning_rate": 0.001, + "loss": 2.0048, + "step": 13657 + }, + { + "epoch": 0.577798460106608, + "grad_norm": 0.20523540675640106, + "learning_rate": 0.001, + "loss": 3.0154, + "step": 13658 + }, + { + "epoch": 0.5778407648701244, + "grad_norm": 0.9247041344642639, + "learning_rate": 0.001, + "loss": 2.0177, + "step": 13659 + }, + { + "epoch": 0.5778830696336408, + "grad_norm": 0.27602866291999817, + "learning_rate": 0.001, + "loss": 1.8518, + "step": 13660 + }, + { + "epoch": 0.5779253743971571, + "grad_norm": 0.1811681091785431, + "learning_rate": 0.001, + "loss": 2.2486, + "step": 13661 + }, + { + "epoch": 0.5779676791606735, + "grad_norm": 1.9261819124221802, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 13662 + }, + { + "epoch": 0.5780099839241899, + "grad_norm": 0.23115941882133484, + "learning_rate": 0.001, + "loss": 2.0086, + "step": 13663 + }, + { + "epoch": 0.5780522886877062, + "grad_norm": 0.20311318337917328, + "learning_rate": 0.001, + "loss": 2.3334, + "step": 13664 + }, + { + "epoch": 0.5780945934512226, + "grad_norm": 0.1666504442691803, + "learning_rate": 0.001, + "loss": 2.0804, + "step": 13665 + }, + { + "epoch": 0.578136898214739, + "grad_norm": 0.20418089628219604, + "learning_rate": 0.001, + "loss": 2.2807, + "step": 13666 + }, + { + "epoch": 0.5781792029782553, + "grad_norm": 0.1729002743959427, + "learning_rate": 0.001, + "loss": 1.6245, + "step": 13667 + }, + { + "epoch": 0.5782215077417717, + "grad_norm": 0.1773560643196106, + "learning_rate": 0.001, + "loss": 1.6606, + "step": 13668 + }, + { + "epoch": 0.5782638125052881, + "grad_norm": 0.17324134707450867, + "learning_rate": 0.001, + "loss": 1.5963, + "step": 13669 + }, + { + "epoch": 0.5783061172688044, + "grad_norm": 0.503576397895813, + "learning_rate": 0.001, + "loss": 2.0075, + "step": 13670 + }, + { + "epoch": 0.5783484220323208, + "grad_norm": 0.15948913991451263, + "learning_rate": 0.001, + "loss": 1.7914, + "step": 13671 + }, + { + "epoch": 0.5783907267958373, + "grad_norm": 0.2122723013162613, + "learning_rate": 0.001, + "loss": 2.4304, + "step": 13672 + }, + { + "epoch": 0.5784330315593535, + "grad_norm": 0.9828127026557922, + "learning_rate": 0.001, + "loss": 2.3654, + "step": 13673 + }, + { + "epoch": 0.57847533632287, + "grad_norm": 0.1700863391160965, + "learning_rate": 0.001, + "loss": 2.0107, + "step": 13674 + }, + { + "epoch": 0.5785176410863864, + "grad_norm": 0.3214263617992401, + "learning_rate": 0.001, + "loss": 2.5081, + "step": 13675 + }, + { + "epoch": 0.5785599458499027, + "grad_norm": 0.21781599521636963, + "learning_rate": 0.001, + "loss": 2.0888, + "step": 13676 + }, + { + "epoch": 0.5786022506134191, + "grad_norm": 0.16866712272167206, + "learning_rate": 0.001, + "loss": 2.407, + "step": 13677 + }, + { + "epoch": 0.5786445553769355, + "grad_norm": 0.15926438570022583, + "learning_rate": 0.001, + "loss": 1.3379, + "step": 13678 + }, + { + "epoch": 0.5786868601404518, + "grad_norm": 0.23510025441646576, + "learning_rate": 0.001, + "loss": 1.7714, + "step": 13679 + }, + { + "epoch": 0.5787291649039682, + "grad_norm": 0.14508269727230072, + "learning_rate": 0.001, + "loss": 2.1146, + "step": 13680 + }, + { + "epoch": 0.5787714696674846, + "grad_norm": 2.956667423248291, + "learning_rate": 0.001, + "loss": 2.8911, + "step": 13681 + }, + { + "epoch": 0.5788137744310009, + "grad_norm": 0.24240057170391083, + "learning_rate": 0.001, + "loss": 3.1488, + "step": 13682 + }, + { + "epoch": 0.5788560791945173, + "grad_norm": 0.16761580109596252, + "learning_rate": 0.001, + "loss": 3.1758, + "step": 13683 + }, + { + "epoch": 0.5788983839580337, + "grad_norm": 0.15230026841163635, + "learning_rate": 0.001, + "loss": 1.5823, + "step": 13684 + }, + { + "epoch": 0.57894068872155, + "grad_norm": 0.14217911660671234, + "learning_rate": 0.001, + "loss": 1.3058, + "step": 13685 + }, + { + "epoch": 0.5789829934850664, + "grad_norm": 0.23708081245422363, + "learning_rate": 0.001, + "loss": 3.0425, + "step": 13686 + }, + { + "epoch": 0.5790252982485828, + "grad_norm": 0.19518715143203735, + "learning_rate": 0.001, + "loss": 2.1443, + "step": 13687 + }, + { + "epoch": 0.5790676030120991, + "grad_norm": 0.2842262089252472, + "learning_rate": 0.001, + "loss": 2.324, + "step": 13688 + }, + { + "epoch": 0.5791099077756156, + "grad_norm": 0.2659071683883667, + "learning_rate": 0.001, + "loss": 2.3996, + "step": 13689 + }, + { + "epoch": 0.579152212539132, + "grad_norm": 0.16496817767620087, + "learning_rate": 0.001, + "loss": 1.9989, + "step": 13690 + }, + { + "epoch": 0.5791945173026483, + "grad_norm": 0.19301360845565796, + "learning_rate": 0.001, + "loss": 3.5109, + "step": 13691 + }, + { + "epoch": 0.5792368220661647, + "grad_norm": 0.3519555628299713, + "learning_rate": 0.001, + "loss": 3.3803, + "step": 13692 + }, + { + "epoch": 0.579279126829681, + "grad_norm": 0.26763901114463806, + "learning_rate": 0.001, + "loss": 2.4265, + "step": 13693 + }, + { + "epoch": 0.5793214315931974, + "grad_norm": 0.1800110638141632, + "learning_rate": 0.001, + "loss": 1.8322, + "step": 13694 + }, + { + "epoch": 0.5793637363567138, + "grad_norm": 0.16732731461524963, + "learning_rate": 0.001, + "loss": 1.8233, + "step": 13695 + }, + { + "epoch": 0.5794060411202301, + "grad_norm": 0.17104704678058624, + "learning_rate": 0.001, + "loss": 2.2859, + "step": 13696 + }, + { + "epoch": 0.5794483458837465, + "grad_norm": 0.18005624413490295, + "learning_rate": 0.001, + "loss": 3.1283, + "step": 13697 + }, + { + "epoch": 0.5794906506472629, + "grad_norm": 0.15262584388256073, + "learning_rate": 0.001, + "loss": 2.445, + "step": 13698 + }, + { + "epoch": 0.5795329554107792, + "grad_norm": 0.13767917454242706, + "learning_rate": 0.001, + "loss": 1.5765, + "step": 13699 + }, + { + "epoch": 0.5795752601742956, + "grad_norm": 0.18696613609790802, + "learning_rate": 0.001, + "loss": 2.0157, + "step": 13700 + }, + { + "epoch": 0.579617564937812, + "grad_norm": 0.21265771985054016, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 13701 + }, + { + "epoch": 0.5796598697013283, + "grad_norm": 0.154225155711174, + "learning_rate": 0.001, + "loss": 2.6487, + "step": 13702 + }, + { + "epoch": 0.5797021744648447, + "grad_norm": 0.3607092797756195, + "learning_rate": 0.001, + "loss": 1.9679, + "step": 13703 + }, + { + "epoch": 0.5797444792283611, + "grad_norm": 0.30119165778160095, + "learning_rate": 0.001, + "loss": 2.7281, + "step": 13704 + }, + { + "epoch": 0.5797867839918774, + "grad_norm": 0.14737650752067566, + "learning_rate": 0.001, + "loss": 1.6039, + "step": 13705 + }, + { + "epoch": 0.5798290887553939, + "grad_norm": 0.1710781455039978, + "learning_rate": 0.001, + "loss": 1.7936, + "step": 13706 + }, + { + "epoch": 0.5798713935189103, + "grad_norm": 0.21716709434986115, + "learning_rate": 0.001, + "loss": 2.3436, + "step": 13707 + }, + { + "epoch": 0.5799136982824266, + "grad_norm": 0.13513155281543732, + "learning_rate": 0.001, + "loss": 2.2419, + "step": 13708 + }, + { + "epoch": 0.579956003045943, + "grad_norm": 0.14529554545879364, + "learning_rate": 0.001, + "loss": 2.1327, + "step": 13709 + }, + { + "epoch": 0.5799983078094594, + "grad_norm": 0.14698375761508942, + "learning_rate": 0.001, + "loss": 1.6831, + "step": 13710 + }, + { + "epoch": 0.5800406125729757, + "grad_norm": 0.1601586788892746, + "learning_rate": 0.001, + "loss": 2.5013, + "step": 13711 + }, + { + "epoch": 0.5800829173364921, + "grad_norm": 2.6871955394744873, + "learning_rate": 0.001, + "loss": 2.3585, + "step": 13712 + }, + { + "epoch": 0.5801252221000085, + "grad_norm": 1.6209683418273926, + "learning_rate": 0.001, + "loss": 2.7192, + "step": 13713 + }, + { + "epoch": 0.5801675268635248, + "grad_norm": 0.4082275927066803, + "learning_rate": 0.001, + "loss": 2.3748, + "step": 13714 + }, + { + "epoch": 0.5802098316270412, + "grad_norm": 0.8170319199562073, + "learning_rate": 0.001, + "loss": 1.9599, + "step": 13715 + }, + { + "epoch": 0.5802521363905576, + "grad_norm": 0.15276405215263367, + "learning_rate": 0.001, + "loss": 3.2077, + "step": 13716 + }, + { + "epoch": 0.5802944411540739, + "grad_norm": 0.16727223992347717, + "learning_rate": 0.001, + "loss": 3.1438, + "step": 13717 + }, + { + "epoch": 0.5803367459175903, + "grad_norm": 0.1725165992975235, + "learning_rate": 0.001, + "loss": 2.3128, + "step": 13718 + }, + { + "epoch": 0.5803790506811067, + "grad_norm": 0.16121014952659607, + "learning_rate": 0.001, + "loss": 2.9876, + "step": 13719 + }, + { + "epoch": 0.580421355444623, + "grad_norm": 0.27277088165283203, + "learning_rate": 0.001, + "loss": 2.1281, + "step": 13720 + }, + { + "epoch": 0.5804636602081394, + "grad_norm": 0.2123079150915146, + "learning_rate": 0.001, + "loss": 2.2785, + "step": 13721 + }, + { + "epoch": 0.5805059649716559, + "grad_norm": 0.7211734056472778, + "learning_rate": 0.001, + "loss": 2.1592, + "step": 13722 + }, + { + "epoch": 0.5805482697351722, + "grad_norm": 0.5063629746437073, + "learning_rate": 0.001, + "loss": 1.605, + "step": 13723 + }, + { + "epoch": 0.5805905744986886, + "grad_norm": 0.15953439474105835, + "learning_rate": 0.001, + "loss": 2.3575, + "step": 13724 + }, + { + "epoch": 0.580632879262205, + "grad_norm": 0.2004745602607727, + "learning_rate": 0.001, + "loss": 2.5765, + "step": 13725 + }, + { + "epoch": 0.5806751840257213, + "grad_norm": 0.17336571216583252, + "learning_rate": 0.001, + "loss": 2.242, + "step": 13726 + }, + { + "epoch": 0.5807174887892377, + "grad_norm": 0.15672898292541504, + "learning_rate": 0.001, + "loss": 1.6338, + "step": 13727 + }, + { + "epoch": 0.5807597935527541, + "grad_norm": 0.16324591636657715, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 13728 + }, + { + "epoch": 0.5808020983162704, + "grad_norm": 0.8709842562675476, + "learning_rate": 0.001, + "loss": 1.8762, + "step": 13729 + }, + { + "epoch": 0.5808444030797868, + "grad_norm": 0.16312599182128906, + "learning_rate": 0.001, + "loss": 1.8523, + "step": 13730 + }, + { + "epoch": 0.5808867078433032, + "grad_norm": 0.6291779279708862, + "learning_rate": 0.001, + "loss": 3.0103, + "step": 13731 + }, + { + "epoch": 0.5809290126068195, + "grad_norm": 0.18601246178150177, + "learning_rate": 0.001, + "loss": 2.3727, + "step": 13732 + }, + { + "epoch": 0.5809713173703359, + "grad_norm": 0.22245633602142334, + "learning_rate": 0.001, + "loss": 1.926, + "step": 13733 + }, + { + "epoch": 0.5810136221338523, + "grad_norm": 0.1883118599653244, + "learning_rate": 0.001, + "loss": 2.1428, + "step": 13734 + }, + { + "epoch": 0.5810559268973686, + "grad_norm": 0.15982557833194733, + "learning_rate": 0.001, + "loss": 2.1728, + "step": 13735 + }, + { + "epoch": 0.581098231660885, + "grad_norm": 0.18497587740421295, + "learning_rate": 0.001, + "loss": 1.7981, + "step": 13736 + }, + { + "epoch": 0.5811405364244013, + "grad_norm": 0.1280389130115509, + "learning_rate": 0.001, + "loss": 1.5147, + "step": 13737 + }, + { + "epoch": 0.5811828411879177, + "grad_norm": 0.2510825991630554, + "learning_rate": 0.001, + "loss": 1.5612, + "step": 13738 + }, + { + "epoch": 0.5812251459514342, + "grad_norm": 0.1742805689573288, + "learning_rate": 0.001, + "loss": 2.0385, + "step": 13739 + }, + { + "epoch": 0.5812674507149505, + "grad_norm": 0.14577478170394897, + "learning_rate": 0.001, + "loss": 2.1657, + "step": 13740 + }, + { + "epoch": 0.5813097554784669, + "grad_norm": 0.17133548855781555, + "learning_rate": 0.001, + "loss": 2.2744, + "step": 13741 + }, + { + "epoch": 0.5813520602419833, + "grad_norm": 0.16263654828071594, + "learning_rate": 0.001, + "loss": 1.8197, + "step": 13742 + }, + { + "epoch": 0.5813943650054996, + "grad_norm": 0.19162042438983917, + "learning_rate": 0.001, + "loss": 1.8052, + "step": 13743 + }, + { + "epoch": 0.581436669769016, + "grad_norm": 0.17110483348369598, + "learning_rate": 0.001, + "loss": 2.1316, + "step": 13744 + }, + { + "epoch": 0.5814789745325324, + "grad_norm": 0.4235318601131439, + "learning_rate": 0.001, + "loss": 2.5278, + "step": 13745 + }, + { + "epoch": 0.5815212792960487, + "grad_norm": 0.15516316890716553, + "learning_rate": 0.001, + "loss": 2.2714, + "step": 13746 + }, + { + "epoch": 0.5815635840595651, + "grad_norm": 1.130749225616455, + "learning_rate": 0.001, + "loss": 1.8972, + "step": 13747 + }, + { + "epoch": 0.5816058888230815, + "grad_norm": 0.41215425729751587, + "learning_rate": 0.001, + "loss": 3.452, + "step": 13748 + }, + { + "epoch": 0.5816481935865978, + "grad_norm": 4.2232441902160645, + "learning_rate": 0.001, + "loss": 1.7344, + "step": 13749 + }, + { + "epoch": 0.5816904983501142, + "grad_norm": 0.19343996047973633, + "learning_rate": 0.001, + "loss": 2.1452, + "step": 13750 + }, + { + "epoch": 0.5817328031136306, + "grad_norm": 36.49468231201172, + "learning_rate": 0.001, + "loss": 1.8784, + "step": 13751 + }, + { + "epoch": 0.5817751078771469, + "grad_norm": 0.2389318197965622, + "learning_rate": 0.001, + "loss": 2.6143, + "step": 13752 + }, + { + "epoch": 0.5818174126406633, + "grad_norm": 0.24423779547214508, + "learning_rate": 0.001, + "loss": 2.6543, + "step": 13753 + }, + { + "epoch": 0.5818597174041797, + "grad_norm": 0.2429848611354828, + "learning_rate": 0.001, + "loss": 2.2033, + "step": 13754 + }, + { + "epoch": 0.581902022167696, + "grad_norm": 0.2506393790245056, + "learning_rate": 0.001, + "loss": 2.3673, + "step": 13755 + }, + { + "epoch": 0.5819443269312125, + "grad_norm": 0.1692809909582138, + "learning_rate": 0.001, + "loss": 2.2108, + "step": 13756 + }, + { + "epoch": 0.5819866316947289, + "grad_norm": 5.47391939163208, + "learning_rate": 0.001, + "loss": 2.5862, + "step": 13757 + }, + { + "epoch": 0.5820289364582452, + "grad_norm": 0.17192316055297852, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 13758 + }, + { + "epoch": 0.5820712412217616, + "grad_norm": 0.22438155114650726, + "learning_rate": 0.001, + "loss": 2.4762, + "step": 13759 + }, + { + "epoch": 0.582113545985278, + "grad_norm": 1.2624096870422363, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 13760 + }, + { + "epoch": 0.5821558507487943, + "grad_norm": 0.202859029173851, + "learning_rate": 0.001, + "loss": 2.0482, + "step": 13761 + }, + { + "epoch": 0.5821981555123107, + "grad_norm": 0.22284016013145447, + "learning_rate": 0.001, + "loss": 2.1997, + "step": 13762 + }, + { + "epoch": 0.5822404602758271, + "grad_norm": 0.20142649114131927, + "learning_rate": 0.001, + "loss": 1.8527, + "step": 13763 + }, + { + "epoch": 0.5822827650393434, + "grad_norm": 0.1934656798839569, + "learning_rate": 0.001, + "loss": 2.7891, + "step": 13764 + }, + { + "epoch": 0.5823250698028598, + "grad_norm": 0.25359898805618286, + "learning_rate": 0.001, + "loss": 2.1556, + "step": 13765 + }, + { + "epoch": 0.5823673745663762, + "grad_norm": 0.43819230794906616, + "learning_rate": 0.001, + "loss": 2.4958, + "step": 13766 + }, + { + "epoch": 0.5824096793298925, + "grad_norm": 0.17032918334007263, + "learning_rate": 0.001, + "loss": 1.8058, + "step": 13767 + }, + { + "epoch": 0.5824519840934089, + "grad_norm": 2.558115005493164, + "learning_rate": 0.001, + "loss": 3.1201, + "step": 13768 + }, + { + "epoch": 0.5824942888569253, + "grad_norm": 0.20715180039405823, + "learning_rate": 0.001, + "loss": 2.2601, + "step": 13769 + }, + { + "epoch": 0.5825365936204416, + "grad_norm": 0.5378583669662476, + "learning_rate": 0.001, + "loss": 2.006, + "step": 13770 + }, + { + "epoch": 0.582578898383958, + "grad_norm": 0.28567495942115784, + "learning_rate": 0.001, + "loss": 1.7223, + "step": 13771 + }, + { + "epoch": 0.5826212031474745, + "grad_norm": 0.18178150057792664, + "learning_rate": 0.001, + "loss": 1.3784, + "step": 13772 + }, + { + "epoch": 0.5826635079109908, + "grad_norm": 0.23406149446964264, + "learning_rate": 0.001, + "loss": 4.444, + "step": 13773 + }, + { + "epoch": 0.5827058126745072, + "grad_norm": 0.18092291057109833, + "learning_rate": 0.001, + "loss": 3.0016, + "step": 13774 + }, + { + "epoch": 0.5827481174380236, + "grad_norm": 0.3098083436489105, + "learning_rate": 0.001, + "loss": 2.7088, + "step": 13775 + }, + { + "epoch": 0.5827904222015399, + "grad_norm": 0.18877199292182922, + "learning_rate": 0.001, + "loss": 1.6503, + "step": 13776 + }, + { + "epoch": 0.5828327269650563, + "grad_norm": 0.2062119096517563, + "learning_rate": 0.001, + "loss": 2.1285, + "step": 13777 + }, + { + "epoch": 0.5828750317285727, + "grad_norm": 0.3584443926811218, + "learning_rate": 0.001, + "loss": 1.7564, + "step": 13778 + }, + { + "epoch": 0.582917336492089, + "grad_norm": 0.17341378331184387, + "learning_rate": 0.001, + "loss": 1.9815, + "step": 13779 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 0.17184588313102722, + "learning_rate": 0.001, + "loss": 1.8823, + "step": 13780 + }, + { + "epoch": 0.5830019460191218, + "grad_norm": 0.16942469775676727, + "learning_rate": 0.001, + "loss": 1.9993, + "step": 13781 + }, + { + "epoch": 0.5830442507826381, + "grad_norm": 0.17933522164821625, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 13782 + }, + { + "epoch": 0.5830865555461545, + "grad_norm": 0.15469792485237122, + "learning_rate": 0.001, + "loss": 2.1212, + "step": 13783 + }, + { + "epoch": 0.5831288603096708, + "grad_norm": 0.1891220211982727, + "learning_rate": 0.001, + "loss": 3.1775, + "step": 13784 + }, + { + "epoch": 0.5831711650731872, + "grad_norm": 0.1707414835691452, + "learning_rate": 0.001, + "loss": 1.991, + "step": 13785 + }, + { + "epoch": 0.5832134698367036, + "grad_norm": 1.5627321004867554, + "learning_rate": 0.001, + "loss": 1.999, + "step": 13786 + }, + { + "epoch": 0.5832557746002199, + "grad_norm": 0.15185093879699707, + "learning_rate": 0.001, + "loss": 2.8022, + "step": 13787 + }, + { + "epoch": 0.5832980793637363, + "grad_norm": 0.1897820383310318, + "learning_rate": 0.001, + "loss": 2.0066, + "step": 13788 + }, + { + "epoch": 0.5833403841272528, + "grad_norm": 0.4447472393512726, + "learning_rate": 0.001, + "loss": 3.3133, + "step": 13789 + }, + { + "epoch": 0.583382688890769, + "grad_norm": 0.15921324491500854, + "learning_rate": 0.001, + "loss": 2.5172, + "step": 13790 + }, + { + "epoch": 0.5834249936542855, + "grad_norm": 0.22110402584075928, + "learning_rate": 0.001, + "loss": 2.0695, + "step": 13791 + }, + { + "epoch": 0.5834672984178019, + "grad_norm": 0.34887969493865967, + "learning_rate": 0.001, + "loss": 2.6305, + "step": 13792 + }, + { + "epoch": 0.5835096031813182, + "grad_norm": 0.16820944845676422, + "learning_rate": 0.001, + "loss": 2.5623, + "step": 13793 + }, + { + "epoch": 0.5835519079448346, + "grad_norm": 0.4705934226512909, + "learning_rate": 0.001, + "loss": 2.2158, + "step": 13794 + }, + { + "epoch": 0.583594212708351, + "grad_norm": 0.24807946383953094, + "learning_rate": 0.001, + "loss": 1.9532, + "step": 13795 + }, + { + "epoch": 0.5836365174718673, + "grad_norm": 1.3096846342086792, + "learning_rate": 0.001, + "loss": 1.9417, + "step": 13796 + }, + { + "epoch": 0.5836788222353837, + "grad_norm": 0.1883528232574463, + "learning_rate": 0.001, + "loss": 2.1403, + "step": 13797 + }, + { + "epoch": 0.5837211269989001, + "grad_norm": 0.20582067966461182, + "learning_rate": 0.001, + "loss": 2.3299, + "step": 13798 + }, + { + "epoch": 0.5837634317624164, + "grad_norm": 2.3853557109832764, + "learning_rate": 0.001, + "loss": 1.6175, + "step": 13799 + }, + { + "epoch": 0.5838057365259328, + "grad_norm": 0.21598750352859497, + "learning_rate": 0.001, + "loss": 2.3283, + "step": 13800 + }, + { + "epoch": 0.5838480412894492, + "grad_norm": 0.41916608810424805, + "learning_rate": 0.001, + "loss": 2.8298, + "step": 13801 + }, + { + "epoch": 0.5838903460529655, + "grad_norm": 0.18645887076854706, + "learning_rate": 0.001, + "loss": 1.4992, + "step": 13802 + }, + { + "epoch": 0.5839326508164819, + "grad_norm": 0.15576979517936707, + "learning_rate": 0.001, + "loss": 1.6169, + "step": 13803 + }, + { + "epoch": 0.5839749555799983, + "grad_norm": 0.1782311350107193, + "learning_rate": 0.001, + "loss": 1.7768, + "step": 13804 + }, + { + "epoch": 0.5840172603435146, + "grad_norm": 0.8121781349182129, + "learning_rate": 0.001, + "loss": 2.2347, + "step": 13805 + }, + { + "epoch": 0.584059565107031, + "grad_norm": 0.20430848002433777, + "learning_rate": 0.001, + "loss": 2.1171, + "step": 13806 + }, + { + "epoch": 0.5841018698705475, + "grad_norm": 0.5235466361045837, + "learning_rate": 0.001, + "loss": 1.7292, + "step": 13807 + }, + { + "epoch": 0.5841441746340638, + "grad_norm": 0.17465122044086456, + "learning_rate": 0.001, + "loss": 1.7302, + "step": 13808 + }, + { + "epoch": 0.5841864793975802, + "grad_norm": 0.18919281661510468, + "learning_rate": 0.001, + "loss": 1.8968, + "step": 13809 + }, + { + "epoch": 0.5842287841610966, + "grad_norm": 0.17904122173786163, + "learning_rate": 0.001, + "loss": 1.8885, + "step": 13810 + }, + { + "epoch": 0.5842710889246129, + "grad_norm": 3.1780359745025635, + "learning_rate": 0.001, + "loss": 2.5823, + "step": 13811 + }, + { + "epoch": 0.5843133936881293, + "grad_norm": 0.4842691123485565, + "learning_rate": 0.001, + "loss": 3.0183, + "step": 13812 + }, + { + "epoch": 0.5843556984516457, + "grad_norm": 0.21552209556102753, + "learning_rate": 0.001, + "loss": 2.5317, + "step": 13813 + }, + { + "epoch": 0.584398003215162, + "grad_norm": 2.484487533569336, + "learning_rate": 0.001, + "loss": 1.738, + "step": 13814 + }, + { + "epoch": 0.5844403079786784, + "grad_norm": 0.3187903165817261, + "learning_rate": 0.001, + "loss": 3.056, + "step": 13815 + }, + { + "epoch": 0.5844826127421948, + "grad_norm": 2.3998584747314453, + "learning_rate": 0.001, + "loss": 1.7838, + "step": 13816 + }, + { + "epoch": 0.5845249175057111, + "grad_norm": 0.17576216161251068, + "learning_rate": 0.001, + "loss": 2.3258, + "step": 13817 + }, + { + "epoch": 0.5845672222692275, + "grad_norm": 0.6371609568595886, + "learning_rate": 0.001, + "loss": 2.2197, + "step": 13818 + }, + { + "epoch": 0.5846095270327439, + "grad_norm": 0.20169895887374878, + "learning_rate": 0.001, + "loss": 2.5768, + "step": 13819 + }, + { + "epoch": 0.5846518317962602, + "grad_norm": 0.20052120089530945, + "learning_rate": 0.001, + "loss": 1.7194, + "step": 13820 + }, + { + "epoch": 0.5846941365597766, + "grad_norm": 0.1984112709760666, + "learning_rate": 0.001, + "loss": 2.0484, + "step": 13821 + }, + { + "epoch": 0.584736441323293, + "grad_norm": 0.16683201491832733, + "learning_rate": 0.001, + "loss": 1.9458, + "step": 13822 + }, + { + "epoch": 0.5847787460868094, + "grad_norm": 0.19755448400974274, + "learning_rate": 0.001, + "loss": 2.6561, + "step": 13823 + }, + { + "epoch": 0.5848210508503258, + "grad_norm": 0.18185539543628693, + "learning_rate": 0.001, + "loss": 2.2397, + "step": 13824 + }, + { + "epoch": 0.5848633556138422, + "grad_norm": 0.17746347188949585, + "learning_rate": 0.001, + "loss": 2.9947, + "step": 13825 + }, + { + "epoch": 0.5849056603773585, + "grad_norm": 0.22636628150939941, + "learning_rate": 0.001, + "loss": 2.5098, + "step": 13826 + }, + { + "epoch": 0.5849479651408749, + "grad_norm": 0.5232028365135193, + "learning_rate": 0.001, + "loss": 3.0739, + "step": 13827 + }, + { + "epoch": 0.5849902699043912, + "grad_norm": 0.20873093605041504, + "learning_rate": 0.001, + "loss": 2.4017, + "step": 13828 + }, + { + "epoch": 0.5850325746679076, + "grad_norm": 0.2317352294921875, + "learning_rate": 0.001, + "loss": 1.5189, + "step": 13829 + }, + { + "epoch": 0.585074879431424, + "grad_norm": 2.4872546195983887, + "learning_rate": 0.001, + "loss": 1.9005, + "step": 13830 + }, + { + "epoch": 0.5851171841949403, + "grad_norm": 0.20402368903160095, + "learning_rate": 0.001, + "loss": 1.6404, + "step": 13831 + }, + { + "epoch": 0.5851594889584567, + "grad_norm": 0.16960665583610535, + "learning_rate": 0.001, + "loss": 1.7397, + "step": 13832 + }, + { + "epoch": 0.5852017937219731, + "grad_norm": 0.16686907410621643, + "learning_rate": 0.001, + "loss": 2.6307, + "step": 13833 + }, + { + "epoch": 0.5852440984854894, + "grad_norm": 0.20668341219425201, + "learning_rate": 0.001, + "loss": 2.2207, + "step": 13834 + }, + { + "epoch": 0.5852864032490058, + "grad_norm": 0.17005237936973572, + "learning_rate": 0.001, + "loss": 2.3992, + "step": 13835 + }, + { + "epoch": 0.5853287080125222, + "grad_norm": 0.16195209324359894, + "learning_rate": 0.001, + "loss": 1.6082, + "step": 13836 + }, + { + "epoch": 0.5853710127760385, + "grad_norm": 0.3230375349521637, + "learning_rate": 0.001, + "loss": 2.2868, + "step": 13837 + }, + { + "epoch": 0.585413317539555, + "grad_norm": 0.16415375471115112, + "learning_rate": 0.001, + "loss": 2.2974, + "step": 13838 + }, + { + "epoch": 0.5854556223030714, + "grad_norm": 0.1570240557193756, + "learning_rate": 0.001, + "loss": 2.3329, + "step": 13839 + }, + { + "epoch": 0.5854979270665877, + "grad_norm": 0.17124520242214203, + "learning_rate": 0.001, + "loss": 1.725, + "step": 13840 + }, + { + "epoch": 0.5855402318301041, + "grad_norm": 0.2079612761735916, + "learning_rate": 0.001, + "loss": 2.2798, + "step": 13841 + }, + { + "epoch": 0.5855825365936205, + "grad_norm": 0.20434881746768951, + "learning_rate": 0.001, + "loss": 2.7584, + "step": 13842 + }, + { + "epoch": 0.5856248413571368, + "grad_norm": 0.17397215962409973, + "learning_rate": 0.001, + "loss": 2.5361, + "step": 13843 + }, + { + "epoch": 0.5856671461206532, + "grad_norm": 0.14827987551689148, + "learning_rate": 0.001, + "loss": 3.5882, + "step": 13844 + }, + { + "epoch": 0.5857094508841696, + "grad_norm": 0.16601242125034332, + "learning_rate": 0.001, + "loss": 1.7631, + "step": 13845 + }, + { + "epoch": 0.5857517556476859, + "grad_norm": 0.1840457022190094, + "learning_rate": 0.001, + "loss": 3.2754, + "step": 13846 + }, + { + "epoch": 0.5857940604112023, + "grad_norm": 0.19872917234897614, + "learning_rate": 0.001, + "loss": 2.24, + "step": 13847 + }, + { + "epoch": 0.5858363651747187, + "grad_norm": 0.18587267398834229, + "learning_rate": 0.001, + "loss": 2.8884, + "step": 13848 + }, + { + "epoch": 0.585878669938235, + "grad_norm": 0.5582289695739746, + "learning_rate": 0.001, + "loss": 2.3321, + "step": 13849 + }, + { + "epoch": 0.5859209747017514, + "grad_norm": 0.21963296830654144, + "learning_rate": 0.001, + "loss": 2.3329, + "step": 13850 + }, + { + "epoch": 0.5859632794652678, + "grad_norm": 0.1582070291042328, + "learning_rate": 0.001, + "loss": 2.5041, + "step": 13851 + }, + { + "epoch": 0.5860055842287841, + "grad_norm": 0.1862238198518753, + "learning_rate": 0.001, + "loss": 1.9722, + "step": 13852 + }, + { + "epoch": 0.5860478889923005, + "grad_norm": 0.24395392835140228, + "learning_rate": 0.001, + "loss": 2.1698, + "step": 13853 + }, + { + "epoch": 0.586090193755817, + "grad_norm": 0.19407311081886292, + "learning_rate": 0.001, + "loss": 2.0222, + "step": 13854 + }, + { + "epoch": 0.5861324985193332, + "grad_norm": 0.18697842955589294, + "learning_rate": 0.001, + "loss": 2.0685, + "step": 13855 + }, + { + "epoch": 0.5861748032828497, + "grad_norm": 0.15743017196655273, + "learning_rate": 0.001, + "loss": 2.9768, + "step": 13856 + }, + { + "epoch": 0.5862171080463661, + "grad_norm": 0.1737719476222992, + "learning_rate": 0.001, + "loss": 1.6942, + "step": 13857 + }, + { + "epoch": 0.5862594128098824, + "grad_norm": 0.17090411484241486, + "learning_rate": 0.001, + "loss": 1.5764, + "step": 13858 + }, + { + "epoch": 0.5863017175733988, + "grad_norm": 0.21270331740379333, + "learning_rate": 0.001, + "loss": 1.7628, + "step": 13859 + }, + { + "epoch": 0.5863440223369152, + "grad_norm": 0.16474924981594086, + "learning_rate": 0.001, + "loss": 2.702, + "step": 13860 + }, + { + "epoch": 0.5863863271004315, + "grad_norm": 0.18944130837917328, + "learning_rate": 0.001, + "loss": 2.5051, + "step": 13861 + }, + { + "epoch": 0.5864286318639479, + "grad_norm": 0.1647646725177765, + "learning_rate": 0.001, + "loss": 2.3672, + "step": 13862 + }, + { + "epoch": 0.5864709366274643, + "grad_norm": 0.2273533195257187, + "learning_rate": 0.001, + "loss": 2.4584, + "step": 13863 + }, + { + "epoch": 0.5865132413909806, + "grad_norm": 0.14887486398220062, + "learning_rate": 0.001, + "loss": 2.4992, + "step": 13864 + }, + { + "epoch": 0.586555546154497, + "grad_norm": 0.20371198654174805, + "learning_rate": 0.001, + "loss": 2.2056, + "step": 13865 + }, + { + "epoch": 0.5865978509180134, + "grad_norm": 0.9710887670516968, + "learning_rate": 0.001, + "loss": 1.9499, + "step": 13866 + }, + { + "epoch": 0.5866401556815297, + "grad_norm": 0.16241279244422913, + "learning_rate": 0.001, + "loss": 1.9251, + "step": 13867 + }, + { + "epoch": 0.5866824604450461, + "grad_norm": 0.401077002286911, + "learning_rate": 0.001, + "loss": 2.3491, + "step": 13868 + }, + { + "epoch": 0.5867247652085625, + "grad_norm": 0.16556090116500854, + "learning_rate": 0.001, + "loss": 1.7719, + "step": 13869 + }, + { + "epoch": 0.5867670699720788, + "grad_norm": 0.14938072860240936, + "learning_rate": 0.001, + "loss": 1.8205, + "step": 13870 + }, + { + "epoch": 0.5868093747355952, + "grad_norm": 0.1616201102733612, + "learning_rate": 0.001, + "loss": 2.3958, + "step": 13871 + }, + { + "epoch": 0.5868516794991115, + "grad_norm": 0.19175079464912415, + "learning_rate": 0.001, + "loss": 1.9595, + "step": 13872 + }, + { + "epoch": 0.586893984262628, + "grad_norm": 0.21373534202575684, + "learning_rate": 0.001, + "loss": 2.2095, + "step": 13873 + }, + { + "epoch": 0.5869362890261444, + "grad_norm": 0.20551612973213196, + "learning_rate": 0.001, + "loss": 2.4731, + "step": 13874 + }, + { + "epoch": 0.5869785937896607, + "grad_norm": 0.2638375759124756, + "learning_rate": 0.001, + "loss": 2.8643, + "step": 13875 + }, + { + "epoch": 0.5870208985531771, + "grad_norm": 0.1855308711528778, + "learning_rate": 0.001, + "loss": 1.8964, + "step": 13876 + }, + { + "epoch": 0.5870632033166935, + "grad_norm": 0.2503952383995056, + "learning_rate": 0.001, + "loss": 2.9562, + "step": 13877 + }, + { + "epoch": 0.5871055080802098, + "grad_norm": 0.15639376640319824, + "learning_rate": 0.001, + "loss": 2.6518, + "step": 13878 + }, + { + "epoch": 0.5871478128437262, + "grad_norm": 0.3068271279335022, + "learning_rate": 0.001, + "loss": 2.9139, + "step": 13879 + }, + { + "epoch": 0.5871901176072426, + "grad_norm": 2.3085885047912598, + "learning_rate": 0.001, + "loss": 1.9774, + "step": 13880 + }, + { + "epoch": 0.5872324223707589, + "grad_norm": 0.17612521350383759, + "learning_rate": 0.001, + "loss": 2.3076, + "step": 13881 + }, + { + "epoch": 0.5872747271342753, + "grad_norm": 0.21990317106246948, + "learning_rate": 0.001, + "loss": 2.7583, + "step": 13882 + }, + { + "epoch": 0.5873170318977917, + "grad_norm": 0.15076623857021332, + "learning_rate": 0.001, + "loss": 1.9021, + "step": 13883 + }, + { + "epoch": 0.587359336661308, + "grad_norm": 0.17355187237262726, + "learning_rate": 0.001, + "loss": 2.1959, + "step": 13884 + }, + { + "epoch": 0.5874016414248244, + "grad_norm": 0.16059938073158264, + "learning_rate": 0.001, + "loss": 2.5057, + "step": 13885 + }, + { + "epoch": 0.5874439461883408, + "grad_norm": 0.17203792929649353, + "learning_rate": 0.001, + "loss": 2.2947, + "step": 13886 + }, + { + "epoch": 0.5874862509518571, + "grad_norm": 0.15990357100963593, + "learning_rate": 0.001, + "loss": 1.7769, + "step": 13887 + }, + { + "epoch": 0.5875285557153735, + "grad_norm": 0.1762009710073471, + "learning_rate": 0.001, + "loss": 3.1737, + "step": 13888 + }, + { + "epoch": 0.58757086047889, + "grad_norm": 0.14753255248069763, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 13889 + }, + { + "epoch": 0.5876131652424063, + "grad_norm": 0.20030535757541656, + "learning_rate": 0.001, + "loss": 2.094, + "step": 13890 + }, + { + "epoch": 0.5876554700059227, + "grad_norm": 0.1795814037322998, + "learning_rate": 0.001, + "loss": 2.438, + "step": 13891 + }, + { + "epoch": 0.5876977747694391, + "grad_norm": 0.2042204886674881, + "learning_rate": 0.001, + "loss": 1.9378, + "step": 13892 + }, + { + "epoch": 0.5877400795329554, + "grad_norm": 0.18718861043453217, + "learning_rate": 0.001, + "loss": 3.3314, + "step": 13893 + }, + { + "epoch": 0.5877823842964718, + "grad_norm": 0.18534649908542633, + "learning_rate": 0.001, + "loss": 1.9572, + "step": 13894 + }, + { + "epoch": 0.5878246890599882, + "grad_norm": 0.14680497348308563, + "learning_rate": 0.001, + "loss": 1.6453, + "step": 13895 + }, + { + "epoch": 0.5878669938235045, + "grad_norm": 0.22413283586502075, + "learning_rate": 0.001, + "loss": 2.7833, + "step": 13896 + }, + { + "epoch": 0.5879092985870209, + "grad_norm": 0.20671682059764862, + "learning_rate": 0.001, + "loss": 3.5421, + "step": 13897 + }, + { + "epoch": 0.5879516033505373, + "grad_norm": 0.3078271448612213, + "learning_rate": 0.001, + "loss": 3.6566, + "step": 13898 + }, + { + "epoch": 0.5879939081140536, + "grad_norm": 0.3358747661113739, + "learning_rate": 0.001, + "loss": 1.9084, + "step": 13899 + }, + { + "epoch": 0.58803621287757, + "grad_norm": 0.15469489991664886, + "learning_rate": 0.001, + "loss": 1.4976, + "step": 13900 + }, + { + "epoch": 0.5880785176410864, + "grad_norm": 0.19083096086978912, + "learning_rate": 0.001, + "loss": 2.5376, + "step": 13901 + }, + { + "epoch": 0.5881208224046027, + "grad_norm": 0.16537901759147644, + "learning_rate": 0.001, + "loss": 1.4644, + "step": 13902 + }, + { + "epoch": 0.5881631271681191, + "grad_norm": 0.1859930157661438, + "learning_rate": 0.001, + "loss": 2.2167, + "step": 13903 + }, + { + "epoch": 0.5882054319316355, + "grad_norm": 0.17841875553131104, + "learning_rate": 0.001, + "loss": 1.7491, + "step": 13904 + }, + { + "epoch": 0.5882477366951518, + "grad_norm": 0.17977175116539001, + "learning_rate": 0.001, + "loss": 2.1064, + "step": 13905 + }, + { + "epoch": 0.5882900414586683, + "grad_norm": 0.33689969778060913, + "learning_rate": 0.001, + "loss": 2.7081, + "step": 13906 + }, + { + "epoch": 0.5883323462221847, + "grad_norm": 0.19339759647846222, + "learning_rate": 0.001, + "loss": 2.108, + "step": 13907 + }, + { + "epoch": 0.588374650985701, + "grad_norm": 0.14537258446216583, + "learning_rate": 0.001, + "loss": 2.1058, + "step": 13908 + }, + { + "epoch": 0.5884169557492174, + "grad_norm": 0.19922007620334625, + "learning_rate": 0.001, + "loss": 1.4782, + "step": 13909 + }, + { + "epoch": 0.5884592605127338, + "grad_norm": 0.1968841850757599, + "learning_rate": 0.001, + "loss": 2.6562, + "step": 13910 + }, + { + "epoch": 0.5885015652762501, + "grad_norm": 0.15195704996585846, + "learning_rate": 0.001, + "loss": 2.0136, + "step": 13911 + }, + { + "epoch": 0.5885438700397665, + "grad_norm": 0.17894305288791656, + "learning_rate": 0.001, + "loss": 3.5737, + "step": 13912 + }, + { + "epoch": 0.5885861748032829, + "grad_norm": 0.2585073411464691, + "learning_rate": 0.001, + "loss": 2.9576, + "step": 13913 + }, + { + "epoch": 0.5886284795667992, + "grad_norm": 0.240971177816391, + "learning_rate": 0.001, + "loss": 2.4537, + "step": 13914 + }, + { + "epoch": 0.5886707843303156, + "grad_norm": 0.1822543740272522, + "learning_rate": 0.001, + "loss": 2.2145, + "step": 13915 + }, + { + "epoch": 0.588713089093832, + "grad_norm": 0.1564810425043106, + "learning_rate": 0.001, + "loss": 2.5292, + "step": 13916 + }, + { + "epoch": 0.5887553938573483, + "grad_norm": 0.16015750169754028, + "learning_rate": 0.001, + "loss": 2.2064, + "step": 13917 + }, + { + "epoch": 0.5887976986208647, + "grad_norm": 0.2196504920721054, + "learning_rate": 0.001, + "loss": 2.7055, + "step": 13918 + }, + { + "epoch": 0.588840003384381, + "grad_norm": 0.19382864236831665, + "learning_rate": 0.001, + "loss": 3.4921, + "step": 13919 + }, + { + "epoch": 0.5888823081478974, + "grad_norm": 0.1581316739320755, + "learning_rate": 0.001, + "loss": 2.4825, + "step": 13920 + }, + { + "epoch": 0.5889246129114138, + "grad_norm": 0.168742835521698, + "learning_rate": 0.001, + "loss": 1.8284, + "step": 13921 + }, + { + "epoch": 0.5889669176749301, + "grad_norm": 0.17295318841934204, + "learning_rate": 0.001, + "loss": 2.2552, + "step": 13922 + }, + { + "epoch": 0.5890092224384466, + "grad_norm": 0.14867152273654938, + "learning_rate": 0.001, + "loss": 1.4105, + "step": 13923 + }, + { + "epoch": 0.589051527201963, + "grad_norm": 0.19675691425800323, + "learning_rate": 0.001, + "loss": 1.9656, + "step": 13924 + }, + { + "epoch": 0.5890938319654793, + "grad_norm": 0.17681030929088593, + "learning_rate": 0.001, + "loss": 3.0477, + "step": 13925 + }, + { + "epoch": 0.5891361367289957, + "grad_norm": 0.5007891654968262, + "learning_rate": 0.001, + "loss": 2.2316, + "step": 13926 + }, + { + "epoch": 0.5891784414925121, + "grad_norm": 0.6708447337150574, + "learning_rate": 0.001, + "loss": 1.6344, + "step": 13927 + }, + { + "epoch": 0.5892207462560284, + "grad_norm": 0.19077329337596893, + "learning_rate": 0.001, + "loss": 2.792, + "step": 13928 + }, + { + "epoch": 0.5892630510195448, + "grad_norm": 0.28526973724365234, + "learning_rate": 0.001, + "loss": 2.514, + "step": 13929 + }, + { + "epoch": 0.5893053557830612, + "grad_norm": 0.34212541580200195, + "learning_rate": 0.001, + "loss": 1.6885, + "step": 13930 + }, + { + "epoch": 0.5893476605465775, + "grad_norm": 0.17507655918598175, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 13931 + }, + { + "epoch": 0.5893899653100939, + "grad_norm": 1.8926448822021484, + "learning_rate": 0.001, + "loss": 1.746, + "step": 13932 + }, + { + "epoch": 0.5894322700736103, + "grad_norm": 0.1368320882320404, + "learning_rate": 0.001, + "loss": 1.5929, + "step": 13933 + }, + { + "epoch": 0.5894745748371266, + "grad_norm": 0.20840121805667877, + "learning_rate": 0.001, + "loss": 2.0687, + "step": 13934 + }, + { + "epoch": 0.589516879600643, + "grad_norm": 0.3458685576915741, + "learning_rate": 0.001, + "loss": 1.7513, + "step": 13935 + }, + { + "epoch": 0.5895591843641594, + "grad_norm": 0.17107515037059784, + "learning_rate": 0.001, + "loss": 1.8047, + "step": 13936 + }, + { + "epoch": 0.5896014891276757, + "grad_norm": 0.37925609946250916, + "learning_rate": 0.001, + "loss": 2.0818, + "step": 13937 + }, + { + "epoch": 0.5896437938911921, + "grad_norm": 0.31357237696647644, + "learning_rate": 0.001, + "loss": 2.9934, + "step": 13938 + }, + { + "epoch": 0.5896860986547086, + "grad_norm": 0.16676288843154907, + "learning_rate": 0.001, + "loss": 2.134, + "step": 13939 + }, + { + "epoch": 0.5897284034182249, + "grad_norm": 0.17158526182174683, + "learning_rate": 0.001, + "loss": 2.1719, + "step": 13940 + }, + { + "epoch": 0.5897707081817413, + "grad_norm": 0.2076321244239807, + "learning_rate": 0.001, + "loss": 2.6721, + "step": 13941 + }, + { + "epoch": 0.5898130129452577, + "grad_norm": 0.14922165870666504, + "learning_rate": 0.001, + "loss": 2.6749, + "step": 13942 + }, + { + "epoch": 0.589855317708774, + "grad_norm": 0.3216572403907776, + "learning_rate": 0.001, + "loss": 2.0285, + "step": 13943 + }, + { + "epoch": 0.5898976224722904, + "grad_norm": 0.2103135734796524, + "learning_rate": 0.001, + "loss": 1.9216, + "step": 13944 + }, + { + "epoch": 0.5899399272358068, + "grad_norm": 0.16740508377552032, + "learning_rate": 0.001, + "loss": 2.4028, + "step": 13945 + }, + { + "epoch": 0.5899822319993231, + "grad_norm": 0.2918654680252075, + "learning_rate": 0.001, + "loss": 2.7537, + "step": 13946 + }, + { + "epoch": 0.5900245367628395, + "grad_norm": 0.18662185966968536, + "learning_rate": 0.001, + "loss": 1.6938, + "step": 13947 + }, + { + "epoch": 0.5900668415263559, + "grad_norm": 0.19983160495758057, + "learning_rate": 0.001, + "loss": 2.2847, + "step": 13948 + }, + { + "epoch": 0.5901091462898722, + "grad_norm": 0.21256953477859497, + "learning_rate": 0.001, + "loss": 3.3585, + "step": 13949 + }, + { + "epoch": 0.5901514510533886, + "grad_norm": 0.1589345932006836, + "learning_rate": 0.001, + "loss": 3.151, + "step": 13950 + }, + { + "epoch": 0.590193755816905, + "grad_norm": 0.5611705183982849, + "learning_rate": 0.001, + "loss": 2.2215, + "step": 13951 + }, + { + "epoch": 0.5902360605804213, + "grad_norm": 0.19331638514995575, + "learning_rate": 0.001, + "loss": 3.0221, + "step": 13952 + }, + { + "epoch": 0.5902783653439377, + "grad_norm": 0.17588597536087036, + "learning_rate": 0.001, + "loss": 1.8878, + "step": 13953 + }, + { + "epoch": 0.5903206701074541, + "grad_norm": 0.14944405853748322, + "learning_rate": 0.001, + "loss": 2.3562, + "step": 13954 + }, + { + "epoch": 0.5903629748709704, + "grad_norm": 0.15163281559944153, + "learning_rate": 0.001, + "loss": 1.2766, + "step": 13955 + }, + { + "epoch": 0.5904052796344869, + "grad_norm": 0.1893131136894226, + "learning_rate": 0.001, + "loss": 2.3026, + "step": 13956 + }, + { + "epoch": 0.5904475843980033, + "grad_norm": 0.17503425478935242, + "learning_rate": 0.001, + "loss": 2.9459, + "step": 13957 + }, + { + "epoch": 0.5904898891615196, + "grad_norm": 0.2079678177833557, + "learning_rate": 0.001, + "loss": 1.8288, + "step": 13958 + }, + { + "epoch": 0.590532193925036, + "grad_norm": 0.4229949116706848, + "learning_rate": 0.001, + "loss": 2.3859, + "step": 13959 + }, + { + "epoch": 0.5905744986885524, + "grad_norm": 0.1723388284444809, + "learning_rate": 0.001, + "loss": 2.6868, + "step": 13960 + }, + { + "epoch": 0.5906168034520687, + "grad_norm": 0.16546562314033508, + "learning_rate": 0.001, + "loss": 1.7051, + "step": 13961 + }, + { + "epoch": 0.5906591082155851, + "grad_norm": 0.1712961196899414, + "learning_rate": 0.001, + "loss": 1.9134, + "step": 13962 + }, + { + "epoch": 0.5907014129791014, + "grad_norm": 0.18814264237880707, + "learning_rate": 0.001, + "loss": 2.4536, + "step": 13963 + }, + { + "epoch": 0.5907437177426178, + "grad_norm": 1.0132697820663452, + "learning_rate": 0.001, + "loss": 2.2008, + "step": 13964 + }, + { + "epoch": 0.5907860225061342, + "grad_norm": 0.16948258876800537, + "learning_rate": 0.001, + "loss": 1.9644, + "step": 13965 + }, + { + "epoch": 0.5908283272696505, + "grad_norm": 0.17865702509880066, + "learning_rate": 0.001, + "loss": 3.5171, + "step": 13966 + }, + { + "epoch": 0.5908706320331669, + "grad_norm": 0.1568206250667572, + "learning_rate": 0.001, + "loss": 2.2852, + "step": 13967 + }, + { + "epoch": 0.5909129367966833, + "grad_norm": 0.18641190230846405, + "learning_rate": 0.001, + "loss": 1.5695, + "step": 13968 + }, + { + "epoch": 0.5909552415601996, + "grad_norm": 0.484352707862854, + "learning_rate": 0.001, + "loss": 2.8019, + "step": 13969 + }, + { + "epoch": 0.590997546323716, + "grad_norm": 0.1573459804058075, + "learning_rate": 0.001, + "loss": 1.5844, + "step": 13970 + }, + { + "epoch": 0.5910398510872324, + "grad_norm": 0.236774280667305, + "learning_rate": 0.001, + "loss": 2.7873, + "step": 13971 + }, + { + "epoch": 0.5910821558507487, + "grad_norm": 1.0169886350631714, + "learning_rate": 0.001, + "loss": 1.9265, + "step": 13972 + }, + { + "epoch": 0.5911244606142652, + "grad_norm": 0.19387690722942352, + "learning_rate": 0.001, + "loss": 1.6925, + "step": 13973 + }, + { + "epoch": 0.5911667653777816, + "grad_norm": 0.17645257711410522, + "learning_rate": 0.001, + "loss": 2.2239, + "step": 13974 + }, + { + "epoch": 0.5912090701412979, + "grad_norm": 0.1641792207956314, + "learning_rate": 0.001, + "loss": 2.3921, + "step": 13975 + }, + { + "epoch": 0.5912513749048143, + "grad_norm": 0.20244446396827698, + "learning_rate": 0.001, + "loss": 2.133, + "step": 13976 + }, + { + "epoch": 0.5912936796683307, + "grad_norm": 0.2241334766149521, + "learning_rate": 0.001, + "loss": 2.7806, + "step": 13977 + }, + { + "epoch": 0.591335984431847, + "grad_norm": 0.16886314749717712, + "learning_rate": 0.001, + "loss": 2.3937, + "step": 13978 + }, + { + "epoch": 0.5913782891953634, + "grad_norm": 0.17606092989444733, + "learning_rate": 0.001, + "loss": 1.5095, + "step": 13979 + }, + { + "epoch": 0.5914205939588798, + "grad_norm": 0.16961275041103363, + "learning_rate": 0.001, + "loss": 2.3644, + "step": 13980 + }, + { + "epoch": 0.5914628987223961, + "grad_norm": 0.16681556403636932, + "learning_rate": 0.001, + "loss": 2.0659, + "step": 13981 + }, + { + "epoch": 0.5915052034859125, + "grad_norm": 0.2830229699611664, + "learning_rate": 0.001, + "loss": 1.8581, + "step": 13982 + }, + { + "epoch": 0.5915475082494289, + "grad_norm": 0.6050325036048889, + "learning_rate": 0.001, + "loss": 2.2993, + "step": 13983 + }, + { + "epoch": 0.5915898130129452, + "grad_norm": 0.17912669479846954, + "learning_rate": 0.001, + "loss": 2.4188, + "step": 13984 + }, + { + "epoch": 0.5916321177764616, + "grad_norm": 0.19286015629768372, + "learning_rate": 0.001, + "loss": 2.7553, + "step": 13985 + }, + { + "epoch": 0.591674422539978, + "grad_norm": 0.1747298538684845, + "learning_rate": 0.001, + "loss": 1.9951, + "step": 13986 + }, + { + "epoch": 0.5917167273034943, + "grad_norm": 0.24552187323570251, + "learning_rate": 0.001, + "loss": 2.9972, + "step": 13987 + }, + { + "epoch": 0.5917590320670107, + "grad_norm": 0.3392343521118164, + "learning_rate": 0.001, + "loss": 2.7581, + "step": 13988 + }, + { + "epoch": 0.5918013368305272, + "grad_norm": 0.22997502982616425, + "learning_rate": 0.001, + "loss": 2.4476, + "step": 13989 + }, + { + "epoch": 0.5918436415940435, + "grad_norm": 0.17258018255233765, + "learning_rate": 0.001, + "loss": 1.8377, + "step": 13990 + }, + { + "epoch": 0.5918859463575599, + "grad_norm": 0.6078843474388123, + "learning_rate": 0.001, + "loss": 2.6106, + "step": 13991 + }, + { + "epoch": 0.5919282511210763, + "grad_norm": 0.19402997195720673, + "learning_rate": 0.001, + "loss": 2.5247, + "step": 13992 + }, + { + "epoch": 0.5919705558845926, + "grad_norm": 0.1636909693479538, + "learning_rate": 0.001, + "loss": 2.0067, + "step": 13993 + }, + { + "epoch": 0.592012860648109, + "grad_norm": 0.18842186033725739, + "learning_rate": 0.001, + "loss": 1.6082, + "step": 13994 + }, + { + "epoch": 0.5920551654116254, + "grad_norm": 0.1705595999956131, + "learning_rate": 0.001, + "loss": 1.6876, + "step": 13995 + }, + { + "epoch": 0.5920974701751417, + "grad_norm": 0.23243148624897003, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 13996 + }, + { + "epoch": 0.5921397749386581, + "grad_norm": 0.18063674867153168, + "learning_rate": 0.001, + "loss": 2.2572, + "step": 13997 + }, + { + "epoch": 0.5921820797021745, + "grad_norm": 0.21057967841625214, + "learning_rate": 0.001, + "loss": 1.7763, + "step": 13998 + }, + { + "epoch": 0.5922243844656908, + "grad_norm": 0.19632886350154877, + "learning_rate": 0.001, + "loss": 1.9372, + "step": 13999 + }, + { + "epoch": 0.5922666892292072, + "grad_norm": 0.16427557170391083, + "learning_rate": 0.001, + "loss": 1.974, + "step": 14000 + }, + { + "epoch": 0.5923089939927236, + "grad_norm": 0.15664273500442505, + "learning_rate": 0.001, + "loss": 1.7737, + "step": 14001 + }, + { + "epoch": 0.5923512987562399, + "grad_norm": 15.016242027282715, + "learning_rate": 0.001, + "loss": 2.0841, + "step": 14002 + }, + { + "epoch": 0.5923936035197563, + "grad_norm": 0.14031845331192017, + "learning_rate": 0.001, + "loss": 2.2924, + "step": 14003 + }, + { + "epoch": 0.5924359082832727, + "grad_norm": 0.17053398489952087, + "learning_rate": 0.001, + "loss": 2.4202, + "step": 14004 + }, + { + "epoch": 0.592478213046789, + "grad_norm": 0.1829700618982315, + "learning_rate": 0.001, + "loss": 2.0318, + "step": 14005 + }, + { + "epoch": 0.5925205178103055, + "grad_norm": 0.16647842526435852, + "learning_rate": 0.001, + "loss": 2.2545, + "step": 14006 + }, + { + "epoch": 0.5925628225738218, + "grad_norm": 0.1678646355867386, + "learning_rate": 0.001, + "loss": 1.8475, + "step": 14007 + }, + { + "epoch": 0.5926051273373382, + "grad_norm": 0.2268977016210556, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 14008 + }, + { + "epoch": 0.5926474321008546, + "grad_norm": 0.16663521528244019, + "learning_rate": 0.001, + "loss": 1.7448, + "step": 14009 + }, + { + "epoch": 0.5926897368643709, + "grad_norm": 0.33862343430519104, + "learning_rate": 0.001, + "loss": 1.9905, + "step": 14010 + }, + { + "epoch": 0.5927320416278873, + "grad_norm": 0.5181235671043396, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 14011 + }, + { + "epoch": 0.5927743463914037, + "grad_norm": 0.15368396043777466, + "learning_rate": 0.001, + "loss": 2.6473, + "step": 14012 + }, + { + "epoch": 0.59281665115492, + "grad_norm": 0.18206775188446045, + "learning_rate": 0.001, + "loss": 1.3312, + "step": 14013 + }, + { + "epoch": 0.5928589559184364, + "grad_norm": 0.21230578422546387, + "learning_rate": 0.001, + "loss": 1.8037, + "step": 14014 + }, + { + "epoch": 0.5929012606819528, + "grad_norm": 0.1920211762189865, + "learning_rate": 0.001, + "loss": 2.3031, + "step": 14015 + }, + { + "epoch": 0.5929435654454691, + "grad_norm": 0.1721087396144867, + "learning_rate": 0.001, + "loss": 2.0063, + "step": 14016 + }, + { + "epoch": 0.5929858702089855, + "grad_norm": 0.17139104008674622, + "learning_rate": 0.001, + "loss": 1.8707, + "step": 14017 + }, + { + "epoch": 0.5930281749725019, + "grad_norm": 0.2000657021999359, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 14018 + }, + { + "epoch": 0.5930704797360182, + "grad_norm": 2.25368595123291, + "learning_rate": 0.001, + "loss": 2.2605, + "step": 14019 + }, + { + "epoch": 0.5931127844995346, + "grad_norm": 0.17565739154815674, + "learning_rate": 0.001, + "loss": 2.4521, + "step": 14020 + }, + { + "epoch": 0.593155089263051, + "grad_norm": 0.7381449341773987, + "learning_rate": 0.001, + "loss": 2.9677, + "step": 14021 + }, + { + "epoch": 0.5931973940265673, + "grad_norm": 0.17680025100708008, + "learning_rate": 0.001, + "loss": 2.6527, + "step": 14022 + }, + { + "epoch": 0.5932396987900838, + "grad_norm": 0.6374967694282532, + "learning_rate": 0.001, + "loss": 2.7785, + "step": 14023 + }, + { + "epoch": 0.5932820035536002, + "grad_norm": 0.18539991974830627, + "learning_rate": 0.001, + "loss": 2.1156, + "step": 14024 + }, + { + "epoch": 0.5933243083171165, + "grad_norm": 0.17498497664928436, + "learning_rate": 0.001, + "loss": 1.9663, + "step": 14025 + }, + { + "epoch": 0.5933666130806329, + "grad_norm": 0.16935625672340393, + "learning_rate": 0.001, + "loss": 1.7092, + "step": 14026 + }, + { + "epoch": 0.5934089178441493, + "grad_norm": 0.1708284169435501, + "learning_rate": 0.001, + "loss": 1.619, + "step": 14027 + }, + { + "epoch": 0.5934512226076656, + "grad_norm": 0.1765231490135193, + "learning_rate": 0.001, + "loss": 1.8157, + "step": 14028 + }, + { + "epoch": 0.593493527371182, + "grad_norm": 0.21221309900283813, + "learning_rate": 0.001, + "loss": 1.911, + "step": 14029 + }, + { + "epoch": 0.5935358321346984, + "grad_norm": 16.430587768554688, + "learning_rate": 0.001, + "loss": 2.7351, + "step": 14030 + }, + { + "epoch": 0.5935781368982147, + "grad_norm": 0.15817879140377045, + "learning_rate": 0.001, + "loss": 2.0115, + "step": 14031 + }, + { + "epoch": 0.5936204416617311, + "grad_norm": 0.2005200982093811, + "learning_rate": 0.001, + "loss": 1.8476, + "step": 14032 + }, + { + "epoch": 0.5936627464252475, + "grad_norm": 0.18343733251094818, + "learning_rate": 0.001, + "loss": 1.9199, + "step": 14033 + }, + { + "epoch": 0.5937050511887638, + "grad_norm": 13.979016304016113, + "learning_rate": 0.001, + "loss": 1.6957, + "step": 14034 + }, + { + "epoch": 0.5937473559522802, + "grad_norm": 1.4430853128433228, + "learning_rate": 0.001, + "loss": 3.6849, + "step": 14035 + }, + { + "epoch": 0.5937896607157966, + "grad_norm": 0.22609612345695496, + "learning_rate": 0.001, + "loss": 3.7324, + "step": 14036 + }, + { + "epoch": 0.5938319654793129, + "grad_norm": 0.3302842080593109, + "learning_rate": 0.001, + "loss": 2.7874, + "step": 14037 + }, + { + "epoch": 0.5938742702428293, + "grad_norm": 0.39011886715888977, + "learning_rate": 0.001, + "loss": 2.6145, + "step": 14038 + }, + { + "epoch": 0.5939165750063458, + "grad_norm": 0.21780306100845337, + "learning_rate": 0.001, + "loss": 1.8593, + "step": 14039 + }, + { + "epoch": 0.5939588797698621, + "grad_norm": 0.2653607130050659, + "learning_rate": 0.001, + "loss": 2.3802, + "step": 14040 + }, + { + "epoch": 0.5940011845333785, + "grad_norm": 0.18377161026000977, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 14041 + }, + { + "epoch": 0.5940434892968949, + "grad_norm": 0.22068673372268677, + "learning_rate": 0.001, + "loss": 3.5105, + "step": 14042 + }, + { + "epoch": 0.5940857940604112, + "grad_norm": 0.2928449213504791, + "learning_rate": 0.001, + "loss": 2.2201, + "step": 14043 + }, + { + "epoch": 0.5941280988239276, + "grad_norm": 5.771327495574951, + "learning_rate": 0.001, + "loss": 3.2931, + "step": 14044 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.24995769560337067, + "learning_rate": 0.001, + "loss": 3.6736, + "step": 14045 + }, + { + "epoch": 0.5942127083509603, + "grad_norm": 1.1308993101119995, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 14046 + }, + { + "epoch": 0.5942550131144767, + "grad_norm": 0.16081666946411133, + "learning_rate": 0.001, + "loss": 1.9559, + "step": 14047 + }, + { + "epoch": 0.5942973178779931, + "grad_norm": 0.8098224997520447, + "learning_rate": 0.001, + "loss": 3.7266, + "step": 14048 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 6.794950008392334, + "learning_rate": 0.001, + "loss": 2.869, + "step": 14049 + }, + { + "epoch": 0.5943819274050258, + "grad_norm": 0.2231823056936264, + "learning_rate": 0.001, + "loss": 2.455, + "step": 14050 + }, + { + "epoch": 0.5944242321685422, + "grad_norm": 0.23370391130447388, + "learning_rate": 0.001, + "loss": 2.4393, + "step": 14051 + }, + { + "epoch": 0.5944665369320585, + "grad_norm": 0.300771564245224, + "learning_rate": 0.001, + "loss": 2.7126, + "step": 14052 + }, + { + "epoch": 0.5945088416955749, + "grad_norm": 0.9046683311462402, + "learning_rate": 0.001, + "loss": 3.1607, + "step": 14053 + }, + { + "epoch": 0.5945511464590912, + "grad_norm": 0.2239486575126648, + "learning_rate": 0.001, + "loss": 2.566, + "step": 14054 + }, + { + "epoch": 0.5945934512226076, + "grad_norm": 0.20858706533908844, + "learning_rate": 0.001, + "loss": 1.9689, + "step": 14055 + }, + { + "epoch": 0.5946357559861241, + "grad_norm": 0.24165865778923035, + "learning_rate": 0.001, + "loss": 2.2796, + "step": 14056 + }, + { + "epoch": 0.5946780607496404, + "grad_norm": 0.18632391095161438, + "learning_rate": 0.001, + "loss": 2.541, + "step": 14057 + }, + { + "epoch": 0.5947203655131568, + "grad_norm": 0.21690063178539276, + "learning_rate": 0.001, + "loss": 2.8571, + "step": 14058 + }, + { + "epoch": 0.5947626702766732, + "grad_norm": 0.2051665484905243, + "learning_rate": 0.001, + "loss": 3.3445, + "step": 14059 + }, + { + "epoch": 0.5948049750401895, + "grad_norm": 0.2277742475271225, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 14060 + }, + { + "epoch": 0.5948472798037059, + "grad_norm": 0.16131742298603058, + "learning_rate": 0.001, + "loss": 2.1966, + "step": 14061 + }, + { + "epoch": 0.5948895845672223, + "grad_norm": 0.20708271861076355, + "learning_rate": 0.001, + "loss": 1.7722, + "step": 14062 + }, + { + "epoch": 0.5949318893307386, + "grad_norm": 0.17444756627082825, + "learning_rate": 0.001, + "loss": 2.1499, + "step": 14063 + }, + { + "epoch": 0.594974194094255, + "grad_norm": 0.8531075119972229, + "learning_rate": 0.001, + "loss": 2.6364, + "step": 14064 + }, + { + "epoch": 0.5950164988577714, + "grad_norm": 0.3456467092037201, + "learning_rate": 0.001, + "loss": 2.8462, + "step": 14065 + }, + { + "epoch": 0.5950588036212877, + "grad_norm": 0.2058996707201004, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 14066 + }, + { + "epoch": 0.5951011083848041, + "grad_norm": 0.22642020881175995, + "learning_rate": 0.001, + "loss": 2.8038, + "step": 14067 + }, + { + "epoch": 0.5951434131483205, + "grad_norm": 4.5695719718933105, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 14068 + }, + { + "epoch": 0.5951857179118368, + "grad_norm": 0.23832251131534576, + "learning_rate": 0.001, + "loss": 2.3427, + "step": 14069 + }, + { + "epoch": 0.5952280226753532, + "grad_norm": 0.19538003206253052, + "learning_rate": 0.001, + "loss": 3.9116, + "step": 14070 + }, + { + "epoch": 0.5952703274388697, + "grad_norm": 0.53928542137146, + "learning_rate": 0.001, + "loss": 1.8973, + "step": 14071 + }, + { + "epoch": 0.595312632202386, + "grad_norm": 0.26821160316467285, + "learning_rate": 0.001, + "loss": 2.5537, + "step": 14072 + }, + { + "epoch": 0.5953549369659024, + "grad_norm": 0.20947696268558502, + "learning_rate": 0.001, + "loss": 2.5992, + "step": 14073 + }, + { + "epoch": 0.5953972417294188, + "grad_norm": 0.2498164176940918, + "learning_rate": 0.001, + "loss": 2.0545, + "step": 14074 + }, + { + "epoch": 0.5954395464929351, + "grad_norm": 0.22430068254470825, + "learning_rate": 0.001, + "loss": 2.6415, + "step": 14075 + }, + { + "epoch": 0.5954818512564515, + "grad_norm": 0.39491817355155945, + "learning_rate": 0.001, + "loss": 2.3454, + "step": 14076 + }, + { + "epoch": 0.5955241560199679, + "grad_norm": 0.21011623740196228, + "learning_rate": 0.001, + "loss": 3.1634, + "step": 14077 + }, + { + "epoch": 0.5955664607834842, + "grad_norm": 0.20442980527877808, + "learning_rate": 0.001, + "loss": 2.2365, + "step": 14078 + }, + { + "epoch": 0.5956087655470006, + "grad_norm": 0.18820206820964813, + "learning_rate": 0.001, + "loss": 2.0868, + "step": 14079 + }, + { + "epoch": 0.595651070310517, + "grad_norm": 0.21796266734600067, + "learning_rate": 0.001, + "loss": 2.4075, + "step": 14080 + }, + { + "epoch": 0.5956933750740333, + "grad_norm": 0.16265445947647095, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 14081 + }, + { + "epoch": 0.5957356798375497, + "grad_norm": 0.17943935096263885, + "learning_rate": 0.001, + "loss": 1.7516, + "step": 14082 + }, + { + "epoch": 0.5957779846010661, + "grad_norm": 0.3553915321826935, + "learning_rate": 0.001, + "loss": 1.9473, + "step": 14083 + }, + { + "epoch": 0.5958202893645824, + "grad_norm": 0.1593002825975418, + "learning_rate": 0.001, + "loss": 1.6759, + "step": 14084 + }, + { + "epoch": 0.5958625941280988, + "grad_norm": 0.18188300728797913, + "learning_rate": 0.001, + "loss": 2.2562, + "step": 14085 + }, + { + "epoch": 0.5959048988916152, + "grad_norm": 0.16794385015964508, + "learning_rate": 0.001, + "loss": 2.2645, + "step": 14086 + }, + { + "epoch": 0.5959472036551315, + "grad_norm": 0.16501161456108093, + "learning_rate": 0.001, + "loss": 2.7828, + "step": 14087 + }, + { + "epoch": 0.595989508418648, + "grad_norm": 0.22263823449611664, + "learning_rate": 0.001, + "loss": 2.0619, + "step": 14088 + }, + { + "epoch": 0.5960318131821644, + "grad_norm": 0.4712727963924408, + "learning_rate": 0.001, + "loss": 3.4105, + "step": 14089 + }, + { + "epoch": 0.5960741179456807, + "grad_norm": 0.19112852215766907, + "learning_rate": 0.001, + "loss": 2.1327, + "step": 14090 + }, + { + "epoch": 0.5961164227091971, + "grad_norm": 0.1422937512397766, + "learning_rate": 0.001, + "loss": 2.0298, + "step": 14091 + }, + { + "epoch": 0.5961587274727135, + "grad_norm": 0.14151273667812347, + "learning_rate": 0.001, + "loss": 1.967, + "step": 14092 + }, + { + "epoch": 0.5962010322362298, + "grad_norm": 0.1558687388896942, + "learning_rate": 0.001, + "loss": 2.4991, + "step": 14093 + }, + { + "epoch": 0.5962433369997462, + "grad_norm": 0.18624737858772278, + "learning_rate": 0.001, + "loss": 3.1451, + "step": 14094 + }, + { + "epoch": 0.5962856417632626, + "grad_norm": 0.15436244010925293, + "learning_rate": 0.001, + "loss": 2.2889, + "step": 14095 + }, + { + "epoch": 0.5963279465267789, + "grad_norm": 0.15433412790298462, + "learning_rate": 0.001, + "loss": 1.7584, + "step": 14096 + }, + { + "epoch": 0.5963702512902953, + "grad_norm": 0.24146470427513123, + "learning_rate": 0.001, + "loss": 3.3554, + "step": 14097 + }, + { + "epoch": 0.5964125560538116, + "grad_norm": 0.15554895997047424, + "learning_rate": 0.001, + "loss": 1.8556, + "step": 14098 + }, + { + "epoch": 0.596454860817328, + "grad_norm": 0.17149004340171814, + "learning_rate": 0.001, + "loss": 2.3022, + "step": 14099 + }, + { + "epoch": 0.5964971655808444, + "grad_norm": 0.2116393893957138, + "learning_rate": 0.001, + "loss": 2.0259, + "step": 14100 + }, + { + "epoch": 0.5965394703443607, + "grad_norm": 0.15278328955173492, + "learning_rate": 0.001, + "loss": 2.0596, + "step": 14101 + }, + { + "epoch": 0.5965817751078771, + "grad_norm": 0.14205598831176758, + "learning_rate": 0.001, + "loss": 2.5957, + "step": 14102 + }, + { + "epoch": 0.5966240798713935, + "grad_norm": 0.14854857325553894, + "learning_rate": 0.001, + "loss": 2.006, + "step": 14103 + }, + { + "epoch": 0.5966663846349098, + "grad_norm": 0.5435006022453308, + "learning_rate": 0.001, + "loss": 2.9184, + "step": 14104 + }, + { + "epoch": 0.5967086893984263, + "grad_norm": 0.15318304300308228, + "learning_rate": 0.001, + "loss": 2.3277, + "step": 14105 + }, + { + "epoch": 0.5967509941619427, + "grad_norm": 0.3546973168849945, + "learning_rate": 0.001, + "loss": 1.9591, + "step": 14106 + }, + { + "epoch": 0.596793298925459, + "grad_norm": 0.15295492112636566, + "learning_rate": 0.001, + "loss": 2.7593, + "step": 14107 + }, + { + "epoch": 0.5968356036889754, + "grad_norm": 0.13746538758277893, + "learning_rate": 0.001, + "loss": 2.6047, + "step": 14108 + }, + { + "epoch": 0.5968779084524918, + "grad_norm": 0.18742793798446655, + "learning_rate": 0.001, + "loss": 1.4895, + "step": 14109 + }, + { + "epoch": 0.5969202132160081, + "grad_norm": 0.28040406107902527, + "learning_rate": 0.001, + "loss": 3.4037, + "step": 14110 + }, + { + "epoch": 0.5969625179795245, + "grad_norm": 0.15165936946868896, + "learning_rate": 0.001, + "loss": 2.2845, + "step": 14111 + }, + { + "epoch": 0.5970048227430409, + "grad_norm": 0.1565185934305191, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 14112 + }, + { + "epoch": 0.5970471275065572, + "grad_norm": 0.16770948469638824, + "learning_rate": 0.001, + "loss": 2.4891, + "step": 14113 + }, + { + "epoch": 0.5970894322700736, + "grad_norm": 0.14534291625022888, + "learning_rate": 0.001, + "loss": 2.0148, + "step": 14114 + }, + { + "epoch": 0.59713173703359, + "grad_norm": 0.157785564661026, + "learning_rate": 0.001, + "loss": 2.0671, + "step": 14115 + }, + { + "epoch": 0.5971740417971063, + "grad_norm": 0.15590552985668182, + "learning_rate": 0.001, + "loss": 3.3419, + "step": 14116 + }, + { + "epoch": 0.5972163465606227, + "grad_norm": 0.24124379456043243, + "learning_rate": 0.001, + "loss": 3.057, + "step": 14117 + }, + { + "epoch": 0.5972586513241391, + "grad_norm": 4.952909469604492, + "learning_rate": 0.001, + "loss": 2.278, + "step": 14118 + }, + { + "epoch": 0.5973009560876554, + "grad_norm": 0.3283090889453888, + "learning_rate": 0.001, + "loss": 2.4805, + "step": 14119 + }, + { + "epoch": 0.5973432608511718, + "grad_norm": 0.14943337440490723, + "learning_rate": 0.001, + "loss": 1.8459, + "step": 14120 + }, + { + "epoch": 0.5973855656146883, + "grad_norm": 0.1828925907611847, + "learning_rate": 0.001, + "loss": 2.2262, + "step": 14121 + }, + { + "epoch": 0.5974278703782046, + "grad_norm": 0.21698467433452606, + "learning_rate": 0.001, + "loss": 1.5742, + "step": 14122 + }, + { + "epoch": 0.597470175141721, + "grad_norm": 0.17111243307590485, + "learning_rate": 0.001, + "loss": 1.6804, + "step": 14123 + }, + { + "epoch": 0.5975124799052374, + "grad_norm": 0.1583365499973297, + "learning_rate": 0.001, + "loss": 1.6393, + "step": 14124 + }, + { + "epoch": 0.5975547846687537, + "grad_norm": 0.18235273659229279, + "learning_rate": 0.001, + "loss": 2.8478, + "step": 14125 + }, + { + "epoch": 0.5975970894322701, + "grad_norm": 0.16986776888370514, + "learning_rate": 0.001, + "loss": 1.6388, + "step": 14126 + }, + { + "epoch": 0.5976393941957865, + "grad_norm": 0.39957574009895325, + "learning_rate": 0.001, + "loss": 2.1505, + "step": 14127 + }, + { + "epoch": 0.5976816989593028, + "grad_norm": 0.15922756493091583, + "learning_rate": 0.001, + "loss": 1.7598, + "step": 14128 + }, + { + "epoch": 0.5977240037228192, + "grad_norm": 6.040116786956787, + "learning_rate": 0.001, + "loss": 3.9252, + "step": 14129 + }, + { + "epoch": 0.5977663084863356, + "grad_norm": 0.2572100758552551, + "learning_rate": 0.001, + "loss": 2.1899, + "step": 14130 + }, + { + "epoch": 0.5978086132498519, + "grad_norm": 7.241512298583984, + "learning_rate": 0.001, + "loss": 2.238, + "step": 14131 + }, + { + "epoch": 0.5978509180133683, + "grad_norm": 0.21074838936328888, + "learning_rate": 0.001, + "loss": 1.8177, + "step": 14132 + }, + { + "epoch": 0.5978932227768847, + "grad_norm": 0.20617999136447906, + "learning_rate": 0.001, + "loss": 2.7263, + "step": 14133 + }, + { + "epoch": 0.597935527540401, + "grad_norm": 8.84471321105957, + "learning_rate": 0.001, + "loss": 2.3485, + "step": 14134 + }, + { + "epoch": 0.5979778323039174, + "grad_norm": 0.21541795134544373, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 14135 + }, + { + "epoch": 0.5980201370674338, + "grad_norm": 0.17976322770118713, + "learning_rate": 0.001, + "loss": 1.9674, + "step": 14136 + }, + { + "epoch": 0.5980624418309501, + "grad_norm": 0.18282859027385712, + "learning_rate": 0.001, + "loss": 2.8773, + "step": 14137 + }, + { + "epoch": 0.5981047465944666, + "grad_norm": 0.29796525835990906, + "learning_rate": 0.001, + "loss": 2.8154, + "step": 14138 + }, + { + "epoch": 0.598147051357983, + "grad_norm": 0.19228790700435638, + "learning_rate": 0.001, + "loss": 2.666, + "step": 14139 + }, + { + "epoch": 0.5981893561214993, + "grad_norm": 0.3605715334415436, + "learning_rate": 0.001, + "loss": 2.8384, + "step": 14140 + }, + { + "epoch": 0.5982316608850157, + "grad_norm": 0.28930771350860596, + "learning_rate": 0.001, + "loss": 1.6602, + "step": 14141 + }, + { + "epoch": 0.5982739656485321, + "grad_norm": 0.21113309264183044, + "learning_rate": 0.001, + "loss": 1.586, + "step": 14142 + }, + { + "epoch": 0.5983162704120484, + "grad_norm": 0.16728419065475464, + "learning_rate": 0.001, + "loss": 3.1558, + "step": 14143 + }, + { + "epoch": 0.5983585751755648, + "grad_norm": 0.19090363383293152, + "learning_rate": 0.001, + "loss": 2.0699, + "step": 14144 + }, + { + "epoch": 0.5984008799390811, + "grad_norm": 0.23593130707740784, + "learning_rate": 0.001, + "loss": 2.9676, + "step": 14145 + }, + { + "epoch": 0.5984431847025975, + "grad_norm": 0.1566046178340912, + "learning_rate": 0.001, + "loss": 1.873, + "step": 14146 + }, + { + "epoch": 0.5984854894661139, + "grad_norm": 0.14671561121940613, + "learning_rate": 0.001, + "loss": 1.6314, + "step": 14147 + }, + { + "epoch": 0.5985277942296302, + "grad_norm": 0.17109639942646027, + "learning_rate": 0.001, + "loss": 1.7268, + "step": 14148 + }, + { + "epoch": 0.5985700989931466, + "grad_norm": 0.13763944804668427, + "learning_rate": 0.001, + "loss": 2.2571, + "step": 14149 + }, + { + "epoch": 0.598612403756663, + "grad_norm": 0.14401958882808685, + "learning_rate": 0.001, + "loss": 1.5655, + "step": 14150 + }, + { + "epoch": 0.5986547085201793, + "grad_norm": 0.21010830998420715, + "learning_rate": 0.001, + "loss": 1.9039, + "step": 14151 + }, + { + "epoch": 0.5986970132836957, + "grad_norm": 1.928266167640686, + "learning_rate": 0.001, + "loss": 2.4406, + "step": 14152 + }, + { + "epoch": 0.5987393180472121, + "grad_norm": 0.1703515201807022, + "learning_rate": 0.001, + "loss": 2.6037, + "step": 14153 + }, + { + "epoch": 0.5987816228107284, + "grad_norm": 0.2860715687274933, + "learning_rate": 0.001, + "loss": 2.0231, + "step": 14154 + }, + { + "epoch": 0.5988239275742449, + "grad_norm": 0.19897747039794922, + "learning_rate": 0.001, + "loss": 2.8199, + "step": 14155 + }, + { + "epoch": 0.5988662323377613, + "grad_norm": 0.18251550197601318, + "learning_rate": 0.001, + "loss": 1.7923, + "step": 14156 + }, + { + "epoch": 0.5989085371012776, + "grad_norm": 1.5346297025680542, + "learning_rate": 0.001, + "loss": 1.7232, + "step": 14157 + }, + { + "epoch": 0.598950841864794, + "grad_norm": 0.16617514193058014, + "learning_rate": 0.001, + "loss": 1.4424, + "step": 14158 + }, + { + "epoch": 0.5989931466283104, + "grad_norm": 0.14995898306369781, + "learning_rate": 0.001, + "loss": 1.8629, + "step": 14159 + }, + { + "epoch": 0.5990354513918267, + "grad_norm": 0.1728651225566864, + "learning_rate": 0.001, + "loss": 2.0475, + "step": 14160 + }, + { + "epoch": 0.5990777561553431, + "grad_norm": 0.27386367321014404, + "learning_rate": 0.001, + "loss": 1.6863, + "step": 14161 + }, + { + "epoch": 0.5991200609188595, + "grad_norm": 0.21479547023773193, + "learning_rate": 0.001, + "loss": 1.8521, + "step": 14162 + }, + { + "epoch": 0.5991623656823758, + "grad_norm": 0.2000337839126587, + "learning_rate": 0.001, + "loss": 2.8076, + "step": 14163 + }, + { + "epoch": 0.5992046704458922, + "grad_norm": 0.16316631436347961, + "learning_rate": 0.001, + "loss": 3.3434, + "step": 14164 + }, + { + "epoch": 0.5992469752094086, + "grad_norm": 0.18021291494369507, + "learning_rate": 0.001, + "loss": 2.0387, + "step": 14165 + }, + { + "epoch": 0.5992892799729249, + "grad_norm": 0.24870742857456207, + "learning_rate": 0.001, + "loss": 2.6437, + "step": 14166 + }, + { + "epoch": 0.5993315847364413, + "grad_norm": 0.17932534217834473, + "learning_rate": 0.001, + "loss": 2.412, + "step": 14167 + }, + { + "epoch": 0.5993738894999577, + "grad_norm": 0.22996637225151062, + "learning_rate": 0.001, + "loss": 3.117, + "step": 14168 + }, + { + "epoch": 0.599416194263474, + "grad_norm": 13.875000953674316, + "learning_rate": 0.001, + "loss": 2.2331, + "step": 14169 + }, + { + "epoch": 0.5994584990269904, + "grad_norm": 0.8741011619567871, + "learning_rate": 0.001, + "loss": 2.2017, + "step": 14170 + }, + { + "epoch": 0.5995008037905069, + "grad_norm": 0.3972923159599304, + "learning_rate": 0.001, + "loss": 2.4348, + "step": 14171 + }, + { + "epoch": 0.5995431085540232, + "grad_norm": 0.18050576746463776, + "learning_rate": 0.001, + "loss": 2.2719, + "step": 14172 + }, + { + "epoch": 0.5995854133175396, + "grad_norm": 0.6231819987297058, + "learning_rate": 0.001, + "loss": 1.9272, + "step": 14173 + }, + { + "epoch": 0.599627718081056, + "grad_norm": 0.2706356942653656, + "learning_rate": 0.001, + "loss": 1.8771, + "step": 14174 + }, + { + "epoch": 0.5996700228445723, + "grad_norm": 0.2066756784915924, + "learning_rate": 0.001, + "loss": 1.9065, + "step": 14175 + }, + { + "epoch": 0.5997123276080887, + "grad_norm": 0.2722444236278534, + "learning_rate": 0.001, + "loss": 2.5702, + "step": 14176 + }, + { + "epoch": 0.5997546323716051, + "grad_norm": 0.18113920092582703, + "learning_rate": 0.001, + "loss": 1.8397, + "step": 14177 + }, + { + "epoch": 0.5997969371351214, + "grad_norm": 0.2111581563949585, + "learning_rate": 0.001, + "loss": 2.0402, + "step": 14178 + }, + { + "epoch": 0.5998392418986378, + "grad_norm": 0.1754538118839264, + "learning_rate": 0.001, + "loss": 1.4719, + "step": 14179 + }, + { + "epoch": 0.5998815466621542, + "grad_norm": 0.15258760750293732, + "learning_rate": 0.001, + "loss": 2.0344, + "step": 14180 + }, + { + "epoch": 0.5999238514256705, + "grad_norm": 0.18332518637180328, + "learning_rate": 0.001, + "loss": 2.2107, + "step": 14181 + }, + { + "epoch": 0.5999661561891869, + "grad_norm": 0.23446984589099884, + "learning_rate": 0.001, + "loss": 2.8876, + "step": 14182 + }, + { + "epoch": 0.6000084609527033, + "grad_norm": 0.20823650062084198, + "learning_rate": 0.001, + "loss": 2.5374, + "step": 14183 + }, + { + "epoch": 0.6000507657162196, + "grad_norm": 0.16053082048892975, + "learning_rate": 0.001, + "loss": 2.1142, + "step": 14184 + }, + { + "epoch": 0.600093070479736, + "grad_norm": 0.2412220984697342, + "learning_rate": 0.001, + "loss": 3.0947, + "step": 14185 + }, + { + "epoch": 0.6001353752432524, + "grad_norm": 0.17646367847919464, + "learning_rate": 0.001, + "loss": 2.4277, + "step": 14186 + }, + { + "epoch": 0.6001776800067687, + "grad_norm": 0.16096143424510956, + "learning_rate": 0.001, + "loss": 2.2449, + "step": 14187 + }, + { + "epoch": 0.6002199847702852, + "grad_norm": 0.2696020007133484, + "learning_rate": 0.001, + "loss": 1.9888, + "step": 14188 + }, + { + "epoch": 0.6002622895338015, + "grad_norm": 0.14469857513904572, + "learning_rate": 0.001, + "loss": 2.14, + "step": 14189 + }, + { + "epoch": 0.6003045942973179, + "grad_norm": 0.17882224917411804, + "learning_rate": 0.001, + "loss": 2.0161, + "step": 14190 + }, + { + "epoch": 0.6003468990608343, + "grad_norm": 0.1592990756034851, + "learning_rate": 0.001, + "loss": 2.067, + "step": 14191 + }, + { + "epoch": 0.6003892038243506, + "grad_norm": 0.8799763321876526, + "learning_rate": 0.001, + "loss": 2.3578, + "step": 14192 + }, + { + "epoch": 0.600431508587867, + "grad_norm": 0.1751909852027893, + "learning_rate": 0.001, + "loss": 1.8269, + "step": 14193 + }, + { + "epoch": 0.6004738133513834, + "grad_norm": 0.22344686090946198, + "learning_rate": 0.001, + "loss": 2.8203, + "step": 14194 + }, + { + "epoch": 0.6005161181148997, + "grad_norm": 0.19454553723335266, + "learning_rate": 0.001, + "loss": 2.5996, + "step": 14195 + }, + { + "epoch": 0.6005584228784161, + "grad_norm": 0.16790246963500977, + "learning_rate": 0.001, + "loss": 2.8472, + "step": 14196 + }, + { + "epoch": 0.6006007276419325, + "grad_norm": 0.16107873618602753, + "learning_rate": 0.001, + "loss": 1.7427, + "step": 14197 + }, + { + "epoch": 0.6006430324054488, + "grad_norm": 22.60145378112793, + "learning_rate": 0.001, + "loss": 1.6772, + "step": 14198 + }, + { + "epoch": 0.6006853371689652, + "grad_norm": 0.16331073641777039, + "learning_rate": 0.001, + "loss": 2.0363, + "step": 14199 + }, + { + "epoch": 0.6007276419324816, + "grad_norm": 1.7857904434204102, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 14200 + }, + { + "epoch": 0.6007699466959979, + "grad_norm": 0.1780909299850464, + "learning_rate": 0.001, + "loss": 2.9103, + "step": 14201 + }, + { + "epoch": 0.6008122514595143, + "grad_norm": 0.3058990240097046, + "learning_rate": 0.001, + "loss": 2.3492, + "step": 14202 + }, + { + "epoch": 0.6008545562230307, + "grad_norm": 0.1861615926027298, + "learning_rate": 0.001, + "loss": 2.2742, + "step": 14203 + }, + { + "epoch": 0.600896860986547, + "grad_norm": 0.21202732622623444, + "learning_rate": 0.001, + "loss": 1.6728, + "step": 14204 + }, + { + "epoch": 0.6009391657500635, + "grad_norm": 0.17166270315647125, + "learning_rate": 0.001, + "loss": 2.293, + "step": 14205 + }, + { + "epoch": 0.6009814705135799, + "grad_norm": 0.1747630089521408, + "learning_rate": 0.001, + "loss": 3.0309, + "step": 14206 + }, + { + "epoch": 0.6010237752770962, + "grad_norm": 0.1900450885295868, + "learning_rate": 0.001, + "loss": 1.6363, + "step": 14207 + }, + { + "epoch": 0.6010660800406126, + "grad_norm": 6.871549129486084, + "learning_rate": 0.001, + "loss": 1.5597, + "step": 14208 + }, + { + "epoch": 0.601108384804129, + "grad_norm": 0.20221640169620514, + "learning_rate": 0.001, + "loss": 1.6391, + "step": 14209 + }, + { + "epoch": 0.6011506895676453, + "grad_norm": 0.5754346251487732, + "learning_rate": 0.001, + "loss": 4.2668, + "step": 14210 + }, + { + "epoch": 0.6011929943311617, + "grad_norm": 0.2078094184398651, + "learning_rate": 0.001, + "loss": 1.8344, + "step": 14211 + }, + { + "epoch": 0.6012352990946781, + "grad_norm": 0.19465067982673645, + "learning_rate": 0.001, + "loss": 2.707, + "step": 14212 + }, + { + "epoch": 0.6012776038581944, + "grad_norm": 0.1583552062511444, + "learning_rate": 0.001, + "loss": 1.791, + "step": 14213 + }, + { + "epoch": 0.6013199086217108, + "grad_norm": 0.6577909588813782, + "learning_rate": 0.001, + "loss": 1.8294, + "step": 14214 + }, + { + "epoch": 0.6013622133852272, + "grad_norm": 0.16228260099887848, + "learning_rate": 0.001, + "loss": 2.0963, + "step": 14215 + }, + { + "epoch": 0.6014045181487435, + "grad_norm": 0.21815434098243713, + "learning_rate": 0.001, + "loss": 1.9625, + "step": 14216 + }, + { + "epoch": 0.6014468229122599, + "grad_norm": 0.168915793299675, + "learning_rate": 0.001, + "loss": 1.7163, + "step": 14217 + }, + { + "epoch": 0.6014891276757763, + "grad_norm": 0.21465511620044708, + "learning_rate": 0.001, + "loss": 2.9348, + "step": 14218 + }, + { + "epoch": 0.6015314324392926, + "grad_norm": 1.094494104385376, + "learning_rate": 0.001, + "loss": 2.2373, + "step": 14219 + }, + { + "epoch": 0.601573737202809, + "grad_norm": 0.21323548257350922, + "learning_rate": 0.001, + "loss": 2.9424, + "step": 14220 + }, + { + "epoch": 0.6016160419663255, + "grad_norm": 4.479442596435547, + "learning_rate": 0.001, + "loss": 1.9656, + "step": 14221 + }, + { + "epoch": 0.6016583467298418, + "grad_norm": 0.13365942239761353, + "learning_rate": 0.001, + "loss": 1.264, + "step": 14222 + }, + { + "epoch": 0.6017006514933582, + "grad_norm": 1.7394766807556152, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 14223 + }, + { + "epoch": 0.6017429562568746, + "grad_norm": 3.335594415664673, + "learning_rate": 0.001, + "loss": 2.4577, + "step": 14224 + }, + { + "epoch": 0.6017852610203909, + "grad_norm": 0.574901282787323, + "learning_rate": 0.001, + "loss": 2.5492, + "step": 14225 + }, + { + "epoch": 0.6018275657839073, + "grad_norm": 0.16462647914886475, + "learning_rate": 0.001, + "loss": 2.7077, + "step": 14226 + }, + { + "epoch": 0.6018698705474237, + "grad_norm": 0.36625564098358154, + "learning_rate": 0.001, + "loss": 2.6481, + "step": 14227 + }, + { + "epoch": 0.60191217531094, + "grad_norm": 0.33818289637565613, + "learning_rate": 0.001, + "loss": 2.2215, + "step": 14228 + }, + { + "epoch": 0.6019544800744564, + "grad_norm": 0.18577156960964203, + "learning_rate": 0.001, + "loss": 2.0114, + "step": 14229 + }, + { + "epoch": 0.6019967848379728, + "grad_norm": 0.14699004590511322, + "learning_rate": 0.001, + "loss": 1.6804, + "step": 14230 + }, + { + "epoch": 0.6020390896014891, + "grad_norm": 0.17045558989048004, + "learning_rate": 0.001, + "loss": 2.3435, + "step": 14231 + }, + { + "epoch": 0.6020813943650055, + "grad_norm": 0.1529216319322586, + "learning_rate": 0.001, + "loss": 2.1237, + "step": 14232 + }, + { + "epoch": 0.6021236991285218, + "grad_norm": 0.1529701054096222, + "learning_rate": 0.001, + "loss": 3.5144, + "step": 14233 + }, + { + "epoch": 0.6021660038920382, + "grad_norm": 0.18276235461235046, + "learning_rate": 0.001, + "loss": 2.1834, + "step": 14234 + }, + { + "epoch": 0.6022083086555546, + "grad_norm": 0.2533540725708008, + "learning_rate": 0.001, + "loss": 3.0911, + "step": 14235 + }, + { + "epoch": 0.6022506134190709, + "grad_norm": 0.1999509334564209, + "learning_rate": 0.001, + "loss": 1.7524, + "step": 14236 + }, + { + "epoch": 0.6022929181825873, + "grad_norm": 0.1922556310892105, + "learning_rate": 0.001, + "loss": 2.1238, + "step": 14237 + }, + { + "epoch": 0.6023352229461038, + "grad_norm": 1.1301320791244507, + "learning_rate": 0.001, + "loss": 1.7954, + "step": 14238 + }, + { + "epoch": 0.60237752770962, + "grad_norm": 0.14961791038513184, + "learning_rate": 0.001, + "loss": 1.6322, + "step": 14239 + }, + { + "epoch": 0.6024198324731365, + "grad_norm": 0.5575346350669861, + "learning_rate": 0.001, + "loss": 2.2569, + "step": 14240 + }, + { + "epoch": 0.6024621372366529, + "grad_norm": 0.19425736367702484, + "learning_rate": 0.001, + "loss": 1.8248, + "step": 14241 + }, + { + "epoch": 0.6025044420001692, + "grad_norm": 0.1504928469657898, + "learning_rate": 0.001, + "loss": 2.2506, + "step": 14242 + }, + { + "epoch": 0.6025467467636856, + "grad_norm": 1.183660626411438, + "learning_rate": 0.001, + "loss": 1.9673, + "step": 14243 + }, + { + "epoch": 0.602589051527202, + "grad_norm": 0.3906610608100891, + "learning_rate": 0.001, + "loss": 2.8927, + "step": 14244 + }, + { + "epoch": 0.6026313562907183, + "grad_norm": 0.20628994703292847, + "learning_rate": 0.001, + "loss": 2.1268, + "step": 14245 + }, + { + "epoch": 0.6026736610542347, + "grad_norm": 4.261792182922363, + "learning_rate": 0.001, + "loss": 4.2053, + "step": 14246 + }, + { + "epoch": 0.6027159658177511, + "grad_norm": 0.15113836526870728, + "learning_rate": 0.001, + "loss": 1.5448, + "step": 14247 + }, + { + "epoch": 0.6027582705812674, + "grad_norm": 0.20235134661197662, + "learning_rate": 0.001, + "loss": 2.3315, + "step": 14248 + }, + { + "epoch": 0.6028005753447838, + "grad_norm": 0.15247638523578644, + "learning_rate": 0.001, + "loss": 2.15, + "step": 14249 + }, + { + "epoch": 0.6028428801083002, + "grad_norm": 0.18794965744018555, + "learning_rate": 0.001, + "loss": 2.3342, + "step": 14250 + }, + { + "epoch": 0.6028851848718165, + "grad_norm": 4.161924839019775, + "learning_rate": 0.001, + "loss": 1.9379, + "step": 14251 + }, + { + "epoch": 0.6029274896353329, + "grad_norm": 0.1974867433309555, + "learning_rate": 0.001, + "loss": 2.1114, + "step": 14252 + }, + { + "epoch": 0.6029697943988493, + "grad_norm": 0.15145453810691833, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 14253 + }, + { + "epoch": 0.6030120991623656, + "grad_norm": 0.1826041340827942, + "learning_rate": 0.001, + "loss": 1.8654, + "step": 14254 + }, + { + "epoch": 0.603054403925882, + "grad_norm": 0.33202704787254333, + "learning_rate": 0.001, + "loss": 2.8525, + "step": 14255 + }, + { + "epoch": 0.6030967086893985, + "grad_norm": 0.17979615926742554, + "learning_rate": 0.001, + "loss": 2.1171, + "step": 14256 + }, + { + "epoch": 0.6031390134529148, + "grad_norm": 0.33477500081062317, + "learning_rate": 0.001, + "loss": 2.4972, + "step": 14257 + }, + { + "epoch": 0.6031813182164312, + "grad_norm": 0.1628679484128952, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 14258 + }, + { + "epoch": 0.6032236229799476, + "grad_norm": 0.15380176901817322, + "learning_rate": 0.001, + "loss": 1.6094, + "step": 14259 + }, + { + "epoch": 0.6032659277434639, + "grad_norm": 0.15169599652290344, + "learning_rate": 0.001, + "loss": 1.3153, + "step": 14260 + }, + { + "epoch": 0.6033082325069803, + "grad_norm": 1.1661324501037598, + "learning_rate": 0.001, + "loss": 1.8356, + "step": 14261 + }, + { + "epoch": 0.6033505372704967, + "grad_norm": 0.14348934590816498, + "learning_rate": 0.001, + "loss": 1.4043, + "step": 14262 + }, + { + "epoch": 0.603392842034013, + "grad_norm": 0.35176733136177063, + "learning_rate": 0.001, + "loss": 2.327, + "step": 14263 + }, + { + "epoch": 0.6034351467975294, + "grad_norm": 2.9565463066101074, + "learning_rate": 0.001, + "loss": 2.2605, + "step": 14264 + }, + { + "epoch": 0.6034774515610458, + "grad_norm": 0.20699867606163025, + "learning_rate": 0.001, + "loss": 2.5949, + "step": 14265 + }, + { + "epoch": 0.6035197563245621, + "grad_norm": 0.6101794242858887, + "learning_rate": 0.001, + "loss": 2.4005, + "step": 14266 + }, + { + "epoch": 0.6035620610880785, + "grad_norm": 0.2099284827709198, + "learning_rate": 0.001, + "loss": 1.7885, + "step": 14267 + }, + { + "epoch": 0.6036043658515949, + "grad_norm": 0.20473699271678925, + "learning_rate": 0.001, + "loss": 2.0055, + "step": 14268 + }, + { + "epoch": 0.6036466706151112, + "grad_norm": 0.2228740006685257, + "learning_rate": 0.001, + "loss": 2.4903, + "step": 14269 + }, + { + "epoch": 0.6036889753786276, + "grad_norm": 0.6952749490737915, + "learning_rate": 0.001, + "loss": 2.8113, + "step": 14270 + }, + { + "epoch": 0.6037312801421441, + "grad_norm": 0.16827528178691864, + "learning_rate": 0.001, + "loss": 1.7197, + "step": 14271 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.23891712725162506, + "learning_rate": 0.001, + "loss": 2.2606, + "step": 14272 + }, + { + "epoch": 0.6038158896691768, + "grad_norm": 0.220239520072937, + "learning_rate": 0.001, + "loss": 1.4048, + "step": 14273 + }, + { + "epoch": 0.6038581944326932, + "grad_norm": 0.16221195459365845, + "learning_rate": 0.001, + "loss": 1.7971, + "step": 14274 + }, + { + "epoch": 0.6039004991962095, + "grad_norm": 0.31431055068969727, + "learning_rate": 0.001, + "loss": 3.3171, + "step": 14275 + }, + { + "epoch": 0.6039428039597259, + "grad_norm": 1.185067057609558, + "learning_rate": 0.001, + "loss": 2.7166, + "step": 14276 + }, + { + "epoch": 0.6039851087232423, + "grad_norm": 0.15773341059684753, + "learning_rate": 0.001, + "loss": 2.0959, + "step": 14277 + }, + { + "epoch": 0.6040274134867586, + "grad_norm": 0.16446535289287567, + "learning_rate": 0.001, + "loss": 1.6047, + "step": 14278 + }, + { + "epoch": 0.604069718250275, + "grad_norm": 0.20500445365905762, + "learning_rate": 0.001, + "loss": 2.4833, + "step": 14279 + }, + { + "epoch": 0.6041120230137913, + "grad_norm": 0.1654626429080963, + "learning_rate": 0.001, + "loss": 1.7549, + "step": 14280 + }, + { + "epoch": 0.6041543277773077, + "grad_norm": 0.16922923922538757, + "learning_rate": 0.001, + "loss": 2.0515, + "step": 14281 + }, + { + "epoch": 0.6041966325408241, + "grad_norm": 0.17011629045009613, + "learning_rate": 0.001, + "loss": 2.5793, + "step": 14282 + }, + { + "epoch": 0.6042389373043404, + "grad_norm": 0.25761881470680237, + "learning_rate": 0.001, + "loss": 1.9069, + "step": 14283 + }, + { + "epoch": 0.6042812420678568, + "grad_norm": 0.18269698321819305, + "learning_rate": 0.001, + "loss": 3.0221, + "step": 14284 + }, + { + "epoch": 0.6043235468313732, + "grad_norm": 85.4771728515625, + "learning_rate": 0.001, + "loss": 2.3729, + "step": 14285 + }, + { + "epoch": 0.6043658515948895, + "grad_norm": 0.7515902519226074, + "learning_rate": 0.001, + "loss": 2.3315, + "step": 14286 + }, + { + "epoch": 0.604408156358406, + "grad_norm": 0.2449134737253189, + "learning_rate": 0.001, + "loss": 1.72, + "step": 14287 + }, + { + "epoch": 0.6044504611219224, + "grad_norm": 0.15557745099067688, + "learning_rate": 0.001, + "loss": 1.9485, + "step": 14288 + }, + { + "epoch": 0.6044927658854387, + "grad_norm": 0.18973484635353088, + "learning_rate": 0.001, + "loss": 2.1576, + "step": 14289 + }, + { + "epoch": 0.6045350706489551, + "grad_norm": 0.2871471643447876, + "learning_rate": 0.001, + "loss": 1.7434, + "step": 14290 + }, + { + "epoch": 0.6045773754124715, + "grad_norm": 0.4421120882034302, + "learning_rate": 0.001, + "loss": 3.4448, + "step": 14291 + }, + { + "epoch": 0.6046196801759878, + "grad_norm": 0.15245676040649414, + "learning_rate": 0.001, + "loss": 1.6375, + "step": 14292 + }, + { + "epoch": 0.6046619849395042, + "grad_norm": 0.22273124754428864, + "learning_rate": 0.001, + "loss": 3.4129, + "step": 14293 + }, + { + "epoch": 0.6047042897030206, + "grad_norm": 0.1912602186203003, + "learning_rate": 0.001, + "loss": 2.368, + "step": 14294 + }, + { + "epoch": 0.6047465944665369, + "grad_norm": 0.18390634655952454, + "learning_rate": 0.001, + "loss": 2.33, + "step": 14295 + }, + { + "epoch": 0.6047888992300533, + "grad_norm": 0.2985355854034424, + "learning_rate": 0.001, + "loss": 2.6485, + "step": 14296 + }, + { + "epoch": 0.6048312039935697, + "grad_norm": 0.1954064518213272, + "learning_rate": 0.001, + "loss": 2.0261, + "step": 14297 + }, + { + "epoch": 0.604873508757086, + "grad_norm": 0.16802330315113068, + "learning_rate": 0.001, + "loss": 2.1737, + "step": 14298 + }, + { + "epoch": 0.6049158135206024, + "grad_norm": 0.1490180343389511, + "learning_rate": 0.001, + "loss": 1.6823, + "step": 14299 + }, + { + "epoch": 0.6049581182841188, + "grad_norm": 0.1882055103778839, + "learning_rate": 0.001, + "loss": 2.7867, + "step": 14300 + }, + { + "epoch": 0.6050004230476351, + "grad_norm": 0.18217292428016663, + "learning_rate": 0.001, + "loss": 2.2182, + "step": 14301 + }, + { + "epoch": 0.6050427278111515, + "grad_norm": 0.16061586141586304, + "learning_rate": 0.001, + "loss": 1.2033, + "step": 14302 + }, + { + "epoch": 0.605085032574668, + "grad_norm": 0.17680132389068604, + "learning_rate": 0.001, + "loss": 2.3674, + "step": 14303 + }, + { + "epoch": 0.6051273373381842, + "grad_norm": 0.21177402138710022, + "learning_rate": 0.001, + "loss": 2.0609, + "step": 14304 + }, + { + "epoch": 0.6051696421017007, + "grad_norm": 0.8895161151885986, + "learning_rate": 0.001, + "loss": 2.1776, + "step": 14305 + }, + { + "epoch": 0.6052119468652171, + "grad_norm": 0.18167245388031006, + "learning_rate": 0.001, + "loss": 1.8625, + "step": 14306 + }, + { + "epoch": 0.6052542516287334, + "grad_norm": 7.46832799911499, + "learning_rate": 0.001, + "loss": 1.9056, + "step": 14307 + }, + { + "epoch": 0.6052965563922498, + "grad_norm": 0.17370212078094482, + "learning_rate": 0.001, + "loss": 2.3242, + "step": 14308 + }, + { + "epoch": 0.6053388611557662, + "grad_norm": 0.1907428652048111, + "learning_rate": 0.001, + "loss": 2.38, + "step": 14309 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.5028619766235352, + "learning_rate": 0.001, + "loss": 2.3635, + "step": 14310 + }, + { + "epoch": 0.6054234706827989, + "grad_norm": 0.15247510373592377, + "learning_rate": 0.001, + "loss": 1.7588, + "step": 14311 + }, + { + "epoch": 0.6054657754463153, + "grad_norm": 0.2253142148256302, + "learning_rate": 0.001, + "loss": 2.7246, + "step": 14312 + }, + { + "epoch": 0.6055080802098316, + "grad_norm": 0.2002718299627304, + "learning_rate": 0.001, + "loss": 2.9367, + "step": 14313 + }, + { + "epoch": 0.605550384973348, + "grad_norm": 1.097982406616211, + "learning_rate": 0.001, + "loss": 1.6434, + "step": 14314 + }, + { + "epoch": 0.6055926897368644, + "grad_norm": 0.20762449502944946, + "learning_rate": 0.001, + "loss": 2.281, + "step": 14315 + }, + { + "epoch": 0.6056349945003807, + "grad_norm": 0.8287076950073242, + "learning_rate": 0.001, + "loss": 2.1989, + "step": 14316 + }, + { + "epoch": 0.6056772992638971, + "grad_norm": 0.19646909832954407, + "learning_rate": 0.001, + "loss": 3.1902, + "step": 14317 + }, + { + "epoch": 0.6057196040274135, + "grad_norm": 0.21229322254657745, + "learning_rate": 0.001, + "loss": 2.3791, + "step": 14318 + }, + { + "epoch": 0.6057619087909298, + "grad_norm": 0.216338649392128, + "learning_rate": 0.001, + "loss": 2.6643, + "step": 14319 + }, + { + "epoch": 0.6058042135544462, + "grad_norm": 0.177630215883255, + "learning_rate": 0.001, + "loss": 3.0275, + "step": 14320 + }, + { + "epoch": 0.6058465183179627, + "grad_norm": 0.173061802983284, + "learning_rate": 0.001, + "loss": 1.5222, + "step": 14321 + }, + { + "epoch": 0.605888823081479, + "grad_norm": 0.2353350967168808, + "learning_rate": 0.001, + "loss": 2.1331, + "step": 14322 + }, + { + "epoch": 0.6059311278449954, + "grad_norm": 0.1542205810546875, + "learning_rate": 0.001, + "loss": 1.6273, + "step": 14323 + }, + { + "epoch": 0.6059734326085117, + "grad_norm": 0.14759258925914764, + "learning_rate": 0.001, + "loss": 2.9241, + "step": 14324 + }, + { + "epoch": 0.6060157373720281, + "grad_norm": 1.2428350448608398, + "learning_rate": 0.001, + "loss": 1.851, + "step": 14325 + }, + { + "epoch": 0.6060580421355445, + "grad_norm": 0.3622968792915344, + "learning_rate": 0.001, + "loss": 2.0143, + "step": 14326 + }, + { + "epoch": 0.6061003468990608, + "grad_norm": 0.1762288361787796, + "learning_rate": 0.001, + "loss": 2.4346, + "step": 14327 + }, + { + "epoch": 0.6061426516625772, + "grad_norm": 0.24503590166568756, + "learning_rate": 0.001, + "loss": 1.8936, + "step": 14328 + }, + { + "epoch": 0.6061849564260936, + "grad_norm": 0.16428449749946594, + "learning_rate": 0.001, + "loss": 1.829, + "step": 14329 + }, + { + "epoch": 0.6062272611896099, + "grad_norm": 0.16530753672122955, + "learning_rate": 0.001, + "loss": 2.2527, + "step": 14330 + }, + { + "epoch": 0.6062695659531263, + "grad_norm": 0.20797306299209595, + "learning_rate": 0.001, + "loss": 2.2176, + "step": 14331 + }, + { + "epoch": 0.6063118707166427, + "grad_norm": 0.27849939465522766, + "learning_rate": 0.001, + "loss": 2.2585, + "step": 14332 + }, + { + "epoch": 0.606354175480159, + "grad_norm": 1.6591880321502686, + "learning_rate": 0.001, + "loss": 3.7181, + "step": 14333 + }, + { + "epoch": 0.6063964802436754, + "grad_norm": 0.15750765800476074, + "learning_rate": 0.001, + "loss": 2.2277, + "step": 14334 + }, + { + "epoch": 0.6064387850071918, + "grad_norm": 0.5017919540405273, + "learning_rate": 0.001, + "loss": 2.991, + "step": 14335 + }, + { + "epoch": 0.6064810897707081, + "grad_norm": 0.37937721610069275, + "learning_rate": 0.001, + "loss": 2.2904, + "step": 14336 + }, + { + "epoch": 0.6065233945342245, + "grad_norm": 1.156523585319519, + "learning_rate": 0.001, + "loss": 2.0803, + "step": 14337 + }, + { + "epoch": 0.606565699297741, + "grad_norm": 0.26391497254371643, + "learning_rate": 0.001, + "loss": 2.2633, + "step": 14338 + }, + { + "epoch": 0.6066080040612573, + "grad_norm": 0.17540274560451508, + "learning_rate": 0.001, + "loss": 2.4428, + "step": 14339 + }, + { + "epoch": 0.6066503088247737, + "grad_norm": 0.1678505390882492, + "learning_rate": 0.001, + "loss": 2.1558, + "step": 14340 + }, + { + "epoch": 0.6066926135882901, + "grad_norm": 0.1621529459953308, + "learning_rate": 0.001, + "loss": 2.3497, + "step": 14341 + }, + { + "epoch": 0.6067349183518064, + "grad_norm": 0.2335866242647171, + "learning_rate": 0.001, + "loss": 2.0897, + "step": 14342 + }, + { + "epoch": 0.6067772231153228, + "grad_norm": 0.2002691626548767, + "learning_rate": 0.001, + "loss": 2.613, + "step": 14343 + }, + { + "epoch": 0.6068195278788392, + "grad_norm": 13.141894340515137, + "learning_rate": 0.001, + "loss": 1.9083, + "step": 14344 + }, + { + "epoch": 0.6068618326423555, + "grad_norm": 0.19172821938991547, + "learning_rate": 0.001, + "loss": 1.8666, + "step": 14345 + }, + { + "epoch": 0.6069041374058719, + "grad_norm": 0.2590673267841339, + "learning_rate": 0.001, + "loss": 1.9539, + "step": 14346 + }, + { + "epoch": 0.6069464421693883, + "grad_norm": 0.17630375921726227, + "learning_rate": 0.001, + "loss": 2.6305, + "step": 14347 + }, + { + "epoch": 0.6069887469329046, + "grad_norm": 0.14956524968147278, + "learning_rate": 0.001, + "loss": 2.8753, + "step": 14348 + }, + { + "epoch": 0.607031051696421, + "grad_norm": 0.1801750808954239, + "learning_rate": 0.001, + "loss": 1.9418, + "step": 14349 + }, + { + "epoch": 0.6070733564599374, + "grad_norm": 0.1958351731300354, + "learning_rate": 0.001, + "loss": 3.2227, + "step": 14350 + }, + { + "epoch": 0.6071156612234537, + "grad_norm": 0.24762704968452454, + "learning_rate": 0.001, + "loss": 2.6021, + "step": 14351 + }, + { + "epoch": 0.6071579659869701, + "grad_norm": 0.19234824180603027, + "learning_rate": 0.001, + "loss": 3.2872, + "step": 14352 + }, + { + "epoch": 0.6072002707504865, + "grad_norm": 0.1839028298854828, + "learning_rate": 0.001, + "loss": 2.8299, + "step": 14353 + }, + { + "epoch": 0.6072425755140028, + "grad_norm": 0.18792131543159485, + "learning_rate": 0.001, + "loss": 1.9494, + "step": 14354 + }, + { + "epoch": 0.6072848802775193, + "grad_norm": 0.14730499684810638, + "learning_rate": 0.001, + "loss": 1.4713, + "step": 14355 + }, + { + "epoch": 0.6073271850410357, + "grad_norm": 0.5098087787628174, + "learning_rate": 0.001, + "loss": 1.9097, + "step": 14356 + }, + { + "epoch": 0.607369489804552, + "grad_norm": 0.15701937675476074, + "learning_rate": 0.001, + "loss": 2.2055, + "step": 14357 + }, + { + "epoch": 0.6074117945680684, + "grad_norm": 0.16516199707984924, + "learning_rate": 0.001, + "loss": 2.4988, + "step": 14358 + }, + { + "epoch": 0.6074540993315848, + "grad_norm": 0.2014375776052475, + "learning_rate": 0.001, + "loss": 2.3594, + "step": 14359 + }, + { + "epoch": 0.6074964040951011, + "grad_norm": 0.17397524416446686, + "learning_rate": 0.001, + "loss": 2.3691, + "step": 14360 + }, + { + "epoch": 0.6075387088586175, + "grad_norm": 8.400336265563965, + "learning_rate": 0.001, + "loss": 3.4756, + "step": 14361 + }, + { + "epoch": 0.6075810136221339, + "grad_norm": 0.18245413899421692, + "learning_rate": 0.001, + "loss": 1.7794, + "step": 14362 + }, + { + "epoch": 0.6076233183856502, + "grad_norm": 0.19193901121616364, + "learning_rate": 0.001, + "loss": 1.7134, + "step": 14363 + }, + { + "epoch": 0.6076656231491666, + "grad_norm": 0.19857852160930634, + "learning_rate": 0.001, + "loss": 2.7933, + "step": 14364 + }, + { + "epoch": 0.607707927912683, + "grad_norm": 0.25610992312431335, + "learning_rate": 0.001, + "loss": 1.4993, + "step": 14365 + }, + { + "epoch": 0.6077502326761993, + "grad_norm": 0.268586128950119, + "learning_rate": 0.001, + "loss": 3.1649, + "step": 14366 + }, + { + "epoch": 0.6077925374397157, + "grad_norm": 0.30467700958251953, + "learning_rate": 0.001, + "loss": 2.0349, + "step": 14367 + }, + { + "epoch": 0.607834842203232, + "grad_norm": 0.38377171754837036, + "learning_rate": 0.001, + "loss": 2.3225, + "step": 14368 + }, + { + "epoch": 0.6078771469667484, + "grad_norm": 0.16598057746887207, + "learning_rate": 0.001, + "loss": 2.5853, + "step": 14369 + }, + { + "epoch": 0.6079194517302648, + "grad_norm": 0.2181134968996048, + "learning_rate": 0.001, + "loss": 3.0605, + "step": 14370 + }, + { + "epoch": 0.6079617564937811, + "grad_norm": 0.182630717754364, + "learning_rate": 0.001, + "loss": 1.6975, + "step": 14371 + }, + { + "epoch": 0.6080040612572976, + "grad_norm": 0.7057227492332458, + "learning_rate": 0.001, + "loss": 2.5886, + "step": 14372 + }, + { + "epoch": 0.608046366020814, + "grad_norm": 0.1978188157081604, + "learning_rate": 0.001, + "loss": 2.124, + "step": 14373 + }, + { + "epoch": 0.6080886707843303, + "grad_norm": 0.2088436782360077, + "learning_rate": 0.001, + "loss": 3.0534, + "step": 14374 + }, + { + "epoch": 0.6081309755478467, + "grad_norm": 0.15031234920024872, + "learning_rate": 0.001, + "loss": 2.396, + "step": 14375 + }, + { + "epoch": 0.6081732803113631, + "grad_norm": 0.2140101194381714, + "learning_rate": 0.001, + "loss": 2.1995, + "step": 14376 + }, + { + "epoch": 0.6082155850748794, + "grad_norm": 0.15244650840759277, + "learning_rate": 0.001, + "loss": 2.3434, + "step": 14377 + }, + { + "epoch": 0.6082578898383958, + "grad_norm": 0.19588300585746765, + "learning_rate": 0.001, + "loss": 1.9308, + "step": 14378 + }, + { + "epoch": 0.6083001946019122, + "grad_norm": 0.1862742006778717, + "learning_rate": 0.001, + "loss": 2.6186, + "step": 14379 + }, + { + "epoch": 0.6083424993654285, + "grad_norm": 0.16094960272312164, + "learning_rate": 0.001, + "loss": 1.7264, + "step": 14380 + }, + { + "epoch": 0.6083848041289449, + "grad_norm": 0.17328019440174103, + "learning_rate": 0.001, + "loss": 1.566, + "step": 14381 + }, + { + "epoch": 0.6084271088924613, + "grad_norm": 0.15607021749019623, + "learning_rate": 0.001, + "loss": 2.6352, + "step": 14382 + }, + { + "epoch": 0.6084694136559776, + "grad_norm": 0.15530943870544434, + "learning_rate": 0.001, + "loss": 2.0045, + "step": 14383 + }, + { + "epoch": 0.608511718419494, + "grad_norm": 0.16822437942028046, + "learning_rate": 0.001, + "loss": 3.1795, + "step": 14384 + }, + { + "epoch": 0.6085540231830104, + "grad_norm": 0.15865309536457062, + "learning_rate": 0.001, + "loss": 2.3704, + "step": 14385 + }, + { + "epoch": 0.6085963279465267, + "grad_norm": 0.30900880694389343, + "learning_rate": 0.001, + "loss": 1.8696, + "step": 14386 + }, + { + "epoch": 0.6086386327100431, + "grad_norm": 0.1799941211938858, + "learning_rate": 0.001, + "loss": 2.3696, + "step": 14387 + }, + { + "epoch": 0.6086809374735596, + "grad_norm": 0.20973291993141174, + "learning_rate": 0.001, + "loss": 2.8554, + "step": 14388 + }, + { + "epoch": 0.6087232422370759, + "grad_norm": 0.15292273461818695, + "learning_rate": 0.001, + "loss": 1.5335, + "step": 14389 + }, + { + "epoch": 0.6087655470005923, + "grad_norm": 0.1537332534790039, + "learning_rate": 0.001, + "loss": 2.7656, + "step": 14390 + }, + { + "epoch": 0.6088078517641087, + "grad_norm": 0.1706329882144928, + "learning_rate": 0.001, + "loss": 2.221, + "step": 14391 + }, + { + "epoch": 0.608850156527625, + "grad_norm": 0.15366674959659576, + "learning_rate": 0.001, + "loss": 1.4947, + "step": 14392 + }, + { + "epoch": 0.6088924612911414, + "grad_norm": 0.16146250069141388, + "learning_rate": 0.001, + "loss": 2.2607, + "step": 14393 + }, + { + "epoch": 0.6089347660546578, + "grad_norm": 0.1696520894765854, + "learning_rate": 0.001, + "loss": 2.8822, + "step": 14394 + }, + { + "epoch": 0.6089770708181741, + "grad_norm": 0.7845591306686401, + "learning_rate": 0.001, + "loss": 2.0901, + "step": 14395 + }, + { + "epoch": 0.6090193755816905, + "grad_norm": 0.20241577923297882, + "learning_rate": 0.001, + "loss": 3.0825, + "step": 14396 + }, + { + "epoch": 0.6090616803452069, + "grad_norm": 0.16126592457294464, + "learning_rate": 0.001, + "loss": 1.6057, + "step": 14397 + }, + { + "epoch": 0.6091039851087232, + "grad_norm": 0.18037404119968414, + "learning_rate": 0.001, + "loss": 2.864, + "step": 14398 + }, + { + "epoch": 0.6091462898722396, + "grad_norm": 0.1628550887107849, + "learning_rate": 0.001, + "loss": 3.1012, + "step": 14399 + }, + { + "epoch": 0.609188594635756, + "grad_norm": 0.20835568010807037, + "learning_rate": 0.001, + "loss": 2.0582, + "step": 14400 + }, + { + "epoch": 0.6092308993992723, + "grad_norm": 0.18450294435024261, + "learning_rate": 0.001, + "loss": 2.5508, + "step": 14401 + }, + { + "epoch": 0.6092732041627887, + "grad_norm": 0.33053863048553467, + "learning_rate": 0.001, + "loss": 1.758, + "step": 14402 + }, + { + "epoch": 0.6093155089263051, + "grad_norm": 1.6624572277069092, + "learning_rate": 0.001, + "loss": 2.8896, + "step": 14403 + }, + { + "epoch": 0.6093578136898214, + "grad_norm": 0.14562225341796875, + "learning_rate": 0.001, + "loss": 1.592, + "step": 14404 + }, + { + "epoch": 0.6094001184533379, + "grad_norm": 0.45143476128578186, + "learning_rate": 0.001, + "loss": 1.8523, + "step": 14405 + }, + { + "epoch": 0.6094424232168543, + "grad_norm": 0.47654953598976135, + "learning_rate": 0.001, + "loss": 2.7751, + "step": 14406 + }, + { + "epoch": 0.6094847279803706, + "grad_norm": 8.705418586730957, + "learning_rate": 0.001, + "loss": 1.8628, + "step": 14407 + }, + { + "epoch": 0.609527032743887, + "grad_norm": 0.17469455301761627, + "learning_rate": 0.001, + "loss": 2.4936, + "step": 14408 + }, + { + "epoch": 0.6095693375074034, + "grad_norm": 0.20742036402225494, + "learning_rate": 0.001, + "loss": 2.1476, + "step": 14409 + }, + { + "epoch": 0.6096116422709197, + "grad_norm": 0.19691161811351776, + "learning_rate": 0.001, + "loss": 1.8967, + "step": 14410 + }, + { + "epoch": 0.6096539470344361, + "grad_norm": 0.16114525496959686, + "learning_rate": 0.001, + "loss": 1.6115, + "step": 14411 + }, + { + "epoch": 0.6096962517979525, + "grad_norm": 1.256961703300476, + "learning_rate": 0.001, + "loss": 2.0185, + "step": 14412 + }, + { + "epoch": 0.6097385565614688, + "grad_norm": 0.16019225120544434, + "learning_rate": 0.001, + "loss": 1.7158, + "step": 14413 + }, + { + "epoch": 0.6097808613249852, + "grad_norm": 0.23191338777542114, + "learning_rate": 0.001, + "loss": 2.2885, + "step": 14414 + }, + { + "epoch": 0.6098231660885015, + "grad_norm": 0.17342033982276917, + "learning_rate": 0.001, + "loss": 1.9171, + "step": 14415 + }, + { + "epoch": 0.6098654708520179, + "grad_norm": 0.14773350954055786, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 14416 + }, + { + "epoch": 0.6099077756155343, + "grad_norm": 0.23442687094211578, + "learning_rate": 0.001, + "loss": 2.2754, + "step": 14417 + }, + { + "epoch": 0.6099500803790506, + "grad_norm": 0.8946742415428162, + "learning_rate": 0.001, + "loss": 2.0776, + "step": 14418 + }, + { + "epoch": 0.609992385142567, + "grad_norm": 0.22962956130504608, + "learning_rate": 0.001, + "loss": 2.4009, + "step": 14419 + }, + { + "epoch": 0.6100346899060834, + "grad_norm": 0.15502995252609253, + "learning_rate": 0.001, + "loss": 1.824, + "step": 14420 + }, + { + "epoch": 0.6100769946695997, + "grad_norm": 0.15531884133815765, + "learning_rate": 0.001, + "loss": 1.7114, + "step": 14421 + }, + { + "epoch": 0.6101192994331162, + "grad_norm": 0.18697033822536469, + "learning_rate": 0.001, + "loss": 1.8181, + "step": 14422 + }, + { + "epoch": 0.6101616041966326, + "grad_norm": 0.14635103940963745, + "learning_rate": 0.001, + "loss": 2.2397, + "step": 14423 + }, + { + "epoch": 0.6102039089601489, + "grad_norm": 0.15549708902835846, + "learning_rate": 0.001, + "loss": 2.2291, + "step": 14424 + }, + { + "epoch": 0.6102462137236653, + "grad_norm": 0.15069544315338135, + "learning_rate": 0.001, + "loss": 2.1368, + "step": 14425 + }, + { + "epoch": 0.6102885184871817, + "grad_norm": 0.15826652944087982, + "learning_rate": 0.001, + "loss": 2.4466, + "step": 14426 + }, + { + "epoch": 0.610330823250698, + "grad_norm": 0.18682676553726196, + "learning_rate": 0.001, + "loss": 1.9024, + "step": 14427 + }, + { + "epoch": 0.6103731280142144, + "grad_norm": 0.40561962127685547, + "learning_rate": 0.001, + "loss": 1.8962, + "step": 14428 + }, + { + "epoch": 0.6104154327777308, + "grad_norm": 0.20110554993152618, + "learning_rate": 0.001, + "loss": 2.3748, + "step": 14429 + }, + { + "epoch": 0.6104577375412471, + "grad_norm": 0.15364859998226166, + "learning_rate": 0.001, + "loss": 2.1927, + "step": 14430 + }, + { + "epoch": 0.6105000423047635, + "grad_norm": 0.17697322368621826, + "learning_rate": 0.001, + "loss": 1.8882, + "step": 14431 + }, + { + "epoch": 0.6105423470682799, + "grad_norm": 0.7132118344306946, + "learning_rate": 0.001, + "loss": 3.1219, + "step": 14432 + }, + { + "epoch": 0.6105846518317962, + "grad_norm": 0.19553466141223907, + "learning_rate": 0.001, + "loss": 2.4858, + "step": 14433 + }, + { + "epoch": 0.6106269565953126, + "grad_norm": 0.15867097675800323, + "learning_rate": 0.001, + "loss": 2.2074, + "step": 14434 + }, + { + "epoch": 0.610669261358829, + "grad_norm": 0.8470575213432312, + "learning_rate": 0.001, + "loss": 3.0073, + "step": 14435 + }, + { + "epoch": 0.6107115661223453, + "grad_norm": 0.16488543152809143, + "learning_rate": 0.001, + "loss": 1.6394, + "step": 14436 + }, + { + "epoch": 0.6107538708858617, + "grad_norm": 0.2117576003074646, + "learning_rate": 0.001, + "loss": 2.5382, + "step": 14437 + }, + { + "epoch": 0.6107961756493782, + "grad_norm": 0.2346462607383728, + "learning_rate": 0.001, + "loss": 1.9488, + "step": 14438 + }, + { + "epoch": 0.6108384804128945, + "grad_norm": 1.9085814952850342, + "learning_rate": 0.001, + "loss": 2.3393, + "step": 14439 + }, + { + "epoch": 0.6108807851764109, + "grad_norm": 0.15879637002944946, + "learning_rate": 0.001, + "loss": 2.3031, + "step": 14440 + }, + { + "epoch": 0.6109230899399273, + "grad_norm": 0.13453783094882965, + "learning_rate": 0.001, + "loss": 1.7157, + "step": 14441 + }, + { + "epoch": 0.6109653947034436, + "grad_norm": 0.3421229422092438, + "learning_rate": 0.001, + "loss": 2.4225, + "step": 14442 + }, + { + "epoch": 0.61100769946696, + "grad_norm": 0.175466388463974, + "learning_rate": 0.001, + "loss": 2.3742, + "step": 14443 + }, + { + "epoch": 0.6110500042304764, + "grad_norm": 5.125629901885986, + "learning_rate": 0.001, + "loss": 2.022, + "step": 14444 + }, + { + "epoch": 0.6110923089939927, + "grad_norm": 0.15601593255996704, + "learning_rate": 0.001, + "loss": 3.9727, + "step": 14445 + }, + { + "epoch": 0.6111346137575091, + "grad_norm": 0.16740910708904266, + "learning_rate": 0.001, + "loss": 2.8391, + "step": 14446 + }, + { + "epoch": 0.6111769185210255, + "grad_norm": 0.18118330836296082, + "learning_rate": 0.001, + "loss": 2.154, + "step": 14447 + }, + { + "epoch": 0.6112192232845418, + "grad_norm": 0.17362359166145325, + "learning_rate": 0.001, + "loss": 2.4111, + "step": 14448 + }, + { + "epoch": 0.6112615280480582, + "grad_norm": 0.20023968815803528, + "learning_rate": 0.001, + "loss": 2.0913, + "step": 14449 + }, + { + "epoch": 0.6113038328115746, + "grad_norm": 0.954468846321106, + "learning_rate": 0.001, + "loss": 2.5481, + "step": 14450 + }, + { + "epoch": 0.6113461375750909, + "grad_norm": 0.2559969127178192, + "learning_rate": 0.001, + "loss": 2.5936, + "step": 14451 + }, + { + "epoch": 0.6113884423386073, + "grad_norm": 0.1946243792772293, + "learning_rate": 0.001, + "loss": 2.4807, + "step": 14452 + }, + { + "epoch": 0.6114307471021238, + "grad_norm": 0.30189049243927, + "learning_rate": 0.001, + "loss": 1.8112, + "step": 14453 + }, + { + "epoch": 0.61147305186564, + "grad_norm": 0.20965829491615295, + "learning_rate": 0.001, + "loss": 2.1163, + "step": 14454 + }, + { + "epoch": 0.6115153566291565, + "grad_norm": 0.2877630889415741, + "learning_rate": 0.001, + "loss": 1.9946, + "step": 14455 + }, + { + "epoch": 0.6115576613926729, + "grad_norm": 0.23159538209438324, + "learning_rate": 0.001, + "loss": 2.35, + "step": 14456 + }, + { + "epoch": 0.6115999661561892, + "grad_norm": 1.8251463174819946, + "learning_rate": 0.001, + "loss": 2.014, + "step": 14457 + }, + { + "epoch": 0.6116422709197056, + "grad_norm": 0.2677466869354248, + "learning_rate": 0.001, + "loss": 2.1856, + "step": 14458 + }, + { + "epoch": 0.6116845756832219, + "grad_norm": 0.291852742433548, + "learning_rate": 0.001, + "loss": 3.0096, + "step": 14459 + }, + { + "epoch": 0.6117268804467383, + "grad_norm": 0.22112785279750824, + "learning_rate": 0.001, + "loss": 1.8697, + "step": 14460 + }, + { + "epoch": 0.6117691852102547, + "grad_norm": 0.1902538239955902, + "learning_rate": 0.001, + "loss": 3.42, + "step": 14461 + }, + { + "epoch": 0.611811489973771, + "grad_norm": 0.17315338551998138, + "learning_rate": 0.001, + "loss": 2.4311, + "step": 14462 + }, + { + "epoch": 0.6118537947372874, + "grad_norm": 0.17455525696277618, + "learning_rate": 0.001, + "loss": 2.2537, + "step": 14463 + }, + { + "epoch": 0.6118960995008038, + "grad_norm": 0.22085832059383392, + "learning_rate": 0.001, + "loss": 3.21, + "step": 14464 + }, + { + "epoch": 0.6119384042643201, + "grad_norm": 0.16527900099754333, + "learning_rate": 0.001, + "loss": 1.7905, + "step": 14465 + }, + { + "epoch": 0.6119807090278365, + "grad_norm": 0.18405689299106598, + "learning_rate": 0.001, + "loss": 2.2348, + "step": 14466 + }, + { + "epoch": 0.6120230137913529, + "grad_norm": 0.19731445610523224, + "learning_rate": 0.001, + "loss": 2.0635, + "step": 14467 + }, + { + "epoch": 0.6120653185548692, + "grad_norm": 0.20120853185653687, + "learning_rate": 0.001, + "loss": 3.1794, + "step": 14468 + }, + { + "epoch": 0.6121076233183856, + "grad_norm": 0.18305185437202454, + "learning_rate": 0.001, + "loss": 1.4547, + "step": 14469 + }, + { + "epoch": 0.612149928081902, + "grad_norm": 0.18967366218566895, + "learning_rate": 0.001, + "loss": 1.6065, + "step": 14470 + }, + { + "epoch": 0.6121922328454183, + "grad_norm": 0.19936278462409973, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 14471 + }, + { + "epoch": 0.6122345376089348, + "grad_norm": 0.17981190979480743, + "learning_rate": 0.001, + "loss": 1.6158, + "step": 14472 + }, + { + "epoch": 0.6122768423724512, + "grad_norm": 0.24982218444347382, + "learning_rate": 0.001, + "loss": 2.276, + "step": 14473 + }, + { + "epoch": 0.6123191471359675, + "grad_norm": 0.2008954882621765, + "learning_rate": 0.001, + "loss": 1.9837, + "step": 14474 + }, + { + "epoch": 0.6123614518994839, + "grad_norm": 0.1441406011581421, + "learning_rate": 0.001, + "loss": 2.0582, + "step": 14475 + }, + { + "epoch": 0.6124037566630003, + "grad_norm": 0.6654687523841858, + "learning_rate": 0.001, + "loss": 1.4793, + "step": 14476 + }, + { + "epoch": 0.6124460614265166, + "grad_norm": 0.17162881791591644, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 14477 + }, + { + "epoch": 0.612488366190033, + "grad_norm": 0.1826455295085907, + "learning_rate": 0.001, + "loss": 1.9079, + "step": 14478 + }, + { + "epoch": 0.6125306709535494, + "grad_norm": 2.3016321659088135, + "learning_rate": 0.001, + "loss": 3.0657, + "step": 14479 + }, + { + "epoch": 0.6125729757170657, + "grad_norm": 0.24080024659633636, + "learning_rate": 0.001, + "loss": 2.9867, + "step": 14480 + }, + { + "epoch": 0.6126152804805821, + "grad_norm": 0.2750950753688812, + "learning_rate": 0.001, + "loss": 1.9209, + "step": 14481 + }, + { + "epoch": 0.6126575852440985, + "grad_norm": 0.22558815777301788, + "learning_rate": 0.001, + "loss": 2.6685, + "step": 14482 + }, + { + "epoch": 0.6126998900076148, + "grad_norm": 0.1902596652507782, + "learning_rate": 0.001, + "loss": 2.2843, + "step": 14483 + }, + { + "epoch": 0.6127421947711312, + "grad_norm": 0.18809862434864044, + "learning_rate": 0.001, + "loss": 2.212, + "step": 14484 + }, + { + "epoch": 0.6127844995346476, + "grad_norm": 0.1640164852142334, + "learning_rate": 0.001, + "loss": 2.0967, + "step": 14485 + }, + { + "epoch": 0.6128268042981639, + "grad_norm": 0.1704949587583542, + "learning_rate": 0.001, + "loss": 2.6479, + "step": 14486 + }, + { + "epoch": 0.6128691090616804, + "grad_norm": 2.7232117652893066, + "learning_rate": 0.001, + "loss": 1.8047, + "step": 14487 + }, + { + "epoch": 0.6129114138251968, + "grad_norm": 0.16258461773395538, + "learning_rate": 0.001, + "loss": 2.556, + "step": 14488 + }, + { + "epoch": 0.6129537185887131, + "grad_norm": 0.22606851160526276, + "learning_rate": 0.001, + "loss": 1.8643, + "step": 14489 + }, + { + "epoch": 0.6129960233522295, + "grad_norm": 0.39778846502304077, + "learning_rate": 0.001, + "loss": 2.8814, + "step": 14490 + }, + { + "epoch": 0.6130383281157459, + "grad_norm": 0.1924993395805359, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 14491 + }, + { + "epoch": 0.6130806328792622, + "grad_norm": 0.1684507429599762, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 14492 + }, + { + "epoch": 0.6131229376427786, + "grad_norm": 0.14290301501750946, + "learning_rate": 0.001, + "loss": 1.9804, + "step": 14493 + }, + { + "epoch": 0.613165242406295, + "grad_norm": 0.4323841333389282, + "learning_rate": 0.001, + "loss": 1.4766, + "step": 14494 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 0.15192975103855133, + "learning_rate": 0.001, + "loss": 1.5979, + "step": 14495 + }, + { + "epoch": 0.6132498519333277, + "grad_norm": 0.23368826508522034, + "learning_rate": 0.001, + "loss": 3.5542, + "step": 14496 + }, + { + "epoch": 0.6132921566968441, + "grad_norm": 0.2155388742685318, + "learning_rate": 0.001, + "loss": 2.2724, + "step": 14497 + }, + { + "epoch": 0.6133344614603604, + "grad_norm": 0.15829609334468842, + "learning_rate": 0.001, + "loss": 3.0361, + "step": 14498 + }, + { + "epoch": 0.6133767662238768, + "grad_norm": 0.16065742075443268, + "learning_rate": 0.001, + "loss": 1.3002, + "step": 14499 + }, + { + "epoch": 0.6134190709873932, + "grad_norm": 1.1077728271484375, + "learning_rate": 0.001, + "loss": 2.6874, + "step": 14500 + }, + { + "epoch": 0.6134613757509095, + "grad_norm": 0.27375224232673645, + "learning_rate": 0.001, + "loss": 2.4451, + "step": 14501 + }, + { + "epoch": 0.6135036805144259, + "grad_norm": 0.14725899696350098, + "learning_rate": 0.001, + "loss": 2.2295, + "step": 14502 + }, + { + "epoch": 0.6135459852779424, + "grad_norm": 0.15817123651504517, + "learning_rate": 0.001, + "loss": 2.0776, + "step": 14503 + }, + { + "epoch": 0.6135882900414587, + "grad_norm": 0.16688252985477448, + "learning_rate": 0.001, + "loss": 1.6458, + "step": 14504 + }, + { + "epoch": 0.6136305948049751, + "grad_norm": 0.1742338240146637, + "learning_rate": 0.001, + "loss": 2.1804, + "step": 14505 + }, + { + "epoch": 0.6136728995684914, + "grad_norm": 0.17663906514644623, + "learning_rate": 0.001, + "loss": 2.1758, + "step": 14506 + }, + { + "epoch": 0.6137152043320078, + "grad_norm": 0.19644849002361298, + "learning_rate": 0.001, + "loss": 2.0111, + "step": 14507 + }, + { + "epoch": 0.6137575090955242, + "grad_norm": 0.2182653397321701, + "learning_rate": 0.001, + "loss": 2.7332, + "step": 14508 + }, + { + "epoch": 0.6137998138590405, + "grad_norm": 1.093945026397705, + "learning_rate": 0.001, + "loss": 2.3192, + "step": 14509 + }, + { + "epoch": 0.6138421186225569, + "grad_norm": 0.20383012294769287, + "learning_rate": 0.001, + "loss": 2.4847, + "step": 14510 + }, + { + "epoch": 0.6138844233860733, + "grad_norm": 0.1353861391544342, + "learning_rate": 0.001, + "loss": 1.9635, + "step": 14511 + }, + { + "epoch": 0.6139267281495896, + "grad_norm": 0.2178875207901001, + "learning_rate": 0.001, + "loss": 1.7267, + "step": 14512 + }, + { + "epoch": 0.613969032913106, + "grad_norm": 0.4006967544555664, + "learning_rate": 0.001, + "loss": 1.8074, + "step": 14513 + }, + { + "epoch": 0.6140113376766224, + "grad_norm": 0.16235609352588654, + "learning_rate": 0.001, + "loss": 3.2147, + "step": 14514 + }, + { + "epoch": 0.6140536424401387, + "grad_norm": 0.21571922302246094, + "learning_rate": 0.001, + "loss": 2.0414, + "step": 14515 + }, + { + "epoch": 0.6140959472036551, + "grad_norm": 2.157762289047241, + "learning_rate": 0.001, + "loss": 2.4857, + "step": 14516 + }, + { + "epoch": 0.6141382519671715, + "grad_norm": 0.20253095030784607, + "learning_rate": 0.001, + "loss": 1.9104, + "step": 14517 + }, + { + "epoch": 0.6141805567306878, + "grad_norm": 0.14087221026420593, + "learning_rate": 0.001, + "loss": 1.518, + "step": 14518 + }, + { + "epoch": 0.6142228614942042, + "grad_norm": 0.2999531924724579, + "learning_rate": 0.001, + "loss": 2.6567, + "step": 14519 + }, + { + "epoch": 0.6142651662577207, + "grad_norm": 3.447448253631592, + "learning_rate": 0.001, + "loss": 1.865, + "step": 14520 + }, + { + "epoch": 0.614307471021237, + "grad_norm": 0.217742919921875, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 14521 + }, + { + "epoch": 0.6143497757847534, + "grad_norm": 0.22795531153678894, + "learning_rate": 0.001, + "loss": 2.0143, + "step": 14522 + }, + { + "epoch": 0.6143920805482698, + "grad_norm": 0.15495827794075012, + "learning_rate": 0.001, + "loss": 1.879, + "step": 14523 + }, + { + "epoch": 0.6144343853117861, + "grad_norm": 0.1813746690750122, + "learning_rate": 0.001, + "loss": 3.0125, + "step": 14524 + }, + { + "epoch": 0.6144766900753025, + "grad_norm": 0.33635345101356506, + "learning_rate": 0.001, + "loss": 2.0765, + "step": 14525 + }, + { + "epoch": 0.6145189948388189, + "grad_norm": 0.2094181776046753, + "learning_rate": 0.001, + "loss": 2.7443, + "step": 14526 + }, + { + "epoch": 0.6145612996023352, + "grad_norm": 0.18319903314113617, + "learning_rate": 0.001, + "loss": 1.7433, + "step": 14527 + }, + { + "epoch": 0.6146036043658516, + "grad_norm": 0.25899675488471985, + "learning_rate": 0.001, + "loss": 1.8407, + "step": 14528 + }, + { + "epoch": 0.614645909129368, + "grad_norm": 0.17044375836849213, + "learning_rate": 0.001, + "loss": 3.0131, + "step": 14529 + }, + { + "epoch": 0.6146882138928843, + "grad_norm": 0.19204241037368774, + "learning_rate": 0.001, + "loss": 1.6593, + "step": 14530 + }, + { + "epoch": 0.6147305186564007, + "grad_norm": 0.15827490389347076, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 14531 + }, + { + "epoch": 0.6147728234199171, + "grad_norm": 0.16154128313064575, + "learning_rate": 0.001, + "loss": 2.025, + "step": 14532 + }, + { + "epoch": 0.6148151281834334, + "grad_norm": 0.18466542661190033, + "learning_rate": 0.001, + "loss": 1.822, + "step": 14533 + }, + { + "epoch": 0.6148574329469498, + "grad_norm": 0.15772341191768646, + "learning_rate": 0.001, + "loss": 2.458, + "step": 14534 + }, + { + "epoch": 0.6148997377104662, + "grad_norm": 0.2001129388809204, + "learning_rate": 0.001, + "loss": 2.7717, + "step": 14535 + }, + { + "epoch": 0.6149420424739825, + "grad_norm": 0.49999427795410156, + "learning_rate": 0.001, + "loss": 1.7162, + "step": 14536 + }, + { + "epoch": 0.614984347237499, + "grad_norm": 0.18692375719547272, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 14537 + }, + { + "epoch": 0.6150266520010154, + "grad_norm": 0.43777942657470703, + "learning_rate": 0.001, + "loss": 1.8468, + "step": 14538 + }, + { + "epoch": 0.6150689567645317, + "grad_norm": 0.18012431263923645, + "learning_rate": 0.001, + "loss": 2.0141, + "step": 14539 + }, + { + "epoch": 0.6151112615280481, + "grad_norm": 0.17725875973701477, + "learning_rate": 0.001, + "loss": 1.9882, + "step": 14540 + }, + { + "epoch": 0.6151535662915645, + "grad_norm": 0.19504068791866302, + "learning_rate": 0.001, + "loss": 2.7564, + "step": 14541 + }, + { + "epoch": 0.6151958710550808, + "grad_norm": 0.18181535601615906, + "learning_rate": 0.001, + "loss": 1.6593, + "step": 14542 + }, + { + "epoch": 0.6152381758185972, + "grad_norm": 0.1605805605649948, + "learning_rate": 0.001, + "loss": 2.4811, + "step": 14543 + }, + { + "epoch": 0.6152804805821136, + "grad_norm": 0.1837165355682373, + "learning_rate": 0.001, + "loss": 2.5541, + "step": 14544 + }, + { + "epoch": 0.6153227853456299, + "grad_norm": 0.1661083698272705, + "learning_rate": 0.001, + "loss": 2.745, + "step": 14545 + }, + { + "epoch": 0.6153650901091463, + "grad_norm": 0.1631242036819458, + "learning_rate": 0.001, + "loss": 3.1111, + "step": 14546 + }, + { + "epoch": 0.6154073948726627, + "grad_norm": 0.1594124138355255, + "learning_rate": 0.001, + "loss": 2.1363, + "step": 14547 + }, + { + "epoch": 0.615449699636179, + "grad_norm": 0.1624990850687027, + "learning_rate": 0.001, + "loss": 1.9491, + "step": 14548 + }, + { + "epoch": 0.6154920043996954, + "grad_norm": 0.15371422469615936, + "learning_rate": 0.001, + "loss": 1.8865, + "step": 14549 + }, + { + "epoch": 0.6155343091632117, + "grad_norm": 0.1542615443468094, + "learning_rate": 0.001, + "loss": 1.8565, + "step": 14550 + }, + { + "epoch": 0.6155766139267281, + "grad_norm": 0.18403269350528717, + "learning_rate": 0.001, + "loss": 1.5119, + "step": 14551 + }, + { + "epoch": 0.6156189186902445, + "grad_norm": 0.19757772982120514, + "learning_rate": 0.001, + "loss": 2.071, + "step": 14552 + }, + { + "epoch": 0.6156612234537608, + "grad_norm": 0.21193090081214905, + "learning_rate": 0.001, + "loss": 1.5183, + "step": 14553 + }, + { + "epoch": 0.6157035282172773, + "grad_norm": 0.15698926150798798, + "learning_rate": 0.001, + "loss": 1.9242, + "step": 14554 + }, + { + "epoch": 0.6157458329807937, + "grad_norm": 0.18835414946079254, + "learning_rate": 0.001, + "loss": 2.5247, + "step": 14555 + }, + { + "epoch": 0.61578813774431, + "grad_norm": 0.1583268940448761, + "learning_rate": 0.001, + "loss": 1.9729, + "step": 14556 + }, + { + "epoch": 0.6158304425078264, + "grad_norm": 0.16368068754673004, + "learning_rate": 0.001, + "loss": 2.4374, + "step": 14557 + }, + { + "epoch": 0.6158727472713428, + "grad_norm": 0.1612701714038849, + "learning_rate": 0.001, + "loss": 1.6765, + "step": 14558 + }, + { + "epoch": 0.6159150520348591, + "grad_norm": 0.14190715551376343, + "learning_rate": 0.001, + "loss": 1.9572, + "step": 14559 + }, + { + "epoch": 0.6159573567983755, + "grad_norm": 0.15973453223705292, + "learning_rate": 0.001, + "loss": 2.3115, + "step": 14560 + }, + { + "epoch": 0.6159996615618919, + "grad_norm": 0.15088126063346863, + "learning_rate": 0.001, + "loss": 2.3368, + "step": 14561 + }, + { + "epoch": 0.6160419663254082, + "grad_norm": 0.1798139065504074, + "learning_rate": 0.001, + "loss": 2.6461, + "step": 14562 + }, + { + "epoch": 0.6160842710889246, + "grad_norm": 2.4017934799194336, + "learning_rate": 0.001, + "loss": 2.928, + "step": 14563 + }, + { + "epoch": 0.616126575852441, + "grad_norm": 0.14553676545619965, + "learning_rate": 0.001, + "loss": 1.5844, + "step": 14564 + }, + { + "epoch": 0.6161688806159573, + "grad_norm": 0.18981827795505524, + "learning_rate": 0.001, + "loss": 1.7026, + "step": 14565 + }, + { + "epoch": 0.6162111853794737, + "grad_norm": 0.1583603173494339, + "learning_rate": 0.001, + "loss": 1.93, + "step": 14566 + }, + { + "epoch": 0.6162534901429901, + "grad_norm": 0.19451354444026947, + "learning_rate": 0.001, + "loss": 2.2567, + "step": 14567 + }, + { + "epoch": 0.6162957949065064, + "grad_norm": 0.18572446703910828, + "learning_rate": 0.001, + "loss": 3.772, + "step": 14568 + }, + { + "epoch": 0.6163380996700228, + "grad_norm": 0.16187942028045654, + "learning_rate": 0.001, + "loss": 2.2992, + "step": 14569 + }, + { + "epoch": 0.6163804044335393, + "grad_norm": 0.1688256412744522, + "learning_rate": 0.001, + "loss": 1.6916, + "step": 14570 + }, + { + "epoch": 0.6164227091970556, + "grad_norm": 0.16254852712154388, + "learning_rate": 0.001, + "loss": 1.4337, + "step": 14571 + }, + { + "epoch": 0.616465013960572, + "grad_norm": 0.14653462171554565, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 14572 + }, + { + "epoch": 0.6165073187240884, + "grad_norm": 0.18988440930843353, + "learning_rate": 0.001, + "loss": 2.2461, + "step": 14573 + }, + { + "epoch": 0.6165496234876047, + "grad_norm": 0.19180841743946075, + "learning_rate": 0.001, + "loss": 2.1642, + "step": 14574 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.17922373116016388, + "learning_rate": 0.001, + "loss": 2.0952, + "step": 14575 + }, + { + "epoch": 0.6166342330146375, + "grad_norm": 0.17525149881839752, + "learning_rate": 0.001, + "loss": 2.2221, + "step": 14576 + }, + { + "epoch": 0.6166765377781538, + "grad_norm": 0.18129895627498627, + "learning_rate": 0.001, + "loss": 2.5061, + "step": 14577 + }, + { + "epoch": 0.6167188425416702, + "grad_norm": 0.3474824130535126, + "learning_rate": 0.001, + "loss": 2.2037, + "step": 14578 + }, + { + "epoch": 0.6167611473051866, + "grad_norm": 0.2225697934627533, + "learning_rate": 0.001, + "loss": 2.9076, + "step": 14579 + }, + { + "epoch": 0.6168034520687029, + "grad_norm": 0.15336523950099945, + "learning_rate": 0.001, + "loss": 2.0626, + "step": 14580 + }, + { + "epoch": 0.6168457568322193, + "grad_norm": 0.22657287120819092, + "learning_rate": 0.001, + "loss": 1.8477, + "step": 14581 + }, + { + "epoch": 0.6168880615957357, + "grad_norm": 0.19697429239749908, + "learning_rate": 0.001, + "loss": 2.6378, + "step": 14582 + }, + { + "epoch": 0.616930366359252, + "grad_norm": 0.783888041973114, + "learning_rate": 0.001, + "loss": 2.097, + "step": 14583 + }, + { + "epoch": 0.6169726711227684, + "grad_norm": 0.2075197994709015, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 14584 + }, + { + "epoch": 0.6170149758862848, + "grad_norm": 0.1552828848361969, + "learning_rate": 0.001, + "loss": 2.6256, + "step": 14585 + }, + { + "epoch": 0.6170572806498011, + "grad_norm": 0.1605806201696396, + "learning_rate": 0.001, + "loss": 1.842, + "step": 14586 + }, + { + "epoch": 0.6170995854133176, + "grad_norm": 0.18926933407783508, + "learning_rate": 0.001, + "loss": 2.7009, + "step": 14587 + }, + { + "epoch": 0.617141890176834, + "grad_norm": 0.17639969289302826, + "learning_rate": 0.001, + "loss": 2.0845, + "step": 14588 + }, + { + "epoch": 0.6171841949403503, + "grad_norm": 0.16132144629955292, + "learning_rate": 0.001, + "loss": 2.5827, + "step": 14589 + }, + { + "epoch": 0.6172264997038667, + "grad_norm": 0.1768246591091156, + "learning_rate": 0.001, + "loss": 2.6096, + "step": 14590 + }, + { + "epoch": 0.6172688044673831, + "grad_norm": 0.1509474366903305, + "learning_rate": 0.001, + "loss": 3.6571, + "step": 14591 + }, + { + "epoch": 0.6173111092308994, + "grad_norm": 0.17288728058338165, + "learning_rate": 0.001, + "loss": 2.6354, + "step": 14592 + }, + { + "epoch": 0.6173534139944158, + "grad_norm": 0.1312096118927002, + "learning_rate": 0.001, + "loss": 2.0085, + "step": 14593 + }, + { + "epoch": 0.6173957187579321, + "grad_norm": 0.2396790236234665, + "learning_rate": 0.001, + "loss": 2.3952, + "step": 14594 + }, + { + "epoch": 0.6174380235214485, + "grad_norm": 0.1728651076555252, + "learning_rate": 0.001, + "loss": 1.4488, + "step": 14595 + }, + { + "epoch": 0.6174803282849649, + "grad_norm": 0.5452121496200562, + "learning_rate": 0.001, + "loss": 1.9965, + "step": 14596 + }, + { + "epoch": 0.6175226330484812, + "grad_norm": 0.8020671010017395, + "learning_rate": 0.001, + "loss": 1.8593, + "step": 14597 + }, + { + "epoch": 0.6175649378119976, + "grad_norm": 0.15915119647979736, + "learning_rate": 0.001, + "loss": 2.0414, + "step": 14598 + }, + { + "epoch": 0.617607242575514, + "grad_norm": 0.13803361356258392, + "learning_rate": 0.001, + "loss": 2.0599, + "step": 14599 + }, + { + "epoch": 0.6176495473390303, + "grad_norm": 0.22241367399692535, + "learning_rate": 0.001, + "loss": 2.8024, + "step": 14600 + }, + { + "epoch": 0.6176918521025467, + "grad_norm": 1.1794174909591675, + "learning_rate": 0.001, + "loss": 2.3638, + "step": 14601 + }, + { + "epoch": 0.6177341568660631, + "grad_norm": 0.20139962434768677, + "learning_rate": 0.001, + "loss": 2.7807, + "step": 14602 + }, + { + "epoch": 0.6177764616295794, + "grad_norm": 0.19287584722042084, + "learning_rate": 0.001, + "loss": 2.6512, + "step": 14603 + }, + { + "epoch": 0.6178187663930959, + "grad_norm": 0.18711356818675995, + "learning_rate": 0.001, + "loss": 2.3137, + "step": 14604 + }, + { + "epoch": 0.6178610711566123, + "grad_norm": 0.18138305842876434, + "learning_rate": 0.001, + "loss": 2.7517, + "step": 14605 + }, + { + "epoch": 0.6179033759201286, + "grad_norm": 0.1796899139881134, + "learning_rate": 0.001, + "loss": 2.9958, + "step": 14606 + }, + { + "epoch": 0.617945680683645, + "grad_norm": 0.20048393309116364, + "learning_rate": 0.001, + "loss": 3.1626, + "step": 14607 + }, + { + "epoch": 0.6179879854471614, + "grad_norm": 0.1451716423034668, + "learning_rate": 0.001, + "loss": 2.6609, + "step": 14608 + }, + { + "epoch": 0.6180302902106777, + "grad_norm": 0.4020373225212097, + "learning_rate": 0.001, + "loss": 2.3252, + "step": 14609 + }, + { + "epoch": 0.6180725949741941, + "grad_norm": 0.2272024154663086, + "learning_rate": 0.001, + "loss": 2.3946, + "step": 14610 + }, + { + "epoch": 0.6181148997377105, + "grad_norm": 0.24336324632167816, + "learning_rate": 0.001, + "loss": 2.0187, + "step": 14611 + }, + { + "epoch": 0.6181572045012268, + "grad_norm": 3.6941826343536377, + "learning_rate": 0.001, + "loss": 1.9722, + "step": 14612 + }, + { + "epoch": 0.6181995092647432, + "grad_norm": 0.1938718855381012, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 14613 + }, + { + "epoch": 0.6182418140282596, + "grad_norm": 0.6641799211502075, + "learning_rate": 0.001, + "loss": 2.0153, + "step": 14614 + }, + { + "epoch": 0.6182841187917759, + "grad_norm": 0.4109531342983246, + "learning_rate": 0.001, + "loss": 1.9137, + "step": 14615 + }, + { + "epoch": 0.6183264235552923, + "grad_norm": 0.14402462542057037, + "learning_rate": 0.001, + "loss": 2.4357, + "step": 14616 + }, + { + "epoch": 0.6183687283188087, + "grad_norm": 0.14747104048728943, + "learning_rate": 0.001, + "loss": 2.7109, + "step": 14617 + }, + { + "epoch": 0.618411033082325, + "grad_norm": 0.14934441447257996, + "learning_rate": 0.001, + "loss": 1.6499, + "step": 14618 + }, + { + "epoch": 0.6184533378458414, + "grad_norm": 0.3803541362285614, + "learning_rate": 0.001, + "loss": 2.4184, + "step": 14619 + }, + { + "epoch": 0.6184956426093579, + "grad_norm": 0.18346597254276276, + "learning_rate": 0.001, + "loss": 2.0202, + "step": 14620 + }, + { + "epoch": 0.6185379473728742, + "grad_norm": 2.4871697425842285, + "learning_rate": 0.001, + "loss": 1.9789, + "step": 14621 + }, + { + "epoch": 0.6185802521363906, + "grad_norm": 0.15989036858081818, + "learning_rate": 0.001, + "loss": 1.9835, + "step": 14622 + }, + { + "epoch": 0.618622556899907, + "grad_norm": 0.17353324592113495, + "learning_rate": 0.001, + "loss": 1.9959, + "step": 14623 + }, + { + "epoch": 0.6186648616634233, + "grad_norm": 0.18269836902618408, + "learning_rate": 0.001, + "loss": 2.9858, + "step": 14624 + }, + { + "epoch": 0.6187071664269397, + "grad_norm": 0.19032901525497437, + "learning_rate": 0.001, + "loss": 3.0415, + "step": 14625 + }, + { + "epoch": 0.6187494711904561, + "grad_norm": 0.17497460544109344, + "learning_rate": 0.001, + "loss": 2.0095, + "step": 14626 + }, + { + "epoch": 0.6187917759539724, + "grad_norm": 0.14147375524044037, + "learning_rate": 0.001, + "loss": 1.6253, + "step": 14627 + }, + { + "epoch": 0.6188340807174888, + "grad_norm": 0.1689339280128479, + "learning_rate": 0.001, + "loss": 2.3544, + "step": 14628 + }, + { + "epoch": 0.6188763854810052, + "grad_norm": 0.14351142942905426, + "learning_rate": 0.001, + "loss": 2.6217, + "step": 14629 + }, + { + "epoch": 0.6189186902445215, + "grad_norm": 0.30054372549057007, + "learning_rate": 0.001, + "loss": 2.5242, + "step": 14630 + }, + { + "epoch": 0.6189609950080379, + "grad_norm": 0.1856541931629181, + "learning_rate": 0.001, + "loss": 2.0127, + "step": 14631 + }, + { + "epoch": 0.6190032997715543, + "grad_norm": 0.2573411762714386, + "learning_rate": 0.001, + "loss": 2.2372, + "step": 14632 + }, + { + "epoch": 0.6190456045350706, + "grad_norm": 0.1990809440612793, + "learning_rate": 0.001, + "loss": 2.2482, + "step": 14633 + }, + { + "epoch": 0.619087909298587, + "grad_norm": 0.1465701162815094, + "learning_rate": 0.001, + "loss": 1.8241, + "step": 14634 + }, + { + "epoch": 0.6191302140621034, + "grad_norm": 0.1555158942937851, + "learning_rate": 0.001, + "loss": 2.1623, + "step": 14635 + }, + { + "epoch": 0.6191725188256197, + "grad_norm": 0.14374671876430511, + "learning_rate": 0.001, + "loss": 2.4611, + "step": 14636 + }, + { + "epoch": 0.6192148235891362, + "grad_norm": 0.19940204918384552, + "learning_rate": 0.001, + "loss": 2.1044, + "step": 14637 + }, + { + "epoch": 0.6192571283526526, + "grad_norm": 0.14661625027656555, + "learning_rate": 0.001, + "loss": 1.6882, + "step": 14638 + }, + { + "epoch": 0.6192994331161689, + "grad_norm": 0.17937296628952026, + "learning_rate": 0.001, + "loss": 2.2717, + "step": 14639 + }, + { + "epoch": 0.6193417378796853, + "grad_norm": 0.15153121948242188, + "learning_rate": 0.001, + "loss": 1.7106, + "step": 14640 + }, + { + "epoch": 0.6193840426432016, + "grad_norm": 0.1418657898902893, + "learning_rate": 0.001, + "loss": 1.6178, + "step": 14641 + }, + { + "epoch": 0.619426347406718, + "grad_norm": 0.19973745942115784, + "learning_rate": 0.001, + "loss": 1.786, + "step": 14642 + }, + { + "epoch": 0.6194686521702344, + "grad_norm": 0.16791045665740967, + "learning_rate": 0.001, + "loss": 1.7425, + "step": 14643 + }, + { + "epoch": 0.6195109569337507, + "grad_norm": 1.9201226234436035, + "learning_rate": 0.001, + "loss": 3.1445, + "step": 14644 + }, + { + "epoch": 0.6195532616972671, + "grad_norm": 0.1937858760356903, + "learning_rate": 0.001, + "loss": 3.0628, + "step": 14645 + }, + { + "epoch": 0.6195955664607835, + "grad_norm": 0.8908295631408691, + "learning_rate": 0.001, + "loss": 1.4817, + "step": 14646 + }, + { + "epoch": 0.6196378712242998, + "grad_norm": 0.18158625066280365, + "learning_rate": 0.001, + "loss": 2.1148, + "step": 14647 + }, + { + "epoch": 0.6196801759878162, + "grad_norm": 0.18421104550361633, + "learning_rate": 0.001, + "loss": 1.7709, + "step": 14648 + }, + { + "epoch": 0.6197224807513326, + "grad_norm": 0.19580382108688354, + "learning_rate": 0.001, + "loss": 2.2813, + "step": 14649 + }, + { + "epoch": 0.6197647855148489, + "grad_norm": 0.23519310355186462, + "learning_rate": 0.001, + "loss": 1.6237, + "step": 14650 + }, + { + "epoch": 0.6198070902783653, + "grad_norm": 0.27454647421836853, + "learning_rate": 0.001, + "loss": 2.3794, + "step": 14651 + }, + { + "epoch": 0.6198493950418817, + "grad_norm": 0.23078219592571259, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 14652 + }, + { + "epoch": 0.619891699805398, + "grad_norm": 2.1767003536224365, + "learning_rate": 0.001, + "loss": 2.1137, + "step": 14653 + }, + { + "epoch": 0.6199340045689145, + "grad_norm": 0.7693164944648743, + "learning_rate": 0.001, + "loss": 1.6826, + "step": 14654 + }, + { + "epoch": 0.6199763093324309, + "grad_norm": 0.18725281953811646, + "learning_rate": 0.001, + "loss": 1.6722, + "step": 14655 + }, + { + "epoch": 0.6200186140959472, + "grad_norm": 0.17474079132080078, + "learning_rate": 0.001, + "loss": 1.6512, + "step": 14656 + }, + { + "epoch": 0.6200609188594636, + "grad_norm": 0.2864467203617096, + "learning_rate": 0.001, + "loss": 1.657, + "step": 14657 + }, + { + "epoch": 0.62010322362298, + "grad_norm": 0.16446319222450256, + "learning_rate": 0.001, + "loss": 2.22, + "step": 14658 + }, + { + "epoch": 0.6201455283864963, + "grad_norm": 0.21445626020431519, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 14659 + }, + { + "epoch": 0.6201878331500127, + "grad_norm": 0.18971355259418488, + "learning_rate": 0.001, + "loss": 1.6979, + "step": 14660 + }, + { + "epoch": 0.6202301379135291, + "grad_norm": 0.1920858919620514, + "learning_rate": 0.001, + "loss": 2.9302, + "step": 14661 + }, + { + "epoch": 0.6202724426770454, + "grad_norm": 2.1202192306518555, + "learning_rate": 0.001, + "loss": 2.9636, + "step": 14662 + }, + { + "epoch": 0.6203147474405618, + "grad_norm": 0.1833968460559845, + "learning_rate": 0.001, + "loss": 2.9163, + "step": 14663 + }, + { + "epoch": 0.6203570522040782, + "grad_norm": 1.7556793689727783, + "learning_rate": 0.001, + "loss": 2.1679, + "step": 14664 + }, + { + "epoch": 0.6203993569675945, + "grad_norm": 0.1881345808506012, + "learning_rate": 0.001, + "loss": 1.4928, + "step": 14665 + }, + { + "epoch": 0.6204416617311109, + "grad_norm": 0.23631784319877625, + "learning_rate": 0.001, + "loss": 2.231, + "step": 14666 + }, + { + "epoch": 0.6204839664946273, + "grad_norm": 0.23086075484752655, + "learning_rate": 0.001, + "loss": 2.1282, + "step": 14667 + }, + { + "epoch": 0.6205262712581436, + "grad_norm": 0.23201145231723785, + "learning_rate": 0.001, + "loss": 2.5473, + "step": 14668 + }, + { + "epoch": 0.62056857602166, + "grad_norm": 0.20039266347885132, + "learning_rate": 0.001, + "loss": 2.1767, + "step": 14669 + }, + { + "epoch": 0.6206108807851765, + "grad_norm": 0.2201448231935501, + "learning_rate": 0.001, + "loss": 2.3714, + "step": 14670 + }, + { + "epoch": 0.6206531855486928, + "grad_norm": 0.25646641850471497, + "learning_rate": 0.001, + "loss": 2.2821, + "step": 14671 + }, + { + "epoch": 0.6206954903122092, + "grad_norm": 0.2379496693611145, + "learning_rate": 0.001, + "loss": 2.428, + "step": 14672 + }, + { + "epoch": 0.6207377950757256, + "grad_norm": 0.6765031218528748, + "learning_rate": 0.001, + "loss": 2.5439, + "step": 14673 + }, + { + "epoch": 0.6207800998392419, + "grad_norm": 0.23694269359111786, + "learning_rate": 0.001, + "loss": 3.0224, + "step": 14674 + }, + { + "epoch": 0.6208224046027583, + "grad_norm": 0.22863154113292694, + "learning_rate": 0.001, + "loss": 2.0422, + "step": 14675 + }, + { + "epoch": 0.6208647093662747, + "grad_norm": 0.2052396535873413, + "learning_rate": 0.001, + "loss": 2.1184, + "step": 14676 + }, + { + "epoch": 0.620907014129791, + "grad_norm": 0.42548587918281555, + "learning_rate": 0.001, + "loss": 2.3153, + "step": 14677 + }, + { + "epoch": 0.6209493188933074, + "grad_norm": 0.20352311432361603, + "learning_rate": 0.001, + "loss": 2.7797, + "step": 14678 + }, + { + "epoch": 0.6209916236568238, + "grad_norm": 1.041845440864563, + "learning_rate": 0.001, + "loss": 1.669, + "step": 14679 + }, + { + "epoch": 0.6210339284203401, + "grad_norm": 0.19072070717811584, + "learning_rate": 0.001, + "loss": 2.6243, + "step": 14680 + }, + { + "epoch": 0.6210762331838565, + "grad_norm": 0.17468595504760742, + "learning_rate": 0.001, + "loss": 2.0063, + "step": 14681 + }, + { + "epoch": 0.6211185379473729, + "grad_norm": 0.18021397292613983, + "learning_rate": 0.001, + "loss": 1.8637, + "step": 14682 + }, + { + "epoch": 0.6211608427108892, + "grad_norm": 0.1784118413925171, + "learning_rate": 0.001, + "loss": 2.5515, + "step": 14683 + }, + { + "epoch": 0.6212031474744056, + "grad_norm": 0.17213745415210724, + "learning_rate": 0.001, + "loss": 1.9208, + "step": 14684 + }, + { + "epoch": 0.6212454522379219, + "grad_norm": 0.16187311708927155, + "learning_rate": 0.001, + "loss": 2.6264, + "step": 14685 + }, + { + "epoch": 0.6212877570014383, + "grad_norm": 0.1690584123134613, + "learning_rate": 0.001, + "loss": 2.0176, + "step": 14686 + }, + { + "epoch": 0.6213300617649548, + "grad_norm": 0.22212959825992584, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 14687 + }, + { + "epoch": 0.621372366528471, + "grad_norm": 0.19889722764492035, + "learning_rate": 0.001, + "loss": 2.108, + "step": 14688 + }, + { + "epoch": 0.6214146712919875, + "grad_norm": 0.20588736236095428, + "learning_rate": 0.001, + "loss": 2.604, + "step": 14689 + }, + { + "epoch": 0.6214569760555039, + "grad_norm": 0.15710745751857758, + "learning_rate": 0.001, + "loss": 1.642, + "step": 14690 + }, + { + "epoch": 0.6214992808190202, + "grad_norm": 1.2365772724151611, + "learning_rate": 0.001, + "loss": 1.5983, + "step": 14691 + }, + { + "epoch": 0.6215415855825366, + "grad_norm": 0.1592063158750534, + "learning_rate": 0.001, + "loss": 1.7246, + "step": 14692 + }, + { + "epoch": 0.621583890346053, + "grad_norm": 0.2230633795261383, + "learning_rate": 0.001, + "loss": 2.5722, + "step": 14693 + }, + { + "epoch": 0.6216261951095693, + "grad_norm": 0.17436636984348297, + "learning_rate": 0.001, + "loss": 1.6434, + "step": 14694 + }, + { + "epoch": 0.6216684998730857, + "grad_norm": 0.1729450523853302, + "learning_rate": 0.001, + "loss": 2.0217, + "step": 14695 + }, + { + "epoch": 0.6217108046366021, + "grad_norm": 0.16045428812503815, + "learning_rate": 0.001, + "loss": 1.4028, + "step": 14696 + }, + { + "epoch": 0.6217531094001184, + "grad_norm": 0.1945531815290451, + "learning_rate": 0.001, + "loss": 2.3836, + "step": 14697 + }, + { + "epoch": 0.6217954141636348, + "grad_norm": 3.1854636669158936, + "learning_rate": 0.001, + "loss": 3.0657, + "step": 14698 + }, + { + "epoch": 0.6218377189271512, + "grad_norm": 0.15348710119724274, + "learning_rate": 0.001, + "loss": 2.0376, + "step": 14699 + }, + { + "epoch": 0.6218800236906675, + "grad_norm": 0.16753935813903809, + "learning_rate": 0.001, + "loss": 1.6844, + "step": 14700 + }, + { + "epoch": 0.6219223284541839, + "grad_norm": 0.1476292461156845, + "learning_rate": 0.001, + "loss": 1.878, + "step": 14701 + }, + { + "epoch": 0.6219646332177003, + "grad_norm": 0.21797721087932587, + "learning_rate": 0.001, + "loss": 1.692, + "step": 14702 + }, + { + "epoch": 0.6220069379812166, + "grad_norm": 0.176117941737175, + "learning_rate": 0.001, + "loss": 2.2716, + "step": 14703 + }, + { + "epoch": 0.622049242744733, + "grad_norm": 4.9479660987854, + "learning_rate": 0.001, + "loss": 1.8148, + "step": 14704 + }, + { + "epoch": 0.6220915475082495, + "grad_norm": 0.16278578341007233, + "learning_rate": 0.001, + "loss": 1.9686, + "step": 14705 + }, + { + "epoch": 0.6221338522717658, + "grad_norm": 0.19134145975112915, + "learning_rate": 0.001, + "loss": 2.3014, + "step": 14706 + }, + { + "epoch": 0.6221761570352822, + "grad_norm": 0.1470852494239807, + "learning_rate": 0.001, + "loss": 2.4502, + "step": 14707 + }, + { + "epoch": 0.6222184617987986, + "grad_norm": 0.3955981731414795, + "learning_rate": 0.001, + "loss": 2.0912, + "step": 14708 + }, + { + "epoch": 0.6222607665623149, + "grad_norm": 0.2723364233970642, + "learning_rate": 0.001, + "loss": 1.8479, + "step": 14709 + }, + { + "epoch": 0.6223030713258313, + "grad_norm": 0.15754322707653046, + "learning_rate": 0.001, + "loss": 2.4443, + "step": 14710 + }, + { + "epoch": 0.6223453760893477, + "grad_norm": 0.21072664856910706, + "learning_rate": 0.001, + "loss": 1.9223, + "step": 14711 + }, + { + "epoch": 0.622387680852864, + "grad_norm": 0.17302967607975006, + "learning_rate": 0.001, + "loss": 2.3223, + "step": 14712 + }, + { + "epoch": 0.6224299856163804, + "grad_norm": 0.17941580712795258, + "learning_rate": 0.001, + "loss": 1.7731, + "step": 14713 + }, + { + "epoch": 0.6224722903798968, + "grad_norm": 0.325706422328949, + "learning_rate": 0.001, + "loss": 2.6128, + "step": 14714 + }, + { + "epoch": 0.6225145951434131, + "grad_norm": 1.440018653869629, + "learning_rate": 0.001, + "loss": 1.9563, + "step": 14715 + }, + { + "epoch": 0.6225568999069295, + "grad_norm": 0.1713675558567047, + "learning_rate": 0.001, + "loss": 2.3617, + "step": 14716 + }, + { + "epoch": 0.6225992046704459, + "grad_norm": 0.19308139383792877, + "learning_rate": 0.001, + "loss": 1.6729, + "step": 14717 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 0.17297999560832977, + "learning_rate": 0.001, + "loss": 3.6168, + "step": 14718 + }, + { + "epoch": 0.6226838141974786, + "grad_norm": 0.27014705538749695, + "learning_rate": 0.001, + "loss": 3.2252, + "step": 14719 + }, + { + "epoch": 0.6227261189609951, + "grad_norm": 0.15865100920200348, + "learning_rate": 0.001, + "loss": 1.4966, + "step": 14720 + }, + { + "epoch": 0.6227684237245114, + "grad_norm": 0.14343230426311493, + "learning_rate": 0.001, + "loss": 1.711, + "step": 14721 + }, + { + "epoch": 0.6228107284880278, + "grad_norm": 0.1730983704328537, + "learning_rate": 0.001, + "loss": 2.159, + "step": 14722 + }, + { + "epoch": 0.6228530332515442, + "grad_norm": 0.20309610664844513, + "learning_rate": 0.001, + "loss": 2.1347, + "step": 14723 + }, + { + "epoch": 0.6228953380150605, + "grad_norm": 0.9017519950866699, + "learning_rate": 0.001, + "loss": 1.8289, + "step": 14724 + }, + { + "epoch": 0.6229376427785769, + "grad_norm": 0.17153169214725494, + "learning_rate": 0.001, + "loss": 2.527, + "step": 14725 + }, + { + "epoch": 0.6229799475420933, + "grad_norm": 0.1731494665145874, + "learning_rate": 0.001, + "loss": 1.3847, + "step": 14726 + }, + { + "epoch": 0.6230222523056096, + "grad_norm": 0.20264741778373718, + "learning_rate": 0.001, + "loss": 2.6843, + "step": 14727 + }, + { + "epoch": 0.623064557069126, + "grad_norm": 0.1755369007587433, + "learning_rate": 0.001, + "loss": 1.8534, + "step": 14728 + }, + { + "epoch": 0.6231068618326423, + "grad_norm": 0.1698158234357834, + "learning_rate": 0.001, + "loss": 2.4937, + "step": 14729 + }, + { + "epoch": 0.6231491665961587, + "grad_norm": 0.3321411907672882, + "learning_rate": 0.001, + "loss": 1.902, + "step": 14730 + }, + { + "epoch": 0.6231914713596751, + "grad_norm": 0.1981087177991867, + "learning_rate": 0.001, + "loss": 2.086, + "step": 14731 + }, + { + "epoch": 0.6232337761231914, + "grad_norm": 0.14606229960918427, + "learning_rate": 0.001, + "loss": 1.7585, + "step": 14732 + }, + { + "epoch": 0.6232760808867078, + "grad_norm": 0.15275610983371735, + "learning_rate": 0.001, + "loss": 2.0629, + "step": 14733 + }, + { + "epoch": 0.6233183856502242, + "grad_norm": 0.3921429514884949, + "learning_rate": 0.001, + "loss": 2.0281, + "step": 14734 + }, + { + "epoch": 0.6233606904137405, + "grad_norm": 0.15665315091609955, + "learning_rate": 0.001, + "loss": 1.9623, + "step": 14735 + }, + { + "epoch": 0.623402995177257, + "grad_norm": 0.1723744422197342, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 14736 + }, + { + "epoch": 0.6234452999407734, + "grad_norm": 0.16472692787647247, + "learning_rate": 0.001, + "loss": 2.3605, + "step": 14737 + }, + { + "epoch": 0.6234876047042897, + "grad_norm": 0.22719836235046387, + "learning_rate": 0.001, + "loss": 2.9966, + "step": 14738 + }, + { + "epoch": 0.6235299094678061, + "grad_norm": 0.5726194381713867, + "learning_rate": 0.001, + "loss": 2.0207, + "step": 14739 + }, + { + "epoch": 0.6235722142313225, + "grad_norm": 0.16665321588516235, + "learning_rate": 0.001, + "loss": 2.1726, + "step": 14740 + }, + { + "epoch": 0.6236145189948388, + "grad_norm": 0.15230652689933777, + "learning_rate": 0.001, + "loss": 2.2088, + "step": 14741 + }, + { + "epoch": 0.6236568237583552, + "grad_norm": 0.17324359714984894, + "learning_rate": 0.001, + "loss": 2.0516, + "step": 14742 + }, + { + "epoch": 0.6236991285218716, + "grad_norm": 0.24778017401695251, + "learning_rate": 0.001, + "loss": 1.798, + "step": 14743 + }, + { + "epoch": 0.6237414332853879, + "grad_norm": 0.18029212951660156, + "learning_rate": 0.001, + "loss": 1.5368, + "step": 14744 + }, + { + "epoch": 0.6237837380489043, + "grad_norm": 0.17106720805168152, + "learning_rate": 0.001, + "loss": 2.5262, + "step": 14745 + }, + { + "epoch": 0.6238260428124207, + "grad_norm": 0.14900583028793335, + "learning_rate": 0.001, + "loss": 1.2427, + "step": 14746 + }, + { + "epoch": 0.623868347575937, + "grad_norm": 0.2502407133579254, + "learning_rate": 0.001, + "loss": 3.2134, + "step": 14747 + }, + { + "epoch": 0.6239106523394534, + "grad_norm": 0.21255750954151154, + "learning_rate": 0.001, + "loss": 2.0656, + "step": 14748 + }, + { + "epoch": 0.6239529571029698, + "grad_norm": 0.15253281593322754, + "learning_rate": 0.001, + "loss": 3.1515, + "step": 14749 + }, + { + "epoch": 0.6239952618664861, + "grad_norm": 0.15575817227363586, + "learning_rate": 0.001, + "loss": 2.1155, + "step": 14750 + }, + { + "epoch": 0.6240375666300025, + "grad_norm": 0.26133993268013, + "learning_rate": 0.001, + "loss": 2.7806, + "step": 14751 + }, + { + "epoch": 0.624079871393519, + "grad_norm": 0.15048684179782867, + "learning_rate": 0.001, + "loss": 1.8556, + "step": 14752 + }, + { + "epoch": 0.6241221761570352, + "grad_norm": 0.2583177387714386, + "learning_rate": 0.001, + "loss": 2.298, + "step": 14753 + }, + { + "epoch": 0.6241644809205517, + "grad_norm": 0.15635612607002258, + "learning_rate": 0.001, + "loss": 1.8436, + "step": 14754 + }, + { + "epoch": 0.6242067856840681, + "grad_norm": 0.3417166769504547, + "learning_rate": 0.001, + "loss": 2.3692, + "step": 14755 + }, + { + "epoch": 0.6242490904475844, + "grad_norm": 0.29285457730293274, + "learning_rate": 0.001, + "loss": 2.2755, + "step": 14756 + }, + { + "epoch": 0.6242913952111008, + "grad_norm": 0.2324390560388565, + "learning_rate": 0.001, + "loss": 2.2066, + "step": 14757 + }, + { + "epoch": 0.6243336999746172, + "grad_norm": 0.15262608230113983, + "learning_rate": 0.001, + "loss": 1.5537, + "step": 14758 + }, + { + "epoch": 0.6243760047381335, + "grad_norm": 0.1738661378622055, + "learning_rate": 0.001, + "loss": 2.2254, + "step": 14759 + }, + { + "epoch": 0.6244183095016499, + "grad_norm": 0.17284923791885376, + "learning_rate": 0.001, + "loss": 2.619, + "step": 14760 + }, + { + "epoch": 0.6244606142651663, + "grad_norm": 0.16344395279884338, + "learning_rate": 0.001, + "loss": 1.8052, + "step": 14761 + }, + { + "epoch": 0.6245029190286826, + "grad_norm": 1.0341507196426392, + "learning_rate": 0.001, + "loss": 2.4716, + "step": 14762 + }, + { + "epoch": 0.624545223792199, + "grad_norm": 0.17319472134113312, + "learning_rate": 0.001, + "loss": 1.7911, + "step": 14763 + }, + { + "epoch": 0.6245875285557154, + "grad_norm": 0.1723223626613617, + "learning_rate": 0.001, + "loss": 2.2801, + "step": 14764 + }, + { + "epoch": 0.6246298333192317, + "grad_norm": 0.16963432729244232, + "learning_rate": 0.001, + "loss": 1.8165, + "step": 14765 + }, + { + "epoch": 0.6246721380827481, + "grad_norm": 0.1635103076696396, + "learning_rate": 0.001, + "loss": 2.3229, + "step": 14766 + }, + { + "epoch": 0.6247144428462645, + "grad_norm": 0.14711347222328186, + "learning_rate": 0.001, + "loss": 2.0542, + "step": 14767 + }, + { + "epoch": 0.6247567476097808, + "grad_norm": 0.15598388016223907, + "learning_rate": 0.001, + "loss": 3.1843, + "step": 14768 + }, + { + "epoch": 0.6247990523732972, + "grad_norm": 0.15948522090911865, + "learning_rate": 0.001, + "loss": 3.8822, + "step": 14769 + }, + { + "epoch": 0.6248413571368137, + "grad_norm": 0.1707606315612793, + "learning_rate": 0.001, + "loss": 1.5173, + "step": 14770 + }, + { + "epoch": 0.62488366190033, + "grad_norm": 0.20455005764961243, + "learning_rate": 0.001, + "loss": 2.8538, + "step": 14771 + }, + { + "epoch": 0.6249259666638464, + "grad_norm": 0.18177711963653564, + "learning_rate": 0.001, + "loss": 1.576, + "step": 14772 + }, + { + "epoch": 0.6249682714273628, + "grad_norm": 0.6637758016586304, + "learning_rate": 0.001, + "loss": 3.9036, + "step": 14773 + }, + { + "epoch": 0.6250105761908791, + "grad_norm": 0.14505170285701752, + "learning_rate": 0.001, + "loss": 1.3907, + "step": 14774 + }, + { + "epoch": 0.6250528809543955, + "grad_norm": 0.1670999825000763, + "learning_rate": 0.001, + "loss": 2.2654, + "step": 14775 + }, + { + "epoch": 0.6250951857179118, + "grad_norm": 0.15915502607822418, + "learning_rate": 0.001, + "loss": 1.5619, + "step": 14776 + }, + { + "epoch": 0.6251374904814282, + "grad_norm": 0.24664057791233063, + "learning_rate": 0.001, + "loss": 2.0549, + "step": 14777 + }, + { + "epoch": 0.6251797952449446, + "grad_norm": 0.19590286910533905, + "learning_rate": 0.001, + "loss": 1.9566, + "step": 14778 + }, + { + "epoch": 0.6252221000084609, + "grad_norm": 0.1494143158197403, + "learning_rate": 0.001, + "loss": 2.6541, + "step": 14779 + }, + { + "epoch": 0.6252644047719773, + "grad_norm": 0.1772175133228302, + "learning_rate": 0.001, + "loss": 2.077, + "step": 14780 + }, + { + "epoch": 0.6253067095354937, + "grad_norm": 0.16731050610542297, + "learning_rate": 0.001, + "loss": 1.8391, + "step": 14781 + }, + { + "epoch": 0.62534901429901, + "grad_norm": 0.18583473563194275, + "learning_rate": 0.001, + "loss": 2.561, + "step": 14782 + }, + { + "epoch": 0.6253913190625264, + "grad_norm": 0.15559391677379608, + "learning_rate": 0.001, + "loss": 1.7544, + "step": 14783 + }, + { + "epoch": 0.6254336238260428, + "grad_norm": 0.16811911761760712, + "learning_rate": 0.001, + "loss": 2.3595, + "step": 14784 + }, + { + "epoch": 0.6254759285895591, + "grad_norm": 0.18134218454360962, + "learning_rate": 0.001, + "loss": 1.9538, + "step": 14785 + }, + { + "epoch": 0.6255182333530755, + "grad_norm": 0.8683392405509949, + "learning_rate": 0.001, + "loss": 2.0525, + "step": 14786 + }, + { + "epoch": 0.625560538116592, + "grad_norm": 0.17296668887138367, + "learning_rate": 0.001, + "loss": 2.1946, + "step": 14787 + }, + { + "epoch": 0.6256028428801083, + "grad_norm": 0.24003466963768005, + "learning_rate": 0.001, + "loss": 2.5681, + "step": 14788 + }, + { + "epoch": 0.6256451476436247, + "grad_norm": 0.1762605756521225, + "learning_rate": 0.001, + "loss": 2.2416, + "step": 14789 + }, + { + "epoch": 0.6256874524071411, + "grad_norm": 0.262921541929245, + "learning_rate": 0.001, + "loss": 3.2333, + "step": 14790 + }, + { + "epoch": 0.6257297571706574, + "grad_norm": 0.2003992646932602, + "learning_rate": 0.001, + "loss": 2.1344, + "step": 14791 + }, + { + "epoch": 0.6257720619341738, + "grad_norm": 0.1967916488647461, + "learning_rate": 0.001, + "loss": 1.5399, + "step": 14792 + }, + { + "epoch": 0.6258143666976902, + "grad_norm": 0.18930691480636597, + "learning_rate": 0.001, + "loss": 1.3929, + "step": 14793 + }, + { + "epoch": 0.6258566714612065, + "grad_norm": 0.6461501121520996, + "learning_rate": 0.001, + "loss": 1.8684, + "step": 14794 + }, + { + "epoch": 0.6258989762247229, + "grad_norm": 0.16277913749217987, + "learning_rate": 0.001, + "loss": 2.1545, + "step": 14795 + }, + { + "epoch": 0.6259412809882393, + "grad_norm": 0.13681641221046448, + "learning_rate": 0.001, + "loss": 1.4387, + "step": 14796 + }, + { + "epoch": 0.6259835857517556, + "grad_norm": 0.49204564094543457, + "learning_rate": 0.001, + "loss": 1.716, + "step": 14797 + }, + { + "epoch": 0.626025890515272, + "grad_norm": 0.2248888909816742, + "learning_rate": 0.001, + "loss": 2.0344, + "step": 14798 + }, + { + "epoch": 0.6260681952787884, + "grad_norm": 0.1486063003540039, + "learning_rate": 0.001, + "loss": 1.7528, + "step": 14799 + }, + { + "epoch": 0.6261105000423047, + "grad_norm": 0.16557568311691284, + "learning_rate": 0.001, + "loss": 1.7937, + "step": 14800 + }, + { + "epoch": 0.6261528048058211, + "grad_norm": 0.2470521777868271, + "learning_rate": 0.001, + "loss": 1.88, + "step": 14801 + }, + { + "epoch": 0.6261951095693375, + "grad_norm": 1.031522274017334, + "learning_rate": 0.001, + "loss": 2.0462, + "step": 14802 + }, + { + "epoch": 0.6262374143328538, + "grad_norm": 0.39598196744918823, + "learning_rate": 0.001, + "loss": 1.4901, + "step": 14803 + }, + { + "epoch": 0.6262797190963703, + "grad_norm": 0.1830819547176361, + "learning_rate": 0.001, + "loss": 1.7788, + "step": 14804 + }, + { + "epoch": 0.6263220238598867, + "grad_norm": 0.18388625979423523, + "learning_rate": 0.001, + "loss": 2.308, + "step": 14805 + }, + { + "epoch": 0.626364328623403, + "grad_norm": 1.8769688606262207, + "learning_rate": 0.001, + "loss": 2.2326, + "step": 14806 + }, + { + "epoch": 0.6264066333869194, + "grad_norm": 0.1682252585887909, + "learning_rate": 0.001, + "loss": 1.7469, + "step": 14807 + }, + { + "epoch": 0.6264489381504358, + "grad_norm": 0.1951892077922821, + "learning_rate": 0.001, + "loss": 3.3156, + "step": 14808 + }, + { + "epoch": 0.6264912429139521, + "grad_norm": 0.20327278971672058, + "learning_rate": 0.001, + "loss": 2.7878, + "step": 14809 + }, + { + "epoch": 0.6265335476774685, + "grad_norm": 7.379302024841309, + "learning_rate": 0.001, + "loss": 2.2043, + "step": 14810 + }, + { + "epoch": 0.6265758524409849, + "grad_norm": 0.19853001832962036, + "learning_rate": 0.001, + "loss": 2.6433, + "step": 14811 + }, + { + "epoch": 0.6266181572045012, + "grad_norm": 0.4619123637676239, + "learning_rate": 0.001, + "loss": 1.9174, + "step": 14812 + }, + { + "epoch": 0.6266604619680176, + "grad_norm": 0.167628213763237, + "learning_rate": 0.001, + "loss": 1.3258, + "step": 14813 + }, + { + "epoch": 0.626702766731534, + "grad_norm": 1.1932356357574463, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 14814 + }, + { + "epoch": 0.6267450714950503, + "grad_norm": 0.21053379774093628, + "learning_rate": 0.001, + "loss": 3.2038, + "step": 14815 + }, + { + "epoch": 0.6267873762585667, + "grad_norm": 1.0147972106933594, + "learning_rate": 0.001, + "loss": 3.0992, + "step": 14816 + }, + { + "epoch": 0.6268296810220831, + "grad_norm": 0.2153492271900177, + "learning_rate": 0.001, + "loss": 2.7228, + "step": 14817 + }, + { + "epoch": 0.6268719857855994, + "grad_norm": 0.4768437147140503, + "learning_rate": 0.001, + "loss": 1.9515, + "step": 14818 + }, + { + "epoch": 0.6269142905491158, + "grad_norm": 0.6105790138244629, + "learning_rate": 0.001, + "loss": 2.5721, + "step": 14819 + }, + { + "epoch": 0.6269565953126321, + "grad_norm": 0.17983876168727875, + "learning_rate": 0.001, + "loss": 1.896, + "step": 14820 + }, + { + "epoch": 0.6269989000761486, + "grad_norm": 0.23579160869121552, + "learning_rate": 0.001, + "loss": 1.8257, + "step": 14821 + }, + { + "epoch": 0.627041204839665, + "grad_norm": 0.14642800390720367, + "learning_rate": 0.001, + "loss": 1.7842, + "step": 14822 + }, + { + "epoch": 0.6270835096031813, + "grad_norm": 0.23658250272274017, + "learning_rate": 0.001, + "loss": 2.5395, + "step": 14823 + }, + { + "epoch": 0.6271258143666977, + "grad_norm": 0.18231017887592316, + "learning_rate": 0.001, + "loss": 2.6851, + "step": 14824 + }, + { + "epoch": 0.6271681191302141, + "grad_norm": 0.22886481881141663, + "learning_rate": 0.001, + "loss": 2.5188, + "step": 14825 + }, + { + "epoch": 0.6272104238937304, + "grad_norm": 0.32207468152046204, + "learning_rate": 0.001, + "loss": 2.9632, + "step": 14826 + }, + { + "epoch": 0.6272527286572468, + "grad_norm": 0.2477169632911682, + "learning_rate": 0.001, + "loss": 2.8172, + "step": 14827 + }, + { + "epoch": 0.6272950334207632, + "grad_norm": 0.2553689479827881, + "learning_rate": 0.001, + "loss": 2.035, + "step": 14828 + }, + { + "epoch": 0.6273373381842795, + "grad_norm": 0.16400131583213806, + "learning_rate": 0.001, + "loss": 2.1881, + "step": 14829 + }, + { + "epoch": 0.6273796429477959, + "grad_norm": 0.47019270062446594, + "learning_rate": 0.001, + "loss": 2.8494, + "step": 14830 + }, + { + "epoch": 0.6274219477113123, + "grad_norm": 0.3059365153312683, + "learning_rate": 0.001, + "loss": 2.2702, + "step": 14831 + }, + { + "epoch": 0.6274642524748286, + "grad_norm": 0.21573302149772644, + "learning_rate": 0.001, + "loss": 2.0846, + "step": 14832 + }, + { + "epoch": 0.627506557238345, + "grad_norm": 0.168625608086586, + "learning_rate": 0.001, + "loss": 1.7721, + "step": 14833 + }, + { + "epoch": 0.6275488620018614, + "grad_norm": 0.15803949534893036, + "learning_rate": 0.001, + "loss": 1.42, + "step": 14834 + }, + { + "epoch": 0.6275911667653777, + "grad_norm": 0.22716844081878662, + "learning_rate": 0.001, + "loss": 1.655, + "step": 14835 + }, + { + "epoch": 0.6276334715288941, + "grad_norm": 0.2481343150138855, + "learning_rate": 0.001, + "loss": 2.1238, + "step": 14836 + }, + { + "epoch": 0.6276757762924106, + "grad_norm": 0.21885570883750916, + "learning_rate": 0.001, + "loss": 2.5259, + "step": 14837 + }, + { + "epoch": 0.6277180810559269, + "grad_norm": 0.22164210677146912, + "learning_rate": 0.001, + "loss": 2.5285, + "step": 14838 + }, + { + "epoch": 0.6277603858194433, + "grad_norm": 0.17463375627994537, + "learning_rate": 0.001, + "loss": 2.1484, + "step": 14839 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.38111189007759094, + "learning_rate": 0.001, + "loss": 3.1991, + "step": 14840 + }, + { + "epoch": 0.627844995346476, + "grad_norm": 0.2066563367843628, + "learning_rate": 0.001, + "loss": 1.804, + "step": 14841 + }, + { + "epoch": 0.6278873001099924, + "grad_norm": 0.23109540343284607, + "learning_rate": 0.001, + "loss": 1.7063, + "step": 14842 + }, + { + "epoch": 0.6279296048735088, + "grad_norm": 0.1922856718301773, + "learning_rate": 0.001, + "loss": 1.6542, + "step": 14843 + }, + { + "epoch": 0.6279719096370251, + "grad_norm": 0.1745460331439972, + "learning_rate": 0.001, + "loss": 2.2544, + "step": 14844 + }, + { + "epoch": 0.6280142144005415, + "grad_norm": 0.16005033254623413, + "learning_rate": 0.001, + "loss": 1.6839, + "step": 14845 + }, + { + "epoch": 0.6280565191640579, + "grad_norm": 0.14515602588653564, + "learning_rate": 0.001, + "loss": 2.7744, + "step": 14846 + }, + { + "epoch": 0.6280988239275742, + "grad_norm": 0.1645001471042633, + "learning_rate": 0.001, + "loss": 1.6844, + "step": 14847 + }, + { + "epoch": 0.6281411286910906, + "grad_norm": 0.19100764393806458, + "learning_rate": 0.001, + "loss": 2.6849, + "step": 14848 + }, + { + "epoch": 0.628183433454607, + "grad_norm": 0.1547725945711136, + "learning_rate": 0.001, + "loss": 1.6892, + "step": 14849 + }, + { + "epoch": 0.6282257382181233, + "grad_norm": 0.15877199172973633, + "learning_rate": 0.001, + "loss": 2.3236, + "step": 14850 + }, + { + "epoch": 0.6282680429816397, + "grad_norm": 0.858564555644989, + "learning_rate": 0.001, + "loss": 2.6084, + "step": 14851 + }, + { + "epoch": 0.6283103477451562, + "grad_norm": 0.15130288898944855, + "learning_rate": 0.001, + "loss": 2.6856, + "step": 14852 + }, + { + "epoch": 0.6283526525086724, + "grad_norm": 0.18292021751403809, + "learning_rate": 0.001, + "loss": 1.7743, + "step": 14853 + }, + { + "epoch": 0.6283949572721889, + "grad_norm": 0.17332005500793457, + "learning_rate": 0.001, + "loss": 2.0246, + "step": 14854 + }, + { + "epoch": 0.6284372620357053, + "grad_norm": 0.2719305455684662, + "learning_rate": 0.001, + "loss": 2.222, + "step": 14855 + }, + { + "epoch": 0.6284795667992216, + "grad_norm": 0.17727939784526825, + "learning_rate": 0.001, + "loss": 2.3099, + "step": 14856 + }, + { + "epoch": 0.628521871562738, + "grad_norm": 0.21192631125450134, + "learning_rate": 0.001, + "loss": 2.0069, + "step": 14857 + }, + { + "epoch": 0.6285641763262544, + "grad_norm": 0.16656994819641113, + "learning_rate": 0.001, + "loss": 2.1416, + "step": 14858 + }, + { + "epoch": 0.6286064810897707, + "grad_norm": 0.14432191848754883, + "learning_rate": 0.001, + "loss": 1.9373, + "step": 14859 + }, + { + "epoch": 0.6286487858532871, + "grad_norm": 0.16526567935943604, + "learning_rate": 0.001, + "loss": 2.9011, + "step": 14860 + }, + { + "epoch": 0.6286910906168035, + "grad_norm": 0.35414940118789673, + "learning_rate": 0.001, + "loss": 1.6422, + "step": 14861 + }, + { + "epoch": 0.6287333953803198, + "grad_norm": 1.0360305309295654, + "learning_rate": 0.001, + "loss": 1.7833, + "step": 14862 + }, + { + "epoch": 0.6287757001438362, + "grad_norm": 0.18930722773075104, + "learning_rate": 0.001, + "loss": 1.7108, + "step": 14863 + }, + { + "epoch": 0.6288180049073525, + "grad_norm": 0.23029504716396332, + "learning_rate": 0.001, + "loss": 2.3319, + "step": 14864 + }, + { + "epoch": 0.6288603096708689, + "grad_norm": 0.1663587987422943, + "learning_rate": 0.001, + "loss": 2.4982, + "step": 14865 + }, + { + "epoch": 0.6289026144343853, + "grad_norm": 0.14894288778305054, + "learning_rate": 0.001, + "loss": 2.4744, + "step": 14866 + }, + { + "epoch": 0.6289449191979016, + "grad_norm": 0.1686108112335205, + "learning_rate": 0.001, + "loss": 2.1875, + "step": 14867 + }, + { + "epoch": 0.628987223961418, + "grad_norm": 0.28346070647239685, + "learning_rate": 0.001, + "loss": 1.7888, + "step": 14868 + }, + { + "epoch": 0.6290295287249345, + "grad_norm": 0.2675085961818695, + "learning_rate": 0.001, + "loss": 2.2643, + "step": 14869 + }, + { + "epoch": 0.6290718334884507, + "grad_norm": 0.5835021734237671, + "learning_rate": 0.001, + "loss": 2.4189, + "step": 14870 + }, + { + "epoch": 0.6291141382519672, + "grad_norm": 0.1684013456106186, + "learning_rate": 0.001, + "loss": 2.6277, + "step": 14871 + }, + { + "epoch": 0.6291564430154836, + "grad_norm": 0.26715943217277527, + "learning_rate": 0.001, + "loss": 2.1086, + "step": 14872 + }, + { + "epoch": 0.6291987477789999, + "grad_norm": 0.1863265186548233, + "learning_rate": 0.001, + "loss": 2.702, + "step": 14873 + }, + { + "epoch": 0.6292410525425163, + "grad_norm": 0.17459061741828918, + "learning_rate": 0.001, + "loss": 1.8087, + "step": 14874 + }, + { + "epoch": 0.6292833573060327, + "grad_norm": 0.19673362374305725, + "learning_rate": 0.001, + "loss": 3.9467, + "step": 14875 + }, + { + "epoch": 0.629325662069549, + "grad_norm": 0.19001983106136322, + "learning_rate": 0.001, + "loss": 1.4823, + "step": 14876 + }, + { + "epoch": 0.6293679668330654, + "grad_norm": 0.24293777346611023, + "learning_rate": 0.001, + "loss": 3.0979, + "step": 14877 + }, + { + "epoch": 0.6294102715965818, + "grad_norm": 0.2093208283185959, + "learning_rate": 0.001, + "loss": 2.5133, + "step": 14878 + }, + { + "epoch": 0.6294525763600981, + "grad_norm": 0.1860685646533966, + "learning_rate": 0.001, + "loss": 1.69, + "step": 14879 + }, + { + "epoch": 0.6294948811236145, + "grad_norm": 2.451101303100586, + "learning_rate": 0.001, + "loss": 1.933, + "step": 14880 + }, + { + "epoch": 0.6295371858871309, + "grad_norm": 0.16000764071941376, + "learning_rate": 0.001, + "loss": 1.6052, + "step": 14881 + }, + { + "epoch": 0.6295794906506472, + "grad_norm": 0.17317312955856323, + "learning_rate": 0.001, + "loss": 1.7292, + "step": 14882 + }, + { + "epoch": 0.6296217954141636, + "grad_norm": 0.15874236822128296, + "learning_rate": 0.001, + "loss": 2.214, + "step": 14883 + }, + { + "epoch": 0.62966410017768, + "grad_norm": 0.16798686981201172, + "learning_rate": 0.001, + "loss": 1.7421, + "step": 14884 + }, + { + "epoch": 0.6297064049411963, + "grad_norm": 8.583555221557617, + "learning_rate": 0.001, + "loss": 2.7575, + "step": 14885 + }, + { + "epoch": 0.6297487097047128, + "grad_norm": 0.20271947979927063, + "learning_rate": 0.001, + "loss": 2.0891, + "step": 14886 + }, + { + "epoch": 0.6297910144682292, + "grad_norm": 0.19663158059120178, + "learning_rate": 0.001, + "loss": 2.0016, + "step": 14887 + }, + { + "epoch": 0.6298333192317455, + "grad_norm": 0.15687520802021027, + "learning_rate": 0.001, + "loss": 1.4679, + "step": 14888 + }, + { + "epoch": 0.6298756239952619, + "grad_norm": 0.6638527512550354, + "learning_rate": 0.001, + "loss": 2.6785, + "step": 14889 + }, + { + "epoch": 0.6299179287587783, + "grad_norm": 0.1655256301164627, + "learning_rate": 0.001, + "loss": 1.9314, + "step": 14890 + }, + { + "epoch": 0.6299602335222946, + "grad_norm": 0.21192145347595215, + "learning_rate": 0.001, + "loss": 2.146, + "step": 14891 + }, + { + "epoch": 0.630002538285811, + "grad_norm": 0.1955963522195816, + "learning_rate": 0.001, + "loss": 1.6481, + "step": 14892 + }, + { + "epoch": 0.6300448430493274, + "grad_norm": 30.6153621673584, + "learning_rate": 0.001, + "loss": 3.6619, + "step": 14893 + }, + { + "epoch": 0.6300871478128437, + "grad_norm": 0.2149381786584854, + "learning_rate": 0.001, + "loss": 3.3041, + "step": 14894 + }, + { + "epoch": 0.6301294525763601, + "grad_norm": 0.19510112702846527, + "learning_rate": 0.001, + "loss": 1.7846, + "step": 14895 + }, + { + "epoch": 0.6301717573398765, + "grad_norm": 1.3764687776565552, + "learning_rate": 0.001, + "loss": 2.012, + "step": 14896 + }, + { + "epoch": 0.6302140621033928, + "grad_norm": 0.15694308280944824, + "learning_rate": 0.001, + "loss": 1.8974, + "step": 14897 + }, + { + "epoch": 0.6302563668669092, + "grad_norm": 0.2188582718372345, + "learning_rate": 0.001, + "loss": 3.2371, + "step": 14898 + }, + { + "epoch": 0.6302986716304256, + "grad_norm": 0.1506149172782898, + "learning_rate": 0.001, + "loss": 1.5915, + "step": 14899 + }, + { + "epoch": 0.6303409763939419, + "grad_norm": 0.3987337648868561, + "learning_rate": 0.001, + "loss": 2.1704, + "step": 14900 + }, + { + "epoch": 0.6303832811574583, + "grad_norm": 0.19197428226470947, + "learning_rate": 0.001, + "loss": 2.7042, + "step": 14901 + }, + { + "epoch": 0.6304255859209748, + "grad_norm": 0.16541145741939545, + "learning_rate": 0.001, + "loss": 1.7064, + "step": 14902 + }, + { + "epoch": 0.630467890684491, + "grad_norm": 0.3559930622577667, + "learning_rate": 0.001, + "loss": 1.8449, + "step": 14903 + }, + { + "epoch": 0.6305101954480075, + "grad_norm": 0.1497167944908142, + "learning_rate": 0.001, + "loss": 2.1508, + "step": 14904 + }, + { + "epoch": 0.6305525002115239, + "grad_norm": 3.9234423637390137, + "learning_rate": 0.001, + "loss": 2.4774, + "step": 14905 + }, + { + "epoch": 0.6305948049750402, + "grad_norm": 1.5417382717132568, + "learning_rate": 0.001, + "loss": 2.181, + "step": 14906 + }, + { + "epoch": 0.6306371097385566, + "grad_norm": 0.2059686779975891, + "learning_rate": 0.001, + "loss": 1.5838, + "step": 14907 + }, + { + "epoch": 0.630679414502073, + "grad_norm": 0.17090870440006256, + "learning_rate": 0.001, + "loss": 2.7388, + "step": 14908 + }, + { + "epoch": 0.6307217192655893, + "grad_norm": 0.1849461793899536, + "learning_rate": 0.001, + "loss": 1.9216, + "step": 14909 + }, + { + "epoch": 0.6307640240291057, + "grad_norm": 0.22465984523296356, + "learning_rate": 0.001, + "loss": 2.0343, + "step": 14910 + }, + { + "epoch": 0.630806328792622, + "grad_norm": 0.6882874369621277, + "learning_rate": 0.001, + "loss": 2.4926, + "step": 14911 + }, + { + "epoch": 0.6308486335561384, + "grad_norm": 0.43035462498664856, + "learning_rate": 0.001, + "loss": 1.9783, + "step": 14912 + }, + { + "epoch": 0.6308909383196548, + "grad_norm": 0.35573610663414, + "learning_rate": 0.001, + "loss": 2.3259, + "step": 14913 + }, + { + "epoch": 0.6309332430831711, + "grad_norm": 0.18853552639484406, + "learning_rate": 0.001, + "loss": 2.3126, + "step": 14914 + }, + { + "epoch": 0.6309755478466875, + "grad_norm": 0.17522744834423065, + "learning_rate": 0.001, + "loss": 2.5944, + "step": 14915 + }, + { + "epoch": 0.6310178526102039, + "grad_norm": 0.5187592506408691, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 14916 + }, + { + "epoch": 0.6310601573737202, + "grad_norm": 0.184511199593544, + "learning_rate": 0.001, + "loss": 1.9519, + "step": 14917 + }, + { + "epoch": 0.6311024621372366, + "grad_norm": 2.1163647174835205, + "learning_rate": 0.001, + "loss": 2.9251, + "step": 14918 + }, + { + "epoch": 0.631144766900753, + "grad_norm": 0.17871823906898499, + "learning_rate": 0.001, + "loss": 2.1804, + "step": 14919 + }, + { + "epoch": 0.6311870716642694, + "grad_norm": 0.49142828583717346, + "learning_rate": 0.001, + "loss": 2.1875, + "step": 14920 + }, + { + "epoch": 0.6312293764277858, + "grad_norm": 0.294746458530426, + "learning_rate": 0.001, + "loss": 2.2112, + "step": 14921 + }, + { + "epoch": 0.6312716811913022, + "grad_norm": 0.23569050431251526, + "learning_rate": 0.001, + "loss": 2.9051, + "step": 14922 + }, + { + "epoch": 0.6313139859548185, + "grad_norm": 0.20902414619922638, + "learning_rate": 0.001, + "loss": 1.7777, + "step": 14923 + }, + { + "epoch": 0.6313562907183349, + "grad_norm": 0.30001044273376465, + "learning_rate": 0.001, + "loss": 2.2292, + "step": 14924 + }, + { + "epoch": 0.6313985954818513, + "grad_norm": 0.20105008780956268, + "learning_rate": 0.001, + "loss": 2.2452, + "step": 14925 + }, + { + "epoch": 0.6314409002453676, + "grad_norm": 0.4256247282028198, + "learning_rate": 0.001, + "loss": 2.3232, + "step": 14926 + }, + { + "epoch": 0.631483205008884, + "grad_norm": 0.1731926053762436, + "learning_rate": 0.001, + "loss": 1.7504, + "step": 14927 + }, + { + "epoch": 0.6315255097724004, + "grad_norm": 0.16395363211631775, + "learning_rate": 0.001, + "loss": 1.5929, + "step": 14928 + }, + { + "epoch": 0.6315678145359167, + "grad_norm": 0.395588755607605, + "learning_rate": 0.001, + "loss": 2.7332, + "step": 14929 + }, + { + "epoch": 0.6316101192994331, + "grad_norm": 0.1726456880569458, + "learning_rate": 0.001, + "loss": 1.8693, + "step": 14930 + }, + { + "epoch": 0.6316524240629495, + "grad_norm": 0.1603616327047348, + "learning_rate": 0.001, + "loss": 2.5375, + "step": 14931 + }, + { + "epoch": 0.6316947288264658, + "grad_norm": 0.21942928433418274, + "learning_rate": 0.001, + "loss": 2.6368, + "step": 14932 + }, + { + "epoch": 0.6317370335899822, + "grad_norm": 0.20922864973545074, + "learning_rate": 0.001, + "loss": 2.3417, + "step": 14933 + }, + { + "epoch": 0.6317793383534986, + "grad_norm": 0.142195925116539, + "learning_rate": 0.001, + "loss": 2.0823, + "step": 14934 + }, + { + "epoch": 0.6318216431170149, + "grad_norm": 0.4264514744281769, + "learning_rate": 0.001, + "loss": 2.3226, + "step": 14935 + }, + { + "epoch": 0.6318639478805314, + "grad_norm": 0.17566858232021332, + "learning_rate": 0.001, + "loss": 1.4364, + "step": 14936 + }, + { + "epoch": 0.6319062526440478, + "grad_norm": 0.5592827200889587, + "learning_rate": 0.001, + "loss": 2.4408, + "step": 14937 + }, + { + "epoch": 0.6319485574075641, + "grad_norm": 0.4113415479660034, + "learning_rate": 0.001, + "loss": 2.6319, + "step": 14938 + }, + { + "epoch": 0.6319908621710805, + "grad_norm": 0.1990566849708557, + "learning_rate": 0.001, + "loss": 2.4111, + "step": 14939 + }, + { + "epoch": 0.6320331669345969, + "grad_norm": 0.7872762084007263, + "learning_rate": 0.001, + "loss": 2.0307, + "step": 14940 + }, + { + "epoch": 0.6320754716981132, + "grad_norm": 0.18138566613197327, + "learning_rate": 0.001, + "loss": 2.3526, + "step": 14941 + }, + { + "epoch": 0.6321177764616296, + "grad_norm": 0.1856798529624939, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 14942 + }, + { + "epoch": 0.632160081225146, + "grad_norm": 0.1658208668231964, + "learning_rate": 0.001, + "loss": 1.8302, + "step": 14943 + }, + { + "epoch": 0.6322023859886623, + "grad_norm": 0.48950570821762085, + "learning_rate": 0.001, + "loss": 1.6563, + "step": 14944 + }, + { + "epoch": 0.6322446907521787, + "grad_norm": 1.1566777229309082, + "learning_rate": 0.001, + "loss": 2.7926, + "step": 14945 + }, + { + "epoch": 0.6322869955156951, + "grad_norm": 0.20745162665843964, + "learning_rate": 0.001, + "loss": 2.2428, + "step": 14946 + }, + { + "epoch": 0.6323293002792114, + "grad_norm": 0.2187420278787613, + "learning_rate": 0.001, + "loss": 2.3801, + "step": 14947 + }, + { + "epoch": 0.6323716050427278, + "grad_norm": 101.63371276855469, + "learning_rate": 0.001, + "loss": 2.0414, + "step": 14948 + }, + { + "epoch": 0.6324139098062442, + "grad_norm": 0.1789313405752182, + "learning_rate": 0.001, + "loss": 1.9108, + "step": 14949 + }, + { + "epoch": 0.6324562145697605, + "grad_norm": 0.2938269078731537, + "learning_rate": 0.001, + "loss": 1.9316, + "step": 14950 + }, + { + "epoch": 0.6324985193332769, + "grad_norm": 0.32403501868247986, + "learning_rate": 0.001, + "loss": 1.9873, + "step": 14951 + }, + { + "epoch": 0.6325408240967934, + "grad_norm": 0.1873176246881485, + "learning_rate": 0.001, + "loss": 1.8201, + "step": 14952 + }, + { + "epoch": 0.6325831288603097, + "grad_norm": 0.19360357522964478, + "learning_rate": 0.001, + "loss": 2.7862, + "step": 14953 + }, + { + "epoch": 0.6326254336238261, + "grad_norm": 0.17896825075149536, + "learning_rate": 0.001, + "loss": 2.0564, + "step": 14954 + }, + { + "epoch": 0.6326677383873424, + "grad_norm": 1.2869054079055786, + "learning_rate": 0.001, + "loss": 1.5205, + "step": 14955 + }, + { + "epoch": 0.6327100431508588, + "grad_norm": 0.18864065408706665, + "learning_rate": 0.001, + "loss": 3.0444, + "step": 14956 + }, + { + "epoch": 0.6327523479143752, + "grad_norm": 0.880074143409729, + "learning_rate": 0.001, + "loss": 1.9113, + "step": 14957 + }, + { + "epoch": 0.6327946526778915, + "grad_norm": 0.1550513505935669, + "learning_rate": 0.001, + "loss": 2.146, + "step": 14958 + }, + { + "epoch": 0.6328369574414079, + "grad_norm": 0.20279888808727264, + "learning_rate": 0.001, + "loss": 2.5939, + "step": 14959 + }, + { + "epoch": 0.6328792622049243, + "grad_norm": 0.4904308617115021, + "learning_rate": 0.001, + "loss": 2.1548, + "step": 14960 + }, + { + "epoch": 0.6329215669684406, + "grad_norm": 0.15668341517448425, + "learning_rate": 0.001, + "loss": 3.1503, + "step": 14961 + }, + { + "epoch": 0.632963871731957, + "grad_norm": 1.097795009613037, + "learning_rate": 0.001, + "loss": 1.9487, + "step": 14962 + }, + { + "epoch": 0.6330061764954734, + "grad_norm": 0.19955521821975708, + "learning_rate": 0.001, + "loss": 1.4885, + "step": 14963 + }, + { + "epoch": 0.6330484812589897, + "grad_norm": 0.15636754035949707, + "learning_rate": 0.001, + "loss": 2.508, + "step": 14964 + }, + { + "epoch": 0.6330907860225061, + "grad_norm": 2.758397340774536, + "learning_rate": 0.001, + "loss": 1.6241, + "step": 14965 + }, + { + "epoch": 0.6331330907860225, + "grad_norm": 2.932732582092285, + "learning_rate": 0.001, + "loss": 1.9931, + "step": 14966 + }, + { + "epoch": 0.6331753955495388, + "grad_norm": 0.21314847469329834, + "learning_rate": 0.001, + "loss": 2.2011, + "step": 14967 + }, + { + "epoch": 0.6332177003130552, + "grad_norm": 0.23298072814941406, + "learning_rate": 0.001, + "loss": 3.0595, + "step": 14968 + }, + { + "epoch": 0.6332600050765717, + "grad_norm": 0.246332049369812, + "learning_rate": 0.001, + "loss": 1.9456, + "step": 14969 + }, + { + "epoch": 0.633302309840088, + "grad_norm": 0.18252138793468475, + "learning_rate": 0.001, + "loss": 3.1042, + "step": 14970 + }, + { + "epoch": 0.6333446146036044, + "grad_norm": 0.20896947383880615, + "learning_rate": 0.001, + "loss": 2.6753, + "step": 14971 + }, + { + "epoch": 0.6333869193671208, + "grad_norm": 25.508777618408203, + "learning_rate": 0.001, + "loss": 3.5983, + "step": 14972 + }, + { + "epoch": 0.6334292241306371, + "grad_norm": 0.8212241530418396, + "learning_rate": 0.001, + "loss": 2.4581, + "step": 14973 + }, + { + "epoch": 0.6334715288941535, + "grad_norm": 0.36662164330482483, + "learning_rate": 0.001, + "loss": 1.8713, + "step": 14974 + }, + { + "epoch": 0.6335138336576699, + "grad_norm": 0.14960072934627533, + "learning_rate": 0.001, + "loss": 2.7025, + "step": 14975 + }, + { + "epoch": 0.6335561384211862, + "grad_norm": 0.17821753025054932, + "learning_rate": 0.001, + "loss": 2.1402, + "step": 14976 + }, + { + "epoch": 0.6335984431847026, + "grad_norm": 0.17542670667171478, + "learning_rate": 0.001, + "loss": 1.9822, + "step": 14977 + }, + { + "epoch": 0.633640747948219, + "grad_norm": 0.25928542017936707, + "learning_rate": 0.001, + "loss": 3.1297, + "step": 14978 + }, + { + "epoch": 0.6336830527117353, + "grad_norm": 0.1568516343832016, + "learning_rate": 0.001, + "loss": 1.6175, + "step": 14979 + }, + { + "epoch": 0.6337253574752517, + "grad_norm": 0.18374250829219818, + "learning_rate": 0.001, + "loss": 3.5063, + "step": 14980 + }, + { + "epoch": 0.6337676622387681, + "grad_norm": 0.23148831725120544, + "learning_rate": 0.001, + "loss": 2.3223, + "step": 14981 + }, + { + "epoch": 0.6338099670022844, + "grad_norm": 0.23832428455352783, + "learning_rate": 0.001, + "loss": 2.036, + "step": 14982 + }, + { + "epoch": 0.6338522717658008, + "grad_norm": 0.19005969166755676, + "learning_rate": 0.001, + "loss": 3.0384, + "step": 14983 + }, + { + "epoch": 0.6338945765293172, + "grad_norm": 2.5607242584228516, + "learning_rate": 0.001, + "loss": 2.0137, + "step": 14984 + }, + { + "epoch": 0.6339368812928335, + "grad_norm": 0.4837716817855835, + "learning_rate": 0.001, + "loss": 3.2616, + "step": 14985 + }, + { + "epoch": 0.63397918605635, + "grad_norm": 0.5023155212402344, + "learning_rate": 0.001, + "loss": 2.6116, + "step": 14986 + }, + { + "epoch": 0.6340214908198664, + "grad_norm": 0.21957671642303467, + "learning_rate": 0.001, + "loss": 2.8777, + "step": 14987 + }, + { + "epoch": 0.6340637955833827, + "grad_norm": 0.15454959869384766, + "learning_rate": 0.001, + "loss": 2.8105, + "step": 14988 + }, + { + "epoch": 0.6341061003468991, + "grad_norm": 7.386536121368408, + "learning_rate": 0.001, + "loss": 1.749, + "step": 14989 + }, + { + "epoch": 0.6341484051104155, + "grad_norm": 0.22474679350852966, + "learning_rate": 0.001, + "loss": 2.1073, + "step": 14990 + }, + { + "epoch": 0.6341907098739318, + "grad_norm": 0.22885270416736603, + "learning_rate": 0.001, + "loss": 2.722, + "step": 14991 + }, + { + "epoch": 0.6342330146374482, + "grad_norm": 0.1935967653989792, + "learning_rate": 0.001, + "loss": 1.5022, + "step": 14992 + }, + { + "epoch": 0.6342753194009646, + "grad_norm": 1.1925420761108398, + "learning_rate": 0.001, + "loss": 3.1638, + "step": 14993 + }, + { + "epoch": 0.6343176241644809, + "grad_norm": 0.16157661378383636, + "learning_rate": 0.001, + "loss": 1.8381, + "step": 14994 + }, + { + "epoch": 0.6343599289279973, + "grad_norm": 0.2140708863735199, + "learning_rate": 0.001, + "loss": 2.2911, + "step": 14995 + }, + { + "epoch": 0.6344022336915137, + "grad_norm": 0.15380524098873138, + "learning_rate": 0.001, + "loss": 2.2116, + "step": 14996 + }, + { + "epoch": 0.63444453845503, + "grad_norm": 0.19003191590309143, + "learning_rate": 0.001, + "loss": 2.0538, + "step": 14997 + }, + { + "epoch": 0.6344868432185464, + "grad_norm": 0.17649468779563904, + "learning_rate": 0.001, + "loss": 1.9866, + "step": 14998 + }, + { + "epoch": 0.6345291479820628, + "grad_norm": 1.8466432094573975, + "learning_rate": 0.001, + "loss": 2.4553, + "step": 14999 + }, + { + "epoch": 0.6345714527455791, + "grad_norm": 0.1778542697429657, + "learning_rate": 0.001, + "loss": 2.5347, + "step": 15000 + }, + { + "epoch": 0.6346137575090955, + "grad_norm": 0.19804713129997253, + "learning_rate": 0.001, + "loss": 1.5805, + "step": 15001 + }, + { + "epoch": 0.6346560622726118, + "grad_norm": 0.14694872498512268, + "learning_rate": 0.001, + "loss": 2.0717, + "step": 15002 + }, + { + "epoch": 0.6346983670361283, + "grad_norm": 1.054595708847046, + "learning_rate": 0.001, + "loss": 3.0874, + "step": 15003 + }, + { + "epoch": 0.6347406717996447, + "grad_norm": 0.1948278397321701, + "learning_rate": 0.001, + "loss": 1.9202, + "step": 15004 + }, + { + "epoch": 0.634782976563161, + "grad_norm": 0.23619548976421356, + "learning_rate": 0.001, + "loss": 2.0307, + "step": 15005 + }, + { + "epoch": 0.6348252813266774, + "grad_norm": 0.21441768109798431, + "learning_rate": 0.001, + "loss": 2.4843, + "step": 15006 + }, + { + "epoch": 0.6348675860901938, + "grad_norm": 0.19178879261016846, + "learning_rate": 0.001, + "loss": 2.0039, + "step": 15007 + }, + { + "epoch": 0.6349098908537101, + "grad_norm": 0.19128453731536865, + "learning_rate": 0.001, + "loss": 1.8224, + "step": 15008 + }, + { + "epoch": 0.6349521956172265, + "grad_norm": 0.30151814222335815, + "learning_rate": 0.001, + "loss": 2.6074, + "step": 15009 + }, + { + "epoch": 0.6349945003807429, + "grad_norm": 0.1635221242904663, + "learning_rate": 0.001, + "loss": 1.5166, + "step": 15010 + }, + { + "epoch": 0.6350368051442592, + "grad_norm": 0.15775498747825623, + "learning_rate": 0.001, + "loss": 1.501, + "step": 15011 + }, + { + "epoch": 0.6350791099077756, + "grad_norm": 0.19926244020462036, + "learning_rate": 0.001, + "loss": 2.0229, + "step": 15012 + }, + { + "epoch": 0.635121414671292, + "grad_norm": 0.2127702683210373, + "learning_rate": 0.001, + "loss": 1.9368, + "step": 15013 + }, + { + "epoch": 0.6351637194348083, + "grad_norm": 0.19976593554019928, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 15014 + }, + { + "epoch": 0.6352060241983247, + "grad_norm": 0.15875552594661713, + "learning_rate": 0.001, + "loss": 2.2435, + "step": 15015 + }, + { + "epoch": 0.6352483289618411, + "grad_norm": 0.20924918353557587, + "learning_rate": 0.001, + "loss": 1.889, + "step": 15016 + }, + { + "epoch": 0.6352906337253574, + "grad_norm": 0.16609390079975128, + "learning_rate": 0.001, + "loss": 2.3809, + "step": 15017 + }, + { + "epoch": 0.6353329384888738, + "grad_norm": 0.17234043776988983, + "learning_rate": 0.001, + "loss": 1.9048, + "step": 15018 + }, + { + "epoch": 0.6353752432523903, + "grad_norm": 0.18904206156730652, + "learning_rate": 0.001, + "loss": 2.9954, + "step": 15019 + }, + { + "epoch": 0.6354175480159066, + "grad_norm": 0.2591887414455414, + "learning_rate": 0.001, + "loss": 1.4825, + "step": 15020 + }, + { + "epoch": 0.635459852779423, + "grad_norm": 0.21162384748458862, + "learning_rate": 0.001, + "loss": 2.1042, + "step": 15021 + }, + { + "epoch": 0.6355021575429394, + "grad_norm": 0.1585322916507721, + "learning_rate": 0.001, + "loss": 2.2755, + "step": 15022 + }, + { + "epoch": 0.6355444623064557, + "grad_norm": 0.3131169080734253, + "learning_rate": 0.001, + "loss": 2.6273, + "step": 15023 + }, + { + "epoch": 0.6355867670699721, + "grad_norm": 0.1742488294839859, + "learning_rate": 0.001, + "loss": 2.3599, + "step": 15024 + }, + { + "epoch": 0.6356290718334885, + "grad_norm": 0.37338918447494507, + "learning_rate": 0.001, + "loss": 2.752, + "step": 15025 + }, + { + "epoch": 0.6356713765970048, + "grad_norm": 0.20515532791614532, + "learning_rate": 0.001, + "loss": 2.4337, + "step": 15026 + }, + { + "epoch": 0.6357136813605212, + "grad_norm": 0.5772908926010132, + "learning_rate": 0.001, + "loss": 2.1753, + "step": 15027 + }, + { + "epoch": 0.6357559861240376, + "grad_norm": 1.2035185098648071, + "learning_rate": 0.001, + "loss": 2.298, + "step": 15028 + }, + { + "epoch": 0.6357982908875539, + "grad_norm": 0.3183322250843048, + "learning_rate": 0.001, + "loss": 2.7047, + "step": 15029 + }, + { + "epoch": 0.6358405956510703, + "grad_norm": 0.15647853910923004, + "learning_rate": 0.001, + "loss": 1.6478, + "step": 15030 + }, + { + "epoch": 0.6358829004145867, + "grad_norm": 0.16988950967788696, + "learning_rate": 0.001, + "loss": 1.7595, + "step": 15031 + }, + { + "epoch": 0.635925205178103, + "grad_norm": 1.6382477283477783, + "learning_rate": 0.001, + "loss": 1.7299, + "step": 15032 + }, + { + "epoch": 0.6359675099416194, + "grad_norm": 0.3181857466697693, + "learning_rate": 0.001, + "loss": 1.609, + "step": 15033 + }, + { + "epoch": 0.6360098147051358, + "grad_norm": 0.20994387567043304, + "learning_rate": 0.001, + "loss": 2.6348, + "step": 15034 + }, + { + "epoch": 0.6360521194686521, + "grad_norm": 0.17180518805980682, + "learning_rate": 0.001, + "loss": 1.4551, + "step": 15035 + }, + { + "epoch": 0.6360944242321686, + "grad_norm": 0.15223318338394165, + "learning_rate": 0.001, + "loss": 2.0057, + "step": 15036 + }, + { + "epoch": 0.636136728995685, + "grad_norm": 0.14394080638885498, + "learning_rate": 0.001, + "loss": 1.8883, + "step": 15037 + }, + { + "epoch": 0.6361790337592013, + "grad_norm": 0.8304606080055237, + "learning_rate": 0.001, + "loss": 2.4011, + "step": 15038 + }, + { + "epoch": 0.6362213385227177, + "grad_norm": 0.1558523178100586, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 15039 + }, + { + "epoch": 0.6362636432862341, + "grad_norm": 0.19884271919727325, + "learning_rate": 0.001, + "loss": 3.0935, + "step": 15040 + }, + { + "epoch": 0.6363059480497504, + "grad_norm": 0.19426396489143372, + "learning_rate": 0.001, + "loss": 2.1119, + "step": 15041 + }, + { + "epoch": 0.6363482528132668, + "grad_norm": 0.1493634283542633, + "learning_rate": 0.001, + "loss": 2.2015, + "step": 15042 + }, + { + "epoch": 0.6363905575767832, + "grad_norm": 0.1937769651412964, + "learning_rate": 0.001, + "loss": 2.2157, + "step": 15043 + }, + { + "epoch": 0.6364328623402995, + "grad_norm": 0.1691095232963562, + "learning_rate": 0.001, + "loss": 1.7524, + "step": 15044 + }, + { + "epoch": 0.6364751671038159, + "grad_norm": 0.2379046529531479, + "learning_rate": 0.001, + "loss": 1.9738, + "step": 15045 + }, + { + "epoch": 0.6365174718673322, + "grad_norm": 0.4896133244037628, + "learning_rate": 0.001, + "loss": 1.2294, + "step": 15046 + }, + { + "epoch": 0.6365597766308486, + "grad_norm": 0.18665045499801636, + "learning_rate": 0.001, + "loss": 2.2706, + "step": 15047 + }, + { + "epoch": 0.636602081394365, + "grad_norm": 0.1756897121667862, + "learning_rate": 0.001, + "loss": 2.0685, + "step": 15048 + }, + { + "epoch": 0.6366443861578813, + "grad_norm": 0.46474409103393555, + "learning_rate": 0.001, + "loss": 1.7136, + "step": 15049 + }, + { + "epoch": 0.6366866909213977, + "grad_norm": 0.7416086196899414, + "learning_rate": 0.001, + "loss": 2.623, + "step": 15050 + }, + { + "epoch": 0.6367289956849141, + "grad_norm": 0.21350111067295074, + "learning_rate": 0.001, + "loss": 1.6577, + "step": 15051 + }, + { + "epoch": 0.6367713004484304, + "grad_norm": 0.6124548316001892, + "learning_rate": 0.001, + "loss": 3.5035, + "step": 15052 + }, + { + "epoch": 0.6368136052119469, + "grad_norm": 0.19088469445705414, + "learning_rate": 0.001, + "loss": 2.3181, + "step": 15053 + }, + { + "epoch": 0.6368559099754633, + "grad_norm": 0.14974898099899292, + "learning_rate": 0.001, + "loss": 2.0188, + "step": 15054 + }, + { + "epoch": 0.6368982147389796, + "grad_norm": 0.17937543988227844, + "learning_rate": 0.001, + "loss": 2.6969, + "step": 15055 + }, + { + "epoch": 0.636940519502496, + "grad_norm": 0.22011379897594452, + "learning_rate": 0.001, + "loss": 2.7894, + "step": 15056 + }, + { + "epoch": 0.6369828242660124, + "grad_norm": 0.18520912528038025, + "learning_rate": 0.001, + "loss": 1.7704, + "step": 15057 + }, + { + "epoch": 0.6370251290295287, + "grad_norm": 0.1443547159433365, + "learning_rate": 0.001, + "loss": 2.0866, + "step": 15058 + }, + { + "epoch": 0.6370674337930451, + "grad_norm": 0.1495741754770279, + "learning_rate": 0.001, + "loss": 1.5056, + "step": 15059 + }, + { + "epoch": 0.6371097385565615, + "grad_norm": 9.01001262664795, + "learning_rate": 0.001, + "loss": 1.8438, + "step": 15060 + }, + { + "epoch": 0.6371520433200778, + "grad_norm": 0.46700048446655273, + "learning_rate": 0.001, + "loss": 2.1663, + "step": 15061 + }, + { + "epoch": 0.6371943480835942, + "grad_norm": 0.20175836980342865, + "learning_rate": 0.001, + "loss": 2.9477, + "step": 15062 + }, + { + "epoch": 0.6372366528471106, + "grad_norm": 0.162751242518425, + "learning_rate": 0.001, + "loss": 1.6253, + "step": 15063 + }, + { + "epoch": 0.6372789576106269, + "grad_norm": 0.1852269321680069, + "learning_rate": 0.001, + "loss": 1.4472, + "step": 15064 + }, + { + "epoch": 0.6373212623741433, + "grad_norm": 0.17682723701000214, + "learning_rate": 0.001, + "loss": 1.6896, + "step": 15065 + }, + { + "epoch": 0.6373635671376597, + "grad_norm": 0.8060948848724365, + "learning_rate": 0.001, + "loss": 1.6884, + "step": 15066 + }, + { + "epoch": 0.637405871901176, + "grad_norm": 0.30199164152145386, + "learning_rate": 0.001, + "loss": 2.0973, + "step": 15067 + }, + { + "epoch": 0.6374481766646924, + "grad_norm": 0.20303098857402802, + "learning_rate": 0.001, + "loss": 2.1595, + "step": 15068 + }, + { + "epoch": 0.6374904814282089, + "grad_norm": 0.20386897027492523, + "learning_rate": 0.001, + "loss": 1.6897, + "step": 15069 + }, + { + "epoch": 0.6375327861917252, + "grad_norm": 0.233832448720932, + "learning_rate": 0.001, + "loss": 2.1659, + "step": 15070 + }, + { + "epoch": 0.6375750909552416, + "grad_norm": 2.2971856594085693, + "learning_rate": 0.001, + "loss": 4.1125, + "step": 15071 + }, + { + "epoch": 0.637617395718758, + "grad_norm": 0.19406281411647797, + "learning_rate": 0.001, + "loss": 2.3436, + "step": 15072 + }, + { + "epoch": 0.6376597004822743, + "grad_norm": 0.16026239097118378, + "learning_rate": 0.001, + "loss": 2.5325, + "step": 15073 + }, + { + "epoch": 0.6377020052457907, + "grad_norm": 0.21731014549732208, + "learning_rate": 0.001, + "loss": 2.6568, + "step": 15074 + }, + { + "epoch": 0.6377443100093071, + "grad_norm": 0.1755847930908203, + "learning_rate": 0.001, + "loss": 2.48, + "step": 15075 + }, + { + "epoch": 0.6377866147728234, + "grad_norm": 0.16319885849952698, + "learning_rate": 0.001, + "loss": 2.152, + "step": 15076 + }, + { + "epoch": 0.6378289195363398, + "grad_norm": 0.1832021176815033, + "learning_rate": 0.001, + "loss": 2.4498, + "step": 15077 + }, + { + "epoch": 0.6378712242998562, + "grad_norm": 0.17077642679214478, + "learning_rate": 0.001, + "loss": 2.2088, + "step": 15078 + }, + { + "epoch": 0.6379135290633725, + "grad_norm": 0.2068411260843277, + "learning_rate": 0.001, + "loss": 2.3452, + "step": 15079 + }, + { + "epoch": 0.6379558338268889, + "grad_norm": 0.1923973709344864, + "learning_rate": 0.001, + "loss": 2.124, + "step": 15080 + }, + { + "epoch": 0.6379981385904053, + "grad_norm": 7.424524784088135, + "learning_rate": 0.001, + "loss": 2.9359, + "step": 15081 + }, + { + "epoch": 0.6380404433539216, + "grad_norm": 0.2013271301984787, + "learning_rate": 0.001, + "loss": 1.4738, + "step": 15082 + }, + { + "epoch": 0.638082748117438, + "grad_norm": 0.7266159653663635, + "learning_rate": 0.001, + "loss": 2.4914, + "step": 15083 + }, + { + "epoch": 0.6381250528809544, + "grad_norm": 0.7086105346679688, + "learning_rate": 0.001, + "loss": 2.7849, + "step": 15084 + }, + { + "epoch": 0.6381673576444707, + "grad_norm": 1.527816891670227, + "learning_rate": 0.001, + "loss": 3.3425, + "step": 15085 + }, + { + "epoch": 0.6382096624079872, + "grad_norm": 0.30964910984039307, + "learning_rate": 0.001, + "loss": 2.8131, + "step": 15086 + }, + { + "epoch": 0.6382519671715036, + "grad_norm": 0.18733543157577515, + "learning_rate": 0.001, + "loss": 2.89, + "step": 15087 + }, + { + "epoch": 0.6382942719350199, + "grad_norm": 0.32117486000061035, + "learning_rate": 0.001, + "loss": 3.6968, + "step": 15088 + }, + { + "epoch": 0.6383365766985363, + "grad_norm": 0.1650446653366089, + "learning_rate": 0.001, + "loss": 1.6528, + "step": 15089 + }, + { + "epoch": 0.6383788814620526, + "grad_norm": 1.424420714378357, + "learning_rate": 0.001, + "loss": 2.9049, + "step": 15090 + }, + { + "epoch": 0.638421186225569, + "grad_norm": 0.18598754703998566, + "learning_rate": 0.001, + "loss": 2.5301, + "step": 15091 + }, + { + "epoch": 0.6384634909890854, + "grad_norm": 0.24407409131526947, + "learning_rate": 0.001, + "loss": 1.614, + "step": 15092 + }, + { + "epoch": 0.6385057957526017, + "grad_norm": 0.18757480382919312, + "learning_rate": 0.001, + "loss": 2.3472, + "step": 15093 + }, + { + "epoch": 0.6385481005161181, + "grad_norm": 0.17683260142803192, + "learning_rate": 0.001, + "loss": 2.2612, + "step": 15094 + }, + { + "epoch": 0.6385904052796345, + "grad_norm": 0.2939551770687103, + "learning_rate": 0.001, + "loss": 1.9736, + "step": 15095 + }, + { + "epoch": 0.6386327100431508, + "grad_norm": 0.1915324181318283, + "learning_rate": 0.001, + "loss": 1.6365, + "step": 15096 + }, + { + "epoch": 0.6386750148066672, + "grad_norm": 0.21610991656780243, + "learning_rate": 0.001, + "loss": 2.0241, + "step": 15097 + }, + { + "epoch": 0.6387173195701836, + "grad_norm": 0.17032171785831451, + "learning_rate": 0.001, + "loss": 2.7106, + "step": 15098 + }, + { + "epoch": 0.6387596243336999, + "grad_norm": 0.33380383253097534, + "learning_rate": 0.001, + "loss": 1.8032, + "step": 15099 + }, + { + "epoch": 0.6388019290972163, + "grad_norm": 0.15979735553264618, + "learning_rate": 0.001, + "loss": 2.0711, + "step": 15100 + }, + { + "epoch": 0.6388442338607327, + "grad_norm": 0.21214862167835236, + "learning_rate": 0.001, + "loss": 2.4475, + "step": 15101 + }, + { + "epoch": 0.638886538624249, + "grad_norm": 0.19921685755252838, + "learning_rate": 0.001, + "loss": 2.1466, + "step": 15102 + }, + { + "epoch": 0.6389288433877655, + "grad_norm": 0.18050815165042877, + "learning_rate": 0.001, + "loss": 2.3784, + "step": 15103 + }, + { + "epoch": 0.6389711481512819, + "grad_norm": 1.592774748802185, + "learning_rate": 0.001, + "loss": 2.568, + "step": 15104 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 0.18110394477844238, + "learning_rate": 0.001, + "loss": 2.3632, + "step": 15105 + }, + { + "epoch": 0.6390557576783146, + "grad_norm": 0.13899345695972443, + "learning_rate": 0.001, + "loss": 2.49, + "step": 15106 + }, + { + "epoch": 0.639098062441831, + "grad_norm": 0.16161954402923584, + "learning_rate": 0.001, + "loss": 1.9518, + "step": 15107 + }, + { + "epoch": 0.6391403672053473, + "grad_norm": 0.2258518636226654, + "learning_rate": 0.001, + "loss": 1.8021, + "step": 15108 + }, + { + "epoch": 0.6391826719688637, + "grad_norm": 0.19789251685142517, + "learning_rate": 0.001, + "loss": 2.7393, + "step": 15109 + }, + { + "epoch": 0.6392249767323801, + "grad_norm": 0.37085291743278503, + "learning_rate": 0.001, + "loss": 1.7739, + "step": 15110 + }, + { + "epoch": 0.6392672814958964, + "grad_norm": 1.3115873336791992, + "learning_rate": 0.001, + "loss": 2.4295, + "step": 15111 + }, + { + "epoch": 0.6393095862594128, + "grad_norm": 0.44819048047065735, + "learning_rate": 0.001, + "loss": 2.1524, + "step": 15112 + }, + { + "epoch": 0.6393518910229292, + "grad_norm": 0.1929224282503128, + "learning_rate": 0.001, + "loss": 1.8331, + "step": 15113 + }, + { + "epoch": 0.6393941957864455, + "grad_norm": 0.17352215945720673, + "learning_rate": 0.001, + "loss": 1.9057, + "step": 15114 + }, + { + "epoch": 0.6394365005499619, + "grad_norm": 0.32852211594581604, + "learning_rate": 0.001, + "loss": 1.5222, + "step": 15115 + }, + { + "epoch": 0.6394788053134783, + "grad_norm": 0.17034880816936493, + "learning_rate": 0.001, + "loss": 2.1697, + "step": 15116 + }, + { + "epoch": 0.6395211100769946, + "grad_norm": 0.17381560802459717, + "learning_rate": 0.001, + "loss": 1.616, + "step": 15117 + }, + { + "epoch": 0.639563414840511, + "grad_norm": 1.7698090076446533, + "learning_rate": 0.001, + "loss": 2.1559, + "step": 15118 + }, + { + "epoch": 0.6396057196040275, + "grad_norm": 0.2830827534198761, + "learning_rate": 0.001, + "loss": 2.1308, + "step": 15119 + }, + { + "epoch": 0.6396480243675438, + "grad_norm": 0.25885289907455444, + "learning_rate": 0.001, + "loss": 1.9705, + "step": 15120 + }, + { + "epoch": 0.6396903291310602, + "grad_norm": 0.2354385405778885, + "learning_rate": 0.001, + "loss": 2.0764, + "step": 15121 + }, + { + "epoch": 0.6397326338945766, + "grad_norm": 0.2808350920677185, + "learning_rate": 0.001, + "loss": 2.703, + "step": 15122 + }, + { + "epoch": 0.6397749386580929, + "grad_norm": 0.1770741045475006, + "learning_rate": 0.001, + "loss": 2.2367, + "step": 15123 + }, + { + "epoch": 0.6398172434216093, + "grad_norm": 0.1734161674976349, + "learning_rate": 0.001, + "loss": 1.4621, + "step": 15124 + }, + { + "epoch": 0.6398595481851257, + "grad_norm": 0.2023870348930359, + "learning_rate": 0.001, + "loss": 2.4376, + "step": 15125 + }, + { + "epoch": 0.639901852948642, + "grad_norm": 0.7190189361572266, + "learning_rate": 0.001, + "loss": 3.3754, + "step": 15126 + }, + { + "epoch": 0.6399441577121584, + "grad_norm": 0.18912023305892944, + "learning_rate": 0.001, + "loss": 3.4364, + "step": 15127 + }, + { + "epoch": 0.6399864624756748, + "grad_norm": 0.16115638613700867, + "learning_rate": 0.001, + "loss": 1.8208, + "step": 15128 + }, + { + "epoch": 0.6400287672391911, + "grad_norm": 0.17985692620277405, + "learning_rate": 0.001, + "loss": 1.921, + "step": 15129 + }, + { + "epoch": 0.6400710720027075, + "grad_norm": 1.403349757194519, + "learning_rate": 0.001, + "loss": 3.553, + "step": 15130 + }, + { + "epoch": 0.6401133767662239, + "grad_norm": 0.14458277821540833, + "learning_rate": 0.001, + "loss": 1.8948, + "step": 15131 + }, + { + "epoch": 0.6401556815297402, + "grad_norm": 0.21987377107143402, + "learning_rate": 0.001, + "loss": 2.8338, + "step": 15132 + }, + { + "epoch": 0.6401979862932566, + "grad_norm": 0.3733513355255127, + "learning_rate": 0.001, + "loss": 2.4485, + "step": 15133 + }, + { + "epoch": 0.640240291056773, + "grad_norm": 21.617033004760742, + "learning_rate": 0.001, + "loss": 2.7377, + "step": 15134 + }, + { + "epoch": 0.6402825958202893, + "grad_norm": 0.4703228771686554, + "learning_rate": 0.001, + "loss": 2.7947, + "step": 15135 + }, + { + "epoch": 0.6403249005838058, + "grad_norm": 1.1009413003921509, + "learning_rate": 0.001, + "loss": 2.0846, + "step": 15136 + }, + { + "epoch": 0.640367205347322, + "grad_norm": 0.16309663653373718, + "learning_rate": 0.001, + "loss": 2.8663, + "step": 15137 + }, + { + "epoch": 0.6404095101108385, + "grad_norm": 0.18451957404613495, + "learning_rate": 0.001, + "loss": 2.7764, + "step": 15138 + }, + { + "epoch": 0.6404518148743549, + "grad_norm": 0.20747211575508118, + "learning_rate": 0.001, + "loss": 1.8173, + "step": 15139 + }, + { + "epoch": 0.6404941196378712, + "grad_norm": 0.2003680169582367, + "learning_rate": 0.001, + "loss": 2.5292, + "step": 15140 + }, + { + "epoch": 0.6405364244013876, + "grad_norm": 0.14385853707790375, + "learning_rate": 0.001, + "loss": 3.4984, + "step": 15141 + }, + { + "epoch": 0.640578729164904, + "grad_norm": 0.20833978056907654, + "learning_rate": 0.001, + "loss": 2.2487, + "step": 15142 + }, + { + "epoch": 0.6406210339284203, + "grad_norm": 1.4632526636123657, + "learning_rate": 0.001, + "loss": 2.327, + "step": 15143 + }, + { + "epoch": 0.6406633386919367, + "grad_norm": 0.1615937054157257, + "learning_rate": 0.001, + "loss": 2.6719, + "step": 15144 + }, + { + "epoch": 0.6407056434554531, + "grad_norm": 0.3300820291042328, + "learning_rate": 0.001, + "loss": 1.7582, + "step": 15145 + }, + { + "epoch": 0.6407479482189694, + "grad_norm": 0.40735575556755066, + "learning_rate": 0.001, + "loss": 1.6912, + "step": 15146 + }, + { + "epoch": 0.6407902529824858, + "grad_norm": 0.16738514602184296, + "learning_rate": 0.001, + "loss": 1.6134, + "step": 15147 + }, + { + "epoch": 0.6408325577460022, + "grad_norm": 0.17721709609031677, + "learning_rate": 0.001, + "loss": 2.4597, + "step": 15148 + }, + { + "epoch": 0.6408748625095185, + "grad_norm": 0.2825102210044861, + "learning_rate": 0.001, + "loss": 1.8421, + "step": 15149 + }, + { + "epoch": 0.6409171672730349, + "grad_norm": 0.19013796746730804, + "learning_rate": 0.001, + "loss": 2.0838, + "step": 15150 + }, + { + "epoch": 0.6409594720365513, + "grad_norm": 0.22980915009975433, + "learning_rate": 0.001, + "loss": 3.4241, + "step": 15151 + }, + { + "epoch": 0.6410017768000676, + "grad_norm": 0.4537425935268402, + "learning_rate": 0.001, + "loss": 2.7787, + "step": 15152 + }, + { + "epoch": 0.6410440815635841, + "grad_norm": 0.44562122225761414, + "learning_rate": 0.001, + "loss": 1.986, + "step": 15153 + }, + { + "epoch": 0.6410863863271005, + "grad_norm": 0.1742187738418579, + "learning_rate": 0.001, + "loss": 1.8076, + "step": 15154 + }, + { + "epoch": 0.6411286910906168, + "grad_norm": 0.41731923818588257, + "learning_rate": 0.001, + "loss": 2.7682, + "step": 15155 + }, + { + "epoch": 0.6411709958541332, + "grad_norm": 0.17305545508861542, + "learning_rate": 0.001, + "loss": 2.2091, + "step": 15156 + }, + { + "epoch": 0.6412133006176496, + "grad_norm": 3.8810274600982666, + "learning_rate": 0.001, + "loss": 2.5479, + "step": 15157 + }, + { + "epoch": 0.6412556053811659, + "grad_norm": 0.22583703696727753, + "learning_rate": 0.001, + "loss": 1.9204, + "step": 15158 + }, + { + "epoch": 0.6412979101446823, + "grad_norm": 0.14476729929447174, + "learning_rate": 0.001, + "loss": 3.0101, + "step": 15159 + }, + { + "epoch": 0.6413402149081987, + "grad_norm": 0.19955182075500488, + "learning_rate": 0.001, + "loss": 2.3063, + "step": 15160 + }, + { + "epoch": 0.641382519671715, + "grad_norm": 0.15358324348926544, + "learning_rate": 0.001, + "loss": 1.4909, + "step": 15161 + }, + { + "epoch": 0.6414248244352314, + "grad_norm": 0.2164243459701538, + "learning_rate": 0.001, + "loss": 1.9155, + "step": 15162 + }, + { + "epoch": 0.6414671291987478, + "grad_norm": 0.15397514402866364, + "learning_rate": 0.001, + "loss": 1.6991, + "step": 15163 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.14911989867687225, + "learning_rate": 0.001, + "loss": 1.9703, + "step": 15164 + }, + { + "epoch": 0.6415517387257805, + "grad_norm": 0.1464262753725052, + "learning_rate": 0.001, + "loss": 1.8296, + "step": 15165 + }, + { + "epoch": 0.6415940434892969, + "grad_norm": 0.9781848192214966, + "learning_rate": 0.001, + "loss": 4.401, + "step": 15166 + }, + { + "epoch": 0.6416363482528132, + "grad_norm": 0.18489103019237518, + "learning_rate": 0.001, + "loss": 1.6242, + "step": 15167 + }, + { + "epoch": 0.6416786530163296, + "grad_norm": 0.16017593443393707, + "learning_rate": 0.001, + "loss": 1.9852, + "step": 15168 + }, + { + "epoch": 0.6417209577798461, + "grad_norm": 1.0685312747955322, + "learning_rate": 0.001, + "loss": 2.4017, + "step": 15169 + }, + { + "epoch": 0.6417632625433624, + "grad_norm": 0.17249107360839844, + "learning_rate": 0.001, + "loss": 1.9322, + "step": 15170 + }, + { + "epoch": 0.6418055673068788, + "grad_norm": 0.2893502712249756, + "learning_rate": 0.001, + "loss": 1.7509, + "step": 15171 + }, + { + "epoch": 0.6418478720703952, + "grad_norm": 1.8474113941192627, + "learning_rate": 0.001, + "loss": 1.5001, + "step": 15172 + }, + { + "epoch": 0.6418901768339115, + "grad_norm": 0.21686485409736633, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 15173 + }, + { + "epoch": 0.6419324815974279, + "grad_norm": 0.1666136085987091, + "learning_rate": 0.001, + "loss": 3.1013, + "step": 15174 + }, + { + "epoch": 0.6419747863609443, + "grad_norm": 0.20355500280857086, + "learning_rate": 0.001, + "loss": 1.8449, + "step": 15175 + }, + { + "epoch": 0.6420170911244606, + "grad_norm": 0.16820862889289856, + "learning_rate": 0.001, + "loss": 2.2891, + "step": 15176 + }, + { + "epoch": 0.642059395887977, + "grad_norm": 0.16452017426490784, + "learning_rate": 0.001, + "loss": 1.8697, + "step": 15177 + }, + { + "epoch": 0.6421017006514934, + "grad_norm": 0.17088449001312256, + "learning_rate": 0.001, + "loss": 1.9137, + "step": 15178 + }, + { + "epoch": 0.6421440054150097, + "grad_norm": 0.4427139163017273, + "learning_rate": 0.001, + "loss": 1.8976, + "step": 15179 + }, + { + "epoch": 0.6421863101785261, + "grad_norm": 0.16302001476287842, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 15180 + }, + { + "epoch": 0.6422286149420424, + "grad_norm": 0.22736626863479614, + "learning_rate": 0.001, + "loss": 1.8923, + "step": 15181 + }, + { + "epoch": 0.6422709197055588, + "grad_norm": 0.20898133516311646, + "learning_rate": 0.001, + "loss": 2.1798, + "step": 15182 + }, + { + "epoch": 0.6423132244690752, + "grad_norm": 1.8661376237869263, + "learning_rate": 0.001, + "loss": 2.4555, + "step": 15183 + }, + { + "epoch": 0.6423555292325915, + "grad_norm": 0.22298663854599, + "learning_rate": 0.001, + "loss": 2.2072, + "step": 15184 + }, + { + "epoch": 0.642397833996108, + "grad_norm": 3.3507096767425537, + "learning_rate": 0.001, + "loss": 2.7466, + "step": 15185 + }, + { + "epoch": 0.6424401387596244, + "grad_norm": 0.3017289638519287, + "learning_rate": 0.001, + "loss": 2.6671, + "step": 15186 + }, + { + "epoch": 0.6424824435231407, + "grad_norm": 0.24658021330833435, + "learning_rate": 0.001, + "loss": 1.7703, + "step": 15187 + }, + { + "epoch": 0.6425247482866571, + "grad_norm": 2.2778401374816895, + "learning_rate": 0.001, + "loss": 3.8874, + "step": 15188 + }, + { + "epoch": 0.6425670530501735, + "grad_norm": 0.1641712188720703, + "learning_rate": 0.001, + "loss": 2.0445, + "step": 15189 + }, + { + "epoch": 0.6426093578136898, + "grad_norm": 0.26722750067710876, + "learning_rate": 0.001, + "loss": 2.8895, + "step": 15190 + }, + { + "epoch": 0.6426516625772062, + "grad_norm": 0.1975189447402954, + "learning_rate": 0.001, + "loss": 2.0646, + "step": 15191 + }, + { + "epoch": 0.6426939673407226, + "grad_norm": 0.22183305025100708, + "learning_rate": 0.001, + "loss": 1.8173, + "step": 15192 + }, + { + "epoch": 0.6427362721042389, + "grad_norm": 0.17388564348220825, + "learning_rate": 0.001, + "loss": 1.6142, + "step": 15193 + }, + { + "epoch": 0.6427785768677553, + "grad_norm": 0.169900044798851, + "learning_rate": 0.001, + "loss": 2.7619, + "step": 15194 + }, + { + "epoch": 0.6428208816312717, + "grad_norm": 0.2132178544998169, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 15195 + }, + { + "epoch": 0.642863186394788, + "grad_norm": 0.2020595222711563, + "learning_rate": 0.001, + "loss": 1.8507, + "step": 15196 + }, + { + "epoch": 0.6429054911583044, + "grad_norm": 0.18588212132453918, + "learning_rate": 0.001, + "loss": 2.8819, + "step": 15197 + }, + { + "epoch": 0.6429477959218208, + "grad_norm": 0.17926856875419617, + "learning_rate": 0.001, + "loss": 2.0165, + "step": 15198 + }, + { + "epoch": 0.6429901006853371, + "grad_norm": 0.1749180406332016, + "learning_rate": 0.001, + "loss": 2.548, + "step": 15199 + }, + { + "epoch": 0.6430324054488535, + "grad_norm": 0.1662074625492096, + "learning_rate": 0.001, + "loss": 2.1117, + "step": 15200 + }, + { + "epoch": 0.64307471021237, + "grad_norm": 0.5484973788261414, + "learning_rate": 0.001, + "loss": 1.8429, + "step": 15201 + }, + { + "epoch": 0.6431170149758862, + "grad_norm": 0.16047289967536926, + "learning_rate": 0.001, + "loss": 2.5943, + "step": 15202 + }, + { + "epoch": 0.6431593197394027, + "grad_norm": 0.17750583589076996, + "learning_rate": 0.001, + "loss": 1.8823, + "step": 15203 + }, + { + "epoch": 0.6432016245029191, + "grad_norm": 0.17922556400299072, + "learning_rate": 0.001, + "loss": 1.844, + "step": 15204 + }, + { + "epoch": 0.6432439292664354, + "grad_norm": 0.2863519489765167, + "learning_rate": 0.001, + "loss": 2.2631, + "step": 15205 + }, + { + "epoch": 0.6432862340299518, + "grad_norm": 0.19518601894378662, + "learning_rate": 0.001, + "loss": 1.5934, + "step": 15206 + }, + { + "epoch": 0.6433285387934682, + "grad_norm": 0.15284669399261475, + "learning_rate": 0.001, + "loss": 1.7176, + "step": 15207 + }, + { + "epoch": 0.6433708435569845, + "grad_norm": 0.16001272201538086, + "learning_rate": 0.001, + "loss": 2.4634, + "step": 15208 + }, + { + "epoch": 0.6434131483205009, + "grad_norm": 1.5020637512207031, + "learning_rate": 0.001, + "loss": 2.363, + "step": 15209 + }, + { + "epoch": 0.6434554530840173, + "grad_norm": 0.19093799591064453, + "learning_rate": 0.001, + "loss": 2.8734, + "step": 15210 + }, + { + "epoch": 0.6434977578475336, + "grad_norm": 0.1673070341348648, + "learning_rate": 0.001, + "loss": 2.2707, + "step": 15211 + }, + { + "epoch": 0.64354006261105, + "grad_norm": 0.1651526242494583, + "learning_rate": 0.001, + "loss": 2.457, + "step": 15212 + }, + { + "epoch": 0.6435823673745664, + "grad_norm": 0.34426966309547424, + "learning_rate": 0.001, + "loss": 2.6741, + "step": 15213 + }, + { + "epoch": 0.6436246721380827, + "grad_norm": 0.15150640904903412, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 15214 + }, + { + "epoch": 0.6436669769015991, + "grad_norm": 0.1601693034172058, + "learning_rate": 0.001, + "loss": 1.4741, + "step": 15215 + }, + { + "epoch": 0.6437092816651155, + "grad_norm": 0.19948755204677582, + "learning_rate": 0.001, + "loss": 1.6806, + "step": 15216 + }, + { + "epoch": 0.6437515864286318, + "grad_norm": 0.17080558836460114, + "learning_rate": 0.001, + "loss": 2.2009, + "step": 15217 + }, + { + "epoch": 0.6437938911921482, + "grad_norm": 0.1389591544866562, + "learning_rate": 0.001, + "loss": 1.921, + "step": 15218 + }, + { + "epoch": 0.6438361959556647, + "grad_norm": 1.358182668685913, + "learning_rate": 0.001, + "loss": 2.0884, + "step": 15219 + }, + { + "epoch": 0.643878500719181, + "grad_norm": 0.3052426278591156, + "learning_rate": 0.001, + "loss": 3.247, + "step": 15220 + }, + { + "epoch": 0.6439208054826974, + "grad_norm": 0.16330936551094055, + "learning_rate": 0.001, + "loss": 1.6572, + "step": 15221 + }, + { + "epoch": 0.6439631102462138, + "grad_norm": 0.21469064056873322, + "learning_rate": 0.001, + "loss": 1.9606, + "step": 15222 + }, + { + "epoch": 0.6440054150097301, + "grad_norm": 0.17421077191829681, + "learning_rate": 0.001, + "loss": 2.3584, + "step": 15223 + }, + { + "epoch": 0.6440477197732465, + "grad_norm": 0.18720628321170807, + "learning_rate": 0.001, + "loss": 2.3146, + "step": 15224 + }, + { + "epoch": 0.6440900245367628, + "grad_norm": 1.2276394367218018, + "learning_rate": 0.001, + "loss": 2.6331, + "step": 15225 + }, + { + "epoch": 0.6441323293002792, + "grad_norm": 0.17419876158237457, + "learning_rate": 0.001, + "loss": 2.1331, + "step": 15226 + }, + { + "epoch": 0.6441746340637956, + "grad_norm": 0.1927102506160736, + "learning_rate": 0.001, + "loss": 2.2267, + "step": 15227 + }, + { + "epoch": 0.6442169388273119, + "grad_norm": 0.19751717150211334, + "learning_rate": 0.001, + "loss": 2.5559, + "step": 15228 + }, + { + "epoch": 0.6442592435908283, + "grad_norm": 0.14785194396972656, + "learning_rate": 0.001, + "loss": 3.0032, + "step": 15229 + }, + { + "epoch": 0.6443015483543447, + "grad_norm": 0.3033665716648102, + "learning_rate": 0.001, + "loss": 2.1101, + "step": 15230 + }, + { + "epoch": 0.644343853117861, + "grad_norm": 0.1637023538351059, + "learning_rate": 0.001, + "loss": 2.0958, + "step": 15231 + }, + { + "epoch": 0.6443861578813774, + "grad_norm": 0.24798746407032013, + "learning_rate": 0.001, + "loss": 2.0162, + "step": 15232 + }, + { + "epoch": 0.6444284626448938, + "grad_norm": 0.16221873462200165, + "learning_rate": 0.001, + "loss": 1.8261, + "step": 15233 + }, + { + "epoch": 0.6444707674084101, + "grad_norm": 0.1729075014591217, + "learning_rate": 0.001, + "loss": 2.3098, + "step": 15234 + }, + { + "epoch": 0.6445130721719265, + "grad_norm": 0.1991363763809204, + "learning_rate": 0.001, + "loss": 1.3549, + "step": 15235 + }, + { + "epoch": 0.644555376935443, + "grad_norm": 1.0189127922058105, + "learning_rate": 0.001, + "loss": 2.1717, + "step": 15236 + }, + { + "epoch": 0.6445976816989593, + "grad_norm": 0.16026759147644043, + "learning_rate": 0.001, + "loss": 2.4036, + "step": 15237 + }, + { + "epoch": 0.6446399864624757, + "grad_norm": 0.31504741311073303, + "learning_rate": 0.001, + "loss": 1.429, + "step": 15238 + }, + { + "epoch": 0.6446822912259921, + "grad_norm": 0.2236756980419159, + "learning_rate": 0.001, + "loss": 2.204, + "step": 15239 + }, + { + "epoch": 0.6447245959895084, + "grad_norm": 0.18025624752044678, + "learning_rate": 0.001, + "loss": 2.653, + "step": 15240 + }, + { + "epoch": 0.6447669007530248, + "grad_norm": 0.1674138605594635, + "learning_rate": 0.001, + "loss": 3.3606, + "step": 15241 + }, + { + "epoch": 0.6448092055165412, + "grad_norm": 0.1862388402223587, + "learning_rate": 0.001, + "loss": 3.4166, + "step": 15242 + }, + { + "epoch": 0.6448515102800575, + "grad_norm": 0.17802289128303528, + "learning_rate": 0.001, + "loss": 3.0027, + "step": 15243 + }, + { + "epoch": 0.6448938150435739, + "grad_norm": 0.1625264286994934, + "learning_rate": 0.001, + "loss": 2.5584, + "step": 15244 + }, + { + "epoch": 0.6449361198070903, + "grad_norm": 0.293082058429718, + "learning_rate": 0.001, + "loss": 2.7967, + "step": 15245 + }, + { + "epoch": 0.6449784245706066, + "grad_norm": 0.20152679085731506, + "learning_rate": 0.001, + "loss": 1.4473, + "step": 15246 + }, + { + "epoch": 0.645020729334123, + "grad_norm": 0.5746887922286987, + "learning_rate": 0.001, + "loss": 1.424, + "step": 15247 + }, + { + "epoch": 0.6450630340976394, + "grad_norm": 0.19594338536262512, + "learning_rate": 0.001, + "loss": 2.3114, + "step": 15248 + }, + { + "epoch": 0.6451053388611557, + "grad_norm": 0.218429297208786, + "learning_rate": 0.001, + "loss": 1.768, + "step": 15249 + }, + { + "epoch": 0.6451476436246721, + "grad_norm": 0.2682070732116699, + "learning_rate": 0.001, + "loss": 1.842, + "step": 15250 + }, + { + "epoch": 0.6451899483881885, + "grad_norm": 0.34150052070617676, + "learning_rate": 0.001, + "loss": 1.7695, + "step": 15251 + }, + { + "epoch": 0.6452322531517048, + "grad_norm": 0.19166529178619385, + "learning_rate": 0.001, + "loss": 2.596, + "step": 15252 + }, + { + "epoch": 0.6452745579152213, + "grad_norm": 1.010944128036499, + "learning_rate": 0.001, + "loss": 2.4779, + "step": 15253 + }, + { + "epoch": 0.6453168626787377, + "grad_norm": 0.20341774821281433, + "learning_rate": 0.001, + "loss": 2.7125, + "step": 15254 + }, + { + "epoch": 0.645359167442254, + "grad_norm": 0.28562796115875244, + "learning_rate": 0.001, + "loss": 2.493, + "step": 15255 + }, + { + "epoch": 0.6454014722057704, + "grad_norm": 0.19176359474658966, + "learning_rate": 0.001, + "loss": 2.4811, + "step": 15256 + }, + { + "epoch": 0.6454437769692868, + "grad_norm": 0.17995816469192505, + "learning_rate": 0.001, + "loss": 1.7175, + "step": 15257 + }, + { + "epoch": 0.6454860817328031, + "grad_norm": 0.2325253188610077, + "learning_rate": 0.001, + "loss": 2.5527, + "step": 15258 + }, + { + "epoch": 0.6455283864963195, + "grad_norm": 0.7789939641952515, + "learning_rate": 0.001, + "loss": 1.7572, + "step": 15259 + }, + { + "epoch": 0.6455706912598359, + "grad_norm": 0.1507723033428192, + "learning_rate": 0.001, + "loss": 2.1685, + "step": 15260 + }, + { + "epoch": 0.6456129960233522, + "grad_norm": 0.15807674825191498, + "learning_rate": 0.001, + "loss": 2.5854, + "step": 15261 + }, + { + "epoch": 0.6456553007868686, + "grad_norm": 0.1875530183315277, + "learning_rate": 0.001, + "loss": 2.4665, + "step": 15262 + }, + { + "epoch": 0.645697605550385, + "grad_norm": 0.20089443027973175, + "learning_rate": 0.001, + "loss": 1.8378, + "step": 15263 + }, + { + "epoch": 0.6457399103139013, + "grad_norm": 0.40566131472587585, + "learning_rate": 0.001, + "loss": 1.5999, + "step": 15264 + }, + { + "epoch": 0.6457822150774177, + "grad_norm": 2.285637378692627, + "learning_rate": 0.001, + "loss": 2.1813, + "step": 15265 + }, + { + "epoch": 0.6458245198409341, + "grad_norm": 0.1539418250322342, + "learning_rate": 0.001, + "loss": 1.8228, + "step": 15266 + }, + { + "epoch": 0.6458668246044504, + "grad_norm": 5.015846252441406, + "learning_rate": 0.001, + "loss": 2.1239, + "step": 15267 + }, + { + "epoch": 0.6459091293679668, + "grad_norm": 0.2014991044998169, + "learning_rate": 0.001, + "loss": 2.0453, + "step": 15268 + }, + { + "epoch": 0.6459514341314833, + "grad_norm": 0.4300079345703125, + "learning_rate": 0.001, + "loss": 2.0821, + "step": 15269 + }, + { + "epoch": 0.6459937388949996, + "grad_norm": 0.20501923561096191, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 15270 + }, + { + "epoch": 0.646036043658516, + "grad_norm": 0.17998738586902618, + "learning_rate": 0.001, + "loss": 1.9638, + "step": 15271 + }, + { + "epoch": 0.6460783484220323, + "grad_norm": 0.17478275299072266, + "learning_rate": 0.001, + "loss": 1.6565, + "step": 15272 + }, + { + "epoch": 0.6461206531855487, + "grad_norm": 0.17657312750816345, + "learning_rate": 0.001, + "loss": 1.9822, + "step": 15273 + }, + { + "epoch": 0.6461629579490651, + "grad_norm": 2.826846122741699, + "learning_rate": 0.001, + "loss": 1.6034, + "step": 15274 + }, + { + "epoch": 0.6462052627125814, + "grad_norm": 0.9250707626342773, + "learning_rate": 0.001, + "loss": 3.1013, + "step": 15275 + }, + { + "epoch": 0.6462475674760978, + "grad_norm": 0.1742154359817505, + "learning_rate": 0.001, + "loss": 2.5725, + "step": 15276 + }, + { + "epoch": 0.6462898722396142, + "grad_norm": 1.7787930965423584, + "learning_rate": 0.001, + "loss": 2.0831, + "step": 15277 + }, + { + "epoch": 0.6463321770031305, + "grad_norm": 0.17975670099258423, + "learning_rate": 0.001, + "loss": 1.9179, + "step": 15278 + }, + { + "epoch": 0.6463744817666469, + "grad_norm": 0.1871751993894577, + "learning_rate": 0.001, + "loss": 1.7723, + "step": 15279 + }, + { + "epoch": 0.6464167865301633, + "grad_norm": 0.19217629730701447, + "learning_rate": 0.001, + "loss": 2.0674, + "step": 15280 + }, + { + "epoch": 0.6464590912936796, + "grad_norm": 0.2209371030330658, + "learning_rate": 0.001, + "loss": 1.9014, + "step": 15281 + }, + { + "epoch": 0.646501396057196, + "grad_norm": 0.1662231981754303, + "learning_rate": 0.001, + "loss": 2.1925, + "step": 15282 + }, + { + "epoch": 0.6465437008207124, + "grad_norm": 0.40641865134239197, + "learning_rate": 0.001, + "loss": 2.4045, + "step": 15283 + }, + { + "epoch": 0.6465860055842287, + "grad_norm": 0.14980608224868774, + "learning_rate": 0.001, + "loss": 1.4153, + "step": 15284 + }, + { + "epoch": 0.6466283103477451, + "grad_norm": 0.18884730339050293, + "learning_rate": 0.001, + "loss": 1.945, + "step": 15285 + }, + { + "epoch": 0.6466706151112616, + "grad_norm": 0.1964019387960434, + "learning_rate": 0.001, + "loss": 2.997, + "step": 15286 + }, + { + "epoch": 0.6467129198747779, + "grad_norm": 0.1846921443939209, + "learning_rate": 0.001, + "loss": 2.0583, + "step": 15287 + }, + { + "epoch": 0.6467552246382943, + "grad_norm": 0.3572556674480438, + "learning_rate": 0.001, + "loss": 3.6665, + "step": 15288 + }, + { + "epoch": 0.6467975294018107, + "grad_norm": 0.1696724146604538, + "learning_rate": 0.001, + "loss": 1.6555, + "step": 15289 + }, + { + "epoch": 0.646839834165327, + "grad_norm": 0.18197889626026154, + "learning_rate": 0.001, + "loss": 2.0833, + "step": 15290 + }, + { + "epoch": 0.6468821389288434, + "grad_norm": 0.7675547003746033, + "learning_rate": 0.001, + "loss": 1.9319, + "step": 15291 + }, + { + "epoch": 0.6469244436923598, + "grad_norm": 0.16503793001174927, + "learning_rate": 0.001, + "loss": 2.4541, + "step": 15292 + }, + { + "epoch": 0.6469667484558761, + "grad_norm": 0.5148576498031616, + "learning_rate": 0.001, + "loss": 1.8749, + "step": 15293 + }, + { + "epoch": 0.6470090532193925, + "grad_norm": 0.1498442143201828, + "learning_rate": 0.001, + "loss": 2.6986, + "step": 15294 + }, + { + "epoch": 0.6470513579829089, + "grad_norm": 0.22199515998363495, + "learning_rate": 0.001, + "loss": 2.5812, + "step": 15295 + }, + { + "epoch": 0.6470936627464252, + "grad_norm": 0.15802200138568878, + "learning_rate": 0.001, + "loss": 1.5685, + "step": 15296 + }, + { + "epoch": 0.6471359675099416, + "grad_norm": 0.15270139276981354, + "learning_rate": 0.001, + "loss": 2.1229, + "step": 15297 + }, + { + "epoch": 0.647178272273458, + "grad_norm": 0.15476052463054657, + "learning_rate": 0.001, + "loss": 2.2869, + "step": 15298 + }, + { + "epoch": 0.6472205770369743, + "grad_norm": 0.23796048760414124, + "learning_rate": 0.001, + "loss": 1.6435, + "step": 15299 + }, + { + "epoch": 0.6472628818004907, + "grad_norm": 0.2162591516971588, + "learning_rate": 0.001, + "loss": 2.5024, + "step": 15300 + }, + { + "epoch": 0.6473051865640072, + "grad_norm": 0.5299810171127319, + "learning_rate": 0.001, + "loss": 2.4246, + "step": 15301 + }, + { + "epoch": 0.6473474913275234, + "grad_norm": 1.423614740371704, + "learning_rate": 0.001, + "loss": 2.4182, + "step": 15302 + }, + { + "epoch": 0.6473897960910399, + "grad_norm": 0.15965206921100616, + "learning_rate": 0.001, + "loss": 1.6585, + "step": 15303 + }, + { + "epoch": 0.6474321008545563, + "grad_norm": 0.2295125126838684, + "learning_rate": 0.001, + "loss": 1.5463, + "step": 15304 + }, + { + "epoch": 0.6474744056180726, + "grad_norm": 0.16311728954315186, + "learning_rate": 0.001, + "loss": 2.6348, + "step": 15305 + }, + { + "epoch": 0.647516710381589, + "grad_norm": 0.16523398458957672, + "learning_rate": 0.001, + "loss": 2.221, + "step": 15306 + }, + { + "epoch": 0.6475590151451054, + "grad_norm": 0.16306725144386292, + "learning_rate": 0.001, + "loss": 1.871, + "step": 15307 + }, + { + "epoch": 0.6476013199086217, + "grad_norm": 0.24580958485603333, + "learning_rate": 0.001, + "loss": 3.2734, + "step": 15308 + }, + { + "epoch": 0.6476436246721381, + "grad_norm": 0.3764786124229431, + "learning_rate": 0.001, + "loss": 2.7352, + "step": 15309 + }, + { + "epoch": 0.6476859294356545, + "grad_norm": 0.41898685693740845, + "learning_rate": 0.001, + "loss": 2.2815, + "step": 15310 + }, + { + "epoch": 0.6477282341991708, + "grad_norm": 1.0306098461151123, + "learning_rate": 0.001, + "loss": 2.1651, + "step": 15311 + }, + { + "epoch": 0.6477705389626872, + "grad_norm": 0.20003588497638702, + "learning_rate": 0.001, + "loss": 1.8374, + "step": 15312 + }, + { + "epoch": 0.6478128437262036, + "grad_norm": 0.18757633864879608, + "learning_rate": 0.001, + "loss": 1.9034, + "step": 15313 + }, + { + "epoch": 0.6478551484897199, + "grad_norm": 0.2218223512172699, + "learning_rate": 0.001, + "loss": 1.9381, + "step": 15314 + }, + { + "epoch": 0.6478974532532363, + "grad_norm": 0.17475609481334686, + "learning_rate": 0.001, + "loss": 1.7699, + "step": 15315 + }, + { + "epoch": 0.6479397580167526, + "grad_norm": 0.23167279362678528, + "learning_rate": 0.001, + "loss": 1.9099, + "step": 15316 + }, + { + "epoch": 0.647982062780269, + "grad_norm": 0.1843143105506897, + "learning_rate": 0.001, + "loss": 1.8182, + "step": 15317 + }, + { + "epoch": 0.6480243675437855, + "grad_norm": 0.20558197796344757, + "learning_rate": 0.001, + "loss": 2.5398, + "step": 15318 + }, + { + "epoch": 0.6480666723073017, + "grad_norm": 0.17838044464588165, + "learning_rate": 0.001, + "loss": 1.7194, + "step": 15319 + }, + { + "epoch": 0.6481089770708182, + "grad_norm": 0.21093347668647766, + "learning_rate": 0.001, + "loss": 3.3587, + "step": 15320 + }, + { + "epoch": 0.6481512818343346, + "grad_norm": 0.24948959052562714, + "learning_rate": 0.001, + "loss": 2.1242, + "step": 15321 + }, + { + "epoch": 0.6481935865978509, + "grad_norm": 0.17409680783748627, + "learning_rate": 0.001, + "loss": 1.7667, + "step": 15322 + }, + { + "epoch": 0.6482358913613673, + "grad_norm": 0.1577576845884323, + "learning_rate": 0.001, + "loss": 1.9457, + "step": 15323 + }, + { + "epoch": 0.6482781961248837, + "grad_norm": 0.6335095167160034, + "learning_rate": 0.001, + "loss": 2.0039, + "step": 15324 + }, + { + "epoch": 0.6483205008884, + "grad_norm": 0.23187367618083954, + "learning_rate": 0.001, + "loss": 1.9439, + "step": 15325 + }, + { + "epoch": 0.6483628056519164, + "grad_norm": 0.17001605033874512, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 15326 + }, + { + "epoch": 0.6484051104154328, + "grad_norm": 0.1839197725057602, + "learning_rate": 0.001, + "loss": 3.4747, + "step": 15327 + }, + { + "epoch": 0.6484474151789491, + "grad_norm": 0.18075038492679596, + "learning_rate": 0.001, + "loss": 1.8859, + "step": 15328 + }, + { + "epoch": 0.6484897199424655, + "grad_norm": 0.42471522092819214, + "learning_rate": 0.001, + "loss": 2.1715, + "step": 15329 + }, + { + "epoch": 0.6485320247059819, + "grad_norm": 0.3928261995315552, + "learning_rate": 0.001, + "loss": 3.119, + "step": 15330 + }, + { + "epoch": 0.6485743294694982, + "grad_norm": 0.15341652929782867, + "learning_rate": 0.001, + "loss": 2.5067, + "step": 15331 + }, + { + "epoch": 0.6486166342330146, + "grad_norm": 0.16816523671150208, + "learning_rate": 0.001, + "loss": 2.3909, + "step": 15332 + }, + { + "epoch": 0.648658938996531, + "grad_norm": 0.24296429753303528, + "learning_rate": 0.001, + "loss": 2.4028, + "step": 15333 + }, + { + "epoch": 0.6487012437600473, + "grad_norm": 0.3298254907131195, + "learning_rate": 0.001, + "loss": 2.0308, + "step": 15334 + }, + { + "epoch": 0.6487435485235638, + "grad_norm": 0.9870914816856384, + "learning_rate": 0.001, + "loss": 2.1879, + "step": 15335 + }, + { + "epoch": 0.6487858532870802, + "grad_norm": 0.1429598480463028, + "learning_rate": 0.001, + "loss": 1.3078, + "step": 15336 + }, + { + "epoch": 0.6488281580505965, + "grad_norm": 2.075826644897461, + "learning_rate": 0.001, + "loss": 1.9437, + "step": 15337 + }, + { + "epoch": 0.6488704628141129, + "grad_norm": 0.18827824294567108, + "learning_rate": 0.001, + "loss": 1.5201, + "step": 15338 + }, + { + "epoch": 0.6489127675776293, + "grad_norm": 0.16094815731048584, + "learning_rate": 0.001, + "loss": 2.2602, + "step": 15339 + }, + { + "epoch": 0.6489550723411456, + "grad_norm": 0.18486766517162323, + "learning_rate": 0.001, + "loss": 2.3534, + "step": 15340 + }, + { + "epoch": 0.648997377104662, + "grad_norm": 0.22534742951393127, + "learning_rate": 0.001, + "loss": 2.5483, + "step": 15341 + }, + { + "epoch": 0.6490396818681784, + "grad_norm": 0.17582590878009796, + "learning_rate": 0.001, + "loss": 1.4281, + "step": 15342 + }, + { + "epoch": 0.6490819866316947, + "grad_norm": 0.13245414197444916, + "learning_rate": 0.001, + "loss": 1.9149, + "step": 15343 + }, + { + "epoch": 0.6491242913952111, + "grad_norm": 0.4428796172142029, + "learning_rate": 0.001, + "loss": 2.2785, + "step": 15344 + }, + { + "epoch": 0.6491665961587275, + "grad_norm": 0.289799302816391, + "learning_rate": 0.001, + "loss": 1.9939, + "step": 15345 + }, + { + "epoch": 0.6492089009222438, + "grad_norm": 0.16207891702651978, + "learning_rate": 0.001, + "loss": 2.3712, + "step": 15346 + }, + { + "epoch": 0.6492512056857602, + "grad_norm": 0.20188239216804504, + "learning_rate": 0.001, + "loss": 2.4096, + "step": 15347 + }, + { + "epoch": 0.6492935104492766, + "grad_norm": 0.21010519564151764, + "learning_rate": 0.001, + "loss": 2.6139, + "step": 15348 + }, + { + "epoch": 0.6493358152127929, + "grad_norm": 0.3073784112930298, + "learning_rate": 0.001, + "loss": 2.1791, + "step": 15349 + }, + { + "epoch": 0.6493781199763093, + "grad_norm": 0.3525810241699219, + "learning_rate": 0.001, + "loss": 2.2354, + "step": 15350 + }, + { + "epoch": 0.6494204247398258, + "grad_norm": 0.18765120208263397, + "learning_rate": 0.001, + "loss": 2.6249, + "step": 15351 + }, + { + "epoch": 0.649462729503342, + "grad_norm": 0.21588948369026184, + "learning_rate": 0.001, + "loss": 1.4647, + "step": 15352 + }, + { + "epoch": 0.6495050342668585, + "grad_norm": 0.17089973390102386, + "learning_rate": 0.001, + "loss": 1.6018, + "step": 15353 + }, + { + "epoch": 0.6495473390303749, + "grad_norm": 1.505226492881775, + "learning_rate": 0.001, + "loss": 2.5792, + "step": 15354 + }, + { + "epoch": 0.6495896437938912, + "grad_norm": 0.18574289977550507, + "learning_rate": 0.001, + "loss": 2.5838, + "step": 15355 + }, + { + "epoch": 0.6496319485574076, + "grad_norm": 0.9261936545372009, + "learning_rate": 0.001, + "loss": 3.0768, + "step": 15356 + }, + { + "epoch": 0.649674253320924, + "grad_norm": 0.1780634969472885, + "learning_rate": 0.001, + "loss": 3.5627, + "step": 15357 + }, + { + "epoch": 0.6497165580844403, + "grad_norm": 0.5185316205024719, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 15358 + }, + { + "epoch": 0.6497588628479567, + "grad_norm": 0.3285927176475525, + "learning_rate": 0.001, + "loss": 2.4195, + "step": 15359 + }, + { + "epoch": 0.6498011676114731, + "grad_norm": 0.2265651822090149, + "learning_rate": 0.001, + "loss": 2.5156, + "step": 15360 + }, + { + "epoch": 0.6498434723749894, + "grad_norm": 0.14902041852474213, + "learning_rate": 0.001, + "loss": 2.5237, + "step": 15361 + }, + { + "epoch": 0.6498857771385058, + "grad_norm": 0.20233316719532013, + "learning_rate": 0.001, + "loss": 2.4619, + "step": 15362 + }, + { + "epoch": 0.6499280819020221, + "grad_norm": 0.16044741868972778, + "learning_rate": 0.001, + "loss": 1.9778, + "step": 15363 + }, + { + "epoch": 0.6499703866655385, + "grad_norm": 0.22437353432178497, + "learning_rate": 0.001, + "loss": 2.704, + "step": 15364 + }, + { + "epoch": 0.6500126914290549, + "grad_norm": 0.1866353303194046, + "learning_rate": 0.001, + "loss": 1.7799, + "step": 15365 + }, + { + "epoch": 0.6500549961925712, + "grad_norm": 0.25852930545806885, + "learning_rate": 0.001, + "loss": 1.3778, + "step": 15366 + }, + { + "epoch": 0.6500973009560876, + "grad_norm": 0.23204196989536285, + "learning_rate": 0.001, + "loss": 2.7044, + "step": 15367 + }, + { + "epoch": 0.650139605719604, + "grad_norm": 0.19387875497341156, + "learning_rate": 0.001, + "loss": 3.1026, + "step": 15368 + }, + { + "epoch": 0.6501819104831204, + "grad_norm": 0.15012617409229279, + "learning_rate": 0.001, + "loss": 2.8182, + "step": 15369 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.1906086802482605, + "learning_rate": 0.001, + "loss": 2.2236, + "step": 15370 + }, + { + "epoch": 0.6502665200101532, + "grad_norm": 0.16298934817314148, + "learning_rate": 0.001, + "loss": 1.6815, + "step": 15371 + }, + { + "epoch": 0.6503088247736695, + "grad_norm": 6.248525142669678, + "learning_rate": 0.001, + "loss": 1.8015, + "step": 15372 + }, + { + "epoch": 0.6503511295371859, + "grad_norm": 0.29866117238998413, + "learning_rate": 0.001, + "loss": 2.1441, + "step": 15373 + }, + { + "epoch": 0.6503934343007023, + "grad_norm": 0.5905458331108093, + "learning_rate": 0.001, + "loss": 1.629, + "step": 15374 + }, + { + "epoch": 0.6504357390642186, + "grad_norm": 0.19776056706905365, + "learning_rate": 0.001, + "loss": 1.8649, + "step": 15375 + }, + { + "epoch": 0.650478043827735, + "grad_norm": 0.19329819083213806, + "learning_rate": 0.001, + "loss": 2.4082, + "step": 15376 + }, + { + "epoch": 0.6505203485912514, + "grad_norm": 0.2139400839805603, + "learning_rate": 0.001, + "loss": 3.2294, + "step": 15377 + }, + { + "epoch": 0.6505626533547677, + "grad_norm": 0.1699124574661255, + "learning_rate": 0.001, + "loss": 2.2416, + "step": 15378 + }, + { + "epoch": 0.6506049581182841, + "grad_norm": 0.17111091315746307, + "learning_rate": 0.001, + "loss": 1.564, + "step": 15379 + }, + { + "epoch": 0.6506472628818005, + "grad_norm": 0.1551768034696579, + "learning_rate": 0.001, + "loss": 2.4745, + "step": 15380 + }, + { + "epoch": 0.6506895676453168, + "grad_norm": 0.3416825830936432, + "learning_rate": 0.001, + "loss": 2.7327, + "step": 15381 + }, + { + "epoch": 0.6507318724088332, + "grad_norm": 0.1836758702993393, + "learning_rate": 0.001, + "loss": 2.3768, + "step": 15382 + }, + { + "epoch": 0.6507741771723496, + "grad_norm": 0.5745386481285095, + "learning_rate": 0.001, + "loss": 1.7781, + "step": 15383 + }, + { + "epoch": 0.6508164819358659, + "grad_norm": 0.21170569956302643, + "learning_rate": 0.001, + "loss": 2.6607, + "step": 15384 + }, + { + "epoch": 0.6508587866993824, + "grad_norm": 0.1727539747953415, + "learning_rate": 0.001, + "loss": 2.1116, + "step": 15385 + }, + { + "epoch": 0.6509010914628988, + "grad_norm": 1.874002456665039, + "learning_rate": 0.001, + "loss": 2.9078, + "step": 15386 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 0.21820873022079468, + "learning_rate": 0.001, + "loss": 1.8, + "step": 15387 + }, + { + "epoch": 0.6509857009899315, + "grad_norm": 6.197278022766113, + "learning_rate": 0.001, + "loss": 1.6494, + "step": 15388 + }, + { + "epoch": 0.6510280057534479, + "grad_norm": 0.16244280338287354, + "learning_rate": 0.001, + "loss": 1.9974, + "step": 15389 + }, + { + "epoch": 0.6510703105169642, + "grad_norm": 0.2022523730993271, + "learning_rate": 0.001, + "loss": 2.4231, + "step": 15390 + }, + { + "epoch": 0.6511126152804806, + "grad_norm": 0.19882678985595703, + "learning_rate": 0.001, + "loss": 2.0508, + "step": 15391 + }, + { + "epoch": 0.651154920043997, + "grad_norm": 0.5075106024742126, + "learning_rate": 0.001, + "loss": 1.6093, + "step": 15392 + }, + { + "epoch": 0.6511972248075133, + "grad_norm": 0.17667998373508453, + "learning_rate": 0.001, + "loss": 1.8422, + "step": 15393 + }, + { + "epoch": 0.6512395295710297, + "grad_norm": 0.28973835706710815, + "learning_rate": 0.001, + "loss": 3.3682, + "step": 15394 + }, + { + "epoch": 0.6512818343345461, + "grad_norm": 36.70282745361328, + "learning_rate": 0.001, + "loss": 2.7338, + "step": 15395 + }, + { + "epoch": 0.6513241390980624, + "grad_norm": 0.16966314613819122, + "learning_rate": 0.001, + "loss": 2.356, + "step": 15396 + }, + { + "epoch": 0.6513664438615788, + "grad_norm": 0.18758277595043182, + "learning_rate": 0.001, + "loss": 1.7575, + "step": 15397 + }, + { + "epoch": 0.6514087486250952, + "grad_norm": 0.16251905262470245, + "learning_rate": 0.001, + "loss": 1.4579, + "step": 15398 + }, + { + "epoch": 0.6514510533886115, + "grad_norm": 0.24462532997131348, + "learning_rate": 0.001, + "loss": 2.0842, + "step": 15399 + }, + { + "epoch": 0.6514933581521279, + "grad_norm": 0.29812654852867126, + "learning_rate": 0.001, + "loss": 2.6172, + "step": 15400 + }, + { + "epoch": 0.6515356629156444, + "grad_norm": 1.4498318433761597, + "learning_rate": 0.001, + "loss": 1.6447, + "step": 15401 + }, + { + "epoch": 0.6515779676791607, + "grad_norm": 0.19964168965816498, + "learning_rate": 0.001, + "loss": 1.4949, + "step": 15402 + }, + { + "epoch": 0.6516202724426771, + "grad_norm": 0.17301076650619507, + "learning_rate": 0.001, + "loss": 3.407, + "step": 15403 + }, + { + "epoch": 0.6516625772061935, + "grad_norm": 0.18589536845684052, + "learning_rate": 0.001, + "loss": 2.1407, + "step": 15404 + }, + { + "epoch": 0.6517048819697098, + "grad_norm": 0.18076921999454498, + "learning_rate": 0.001, + "loss": 2.0286, + "step": 15405 + }, + { + "epoch": 0.6517471867332262, + "grad_norm": 0.15782049298286438, + "learning_rate": 0.001, + "loss": 2.6584, + "step": 15406 + }, + { + "epoch": 0.6517894914967425, + "grad_norm": 1.653499960899353, + "learning_rate": 0.001, + "loss": 2.1578, + "step": 15407 + }, + { + "epoch": 0.6518317962602589, + "grad_norm": 0.20613858103752136, + "learning_rate": 0.001, + "loss": 1.7229, + "step": 15408 + }, + { + "epoch": 0.6518741010237753, + "grad_norm": 0.16704171895980835, + "learning_rate": 0.001, + "loss": 2.4252, + "step": 15409 + }, + { + "epoch": 0.6519164057872916, + "grad_norm": 0.3319103717803955, + "learning_rate": 0.001, + "loss": 2.2387, + "step": 15410 + }, + { + "epoch": 0.651958710550808, + "grad_norm": 0.1666289120912552, + "learning_rate": 0.001, + "loss": 1.8471, + "step": 15411 + }, + { + "epoch": 0.6520010153143244, + "grad_norm": 0.3717435896396637, + "learning_rate": 0.001, + "loss": 1.9818, + "step": 15412 + }, + { + "epoch": 0.6520433200778407, + "grad_norm": 0.21722985804080963, + "learning_rate": 0.001, + "loss": 1.9053, + "step": 15413 + }, + { + "epoch": 0.6520856248413571, + "grad_norm": 0.21333225071430206, + "learning_rate": 0.001, + "loss": 2.19, + "step": 15414 + }, + { + "epoch": 0.6521279296048735, + "grad_norm": 0.22371824085712433, + "learning_rate": 0.001, + "loss": 2.5068, + "step": 15415 + }, + { + "epoch": 0.6521702343683898, + "grad_norm": 0.1780281811952591, + "learning_rate": 0.001, + "loss": 1.9781, + "step": 15416 + }, + { + "epoch": 0.6522125391319062, + "grad_norm": 0.1755794882774353, + "learning_rate": 0.001, + "loss": 2.8328, + "step": 15417 + }, + { + "epoch": 0.6522548438954227, + "grad_norm": 1.0463812351226807, + "learning_rate": 0.001, + "loss": 2.8041, + "step": 15418 + }, + { + "epoch": 0.652297148658939, + "grad_norm": 0.16094234585762024, + "learning_rate": 0.001, + "loss": 1.7606, + "step": 15419 + }, + { + "epoch": 0.6523394534224554, + "grad_norm": 0.14978618919849396, + "learning_rate": 0.001, + "loss": 2.2326, + "step": 15420 + }, + { + "epoch": 0.6523817581859718, + "grad_norm": 0.19463518261909485, + "learning_rate": 0.001, + "loss": 2.1057, + "step": 15421 + }, + { + "epoch": 0.6524240629494881, + "grad_norm": 0.6183575391769409, + "learning_rate": 0.001, + "loss": 1.9251, + "step": 15422 + }, + { + "epoch": 0.6524663677130045, + "grad_norm": 0.15496648848056793, + "learning_rate": 0.001, + "loss": 1.7538, + "step": 15423 + }, + { + "epoch": 0.6525086724765209, + "grad_norm": 0.17894995212554932, + "learning_rate": 0.001, + "loss": 1.6924, + "step": 15424 + }, + { + "epoch": 0.6525509772400372, + "grad_norm": 0.16409684717655182, + "learning_rate": 0.001, + "loss": 1.995, + "step": 15425 + }, + { + "epoch": 0.6525932820035536, + "grad_norm": 0.15463796257972717, + "learning_rate": 0.001, + "loss": 1.8611, + "step": 15426 + }, + { + "epoch": 0.65263558676707, + "grad_norm": 0.14601118862628937, + "learning_rate": 0.001, + "loss": 2.8148, + "step": 15427 + }, + { + "epoch": 0.6526778915305863, + "grad_norm": 0.19336079061031342, + "learning_rate": 0.001, + "loss": 2.299, + "step": 15428 + }, + { + "epoch": 0.6527201962941027, + "grad_norm": 0.15365414321422577, + "learning_rate": 0.001, + "loss": 1.8484, + "step": 15429 + }, + { + "epoch": 0.6527625010576191, + "grad_norm": 0.17741386592388153, + "learning_rate": 0.001, + "loss": 1.9603, + "step": 15430 + }, + { + "epoch": 0.6528048058211354, + "grad_norm": 0.1832387000322342, + "learning_rate": 0.001, + "loss": 2.7062, + "step": 15431 + }, + { + "epoch": 0.6528471105846518, + "grad_norm": 0.16831880807876587, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 15432 + }, + { + "epoch": 0.6528894153481682, + "grad_norm": 0.2662883698940277, + "learning_rate": 0.001, + "loss": 1.614, + "step": 15433 + }, + { + "epoch": 0.6529317201116845, + "grad_norm": 0.15198548138141632, + "learning_rate": 0.001, + "loss": 1.7864, + "step": 15434 + }, + { + "epoch": 0.652974024875201, + "grad_norm": 0.1819853037595749, + "learning_rate": 0.001, + "loss": 1.8496, + "step": 15435 + }, + { + "epoch": 0.6530163296387174, + "grad_norm": 0.17152895033359528, + "learning_rate": 0.001, + "loss": 2.5133, + "step": 15436 + }, + { + "epoch": 0.6530586344022337, + "grad_norm": 0.16041690111160278, + "learning_rate": 0.001, + "loss": 1.5841, + "step": 15437 + }, + { + "epoch": 0.6531009391657501, + "grad_norm": 0.1594046652317047, + "learning_rate": 0.001, + "loss": 1.9821, + "step": 15438 + }, + { + "epoch": 0.6531432439292665, + "grad_norm": 0.45842236280441284, + "learning_rate": 0.001, + "loss": 2.6672, + "step": 15439 + }, + { + "epoch": 0.6531855486927828, + "grad_norm": 0.6107159852981567, + "learning_rate": 0.001, + "loss": 2.3053, + "step": 15440 + }, + { + "epoch": 0.6532278534562992, + "grad_norm": 0.19147630035877228, + "learning_rate": 0.001, + "loss": 1.6185, + "step": 15441 + }, + { + "epoch": 0.6532701582198156, + "grad_norm": 0.1753721982240677, + "learning_rate": 0.001, + "loss": 2.134, + "step": 15442 + }, + { + "epoch": 0.6533124629833319, + "grad_norm": 0.18597379326820374, + "learning_rate": 0.001, + "loss": 2.6756, + "step": 15443 + }, + { + "epoch": 0.6533547677468483, + "grad_norm": 0.17725729942321777, + "learning_rate": 0.001, + "loss": 2.1048, + "step": 15444 + }, + { + "epoch": 0.6533970725103647, + "grad_norm": 0.13884805142879486, + "learning_rate": 0.001, + "loss": 1.2568, + "step": 15445 + }, + { + "epoch": 0.653439377273881, + "grad_norm": 0.9566793441772461, + "learning_rate": 0.001, + "loss": 1.6837, + "step": 15446 + }, + { + "epoch": 0.6534816820373974, + "grad_norm": 0.2636922597885132, + "learning_rate": 0.001, + "loss": 2.301, + "step": 15447 + }, + { + "epoch": 0.6535239868009138, + "grad_norm": 0.20910915732383728, + "learning_rate": 0.001, + "loss": 2.7653, + "step": 15448 + }, + { + "epoch": 0.6535662915644301, + "grad_norm": 0.38089674711227417, + "learning_rate": 0.001, + "loss": 1.8104, + "step": 15449 + }, + { + "epoch": 0.6536085963279465, + "grad_norm": 0.18689171969890594, + "learning_rate": 0.001, + "loss": 1.6432, + "step": 15450 + }, + { + "epoch": 0.6536509010914628, + "grad_norm": 0.19268843531608582, + "learning_rate": 0.001, + "loss": 1.7838, + "step": 15451 + }, + { + "epoch": 0.6536932058549793, + "grad_norm": 0.17988325655460358, + "learning_rate": 0.001, + "loss": 2.2731, + "step": 15452 + }, + { + "epoch": 0.6537355106184957, + "grad_norm": 0.17474591732025146, + "learning_rate": 0.001, + "loss": 1.865, + "step": 15453 + }, + { + "epoch": 0.653777815382012, + "grad_norm": 0.16267631947994232, + "learning_rate": 0.001, + "loss": 1.614, + "step": 15454 + }, + { + "epoch": 0.6538201201455284, + "grad_norm": 0.27630341053009033, + "learning_rate": 0.001, + "loss": 1.9749, + "step": 15455 + }, + { + "epoch": 0.6538624249090448, + "grad_norm": 0.16530779004096985, + "learning_rate": 0.001, + "loss": 1.5857, + "step": 15456 + }, + { + "epoch": 0.6539047296725611, + "grad_norm": 0.7646900415420532, + "learning_rate": 0.001, + "loss": 2.128, + "step": 15457 + }, + { + "epoch": 0.6539470344360775, + "grad_norm": 0.21980629861354828, + "learning_rate": 0.001, + "loss": 2.7452, + "step": 15458 + }, + { + "epoch": 0.6539893391995939, + "grad_norm": 0.18467316031455994, + "learning_rate": 0.001, + "loss": 3.4153, + "step": 15459 + }, + { + "epoch": 0.6540316439631102, + "grad_norm": 0.1603761464357376, + "learning_rate": 0.001, + "loss": 2.3351, + "step": 15460 + }, + { + "epoch": 0.6540739487266266, + "grad_norm": 0.19741065800189972, + "learning_rate": 0.001, + "loss": 1.9429, + "step": 15461 + }, + { + "epoch": 0.654116253490143, + "grad_norm": 0.2010018527507782, + "learning_rate": 0.001, + "loss": 2.3873, + "step": 15462 + }, + { + "epoch": 0.6541585582536593, + "grad_norm": 0.1678660362958908, + "learning_rate": 0.001, + "loss": 1.8596, + "step": 15463 + }, + { + "epoch": 0.6542008630171757, + "grad_norm": 0.15577636659145355, + "learning_rate": 0.001, + "loss": 2.438, + "step": 15464 + }, + { + "epoch": 0.6542431677806921, + "grad_norm": 0.16869132220745087, + "learning_rate": 0.001, + "loss": 2.0241, + "step": 15465 + }, + { + "epoch": 0.6542854725442084, + "grad_norm": 0.16793528199195862, + "learning_rate": 0.001, + "loss": 2.3212, + "step": 15466 + }, + { + "epoch": 0.6543277773077248, + "grad_norm": 0.14412140846252441, + "learning_rate": 0.001, + "loss": 2.1674, + "step": 15467 + }, + { + "epoch": 0.6543700820712413, + "grad_norm": 0.17492231726646423, + "learning_rate": 0.001, + "loss": 1.7155, + "step": 15468 + }, + { + "epoch": 0.6544123868347576, + "grad_norm": 0.16392190754413605, + "learning_rate": 0.001, + "loss": 1.5997, + "step": 15469 + }, + { + "epoch": 0.654454691598274, + "grad_norm": 0.15477705001831055, + "learning_rate": 0.001, + "loss": 2.3755, + "step": 15470 + }, + { + "epoch": 0.6544969963617904, + "grad_norm": 9.079736709594727, + "learning_rate": 0.001, + "loss": 2.5364, + "step": 15471 + }, + { + "epoch": 0.6545393011253067, + "grad_norm": 0.2828308641910553, + "learning_rate": 0.001, + "loss": 1.9828, + "step": 15472 + }, + { + "epoch": 0.6545816058888231, + "grad_norm": 0.2948327958583832, + "learning_rate": 0.001, + "loss": 2.5522, + "step": 15473 + }, + { + "epoch": 0.6546239106523395, + "grad_norm": 0.2276574969291687, + "learning_rate": 0.001, + "loss": 2.3777, + "step": 15474 + }, + { + "epoch": 0.6546662154158558, + "grad_norm": 0.23238156735897064, + "learning_rate": 0.001, + "loss": 1.8858, + "step": 15475 + }, + { + "epoch": 0.6547085201793722, + "grad_norm": 0.2526158094406128, + "learning_rate": 0.001, + "loss": 2.4122, + "step": 15476 + }, + { + "epoch": 0.6547508249428886, + "grad_norm": 0.1535889059305191, + "learning_rate": 0.001, + "loss": 1.8641, + "step": 15477 + }, + { + "epoch": 0.6547931297064049, + "grad_norm": 0.9799718856811523, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 15478 + }, + { + "epoch": 0.6548354344699213, + "grad_norm": 0.18154305219650269, + "learning_rate": 0.001, + "loss": 2.3197, + "step": 15479 + }, + { + "epoch": 0.6548777392334377, + "grad_norm": 0.1667226254940033, + "learning_rate": 0.001, + "loss": 2.3554, + "step": 15480 + }, + { + "epoch": 0.654920043996954, + "grad_norm": 1.3010785579681396, + "learning_rate": 0.001, + "loss": 2.7386, + "step": 15481 + }, + { + "epoch": 0.6549623487604704, + "grad_norm": 0.19829553365707397, + "learning_rate": 0.001, + "loss": 1.6301, + "step": 15482 + }, + { + "epoch": 0.6550046535239868, + "grad_norm": 0.2689056098461151, + "learning_rate": 0.001, + "loss": 3.029, + "step": 15483 + }, + { + "epoch": 0.6550469582875031, + "grad_norm": 0.33899521827697754, + "learning_rate": 0.001, + "loss": 2.2413, + "step": 15484 + }, + { + "epoch": 0.6550892630510196, + "grad_norm": 0.26710861921310425, + "learning_rate": 0.001, + "loss": 2.9207, + "step": 15485 + }, + { + "epoch": 0.655131567814536, + "grad_norm": 0.22953356802463531, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 15486 + }, + { + "epoch": 0.6551738725780523, + "grad_norm": 0.2150760293006897, + "learning_rate": 0.001, + "loss": 2.098, + "step": 15487 + }, + { + "epoch": 0.6552161773415687, + "grad_norm": 0.20908361673355103, + "learning_rate": 0.001, + "loss": 2.7129, + "step": 15488 + }, + { + "epoch": 0.6552584821050851, + "grad_norm": 0.20225393772125244, + "learning_rate": 0.001, + "loss": 2.7402, + "step": 15489 + }, + { + "epoch": 0.6553007868686014, + "grad_norm": 0.2164647877216339, + "learning_rate": 0.001, + "loss": 2.245, + "step": 15490 + }, + { + "epoch": 0.6553430916321178, + "grad_norm": 0.17477239668369293, + "learning_rate": 0.001, + "loss": 2.915, + "step": 15491 + }, + { + "epoch": 0.6553853963956342, + "grad_norm": 1.133888602256775, + "learning_rate": 0.001, + "loss": 1.5993, + "step": 15492 + }, + { + "epoch": 0.6554277011591505, + "grad_norm": 1.6383650302886963, + "learning_rate": 0.001, + "loss": 1.6001, + "step": 15493 + }, + { + "epoch": 0.6554700059226669, + "grad_norm": 0.419475257396698, + "learning_rate": 0.001, + "loss": 1.8772, + "step": 15494 + }, + { + "epoch": 0.6555123106861833, + "grad_norm": 0.18061023950576782, + "learning_rate": 0.001, + "loss": 2.0269, + "step": 15495 + }, + { + "epoch": 0.6555546154496996, + "grad_norm": 0.3554385304450989, + "learning_rate": 0.001, + "loss": 2.9187, + "step": 15496 + }, + { + "epoch": 0.655596920213216, + "grad_norm": 0.39084017276763916, + "learning_rate": 0.001, + "loss": 3.1199, + "step": 15497 + }, + { + "epoch": 0.6556392249767323, + "grad_norm": 0.168495312333107, + "learning_rate": 0.001, + "loss": 3.786, + "step": 15498 + }, + { + "epoch": 0.6556815297402487, + "grad_norm": 0.6152557134628296, + "learning_rate": 0.001, + "loss": 2.2492, + "step": 15499 + }, + { + "epoch": 0.6557238345037651, + "grad_norm": 0.19804157316684723, + "learning_rate": 0.001, + "loss": 2.449, + "step": 15500 + }, + { + "epoch": 0.6557661392672814, + "grad_norm": 0.1926928460597992, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 15501 + }, + { + "epoch": 0.6558084440307979, + "grad_norm": 0.1641806960105896, + "learning_rate": 0.001, + "loss": 1.742, + "step": 15502 + }, + { + "epoch": 0.6558507487943143, + "grad_norm": 0.1997319757938385, + "learning_rate": 0.001, + "loss": 1.7896, + "step": 15503 + }, + { + "epoch": 0.6558930535578306, + "grad_norm": 0.2186889350414276, + "learning_rate": 0.001, + "loss": 2.4833, + "step": 15504 + }, + { + "epoch": 0.655935358321347, + "grad_norm": 0.16924524307250977, + "learning_rate": 0.001, + "loss": 2.279, + "step": 15505 + }, + { + "epoch": 0.6559776630848634, + "grad_norm": 0.17367133498191833, + "learning_rate": 0.001, + "loss": 2.3269, + "step": 15506 + }, + { + "epoch": 0.6560199678483797, + "grad_norm": 0.1684117615222931, + "learning_rate": 0.001, + "loss": 2.4659, + "step": 15507 + }, + { + "epoch": 0.6560622726118961, + "grad_norm": 8.400760650634766, + "learning_rate": 0.001, + "loss": 2.1449, + "step": 15508 + }, + { + "epoch": 0.6561045773754125, + "grad_norm": 0.17332401871681213, + "learning_rate": 0.001, + "loss": 1.8829, + "step": 15509 + }, + { + "epoch": 0.6561468821389288, + "grad_norm": 0.4135810136795044, + "learning_rate": 0.001, + "loss": 2.5169, + "step": 15510 + }, + { + "epoch": 0.6561891869024452, + "grad_norm": 0.15025272965431213, + "learning_rate": 0.001, + "loss": 1.9536, + "step": 15511 + }, + { + "epoch": 0.6562314916659616, + "grad_norm": 0.21531356871128082, + "learning_rate": 0.001, + "loss": 2.1685, + "step": 15512 + }, + { + "epoch": 0.6562737964294779, + "grad_norm": 0.20930558443069458, + "learning_rate": 0.001, + "loss": 2.3867, + "step": 15513 + }, + { + "epoch": 0.6563161011929943, + "grad_norm": 0.1821804642677307, + "learning_rate": 0.001, + "loss": 1.4883, + "step": 15514 + }, + { + "epoch": 0.6563584059565107, + "grad_norm": 0.2047373205423355, + "learning_rate": 0.001, + "loss": 3.2604, + "step": 15515 + }, + { + "epoch": 0.656400710720027, + "grad_norm": 0.1793907731771469, + "learning_rate": 0.001, + "loss": 2.5151, + "step": 15516 + }, + { + "epoch": 0.6564430154835434, + "grad_norm": 0.20762686431407928, + "learning_rate": 0.001, + "loss": 3.577, + "step": 15517 + }, + { + "epoch": 0.6564853202470599, + "grad_norm": 0.20441284775733948, + "learning_rate": 0.001, + "loss": 2.101, + "step": 15518 + }, + { + "epoch": 0.6565276250105762, + "grad_norm": 0.3753068447113037, + "learning_rate": 0.001, + "loss": 3.6612, + "step": 15519 + }, + { + "epoch": 0.6565699297740926, + "grad_norm": 0.1667603701353073, + "learning_rate": 0.001, + "loss": 1.8838, + "step": 15520 + }, + { + "epoch": 0.656612234537609, + "grad_norm": 16.620149612426758, + "learning_rate": 0.001, + "loss": 2.5339, + "step": 15521 + }, + { + "epoch": 0.6566545393011253, + "grad_norm": 0.17226529121398926, + "learning_rate": 0.001, + "loss": 2.9333, + "step": 15522 + }, + { + "epoch": 0.6566968440646417, + "grad_norm": 0.1851300597190857, + "learning_rate": 0.001, + "loss": 2.8224, + "step": 15523 + }, + { + "epoch": 0.6567391488281581, + "grad_norm": 0.1961529552936554, + "learning_rate": 0.001, + "loss": 2.389, + "step": 15524 + }, + { + "epoch": 0.6567814535916744, + "grad_norm": 0.2944627106189728, + "learning_rate": 0.001, + "loss": 2.1613, + "step": 15525 + }, + { + "epoch": 0.6568237583551908, + "grad_norm": 0.4502488374710083, + "learning_rate": 0.001, + "loss": 1.678, + "step": 15526 + }, + { + "epoch": 0.6568660631187072, + "grad_norm": 0.19314773380756378, + "learning_rate": 0.001, + "loss": 2.8654, + "step": 15527 + }, + { + "epoch": 0.6569083678822235, + "grad_norm": 0.19277732074260712, + "learning_rate": 0.001, + "loss": 3.3764, + "step": 15528 + }, + { + "epoch": 0.6569506726457399, + "grad_norm": 0.2084587961435318, + "learning_rate": 0.001, + "loss": 3.0331, + "step": 15529 + }, + { + "epoch": 0.6569929774092563, + "grad_norm": 0.24447475373744965, + "learning_rate": 0.001, + "loss": 2.4765, + "step": 15530 + }, + { + "epoch": 0.6570352821727726, + "grad_norm": 0.4201889932155609, + "learning_rate": 0.001, + "loss": 1.8756, + "step": 15531 + }, + { + "epoch": 0.657077586936289, + "grad_norm": 0.3729053735733032, + "learning_rate": 0.001, + "loss": 1.9558, + "step": 15532 + }, + { + "epoch": 0.6571198916998054, + "grad_norm": 0.21242676675319672, + "learning_rate": 0.001, + "loss": 2.0069, + "step": 15533 + }, + { + "epoch": 0.6571621964633217, + "grad_norm": 0.1712314486503601, + "learning_rate": 0.001, + "loss": 1.3812, + "step": 15534 + }, + { + "epoch": 0.6572045012268382, + "grad_norm": 0.1673876792192459, + "learning_rate": 0.001, + "loss": 1.1909, + "step": 15535 + }, + { + "epoch": 0.6572468059903546, + "grad_norm": 0.15203256905078888, + "learning_rate": 0.001, + "loss": 1.6762, + "step": 15536 + }, + { + "epoch": 0.6572891107538709, + "grad_norm": 0.15472784638404846, + "learning_rate": 0.001, + "loss": 1.5563, + "step": 15537 + }, + { + "epoch": 0.6573314155173873, + "grad_norm": 0.22867830097675323, + "learning_rate": 0.001, + "loss": 2.4964, + "step": 15538 + }, + { + "epoch": 0.6573737202809037, + "grad_norm": 1.3349720239639282, + "learning_rate": 0.001, + "loss": 1.7851, + "step": 15539 + }, + { + "epoch": 0.65741602504442, + "grad_norm": 3.2347517013549805, + "learning_rate": 0.001, + "loss": 1.8873, + "step": 15540 + }, + { + "epoch": 0.6574583298079364, + "grad_norm": 1.0401722192764282, + "learning_rate": 0.001, + "loss": 1.982, + "step": 15541 + }, + { + "epoch": 0.6575006345714527, + "grad_norm": 0.4150663912296295, + "learning_rate": 0.001, + "loss": 2.4127, + "step": 15542 + }, + { + "epoch": 0.6575429393349691, + "grad_norm": 0.28380629420280457, + "learning_rate": 0.001, + "loss": 1.7512, + "step": 15543 + }, + { + "epoch": 0.6575852440984855, + "grad_norm": 0.1765417903661728, + "learning_rate": 0.001, + "loss": 2.7307, + "step": 15544 + }, + { + "epoch": 0.6576275488620018, + "grad_norm": 0.22841008007526398, + "learning_rate": 0.001, + "loss": 2.1964, + "step": 15545 + }, + { + "epoch": 0.6576698536255182, + "grad_norm": 0.3355230391025543, + "learning_rate": 0.001, + "loss": 2.1679, + "step": 15546 + }, + { + "epoch": 0.6577121583890346, + "grad_norm": 0.16296561062335968, + "learning_rate": 0.001, + "loss": 1.8227, + "step": 15547 + }, + { + "epoch": 0.6577544631525509, + "grad_norm": 0.2643844485282898, + "learning_rate": 0.001, + "loss": 2.0391, + "step": 15548 + }, + { + "epoch": 0.6577967679160673, + "grad_norm": 0.19168947637081146, + "learning_rate": 0.001, + "loss": 1.9581, + "step": 15549 + }, + { + "epoch": 0.6578390726795837, + "grad_norm": 0.17555919289588928, + "learning_rate": 0.001, + "loss": 1.7128, + "step": 15550 + }, + { + "epoch": 0.6578813774431, + "grad_norm": 0.21022935211658478, + "learning_rate": 0.001, + "loss": 2.0966, + "step": 15551 + }, + { + "epoch": 0.6579236822066165, + "grad_norm": 0.17026807367801666, + "learning_rate": 0.001, + "loss": 1.7935, + "step": 15552 + }, + { + "epoch": 0.6579659869701329, + "grad_norm": 9.642647743225098, + "learning_rate": 0.001, + "loss": 1.9757, + "step": 15553 + }, + { + "epoch": 0.6580082917336492, + "grad_norm": 0.1723700612783432, + "learning_rate": 0.001, + "loss": 2.0512, + "step": 15554 + }, + { + "epoch": 0.6580505964971656, + "grad_norm": 0.16185259819030762, + "learning_rate": 0.001, + "loss": 2.1963, + "step": 15555 + }, + { + "epoch": 0.658092901260682, + "grad_norm": 0.17926618456840515, + "learning_rate": 0.001, + "loss": 2.3246, + "step": 15556 + }, + { + "epoch": 0.6581352060241983, + "grad_norm": 0.31012213230133057, + "learning_rate": 0.001, + "loss": 2.4251, + "step": 15557 + }, + { + "epoch": 0.6581775107877147, + "grad_norm": 0.2613414525985718, + "learning_rate": 0.001, + "loss": 3.0419, + "step": 15558 + }, + { + "epoch": 0.6582198155512311, + "grad_norm": 0.17412632703781128, + "learning_rate": 0.001, + "loss": 2.2753, + "step": 15559 + }, + { + "epoch": 0.6582621203147474, + "grad_norm": 0.40548646450042725, + "learning_rate": 0.001, + "loss": 1.9838, + "step": 15560 + }, + { + "epoch": 0.6583044250782638, + "grad_norm": 0.16524159908294678, + "learning_rate": 0.001, + "loss": 2.8922, + "step": 15561 + }, + { + "epoch": 0.6583467298417802, + "grad_norm": 0.17522113025188446, + "learning_rate": 0.001, + "loss": 1.704, + "step": 15562 + }, + { + "epoch": 0.6583890346052965, + "grad_norm": 0.1847570687532425, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 15563 + }, + { + "epoch": 0.6584313393688129, + "grad_norm": 0.16982965171337128, + "learning_rate": 0.001, + "loss": 1.4231, + "step": 15564 + }, + { + "epoch": 0.6584736441323293, + "grad_norm": 0.16816236078739166, + "learning_rate": 0.001, + "loss": 2.9246, + "step": 15565 + }, + { + "epoch": 0.6585159488958456, + "grad_norm": 0.18339279294013977, + "learning_rate": 0.001, + "loss": 2.7366, + "step": 15566 + }, + { + "epoch": 0.658558253659362, + "grad_norm": 0.15726692974567413, + "learning_rate": 0.001, + "loss": 1.7157, + "step": 15567 + }, + { + "epoch": 0.6586005584228785, + "grad_norm": 0.1590546816587448, + "learning_rate": 0.001, + "loss": 1.957, + "step": 15568 + }, + { + "epoch": 0.6586428631863948, + "grad_norm": 0.28861668705940247, + "learning_rate": 0.001, + "loss": 2.2768, + "step": 15569 + }, + { + "epoch": 0.6586851679499112, + "grad_norm": 0.15033060312271118, + "learning_rate": 0.001, + "loss": 1.8739, + "step": 15570 + }, + { + "epoch": 0.6587274727134276, + "grad_norm": 0.17307975888252258, + "learning_rate": 0.001, + "loss": 1.7251, + "step": 15571 + }, + { + "epoch": 0.6587697774769439, + "grad_norm": 0.4346740245819092, + "learning_rate": 0.001, + "loss": 3.3548, + "step": 15572 + }, + { + "epoch": 0.6588120822404603, + "grad_norm": 0.24258974194526672, + "learning_rate": 0.001, + "loss": 2.4327, + "step": 15573 + }, + { + "epoch": 0.6588543870039767, + "grad_norm": 0.1866195797920227, + "learning_rate": 0.001, + "loss": 2.1185, + "step": 15574 + }, + { + "epoch": 0.658896691767493, + "grad_norm": 0.15456928312778473, + "learning_rate": 0.001, + "loss": 1.487, + "step": 15575 + }, + { + "epoch": 0.6589389965310094, + "grad_norm": 0.18687579035758972, + "learning_rate": 0.001, + "loss": 2.329, + "step": 15576 + }, + { + "epoch": 0.6589813012945258, + "grad_norm": 0.16042332351207733, + "learning_rate": 0.001, + "loss": 2.1489, + "step": 15577 + }, + { + "epoch": 0.6590236060580421, + "grad_norm": 0.15789800882339478, + "learning_rate": 0.001, + "loss": 2.7708, + "step": 15578 + }, + { + "epoch": 0.6590659108215585, + "grad_norm": 0.4976028501987457, + "learning_rate": 0.001, + "loss": 1.66, + "step": 15579 + }, + { + "epoch": 0.6591082155850749, + "grad_norm": 0.18985185027122498, + "learning_rate": 0.001, + "loss": 1.7931, + "step": 15580 + }, + { + "epoch": 0.6591505203485912, + "grad_norm": 0.18553532660007477, + "learning_rate": 0.001, + "loss": 2.2777, + "step": 15581 + }, + { + "epoch": 0.6591928251121076, + "grad_norm": 0.15861479938030243, + "learning_rate": 0.001, + "loss": 1.9693, + "step": 15582 + }, + { + "epoch": 0.659235129875624, + "grad_norm": 0.34126266837120056, + "learning_rate": 0.001, + "loss": 1.8181, + "step": 15583 + }, + { + "epoch": 0.6592774346391403, + "grad_norm": 3.384877920150757, + "learning_rate": 0.001, + "loss": 1.9545, + "step": 15584 + }, + { + "epoch": 0.6593197394026568, + "grad_norm": 0.18254320323467255, + "learning_rate": 0.001, + "loss": 2.2612, + "step": 15585 + }, + { + "epoch": 0.6593620441661731, + "grad_norm": 0.17780011892318726, + "learning_rate": 0.001, + "loss": 2.3693, + "step": 15586 + }, + { + "epoch": 0.6594043489296895, + "grad_norm": 0.2957765460014343, + "learning_rate": 0.001, + "loss": 2.8228, + "step": 15587 + }, + { + "epoch": 0.6594466536932059, + "grad_norm": 0.20004132390022278, + "learning_rate": 0.001, + "loss": 2.2395, + "step": 15588 + }, + { + "epoch": 0.6594889584567222, + "grad_norm": 0.16427592933177948, + "learning_rate": 0.001, + "loss": 1.9505, + "step": 15589 + }, + { + "epoch": 0.6595312632202386, + "grad_norm": 0.3645588755607605, + "learning_rate": 0.001, + "loss": 2.4585, + "step": 15590 + }, + { + "epoch": 0.659573567983755, + "grad_norm": 0.1937217265367508, + "learning_rate": 0.001, + "loss": 1.9487, + "step": 15591 + }, + { + "epoch": 0.6596158727472713, + "grad_norm": 0.5016273260116577, + "learning_rate": 0.001, + "loss": 3.591, + "step": 15592 + }, + { + "epoch": 0.6596581775107877, + "grad_norm": 0.15664246678352356, + "learning_rate": 0.001, + "loss": 1.5117, + "step": 15593 + }, + { + "epoch": 0.6597004822743041, + "grad_norm": 1.7184746265411377, + "learning_rate": 0.001, + "loss": 2.2226, + "step": 15594 + }, + { + "epoch": 0.6597427870378204, + "grad_norm": 1.1707103252410889, + "learning_rate": 0.001, + "loss": 2.213, + "step": 15595 + }, + { + "epoch": 0.6597850918013368, + "grad_norm": 0.1881413757801056, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 15596 + }, + { + "epoch": 0.6598273965648532, + "grad_norm": 0.1818539947271347, + "learning_rate": 0.001, + "loss": 2.2715, + "step": 15597 + }, + { + "epoch": 0.6598697013283695, + "grad_norm": 0.5400058627128601, + "learning_rate": 0.001, + "loss": 1.6366, + "step": 15598 + }, + { + "epoch": 0.6599120060918859, + "grad_norm": 0.3063301742076874, + "learning_rate": 0.001, + "loss": 2.4906, + "step": 15599 + }, + { + "epoch": 0.6599543108554023, + "grad_norm": 0.2518875300884247, + "learning_rate": 0.001, + "loss": 2.2794, + "step": 15600 + }, + { + "epoch": 0.6599966156189186, + "grad_norm": 0.17568255960941315, + "learning_rate": 0.001, + "loss": 2.1365, + "step": 15601 + }, + { + "epoch": 0.6600389203824351, + "grad_norm": 0.17697425186634064, + "learning_rate": 0.001, + "loss": 2.8336, + "step": 15602 + }, + { + "epoch": 0.6600812251459515, + "grad_norm": 1.3957675695419312, + "learning_rate": 0.001, + "loss": 2.2867, + "step": 15603 + }, + { + "epoch": 0.6601235299094678, + "grad_norm": 0.15436089038848877, + "learning_rate": 0.001, + "loss": 1.9459, + "step": 15604 + }, + { + "epoch": 0.6601658346729842, + "grad_norm": 0.16911160945892334, + "learning_rate": 0.001, + "loss": 2.0371, + "step": 15605 + }, + { + "epoch": 0.6602081394365006, + "grad_norm": 0.2687240242958069, + "learning_rate": 0.001, + "loss": 2.4646, + "step": 15606 + }, + { + "epoch": 0.6602504442000169, + "grad_norm": 0.2176024168729782, + "learning_rate": 0.001, + "loss": 2.0944, + "step": 15607 + }, + { + "epoch": 0.6602927489635333, + "grad_norm": 0.4650316536426544, + "learning_rate": 0.001, + "loss": 1.8368, + "step": 15608 + }, + { + "epoch": 0.6603350537270497, + "grad_norm": 0.21563652157783508, + "learning_rate": 0.001, + "loss": 3.3112, + "step": 15609 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 0.14292295277118683, + "learning_rate": 0.001, + "loss": 2.6444, + "step": 15610 + }, + { + "epoch": 0.6604196632540824, + "grad_norm": 0.9657493233680725, + "learning_rate": 0.001, + "loss": 2.7724, + "step": 15611 + }, + { + "epoch": 0.6604619680175988, + "grad_norm": 0.31469717621803284, + "learning_rate": 0.001, + "loss": 2.1013, + "step": 15612 + }, + { + "epoch": 0.6605042727811151, + "grad_norm": 0.16225768625736237, + "learning_rate": 0.001, + "loss": 1.816, + "step": 15613 + }, + { + "epoch": 0.6605465775446315, + "grad_norm": 0.19633138179779053, + "learning_rate": 0.001, + "loss": 1.9044, + "step": 15614 + }, + { + "epoch": 0.6605888823081479, + "grad_norm": 0.18959277868270874, + "learning_rate": 0.001, + "loss": 2.2034, + "step": 15615 + }, + { + "epoch": 0.6606311870716642, + "grad_norm": 0.25994089245796204, + "learning_rate": 0.001, + "loss": 2.6884, + "step": 15616 + }, + { + "epoch": 0.6606734918351806, + "grad_norm": 0.17219536006450653, + "learning_rate": 0.001, + "loss": 2.4676, + "step": 15617 + }, + { + "epoch": 0.6607157965986971, + "grad_norm": 0.15563371777534485, + "learning_rate": 0.001, + "loss": 2.8265, + "step": 15618 + }, + { + "epoch": 0.6607581013622134, + "grad_norm": 0.1527656465768814, + "learning_rate": 0.001, + "loss": 1.3507, + "step": 15619 + }, + { + "epoch": 0.6608004061257298, + "grad_norm": 0.2252880334854126, + "learning_rate": 0.001, + "loss": 2.4293, + "step": 15620 + }, + { + "epoch": 0.6608427108892462, + "grad_norm": 0.20225971937179565, + "learning_rate": 0.001, + "loss": 2.1757, + "step": 15621 + }, + { + "epoch": 0.6608850156527625, + "grad_norm": 0.4528272747993469, + "learning_rate": 0.001, + "loss": 1.8446, + "step": 15622 + }, + { + "epoch": 0.6609273204162789, + "grad_norm": 0.16811969876289368, + "learning_rate": 0.001, + "loss": 1.8456, + "step": 15623 + }, + { + "epoch": 0.6609696251797953, + "grad_norm": 0.16160327196121216, + "learning_rate": 0.001, + "loss": 2.14, + "step": 15624 + }, + { + "epoch": 0.6610119299433116, + "grad_norm": 0.17660216987133026, + "learning_rate": 0.001, + "loss": 3.2061, + "step": 15625 + }, + { + "epoch": 0.661054234706828, + "grad_norm": 0.20479421317577362, + "learning_rate": 0.001, + "loss": 4.2633, + "step": 15626 + }, + { + "epoch": 0.6610965394703444, + "grad_norm": 0.21080619096755981, + "learning_rate": 0.001, + "loss": 2.2565, + "step": 15627 + }, + { + "epoch": 0.6611388442338607, + "grad_norm": 0.180558443069458, + "learning_rate": 0.001, + "loss": 1.8742, + "step": 15628 + }, + { + "epoch": 0.6611811489973771, + "grad_norm": 0.21047769486904144, + "learning_rate": 0.001, + "loss": 2.6093, + "step": 15629 + }, + { + "epoch": 0.6612234537608935, + "grad_norm": 0.1717175841331482, + "learning_rate": 0.001, + "loss": 1.7606, + "step": 15630 + }, + { + "epoch": 0.6612657585244098, + "grad_norm": 0.19754162430763245, + "learning_rate": 0.001, + "loss": 2.4099, + "step": 15631 + }, + { + "epoch": 0.6613080632879262, + "grad_norm": 0.24597451090812683, + "learning_rate": 0.001, + "loss": 2.0901, + "step": 15632 + }, + { + "epoch": 0.6613503680514425, + "grad_norm": 0.1566450297832489, + "learning_rate": 0.001, + "loss": 1.5422, + "step": 15633 + }, + { + "epoch": 0.661392672814959, + "grad_norm": 0.16817563772201538, + "learning_rate": 0.001, + "loss": 1.6473, + "step": 15634 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.16162802278995514, + "learning_rate": 0.001, + "loss": 2.6791, + "step": 15635 + }, + { + "epoch": 0.6614772823419917, + "grad_norm": 0.15925811231136322, + "learning_rate": 0.001, + "loss": 2.2663, + "step": 15636 + }, + { + "epoch": 0.6615195871055081, + "grad_norm": 0.17941713333129883, + "learning_rate": 0.001, + "loss": 2.1478, + "step": 15637 + }, + { + "epoch": 0.6615618918690245, + "grad_norm": 0.14808796346187592, + "learning_rate": 0.001, + "loss": 2.3699, + "step": 15638 + }, + { + "epoch": 0.6616041966325408, + "grad_norm": 0.17554134130477905, + "learning_rate": 0.001, + "loss": 1.5387, + "step": 15639 + }, + { + "epoch": 0.6616465013960572, + "grad_norm": 0.17657719552516937, + "learning_rate": 0.001, + "loss": 1.8514, + "step": 15640 + }, + { + "epoch": 0.6616888061595736, + "grad_norm": 0.9961352348327637, + "learning_rate": 0.001, + "loss": 2.5427, + "step": 15641 + }, + { + "epoch": 0.6617311109230899, + "grad_norm": 0.16566766798496246, + "learning_rate": 0.001, + "loss": 3.038, + "step": 15642 + }, + { + "epoch": 0.6617734156866063, + "grad_norm": 0.3651495575904846, + "learning_rate": 0.001, + "loss": 2.575, + "step": 15643 + }, + { + "epoch": 0.6618157204501227, + "grad_norm": 27.26308250427246, + "learning_rate": 0.001, + "loss": 1.8942, + "step": 15644 + }, + { + "epoch": 0.661858025213639, + "grad_norm": 0.15556779503822327, + "learning_rate": 0.001, + "loss": 3.0992, + "step": 15645 + }, + { + "epoch": 0.6619003299771554, + "grad_norm": 0.2336292266845703, + "learning_rate": 0.001, + "loss": 2.0776, + "step": 15646 + }, + { + "epoch": 0.6619426347406718, + "grad_norm": 0.191901296377182, + "learning_rate": 0.001, + "loss": 2.344, + "step": 15647 + }, + { + "epoch": 0.6619849395041881, + "grad_norm": 0.1957559734582901, + "learning_rate": 0.001, + "loss": 2.3465, + "step": 15648 + }, + { + "epoch": 0.6620272442677045, + "grad_norm": 0.19413861632347107, + "learning_rate": 0.001, + "loss": 2.6151, + "step": 15649 + }, + { + "epoch": 0.662069549031221, + "grad_norm": 0.2169937938451767, + "learning_rate": 0.001, + "loss": 2.3186, + "step": 15650 + }, + { + "epoch": 0.6621118537947372, + "grad_norm": 0.16846176981925964, + "learning_rate": 0.001, + "loss": 1.4052, + "step": 15651 + }, + { + "epoch": 0.6621541585582537, + "grad_norm": 0.23452334105968475, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 15652 + }, + { + "epoch": 0.6621964633217701, + "grad_norm": 0.19128349423408508, + "learning_rate": 0.001, + "loss": 2.3334, + "step": 15653 + }, + { + "epoch": 0.6622387680852864, + "grad_norm": 0.3340997099876404, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 15654 + }, + { + "epoch": 0.6622810728488028, + "grad_norm": 0.2314499020576477, + "learning_rate": 0.001, + "loss": 1.8396, + "step": 15655 + }, + { + "epoch": 0.6623233776123192, + "grad_norm": 0.19297367334365845, + "learning_rate": 0.001, + "loss": 2.2581, + "step": 15656 + }, + { + "epoch": 0.6623656823758355, + "grad_norm": 0.15719622373580933, + "learning_rate": 0.001, + "loss": 1.741, + "step": 15657 + }, + { + "epoch": 0.6624079871393519, + "grad_norm": 0.1912499964237213, + "learning_rate": 0.001, + "loss": 2.2499, + "step": 15658 + }, + { + "epoch": 0.6624502919028683, + "grad_norm": 0.21334213018417358, + "learning_rate": 0.001, + "loss": 2.5132, + "step": 15659 + }, + { + "epoch": 0.6624925966663846, + "grad_norm": 0.215172678232193, + "learning_rate": 0.001, + "loss": 1.9443, + "step": 15660 + }, + { + "epoch": 0.662534901429901, + "grad_norm": 0.18598368763923645, + "learning_rate": 0.001, + "loss": 2.266, + "step": 15661 + }, + { + "epoch": 0.6625772061934174, + "grad_norm": 0.1715102642774582, + "learning_rate": 0.001, + "loss": 2.4652, + "step": 15662 + }, + { + "epoch": 0.6626195109569337, + "grad_norm": 0.19849710166454315, + "learning_rate": 0.001, + "loss": 1.9286, + "step": 15663 + }, + { + "epoch": 0.6626618157204501, + "grad_norm": 0.24974191188812256, + "learning_rate": 0.001, + "loss": 2.3705, + "step": 15664 + }, + { + "epoch": 0.6627041204839665, + "grad_norm": 0.1930912882089615, + "learning_rate": 0.001, + "loss": 1.6271, + "step": 15665 + }, + { + "epoch": 0.6627464252474828, + "grad_norm": 0.18645654618740082, + "learning_rate": 0.001, + "loss": 1.5376, + "step": 15666 + }, + { + "epoch": 0.6627887300109992, + "grad_norm": 0.14447803795337677, + "learning_rate": 0.001, + "loss": 1.5098, + "step": 15667 + }, + { + "epoch": 0.6628310347745157, + "grad_norm": 0.1653042733669281, + "learning_rate": 0.001, + "loss": 1.52, + "step": 15668 + }, + { + "epoch": 0.662873339538032, + "grad_norm": 0.16530010104179382, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 15669 + }, + { + "epoch": 0.6629156443015484, + "grad_norm": 0.18780213594436646, + "learning_rate": 0.001, + "loss": 1.5614, + "step": 15670 + }, + { + "epoch": 0.6629579490650648, + "grad_norm": 3.833364725112915, + "learning_rate": 0.001, + "loss": 1.9021, + "step": 15671 + }, + { + "epoch": 0.6630002538285811, + "grad_norm": 0.16255183517932892, + "learning_rate": 0.001, + "loss": 2.8819, + "step": 15672 + }, + { + "epoch": 0.6630425585920975, + "grad_norm": 0.19622357189655304, + "learning_rate": 0.001, + "loss": 3.1144, + "step": 15673 + }, + { + "epoch": 0.6630848633556139, + "grad_norm": 0.22275784611701965, + "learning_rate": 0.001, + "loss": 2.0371, + "step": 15674 + }, + { + "epoch": 0.6631271681191302, + "grad_norm": 0.17636406421661377, + "learning_rate": 0.001, + "loss": 2.1601, + "step": 15675 + }, + { + "epoch": 0.6631694728826466, + "grad_norm": 0.23526060581207275, + "learning_rate": 0.001, + "loss": 2.5409, + "step": 15676 + }, + { + "epoch": 0.6632117776461629, + "grad_norm": 0.17581601440906525, + "learning_rate": 0.001, + "loss": 1.7807, + "step": 15677 + }, + { + "epoch": 0.6632540824096793, + "grad_norm": 0.14658299088478088, + "learning_rate": 0.001, + "loss": 2.1031, + "step": 15678 + }, + { + "epoch": 0.6632963871731957, + "grad_norm": 0.19805100560188293, + "learning_rate": 0.001, + "loss": 1.9881, + "step": 15679 + }, + { + "epoch": 0.663338691936712, + "grad_norm": 0.1707378327846527, + "learning_rate": 0.001, + "loss": 2.3714, + "step": 15680 + }, + { + "epoch": 0.6633809967002284, + "grad_norm": 0.3700905740261078, + "learning_rate": 0.001, + "loss": 1.8238, + "step": 15681 + }, + { + "epoch": 0.6634233014637448, + "grad_norm": 0.2583630084991455, + "learning_rate": 0.001, + "loss": 3.4336, + "step": 15682 + }, + { + "epoch": 0.6634656062272611, + "grad_norm": 0.1871895045042038, + "learning_rate": 0.001, + "loss": 2.8345, + "step": 15683 + }, + { + "epoch": 0.6635079109907775, + "grad_norm": 0.18821239471435547, + "learning_rate": 0.001, + "loss": 2.0015, + "step": 15684 + }, + { + "epoch": 0.663550215754294, + "grad_norm": 0.1794331818819046, + "learning_rate": 0.001, + "loss": 2.6855, + "step": 15685 + }, + { + "epoch": 0.6635925205178103, + "grad_norm": 0.13422973453998566, + "learning_rate": 0.001, + "loss": 2.8227, + "step": 15686 + }, + { + "epoch": 0.6636348252813267, + "grad_norm": 0.16726963222026825, + "learning_rate": 0.001, + "loss": 2.4534, + "step": 15687 + }, + { + "epoch": 0.6636771300448431, + "grad_norm": 0.19704453647136688, + "learning_rate": 0.001, + "loss": 3.0143, + "step": 15688 + }, + { + "epoch": 0.6637194348083594, + "grad_norm": 0.27630937099456787, + "learning_rate": 0.001, + "loss": 1.6488, + "step": 15689 + }, + { + "epoch": 0.6637617395718758, + "grad_norm": 0.16017358005046844, + "learning_rate": 0.001, + "loss": 2.288, + "step": 15690 + }, + { + "epoch": 0.6638040443353922, + "grad_norm": 0.4271370768547058, + "learning_rate": 0.001, + "loss": 2.3904, + "step": 15691 + }, + { + "epoch": 0.6638463490989085, + "grad_norm": 0.17746147513389587, + "learning_rate": 0.001, + "loss": 2.5629, + "step": 15692 + }, + { + "epoch": 0.6638886538624249, + "grad_norm": 0.22230751812458038, + "learning_rate": 0.001, + "loss": 3.1131, + "step": 15693 + }, + { + "epoch": 0.6639309586259413, + "grad_norm": 0.1868673861026764, + "learning_rate": 0.001, + "loss": 2.424, + "step": 15694 + }, + { + "epoch": 0.6639732633894576, + "grad_norm": 0.1976270079612732, + "learning_rate": 0.001, + "loss": 2.4694, + "step": 15695 + }, + { + "epoch": 0.664015568152974, + "grad_norm": 0.18282292783260345, + "learning_rate": 0.001, + "loss": 2.1648, + "step": 15696 + }, + { + "epoch": 0.6640578729164904, + "grad_norm": 0.1945333033800125, + "learning_rate": 0.001, + "loss": 2.1718, + "step": 15697 + }, + { + "epoch": 0.6641001776800067, + "grad_norm": 0.19967901706695557, + "learning_rate": 0.001, + "loss": 1.6653, + "step": 15698 + }, + { + "epoch": 0.6641424824435231, + "grad_norm": 0.35160374641418457, + "learning_rate": 0.001, + "loss": 2.9863, + "step": 15699 + }, + { + "epoch": 0.6641847872070396, + "grad_norm": 0.18338724970817566, + "learning_rate": 0.001, + "loss": 2.0818, + "step": 15700 + }, + { + "epoch": 0.6642270919705558, + "grad_norm": 0.26480796933174133, + "learning_rate": 0.001, + "loss": 1.8102, + "step": 15701 + }, + { + "epoch": 0.6642693967340723, + "grad_norm": 0.1641346663236618, + "learning_rate": 0.001, + "loss": 1.8631, + "step": 15702 + }, + { + "epoch": 0.6643117014975887, + "grad_norm": 0.19785989820957184, + "learning_rate": 0.001, + "loss": 2.0509, + "step": 15703 + }, + { + "epoch": 0.664354006261105, + "grad_norm": 0.1693398356437683, + "learning_rate": 0.001, + "loss": 2.2726, + "step": 15704 + }, + { + "epoch": 0.6643963110246214, + "grad_norm": 0.20726856589317322, + "learning_rate": 0.001, + "loss": 2.8105, + "step": 15705 + }, + { + "epoch": 0.6644386157881378, + "grad_norm": 0.251668781042099, + "learning_rate": 0.001, + "loss": 2.1022, + "step": 15706 + }, + { + "epoch": 0.6644809205516541, + "grad_norm": 0.1539115458726883, + "learning_rate": 0.001, + "loss": 2.8067, + "step": 15707 + }, + { + "epoch": 0.6645232253151705, + "grad_norm": 0.36175769567489624, + "learning_rate": 0.001, + "loss": 1.5274, + "step": 15708 + }, + { + "epoch": 0.6645655300786869, + "grad_norm": 0.17199809849262238, + "learning_rate": 0.001, + "loss": 2.5353, + "step": 15709 + }, + { + "epoch": 0.6646078348422032, + "grad_norm": 2.3134865760803223, + "learning_rate": 0.001, + "loss": 1.6276, + "step": 15710 + }, + { + "epoch": 0.6646501396057196, + "grad_norm": 0.8231282234191895, + "learning_rate": 0.001, + "loss": 2.0584, + "step": 15711 + }, + { + "epoch": 0.664692444369236, + "grad_norm": 0.5767163038253784, + "learning_rate": 0.001, + "loss": 1.7341, + "step": 15712 + }, + { + "epoch": 0.6647347491327523, + "grad_norm": 0.1625528633594513, + "learning_rate": 0.001, + "loss": 2.172, + "step": 15713 + }, + { + "epoch": 0.6647770538962687, + "grad_norm": 0.8674761056900024, + "learning_rate": 0.001, + "loss": 2.3215, + "step": 15714 + }, + { + "epoch": 0.6648193586597851, + "grad_norm": 0.9063986539840698, + "learning_rate": 0.001, + "loss": 1.7439, + "step": 15715 + }, + { + "epoch": 0.6648616634233014, + "grad_norm": 0.17824086546897888, + "learning_rate": 0.001, + "loss": 2.0572, + "step": 15716 + }, + { + "epoch": 0.6649039681868179, + "grad_norm": 0.2059624344110489, + "learning_rate": 0.001, + "loss": 2.6742, + "step": 15717 + }, + { + "epoch": 0.6649462729503343, + "grad_norm": 0.6646909117698669, + "learning_rate": 0.001, + "loss": 2.3202, + "step": 15718 + }, + { + "epoch": 0.6649885777138506, + "grad_norm": 0.1927110105752945, + "learning_rate": 0.001, + "loss": 1.8751, + "step": 15719 + }, + { + "epoch": 0.665030882477367, + "grad_norm": 0.20116344094276428, + "learning_rate": 0.001, + "loss": 1.9791, + "step": 15720 + }, + { + "epoch": 0.6650731872408834, + "grad_norm": 0.44798147678375244, + "learning_rate": 0.001, + "loss": 2.8342, + "step": 15721 + }, + { + "epoch": 0.6651154920043997, + "grad_norm": 0.239822655916214, + "learning_rate": 0.001, + "loss": 1.8323, + "step": 15722 + }, + { + "epoch": 0.6651577967679161, + "grad_norm": 0.20988096296787262, + "learning_rate": 0.001, + "loss": 2.2684, + "step": 15723 + }, + { + "epoch": 0.6652001015314324, + "grad_norm": 0.22264404594898224, + "learning_rate": 0.001, + "loss": 2.8356, + "step": 15724 + }, + { + "epoch": 0.6652424062949488, + "grad_norm": 0.4639243185520172, + "learning_rate": 0.001, + "loss": 1.9119, + "step": 15725 + }, + { + "epoch": 0.6652847110584652, + "grad_norm": 0.8543448448181152, + "learning_rate": 0.001, + "loss": 1.6931, + "step": 15726 + }, + { + "epoch": 0.6653270158219815, + "grad_norm": 1.1423962116241455, + "learning_rate": 0.001, + "loss": 1.5227, + "step": 15727 + }, + { + "epoch": 0.6653693205854979, + "grad_norm": 0.1901700645685196, + "learning_rate": 0.001, + "loss": 2.7098, + "step": 15728 + }, + { + "epoch": 0.6654116253490143, + "grad_norm": 0.8082202076911926, + "learning_rate": 0.001, + "loss": 2.3667, + "step": 15729 + }, + { + "epoch": 0.6654539301125306, + "grad_norm": 0.3281959891319275, + "learning_rate": 0.001, + "loss": 1.9087, + "step": 15730 + }, + { + "epoch": 0.665496234876047, + "grad_norm": 0.3187538683414459, + "learning_rate": 0.001, + "loss": 1.9291, + "step": 15731 + }, + { + "epoch": 0.6655385396395634, + "grad_norm": 2.2117702960968018, + "learning_rate": 0.001, + "loss": 2.724, + "step": 15732 + }, + { + "epoch": 0.6655808444030797, + "grad_norm": 35.29819107055664, + "learning_rate": 0.001, + "loss": 2.2991, + "step": 15733 + }, + { + "epoch": 0.6656231491665962, + "grad_norm": 0.18186047673225403, + "learning_rate": 0.001, + "loss": 1.3769, + "step": 15734 + }, + { + "epoch": 0.6656654539301126, + "grad_norm": 0.24423609673976898, + "learning_rate": 0.001, + "loss": 2.32, + "step": 15735 + }, + { + "epoch": 0.6657077586936289, + "grad_norm": 0.319638729095459, + "learning_rate": 0.001, + "loss": 2.4374, + "step": 15736 + }, + { + "epoch": 0.6657500634571453, + "grad_norm": 0.1802143007516861, + "learning_rate": 0.001, + "loss": 2.2174, + "step": 15737 + }, + { + "epoch": 0.6657923682206617, + "grad_norm": 0.17544938623905182, + "learning_rate": 0.001, + "loss": 1.7473, + "step": 15738 + }, + { + "epoch": 0.665834672984178, + "grad_norm": 0.9758937954902649, + "learning_rate": 0.001, + "loss": 2.2743, + "step": 15739 + }, + { + "epoch": 0.6658769777476944, + "grad_norm": 0.1803831309080124, + "learning_rate": 0.001, + "loss": 3.2636, + "step": 15740 + }, + { + "epoch": 0.6659192825112108, + "grad_norm": 0.2357131391763687, + "learning_rate": 0.001, + "loss": 2.4164, + "step": 15741 + }, + { + "epoch": 0.6659615872747271, + "grad_norm": 0.16148456931114197, + "learning_rate": 0.001, + "loss": 1.8385, + "step": 15742 + }, + { + "epoch": 0.6660038920382435, + "grad_norm": 0.17655906081199646, + "learning_rate": 0.001, + "loss": 1.7078, + "step": 15743 + }, + { + "epoch": 0.6660461968017599, + "grad_norm": 0.18633557856082916, + "learning_rate": 0.001, + "loss": 1.5433, + "step": 15744 + }, + { + "epoch": 0.6660885015652762, + "grad_norm": 0.16458381712436676, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 15745 + }, + { + "epoch": 0.6661308063287926, + "grad_norm": 0.1529700607061386, + "learning_rate": 0.001, + "loss": 2.7367, + "step": 15746 + }, + { + "epoch": 0.666173111092309, + "grad_norm": 0.16840951144695282, + "learning_rate": 0.001, + "loss": 2.7347, + "step": 15747 + }, + { + "epoch": 0.6662154158558253, + "grad_norm": 0.15616460144519806, + "learning_rate": 0.001, + "loss": 1.6326, + "step": 15748 + }, + { + "epoch": 0.6662577206193417, + "grad_norm": 45.16459274291992, + "learning_rate": 0.001, + "loss": 2.0402, + "step": 15749 + }, + { + "epoch": 0.6663000253828582, + "grad_norm": 0.35248544812202454, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 15750 + }, + { + "epoch": 0.6663423301463745, + "grad_norm": 0.14932596683502197, + "learning_rate": 0.001, + "loss": 2.1283, + "step": 15751 + }, + { + "epoch": 0.6663846349098909, + "grad_norm": 0.22044512629508972, + "learning_rate": 0.001, + "loss": 2.6434, + "step": 15752 + }, + { + "epoch": 0.6664269396734073, + "grad_norm": 0.21503077447414398, + "learning_rate": 0.001, + "loss": 2.6892, + "step": 15753 + }, + { + "epoch": 0.6664692444369236, + "grad_norm": 3.1654529571533203, + "learning_rate": 0.001, + "loss": 1.6119, + "step": 15754 + }, + { + "epoch": 0.66651154920044, + "grad_norm": 0.19109536707401276, + "learning_rate": 0.001, + "loss": 1.7015, + "step": 15755 + }, + { + "epoch": 0.6665538539639564, + "grad_norm": 0.217402383685112, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 15756 + }, + { + "epoch": 0.6665961587274727, + "grad_norm": 0.20222723484039307, + "learning_rate": 0.001, + "loss": 1.8401, + "step": 15757 + }, + { + "epoch": 0.6666384634909891, + "grad_norm": 0.24966546893119812, + "learning_rate": 0.001, + "loss": 1.984, + "step": 15758 + }, + { + "epoch": 0.6666807682545055, + "grad_norm": 0.25226879119873047, + "learning_rate": 0.001, + "loss": 2.9811, + "step": 15759 + }, + { + "epoch": 0.6667230730180218, + "grad_norm": 0.37204667925834656, + "learning_rate": 0.001, + "loss": 2.5787, + "step": 15760 + }, + { + "epoch": 0.6667653777815382, + "grad_norm": 1.5811610221862793, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 15761 + }, + { + "epoch": 0.6668076825450546, + "grad_norm": 0.41911280155181885, + "learning_rate": 0.001, + "loss": 2.1604, + "step": 15762 + }, + { + "epoch": 0.6668499873085709, + "grad_norm": 0.19730260968208313, + "learning_rate": 0.001, + "loss": 3.0043, + "step": 15763 + }, + { + "epoch": 0.6668922920720873, + "grad_norm": 0.23359958827495575, + "learning_rate": 0.001, + "loss": 3.387, + "step": 15764 + }, + { + "epoch": 0.6669345968356037, + "grad_norm": 4.667820930480957, + "learning_rate": 0.001, + "loss": 2.4877, + "step": 15765 + }, + { + "epoch": 0.66697690159912, + "grad_norm": 0.25286656618118286, + "learning_rate": 0.001, + "loss": 1.9937, + "step": 15766 + }, + { + "epoch": 0.6670192063626365, + "grad_norm": 0.23963415622711182, + "learning_rate": 0.001, + "loss": 1.69, + "step": 15767 + }, + { + "epoch": 0.6670615111261528, + "grad_norm": 0.26238617300987244, + "learning_rate": 0.001, + "loss": 2.352, + "step": 15768 + }, + { + "epoch": 0.6671038158896692, + "grad_norm": 83.53499603271484, + "learning_rate": 0.001, + "loss": 2.5639, + "step": 15769 + }, + { + "epoch": 0.6671461206531856, + "grad_norm": 0.325317919254303, + "learning_rate": 0.001, + "loss": 2.2086, + "step": 15770 + }, + { + "epoch": 0.6671884254167019, + "grad_norm": 0.20828962326049805, + "learning_rate": 0.001, + "loss": 2.473, + "step": 15771 + }, + { + "epoch": 0.6672307301802183, + "grad_norm": 0.21652193367481232, + "learning_rate": 0.001, + "loss": 2.2587, + "step": 15772 + }, + { + "epoch": 0.6672730349437347, + "grad_norm": 0.44318485260009766, + "learning_rate": 0.001, + "loss": 2.0383, + "step": 15773 + }, + { + "epoch": 0.667315339707251, + "grad_norm": 0.5616191625595093, + "learning_rate": 0.001, + "loss": 1.6704, + "step": 15774 + }, + { + "epoch": 0.6673576444707674, + "grad_norm": 0.2393038272857666, + "learning_rate": 0.001, + "loss": 2.2136, + "step": 15775 + }, + { + "epoch": 0.6673999492342838, + "grad_norm": 0.2519043982028961, + "learning_rate": 0.001, + "loss": 2.8899, + "step": 15776 + }, + { + "epoch": 0.6674422539978001, + "grad_norm": 0.2502747178077698, + "learning_rate": 0.001, + "loss": 2.4507, + "step": 15777 + }, + { + "epoch": 0.6674845587613165, + "grad_norm": 0.20891942083835602, + "learning_rate": 0.001, + "loss": 1.7339, + "step": 15778 + }, + { + "epoch": 0.6675268635248329, + "grad_norm": 1.0747361183166504, + "learning_rate": 0.001, + "loss": 2.0077, + "step": 15779 + }, + { + "epoch": 0.6675691682883492, + "grad_norm": 0.23666757345199585, + "learning_rate": 0.001, + "loss": 2.8961, + "step": 15780 + }, + { + "epoch": 0.6676114730518656, + "grad_norm": 0.3330824077129364, + "learning_rate": 0.001, + "loss": 2.9923, + "step": 15781 + }, + { + "epoch": 0.667653777815382, + "grad_norm": 0.33238059282302856, + "learning_rate": 0.001, + "loss": 1.7868, + "step": 15782 + }, + { + "epoch": 0.6676960825788983, + "grad_norm": 0.17682193219661713, + "learning_rate": 0.001, + "loss": 2.7748, + "step": 15783 + }, + { + "epoch": 0.6677383873424148, + "grad_norm": 0.2014869600534439, + "learning_rate": 0.001, + "loss": 1.9485, + "step": 15784 + }, + { + "epoch": 0.6677806921059312, + "grad_norm": 0.17806148529052734, + "learning_rate": 0.001, + "loss": 1.7066, + "step": 15785 + }, + { + "epoch": 0.6678229968694475, + "grad_norm": 6.159350395202637, + "learning_rate": 0.001, + "loss": 2.0137, + "step": 15786 + }, + { + "epoch": 0.6678653016329639, + "grad_norm": 0.381747841835022, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 15787 + }, + { + "epoch": 0.6679076063964803, + "grad_norm": 0.2559763789176941, + "learning_rate": 0.001, + "loss": 2.3698, + "step": 15788 + }, + { + "epoch": 0.6679499111599966, + "grad_norm": 0.24722157418727875, + "learning_rate": 0.001, + "loss": 2.6856, + "step": 15789 + }, + { + "epoch": 0.667992215923513, + "grad_norm": 3.812957286834717, + "learning_rate": 0.001, + "loss": 1.976, + "step": 15790 + }, + { + "epoch": 0.6680345206870294, + "grad_norm": 0.3154589533805847, + "learning_rate": 0.001, + "loss": 4.0476, + "step": 15791 + }, + { + "epoch": 0.6680768254505457, + "grad_norm": 0.20563848316669464, + "learning_rate": 0.001, + "loss": 2.0702, + "step": 15792 + }, + { + "epoch": 0.6681191302140621, + "grad_norm": 0.3167200982570648, + "learning_rate": 0.001, + "loss": 2.8908, + "step": 15793 + }, + { + "epoch": 0.6681614349775785, + "grad_norm": 0.2859385907649994, + "learning_rate": 0.001, + "loss": 2.3652, + "step": 15794 + }, + { + "epoch": 0.6682037397410948, + "grad_norm": 4.2778730392456055, + "learning_rate": 0.001, + "loss": 2.6051, + "step": 15795 + }, + { + "epoch": 0.6682460445046112, + "grad_norm": 0.23722970485687256, + "learning_rate": 0.001, + "loss": 1.4931, + "step": 15796 + }, + { + "epoch": 0.6682883492681276, + "grad_norm": 0.19192059338092804, + "learning_rate": 0.001, + "loss": 2.4992, + "step": 15797 + }, + { + "epoch": 0.6683306540316439, + "grad_norm": 0.25284337997436523, + "learning_rate": 0.001, + "loss": 1.957, + "step": 15798 + }, + { + "epoch": 0.6683729587951603, + "grad_norm": 0.23747120797634125, + "learning_rate": 0.001, + "loss": 1.6552, + "step": 15799 + }, + { + "epoch": 0.6684152635586768, + "grad_norm": 0.21687759459018707, + "learning_rate": 0.001, + "loss": 1.8681, + "step": 15800 + }, + { + "epoch": 0.668457568322193, + "grad_norm": 0.2691959738731384, + "learning_rate": 0.001, + "loss": 2.2289, + "step": 15801 + }, + { + "epoch": 0.6684998730857095, + "grad_norm": 0.2524837553501129, + "learning_rate": 0.001, + "loss": 1.8516, + "step": 15802 + }, + { + "epoch": 0.6685421778492259, + "grad_norm": 0.2057633101940155, + "learning_rate": 0.001, + "loss": 2.509, + "step": 15803 + }, + { + "epoch": 0.6685844826127422, + "grad_norm": 1.199033498764038, + "learning_rate": 0.001, + "loss": 1.5052, + "step": 15804 + }, + { + "epoch": 0.6686267873762586, + "grad_norm": 0.3630319833755493, + "learning_rate": 0.001, + "loss": 2.5884, + "step": 15805 + }, + { + "epoch": 0.668669092139775, + "grad_norm": 0.4470106065273285, + "learning_rate": 0.001, + "loss": 2.7533, + "step": 15806 + }, + { + "epoch": 0.6687113969032913, + "grad_norm": 0.18470320105552673, + "learning_rate": 0.001, + "loss": 2.5, + "step": 15807 + }, + { + "epoch": 0.6687537016668077, + "grad_norm": 3.678845167160034, + "learning_rate": 0.001, + "loss": 2.6681, + "step": 15808 + }, + { + "epoch": 0.6687960064303241, + "grad_norm": 0.19354915618896484, + "learning_rate": 0.001, + "loss": 2.4777, + "step": 15809 + }, + { + "epoch": 0.6688383111938404, + "grad_norm": 0.32464343309402466, + "learning_rate": 0.001, + "loss": 2.7447, + "step": 15810 + }, + { + "epoch": 0.6688806159573568, + "grad_norm": 0.1859036386013031, + "learning_rate": 0.001, + "loss": 1.5693, + "step": 15811 + }, + { + "epoch": 0.6689229207208731, + "grad_norm": 0.19540037214756012, + "learning_rate": 0.001, + "loss": 2.8704, + "step": 15812 + }, + { + "epoch": 0.6689652254843895, + "grad_norm": 0.24034158885478973, + "learning_rate": 0.001, + "loss": 1.6442, + "step": 15813 + }, + { + "epoch": 0.6690075302479059, + "grad_norm": 0.20161794126033783, + "learning_rate": 0.001, + "loss": 1.7371, + "step": 15814 + }, + { + "epoch": 0.6690498350114222, + "grad_norm": 0.27223238348960876, + "learning_rate": 0.001, + "loss": 2.8628, + "step": 15815 + }, + { + "epoch": 0.6690921397749386, + "grad_norm": 0.18867754936218262, + "learning_rate": 0.001, + "loss": 1.8677, + "step": 15816 + }, + { + "epoch": 0.669134444538455, + "grad_norm": 0.18999359011650085, + "learning_rate": 0.001, + "loss": 2.0956, + "step": 15817 + }, + { + "epoch": 0.6691767493019714, + "grad_norm": 0.14994925260543823, + "learning_rate": 0.001, + "loss": 2.6274, + "step": 15818 + }, + { + "epoch": 0.6692190540654878, + "grad_norm": 0.16162274777889252, + "learning_rate": 0.001, + "loss": 2.5421, + "step": 15819 + }, + { + "epoch": 0.6692613588290042, + "grad_norm": 1.4299275875091553, + "learning_rate": 0.001, + "loss": 2.334, + "step": 15820 + }, + { + "epoch": 0.6693036635925205, + "grad_norm": 0.9547110199928284, + "learning_rate": 0.001, + "loss": 2.2495, + "step": 15821 + }, + { + "epoch": 0.6693459683560369, + "grad_norm": 0.19420866668224335, + "learning_rate": 0.001, + "loss": 2.792, + "step": 15822 + }, + { + "epoch": 0.6693882731195533, + "grad_norm": 0.15037833154201508, + "learning_rate": 0.001, + "loss": 2.696, + "step": 15823 + }, + { + "epoch": 0.6694305778830696, + "grad_norm": 0.1415679156780243, + "learning_rate": 0.001, + "loss": 3.4118, + "step": 15824 + }, + { + "epoch": 0.669472882646586, + "grad_norm": 0.15855517983436584, + "learning_rate": 0.001, + "loss": 1.7473, + "step": 15825 + }, + { + "epoch": 0.6695151874101024, + "grad_norm": 0.47525739669799805, + "learning_rate": 0.001, + "loss": 2.8112, + "step": 15826 + }, + { + "epoch": 0.6695574921736187, + "grad_norm": 0.35138139128685, + "learning_rate": 0.001, + "loss": 2.0132, + "step": 15827 + }, + { + "epoch": 0.6695997969371351, + "grad_norm": 0.18116721510887146, + "learning_rate": 0.001, + "loss": 2.488, + "step": 15828 + }, + { + "epoch": 0.6696421017006515, + "grad_norm": 0.18540117144584656, + "learning_rate": 0.001, + "loss": 3.8645, + "step": 15829 + }, + { + "epoch": 0.6696844064641678, + "grad_norm": 0.19489508867263794, + "learning_rate": 0.001, + "loss": 2.7296, + "step": 15830 + }, + { + "epoch": 0.6697267112276842, + "grad_norm": 0.19228166341781616, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 15831 + }, + { + "epoch": 0.6697690159912006, + "grad_norm": 0.15116505324840546, + "learning_rate": 0.001, + "loss": 1.8258, + "step": 15832 + }, + { + "epoch": 0.6698113207547169, + "grad_norm": 0.6345039010047913, + "learning_rate": 0.001, + "loss": 2.1165, + "step": 15833 + }, + { + "epoch": 0.6698536255182334, + "grad_norm": 0.15162324905395508, + "learning_rate": 0.001, + "loss": 2.3055, + "step": 15834 + }, + { + "epoch": 0.6698959302817498, + "grad_norm": 1.6587047576904297, + "learning_rate": 0.001, + "loss": 1.78, + "step": 15835 + }, + { + "epoch": 0.6699382350452661, + "grad_norm": 0.1674169898033142, + "learning_rate": 0.001, + "loss": 1.9546, + "step": 15836 + }, + { + "epoch": 0.6699805398087825, + "grad_norm": 0.1696109175682068, + "learning_rate": 0.001, + "loss": 2.0511, + "step": 15837 + }, + { + "epoch": 0.6700228445722989, + "grad_norm": 0.16364048421382904, + "learning_rate": 0.001, + "loss": 1.7434, + "step": 15838 + }, + { + "epoch": 0.6700651493358152, + "grad_norm": 0.18208681046962738, + "learning_rate": 0.001, + "loss": 2.9249, + "step": 15839 + }, + { + "epoch": 0.6701074540993316, + "grad_norm": 0.870050847530365, + "learning_rate": 0.001, + "loss": 1.8984, + "step": 15840 + }, + { + "epoch": 0.670149758862848, + "grad_norm": 0.20092959702014923, + "learning_rate": 0.001, + "loss": 2.044, + "step": 15841 + }, + { + "epoch": 0.6701920636263643, + "grad_norm": 0.24720391631126404, + "learning_rate": 0.001, + "loss": 2.1473, + "step": 15842 + }, + { + "epoch": 0.6702343683898807, + "grad_norm": 0.18767613172531128, + "learning_rate": 0.001, + "loss": 2.8646, + "step": 15843 + }, + { + "epoch": 0.6702766731533971, + "grad_norm": 0.15457838773727417, + "learning_rate": 0.001, + "loss": 1.7142, + "step": 15844 + }, + { + "epoch": 0.6703189779169134, + "grad_norm": 0.1792488992214203, + "learning_rate": 0.001, + "loss": 2.8613, + "step": 15845 + }, + { + "epoch": 0.6703612826804298, + "grad_norm": 0.9255082607269287, + "learning_rate": 0.001, + "loss": 2.8254, + "step": 15846 + }, + { + "epoch": 0.6704035874439462, + "grad_norm": 3.0631420612335205, + "learning_rate": 0.001, + "loss": 2.5177, + "step": 15847 + }, + { + "epoch": 0.6704458922074625, + "grad_norm": 0.1708671599626541, + "learning_rate": 0.001, + "loss": 2.045, + "step": 15848 + }, + { + "epoch": 0.670488196970979, + "grad_norm": 0.1887563019990921, + "learning_rate": 0.001, + "loss": 1.8023, + "step": 15849 + }, + { + "epoch": 0.6705305017344954, + "grad_norm": 0.23792339861392975, + "learning_rate": 0.001, + "loss": 2.0385, + "step": 15850 + }, + { + "epoch": 0.6705728064980117, + "grad_norm": 0.1688668131828308, + "learning_rate": 0.001, + "loss": 2.0975, + "step": 15851 + }, + { + "epoch": 0.6706151112615281, + "grad_norm": 0.21332918107509613, + "learning_rate": 0.001, + "loss": 2.3229, + "step": 15852 + }, + { + "epoch": 0.6706574160250445, + "grad_norm": 0.1556411236524582, + "learning_rate": 0.001, + "loss": 2.6122, + "step": 15853 + }, + { + "epoch": 0.6706997207885608, + "grad_norm": 0.17764410376548767, + "learning_rate": 0.001, + "loss": 1.339, + "step": 15854 + }, + { + "epoch": 0.6707420255520772, + "grad_norm": 0.31873539090156555, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 15855 + }, + { + "epoch": 0.6707843303155936, + "grad_norm": 1.299968957901001, + "learning_rate": 0.001, + "loss": 1.8488, + "step": 15856 + }, + { + "epoch": 0.6708266350791099, + "grad_norm": 0.19831599295139313, + "learning_rate": 0.001, + "loss": 2.9631, + "step": 15857 + }, + { + "epoch": 0.6708689398426263, + "grad_norm": 0.4041413962841034, + "learning_rate": 0.001, + "loss": 1.7992, + "step": 15858 + }, + { + "epoch": 0.6709112446061426, + "grad_norm": 0.19786041975021362, + "learning_rate": 0.001, + "loss": 2.2841, + "step": 15859 + }, + { + "epoch": 0.670953549369659, + "grad_norm": 0.17182038724422455, + "learning_rate": 0.001, + "loss": 2.1733, + "step": 15860 + }, + { + "epoch": 0.6709958541331754, + "grad_norm": 0.21848192811012268, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 15861 + }, + { + "epoch": 0.6710381588966917, + "grad_norm": 0.2034897655248642, + "learning_rate": 0.001, + "loss": 2.1136, + "step": 15862 + }, + { + "epoch": 0.6710804636602081, + "grad_norm": 0.19234777987003326, + "learning_rate": 0.001, + "loss": 2.0125, + "step": 15863 + }, + { + "epoch": 0.6711227684237245, + "grad_norm": 0.2422971874475479, + "learning_rate": 0.001, + "loss": 3.2963, + "step": 15864 + }, + { + "epoch": 0.6711650731872408, + "grad_norm": 0.17505241930484772, + "learning_rate": 0.001, + "loss": 1.7463, + "step": 15865 + }, + { + "epoch": 0.6712073779507572, + "grad_norm": 0.1780707985162735, + "learning_rate": 0.001, + "loss": 1.6777, + "step": 15866 + }, + { + "epoch": 0.6712496827142737, + "grad_norm": 0.1790010929107666, + "learning_rate": 0.001, + "loss": 2.4067, + "step": 15867 + }, + { + "epoch": 0.67129198747779, + "grad_norm": 0.19889087975025177, + "learning_rate": 0.001, + "loss": 2.5161, + "step": 15868 + }, + { + "epoch": 0.6713342922413064, + "grad_norm": 0.9066613912582397, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 15869 + }, + { + "epoch": 0.6713765970048228, + "grad_norm": 0.15774136781692505, + "learning_rate": 0.001, + "loss": 2.4626, + "step": 15870 + }, + { + "epoch": 0.6714189017683391, + "grad_norm": 0.14970077574253082, + "learning_rate": 0.001, + "loss": 1.6917, + "step": 15871 + }, + { + "epoch": 0.6714612065318555, + "grad_norm": 0.1569453924894333, + "learning_rate": 0.001, + "loss": 2.3071, + "step": 15872 + }, + { + "epoch": 0.6715035112953719, + "grad_norm": 0.4611540138721466, + "learning_rate": 0.001, + "loss": 2.905, + "step": 15873 + }, + { + "epoch": 0.6715458160588882, + "grad_norm": 0.15557625889778137, + "learning_rate": 0.001, + "loss": 2.3258, + "step": 15874 + }, + { + "epoch": 0.6715881208224046, + "grad_norm": 0.19848431646823883, + "learning_rate": 0.001, + "loss": 2.6509, + "step": 15875 + }, + { + "epoch": 0.671630425585921, + "grad_norm": 0.16773267090320587, + "learning_rate": 0.001, + "loss": 2.1152, + "step": 15876 + }, + { + "epoch": 0.6716727303494373, + "grad_norm": 0.23722898960113525, + "learning_rate": 0.001, + "loss": 2.3771, + "step": 15877 + }, + { + "epoch": 0.6717150351129537, + "grad_norm": 1.1143704652786255, + "learning_rate": 0.001, + "loss": 2.607, + "step": 15878 + }, + { + "epoch": 0.6717573398764701, + "grad_norm": 0.20059651136398315, + "learning_rate": 0.001, + "loss": 2.2388, + "step": 15879 + }, + { + "epoch": 0.6717996446399864, + "grad_norm": 0.19346950948238373, + "learning_rate": 0.001, + "loss": 1.8036, + "step": 15880 + }, + { + "epoch": 0.6718419494035028, + "grad_norm": 0.1549372375011444, + "learning_rate": 0.001, + "loss": 1.6958, + "step": 15881 + }, + { + "epoch": 0.6718842541670192, + "grad_norm": 0.23537324368953705, + "learning_rate": 0.001, + "loss": 1.8379, + "step": 15882 + }, + { + "epoch": 0.6719265589305355, + "grad_norm": 0.2048342525959015, + "learning_rate": 0.001, + "loss": 2.3291, + "step": 15883 + }, + { + "epoch": 0.671968863694052, + "grad_norm": 0.15993832051753998, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 15884 + }, + { + "epoch": 0.6720111684575684, + "grad_norm": 0.1927027702331543, + "learning_rate": 0.001, + "loss": 2.0073, + "step": 15885 + }, + { + "epoch": 0.6720534732210847, + "grad_norm": 0.2728440463542938, + "learning_rate": 0.001, + "loss": 2.1145, + "step": 15886 + }, + { + "epoch": 0.6720957779846011, + "grad_norm": 0.21112090349197388, + "learning_rate": 0.001, + "loss": 2.4727, + "step": 15887 + }, + { + "epoch": 0.6721380827481175, + "grad_norm": 0.19208763539791107, + "learning_rate": 0.001, + "loss": 1.6904, + "step": 15888 + }, + { + "epoch": 0.6721803875116338, + "grad_norm": 0.18781235814094543, + "learning_rate": 0.001, + "loss": 2.5108, + "step": 15889 + }, + { + "epoch": 0.6722226922751502, + "grad_norm": 0.20023007690906525, + "learning_rate": 0.001, + "loss": 1.6354, + "step": 15890 + }, + { + "epoch": 0.6722649970386666, + "grad_norm": 0.18701715767383575, + "learning_rate": 0.001, + "loss": 2.3519, + "step": 15891 + }, + { + "epoch": 0.6723073018021829, + "grad_norm": 2.3707351684570312, + "learning_rate": 0.001, + "loss": 1.7433, + "step": 15892 + }, + { + "epoch": 0.6723496065656993, + "grad_norm": 1.582403540611267, + "learning_rate": 0.001, + "loss": 2.2697, + "step": 15893 + }, + { + "epoch": 0.6723919113292157, + "grad_norm": 0.17928339540958405, + "learning_rate": 0.001, + "loss": 2.7044, + "step": 15894 + }, + { + "epoch": 0.672434216092732, + "grad_norm": 0.1638590693473816, + "learning_rate": 0.001, + "loss": 2.5621, + "step": 15895 + }, + { + "epoch": 0.6724765208562484, + "grad_norm": 0.25966379046440125, + "learning_rate": 0.001, + "loss": 2.4395, + "step": 15896 + }, + { + "epoch": 0.6725188256197648, + "grad_norm": 0.17949466407299042, + "learning_rate": 0.001, + "loss": 1.7309, + "step": 15897 + }, + { + "epoch": 0.6725611303832811, + "grad_norm": 0.182010680437088, + "learning_rate": 0.001, + "loss": 1.9328, + "step": 15898 + }, + { + "epoch": 0.6726034351467975, + "grad_norm": 0.2181997448205948, + "learning_rate": 0.001, + "loss": 2.3349, + "step": 15899 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.28189408779144287, + "learning_rate": 0.001, + "loss": 2.0677, + "step": 15900 + }, + { + "epoch": 0.6726880446738303, + "grad_norm": 0.22430168092250824, + "learning_rate": 0.001, + "loss": 1.6799, + "step": 15901 + }, + { + "epoch": 0.6727303494373467, + "grad_norm": 0.6292093396186829, + "learning_rate": 0.001, + "loss": 1.8766, + "step": 15902 + }, + { + "epoch": 0.672772654200863, + "grad_norm": 4.264910697937012, + "learning_rate": 0.001, + "loss": 2.1911, + "step": 15903 + }, + { + "epoch": 0.6728149589643794, + "grad_norm": 0.2221497744321823, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 15904 + }, + { + "epoch": 0.6728572637278958, + "grad_norm": 0.19245116412639618, + "learning_rate": 0.001, + "loss": 2.712, + "step": 15905 + }, + { + "epoch": 0.6728995684914121, + "grad_norm": 0.25045454502105713, + "learning_rate": 0.001, + "loss": 3.8263, + "step": 15906 + }, + { + "epoch": 0.6729418732549285, + "grad_norm": 0.16638623178005219, + "learning_rate": 0.001, + "loss": 2.4182, + "step": 15907 + }, + { + "epoch": 0.6729841780184449, + "grad_norm": 0.17453812062740326, + "learning_rate": 0.001, + "loss": 1.8538, + "step": 15908 + }, + { + "epoch": 0.6730264827819612, + "grad_norm": 0.27706319093704224, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 15909 + }, + { + "epoch": 0.6730687875454776, + "grad_norm": 0.1942570060491562, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 15910 + }, + { + "epoch": 0.673111092308994, + "grad_norm": 0.18307733535766602, + "learning_rate": 0.001, + "loss": 2.1524, + "step": 15911 + }, + { + "epoch": 0.6731533970725103, + "grad_norm": 0.1911049634218216, + "learning_rate": 0.001, + "loss": 2.4024, + "step": 15912 + }, + { + "epoch": 0.6731957018360267, + "grad_norm": 0.15870538353919983, + "learning_rate": 0.001, + "loss": 2.1905, + "step": 15913 + }, + { + "epoch": 0.6732380065995431, + "grad_norm": 0.14895422756671906, + "learning_rate": 0.001, + "loss": 1.8064, + "step": 15914 + }, + { + "epoch": 0.6732803113630594, + "grad_norm": 0.16844263672828674, + "learning_rate": 0.001, + "loss": 2.3851, + "step": 15915 + }, + { + "epoch": 0.6733226161265758, + "grad_norm": 0.15884919464588165, + "learning_rate": 0.001, + "loss": 2.3164, + "step": 15916 + }, + { + "epoch": 0.6733649208900923, + "grad_norm": 0.20211821794509888, + "learning_rate": 0.001, + "loss": 2.3867, + "step": 15917 + }, + { + "epoch": 0.6734072256536086, + "grad_norm": 0.3958881199359894, + "learning_rate": 0.001, + "loss": 1.911, + "step": 15918 + }, + { + "epoch": 0.673449530417125, + "grad_norm": 0.4282142221927643, + "learning_rate": 0.001, + "loss": 2.8803, + "step": 15919 + }, + { + "epoch": 0.6734918351806414, + "grad_norm": 0.1638120859861374, + "learning_rate": 0.001, + "loss": 1.665, + "step": 15920 + }, + { + "epoch": 0.6735341399441577, + "grad_norm": 0.15340758860111237, + "learning_rate": 0.001, + "loss": 2.273, + "step": 15921 + }, + { + "epoch": 0.6735764447076741, + "grad_norm": 0.17197485268115997, + "learning_rate": 0.001, + "loss": 2.0298, + "step": 15922 + }, + { + "epoch": 0.6736187494711905, + "grad_norm": 0.1842762529850006, + "learning_rate": 0.001, + "loss": 2.1636, + "step": 15923 + }, + { + "epoch": 0.6736610542347068, + "grad_norm": 0.1678512841463089, + "learning_rate": 0.001, + "loss": 2.6527, + "step": 15924 + }, + { + "epoch": 0.6737033589982232, + "grad_norm": 0.1786356419324875, + "learning_rate": 0.001, + "loss": 2.7915, + "step": 15925 + }, + { + "epoch": 0.6737456637617396, + "grad_norm": 0.14677925407886505, + "learning_rate": 0.001, + "loss": 1.7595, + "step": 15926 + }, + { + "epoch": 0.6737879685252559, + "grad_norm": 0.18858976662158966, + "learning_rate": 0.001, + "loss": 1.9292, + "step": 15927 + }, + { + "epoch": 0.6738302732887723, + "grad_norm": 0.18520209193229675, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 15928 + }, + { + "epoch": 0.6738725780522887, + "grad_norm": 0.9475305676460266, + "learning_rate": 0.001, + "loss": 2.5929, + "step": 15929 + }, + { + "epoch": 0.673914882815805, + "grad_norm": 0.17860578000545502, + "learning_rate": 0.001, + "loss": 1.4486, + "step": 15930 + }, + { + "epoch": 0.6739571875793214, + "grad_norm": 0.1508931964635849, + "learning_rate": 0.001, + "loss": 1.5058, + "step": 15931 + }, + { + "epoch": 0.6739994923428378, + "grad_norm": 0.17374980449676514, + "learning_rate": 0.001, + "loss": 1.819, + "step": 15932 + }, + { + "epoch": 0.6740417971063541, + "grad_norm": 0.1520918607711792, + "learning_rate": 0.001, + "loss": 1.8909, + "step": 15933 + }, + { + "epoch": 0.6740841018698706, + "grad_norm": 1.1293140649795532, + "learning_rate": 0.001, + "loss": 3.0682, + "step": 15934 + }, + { + "epoch": 0.674126406633387, + "grad_norm": 0.17505602538585663, + "learning_rate": 0.001, + "loss": 3.318, + "step": 15935 + }, + { + "epoch": 0.6741687113969033, + "grad_norm": 0.3417392075061798, + "learning_rate": 0.001, + "loss": 2.3171, + "step": 15936 + }, + { + "epoch": 0.6742110161604197, + "grad_norm": 0.39481237530708313, + "learning_rate": 0.001, + "loss": 1.7587, + "step": 15937 + }, + { + "epoch": 0.6742533209239361, + "grad_norm": 0.37343841791152954, + "learning_rate": 0.001, + "loss": 2.229, + "step": 15938 + }, + { + "epoch": 0.6742956256874524, + "grad_norm": 0.17360848188400269, + "learning_rate": 0.001, + "loss": 2.4422, + "step": 15939 + }, + { + "epoch": 0.6743379304509688, + "grad_norm": 0.2295389622449875, + "learning_rate": 0.001, + "loss": 1.9226, + "step": 15940 + }, + { + "epoch": 0.6743802352144852, + "grad_norm": 0.8693257570266724, + "learning_rate": 0.001, + "loss": 2.2007, + "step": 15941 + }, + { + "epoch": 0.6744225399780015, + "grad_norm": 0.19263996183872223, + "learning_rate": 0.001, + "loss": 3.2355, + "step": 15942 + }, + { + "epoch": 0.6744648447415179, + "grad_norm": 0.17120057344436646, + "learning_rate": 0.001, + "loss": 2.5869, + "step": 15943 + }, + { + "epoch": 0.6745071495050343, + "grad_norm": 0.292049378156662, + "learning_rate": 0.001, + "loss": 1.8235, + "step": 15944 + }, + { + "epoch": 0.6745494542685506, + "grad_norm": 0.17973144352436066, + "learning_rate": 0.001, + "loss": 2.0943, + "step": 15945 + }, + { + "epoch": 0.674591759032067, + "grad_norm": 0.30642953515052795, + "learning_rate": 0.001, + "loss": 3.0754, + "step": 15946 + }, + { + "epoch": 0.6746340637955833, + "grad_norm": 0.15908905863761902, + "learning_rate": 0.001, + "loss": 1.4202, + "step": 15947 + }, + { + "epoch": 0.6746763685590997, + "grad_norm": 0.1885390281677246, + "learning_rate": 0.001, + "loss": 1.8611, + "step": 15948 + }, + { + "epoch": 0.6747186733226161, + "grad_norm": 1.8227906227111816, + "learning_rate": 0.001, + "loss": 3.1409, + "step": 15949 + }, + { + "epoch": 0.6747609780861324, + "grad_norm": 0.3331076204776764, + "learning_rate": 0.001, + "loss": 2.6025, + "step": 15950 + }, + { + "epoch": 0.6748032828496489, + "grad_norm": 0.18833647668361664, + "learning_rate": 0.001, + "loss": 1.7091, + "step": 15951 + }, + { + "epoch": 0.6748455876131653, + "grad_norm": 0.8229162693023682, + "learning_rate": 0.001, + "loss": 1.9747, + "step": 15952 + }, + { + "epoch": 0.6748878923766816, + "grad_norm": 0.19028577208518982, + "learning_rate": 0.001, + "loss": 1.9955, + "step": 15953 + }, + { + "epoch": 0.674930197140198, + "grad_norm": 0.19043925404548645, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 15954 + }, + { + "epoch": 0.6749725019037144, + "grad_norm": 0.1884336918592453, + "learning_rate": 0.001, + "loss": 2.4741, + "step": 15955 + }, + { + "epoch": 0.6750148066672307, + "grad_norm": 0.2047235518693924, + "learning_rate": 0.001, + "loss": 2.729, + "step": 15956 + }, + { + "epoch": 0.6750571114307471, + "grad_norm": 1.2655960321426392, + "learning_rate": 0.001, + "loss": 2.2603, + "step": 15957 + }, + { + "epoch": 0.6750994161942635, + "grad_norm": 0.2844761312007904, + "learning_rate": 0.001, + "loss": 1.8044, + "step": 15958 + }, + { + "epoch": 0.6751417209577798, + "grad_norm": 0.19485563039779663, + "learning_rate": 0.001, + "loss": 2.0, + "step": 15959 + }, + { + "epoch": 0.6751840257212962, + "grad_norm": 0.2195066213607788, + "learning_rate": 0.001, + "loss": 2.6798, + "step": 15960 + }, + { + "epoch": 0.6752263304848126, + "grad_norm": 0.18380756676197052, + "learning_rate": 0.001, + "loss": 1.7459, + "step": 15961 + }, + { + "epoch": 0.6752686352483289, + "grad_norm": 0.22318629920482635, + "learning_rate": 0.001, + "loss": 2.6885, + "step": 15962 + }, + { + "epoch": 0.6753109400118453, + "grad_norm": 0.940578818321228, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 15963 + }, + { + "epoch": 0.6753532447753617, + "grad_norm": 0.19312544167041779, + "learning_rate": 0.001, + "loss": 2.0494, + "step": 15964 + }, + { + "epoch": 0.675395549538878, + "grad_norm": 0.24037931859493256, + "learning_rate": 0.001, + "loss": 2.2044, + "step": 15965 + }, + { + "epoch": 0.6754378543023944, + "grad_norm": 0.478103369474411, + "learning_rate": 0.001, + "loss": 3.5143, + "step": 15966 + }, + { + "epoch": 0.6754801590659109, + "grad_norm": 0.976945161819458, + "learning_rate": 0.001, + "loss": 2.7022, + "step": 15967 + }, + { + "epoch": 0.6755224638294272, + "grad_norm": 0.1700313240289688, + "learning_rate": 0.001, + "loss": 2.0294, + "step": 15968 + }, + { + "epoch": 0.6755647685929436, + "grad_norm": 0.36223527789115906, + "learning_rate": 0.001, + "loss": 2.3376, + "step": 15969 + }, + { + "epoch": 0.67560707335646, + "grad_norm": 1.9298176765441895, + "learning_rate": 0.001, + "loss": 2.5461, + "step": 15970 + }, + { + "epoch": 0.6756493781199763, + "grad_norm": 0.23120592534542084, + "learning_rate": 0.001, + "loss": 2.5119, + "step": 15971 + }, + { + "epoch": 0.6756916828834927, + "grad_norm": 0.18629468977451324, + "learning_rate": 0.001, + "loss": 2.1999, + "step": 15972 + }, + { + "epoch": 0.6757339876470091, + "grad_norm": 0.2043001651763916, + "learning_rate": 0.001, + "loss": 2.3867, + "step": 15973 + }, + { + "epoch": 0.6757762924105254, + "grad_norm": 0.1776389479637146, + "learning_rate": 0.001, + "loss": 1.6796, + "step": 15974 + }, + { + "epoch": 0.6758185971740418, + "grad_norm": 0.17264407873153687, + "learning_rate": 0.001, + "loss": 1.5977, + "step": 15975 + }, + { + "epoch": 0.6758609019375582, + "grad_norm": 0.3489270806312561, + "learning_rate": 0.001, + "loss": 2.4665, + "step": 15976 + }, + { + "epoch": 0.6759032067010745, + "grad_norm": 0.18797357380390167, + "learning_rate": 0.001, + "loss": 2.8661, + "step": 15977 + }, + { + "epoch": 0.6759455114645909, + "grad_norm": 0.1839050054550171, + "learning_rate": 0.001, + "loss": 1.798, + "step": 15978 + }, + { + "epoch": 0.6759878162281073, + "grad_norm": 0.2641671299934387, + "learning_rate": 0.001, + "loss": 2.0923, + "step": 15979 + }, + { + "epoch": 0.6760301209916236, + "grad_norm": 0.1782594621181488, + "learning_rate": 0.001, + "loss": 2.7264, + "step": 15980 + }, + { + "epoch": 0.67607242575514, + "grad_norm": 0.3223048150539398, + "learning_rate": 0.001, + "loss": 1.5769, + "step": 15981 + }, + { + "epoch": 0.6761147305186564, + "grad_norm": 0.17407678067684174, + "learning_rate": 0.001, + "loss": 2.2179, + "step": 15982 + }, + { + "epoch": 0.6761570352821727, + "grad_norm": 0.2168140560388565, + "learning_rate": 0.001, + "loss": 2.9042, + "step": 15983 + }, + { + "epoch": 0.6761993400456892, + "grad_norm": 0.24506430327892303, + "learning_rate": 0.001, + "loss": 2.6773, + "step": 15984 + }, + { + "epoch": 0.6762416448092056, + "grad_norm": 3.0023486614227295, + "learning_rate": 0.001, + "loss": 2.0182, + "step": 15985 + }, + { + "epoch": 0.6762839495727219, + "grad_norm": 0.1910613626241684, + "learning_rate": 0.001, + "loss": 2.047, + "step": 15986 + }, + { + "epoch": 0.6763262543362383, + "grad_norm": 0.17066790163516998, + "learning_rate": 0.001, + "loss": 2.0095, + "step": 15987 + }, + { + "epoch": 0.6763685590997547, + "grad_norm": 0.21411702036857605, + "learning_rate": 0.001, + "loss": 1.883, + "step": 15988 + }, + { + "epoch": 0.676410863863271, + "grad_norm": 0.16612938046455383, + "learning_rate": 0.001, + "loss": 1.7573, + "step": 15989 + }, + { + "epoch": 0.6764531686267874, + "grad_norm": 0.2450494021177292, + "learning_rate": 0.001, + "loss": 2.951, + "step": 15990 + }, + { + "epoch": 0.6764954733903038, + "grad_norm": 0.21236656606197357, + "learning_rate": 0.001, + "loss": 3.2924, + "step": 15991 + }, + { + "epoch": 0.6765377781538201, + "grad_norm": 1.4454017877578735, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 15992 + }, + { + "epoch": 0.6765800829173365, + "grad_norm": 0.16844910383224487, + "learning_rate": 0.001, + "loss": 2.0065, + "step": 15993 + }, + { + "epoch": 0.6766223876808528, + "grad_norm": 0.19149447977542877, + "learning_rate": 0.001, + "loss": 2.4233, + "step": 15994 + }, + { + "epoch": 0.6766646924443692, + "grad_norm": 0.15004916489124298, + "learning_rate": 0.001, + "loss": 1.9877, + "step": 15995 + }, + { + "epoch": 0.6767069972078856, + "grad_norm": 0.1823279857635498, + "learning_rate": 0.001, + "loss": 1.6102, + "step": 15996 + }, + { + "epoch": 0.6767493019714019, + "grad_norm": 0.17334629595279694, + "learning_rate": 0.001, + "loss": 2.3336, + "step": 15997 + }, + { + "epoch": 0.6767916067349183, + "grad_norm": 0.19492603838443756, + "learning_rate": 0.001, + "loss": 2.8301, + "step": 15998 + }, + { + "epoch": 0.6768339114984347, + "grad_norm": 0.17607484757900238, + "learning_rate": 0.001, + "loss": 1.8637, + "step": 15999 + }, + { + "epoch": 0.676876216261951, + "grad_norm": 0.15553830564022064, + "learning_rate": 0.001, + "loss": 2.4753, + "step": 16000 + }, + { + "epoch": 0.6769185210254675, + "grad_norm": 0.30437377095222473, + "learning_rate": 0.001, + "loss": 2.1501, + "step": 16001 + }, + { + "epoch": 0.6769608257889839, + "grad_norm": 0.17832662165164948, + "learning_rate": 0.001, + "loss": 1.9037, + "step": 16002 + }, + { + "epoch": 0.6770031305525002, + "grad_norm": 0.25508981943130493, + "learning_rate": 0.001, + "loss": 1.3602, + "step": 16003 + }, + { + "epoch": 0.6770454353160166, + "grad_norm": 0.17589248716831207, + "learning_rate": 0.001, + "loss": 3.1867, + "step": 16004 + }, + { + "epoch": 0.677087740079533, + "grad_norm": 0.1560450941324234, + "learning_rate": 0.001, + "loss": 1.6277, + "step": 16005 + }, + { + "epoch": 0.6771300448430493, + "grad_norm": 25.86013412475586, + "learning_rate": 0.001, + "loss": 2.1036, + "step": 16006 + }, + { + "epoch": 0.6771723496065657, + "grad_norm": 0.153413325548172, + "learning_rate": 0.001, + "loss": 2.2372, + "step": 16007 + }, + { + "epoch": 0.6772146543700821, + "grad_norm": 0.17835178971290588, + "learning_rate": 0.001, + "loss": 1.8833, + "step": 16008 + }, + { + "epoch": 0.6772569591335984, + "grad_norm": 0.15974244475364685, + "learning_rate": 0.001, + "loss": 1.9219, + "step": 16009 + }, + { + "epoch": 0.6772992638971148, + "grad_norm": 0.1814177930355072, + "learning_rate": 0.001, + "loss": 1.6924, + "step": 16010 + }, + { + "epoch": 0.6773415686606312, + "grad_norm": 0.14862357079982758, + "learning_rate": 0.001, + "loss": 1.2958, + "step": 16011 + }, + { + "epoch": 0.6773838734241475, + "grad_norm": 0.2236209362745285, + "learning_rate": 0.001, + "loss": 2.7953, + "step": 16012 + }, + { + "epoch": 0.6774261781876639, + "grad_norm": 0.17278966307640076, + "learning_rate": 0.001, + "loss": 2.3449, + "step": 16013 + }, + { + "epoch": 0.6774684829511803, + "grad_norm": 0.4638851284980774, + "learning_rate": 0.001, + "loss": 1.7526, + "step": 16014 + }, + { + "epoch": 0.6775107877146966, + "grad_norm": 0.1768551915884018, + "learning_rate": 0.001, + "loss": 3.0475, + "step": 16015 + }, + { + "epoch": 0.677553092478213, + "grad_norm": 0.19388507306575775, + "learning_rate": 0.001, + "loss": 2.2825, + "step": 16016 + }, + { + "epoch": 0.6775953972417295, + "grad_norm": 0.20869360864162445, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 16017 + }, + { + "epoch": 0.6776377020052458, + "grad_norm": 0.16567271947860718, + "learning_rate": 0.001, + "loss": 1.9322, + "step": 16018 + }, + { + "epoch": 0.6776800067687622, + "grad_norm": 0.15912829339504242, + "learning_rate": 0.001, + "loss": 1.5842, + "step": 16019 + }, + { + "epoch": 0.6777223115322786, + "grad_norm": 0.14932376146316528, + "learning_rate": 0.001, + "loss": 1.6747, + "step": 16020 + }, + { + "epoch": 0.6777646162957949, + "grad_norm": 1.8247519731521606, + "learning_rate": 0.001, + "loss": 2.3317, + "step": 16021 + }, + { + "epoch": 0.6778069210593113, + "grad_norm": 0.22858451306819916, + "learning_rate": 0.001, + "loss": 2.5957, + "step": 16022 + }, + { + "epoch": 0.6778492258228277, + "grad_norm": 0.1770337074995041, + "learning_rate": 0.001, + "loss": 1.436, + "step": 16023 + }, + { + "epoch": 0.677891530586344, + "grad_norm": 0.1400739699602127, + "learning_rate": 0.001, + "loss": 1.9365, + "step": 16024 + }, + { + "epoch": 0.6779338353498604, + "grad_norm": 0.1432129591703415, + "learning_rate": 0.001, + "loss": 1.5036, + "step": 16025 + }, + { + "epoch": 0.6779761401133768, + "grad_norm": 0.16388866305351257, + "learning_rate": 0.001, + "loss": 1.8471, + "step": 16026 + }, + { + "epoch": 0.6780184448768931, + "grad_norm": 2.311777353286743, + "learning_rate": 0.001, + "loss": 2.4238, + "step": 16027 + }, + { + "epoch": 0.6780607496404095, + "grad_norm": 0.19178903102874756, + "learning_rate": 0.001, + "loss": 1.9744, + "step": 16028 + }, + { + "epoch": 0.6781030544039259, + "grad_norm": 0.15423864126205444, + "learning_rate": 0.001, + "loss": 2.1231, + "step": 16029 + }, + { + "epoch": 0.6781453591674422, + "grad_norm": 0.16429832577705383, + "learning_rate": 0.001, + "loss": 2.1746, + "step": 16030 + }, + { + "epoch": 0.6781876639309586, + "grad_norm": 0.14937609434127808, + "learning_rate": 0.001, + "loss": 2.2265, + "step": 16031 + }, + { + "epoch": 0.678229968694475, + "grad_norm": 0.17871306836605072, + "learning_rate": 0.001, + "loss": 2.605, + "step": 16032 + }, + { + "epoch": 0.6782722734579913, + "grad_norm": 0.1368352472782135, + "learning_rate": 0.001, + "loss": 1.3931, + "step": 16033 + }, + { + "epoch": 0.6783145782215078, + "grad_norm": 0.15225689113140106, + "learning_rate": 0.001, + "loss": 1.8712, + "step": 16034 + }, + { + "epoch": 0.6783568829850242, + "grad_norm": 0.237832173705101, + "learning_rate": 0.001, + "loss": 2.7011, + "step": 16035 + }, + { + "epoch": 0.6783991877485405, + "grad_norm": 0.2749160826206207, + "learning_rate": 0.001, + "loss": 2.0368, + "step": 16036 + }, + { + "epoch": 0.6784414925120569, + "grad_norm": 0.1856054663658142, + "learning_rate": 0.001, + "loss": 1.5841, + "step": 16037 + }, + { + "epoch": 0.6784837972755732, + "grad_norm": 0.35927614569664, + "learning_rate": 0.001, + "loss": 1.5595, + "step": 16038 + }, + { + "epoch": 0.6785261020390896, + "grad_norm": 0.18782657384872437, + "learning_rate": 0.001, + "loss": 2.4929, + "step": 16039 + }, + { + "epoch": 0.678568406802606, + "grad_norm": 0.1664906144142151, + "learning_rate": 0.001, + "loss": 2.8713, + "step": 16040 + }, + { + "epoch": 0.6786107115661223, + "grad_norm": 0.17706331610679626, + "learning_rate": 0.001, + "loss": 2.2758, + "step": 16041 + }, + { + "epoch": 0.6786530163296387, + "grad_norm": 1.698190450668335, + "learning_rate": 0.001, + "loss": 2.0762, + "step": 16042 + }, + { + "epoch": 0.6786953210931551, + "grad_norm": 0.2534584105014801, + "learning_rate": 0.001, + "loss": 2.2728, + "step": 16043 + }, + { + "epoch": 0.6787376258566714, + "grad_norm": 0.14307022094726562, + "learning_rate": 0.001, + "loss": 2.1301, + "step": 16044 + }, + { + "epoch": 0.6787799306201878, + "grad_norm": 0.16313962638378143, + "learning_rate": 0.001, + "loss": 2.6391, + "step": 16045 + }, + { + "epoch": 0.6788222353837042, + "grad_norm": 0.2026868611574173, + "learning_rate": 0.001, + "loss": 3.0308, + "step": 16046 + }, + { + "epoch": 0.6788645401472205, + "grad_norm": 0.1914452612400055, + "learning_rate": 0.001, + "loss": 2.2605, + "step": 16047 + }, + { + "epoch": 0.6789068449107369, + "grad_norm": 0.17033523321151733, + "learning_rate": 0.001, + "loss": 2.7466, + "step": 16048 + }, + { + "epoch": 0.6789491496742533, + "grad_norm": 0.41400858759880066, + "learning_rate": 0.001, + "loss": 1.8912, + "step": 16049 + }, + { + "epoch": 0.6789914544377696, + "grad_norm": 0.14586646854877472, + "learning_rate": 0.001, + "loss": 2.2457, + "step": 16050 + }, + { + "epoch": 0.6790337592012861, + "grad_norm": 0.1477472484111786, + "learning_rate": 0.001, + "loss": 1.9528, + "step": 16051 + }, + { + "epoch": 0.6790760639648025, + "grad_norm": 0.18150284886360168, + "learning_rate": 0.001, + "loss": 1.8803, + "step": 16052 + }, + { + "epoch": 0.6791183687283188, + "grad_norm": 163.43626403808594, + "learning_rate": 0.001, + "loss": 2.7313, + "step": 16053 + }, + { + "epoch": 0.6791606734918352, + "grad_norm": 0.17980308830738068, + "learning_rate": 0.001, + "loss": 2.8632, + "step": 16054 + }, + { + "epoch": 0.6792029782553516, + "grad_norm": 1.3750078678131104, + "learning_rate": 0.001, + "loss": 2.8298, + "step": 16055 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.23904463648796082, + "learning_rate": 0.001, + "loss": 2.7106, + "step": 16056 + }, + { + "epoch": 0.6792875877823843, + "grad_norm": 0.18311509490013123, + "learning_rate": 0.001, + "loss": 1.9757, + "step": 16057 + }, + { + "epoch": 0.6793298925459007, + "grad_norm": 0.15157288312911987, + "learning_rate": 0.001, + "loss": 2.3797, + "step": 16058 + }, + { + "epoch": 0.679372197309417, + "grad_norm": 0.1471908688545227, + "learning_rate": 0.001, + "loss": 2.9482, + "step": 16059 + }, + { + "epoch": 0.6794145020729334, + "grad_norm": 0.16476254165172577, + "learning_rate": 0.001, + "loss": 3.0529, + "step": 16060 + }, + { + "epoch": 0.6794568068364498, + "grad_norm": 0.17069002985954285, + "learning_rate": 0.001, + "loss": 2.0707, + "step": 16061 + }, + { + "epoch": 0.6794991115999661, + "grad_norm": 0.17665056884288788, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 16062 + }, + { + "epoch": 0.6795414163634825, + "grad_norm": 0.1902749389410019, + "learning_rate": 0.001, + "loss": 2.5135, + "step": 16063 + }, + { + "epoch": 0.6795837211269989, + "grad_norm": 0.16366536915302277, + "learning_rate": 0.001, + "loss": 2.1562, + "step": 16064 + }, + { + "epoch": 0.6796260258905152, + "grad_norm": 0.6264306306838989, + "learning_rate": 0.001, + "loss": 1.8162, + "step": 16065 + }, + { + "epoch": 0.6796683306540316, + "grad_norm": 0.2236694097518921, + "learning_rate": 0.001, + "loss": 2.4581, + "step": 16066 + }, + { + "epoch": 0.6797106354175481, + "grad_norm": 0.16119384765625, + "learning_rate": 0.001, + "loss": 2.1214, + "step": 16067 + }, + { + "epoch": 0.6797529401810644, + "grad_norm": 0.16101056337356567, + "learning_rate": 0.001, + "loss": 1.6174, + "step": 16068 + }, + { + "epoch": 0.6797952449445808, + "grad_norm": 0.28261250257492065, + "learning_rate": 0.001, + "loss": 2.6421, + "step": 16069 + }, + { + "epoch": 0.6798375497080972, + "grad_norm": 1.5717463493347168, + "learning_rate": 0.001, + "loss": 2.7132, + "step": 16070 + }, + { + "epoch": 0.6798798544716135, + "grad_norm": 0.15996554493904114, + "learning_rate": 0.001, + "loss": 2.364, + "step": 16071 + }, + { + "epoch": 0.6799221592351299, + "grad_norm": 0.20333285629749298, + "learning_rate": 0.001, + "loss": 2.6281, + "step": 16072 + }, + { + "epoch": 0.6799644639986463, + "grad_norm": 0.17524294555187225, + "learning_rate": 0.001, + "loss": 2.1776, + "step": 16073 + }, + { + "epoch": 0.6800067687621626, + "grad_norm": 0.2101033478975296, + "learning_rate": 0.001, + "loss": 2.2662, + "step": 16074 + }, + { + "epoch": 0.680049073525679, + "grad_norm": 0.1923307329416275, + "learning_rate": 0.001, + "loss": 1.9986, + "step": 16075 + }, + { + "epoch": 0.6800913782891954, + "grad_norm": 0.14267486333847046, + "learning_rate": 0.001, + "loss": 1.4753, + "step": 16076 + }, + { + "epoch": 0.6801336830527117, + "grad_norm": 0.14065372943878174, + "learning_rate": 0.001, + "loss": 1.4347, + "step": 16077 + }, + { + "epoch": 0.6801759878162281, + "grad_norm": 0.20316016674041748, + "learning_rate": 0.001, + "loss": 1.7941, + "step": 16078 + }, + { + "epoch": 0.6802182925797445, + "grad_norm": 0.24780157208442688, + "learning_rate": 0.001, + "loss": 3.0841, + "step": 16079 + }, + { + "epoch": 0.6802605973432608, + "grad_norm": 0.19607360661029816, + "learning_rate": 0.001, + "loss": 3.1225, + "step": 16080 + }, + { + "epoch": 0.6803029021067772, + "grad_norm": 0.16723619401454926, + "learning_rate": 0.001, + "loss": 1.9696, + "step": 16081 + }, + { + "epoch": 0.6803452068702935, + "grad_norm": 10.566794395446777, + "learning_rate": 0.001, + "loss": 2.347, + "step": 16082 + }, + { + "epoch": 0.68038751163381, + "grad_norm": 0.19241277873516083, + "learning_rate": 0.001, + "loss": 2.1758, + "step": 16083 + }, + { + "epoch": 0.6804298163973264, + "grad_norm": 0.3141223192214966, + "learning_rate": 0.001, + "loss": 2.1376, + "step": 16084 + }, + { + "epoch": 0.6804721211608427, + "grad_norm": 0.1563158631324768, + "learning_rate": 0.001, + "loss": 2.7554, + "step": 16085 + }, + { + "epoch": 0.6805144259243591, + "grad_norm": 0.967668354511261, + "learning_rate": 0.001, + "loss": 2.3854, + "step": 16086 + }, + { + "epoch": 0.6805567306878755, + "grad_norm": 0.1570819467306137, + "learning_rate": 0.001, + "loss": 1.7543, + "step": 16087 + }, + { + "epoch": 0.6805990354513918, + "grad_norm": 0.18060454726219177, + "learning_rate": 0.001, + "loss": 1.3768, + "step": 16088 + }, + { + "epoch": 0.6806413402149082, + "grad_norm": 0.1356450766324997, + "learning_rate": 0.001, + "loss": 1.2979, + "step": 16089 + }, + { + "epoch": 0.6806836449784246, + "grad_norm": 0.20549282431602478, + "learning_rate": 0.001, + "loss": 2.1857, + "step": 16090 + }, + { + "epoch": 0.6807259497419409, + "grad_norm": 0.17379418015480042, + "learning_rate": 0.001, + "loss": 1.7882, + "step": 16091 + }, + { + "epoch": 0.6807682545054573, + "grad_norm": 0.17570160329341888, + "learning_rate": 0.001, + "loss": 2.2534, + "step": 16092 + }, + { + "epoch": 0.6808105592689737, + "grad_norm": 0.17414408922195435, + "learning_rate": 0.001, + "loss": 2.5999, + "step": 16093 + }, + { + "epoch": 0.68085286403249, + "grad_norm": 0.9091881513595581, + "learning_rate": 0.001, + "loss": 2.7196, + "step": 16094 + }, + { + "epoch": 0.6808951687960064, + "grad_norm": 0.17393404245376587, + "learning_rate": 0.001, + "loss": 1.8711, + "step": 16095 + }, + { + "epoch": 0.6809374735595228, + "grad_norm": 0.15561740100383759, + "learning_rate": 0.001, + "loss": 1.4363, + "step": 16096 + }, + { + "epoch": 0.6809797783230391, + "grad_norm": 0.16957683861255646, + "learning_rate": 0.001, + "loss": 2.131, + "step": 16097 + }, + { + "epoch": 0.6810220830865555, + "grad_norm": 0.2562768757343292, + "learning_rate": 0.001, + "loss": 3.0767, + "step": 16098 + }, + { + "epoch": 0.681064387850072, + "grad_norm": 1.1886937618255615, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 16099 + }, + { + "epoch": 0.6811066926135882, + "grad_norm": 0.5651175379753113, + "learning_rate": 0.001, + "loss": 3.2804, + "step": 16100 + }, + { + "epoch": 0.6811489973771047, + "grad_norm": 0.407637357711792, + "learning_rate": 0.001, + "loss": 2.6736, + "step": 16101 + }, + { + "epoch": 0.6811913021406211, + "grad_norm": 0.19160562753677368, + "learning_rate": 0.001, + "loss": 2.064, + "step": 16102 + }, + { + "epoch": 0.6812336069041374, + "grad_norm": 0.2181776911020279, + "learning_rate": 0.001, + "loss": 2.0987, + "step": 16103 + }, + { + "epoch": 0.6812759116676538, + "grad_norm": 0.22167150676250458, + "learning_rate": 0.001, + "loss": 1.9636, + "step": 16104 + }, + { + "epoch": 0.6813182164311702, + "grad_norm": 0.4942590892314911, + "learning_rate": 0.001, + "loss": 3.871, + "step": 16105 + }, + { + "epoch": 0.6813605211946865, + "grad_norm": 4.270835876464844, + "learning_rate": 0.001, + "loss": 2.1507, + "step": 16106 + }, + { + "epoch": 0.6814028259582029, + "grad_norm": 2.0456957817077637, + "learning_rate": 0.001, + "loss": 2.1891, + "step": 16107 + }, + { + "epoch": 0.6814451307217193, + "grad_norm": 0.3435433506965637, + "learning_rate": 0.001, + "loss": 3.4219, + "step": 16108 + }, + { + "epoch": 0.6814874354852356, + "grad_norm": 0.3711515963077545, + "learning_rate": 0.001, + "loss": 3.505, + "step": 16109 + }, + { + "epoch": 0.681529740248752, + "grad_norm": 0.40264999866485596, + "learning_rate": 0.001, + "loss": 4.1824, + "step": 16110 + }, + { + "epoch": 0.6815720450122684, + "grad_norm": 0.25708243250846863, + "learning_rate": 0.001, + "loss": 3.0473, + "step": 16111 + }, + { + "epoch": 0.6816143497757847, + "grad_norm": 0.9480053782463074, + "learning_rate": 0.001, + "loss": 2.4107, + "step": 16112 + }, + { + "epoch": 0.6816566545393011, + "grad_norm": 0.49835169315338135, + "learning_rate": 0.001, + "loss": 2.4002, + "step": 16113 + }, + { + "epoch": 0.6816989593028175, + "grad_norm": 0.33678311109542847, + "learning_rate": 0.001, + "loss": 2.2966, + "step": 16114 + }, + { + "epoch": 0.6817412640663338, + "grad_norm": 0.23809859156608582, + "learning_rate": 0.001, + "loss": 2.1127, + "step": 16115 + }, + { + "epoch": 0.6817835688298503, + "grad_norm": 0.26749664545059204, + "learning_rate": 0.001, + "loss": 3.4136, + "step": 16116 + }, + { + "epoch": 0.6818258735933667, + "grad_norm": 0.27437347173690796, + "learning_rate": 0.001, + "loss": 2.4324, + "step": 16117 + }, + { + "epoch": 0.681868178356883, + "grad_norm": 0.26453253626823425, + "learning_rate": 0.001, + "loss": 2.0741, + "step": 16118 + }, + { + "epoch": 0.6819104831203994, + "grad_norm": 0.18699724972248077, + "learning_rate": 0.001, + "loss": 2.6181, + "step": 16119 + }, + { + "epoch": 0.6819527878839158, + "grad_norm": 0.23921284079551697, + "learning_rate": 0.001, + "loss": 1.9422, + "step": 16120 + }, + { + "epoch": 0.6819950926474321, + "grad_norm": 0.27278849482536316, + "learning_rate": 0.001, + "loss": 2.635, + "step": 16121 + }, + { + "epoch": 0.6820373974109485, + "grad_norm": 22.81956672668457, + "learning_rate": 0.001, + "loss": 2.203, + "step": 16122 + }, + { + "epoch": 0.6820797021744649, + "grad_norm": 0.17843838036060333, + "learning_rate": 0.001, + "loss": 2.2639, + "step": 16123 + }, + { + "epoch": 0.6821220069379812, + "grad_norm": 0.17329254746437073, + "learning_rate": 0.001, + "loss": 2.0456, + "step": 16124 + }, + { + "epoch": 0.6821643117014976, + "grad_norm": 0.20157817006111145, + "learning_rate": 0.001, + "loss": 1.7671, + "step": 16125 + }, + { + "epoch": 0.682206616465014, + "grad_norm": 0.3038026988506317, + "learning_rate": 0.001, + "loss": 2.6014, + "step": 16126 + }, + { + "epoch": 0.6822489212285303, + "grad_norm": 0.25918006896972656, + "learning_rate": 0.001, + "loss": 3.0838, + "step": 16127 + }, + { + "epoch": 0.6822912259920467, + "grad_norm": 0.17397001385688782, + "learning_rate": 0.001, + "loss": 2.6565, + "step": 16128 + }, + { + "epoch": 0.682333530755563, + "grad_norm": 0.262084424495697, + "learning_rate": 0.001, + "loss": 2.5259, + "step": 16129 + }, + { + "epoch": 0.6823758355190794, + "grad_norm": 0.21808075904846191, + "learning_rate": 0.001, + "loss": 2.4083, + "step": 16130 + }, + { + "epoch": 0.6824181402825958, + "grad_norm": 0.17491497099399567, + "learning_rate": 0.001, + "loss": 2.25, + "step": 16131 + }, + { + "epoch": 0.6824604450461121, + "grad_norm": 0.1757962703704834, + "learning_rate": 0.001, + "loss": 2.1881, + "step": 16132 + }, + { + "epoch": 0.6825027498096286, + "grad_norm": 0.3888621926307678, + "learning_rate": 0.001, + "loss": 2.3828, + "step": 16133 + }, + { + "epoch": 0.682545054573145, + "grad_norm": 0.15985238552093506, + "learning_rate": 0.001, + "loss": 2.4909, + "step": 16134 + }, + { + "epoch": 0.6825873593366613, + "grad_norm": 0.21272926032543182, + "learning_rate": 0.001, + "loss": 2.7576, + "step": 16135 + }, + { + "epoch": 0.6826296641001777, + "grad_norm": 0.2001175731420517, + "learning_rate": 0.001, + "loss": 2.2182, + "step": 16136 + }, + { + "epoch": 0.6826719688636941, + "grad_norm": 0.18408158421516418, + "learning_rate": 0.001, + "loss": 1.8213, + "step": 16137 + }, + { + "epoch": 0.6827142736272104, + "grad_norm": 0.1814546287059784, + "learning_rate": 0.001, + "loss": 2.6382, + "step": 16138 + }, + { + "epoch": 0.6827565783907268, + "grad_norm": 0.17873550951480865, + "learning_rate": 0.001, + "loss": 3.3059, + "step": 16139 + }, + { + "epoch": 0.6827988831542432, + "grad_norm": 0.20914815366268158, + "learning_rate": 0.001, + "loss": 2.5146, + "step": 16140 + }, + { + "epoch": 0.6828411879177595, + "grad_norm": 9.856534004211426, + "learning_rate": 0.001, + "loss": 2.0903, + "step": 16141 + }, + { + "epoch": 0.6828834926812759, + "grad_norm": 0.19409655034542084, + "learning_rate": 0.001, + "loss": 2.5403, + "step": 16142 + }, + { + "epoch": 0.6829257974447923, + "grad_norm": 0.424080491065979, + "learning_rate": 0.001, + "loss": 3.0397, + "step": 16143 + }, + { + "epoch": 0.6829681022083086, + "grad_norm": 1.3478124141693115, + "learning_rate": 0.001, + "loss": 2.0678, + "step": 16144 + }, + { + "epoch": 0.683010406971825, + "grad_norm": 0.20509791374206543, + "learning_rate": 0.001, + "loss": 2.4558, + "step": 16145 + }, + { + "epoch": 0.6830527117353414, + "grad_norm": 0.19733795523643494, + "learning_rate": 0.001, + "loss": 2.6852, + "step": 16146 + }, + { + "epoch": 0.6830950164988577, + "grad_norm": 0.16524255275726318, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 16147 + }, + { + "epoch": 0.6831373212623741, + "grad_norm": 0.17051273584365845, + "learning_rate": 0.001, + "loss": 2.6465, + "step": 16148 + }, + { + "epoch": 0.6831796260258906, + "grad_norm": 0.4586073160171509, + "learning_rate": 0.001, + "loss": 3.2835, + "step": 16149 + }, + { + "epoch": 0.6832219307894069, + "grad_norm": 0.38520026206970215, + "learning_rate": 0.001, + "loss": 1.8567, + "step": 16150 + }, + { + "epoch": 0.6832642355529233, + "grad_norm": 7.5121941566467285, + "learning_rate": 0.001, + "loss": 2.7541, + "step": 16151 + }, + { + "epoch": 0.6833065403164397, + "grad_norm": 0.19074460864067078, + "learning_rate": 0.001, + "loss": 2.2912, + "step": 16152 + }, + { + "epoch": 0.683348845079956, + "grad_norm": 0.28227201104164124, + "learning_rate": 0.001, + "loss": 1.4986, + "step": 16153 + }, + { + "epoch": 0.6833911498434724, + "grad_norm": 0.1890229731798172, + "learning_rate": 0.001, + "loss": 2.4198, + "step": 16154 + }, + { + "epoch": 0.6834334546069888, + "grad_norm": 0.3094002902507782, + "learning_rate": 0.001, + "loss": 1.7272, + "step": 16155 + }, + { + "epoch": 0.6834757593705051, + "grad_norm": 0.4410035014152527, + "learning_rate": 0.001, + "loss": 1.7713, + "step": 16156 + }, + { + "epoch": 0.6835180641340215, + "grad_norm": 0.20090627670288086, + "learning_rate": 0.001, + "loss": 2.3903, + "step": 16157 + }, + { + "epoch": 0.6835603688975379, + "grad_norm": 0.1748632788658142, + "learning_rate": 0.001, + "loss": 2.1165, + "step": 16158 + }, + { + "epoch": 0.6836026736610542, + "grad_norm": 0.18395616114139557, + "learning_rate": 0.001, + "loss": 1.9808, + "step": 16159 + }, + { + "epoch": 0.6836449784245706, + "grad_norm": 0.18044279515743256, + "learning_rate": 0.001, + "loss": 2.2385, + "step": 16160 + }, + { + "epoch": 0.683687283188087, + "grad_norm": 0.7966620922088623, + "learning_rate": 0.001, + "loss": 2.1261, + "step": 16161 + }, + { + "epoch": 0.6837295879516033, + "grad_norm": 0.5675308108329773, + "learning_rate": 0.001, + "loss": 1.9956, + "step": 16162 + }, + { + "epoch": 0.6837718927151197, + "grad_norm": 0.15210571885108948, + "learning_rate": 0.001, + "loss": 1.8715, + "step": 16163 + }, + { + "epoch": 0.6838141974786361, + "grad_norm": 0.19697412848472595, + "learning_rate": 0.001, + "loss": 2.2049, + "step": 16164 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.2632860243320465, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 16165 + }, + { + "epoch": 0.6838988070056689, + "grad_norm": 0.14289480447769165, + "learning_rate": 0.001, + "loss": 2.3733, + "step": 16166 + }, + { + "epoch": 0.6839411117691853, + "grad_norm": 0.16946500539779663, + "learning_rate": 0.001, + "loss": 2.1708, + "step": 16167 + }, + { + "epoch": 0.6839834165327016, + "grad_norm": 0.1758553683757782, + "learning_rate": 0.001, + "loss": 1.8589, + "step": 16168 + }, + { + "epoch": 0.684025721296218, + "grad_norm": 0.1700044721364975, + "learning_rate": 0.001, + "loss": 1.6158, + "step": 16169 + }, + { + "epoch": 0.6840680260597344, + "grad_norm": 0.15607595443725586, + "learning_rate": 0.001, + "loss": 1.6295, + "step": 16170 + }, + { + "epoch": 0.6841103308232507, + "grad_norm": 0.173630490899086, + "learning_rate": 0.001, + "loss": 2.871, + "step": 16171 + }, + { + "epoch": 0.6841526355867671, + "grad_norm": 0.3207765817642212, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 16172 + }, + { + "epoch": 0.6841949403502834, + "grad_norm": 0.1741725355386734, + "learning_rate": 0.001, + "loss": 2.2591, + "step": 16173 + }, + { + "epoch": 0.6842372451137998, + "grad_norm": 0.17393991351127625, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 16174 + }, + { + "epoch": 0.6842795498773162, + "grad_norm": 0.15135467052459717, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 16175 + }, + { + "epoch": 0.6843218546408325, + "grad_norm": 0.24743154644966125, + "learning_rate": 0.001, + "loss": 2.5485, + "step": 16176 + }, + { + "epoch": 0.6843641594043489, + "grad_norm": 0.28311067819595337, + "learning_rate": 0.001, + "loss": 2.6526, + "step": 16177 + }, + { + "epoch": 0.6844064641678653, + "grad_norm": 0.14594843983650208, + "learning_rate": 0.001, + "loss": 2.3248, + "step": 16178 + }, + { + "epoch": 0.6844487689313816, + "grad_norm": 2.94158935546875, + "learning_rate": 0.001, + "loss": 2.2385, + "step": 16179 + }, + { + "epoch": 0.684491073694898, + "grad_norm": 0.19169901311397552, + "learning_rate": 0.001, + "loss": 2.3112, + "step": 16180 + }, + { + "epoch": 0.6845333784584144, + "grad_norm": 0.6225913763046265, + "learning_rate": 0.001, + "loss": 2.5312, + "step": 16181 + }, + { + "epoch": 0.6845756832219307, + "grad_norm": 0.1315428614616394, + "learning_rate": 0.001, + "loss": 2.1162, + "step": 16182 + }, + { + "epoch": 0.6846179879854472, + "grad_norm": 0.1488700956106186, + "learning_rate": 0.001, + "loss": 1.6814, + "step": 16183 + }, + { + "epoch": 0.6846602927489636, + "grad_norm": 0.1776464730501175, + "learning_rate": 0.001, + "loss": 2.0227, + "step": 16184 + }, + { + "epoch": 0.6847025975124799, + "grad_norm": 0.17623931169509888, + "learning_rate": 0.001, + "loss": 2.0609, + "step": 16185 + }, + { + "epoch": 0.6847449022759963, + "grad_norm": 0.2685874402523041, + "learning_rate": 0.001, + "loss": 2.3874, + "step": 16186 + }, + { + "epoch": 0.6847872070395127, + "grad_norm": 0.20432405173778534, + "learning_rate": 0.001, + "loss": 1.6907, + "step": 16187 + }, + { + "epoch": 0.684829511803029, + "grad_norm": 0.17755457758903503, + "learning_rate": 0.001, + "loss": 1.7708, + "step": 16188 + }, + { + "epoch": 0.6848718165665454, + "grad_norm": 0.3379620611667633, + "learning_rate": 0.001, + "loss": 2.383, + "step": 16189 + }, + { + "epoch": 0.6849141213300618, + "grad_norm": 0.31830352544784546, + "learning_rate": 0.001, + "loss": 1.7277, + "step": 16190 + }, + { + "epoch": 0.6849564260935781, + "grad_norm": 0.18110892176628113, + "learning_rate": 0.001, + "loss": 3.5317, + "step": 16191 + }, + { + "epoch": 0.6849987308570945, + "grad_norm": 0.5665266513824463, + "learning_rate": 0.001, + "loss": 2.4427, + "step": 16192 + }, + { + "epoch": 0.6850410356206109, + "grad_norm": 0.28646814823150635, + "learning_rate": 0.001, + "loss": 3.2973, + "step": 16193 + }, + { + "epoch": 0.6850833403841272, + "grad_norm": 0.19316011667251587, + "learning_rate": 0.001, + "loss": 2.7649, + "step": 16194 + }, + { + "epoch": 0.6851256451476436, + "grad_norm": 0.21821334958076477, + "learning_rate": 0.001, + "loss": 2.3747, + "step": 16195 + }, + { + "epoch": 0.68516794991116, + "grad_norm": 0.18347589671611786, + "learning_rate": 0.001, + "loss": 2.0008, + "step": 16196 + }, + { + "epoch": 0.6852102546746763, + "grad_norm": 0.1854442059993744, + "learning_rate": 0.001, + "loss": 1.7193, + "step": 16197 + }, + { + "epoch": 0.6852525594381927, + "grad_norm": 0.17909593880176544, + "learning_rate": 0.001, + "loss": 2.0372, + "step": 16198 + }, + { + "epoch": 0.6852948642017092, + "grad_norm": 0.1535256803035736, + "learning_rate": 0.001, + "loss": 1.7345, + "step": 16199 + }, + { + "epoch": 0.6853371689652255, + "grad_norm": 0.20372457802295685, + "learning_rate": 0.001, + "loss": 2.1256, + "step": 16200 + }, + { + "epoch": 0.6853794737287419, + "grad_norm": 0.291826456785202, + "learning_rate": 0.001, + "loss": 2.5165, + "step": 16201 + }, + { + "epoch": 0.6854217784922583, + "grad_norm": 0.18575716018676758, + "learning_rate": 0.001, + "loss": 1.7661, + "step": 16202 + }, + { + "epoch": 0.6854640832557746, + "grad_norm": 0.15441353619098663, + "learning_rate": 0.001, + "loss": 1.672, + "step": 16203 + }, + { + "epoch": 0.685506388019291, + "grad_norm": 1.2076762914657593, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 16204 + }, + { + "epoch": 0.6855486927828074, + "grad_norm": 0.17845335602760315, + "learning_rate": 0.001, + "loss": 2.2695, + "step": 16205 + }, + { + "epoch": 0.6855909975463237, + "grad_norm": 0.1648271232843399, + "learning_rate": 0.001, + "loss": 2.9431, + "step": 16206 + }, + { + "epoch": 0.6856333023098401, + "grad_norm": 0.264567494392395, + "learning_rate": 0.001, + "loss": 2.7659, + "step": 16207 + }, + { + "epoch": 0.6856756070733565, + "grad_norm": 0.15774111449718475, + "learning_rate": 0.001, + "loss": 1.7371, + "step": 16208 + }, + { + "epoch": 0.6857179118368728, + "grad_norm": 0.3049740195274353, + "learning_rate": 0.001, + "loss": 2.4582, + "step": 16209 + }, + { + "epoch": 0.6857602166003892, + "grad_norm": 0.371410071849823, + "learning_rate": 0.001, + "loss": 3.2076, + "step": 16210 + }, + { + "epoch": 0.6858025213639056, + "grad_norm": 0.17293484508991241, + "learning_rate": 0.001, + "loss": 1.4464, + "step": 16211 + }, + { + "epoch": 0.6858448261274219, + "grad_norm": 0.3040180802345276, + "learning_rate": 0.001, + "loss": 2.9824, + "step": 16212 + }, + { + "epoch": 0.6858871308909383, + "grad_norm": 0.22977972030639648, + "learning_rate": 0.001, + "loss": 3.5424, + "step": 16213 + }, + { + "epoch": 0.6859294356544547, + "grad_norm": 0.14453327655792236, + "learning_rate": 0.001, + "loss": 1.4584, + "step": 16214 + }, + { + "epoch": 0.685971740417971, + "grad_norm": 0.3432617485523224, + "learning_rate": 0.001, + "loss": 2.2774, + "step": 16215 + }, + { + "epoch": 0.6860140451814875, + "grad_norm": 0.614466667175293, + "learning_rate": 0.001, + "loss": 2.9487, + "step": 16216 + }, + { + "epoch": 0.6860563499450039, + "grad_norm": 0.5213249325752258, + "learning_rate": 0.001, + "loss": 2.6308, + "step": 16217 + }, + { + "epoch": 0.6860986547085202, + "grad_norm": 0.13962678611278534, + "learning_rate": 0.001, + "loss": 2.3401, + "step": 16218 + }, + { + "epoch": 0.6861409594720366, + "grad_norm": 0.13959261775016785, + "learning_rate": 0.001, + "loss": 2.066, + "step": 16219 + }, + { + "epoch": 0.6861832642355529, + "grad_norm": 0.12609069049358368, + "learning_rate": 0.001, + "loss": 2.7186, + "step": 16220 + }, + { + "epoch": 0.6862255689990693, + "grad_norm": 0.1317724585533142, + "learning_rate": 0.001, + "loss": 1.6887, + "step": 16221 + }, + { + "epoch": 0.6862678737625857, + "grad_norm": 0.22671738266944885, + "learning_rate": 0.001, + "loss": 1.519, + "step": 16222 + }, + { + "epoch": 0.686310178526102, + "grad_norm": 0.21472686529159546, + "learning_rate": 0.001, + "loss": 2.5224, + "step": 16223 + }, + { + "epoch": 0.6863524832896184, + "grad_norm": 0.5818791389465332, + "learning_rate": 0.001, + "loss": 1.7538, + "step": 16224 + }, + { + "epoch": 0.6863947880531348, + "grad_norm": 0.22420547902584076, + "learning_rate": 0.001, + "loss": 1.5812, + "step": 16225 + }, + { + "epoch": 0.6864370928166511, + "grad_norm": 0.16626037657260895, + "learning_rate": 0.001, + "loss": 1.8388, + "step": 16226 + }, + { + "epoch": 0.6864793975801675, + "grad_norm": 0.19007837772369385, + "learning_rate": 0.001, + "loss": 2.5345, + "step": 16227 + }, + { + "epoch": 0.6865217023436839, + "grad_norm": 0.3126547932624817, + "learning_rate": 0.001, + "loss": 2.5063, + "step": 16228 + }, + { + "epoch": 0.6865640071072002, + "grad_norm": 0.2687819004058838, + "learning_rate": 0.001, + "loss": 3.085, + "step": 16229 + }, + { + "epoch": 0.6866063118707166, + "grad_norm": 0.9603768587112427, + "learning_rate": 0.001, + "loss": 1.8578, + "step": 16230 + }, + { + "epoch": 0.686648616634233, + "grad_norm": 0.1472010314464569, + "learning_rate": 0.001, + "loss": 3.0854, + "step": 16231 + }, + { + "epoch": 0.6866909213977493, + "grad_norm": 0.20346207916736603, + "learning_rate": 0.001, + "loss": 1.8353, + "step": 16232 + }, + { + "epoch": 0.6867332261612658, + "grad_norm": 0.17270101606845856, + "learning_rate": 0.001, + "loss": 3.0351, + "step": 16233 + }, + { + "epoch": 0.6867755309247822, + "grad_norm": 0.14880718290805817, + "learning_rate": 0.001, + "loss": 2.1701, + "step": 16234 + }, + { + "epoch": 0.6868178356882985, + "grad_norm": 0.23077774047851562, + "learning_rate": 0.001, + "loss": 3.2428, + "step": 16235 + }, + { + "epoch": 0.6868601404518149, + "grad_norm": 0.14143741130828857, + "learning_rate": 0.001, + "loss": 1.7177, + "step": 16236 + }, + { + "epoch": 0.6869024452153313, + "grad_norm": 0.15532909333705902, + "learning_rate": 0.001, + "loss": 1.4819, + "step": 16237 + }, + { + "epoch": 0.6869447499788476, + "grad_norm": 1.5468194484710693, + "learning_rate": 0.001, + "loss": 1.6917, + "step": 16238 + }, + { + "epoch": 0.686987054742364, + "grad_norm": 0.16617412865161896, + "learning_rate": 0.001, + "loss": 1.4165, + "step": 16239 + }, + { + "epoch": 0.6870293595058804, + "grad_norm": 0.25191357731819153, + "learning_rate": 0.001, + "loss": 2.1047, + "step": 16240 + }, + { + "epoch": 0.6870716642693967, + "grad_norm": 0.15333382785320282, + "learning_rate": 0.001, + "loss": 1.978, + "step": 16241 + }, + { + "epoch": 0.6871139690329131, + "grad_norm": 0.16414469480514526, + "learning_rate": 0.001, + "loss": 2.0745, + "step": 16242 + }, + { + "epoch": 0.6871562737964295, + "grad_norm": 1.5129097700119019, + "learning_rate": 0.001, + "loss": 2.2844, + "step": 16243 + }, + { + "epoch": 0.6871985785599458, + "grad_norm": 0.18176978826522827, + "learning_rate": 0.001, + "loss": 2.1622, + "step": 16244 + }, + { + "epoch": 0.6872408833234622, + "grad_norm": 0.17760765552520752, + "learning_rate": 0.001, + "loss": 2.9898, + "step": 16245 + }, + { + "epoch": 0.6872831880869786, + "grad_norm": 0.2571372985839844, + "learning_rate": 0.001, + "loss": 2.6795, + "step": 16246 + }, + { + "epoch": 0.6873254928504949, + "grad_norm": 0.20494753122329712, + "learning_rate": 0.001, + "loss": 1.6626, + "step": 16247 + }, + { + "epoch": 0.6873677976140113, + "grad_norm": 0.9767950773239136, + "learning_rate": 0.001, + "loss": 1.6601, + "step": 16248 + }, + { + "epoch": 0.6874101023775278, + "grad_norm": 0.2790040075778961, + "learning_rate": 0.001, + "loss": 3.1638, + "step": 16249 + }, + { + "epoch": 0.687452407141044, + "grad_norm": 0.2396850734949112, + "learning_rate": 0.001, + "loss": 1.8693, + "step": 16250 + }, + { + "epoch": 0.6874947119045605, + "grad_norm": 1.0322060585021973, + "learning_rate": 0.001, + "loss": 2.7673, + "step": 16251 + }, + { + "epoch": 0.6875370166680769, + "grad_norm": 0.15175770223140717, + "learning_rate": 0.001, + "loss": 3.0304, + "step": 16252 + }, + { + "epoch": 0.6875793214315932, + "grad_norm": 0.18464677035808563, + "learning_rate": 0.001, + "loss": 2.1256, + "step": 16253 + }, + { + "epoch": 0.6876216261951096, + "grad_norm": 0.16536930203437805, + "learning_rate": 0.001, + "loss": 2.0773, + "step": 16254 + }, + { + "epoch": 0.687663930958626, + "grad_norm": 0.20809626579284668, + "learning_rate": 0.001, + "loss": 1.8316, + "step": 16255 + }, + { + "epoch": 0.6877062357221423, + "grad_norm": 0.163310706615448, + "learning_rate": 0.001, + "loss": 2.1962, + "step": 16256 + }, + { + "epoch": 0.6877485404856587, + "grad_norm": 0.39365145564079285, + "learning_rate": 0.001, + "loss": 2.7742, + "step": 16257 + }, + { + "epoch": 0.6877908452491751, + "grad_norm": 0.5424996018409729, + "learning_rate": 0.001, + "loss": 3.4448, + "step": 16258 + }, + { + "epoch": 0.6878331500126914, + "grad_norm": 0.1544448286294937, + "learning_rate": 0.001, + "loss": 1.9199, + "step": 16259 + }, + { + "epoch": 0.6878754547762078, + "grad_norm": 10.53158950805664, + "learning_rate": 0.001, + "loss": 2.1641, + "step": 16260 + }, + { + "epoch": 0.6879177595397242, + "grad_norm": 0.3506162166595459, + "learning_rate": 0.001, + "loss": 2.6466, + "step": 16261 + }, + { + "epoch": 0.6879600643032405, + "grad_norm": 0.16496671736240387, + "learning_rate": 0.001, + "loss": 2.5371, + "step": 16262 + }, + { + "epoch": 0.6880023690667569, + "grad_norm": 0.1795535832643509, + "learning_rate": 0.001, + "loss": 3.5854, + "step": 16263 + }, + { + "epoch": 0.6880446738302732, + "grad_norm": 0.1523294448852539, + "learning_rate": 0.001, + "loss": 1.3369, + "step": 16264 + }, + { + "epoch": 0.6880869785937896, + "grad_norm": 0.16334094107151031, + "learning_rate": 0.001, + "loss": 1.9804, + "step": 16265 + }, + { + "epoch": 0.688129283357306, + "grad_norm": 0.36673104763031006, + "learning_rate": 0.001, + "loss": 2.0428, + "step": 16266 + }, + { + "epoch": 0.6881715881208224, + "grad_norm": 0.23456034064292908, + "learning_rate": 0.001, + "loss": 1.778, + "step": 16267 + }, + { + "epoch": 0.6882138928843388, + "grad_norm": 0.1658594310283661, + "learning_rate": 0.001, + "loss": 1.8266, + "step": 16268 + }, + { + "epoch": 0.6882561976478552, + "grad_norm": 0.17385973036289215, + "learning_rate": 0.001, + "loss": 2.7395, + "step": 16269 + }, + { + "epoch": 0.6882985024113715, + "grad_norm": 0.20876826345920563, + "learning_rate": 0.001, + "loss": 2.2943, + "step": 16270 + }, + { + "epoch": 0.6883408071748879, + "grad_norm": 0.2948185205459595, + "learning_rate": 0.001, + "loss": 2.295, + "step": 16271 + }, + { + "epoch": 0.6883831119384043, + "grad_norm": 0.1359289586544037, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 16272 + }, + { + "epoch": 0.6884254167019206, + "grad_norm": 0.1961379051208496, + "learning_rate": 0.001, + "loss": 2.3073, + "step": 16273 + }, + { + "epoch": 0.688467721465437, + "grad_norm": 0.2041141390800476, + "learning_rate": 0.001, + "loss": 1.6394, + "step": 16274 + }, + { + "epoch": 0.6885100262289534, + "grad_norm": 7.983102798461914, + "learning_rate": 0.001, + "loss": 1.5184, + "step": 16275 + }, + { + "epoch": 0.6885523309924697, + "grad_norm": 0.1904926300048828, + "learning_rate": 0.001, + "loss": 2.1257, + "step": 16276 + }, + { + "epoch": 0.6885946357559861, + "grad_norm": 0.16287410259246826, + "learning_rate": 0.001, + "loss": 2.0357, + "step": 16277 + }, + { + "epoch": 0.6886369405195025, + "grad_norm": 0.17052966356277466, + "learning_rate": 0.001, + "loss": 2.5068, + "step": 16278 + }, + { + "epoch": 0.6886792452830188, + "grad_norm": 0.35149747133255005, + "learning_rate": 0.001, + "loss": 2.5683, + "step": 16279 + }, + { + "epoch": 0.6887215500465352, + "grad_norm": 1.4754235744476318, + "learning_rate": 0.001, + "loss": 2.9501, + "step": 16280 + }, + { + "epoch": 0.6887638548100516, + "grad_norm": 0.1624482125043869, + "learning_rate": 0.001, + "loss": 1.9048, + "step": 16281 + }, + { + "epoch": 0.6888061595735679, + "grad_norm": 1.466571569442749, + "learning_rate": 0.001, + "loss": 2.5641, + "step": 16282 + }, + { + "epoch": 0.6888484643370844, + "grad_norm": 0.18369127810001373, + "learning_rate": 0.001, + "loss": 1.8939, + "step": 16283 + }, + { + "epoch": 0.6888907691006008, + "grad_norm": 0.31858545541763306, + "learning_rate": 0.001, + "loss": 2.515, + "step": 16284 + }, + { + "epoch": 0.6889330738641171, + "grad_norm": 1.7627861499786377, + "learning_rate": 0.001, + "loss": 2.8145, + "step": 16285 + }, + { + "epoch": 0.6889753786276335, + "grad_norm": 0.20102620124816895, + "learning_rate": 0.001, + "loss": 2.8067, + "step": 16286 + }, + { + "epoch": 0.6890176833911499, + "grad_norm": 0.20633406937122345, + "learning_rate": 0.001, + "loss": 2.5596, + "step": 16287 + }, + { + "epoch": 0.6890599881546662, + "grad_norm": 0.2921423017978668, + "learning_rate": 0.001, + "loss": 2.6385, + "step": 16288 + }, + { + "epoch": 0.6891022929181826, + "grad_norm": 1.0566281080245972, + "learning_rate": 0.001, + "loss": 3.4956, + "step": 16289 + }, + { + "epoch": 0.689144597681699, + "grad_norm": 0.23099292814731598, + "learning_rate": 0.001, + "loss": 2.0018, + "step": 16290 + }, + { + "epoch": 0.6891869024452153, + "grad_norm": 0.19271378219127655, + "learning_rate": 0.001, + "loss": 2.4489, + "step": 16291 + }, + { + "epoch": 0.6892292072087317, + "grad_norm": 0.20972347259521484, + "learning_rate": 0.001, + "loss": 2.3633, + "step": 16292 + }, + { + "epoch": 0.6892715119722481, + "grad_norm": 0.17728467285633087, + "learning_rate": 0.001, + "loss": 1.8417, + "step": 16293 + }, + { + "epoch": 0.6893138167357644, + "grad_norm": 1.631922960281372, + "learning_rate": 0.001, + "loss": 1.903, + "step": 16294 + }, + { + "epoch": 0.6893561214992808, + "grad_norm": 0.39803799986839294, + "learning_rate": 0.001, + "loss": 2.5417, + "step": 16295 + }, + { + "epoch": 0.6893984262627972, + "grad_norm": 0.206565260887146, + "learning_rate": 0.001, + "loss": 2.3135, + "step": 16296 + }, + { + "epoch": 0.6894407310263135, + "grad_norm": 0.18916979432106018, + "learning_rate": 0.001, + "loss": 1.8302, + "step": 16297 + }, + { + "epoch": 0.68948303578983, + "grad_norm": 0.21091687679290771, + "learning_rate": 0.001, + "loss": 2.1947, + "step": 16298 + }, + { + "epoch": 0.6895253405533464, + "grad_norm": 0.16629081964492798, + "learning_rate": 0.001, + "loss": 3.1085, + "step": 16299 + }, + { + "epoch": 0.6895676453168627, + "grad_norm": 0.16846491396427155, + "learning_rate": 0.001, + "loss": 1.5303, + "step": 16300 + }, + { + "epoch": 0.6896099500803791, + "grad_norm": 0.1889643669128418, + "learning_rate": 0.001, + "loss": 2.0213, + "step": 16301 + }, + { + "epoch": 0.6896522548438955, + "grad_norm": 0.17257481813430786, + "learning_rate": 0.001, + "loss": 2.3829, + "step": 16302 + }, + { + "epoch": 0.6896945596074118, + "grad_norm": 0.17580455541610718, + "learning_rate": 0.001, + "loss": 2.7781, + "step": 16303 + }, + { + "epoch": 0.6897368643709282, + "grad_norm": 0.1580284833908081, + "learning_rate": 0.001, + "loss": 2.3906, + "step": 16304 + }, + { + "epoch": 0.6897791691344446, + "grad_norm": 0.1577870398759842, + "learning_rate": 0.001, + "loss": 2.7536, + "step": 16305 + }, + { + "epoch": 0.6898214738979609, + "grad_norm": 0.17416058480739594, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 16306 + }, + { + "epoch": 0.6898637786614773, + "grad_norm": 0.7185014486312866, + "learning_rate": 0.001, + "loss": 1.3881, + "step": 16307 + }, + { + "epoch": 0.6899060834249936, + "grad_norm": 0.23386551439762115, + "learning_rate": 0.001, + "loss": 3.7348, + "step": 16308 + }, + { + "epoch": 0.68994838818851, + "grad_norm": 0.1478804051876068, + "learning_rate": 0.001, + "loss": 1.8103, + "step": 16309 + }, + { + "epoch": 0.6899906929520264, + "grad_norm": 0.14378052949905396, + "learning_rate": 0.001, + "loss": 1.8187, + "step": 16310 + }, + { + "epoch": 0.6900329977155427, + "grad_norm": 0.19731678068637848, + "learning_rate": 0.001, + "loss": 2.5575, + "step": 16311 + }, + { + "epoch": 0.6900753024790591, + "grad_norm": 0.6392672657966614, + "learning_rate": 0.001, + "loss": 2.2473, + "step": 16312 + }, + { + "epoch": 0.6901176072425755, + "grad_norm": 0.5831864476203918, + "learning_rate": 0.001, + "loss": 1.6424, + "step": 16313 + }, + { + "epoch": 0.6901599120060918, + "grad_norm": 0.573341965675354, + "learning_rate": 0.001, + "loss": 1.8507, + "step": 16314 + }, + { + "epoch": 0.6902022167696082, + "grad_norm": 0.1696714460849762, + "learning_rate": 0.001, + "loss": 2.02, + "step": 16315 + }, + { + "epoch": 0.6902445215331247, + "grad_norm": 0.8533129692077637, + "learning_rate": 0.001, + "loss": 2.3024, + "step": 16316 + }, + { + "epoch": 0.690286826296641, + "grad_norm": 0.9942197203636169, + "learning_rate": 0.001, + "loss": 1.8328, + "step": 16317 + }, + { + "epoch": 0.6903291310601574, + "grad_norm": 0.587260901927948, + "learning_rate": 0.001, + "loss": 2.375, + "step": 16318 + }, + { + "epoch": 0.6903714358236738, + "grad_norm": 0.17391139268875122, + "learning_rate": 0.001, + "loss": 1.7228, + "step": 16319 + }, + { + "epoch": 0.6904137405871901, + "grad_norm": 0.437977135181427, + "learning_rate": 0.001, + "loss": 2.7662, + "step": 16320 + }, + { + "epoch": 0.6904560453507065, + "grad_norm": 0.2368604987859726, + "learning_rate": 0.001, + "loss": 2.2516, + "step": 16321 + }, + { + "epoch": 0.6904983501142229, + "grad_norm": 0.1805589199066162, + "learning_rate": 0.001, + "loss": 2.1679, + "step": 16322 + }, + { + "epoch": 0.6905406548777392, + "grad_norm": 0.6197409629821777, + "learning_rate": 0.001, + "loss": 1.9621, + "step": 16323 + }, + { + "epoch": 0.6905829596412556, + "grad_norm": 0.16126328706741333, + "learning_rate": 0.001, + "loss": 2.1508, + "step": 16324 + }, + { + "epoch": 0.690625264404772, + "grad_norm": 0.17025581002235413, + "learning_rate": 0.001, + "loss": 2.3355, + "step": 16325 + }, + { + "epoch": 0.6906675691682883, + "grad_norm": 0.2806219756603241, + "learning_rate": 0.001, + "loss": 2.0408, + "step": 16326 + }, + { + "epoch": 0.6907098739318047, + "grad_norm": 0.4944941997528076, + "learning_rate": 0.001, + "loss": 3.1678, + "step": 16327 + }, + { + "epoch": 0.6907521786953211, + "grad_norm": 0.3849775195121765, + "learning_rate": 0.001, + "loss": 2.0565, + "step": 16328 + }, + { + "epoch": 0.6907944834588374, + "grad_norm": 0.17728754878044128, + "learning_rate": 0.001, + "loss": 1.5861, + "step": 16329 + }, + { + "epoch": 0.6908367882223538, + "grad_norm": 3.389814615249634, + "learning_rate": 0.001, + "loss": 2.1157, + "step": 16330 + }, + { + "epoch": 0.6908790929858702, + "grad_norm": 0.16745001077651978, + "learning_rate": 0.001, + "loss": 1.7655, + "step": 16331 + }, + { + "epoch": 0.6909213977493865, + "grad_norm": 0.2383284568786621, + "learning_rate": 0.001, + "loss": 2.2718, + "step": 16332 + }, + { + "epoch": 0.690963702512903, + "grad_norm": 1.5297582149505615, + "learning_rate": 0.001, + "loss": 2.8112, + "step": 16333 + }, + { + "epoch": 0.6910060072764194, + "grad_norm": 0.17354048788547516, + "learning_rate": 0.001, + "loss": 2.7349, + "step": 16334 + }, + { + "epoch": 0.6910483120399357, + "grad_norm": 0.1723877489566803, + "learning_rate": 0.001, + "loss": 2.1994, + "step": 16335 + }, + { + "epoch": 0.6910906168034521, + "grad_norm": 0.160502627491951, + "learning_rate": 0.001, + "loss": 2.5597, + "step": 16336 + }, + { + "epoch": 0.6911329215669685, + "grad_norm": 0.3956283628940582, + "learning_rate": 0.001, + "loss": 1.5582, + "step": 16337 + }, + { + "epoch": 0.6911752263304848, + "grad_norm": 0.27428823709487915, + "learning_rate": 0.001, + "loss": 2.2426, + "step": 16338 + }, + { + "epoch": 0.6912175310940012, + "grad_norm": 0.6840664148330688, + "learning_rate": 0.001, + "loss": 2.731, + "step": 16339 + }, + { + "epoch": 0.6912598358575176, + "grad_norm": 0.1819261908531189, + "learning_rate": 0.001, + "loss": 2.0739, + "step": 16340 + }, + { + "epoch": 0.6913021406210339, + "grad_norm": 0.2000657320022583, + "learning_rate": 0.001, + "loss": 3.0519, + "step": 16341 + }, + { + "epoch": 0.6913444453845503, + "grad_norm": 0.28833070397377014, + "learning_rate": 0.001, + "loss": 2.1887, + "step": 16342 + }, + { + "epoch": 0.6913867501480667, + "grad_norm": 0.24844685196876526, + "learning_rate": 0.001, + "loss": 2.7631, + "step": 16343 + }, + { + "epoch": 0.691429054911583, + "grad_norm": 0.16090954840183258, + "learning_rate": 0.001, + "loss": 2.0467, + "step": 16344 + }, + { + "epoch": 0.6914713596750994, + "grad_norm": 0.19617071747779846, + "learning_rate": 0.001, + "loss": 1.4285, + "step": 16345 + }, + { + "epoch": 0.6915136644386158, + "grad_norm": 0.16506654024124146, + "learning_rate": 0.001, + "loss": 1.3764, + "step": 16346 + }, + { + "epoch": 0.6915559692021321, + "grad_norm": 0.175397127866745, + "learning_rate": 0.001, + "loss": 1.7242, + "step": 16347 + }, + { + "epoch": 0.6915982739656485, + "grad_norm": 0.20223967730998993, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 16348 + }, + { + "epoch": 0.691640578729165, + "grad_norm": 0.17678618431091309, + "learning_rate": 0.001, + "loss": 2.9338, + "step": 16349 + }, + { + "epoch": 0.6916828834926813, + "grad_norm": 0.2138862907886505, + "learning_rate": 0.001, + "loss": 2.1967, + "step": 16350 + }, + { + "epoch": 0.6917251882561977, + "grad_norm": 0.15194635093212128, + "learning_rate": 0.001, + "loss": 2.0574, + "step": 16351 + }, + { + "epoch": 0.6917674930197141, + "grad_norm": 0.2285081446170807, + "learning_rate": 0.001, + "loss": 3.0883, + "step": 16352 + }, + { + "epoch": 0.6918097977832304, + "grad_norm": 0.17893058061599731, + "learning_rate": 0.001, + "loss": 1.9451, + "step": 16353 + }, + { + "epoch": 0.6918521025467468, + "grad_norm": 0.1954171061515808, + "learning_rate": 0.001, + "loss": 1.7675, + "step": 16354 + }, + { + "epoch": 0.6918944073102631, + "grad_norm": 0.1901036947965622, + "learning_rate": 0.001, + "loss": 2.7754, + "step": 16355 + }, + { + "epoch": 0.6919367120737795, + "grad_norm": 0.2298295795917511, + "learning_rate": 0.001, + "loss": 1.8922, + "step": 16356 + }, + { + "epoch": 0.6919790168372959, + "grad_norm": 0.166233092546463, + "learning_rate": 0.001, + "loss": 1.6469, + "step": 16357 + }, + { + "epoch": 0.6920213216008122, + "grad_norm": 0.17486384510993958, + "learning_rate": 0.001, + "loss": 1.7152, + "step": 16358 + }, + { + "epoch": 0.6920636263643286, + "grad_norm": 0.2248300462961197, + "learning_rate": 0.001, + "loss": 1.6036, + "step": 16359 + }, + { + "epoch": 0.692105931127845, + "grad_norm": 0.36651915311813354, + "learning_rate": 0.001, + "loss": 3.0015, + "step": 16360 + }, + { + "epoch": 0.6921482358913613, + "grad_norm": 0.2662813663482666, + "learning_rate": 0.001, + "loss": 2.7571, + "step": 16361 + }, + { + "epoch": 0.6921905406548777, + "grad_norm": 0.540983259677887, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 16362 + }, + { + "epoch": 0.6922328454183941, + "grad_norm": 0.44526684284210205, + "learning_rate": 0.001, + "loss": 1.8432, + "step": 16363 + }, + { + "epoch": 0.6922751501819104, + "grad_norm": 4.076467990875244, + "learning_rate": 0.001, + "loss": 4.882, + "step": 16364 + }, + { + "epoch": 0.6923174549454268, + "grad_norm": 0.16594544053077698, + "learning_rate": 0.001, + "loss": 1.753, + "step": 16365 + }, + { + "epoch": 0.6923597597089433, + "grad_norm": 0.15641087293624878, + "learning_rate": 0.001, + "loss": 2.2164, + "step": 16366 + }, + { + "epoch": 0.6924020644724596, + "grad_norm": 0.14572037756443024, + "learning_rate": 0.001, + "loss": 3.1288, + "step": 16367 + }, + { + "epoch": 0.692444369235976, + "grad_norm": 0.18323473632335663, + "learning_rate": 0.001, + "loss": 1.6634, + "step": 16368 + }, + { + "epoch": 0.6924866739994924, + "grad_norm": 0.552975594997406, + "learning_rate": 0.001, + "loss": 2.5361, + "step": 16369 + }, + { + "epoch": 0.6925289787630087, + "grad_norm": 0.743918240070343, + "learning_rate": 0.001, + "loss": 3.3047, + "step": 16370 + }, + { + "epoch": 0.6925712835265251, + "grad_norm": 0.22077415883541107, + "learning_rate": 0.001, + "loss": 1.9185, + "step": 16371 + }, + { + "epoch": 0.6926135882900415, + "grad_norm": 4.2804365158081055, + "learning_rate": 0.001, + "loss": 1.6356, + "step": 16372 + }, + { + "epoch": 0.6926558930535578, + "grad_norm": 0.23495794832706451, + "learning_rate": 0.001, + "loss": 2.2379, + "step": 16373 + }, + { + "epoch": 0.6926981978170742, + "grad_norm": 0.14762520790100098, + "learning_rate": 0.001, + "loss": 1.7328, + "step": 16374 + }, + { + "epoch": 0.6927405025805906, + "grad_norm": 1.2095277309417725, + "learning_rate": 0.001, + "loss": 3.5081, + "step": 16375 + }, + { + "epoch": 0.6927828073441069, + "grad_norm": 0.16055262088775635, + "learning_rate": 0.001, + "loss": 2.0528, + "step": 16376 + }, + { + "epoch": 0.6928251121076233, + "grad_norm": 0.24955902993679047, + "learning_rate": 0.001, + "loss": 2.0347, + "step": 16377 + }, + { + "epoch": 0.6928674168711397, + "grad_norm": 0.8334779143333435, + "learning_rate": 0.001, + "loss": 2.8753, + "step": 16378 + }, + { + "epoch": 0.692909721634656, + "grad_norm": 0.18906332552433014, + "learning_rate": 0.001, + "loss": 1.6332, + "step": 16379 + }, + { + "epoch": 0.6929520263981724, + "grad_norm": 0.1773161143064499, + "learning_rate": 0.001, + "loss": 2.7939, + "step": 16380 + }, + { + "epoch": 0.6929943311616888, + "grad_norm": 0.18923841416835785, + "learning_rate": 0.001, + "loss": 2.376, + "step": 16381 + }, + { + "epoch": 0.6930366359252051, + "grad_norm": 0.1875029057264328, + "learning_rate": 0.001, + "loss": 2.5787, + "step": 16382 + }, + { + "epoch": 0.6930789406887216, + "grad_norm": 0.1562403440475464, + "learning_rate": 0.001, + "loss": 1.9452, + "step": 16383 + }, + { + "epoch": 0.693121245452238, + "grad_norm": 0.19524121284484863, + "learning_rate": 0.001, + "loss": 3.0866, + "step": 16384 + }, + { + "epoch": 0.6931635502157543, + "grad_norm": 0.1857270449399948, + "learning_rate": 0.001, + "loss": 1.9935, + "step": 16385 + }, + { + "epoch": 0.6932058549792707, + "grad_norm": 0.3167372941970825, + "learning_rate": 0.001, + "loss": 2.4217, + "step": 16386 + }, + { + "epoch": 0.6932481597427871, + "grad_norm": 0.24016937613487244, + "learning_rate": 0.001, + "loss": 2.8926, + "step": 16387 + }, + { + "epoch": 0.6932904645063034, + "grad_norm": 0.15906888246536255, + "learning_rate": 0.001, + "loss": 3.2683, + "step": 16388 + }, + { + "epoch": 0.6933327692698198, + "grad_norm": 0.2372499704360962, + "learning_rate": 0.001, + "loss": 2.9656, + "step": 16389 + }, + { + "epoch": 0.6933750740333362, + "grad_norm": 18.70325469970703, + "learning_rate": 0.001, + "loss": 1.8463, + "step": 16390 + }, + { + "epoch": 0.6934173787968525, + "grad_norm": 0.21969793736934662, + "learning_rate": 0.001, + "loss": 1.9638, + "step": 16391 + }, + { + "epoch": 0.6934596835603689, + "grad_norm": 1.0692784786224365, + "learning_rate": 0.001, + "loss": 2.1758, + "step": 16392 + }, + { + "epoch": 0.6935019883238853, + "grad_norm": 0.17494791746139526, + "learning_rate": 0.001, + "loss": 1.9434, + "step": 16393 + }, + { + "epoch": 0.6935442930874016, + "grad_norm": 10.130684852600098, + "learning_rate": 0.001, + "loss": 2.0469, + "step": 16394 + }, + { + "epoch": 0.693586597850918, + "grad_norm": 0.18543873727321625, + "learning_rate": 0.001, + "loss": 1.6253, + "step": 16395 + }, + { + "epoch": 0.6936289026144344, + "grad_norm": 0.176463320851326, + "learning_rate": 0.001, + "loss": 2.5161, + "step": 16396 + }, + { + "epoch": 0.6936712073779507, + "grad_norm": 0.21182700991630554, + "learning_rate": 0.001, + "loss": 1.9964, + "step": 16397 + }, + { + "epoch": 0.6937135121414671, + "grad_norm": 0.17434942722320557, + "learning_rate": 0.001, + "loss": 1.608, + "step": 16398 + }, + { + "epoch": 0.6937558169049834, + "grad_norm": 1.2661168575286865, + "learning_rate": 0.001, + "loss": 3.2148, + "step": 16399 + }, + { + "epoch": 0.6937981216684999, + "grad_norm": 16.555898666381836, + "learning_rate": 0.001, + "loss": 2.0895, + "step": 16400 + }, + { + "epoch": 0.6938404264320163, + "grad_norm": 0.1642748862504959, + "learning_rate": 0.001, + "loss": 1.9288, + "step": 16401 + }, + { + "epoch": 0.6938827311955326, + "grad_norm": 0.20912915468215942, + "learning_rate": 0.001, + "loss": 3.3075, + "step": 16402 + }, + { + "epoch": 0.693925035959049, + "grad_norm": 0.1803750842809677, + "learning_rate": 0.001, + "loss": 3.1573, + "step": 16403 + }, + { + "epoch": 0.6939673407225654, + "grad_norm": 0.21394596993923187, + "learning_rate": 0.001, + "loss": 2.6982, + "step": 16404 + }, + { + "epoch": 0.6940096454860817, + "grad_norm": 0.22637422382831573, + "learning_rate": 0.001, + "loss": 2.2632, + "step": 16405 + }, + { + "epoch": 0.6940519502495981, + "grad_norm": 0.3572201728820801, + "learning_rate": 0.001, + "loss": 2.7936, + "step": 16406 + }, + { + "epoch": 0.6940942550131145, + "grad_norm": 17.344480514526367, + "learning_rate": 0.001, + "loss": 1.5021, + "step": 16407 + }, + { + "epoch": 0.6941365597766308, + "grad_norm": 0.19448307156562805, + "learning_rate": 0.001, + "loss": 1.9725, + "step": 16408 + }, + { + "epoch": 0.6941788645401472, + "grad_norm": 0.18592120707035065, + "learning_rate": 0.001, + "loss": 2.0079, + "step": 16409 + }, + { + "epoch": 0.6942211693036636, + "grad_norm": 0.17572908103466034, + "learning_rate": 0.001, + "loss": 1.9132, + "step": 16410 + }, + { + "epoch": 0.6942634740671799, + "grad_norm": 0.31208327412605286, + "learning_rate": 0.001, + "loss": 1.6902, + "step": 16411 + }, + { + "epoch": 0.6943057788306963, + "grad_norm": 0.2312842458486557, + "learning_rate": 0.001, + "loss": 2.3156, + "step": 16412 + }, + { + "epoch": 0.6943480835942127, + "grad_norm": 0.25776514410972595, + "learning_rate": 0.001, + "loss": 1.3374, + "step": 16413 + }, + { + "epoch": 0.694390388357729, + "grad_norm": 0.7767313718795776, + "learning_rate": 0.001, + "loss": 2.6159, + "step": 16414 + }, + { + "epoch": 0.6944326931212454, + "grad_norm": 1.3222384452819824, + "learning_rate": 0.001, + "loss": 3.7811, + "step": 16415 + }, + { + "epoch": 0.6944749978847619, + "grad_norm": 0.22960853576660156, + "learning_rate": 0.001, + "loss": 1.8167, + "step": 16416 + }, + { + "epoch": 0.6945173026482782, + "grad_norm": 0.20826157927513123, + "learning_rate": 0.001, + "loss": 2.0062, + "step": 16417 + }, + { + "epoch": 0.6945596074117946, + "grad_norm": 2.312390089035034, + "learning_rate": 0.001, + "loss": 3.3735, + "step": 16418 + }, + { + "epoch": 0.694601912175311, + "grad_norm": 3.1580910682678223, + "learning_rate": 0.001, + "loss": 2.463, + "step": 16419 + }, + { + "epoch": 0.6946442169388273, + "grad_norm": 0.25120091438293457, + "learning_rate": 0.001, + "loss": 2.9196, + "step": 16420 + }, + { + "epoch": 0.6946865217023437, + "grad_norm": 0.15511690080165863, + "learning_rate": 0.001, + "loss": 1.6251, + "step": 16421 + }, + { + "epoch": 0.6947288264658601, + "grad_norm": 0.18598857522010803, + "learning_rate": 0.001, + "loss": 2.9597, + "step": 16422 + }, + { + "epoch": 0.6947711312293764, + "grad_norm": 0.16862650215625763, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 16423 + }, + { + "epoch": 0.6948134359928928, + "grad_norm": 0.18114817142486572, + "learning_rate": 0.001, + "loss": 2.222, + "step": 16424 + }, + { + "epoch": 0.6948557407564092, + "grad_norm": 0.19267553091049194, + "learning_rate": 0.001, + "loss": 2.3873, + "step": 16425 + }, + { + "epoch": 0.6948980455199255, + "grad_norm": 0.20283320546150208, + "learning_rate": 0.001, + "loss": 1.3301, + "step": 16426 + }, + { + "epoch": 0.6949403502834419, + "grad_norm": 2.084455728530884, + "learning_rate": 0.001, + "loss": 1.6759, + "step": 16427 + }, + { + "epoch": 0.6949826550469583, + "grad_norm": 2.8412082195281982, + "learning_rate": 0.001, + "loss": 3.4139, + "step": 16428 + }, + { + "epoch": 0.6950249598104746, + "grad_norm": 2.059316396713257, + "learning_rate": 0.001, + "loss": 1.9375, + "step": 16429 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.3789178431034088, + "learning_rate": 0.001, + "loss": 2.9269, + "step": 16430 + }, + { + "epoch": 0.6951095693375074, + "grad_norm": 0.31577736139297485, + "learning_rate": 0.001, + "loss": 2.9995, + "step": 16431 + }, + { + "epoch": 0.6951518741010237, + "grad_norm": 2.173356533050537, + "learning_rate": 0.001, + "loss": 2.4088, + "step": 16432 + }, + { + "epoch": 0.6951941788645402, + "grad_norm": 0.2261185199022293, + "learning_rate": 0.001, + "loss": 2.0083, + "step": 16433 + }, + { + "epoch": 0.6952364836280566, + "grad_norm": 0.23543325066566467, + "learning_rate": 0.001, + "loss": 1.9962, + "step": 16434 + }, + { + "epoch": 0.6952787883915729, + "grad_norm": 0.23661349713802338, + "learning_rate": 0.001, + "loss": 2.1091, + "step": 16435 + }, + { + "epoch": 0.6953210931550893, + "grad_norm": 0.20180752873420715, + "learning_rate": 0.001, + "loss": 2.3918, + "step": 16436 + }, + { + "epoch": 0.6953633979186057, + "grad_norm": 0.261025995016098, + "learning_rate": 0.001, + "loss": 2.3989, + "step": 16437 + }, + { + "epoch": 0.695405702682122, + "grad_norm": 0.24305380880832672, + "learning_rate": 0.001, + "loss": 2.3177, + "step": 16438 + }, + { + "epoch": 0.6954480074456384, + "grad_norm": 0.1875523179769516, + "learning_rate": 0.001, + "loss": 2.4939, + "step": 16439 + }, + { + "epoch": 0.6954903122091548, + "grad_norm": 0.20536570250988007, + "learning_rate": 0.001, + "loss": 2.3507, + "step": 16440 + }, + { + "epoch": 0.6955326169726711, + "grad_norm": 2.672262668609619, + "learning_rate": 0.001, + "loss": 2.8801, + "step": 16441 + }, + { + "epoch": 0.6955749217361875, + "grad_norm": 0.16370277106761932, + "learning_rate": 0.001, + "loss": 2.0448, + "step": 16442 + }, + { + "epoch": 0.6956172264997038, + "grad_norm": 0.3966878056526184, + "learning_rate": 0.001, + "loss": 1.4291, + "step": 16443 + }, + { + "epoch": 0.6956595312632202, + "grad_norm": 6.427881717681885, + "learning_rate": 0.001, + "loss": 2.0735, + "step": 16444 + }, + { + "epoch": 0.6957018360267366, + "grad_norm": 0.9864132404327393, + "learning_rate": 0.001, + "loss": 1.9811, + "step": 16445 + }, + { + "epoch": 0.6957441407902529, + "grad_norm": 0.19832006096839905, + "learning_rate": 0.001, + "loss": 2.0114, + "step": 16446 + }, + { + "epoch": 0.6957864455537693, + "grad_norm": 0.18039336800575256, + "learning_rate": 0.001, + "loss": 1.4557, + "step": 16447 + }, + { + "epoch": 0.6958287503172857, + "grad_norm": 0.18589988350868225, + "learning_rate": 0.001, + "loss": 1.7354, + "step": 16448 + }, + { + "epoch": 0.695871055080802, + "grad_norm": 0.21510185301303864, + "learning_rate": 0.001, + "loss": 2.3479, + "step": 16449 + }, + { + "epoch": 0.6959133598443185, + "grad_norm": 0.19630225002765656, + "learning_rate": 0.001, + "loss": 2.2254, + "step": 16450 + }, + { + "epoch": 0.6959556646078349, + "grad_norm": 0.18261785805225372, + "learning_rate": 0.001, + "loss": 2.2902, + "step": 16451 + }, + { + "epoch": 0.6959979693713512, + "grad_norm": 0.20006346702575684, + "learning_rate": 0.001, + "loss": 2.0152, + "step": 16452 + }, + { + "epoch": 0.6960402741348676, + "grad_norm": 0.18040160834789276, + "learning_rate": 0.001, + "loss": 1.8994, + "step": 16453 + }, + { + "epoch": 0.696082578898384, + "grad_norm": 1.6500322818756104, + "learning_rate": 0.001, + "loss": 1.7275, + "step": 16454 + }, + { + "epoch": 0.6961248836619003, + "grad_norm": 0.2118813842535019, + "learning_rate": 0.001, + "loss": 3.1393, + "step": 16455 + }, + { + "epoch": 0.6961671884254167, + "grad_norm": 0.18187031149864197, + "learning_rate": 0.001, + "loss": 2.1885, + "step": 16456 + }, + { + "epoch": 0.6962094931889331, + "grad_norm": 0.21792373061180115, + "learning_rate": 0.001, + "loss": 2.5011, + "step": 16457 + }, + { + "epoch": 0.6962517979524494, + "grad_norm": 0.2024918496608734, + "learning_rate": 0.001, + "loss": 2.1317, + "step": 16458 + }, + { + "epoch": 0.6962941027159658, + "grad_norm": 0.1720605045557022, + "learning_rate": 0.001, + "loss": 2.5661, + "step": 16459 + }, + { + "epoch": 0.6963364074794822, + "grad_norm": 0.20015959441661835, + "learning_rate": 0.001, + "loss": 2.6472, + "step": 16460 + }, + { + "epoch": 0.6963787122429985, + "grad_norm": 0.3034919500350952, + "learning_rate": 0.001, + "loss": 1.9821, + "step": 16461 + }, + { + "epoch": 0.6964210170065149, + "grad_norm": 0.1722230315208435, + "learning_rate": 0.001, + "loss": 1.7016, + "step": 16462 + }, + { + "epoch": 0.6964633217700313, + "grad_norm": 0.17962494492530823, + "learning_rate": 0.001, + "loss": 1.6498, + "step": 16463 + }, + { + "epoch": 0.6965056265335476, + "grad_norm": 0.6122218370437622, + "learning_rate": 0.001, + "loss": 1.7189, + "step": 16464 + }, + { + "epoch": 0.696547931297064, + "grad_norm": 0.17944207787513733, + "learning_rate": 0.001, + "loss": 1.5481, + "step": 16465 + }, + { + "epoch": 0.6965902360605805, + "grad_norm": 0.16219043731689453, + "learning_rate": 0.001, + "loss": 2.1806, + "step": 16466 + }, + { + "epoch": 0.6966325408240968, + "grad_norm": 0.2081691324710846, + "learning_rate": 0.001, + "loss": 3.2181, + "step": 16467 + }, + { + "epoch": 0.6966748455876132, + "grad_norm": 0.16732138395309448, + "learning_rate": 0.001, + "loss": 3.0854, + "step": 16468 + }, + { + "epoch": 0.6967171503511296, + "grad_norm": 0.1735231727361679, + "learning_rate": 0.001, + "loss": 2.2234, + "step": 16469 + }, + { + "epoch": 0.6967594551146459, + "grad_norm": 0.7906176447868347, + "learning_rate": 0.001, + "loss": 2.4975, + "step": 16470 + }, + { + "epoch": 0.6968017598781623, + "grad_norm": 0.20566585659980774, + "learning_rate": 0.001, + "loss": 3.4062, + "step": 16471 + }, + { + "epoch": 0.6968440646416787, + "grad_norm": 0.1833547055721283, + "learning_rate": 0.001, + "loss": 2.034, + "step": 16472 + }, + { + "epoch": 0.696886369405195, + "grad_norm": 0.18477390706539154, + "learning_rate": 0.001, + "loss": 2.5579, + "step": 16473 + }, + { + "epoch": 0.6969286741687114, + "grad_norm": 0.19416743516921997, + "learning_rate": 0.001, + "loss": 2.2212, + "step": 16474 + }, + { + "epoch": 0.6969709789322278, + "grad_norm": 0.2078043520450592, + "learning_rate": 0.001, + "loss": 2.0475, + "step": 16475 + }, + { + "epoch": 0.6970132836957441, + "grad_norm": 0.1452716588973999, + "learning_rate": 0.001, + "loss": 1.6866, + "step": 16476 + }, + { + "epoch": 0.6970555884592605, + "grad_norm": 0.1788160353899002, + "learning_rate": 0.001, + "loss": 2.1014, + "step": 16477 + }, + { + "epoch": 0.6970978932227769, + "grad_norm": 0.1941988617181778, + "learning_rate": 0.001, + "loss": 2.2228, + "step": 16478 + }, + { + "epoch": 0.6971401979862932, + "grad_norm": 0.5075099468231201, + "learning_rate": 0.001, + "loss": 2.046, + "step": 16479 + }, + { + "epoch": 0.6971825027498096, + "grad_norm": 0.16728682816028595, + "learning_rate": 0.001, + "loss": 2.5115, + "step": 16480 + }, + { + "epoch": 0.697224807513326, + "grad_norm": 0.17837141454219818, + "learning_rate": 0.001, + "loss": 1.8429, + "step": 16481 + }, + { + "epoch": 0.6972671122768423, + "grad_norm": 0.14722979068756104, + "learning_rate": 0.001, + "loss": 1.6878, + "step": 16482 + }, + { + "epoch": 0.6973094170403588, + "grad_norm": 0.17707088589668274, + "learning_rate": 0.001, + "loss": 2.0991, + "step": 16483 + }, + { + "epoch": 0.6973517218038752, + "grad_norm": 0.20449602603912354, + "learning_rate": 0.001, + "loss": 1.4112, + "step": 16484 + }, + { + "epoch": 0.6973940265673915, + "grad_norm": 0.18787558376789093, + "learning_rate": 0.001, + "loss": 1.9535, + "step": 16485 + }, + { + "epoch": 0.6974363313309079, + "grad_norm": 3.8804171085357666, + "learning_rate": 0.001, + "loss": 2.9737, + "step": 16486 + }, + { + "epoch": 0.6974786360944243, + "grad_norm": 0.18371431529521942, + "learning_rate": 0.001, + "loss": 2.5309, + "step": 16487 + }, + { + "epoch": 0.6975209408579406, + "grad_norm": 0.154661625623703, + "learning_rate": 0.001, + "loss": 1.7516, + "step": 16488 + }, + { + "epoch": 0.697563245621457, + "grad_norm": 0.15226295590400696, + "learning_rate": 0.001, + "loss": 2.5867, + "step": 16489 + }, + { + "epoch": 0.6976055503849733, + "grad_norm": 9.875614166259766, + "learning_rate": 0.001, + "loss": 2.3198, + "step": 16490 + }, + { + "epoch": 0.6976478551484897, + "grad_norm": 0.18575811386108398, + "learning_rate": 0.001, + "loss": 2.5731, + "step": 16491 + }, + { + "epoch": 0.6976901599120061, + "grad_norm": 0.18501834571361542, + "learning_rate": 0.001, + "loss": 1.9908, + "step": 16492 + }, + { + "epoch": 0.6977324646755224, + "grad_norm": 0.17492198944091797, + "learning_rate": 0.001, + "loss": 2.1035, + "step": 16493 + }, + { + "epoch": 0.6977747694390388, + "grad_norm": 0.1972741186618805, + "learning_rate": 0.001, + "loss": 2.0895, + "step": 16494 + }, + { + "epoch": 0.6978170742025552, + "grad_norm": 0.1590324491262436, + "learning_rate": 0.001, + "loss": 1.8644, + "step": 16495 + }, + { + "epoch": 0.6978593789660715, + "grad_norm": 0.15750128030776978, + "learning_rate": 0.001, + "loss": 2.3938, + "step": 16496 + }, + { + "epoch": 0.6979016837295879, + "grad_norm": 0.18353481590747833, + "learning_rate": 0.001, + "loss": 1.9701, + "step": 16497 + }, + { + "epoch": 0.6979439884931044, + "grad_norm": 0.4982626438140869, + "learning_rate": 0.001, + "loss": 2.8423, + "step": 16498 + }, + { + "epoch": 0.6979862932566206, + "grad_norm": 0.5572125911712646, + "learning_rate": 0.001, + "loss": 1.6008, + "step": 16499 + }, + { + "epoch": 0.6980285980201371, + "grad_norm": 0.18385641276836395, + "learning_rate": 0.001, + "loss": 1.7157, + "step": 16500 + }, + { + "epoch": 0.6980709027836535, + "grad_norm": 0.20316894352436066, + "learning_rate": 0.001, + "loss": 3.3949, + "step": 16501 + }, + { + "epoch": 0.6981132075471698, + "grad_norm": 0.21854710578918457, + "learning_rate": 0.001, + "loss": 2.4985, + "step": 16502 + }, + { + "epoch": 0.6981555123106862, + "grad_norm": 0.2654995620250702, + "learning_rate": 0.001, + "loss": 2.077, + "step": 16503 + }, + { + "epoch": 0.6981978170742026, + "grad_norm": 0.15483231842517853, + "learning_rate": 0.001, + "loss": 2.5364, + "step": 16504 + }, + { + "epoch": 0.6982401218377189, + "grad_norm": 0.1526549905538559, + "learning_rate": 0.001, + "loss": 2.8325, + "step": 16505 + }, + { + "epoch": 0.6982824266012353, + "grad_norm": 0.18169742822647095, + "learning_rate": 0.001, + "loss": 2.6684, + "step": 16506 + }, + { + "epoch": 0.6983247313647517, + "grad_norm": 0.19134920835494995, + "learning_rate": 0.001, + "loss": 2.8449, + "step": 16507 + }, + { + "epoch": 0.698367036128268, + "grad_norm": 3.5741662979125977, + "learning_rate": 0.001, + "loss": 2.137, + "step": 16508 + }, + { + "epoch": 0.6984093408917844, + "grad_norm": 0.42037051916122437, + "learning_rate": 0.001, + "loss": 1.8969, + "step": 16509 + }, + { + "epoch": 0.6984516456553008, + "grad_norm": 0.1691959798336029, + "learning_rate": 0.001, + "loss": 1.5428, + "step": 16510 + }, + { + "epoch": 0.6984939504188171, + "grad_norm": 0.19847933948040009, + "learning_rate": 0.001, + "loss": 1.9573, + "step": 16511 + }, + { + "epoch": 0.6985362551823335, + "grad_norm": 0.8859964609146118, + "learning_rate": 0.001, + "loss": 2.9135, + "step": 16512 + }, + { + "epoch": 0.6985785599458499, + "grad_norm": 0.17541298270225525, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 16513 + }, + { + "epoch": 0.6986208647093662, + "grad_norm": 0.1629028022289276, + "learning_rate": 0.001, + "loss": 2.2677, + "step": 16514 + }, + { + "epoch": 0.6986631694728827, + "grad_norm": 0.39890217781066895, + "learning_rate": 0.001, + "loss": 1.998, + "step": 16515 + }, + { + "epoch": 0.6987054742363991, + "grad_norm": 0.16391772031784058, + "learning_rate": 0.001, + "loss": 2.2344, + "step": 16516 + }, + { + "epoch": 0.6987477789999154, + "grad_norm": 0.20144188404083252, + "learning_rate": 0.001, + "loss": 2.3193, + "step": 16517 + }, + { + "epoch": 0.6987900837634318, + "grad_norm": 0.3551698923110962, + "learning_rate": 0.001, + "loss": 1.9678, + "step": 16518 + }, + { + "epoch": 0.6988323885269482, + "grad_norm": 3.031266450881958, + "learning_rate": 0.001, + "loss": 2.1602, + "step": 16519 + }, + { + "epoch": 0.6988746932904645, + "grad_norm": 0.21469104290008545, + "learning_rate": 0.001, + "loss": 2.2958, + "step": 16520 + }, + { + "epoch": 0.6989169980539809, + "grad_norm": 0.9255651831626892, + "learning_rate": 0.001, + "loss": 1.9495, + "step": 16521 + }, + { + "epoch": 0.6989593028174973, + "grad_norm": 0.1491638422012329, + "learning_rate": 0.001, + "loss": 1.8283, + "step": 16522 + }, + { + "epoch": 0.6990016075810136, + "grad_norm": 0.22015687823295593, + "learning_rate": 0.001, + "loss": 2.5978, + "step": 16523 + }, + { + "epoch": 0.69904391234453, + "grad_norm": 0.17289428412914276, + "learning_rate": 0.001, + "loss": 1.9345, + "step": 16524 + }, + { + "epoch": 0.6990862171080464, + "grad_norm": 0.1912022829055786, + "learning_rate": 0.001, + "loss": 2.4924, + "step": 16525 + }, + { + "epoch": 0.6991285218715627, + "grad_norm": 0.17688611149787903, + "learning_rate": 0.001, + "loss": 1.7264, + "step": 16526 + }, + { + "epoch": 0.6991708266350791, + "grad_norm": 0.1654282510280609, + "learning_rate": 0.001, + "loss": 2.0543, + "step": 16527 + }, + { + "epoch": 0.6992131313985955, + "grad_norm": 3.0969583988189697, + "learning_rate": 0.001, + "loss": 2.2927, + "step": 16528 + }, + { + "epoch": 0.6992554361621118, + "grad_norm": 0.2961462438106537, + "learning_rate": 0.001, + "loss": 1.3273, + "step": 16529 + }, + { + "epoch": 0.6992977409256282, + "grad_norm": 0.1830281913280487, + "learning_rate": 0.001, + "loss": 2.2964, + "step": 16530 + }, + { + "epoch": 0.6993400456891447, + "grad_norm": 0.19656279683113098, + "learning_rate": 0.001, + "loss": 2.521, + "step": 16531 + }, + { + "epoch": 0.699382350452661, + "grad_norm": 0.3053706884384155, + "learning_rate": 0.001, + "loss": 2.1476, + "step": 16532 + }, + { + "epoch": 0.6994246552161774, + "grad_norm": 0.3848680257797241, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 16533 + }, + { + "epoch": 0.6994669599796937, + "grad_norm": 0.20205725729465485, + "learning_rate": 0.001, + "loss": 2.2957, + "step": 16534 + }, + { + "epoch": 0.6995092647432101, + "grad_norm": 0.21945300698280334, + "learning_rate": 0.001, + "loss": 1.8217, + "step": 16535 + }, + { + "epoch": 0.6995515695067265, + "grad_norm": 0.3286508321762085, + "learning_rate": 0.001, + "loss": 2.1553, + "step": 16536 + }, + { + "epoch": 0.6995938742702428, + "grad_norm": 0.17390255630016327, + "learning_rate": 0.001, + "loss": 1.8902, + "step": 16537 + }, + { + "epoch": 0.6996361790337592, + "grad_norm": 0.2187155783176422, + "learning_rate": 0.001, + "loss": 2.5251, + "step": 16538 + }, + { + "epoch": 0.6996784837972756, + "grad_norm": 0.1886243224143982, + "learning_rate": 0.001, + "loss": 1.8132, + "step": 16539 + }, + { + "epoch": 0.6997207885607919, + "grad_norm": 0.14639221131801605, + "learning_rate": 0.001, + "loss": 2.2848, + "step": 16540 + }, + { + "epoch": 0.6997630933243083, + "grad_norm": 0.2016509771347046, + "learning_rate": 0.001, + "loss": 2.3703, + "step": 16541 + }, + { + "epoch": 0.6998053980878247, + "grad_norm": 0.2064632922410965, + "learning_rate": 0.001, + "loss": 1.8111, + "step": 16542 + }, + { + "epoch": 0.699847702851341, + "grad_norm": 0.304959237575531, + "learning_rate": 0.001, + "loss": 1.8825, + "step": 16543 + }, + { + "epoch": 0.6998900076148574, + "grad_norm": 0.23426057398319244, + "learning_rate": 0.001, + "loss": 1.7511, + "step": 16544 + }, + { + "epoch": 0.6999323123783738, + "grad_norm": 0.15036675333976746, + "learning_rate": 0.001, + "loss": 1.4636, + "step": 16545 + }, + { + "epoch": 0.6999746171418901, + "grad_norm": 0.18853148818016052, + "learning_rate": 0.001, + "loss": 3.3999, + "step": 16546 + }, + { + "epoch": 0.7000169219054065, + "grad_norm": 0.20281235873699188, + "learning_rate": 0.001, + "loss": 3.1192, + "step": 16547 + }, + { + "epoch": 0.700059226668923, + "grad_norm": 0.1816108375787735, + "learning_rate": 0.001, + "loss": 2.7407, + "step": 16548 + }, + { + "epoch": 0.7001015314324393, + "grad_norm": 0.28820428252220154, + "learning_rate": 0.001, + "loss": 2.1106, + "step": 16549 + }, + { + "epoch": 0.7001438361959557, + "grad_norm": 0.1509302407503128, + "learning_rate": 0.001, + "loss": 1.7362, + "step": 16550 + }, + { + "epoch": 0.7001861409594721, + "grad_norm": 0.14989890158176422, + "learning_rate": 0.001, + "loss": 1.9436, + "step": 16551 + }, + { + "epoch": 0.7002284457229884, + "grad_norm": 0.17388369143009186, + "learning_rate": 0.001, + "loss": 1.4622, + "step": 16552 + }, + { + "epoch": 0.7002707504865048, + "grad_norm": 0.28890854120254517, + "learning_rate": 0.001, + "loss": 2.0939, + "step": 16553 + }, + { + "epoch": 0.7003130552500212, + "grad_norm": 0.1795245260000229, + "learning_rate": 0.001, + "loss": 1.9428, + "step": 16554 + }, + { + "epoch": 0.7003553600135375, + "grad_norm": 0.178712397813797, + "learning_rate": 0.001, + "loss": 1.6748, + "step": 16555 + }, + { + "epoch": 0.7003976647770539, + "grad_norm": 0.16266508400440216, + "learning_rate": 0.001, + "loss": 1.7466, + "step": 16556 + }, + { + "epoch": 0.7004399695405703, + "grad_norm": 0.18018513917922974, + "learning_rate": 0.001, + "loss": 1.5014, + "step": 16557 + }, + { + "epoch": 0.7004822743040866, + "grad_norm": 0.20177042484283447, + "learning_rate": 0.001, + "loss": 1.8159, + "step": 16558 + }, + { + "epoch": 0.700524579067603, + "grad_norm": 0.4257431626319885, + "learning_rate": 0.001, + "loss": 2.8354, + "step": 16559 + }, + { + "epoch": 0.7005668838311194, + "grad_norm": 0.3945598006248474, + "learning_rate": 0.001, + "loss": 2.5141, + "step": 16560 + }, + { + "epoch": 0.7006091885946357, + "grad_norm": 0.1487802118062973, + "learning_rate": 0.001, + "loss": 1.9653, + "step": 16561 + }, + { + "epoch": 0.7006514933581521, + "grad_norm": 0.1524672657251358, + "learning_rate": 0.001, + "loss": 2.3534, + "step": 16562 + }, + { + "epoch": 0.7006937981216685, + "grad_norm": 0.1571507751941681, + "learning_rate": 0.001, + "loss": 3.0126, + "step": 16563 + }, + { + "epoch": 0.7007361028851848, + "grad_norm": 1.0442800521850586, + "learning_rate": 0.001, + "loss": 2.0008, + "step": 16564 + }, + { + "epoch": 0.7007784076487013, + "grad_norm": 0.13998109102249146, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 16565 + }, + { + "epoch": 0.7008207124122177, + "grad_norm": 0.1480410099029541, + "learning_rate": 0.001, + "loss": 1.6822, + "step": 16566 + }, + { + "epoch": 0.700863017175734, + "grad_norm": 0.2611854374408722, + "learning_rate": 0.001, + "loss": 1.9182, + "step": 16567 + }, + { + "epoch": 0.7009053219392504, + "grad_norm": 1.3647289276123047, + "learning_rate": 0.001, + "loss": 1.8359, + "step": 16568 + }, + { + "epoch": 0.7009476267027668, + "grad_norm": 0.15801845490932465, + "learning_rate": 0.001, + "loss": 1.8454, + "step": 16569 + }, + { + "epoch": 0.7009899314662831, + "grad_norm": 0.18349795043468475, + "learning_rate": 0.001, + "loss": 1.8095, + "step": 16570 + }, + { + "epoch": 0.7010322362297995, + "grad_norm": 0.16039691865444183, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 16571 + }, + { + "epoch": 0.7010745409933159, + "grad_norm": 0.21815955638885498, + "learning_rate": 0.001, + "loss": 1.767, + "step": 16572 + }, + { + "epoch": 0.7011168457568322, + "grad_norm": 0.22475208342075348, + "learning_rate": 0.001, + "loss": 3.201, + "step": 16573 + }, + { + "epoch": 0.7011591505203486, + "grad_norm": 0.1533900797367096, + "learning_rate": 0.001, + "loss": 1.5355, + "step": 16574 + }, + { + "epoch": 0.701201455283865, + "grad_norm": 0.18003559112548828, + "learning_rate": 0.001, + "loss": 2.124, + "step": 16575 + }, + { + "epoch": 0.7012437600473813, + "grad_norm": 2.3861000537872314, + "learning_rate": 0.001, + "loss": 2.4133, + "step": 16576 + }, + { + "epoch": 0.7012860648108977, + "grad_norm": 0.18636606633663177, + "learning_rate": 0.001, + "loss": 1.7297, + "step": 16577 + }, + { + "epoch": 0.7013283695744141, + "grad_norm": 0.1709393411874771, + "learning_rate": 0.001, + "loss": 2.7423, + "step": 16578 + }, + { + "epoch": 0.7013706743379304, + "grad_norm": 0.1964663714170456, + "learning_rate": 0.001, + "loss": 2.2989, + "step": 16579 + }, + { + "epoch": 0.7014129791014468, + "grad_norm": 0.13408979773521423, + "learning_rate": 0.001, + "loss": 1.2903, + "step": 16580 + }, + { + "epoch": 0.7014552838649631, + "grad_norm": 0.17530257999897003, + "learning_rate": 0.001, + "loss": 2.0109, + "step": 16581 + }, + { + "epoch": 0.7014975886284796, + "grad_norm": 0.44883960485458374, + "learning_rate": 0.001, + "loss": 3.1007, + "step": 16582 + }, + { + "epoch": 0.701539893391996, + "grad_norm": 4.928041458129883, + "learning_rate": 0.001, + "loss": 2.222, + "step": 16583 + }, + { + "epoch": 0.7015821981555123, + "grad_norm": 0.1666925698518753, + "learning_rate": 0.001, + "loss": 2.2696, + "step": 16584 + }, + { + "epoch": 0.7016245029190287, + "grad_norm": 0.44181501865386963, + "learning_rate": 0.001, + "loss": 1.9347, + "step": 16585 + }, + { + "epoch": 0.7016668076825451, + "grad_norm": 0.17727473378181458, + "learning_rate": 0.001, + "loss": 2.1058, + "step": 16586 + }, + { + "epoch": 0.7017091124460614, + "grad_norm": 0.1318918913602829, + "learning_rate": 0.001, + "loss": 2.1093, + "step": 16587 + }, + { + "epoch": 0.7017514172095778, + "grad_norm": 0.1690230816602707, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 16588 + }, + { + "epoch": 0.7017937219730942, + "grad_norm": 0.29973065853118896, + "learning_rate": 0.001, + "loss": 2.0176, + "step": 16589 + }, + { + "epoch": 0.7018360267366105, + "grad_norm": 0.5178530216217041, + "learning_rate": 0.001, + "loss": 3.1915, + "step": 16590 + }, + { + "epoch": 0.7018783315001269, + "grad_norm": 0.1572248637676239, + "learning_rate": 0.001, + "loss": 1.943, + "step": 16591 + }, + { + "epoch": 0.7019206362636433, + "grad_norm": 0.187464639544487, + "learning_rate": 0.001, + "loss": 2.0762, + "step": 16592 + }, + { + "epoch": 0.7019629410271596, + "grad_norm": 0.21802839636802673, + "learning_rate": 0.001, + "loss": 1.9435, + "step": 16593 + }, + { + "epoch": 0.702005245790676, + "grad_norm": 0.20468945801258087, + "learning_rate": 0.001, + "loss": 3.3978, + "step": 16594 + }, + { + "epoch": 0.7020475505541924, + "grad_norm": 2.5467371940612793, + "learning_rate": 0.001, + "loss": 2.1057, + "step": 16595 + }, + { + "epoch": 0.7020898553177087, + "grad_norm": 0.198502317070961, + "learning_rate": 0.001, + "loss": 1.8921, + "step": 16596 + }, + { + "epoch": 0.7021321600812251, + "grad_norm": 0.18070580065250397, + "learning_rate": 0.001, + "loss": 2.1183, + "step": 16597 + }, + { + "epoch": 0.7021744648447416, + "grad_norm": 0.4479145109653473, + "learning_rate": 0.001, + "loss": 2.0895, + "step": 16598 + }, + { + "epoch": 0.7022167696082579, + "grad_norm": 0.16668649017810822, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 16599 + }, + { + "epoch": 0.7022590743717743, + "grad_norm": 0.17309440672397614, + "learning_rate": 0.001, + "loss": 1.5591, + "step": 16600 + }, + { + "epoch": 0.7023013791352907, + "grad_norm": 1.5597689151763916, + "learning_rate": 0.001, + "loss": 2.1839, + "step": 16601 + }, + { + "epoch": 0.702343683898807, + "grad_norm": 0.2020406872034073, + "learning_rate": 0.001, + "loss": 2.4463, + "step": 16602 + }, + { + "epoch": 0.7023859886623234, + "grad_norm": 0.17011041939258575, + "learning_rate": 0.001, + "loss": 2.2795, + "step": 16603 + }, + { + "epoch": 0.7024282934258398, + "grad_norm": 0.1457245647907257, + "learning_rate": 0.001, + "loss": 2.8202, + "step": 16604 + }, + { + "epoch": 0.7024705981893561, + "grad_norm": 0.1977822333574295, + "learning_rate": 0.001, + "loss": 2.1196, + "step": 16605 + }, + { + "epoch": 0.7025129029528725, + "grad_norm": 0.14688114821910858, + "learning_rate": 0.001, + "loss": 1.9252, + "step": 16606 + }, + { + "epoch": 0.7025552077163889, + "grad_norm": 0.308075487613678, + "learning_rate": 0.001, + "loss": 2.4381, + "step": 16607 + }, + { + "epoch": 0.7025975124799052, + "grad_norm": 0.1714773029088974, + "learning_rate": 0.001, + "loss": 1.698, + "step": 16608 + }, + { + "epoch": 0.7026398172434216, + "grad_norm": 0.18509937822818756, + "learning_rate": 0.001, + "loss": 2.676, + "step": 16609 + }, + { + "epoch": 0.702682122006938, + "grad_norm": 0.37178945541381836, + "learning_rate": 0.001, + "loss": 3.245, + "step": 16610 + }, + { + "epoch": 0.7027244267704543, + "grad_norm": 0.15938927233219147, + "learning_rate": 0.001, + "loss": 1.8862, + "step": 16611 + }, + { + "epoch": 0.7027667315339707, + "grad_norm": 0.25779882073402405, + "learning_rate": 0.001, + "loss": 3.5429, + "step": 16612 + }, + { + "epoch": 0.7028090362974871, + "grad_norm": 0.1717577427625656, + "learning_rate": 0.001, + "loss": 1.811, + "step": 16613 + }, + { + "epoch": 0.7028513410610034, + "grad_norm": 0.25885605812072754, + "learning_rate": 0.001, + "loss": 2.4719, + "step": 16614 + }, + { + "epoch": 0.7028936458245199, + "grad_norm": 0.15932494401931763, + "learning_rate": 0.001, + "loss": 2.5687, + "step": 16615 + }, + { + "epoch": 0.7029359505880363, + "grad_norm": 0.15366816520690918, + "learning_rate": 0.001, + "loss": 2.3769, + "step": 16616 + }, + { + "epoch": 0.7029782553515526, + "grad_norm": 0.4392596483230591, + "learning_rate": 0.001, + "loss": 2.3932, + "step": 16617 + }, + { + "epoch": 0.703020560115069, + "grad_norm": 0.18615837395191193, + "learning_rate": 0.001, + "loss": 2.1175, + "step": 16618 + }, + { + "epoch": 0.7030628648785854, + "grad_norm": 0.20948411524295807, + "learning_rate": 0.001, + "loss": 2.4846, + "step": 16619 + }, + { + "epoch": 0.7031051696421017, + "grad_norm": 0.7131271362304688, + "learning_rate": 0.001, + "loss": 2.928, + "step": 16620 + }, + { + "epoch": 0.7031474744056181, + "grad_norm": 0.23972147703170776, + "learning_rate": 0.001, + "loss": 2.3654, + "step": 16621 + }, + { + "epoch": 0.7031897791691345, + "grad_norm": 0.23003694415092468, + "learning_rate": 0.001, + "loss": 2.846, + "step": 16622 + }, + { + "epoch": 0.7032320839326508, + "grad_norm": 0.14880388975143433, + "learning_rate": 0.001, + "loss": 1.8853, + "step": 16623 + }, + { + "epoch": 0.7032743886961672, + "grad_norm": 0.1396287977695465, + "learning_rate": 0.001, + "loss": 2.4475, + "step": 16624 + }, + { + "epoch": 0.7033166934596835, + "grad_norm": 0.146676704287529, + "learning_rate": 0.001, + "loss": 2.1553, + "step": 16625 + }, + { + "epoch": 0.7033589982231999, + "grad_norm": 0.17937001585960388, + "learning_rate": 0.001, + "loss": 2.787, + "step": 16626 + }, + { + "epoch": 0.7034013029867163, + "grad_norm": 0.18604117631912231, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 16627 + }, + { + "epoch": 0.7034436077502326, + "grad_norm": 3.127412796020508, + "learning_rate": 0.001, + "loss": 1.5014, + "step": 16628 + }, + { + "epoch": 0.703485912513749, + "grad_norm": 0.6706067323684692, + "learning_rate": 0.001, + "loss": 2.0265, + "step": 16629 + }, + { + "epoch": 0.7035282172772654, + "grad_norm": 0.13607746362686157, + "learning_rate": 0.001, + "loss": 1.5039, + "step": 16630 + }, + { + "epoch": 0.7035705220407817, + "grad_norm": 0.2080031931400299, + "learning_rate": 0.001, + "loss": 2.7118, + "step": 16631 + }, + { + "epoch": 0.7036128268042982, + "grad_norm": 0.2915956377983093, + "learning_rate": 0.001, + "loss": 1.9805, + "step": 16632 + }, + { + "epoch": 0.7036551315678146, + "grad_norm": 1.015211582183838, + "learning_rate": 0.001, + "loss": 2.4291, + "step": 16633 + }, + { + "epoch": 0.7036974363313309, + "grad_norm": 0.15686677396297455, + "learning_rate": 0.001, + "loss": 1.4743, + "step": 16634 + }, + { + "epoch": 0.7037397410948473, + "grad_norm": 0.3258562386035919, + "learning_rate": 0.001, + "loss": 2.1383, + "step": 16635 + }, + { + "epoch": 0.7037820458583637, + "grad_norm": 0.21752215921878815, + "learning_rate": 0.001, + "loss": 2.0518, + "step": 16636 + }, + { + "epoch": 0.70382435062188, + "grad_norm": 3.988706111907959, + "learning_rate": 0.001, + "loss": 3.0142, + "step": 16637 + }, + { + "epoch": 0.7038666553853964, + "grad_norm": 0.15791606903076172, + "learning_rate": 0.001, + "loss": 2.3269, + "step": 16638 + }, + { + "epoch": 0.7039089601489128, + "grad_norm": 0.16375815868377686, + "learning_rate": 0.001, + "loss": 1.5517, + "step": 16639 + }, + { + "epoch": 0.7039512649124291, + "grad_norm": 2.8950631618499756, + "learning_rate": 0.001, + "loss": 1.831, + "step": 16640 + }, + { + "epoch": 0.7039935696759455, + "grad_norm": 0.5850285887718201, + "learning_rate": 0.001, + "loss": 2.3522, + "step": 16641 + }, + { + "epoch": 0.7040358744394619, + "grad_norm": 0.25037726759910583, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 16642 + }, + { + "epoch": 0.7040781792029782, + "grad_norm": 0.17441709339618683, + "learning_rate": 0.001, + "loss": 2.4771, + "step": 16643 + }, + { + "epoch": 0.7041204839664946, + "grad_norm": 0.15096983313560486, + "learning_rate": 0.001, + "loss": 1.4872, + "step": 16644 + }, + { + "epoch": 0.704162788730011, + "grad_norm": 1.1566590070724487, + "learning_rate": 0.001, + "loss": 1.9335, + "step": 16645 + }, + { + "epoch": 0.7042050934935273, + "grad_norm": 0.21050888299942017, + "learning_rate": 0.001, + "loss": 1.6326, + "step": 16646 + }, + { + "epoch": 0.7042473982570437, + "grad_norm": 0.24145705997943878, + "learning_rate": 0.001, + "loss": 3.1265, + "step": 16647 + }, + { + "epoch": 0.7042897030205602, + "grad_norm": 0.20239047706127167, + "learning_rate": 0.001, + "loss": 1.7814, + "step": 16648 + }, + { + "epoch": 0.7043320077840765, + "grad_norm": 0.5811108946800232, + "learning_rate": 0.001, + "loss": 2.4936, + "step": 16649 + }, + { + "epoch": 0.7043743125475929, + "grad_norm": 0.42122969031333923, + "learning_rate": 0.001, + "loss": 2.2731, + "step": 16650 + }, + { + "epoch": 0.7044166173111093, + "grad_norm": 0.3062237501144409, + "learning_rate": 0.001, + "loss": 1.9344, + "step": 16651 + }, + { + "epoch": 0.7044589220746256, + "grad_norm": 1.1125757694244385, + "learning_rate": 0.001, + "loss": 2.0033, + "step": 16652 + }, + { + "epoch": 0.704501226838142, + "grad_norm": 0.20193973183631897, + "learning_rate": 0.001, + "loss": 2.0509, + "step": 16653 + }, + { + "epoch": 0.7045435316016584, + "grad_norm": 5.243770122528076, + "learning_rate": 0.001, + "loss": 1.7479, + "step": 16654 + }, + { + "epoch": 0.7045858363651747, + "grad_norm": 0.1896519958972931, + "learning_rate": 0.001, + "loss": 1.9381, + "step": 16655 + }, + { + "epoch": 0.7046281411286911, + "grad_norm": 0.21094900369644165, + "learning_rate": 0.001, + "loss": 1.6246, + "step": 16656 + }, + { + "epoch": 0.7046704458922075, + "grad_norm": 0.24396421015262604, + "learning_rate": 0.001, + "loss": 3.4794, + "step": 16657 + }, + { + "epoch": 0.7047127506557238, + "grad_norm": 0.30104437470436096, + "learning_rate": 0.001, + "loss": 3.0927, + "step": 16658 + }, + { + "epoch": 0.7047550554192402, + "grad_norm": 0.4394260346889496, + "learning_rate": 0.001, + "loss": 2.2789, + "step": 16659 + }, + { + "epoch": 0.7047973601827566, + "grad_norm": 0.23813438415527344, + "learning_rate": 0.001, + "loss": 1.8201, + "step": 16660 + }, + { + "epoch": 0.7048396649462729, + "grad_norm": 0.2260584682226181, + "learning_rate": 0.001, + "loss": 2.8637, + "step": 16661 + }, + { + "epoch": 0.7048819697097893, + "grad_norm": 0.17535214126110077, + "learning_rate": 0.001, + "loss": 1.7847, + "step": 16662 + }, + { + "epoch": 0.7049242744733057, + "grad_norm": 0.342427134513855, + "learning_rate": 0.001, + "loss": 2.5649, + "step": 16663 + }, + { + "epoch": 0.704966579236822, + "grad_norm": 0.2312489151954651, + "learning_rate": 0.001, + "loss": 2.3301, + "step": 16664 + }, + { + "epoch": 0.7050088840003385, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.001, + "loss": 2.0729, + "step": 16665 + }, + { + "epoch": 0.7050511887638549, + "grad_norm": 2.1229746341705322, + "learning_rate": 0.001, + "loss": 2.058, + "step": 16666 + }, + { + "epoch": 0.7050934935273712, + "grad_norm": 0.20138874650001526, + "learning_rate": 0.001, + "loss": 2.5361, + "step": 16667 + }, + { + "epoch": 0.7051357982908876, + "grad_norm": 0.1767900288105011, + "learning_rate": 0.001, + "loss": 2.0497, + "step": 16668 + }, + { + "epoch": 0.7051781030544039, + "grad_norm": 0.23017890751361847, + "learning_rate": 0.001, + "loss": 1.6551, + "step": 16669 + }, + { + "epoch": 0.7052204078179203, + "grad_norm": 0.3998371362686157, + "learning_rate": 0.001, + "loss": 2.6177, + "step": 16670 + }, + { + "epoch": 0.7052627125814367, + "grad_norm": 0.1956208050251007, + "learning_rate": 0.001, + "loss": 2.4729, + "step": 16671 + }, + { + "epoch": 0.705305017344953, + "grad_norm": 0.13961687684059143, + "learning_rate": 0.001, + "loss": 2.5719, + "step": 16672 + }, + { + "epoch": 0.7053473221084694, + "grad_norm": 0.581203281879425, + "learning_rate": 0.001, + "loss": 3.0076, + "step": 16673 + }, + { + "epoch": 0.7053896268719858, + "grad_norm": 0.18303988873958588, + "learning_rate": 0.001, + "loss": 1.9536, + "step": 16674 + }, + { + "epoch": 0.7054319316355021, + "grad_norm": 0.18635371327400208, + "learning_rate": 0.001, + "loss": 1.5738, + "step": 16675 + }, + { + "epoch": 0.7054742363990185, + "grad_norm": 0.17525175213813782, + "learning_rate": 0.001, + "loss": 2.3192, + "step": 16676 + }, + { + "epoch": 0.7055165411625349, + "grad_norm": 1.5950613021850586, + "learning_rate": 0.001, + "loss": 2.6333, + "step": 16677 + }, + { + "epoch": 0.7055588459260512, + "grad_norm": 0.2035263627767563, + "learning_rate": 0.001, + "loss": 1.9577, + "step": 16678 + }, + { + "epoch": 0.7056011506895676, + "grad_norm": 0.16556823253631592, + "learning_rate": 0.001, + "loss": 2.3302, + "step": 16679 + }, + { + "epoch": 0.705643455453084, + "grad_norm": 6.488144397735596, + "learning_rate": 0.001, + "loss": 2.0294, + "step": 16680 + }, + { + "epoch": 0.7056857602166003, + "grad_norm": 0.21068356931209564, + "learning_rate": 0.001, + "loss": 3.265, + "step": 16681 + }, + { + "epoch": 0.7057280649801168, + "grad_norm": 0.1684761941432953, + "learning_rate": 0.001, + "loss": 2.9163, + "step": 16682 + }, + { + "epoch": 0.7057703697436332, + "grad_norm": 2.740279197692871, + "learning_rate": 0.001, + "loss": 1.8363, + "step": 16683 + }, + { + "epoch": 0.7058126745071495, + "grad_norm": 69.86604309082031, + "learning_rate": 0.001, + "loss": 2.5254, + "step": 16684 + }, + { + "epoch": 0.7058549792706659, + "grad_norm": 0.2736927568912506, + "learning_rate": 0.001, + "loss": 2.2184, + "step": 16685 + }, + { + "epoch": 0.7058972840341823, + "grad_norm": 0.21192248165607452, + "learning_rate": 0.001, + "loss": 2.7321, + "step": 16686 + }, + { + "epoch": 0.7059395887976986, + "grad_norm": 0.3148384988307953, + "learning_rate": 0.001, + "loss": 1.9477, + "step": 16687 + }, + { + "epoch": 0.705981893561215, + "grad_norm": 0.24907708168029785, + "learning_rate": 0.001, + "loss": 2.7906, + "step": 16688 + }, + { + "epoch": 0.7060241983247314, + "grad_norm": 0.3226729929447174, + "learning_rate": 0.001, + "loss": 2.8049, + "step": 16689 + }, + { + "epoch": 0.7060665030882477, + "grad_norm": 3.953677177429199, + "learning_rate": 0.001, + "loss": 2.0564, + "step": 16690 + }, + { + "epoch": 0.7061088078517641, + "grad_norm": 0.24923500418663025, + "learning_rate": 0.001, + "loss": 1.3582, + "step": 16691 + }, + { + "epoch": 0.7061511126152805, + "grad_norm": 0.9128309488296509, + "learning_rate": 0.001, + "loss": 3.0, + "step": 16692 + }, + { + "epoch": 0.7061934173787968, + "grad_norm": 0.30432406067848206, + "learning_rate": 0.001, + "loss": 2.2832, + "step": 16693 + }, + { + "epoch": 0.7062357221423132, + "grad_norm": 0.27826377749443054, + "learning_rate": 0.001, + "loss": 3.6967, + "step": 16694 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 1.1862016916275024, + "learning_rate": 0.001, + "loss": 2.8519, + "step": 16695 + }, + { + "epoch": 0.7063203316693459, + "grad_norm": 0.27995389699935913, + "learning_rate": 0.001, + "loss": 2.0341, + "step": 16696 + }, + { + "epoch": 0.7063626364328623, + "grad_norm": 0.20460890233516693, + "learning_rate": 0.001, + "loss": 2.0071, + "step": 16697 + }, + { + "epoch": 0.7064049411963788, + "grad_norm": 0.20291836559772491, + "learning_rate": 0.001, + "loss": 2.3107, + "step": 16698 + }, + { + "epoch": 0.706447245959895, + "grad_norm": 0.2692174017429352, + "learning_rate": 0.001, + "loss": 2.5738, + "step": 16699 + }, + { + "epoch": 0.7064895507234115, + "grad_norm": 0.1654096394777298, + "learning_rate": 0.001, + "loss": 2.286, + "step": 16700 + }, + { + "epoch": 0.7065318554869279, + "grad_norm": 0.23862436413764954, + "learning_rate": 0.001, + "loss": 2.321, + "step": 16701 + }, + { + "epoch": 0.7065741602504442, + "grad_norm": 0.1421448439359665, + "learning_rate": 0.001, + "loss": 2.7367, + "step": 16702 + }, + { + "epoch": 0.7066164650139606, + "grad_norm": 0.5097910165786743, + "learning_rate": 0.001, + "loss": 2.5246, + "step": 16703 + }, + { + "epoch": 0.706658769777477, + "grad_norm": 0.19460159540176392, + "learning_rate": 0.001, + "loss": 2.2217, + "step": 16704 + }, + { + "epoch": 0.7067010745409933, + "grad_norm": 0.17561720311641693, + "learning_rate": 0.001, + "loss": 1.7901, + "step": 16705 + }, + { + "epoch": 0.7067433793045097, + "grad_norm": 0.16379234194755554, + "learning_rate": 0.001, + "loss": 2.0123, + "step": 16706 + }, + { + "epoch": 0.7067856840680261, + "grad_norm": 0.29534009099006653, + "learning_rate": 0.001, + "loss": 1.7291, + "step": 16707 + }, + { + "epoch": 0.7068279888315424, + "grad_norm": 0.16030588746070862, + "learning_rate": 0.001, + "loss": 2.4982, + "step": 16708 + }, + { + "epoch": 0.7068702935950588, + "grad_norm": 0.19317007064819336, + "learning_rate": 0.001, + "loss": 1.7916, + "step": 16709 + }, + { + "epoch": 0.7069125983585752, + "grad_norm": 16.575075149536133, + "learning_rate": 0.001, + "loss": 1.7214, + "step": 16710 + }, + { + "epoch": 0.7069549031220915, + "grad_norm": 0.6508944630622864, + "learning_rate": 0.001, + "loss": 3.3014, + "step": 16711 + }, + { + "epoch": 0.7069972078856079, + "grad_norm": 0.17529930174350739, + "learning_rate": 0.001, + "loss": 2.833, + "step": 16712 + }, + { + "epoch": 0.7070395126491243, + "grad_norm": 0.18948234617710114, + "learning_rate": 0.001, + "loss": 2.3365, + "step": 16713 + }, + { + "epoch": 0.7070818174126406, + "grad_norm": 0.19644437730312347, + "learning_rate": 0.001, + "loss": 3.7717, + "step": 16714 + }, + { + "epoch": 0.707124122176157, + "grad_norm": 0.36067622900009155, + "learning_rate": 0.001, + "loss": 2.016, + "step": 16715 + }, + { + "epoch": 0.7071664269396734, + "grad_norm": 0.1735169142484665, + "learning_rate": 0.001, + "loss": 2.1556, + "step": 16716 + }, + { + "epoch": 0.7072087317031898, + "grad_norm": 0.7500831484794617, + "learning_rate": 0.001, + "loss": 2.5418, + "step": 16717 + }, + { + "epoch": 0.7072510364667062, + "grad_norm": 0.1380908042192459, + "learning_rate": 0.001, + "loss": 2.0679, + "step": 16718 + }, + { + "epoch": 0.7072933412302225, + "grad_norm": 1.2244987487792969, + "learning_rate": 0.001, + "loss": 2.6043, + "step": 16719 + }, + { + "epoch": 0.7073356459937389, + "grad_norm": 0.14987333118915558, + "learning_rate": 0.001, + "loss": 2.4967, + "step": 16720 + }, + { + "epoch": 0.7073779507572553, + "grad_norm": 0.17000074684619904, + "learning_rate": 0.001, + "loss": 2.2771, + "step": 16721 + }, + { + "epoch": 0.7074202555207716, + "grad_norm": 2.4218010902404785, + "learning_rate": 0.001, + "loss": 2.0412, + "step": 16722 + }, + { + "epoch": 0.707462560284288, + "grad_norm": 0.24326300621032715, + "learning_rate": 0.001, + "loss": 2.3567, + "step": 16723 + }, + { + "epoch": 0.7075048650478044, + "grad_norm": 0.5091056227684021, + "learning_rate": 0.001, + "loss": 2.88, + "step": 16724 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 0.16985633969306946, + "learning_rate": 0.001, + "loss": 1.6534, + "step": 16725 + }, + { + "epoch": 0.7075894745748371, + "grad_norm": 0.20579244196414948, + "learning_rate": 0.001, + "loss": 1.6615, + "step": 16726 + }, + { + "epoch": 0.7076317793383535, + "grad_norm": 0.19666582345962524, + "learning_rate": 0.001, + "loss": 2.5436, + "step": 16727 + }, + { + "epoch": 0.7076740841018698, + "grad_norm": 0.1708710938692093, + "learning_rate": 0.001, + "loss": 1.5373, + "step": 16728 + }, + { + "epoch": 0.7077163888653862, + "grad_norm": 0.17626847326755524, + "learning_rate": 0.001, + "loss": 1.9095, + "step": 16729 + }, + { + "epoch": 0.7077586936289026, + "grad_norm": 0.19842320680618286, + "learning_rate": 0.001, + "loss": 2.8567, + "step": 16730 + }, + { + "epoch": 0.707800998392419, + "grad_norm": 0.17139360308647156, + "learning_rate": 0.001, + "loss": 2.3247, + "step": 16731 + }, + { + "epoch": 0.7078433031559354, + "grad_norm": 0.14886251091957092, + "learning_rate": 0.001, + "loss": 1.6924, + "step": 16732 + }, + { + "epoch": 0.7078856079194518, + "grad_norm": 0.182938814163208, + "learning_rate": 0.001, + "loss": 1.2942, + "step": 16733 + }, + { + "epoch": 0.7079279126829681, + "grad_norm": 0.21765394508838654, + "learning_rate": 0.001, + "loss": 2.992, + "step": 16734 + }, + { + "epoch": 0.7079702174464845, + "grad_norm": 49.08909606933594, + "learning_rate": 0.001, + "loss": 2.9176, + "step": 16735 + }, + { + "epoch": 0.7080125222100009, + "grad_norm": 0.15202230215072632, + "learning_rate": 0.001, + "loss": 1.8571, + "step": 16736 + }, + { + "epoch": 0.7080548269735172, + "grad_norm": 0.2932276129722595, + "learning_rate": 0.001, + "loss": 2.6166, + "step": 16737 + }, + { + "epoch": 0.7080971317370336, + "grad_norm": 0.15039752423763275, + "learning_rate": 0.001, + "loss": 1.5913, + "step": 16738 + }, + { + "epoch": 0.70813943650055, + "grad_norm": 0.1468665897846222, + "learning_rate": 0.001, + "loss": 2.4612, + "step": 16739 + }, + { + "epoch": 0.7081817412640663, + "grad_norm": 0.17689596116542816, + "learning_rate": 0.001, + "loss": 2.5566, + "step": 16740 + }, + { + "epoch": 0.7082240460275827, + "grad_norm": 0.22001603245735168, + "learning_rate": 0.001, + "loss": 2.2724, + "step": 16741 + }, + { + "epoch": 0.7082663507910991, + "grad_norm": 0.24464015662670135, + "learning_rate": 0.001, + "loss": 2.3597, + "step": 16742 + }, + { + "epoch": 0.7083086555546154, + "grad_norm": 0.1501077562570572, + "learning_rate": 0.001, + "loss": 2.4862, + "step": 16743 + }, + { + "epoch": 0.7083509603181318, + "grad_norm": 0.22457842528820038, + "learning_rate": 0.001, + "loss": 1.9533, + "step": 16744 + }, + { + "epoch": 0.7083932650816482, + "grad_norm": 0.48851919174194336, + "learning_rate": 0.001, + "loss": 1.8844, + "step": 16745 + }, + { + "epoch": 0.7084355698451645, + "grad_norm": 0.16567832231521606, + "learning_rate": 0.001, + "loss": 2.3328, + "step": 16746 + }, + { + "epoch": 0.708477874608681, + "grad_norm": 0.16666191816329956, + "learning_rate": 0.001, + "loss": 3.0872, + "step": 16747 + }, + { + "epoch": 0.7085201793721974, + "grad_norm": 0.1672544926404953, + "learning_rate": 0.001, + "loss": 2.2649, + "step": 16748 + }, + { + "epoch": 0.7085624841357137, + "grad_norm": 0.1558084934949875, + "learning_rate": 0.001, + "loss": 1.8319, + "step": 16749 + }, + { + "epoch": 0.7086047888992301, + "grad_norm": 0.14037245512008667, + "learning_rate": 0.001, + "loss": 2.3805, + "step": 16750 + }, + { + "epoch": 0.7086470936627465, + "grad_norm": 0.16425585746765137, + "learning_rate": 0.001, + "loss": 1.7784, + "step": 16751 + }, + { + "epoch": 0.7086893984262628, + "grad_norm": 0.7935085892677307, + "learning_rate": 0.001, + "loss": 2.1095, + "step": 16752 + }, + { + "epoch": 0.7087317031897792, + "grad_norm": 0.16057053208351135, + "learning_rate": 0.001, + "loss": 1.8058, + "step": 16753 + }, + { + "epoch": 0.7087740079532956, + "grad_norm": 0.16115844249725342, + "learning_rate": 0.001, + "loss": 1.6998, + "step": 16754 + }, + { + "epoch": 0.7088163127168119, + "grad_norm": 1.6587260961532593, + "learning_rate": 0.001, + "loss": 2.3739, + "step": 16755 + }, + { + "epoch": 0.7088586174803283, + "grad_norm": 0.11960441619157791, + "learning_rate": 0.001, + "loss": 1.2877, + "step": 16756 + }, + { + "epoch": 0.7089009222438447, + "grad_norm": 0.28413790464401245, + "learning_rate": 0.001, + "loss": 2.0965, + "step": 16757 + }, + { + "epoch": 0.708943227007361, + "grad_norm": 0.18682576715946198, + "learning_rate": 0.001, + "loss": 2.3906, + "step": 16758 + }, + { + "epoch": 0.7089855317708774, + "grad_norm": 0.15329422056674957, + "learning_rate": 0.001, + "loss": 2.0463, + "step": 16759 + }, + { + "epoch": 0.7090278365343937, + "grad_norm": 0.18216441571712494, + "learning_rate": 0.001, + "loss": 2.1109, + "step": 16760 + }, + { + "epoch": 0.7090701412979101, + "grad_norm": 0.38540518283843994, + "learning_rate": 0.001, + "loss": 1.7257, + "step": 16761 + }, + { + "epoch": 0.7091124460614265, + "grad_norm": 0.2819367051124573, + "learning_rate": 0.001, + "loss": 1.708, + "step": 16762 + }, + { + "epoch": 0.7091547508249428, + "grad_norm": 0.16718243062496185, + "learning_rate": 0.001, + "loss": 1.9525, + "step": 16763 + }, + { + "epoch": 0.7091970555884592, + "grad_norm": 0.2087067812681198, + "learning_rate": 0.001, + "loss": 2.1724, + "step": 16764 + }, + { + "epoch": 0.7092393603519757, + "grad_norm": 0.46061161160469055, + "learning_rate": 0.001, + "loss": 3.8056, + "step": 16765 + }, + { + "epoch": 0.709281665115492, + "grad_norm": 0.20006658136844635, + "learning_rate": 0.001, + "loss": 3.0642, + "step": 16766 + }, + { + "epoch": 0.7093239698790084, + "grad_norm": 0.15130923688411713, + "learning_rate": 0.001, + "loss": 2.0505, + "step": 16767 + }, + { + "epoch": 0.7093662746425248, + "grad_norm": 0.20808841288089752, + "learning_rate": 0.001, + "loss": 2.407, + "step": 16768 + }, + { + "epoch": 0.7094085794060411, + "grad_norm": 0.17218559980392456, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 16769 + }, + { + "epoch": 0.7094508841695575, + "grad_norm": 0.14409933984279633, + "learning_rate": 0.001, + "loss": 1.674, + "step": 16770 + }, + { + "epoch": 0.7094931889330739, + "grad_norm": 0.19326108694076538, + "learning_rate": 0.001, + "loss": 2.979, + "step": 16771 + }, + { + "epoch": 0.7095354936965902, + "grad_norm": 0.1743086278438568, + "learning_rate": 0.001, + "loss": 3.2637, + "step": 16772 + }, + { + "epoch": 0.7095777984601066, + "grad_norm": 0.17006252706050873, + "learning_rate": 0.001, + "loss": 1.9035, + "step": 16773 + }, + { + "epoch": 0.709620103223623, + "grad_norm": 0.1594936102628708, + "learning_rate": 0.001, + "loss": 1.8383, + "step": 16774 + }, + { + "epoch": 0.7096624079871393, + "grad_norm": 0.1599188596010208, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 16775 + }, + { + "epoch": 0.7097047127506557, + "grad_norm": 0.22632001340389252, + "learning_rate": 0.001, + "loss": 2.0174, + "step": 16776 + }, + { + "epoch": 0.7097470175141721, + "grad_norm": 0.151228666305542, + "learning_rate": 0.001, + "loss": 3.3681, + "step": 16777 + }, + { + "epoch": 0.7097893222776884, + "grad_norm": 0.154108464717865, + "learning_rate": 0.001, + "loss": 2.6175, + "step": 16778 + }, + { + "epoch": 0.7098316270412048, + "grad_norm": 0.23178794980049133, + "learning_rate": 0.001, + "loss": 2.6288, + "step": 16779 + }, + { + "epoch": 0.7098739318047212, + "grad_norm": 0.18430373072624207, + "learning_rate": 0.001, + "loss": 1.8585, + "step": 16780 + }, + { + "epoch": 0.7099162365682375, + "grad_norm": 1.811462640762329, + "learning_rate": 0.001, + "loss": 2.4217, + "step": 16781 + }, + { + "epoch": 0.709958541331754, + "grad_norm": 0.14598025381565094, + "learning_rate": 0.001, + "loss": 1.4804, + "step": 16782 + }, + { + "epoch": 0.7100008460952704, + "grad_norm": 0.162042036652565, + "learning_rate": 0.001, + "loss": 2.7988, + "step": 16783 + }, + { + "epoch": 0.7100431508587867, + "grad_norm": 0.24106714129447937, + "learning_rate": 0.001, + "loss": 2.4054, + "step": 16784 + }, + { + "epoch": 0.7100854556223031, + "grad_norm": 0.2140771746635437, + "learning_rate": 0.001, + "loss": 3.0376, + "step": 16785 + }, + { + "epoch": 0.7101277603858195, + "grad_norm": 0.24649621546268463, + "learning_rate": 0.001, + "loss": 1.7422, + "step": 16786 + }, + { + "epoch": 0.7101700651493358, + "grad_norm": 0.5776690244674683, + "learning_rate": 0.001, + "loss": 2.1208, + "step": 16787 + }, + { + "epoch": 0.7102123699128522, + "grad_norm": 0.21545056998729706, + "learning_rate": 0.001, + "loss": 2.2494, + "step": 16788 + }, + { + "epoch": 0.7102546746763686, + "grad_norm": 0.16463403403759003, + "learning_rate": 0.001, + "loss": 2.7351, + "step": 16789 + }, + { + "epoch": 0.7102969794398849, + "grad_norm": 0.20469972491264343, + "learning_rate": 0.001, + "loss": 2.3172, + "step": 16790 + }, + { + "epoch": 0.7103392842034013, + "grad_norm": 0.1745065301656723, + "learning_rate": 0.001, + "loss": 2.9764, + "step": 16791 + }, + { + "epoch": 0.7103815889669177, + "grad_norm": 0.15587376058101654, + "learning_rate": 0.001, + "loss": 2.395, + "step": 16792 + }, + { + "epoch": 0.710423893730434, + "grad_norm": 0.21353621780872345, + "learning_rate": 0.001, + "loss": 2.1667, + "step": 16793 + }, + { + "epoch": 0.7104661984939504, + "grad_norm": 1.4838371276855469, + "learning_rate": 0.001, + "loss": 2.7724, + "step": 16794 + }, + { + "epoch": 0.7105085032574668, + "grad_norm": 0.18408633768558502, + "learning_rate": 0.001, + "loss": 1.7745, + "step": 16795 + }, + { + "epoch": 0.7105508080209831, + "grad_norm": 0.20254147052764893, + "learning_rate": 0.001, + "loss": 3.6371, + "step": 16796 + }, + { + "epoch": 0.7105931127844995, + "grad_norm": 0.1831352263689041, + "learning_rate": 0.001, + "loss": 1.6486, + "step": 16797 + }, + { + "epoch": 0.710635417548016, + "grad_norm": 0.18303878605365753, + "learning_rate": 0.001, + "loss": 1.9915, + "step": 16798 + }, + { + "epoch": 0.7106777223115323, + "grad_norm": 8.696745872497559, + "learning_rate": 0.001, + "loss": 2.9612, + "step": 16799 + }, + { + "epoch": 0.7107200270750487, + "grad_norm": 0.16578525304794312, + "learning_rate": 0.001, + "loss": 2.5333, + "step": 16800 + }, + { + "epoch": 0.7107623318385651, + "grad_norm": 0.19835564494132996, + "learning_rate": 0.001, + "loss": 2.5376, + "step": 16801 + }, + { + "epoch": 0.7108046366020814, + "grad_norm": 0.18402199447155, + "learning_rate": 0.001, + "loss": 2.1194, + "step": 16802 + }, + { + "epoch": 0.7108469413655978, + "grad_norm": 0.39828217029571533, + "learning_rate": 0.001, + "loss": 2.7564, + "step": 16803 + }, + { + "epoch": 0.7108892461291141, + "grad_norm": 0.17361795902252197, + "learning_rate": 0.001, + "loss": 2.9527, + "step": 16804 + }, + { + "epoch": 0.7109315508926305, + "grad_norm": 0.2756977379322052, + "learning_rate": 0.001, + "loss": 2.1633, + "step": 16805 + }, + { + "epoch": 0.7109738556561469, + "grad_norm": 0.22067061066627502, + "learning_rate": 0.001, + "loss": 2.6841, + "step": 16806 + }, + { + "epoch": 0.7110161604196632, + "grad_norm": 0.1891237497329712, + "learning_rate": 0.001, + "loss": 1.4142, + "step": 16807 + }, + { + "epoch": 0.7110584651831796, + "grad_norm": 1.683262586593628, + "learning_rate": 0.001, + "loss": 2.0285, + "step": 16808 + }, + { + "epoch": 0.711100769946696, + "grad_norm": 0.16583381593227386, + "learning_rate": 0.001, + "loss": 1.7941, + "step": 16809 + }, + { + "epoch": 0.7111430747102123, + "grad_norm": 2.930053472518921, + "learning_rate": 0.001, + "loss": 1.8499, + "step": 16810 + }, + { + "epoch": 0.7111853794737287, + "grad_norm": 0.1657000333070755, + "learning_rate": 0.001, + "loss": 1.7437, + "step": 16811 + }, + { + "epoch": 0.7112276842372451, + "grad_norm": 0.16928398609161377, + "learning_rate": 0.001, + "loss": 1.7506, + "step": 16812 + }, + { + "epoch": 0.7112699890007614, + "grad_norm": 0.1839533895254135, + "learning_rate": 0.001, + "loss": 1.7639, + "step": 16813 + }, + { + "epoch": 0.7113122937642778, + "grad_norm": 0.15033124387264252, + "learning_rate": 0.001, + "loss": 2.3976, + "step": 16814 + }, + { + "epoch": 0.7113545985277943, + "grad_norm": 0.21158377826213837, + "learning_rate": 0.001, + "loss": 2.7739, + "step": 16815 + }, + { + "epoch": 0.7113969032913106, + "grad_norm": 0.14168700575828552, + "learning_rate": 0.001, + "loss": 1.7022, + "step": 16816 + }, + { + "epoch": 0.711439208054827, + "grad_norm": 0.17619457840919495, + "learning_rate": 0.001, + "loss": 3.0258, + "step": 16817 + }, + { + "epoch": 0.7114815128183434, + "grad_norm": 2.186882972717285, + "learning_rate": 0.001, + "loss": 1.669, + "step": 16818 + }, + { + "epoch": 0.7115238175818597, + "grad_norm": 0.37411370873451233, + "learning_rate": 0.001, + "loss": 1.7232, + "step": 16819 + }, + { + "epoch": 0.7115661223453761, + "grad_norm": 0.16331219673156738, + "learning_rate": 0.001, + "loss": 1.4706, + "step": 16820 + }, + { + "epoch": 0.7116084271088925, + "grad_norm": 0.9669393301010132, + "learning_rate": 0.001, + "loss": 2.8231, + "step": 16821 + }, + { + "epoch": 0.7116507318724088, + "grad_norm": 1.7849273681640625, + "learning_rate": 0.001, + "loss": 2.0226, + "step": 16822 + }, + { + "epoch": 0.7116930366359252, + "grad_norm": 5.401309013366699, + "learning_rate": 0.001, + "loss": 2.1295, + "step": 16823 + }, + { + "epoch": 0.7117353413994416, + "grad_norm": 4.94477653503418, + "learning_rate": 0.001, + "loss": 3.141, + "step": 16824 + }, + { + "epoch": 0.7117776461629579, + "grad_norm": 0.1646738350391388, + "learning_rate": 0.001, + "loss": 2.3503, + "step": 16825 + }, + { + "epoch": 0.7118199509264743, + "grad_norm": 0.2677449584007263, + "learning_rate": 0.001, + "loss": 1.6298, + "step": 16826 + }, + { + "epoch": 0.7118622556899907, + "grad_norm": 0.19275929033756256, + "learning_rate": 0.001, + "loss": 1.8281, + "step": 16827 + }, + { + "epoch": 0.711904560453507, + "grad_norm": 0.9130849242210388, + "learning_rate": 0.001, + "loss": 2.2454, + "step": 16828 + }, + { + "epoch": 0.7119468652170234, + "grad_norm": 0.16374343633651733, + "learning_rate": 0.001, + "loss": 1.355, + "step": 16829 + }, + { + "epoch": 0.7119891699805398, + "grad_norm": 0.19432541728019714, + "learning_rate": 0.001, + "loss": 3.3869, + "step": 16830 + }, + { + "epoch": 0.7120314747440561, + "grad_norm": 0.8363915681838989, + "learning_rate": 0.001, + "loss": 2.2074, + "step": 16831 + }, + { + "epoch": 0.7120737795075726, + "grad_norm": 0.18882228434085846, + "learning_rate": 0.001, + "loss": 2.0904, + "step": 16832 + }, + { + "epoch": 0.712116084271089, + "grad_norm": 0.16533978283405304, + "learning_rate": 0.001, + "loss": 1.8895, + "step": 16833 + }, + { + "epoch": 0.7121583890346053, + "grad_norm": 2.9509565830230713, + "learning_rate": 0.001, + "loss": 2.142, + "step": 16834 + }, + { + "epoch": 0.7122006937981217, + "grad_norm": 0.2698115408420563, + "learning_rate": 0.001, + "loss": 3.958, + "step": 16835 + }, + { + "epoch": 0.7122429985616381, + "grad_norm": 0.1884506493806839, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 16836 + }, + { + "epoch": 0.7122853033251544, + "grad_norm": 59.53257751464844, + "learning_rate": 0.001, + "loss": 2.1418, + "step": 16837 + }, + { + "epoch": 0.7123276080886708, + "grad_norm": 4.942716121673584, + "learning_rate": 0.001, + "loss": 2.5548, + "step": 16838 + }, + { + "epoch": 0.7123699128521872, + "grad_norm": 0.3117887079715729, + "learning_rate": 0.001, + "loss": 1.6967, + "step": 16839 + }, + { + "epoch": 0.7124122176157035, + "grad_norm": 0.2594894766807556, + "learning_rate": 0.001, + "loss": 1.6713, + "step": 16840 + }, + { + "epoch": 0.7124545223792199, + "grad_norm": 0.37897738814353943, + "learning_rate": 0.001, + "loss": 1.9001, + "step": 16841 + }, + { + "epoch": 0.7124968271427363, + "grad_norm": 0.3171361982822418, + "learning_rate": 0.001, + "loss": 2.6247, + "step": 16842 + }, + { + "epoch": 0.7125391319062526, + "grad_norm": 0.15616539120674133, + "learning_rate": 0.001, + "loss": 2.1841, + "step": 16843 + }, + { + "epoch": 0.712581436669769, + "grad_norm": 0.363093763589859, + "learning_rate": 0.001, + "loss": 1.9907, + "step": 16844 + }, + { + "epoch": 0.7126237414332854, + "grad_norm": 0.20196084678173065, + "learning_rate": 0.001, + "loss": 2.3817, + "step": 16845 + }, + { + "epoch": 0.7126660461968017, + "grad_norm": 0.19358810782432556, + "learning_rate": 0.001, + "loss": 1.6368, + "step": 16846 + }, + { + "epoch": 0.7127083509603181, + "grad_norm": 0.24825577437877655, + "learning_rate": 0.001, + "loss": 2.6596, + "step": 16847 + }, + { + "epoch": 0.7127506557238346, + "grad_norm": 0.20185577869415283, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 16848 + }, + { + "epoch": 0.7127929604873509, + "grad_norm": 0.3401681184768677, + "learning_rate": 0.001, + "loss": 1.8744, + "step": 16849 + }, + { + "epoch": 0.7128352652508673, + "grad_norm": 0.2052794247865677, + "learning_rate": 0.001, + "loss": 1.8685, + "step": 16850 + }, + { + "epoch": 0.7128775700143836, + "grad_norm": 0.17829133570194244, + "learning_rate": 0.001, + "loss": 2.681, + "step": 16851 + }, + { + "epoch": 0.7129198747779, + "grad_norm": 0.2980566918849945, + "learning_rate": 0.001, + "loss": 2.1763, + "step": 16852 + }, + { + "epoch": 0.7129621795414164, + "grad_norm": 0.22543774545192719, + "learning_rate": 0.001, + "loss": 1.9288, + "step": 16853 + }, + { + "epoch": 0.7130044843049327, + "grad_norm": 1.7789720296859741, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 16854 + }, + { + "epoch": 0.7130467890684491, + "grad_norm": 0.2461077868938446, + "learning_rate": 0.001, + "loss": 1.7258, + "step": 16855 + }, + { + "epoch": 0.7130890938319655, + "grad_norm": 0.1980772763490677, + "learning_rate": 0.001, + "loss": 2.8052, + "step": 16856 + }, + { + "epoch": 0.7131313985954818, + "grad_norm": 0.19356103241443634, + "learning_rate": 0.001, + "loss": 2.0402, + "step": 16857 + }, + { + "epoch": 0.7131737033589982, + "grad_norm": 0.21868008375167847, + "learning_rate": 0.001, + "loss": 2.9919, + "step": 16858 + }, + { + "epoch": 0.7132160081225146, + "grad_norm": 0.22265969216823578, + "learning_rate": 0.001, + "loss": 1.8722, + "step": 16859 + }, + { + "epoch": 0.7132583128860309, + "grad_norm": 0.9610694646835327, + "learning_rate": 0.001, + "loss": 1.7194, + "step": 16860 + }, + { + "epoch": 0.7133006176495473, + "grad_norm": 1.113793134689331, + "learning_rate": 0.001, + "loss": 2.0243, + "step": 16861 + }, + { + "epoch": 0.7133429224130637, + "grad_norm": 3.1779723167419434, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 16862 + }, + { + "epoch": 0.71338522717658, + "grad_norm": 0.1677677184343338, + "learning_rate": 0.001, + "loss": 2.1601, + "step": 16863 + }, + { + "epoch": 0.7134275319400964, + "grad_norm": 0.7339451313018799, + "learning_rate": 0.001, + "loss": 1.9814, + "step": 16864 + }, + { + "epoch": 0.7134698367036129, + "grad_norm": 0.4018705487251282, + "learning_rate": 0.001, + "loss": 2.3781, + "step": 16865 + }, + { + "epoch": 0.7135121414671292, + "grad_norm": 9.572480201721191, + "learning_rate": 0.001, + "loss": 1.9837, + "step": 16866 + }, + { + "epoch": 0.7135544462306456, + "grad_norm": 0.26376014947891235, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 16867 + }, + { + "epoch": 0.713596750994162, + "grad_norm": 1.0289099216461182, + "learning_rate": 0.001, + "loss": 2.5283, + "step": 16868 + }, + { + "epoch": 0.7136390557576783, + "grad_norm": 0.1844884753227234, + "learning_rate": 0.001, + "loss": 1.9274, + "step": 16869 + }, + { + "epoch": 0.7136813605211947, + "grad_norm": 0.19893215596675873, + "learning_rate": 0.001, + "loss": 2.1862, + "step": 16870 + }, + { + "epoch": 0.7137236652847111, + "grad_norm": 0.239218607544899, + "learning_rate": 0.001, + "loss": 1.6428, + "step": 16871 + }, + { + "epoch": 0.7137659700482274, + "grad_norm": 0.3456837832927704, + "learning_rate": 0.001, + "loss": 1.6866, + "step": 16872 + }, + { + "epoch": 0.7138082748117438, + "grad_norm": 0.20002485811710358, + "learning_rate": 0.001, + "loss": 2.0121, + "step": 16873 + }, + { + "epoch": 0.7138505795752602, + "grad_norm": 0.16409829258918762, + "learning_rate": 0.001, + "loss": 2.7127, + "step": 16874 + }, + { + "epoch": 0.7138928843387765, + "grad_norm": 0.2976275086402893, + "learning_rate": 0.001, + "loss": 2.683, + "step": 16875 + }, + { + "epoch": 0.7139351891022929, + "grad_norm": 0.1813412308692932, + "learning_rate": 0.001, + "loss": 1.4256, + "step": 16876 + }, + { + "epoch": 0.7139774938658093, + "grad_norm": 0.28270435333251953, + "learning_rate": 0.001, + "loss": 1.8264, + "step": 16877 + }, + { + "epoch": 0.7140197986293256, + "grad_norm": 0.5542454719543457, + "learning_rate": 0.001, + "loss": 2.6197, + "step": 16878 + }, + { + "epoch": 0.714062103392842, + "grad_norm": 0.17437031865119934, + "learning_rate": 0.001, + "loss": 2.1137, + "step": 16879 + }, + { + "epoch": 0.7141044081563585, + "grad_norm": 0.3185507357120514, + "learning_rate": 0.001, + "loss": 2.0136, + "step": 16880 + }, + { + "epoch": 0.7141467129198747, + "grad_norm": 0.8088506460189819, + "learning_rate": 0.001, + "loss": 3.2072, + "step": 16881 + }, + { + "epoch": 0.7141890176833912, + "grad_norm": 0.15809717774391174, + "learning_rate": 0.001, + "loss": 1.428, + "step": 16882 + }, + { + "epoch": 0.7142313224469076, + "grad_norm": 0.16368800401687622, + "learning_rate": 0.001, + "loss": 2.1529, + "step": 16883 + }, + { + "epoch": 0.7142736272104239, + "grad_norm": 0.2843414843082428, + "learning_rate": 0.001, + "loss": 1.4441, + "step": 16884 + }, + { + "epoch": 0.7143159319739403, + "grad_norm": 0.20295199751853943, + "learning_rate": 0.001, + "loss": 2.4748, + "step": 16885 + }, + { + "epoch": 0.7143582367374567, + "grad_norm": 0.15212145447731018, + "learning_rate": 0.001, + "loss": 2.8044, + "step": 16886 + }, + { + "epoch": 0.714400541500973, + "grad_norm": 0.1592477262020111, + "learning_rate": 0.001, + "loss": 2.3869, + "step": 16887 + }, + { + "epoch": 0.7144428462644894, + "grad_norm": 0.18488934636116028, + "learning_rate": 0.001, + "loss": 2.8091, + "step": 16888 + }, + { + "epoch": 0.7144851510280058, + "grad_norm": 0.2524031698703766, + "learning_rate": 0.001, + "loss": 3.1258, + "step": 16889 + }, + { + "epoch": 0.7145274557915221, + "grad_norm": 0.19174166023731232, + "learning_rate": 0.001, + "loss": 2.3683, + "step": 16890 + }, + { + "epoch": 0.7145697605550385, + "grad_norm": 0.7119539380073547, + "learning_rate": 0.001, + "loss": 1.8931, + "step": 16891 + }, + { + "epoch": 0.7146120653185549, + "grad_norm": 0.14731153845787048, + "learning_rate": 0.001, + "loss": 3.5619, + "step": 16892 + }, + { + "epoch": 0.7146543700820712, + "grad_norm": 0.25515803694725037, + "learning_rate": 0.001, + "loss": 1.8279, + "step": 16893 + }, + { + "epoch": 0.7146966748455876, + "grad_norm": 0.1940464973449707, + "learning_rate": 0.001, + "loss": 2.2902, + "step": 16894 + }, + { + "epoch": 0.7147389796091039, + "grad_norm": 0.3229266405105591, + "learning_rate": 0.001, + "loss": 1.7167, + "step": 16895 + }, + { + "epoch": 0.7147812843726203, + "grad_norm": 0.166666179895401, + "learning_rate": 0.001, + "loss": 2.0199, + "step": 16896 + }, + { + "epoch": 0.7148235891361367, + "grad_norm": 0.14919742941856384, + "learning_rate": 0.001, + "loss": 1.6522, + "step": 16897 + }, + { + "epoch": 0.714865893899653, + "grad_norm": 0.1620715856552124, + "learning_rate": 0.001, + "loss": 2.5642, + "step": 16898 + }, + { + "epoch": 0.7149081986631695, + "grad_norm": 1.447795033454895, + "learning_rate": 0.001, + "loss": 2.2744, + "step": 16899 + }, + { + "epoch": 0.7149505034266859, + "grad_norm": 0.4487624764442444, + "learning_rate": 0.001, + "loss": 2.0419, + "step": 16900 + }, + { + "epoch": 0.7149928081902022, + "grad_norm": 0.21174852550029755, + "learning_rate": 0.001, + "loss": 3.2943, + "step": 16901 + }, + { + "epoch": 0.7150351129537186, + "grad_norm": 0.16627584397792816, + "learning_rate": 0.001, + "loss": 2.8095, + "step": 16902 + }, + { + "epoch": 0.715077417717235, + "grad_norm": 0.6215983629226685, + "learning_rate": 0.001, + "loss": 1.8961, + "step": 16903 + }, + { + "epoch": 0.7151197224807513, + "grad_norm": 0.3979867696762085, + "learning_rate": 0.001, + "loss": 2.1163, + "step": 16904 + }, + { + "epoch": 0.7151620272442677, + "grad_norm": 0.17654618620872498, + "learning_rate": 0.001, + "loss": 2.0656, + "step": 16905 + }, + { + "epoch": 0.7152043320077841, + "grad_norm": 0.15997301042079926, + "learning_rate": 0.001, + "loss": 1.4513, + "step": 16906 + }, + { + "epoch": 0.7152466367713004, + "grad_norm": 0.21495701372623444, + "learning_rate": 0.001, + "loss": 2.0596, + "step": 16907 + }, + { + "epoch": 0.7152889415348168, + "grad_norm": 0.3844386339187622, + "learning_rate": 0.001, + "loss": 1.82, + "step": 16908 + }, + { + "epoch": 0.7153312462983332, + "grad_norm": 1.2073756456375122, + "learning_rate": 0.001, + "loss": 1.7744, + "step": 16909 + }, + { + "epoch": 0.7153735510618495, + "grad_norm": 0.272298663854599, + "learning_rate": 0.001, + "loss": 3.1949, + "step": 16910 + }, + { + "epoch": 0.7154158558253659, + "grad_norm": 0.18787261843681335, + "learning_rate": 0.001, + "loss": 2.01, + "step": 16911 + }, + { + "epoch": 0.7154581605888823, + "grad_norm": 0.16972093284130096, + "learning_rate": 0.001, + "loss": 2.3674, + "step": 16912 + }, + { + "epoch": 0.7155004653523986, + "grad_norm": 0.20305274426937103, + "learning_rate": 0.001, + "loss": 2.1895, + "step": 16913 + }, + { + "epoch": 0.715542770115915, + "grad_norm": 0.16325430572032928, + "learning_rate": 0.001, + "loss": 1.6928, + "step": 16914 + }, + { + "epoch": 0.7155850748794315, + "grad_norm": 0.17961114645004272, + "learning_rate": 0.001, + "loss": 2.2898, + "step": 16915 + }, + { + "epoch": 0.7156273796429478, + "grad_norm": 0.15911836922168732, + "learning_rate": 0.001, + "loss": 2.2762, + "step": 16916 + }, + { + "epoch": 0.7156696844064642, + "grad_norm": 0.1571052074432373, + "learning_rate": 0.001, + "loss": 2.0235, + "step": 16917 + }, + { + "epoch": 0.7157119891699806, + "grad_norm": 0.17579875886440277, + "learning_rate": 0.001, + "loss": 2.8934, + "step": 16918 + }, + { + "epoch": 0.7157542939334969, + "grad_norm": 0.24277016520500183, + "learning_rate": 0.001, + "loss": 3.4171, + "step": 16919 + }, + { + "epoch": 0.7157965986970133, + "grad_norm": 1.3790236711502075, + "learning_rate": 0.001, + "loss": 3.0564, + "step": 16920 + }, + { + "epoch": 0.7158389034605297, + "grad_norm": 0.1847652643918991, + "learning_rate": 0.001, + "loss": 2.5091, + "step": 16921 + }, + { + "epoch": 0.715881208224046, + "grad_norm": 0.18262675404548645, + "learning_rate": 0.001, + "loss": 2.8952, + "step": 16922 + }, + { + "epoch": 0.7159235129875624, + "grad_norm": 0.5282437205314636, + "learning_rate": 0.001, + "loss": 2.3116, + "step": 16923 + }, + { + "epoch": 0.7159658177510788, + "grad_norm": 0.17553091049194336, + "learning_rate": 0.001, + "loss": 1.662, + "step": 16924 + }, + { + "epoch": 0.7160081225145951, + "grad_norm": 0.15575726330280304, + "learning_rate": 0.001, + "loss": 1.5341, + "step": 16925 + }, + { + "epoch": 0.7160504272781115, + "grad_norm": 0.1408965289592743, + "learning_rate": 0.001, + "loss": 1.2918, + "step": 16926 + }, + { + "epoch": 0.7160927320416279, + "grad_norm": 0.18358954787254333, + "learning_rate": 0.001, + "loss": 2.8717, + "step": 16927 + }, + { + "epoch": 0.7161350368051442, + "grad_norm": 0.15843123197555542, + "learning_rate": 0.001, + "loss": 2.5845, + "step": 16928 + }, + { + "epoch": 0.7161773415686606, + "grad_norm": 0.1516776978969574, + "learning_rate": 0.001, + "loss": 2.7124, + "step": 16929 + }, + { + "epoch": 0.716219646332177, + "grad_norm": 0.2799467146396637, + "learning_rate": 0.001, + "loss": 3.124, + "step": 16930 + }, + { + "epoch": 0.7162619510956933, + "grad_norm": 0.16782619059085846, + "learning_rate": 0.001, + "loss": 2.856, + "step": 16931 + }, + { + "epoch": 0.7163042558592098, + "grad_norm": 0.17226386070251465, + "learning_rate": 0.001, + "loss": 2.282, + "step": 16932 + }, + { + "epoch": 0.7163465606227262, + "grad_norm": 0.1673172563314438, + "learning_rate": 0.001, + "loss": 2.0755, + "step": 16933 + }, + { + "epoch": 0.7163888653862425, + "grad_norm": 0.20048165321350098, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 16934 + }, + { + "epoch": 0.7164311701497589, + "grad_norm": 0.3704041838645935, + "learning_rate": 0.001, + "loss": 1.8703, + "step": 16935 + }, + { + "epoch": 0.7164734749132753, + "grad_norm": 0.17238663136959076, + "learning_rate": 0.001, + "loss": 1.8328, + "step": 16936 + }, + { + "epoch": 0.7165157796767916, + "grad_norm": 0.5309063196182251, + "learning_rate": 0.001, + "loss": 2.7206, + "step": 16937 + }, + { + "epoch": 0.716558084440308, + "grad_norm": 0.28528085350990295, + "learning_rate": 0.001, + "loss": 1.8669, + "step": 16938 + }, + { + "epoch": 0.7166003892038244, + "grad_norm": 0.34017452597618103, + "learning_rate": 0.001, + "loss": 2.7997, + "step": 16939 + }, + { + "epoch": 0.7166426939673407, + "grad_norm": 0.19825142621994019, + "learning_rate": 0.001, + "loss": 3.2758, + "step": 16940 + }, + { + "epoch": 0.7166849987308571, + "grad_norm": 0.18345898389816284, + "learning_rate": 0.001, + "loss": 1.6755, + "step": 16941 + }, + { + "epoch": 0.7167273034943734, + "grad_norm": 0.15514928102493286, + "learning_rate": 0.001, + "loss": 1.7149, + "step": 16942 + }, + { + "epoch": 0.7167696082578898, + "grad_norm": 0.19425754249095917, + "learning_rate": 0.001, + "loss": 2.3292, + "step": 16943 + }, + { + "epoch": 0.7168119130214062, + "grad_norm": 0.21819639205932617, + "learning_rate": 0.001, + "loss": 2.918, + "step": 16944 + }, + { + "epoch": 0.7168542177849225, + "grad_norm": 0.16114571690559387, + "learning_rate": 0.001, + "loss": 1.7553, + "step": 16945 + }, + { + "epoch": 0.7168965225484389, + "grad_norm": 0.1726664900779724, + "learning_rate": 0.001, + "loss": 1.8447, + "step": 16946 + }, + { + "epoch": 0.7169388273119554, + "grad_norm": 0.17410875856876373, + "learning_rate": 0.001, + "loss": 1.649, + "step": 16947 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.17056071758270264, + "learning_rate": 0.001, + "loss": 2.7856, + "step": 16948 + }, + { + "epoch": 0.7170234368389881, + "grad_norm": 0.2786010801792145, + "learning_rate": 0.001, + "loss": 2.5549, + "step": 16949 + }, + { + "epoch": 0.7170657416025045, + "grad_norm": 3.383892774581909, + "learning_rate": 0.001, + "loss": 3.8689, + "step": 16950 + }, + { + "epoch": 0.7171080463660208, + "grad_norm": 0.8111663460731506, + "learning_rate": 0.001, + "loss": 1.7525, + "step": 16951 + }, + { + "epoch": 0.7171503511295372, + "grad_norm": 0.22883149981498718, + "learning_rate": 0.001, + "loss": 2.36, + "step": 16952 + }, + { + "epoch": 0.7171926558930536, + "grad_norm": 0.17972877621650696, + "learning_rate": 0.001, + "loss": 2.6088, + "step": 16953 + }, + { + "epoch": 0.7172349606565699, + "grad_norm": 0.2092818170785904, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 16954 + }, + { + "epoch": 0.7172772654200863, + "grad_norm": 0.16795043647289276, + "learning_rate": 0.001, + "loss": 2.887, + "step": 16955 + }, + { + "epoch": 0.7173195701836027, + "grad_norm": 0.16794079542160034, + "learning_rate": 0.001, + "loss": 2.2538, + "step": 16956 + }, + { + "epoch": 0.717361874947119, + "grad_norm": 0.1680205762386322, + "learning_rate": 0.001, + "loss": 1.7101, + "step": 16957 + }, + { + "epoch": 0.7174041797106354, + "grad_norm": 0.17401909828186035, + "learning_rate": 0.001, + "loss": 1.853, + "step": 16958 + }, + { + "epoch": 0.7174464844741518, + "grad_norm": 0.49490857124328613, + "learning_rate": 0.001, + "loss": 2.6701, + "step": 16959 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.17487838864326477, + "learning_rate": 0.001, + "loss": 1.2779, + "step": 16960 + }, + { + "epoch": 0.7175310940011845, + "grad_norm": 0.23202207684516907, + "learning_rate": 0.001, + "loss": 2.4539, + "step": 16961 + }, + { + "epoch": 0.7175733987647009, + "grad_norm": 0.4263302981853485, + "learning_rate": 0.001, + "loss": 1.7071, + "step": 16962 + }, + { + "epoch": 0.7176157035282172, + "grad_norm": 0.22260010242462158, + "learning_rate": 0.001, + "loss": 2.4392, + "step": 16963 + }, + { + "epoch": 0.7176580082917337, + "grad_norm": 0.15869098901748657, + "learning_rate": 0.001, + "loss": 1.7775, + "step": 16964 + }, + { + "epoch": 0.7177003130552501, + "grad_norm": 0.19737808406352997, + "learning_rate": 0.001, + "loss": 2.0257, + "step": 16965 + }, + { + "epoch": 0.7177426178187664, + "grad_norm": 0.15930262207984924, + "learning_rate": 0.001, + "loss": 3.0723, + "step": 16966 + }, + { + "epoch": 0.7177849225822828, + "grad_norm": 0.1594778150320053, + "learning_rate": 0.001, + "loss": 2.2068, + "step": 16967 + }, + { + "epoch": 0.7178272273457992, + "grad_norm": 0.17711618542671204, + "learning_rate": 0.001, + "loss": 1.7199, + "step": 16968 + }, + { + "epoch": 0.7178695321093155, + "grad_norm": 0.7712798118591309, + "learning_rate": 0.001, + "loss": 2.6924, + "step": 16969 + }, + { + "epoch": 0.7179118368728319, + "grad_norm": 0.16748692095279694, + "learning_rate": 0.001, + "loss": 2.1455, + "step": 16970 + }, + { + "epoch": 0.7179541416363483, + "grad_norm": 0.2054317146539688, + "learning_rate": 0.001, + "loss": 2.3641, + "step": 16971 + }, + { + "epoch": 0.7179964463998646, + "grad_norm": 1.4810477495193481, + "learning_rate": 0.001, + "loss": 2.5664, + "step": 16972 + }, + { + "epoch": 0.718038751163381, + "grad_norm": 0.3210020363330841, + "learning_rate": 0.001, + "loss": 2.4745, + "step": 16973 + }, + { + "epoch": 0.7180810559268974, + "grad_norm": 0.14637017250061035, + "learning_rate": 0.001, + "loss": 2.0775, + "step": 16974 + }, + { + "epoch": 0.7181233606904137, + "grad_norm": 0.1749821901321411, + "learning_rate": 0.001, + "loss": 2.4121, + "step": 16975 + }, + { + "epoch": 0.7181656654539301, + "grad_norm": 0.1919098049402237, + "learning_rate": 0.001, + "loss": 1.879, + "step": 16976 + }, + { + "epoch": 0.7182079702174465, + "grad_norm": 0.14838182926177979, + "learning_rate": 0.001, + "loss": 2.2794, + "step": 16977 + }, + { + "epoch": 0.7182502749809628, + "grad_norm": 0.3306111991405487, + "learning_rate": 0.001, + "loss": 1.7571, + "step": 16978 + }, + { + "epoch": 0.7182925797444792, + "grad_norm": 0.1290099322795868, + "learning_rate": 0.001, + "loss": 2.0743, + "step": 16979 + }, + { + "epoch": 0.7183348845079957, + "grad_norm": 0.2530765235424042, + "learning_rate": 0.001, + "loss": 1.92, + "step": 16980 + }, + { + "epoch": 0.718377189271512, + "grad_norm": 0.15586386620998383, + "learning_rate": 0.001, + "loss": 2.5149, + "step": 16981 + }, + { + "epoch": 0.7184194940350284, + "grad_norm": 0.4855019748210907, + "learning_rate": 0.001, + "loss": 1.9419, + "step": 16982 + }, + { + "epoch": 0.7184617987985448, + "grad_norm": 6.393121719360352, + "learning_rate": 0.001, + "loss": 1.7564, + "step": 16983 + }, + { + "epoch": 0.7185041035620611, + "grad_norm": 0.1833300143480301, + "learning_rate": 0.001, + "loss": 2.1363, + "step": 16984 + }, + { + "epoch": 0.7185464083255775, + "grad_norm": 0.1298752874135971, + "learning_rate": 0.001, + "loss": 2.2444, + "step": 16985 + }, + { + "epoch": 0.7185887130890938, + "grad_norm": 0.16792719066143036, + "learning_rate": 0.001, + "loss": 1.7757, + "step": 16986 + }, + { + "epoch": 0.7186310178526102, + "grad_norm": 1.4320098161697388, + "learning_rate": 0.001, + "loss": 2.647, + "step": 16987 + }, + { + "epoch": 0.7186733226161266, + "grad_norm": 0.13900575041770935, + "learning_rate": 0.001, + "loss": 1.6843, + "step": 16988 + }, + { + "epoch": 0.7187156273796429, + "grad_norm": 0.1738477498292923, + "learning_rate": 0.001, + "loss": 1.4185, + "step": 16989 + }, + { + "epoch": 0.7187579321431593, + "grad_norm": 0.5591545701026917, + "learning_rate": 0.001, + "loss": 2.4475, + "step": 16990 + }, + { + "epoch": 0.7188002369066757, + "grad_norm": 0.20126506686210632, + "learning_rate": 0.001, + "loss": 2.3001, + "step": 16991 + }, + { + "epoch": 0.718842541670192, + "grad_norm": 0.3284417986869812, + "learning_rate": 0.001, + "loss": 2.2221, + "step": 16992 + }, + { + "epoch": 0.7188848464337084, + "grad_norm": 0.1933998167514801, + "learning_rate": 0.001, + "loss": 1.9981, + "step": 16993 + }, + { + "epoch": 0.7189271511972248, + "grad_norm": 1.0560952425003052, + "learning_rate": 0.001, + "loss": 2.3462, + "step": 16994 + }, + { + "epoch": 0.7189694559607411, + "grad_norm": 0.16365966200828552, + "learning_rate": 0.001, + "loss": 2.6972, + "step": 16995 + }, + { + "epoch": 0.7190117607242575, + "grad_norm": 0.21969819068908691, + "learning_rate": 0.001, + "loss": 2.3659, + "step": 16996 + }, + { + "epoch": 0.719054065487774, + "grad_norm": 9.586769104003906, + "learning_rate": 0.001, + "loss": 1.9068, + "step": 16997 + }, + { + "epoch": 0.7190963702512903, + "grad_norm": 0.15527045726776123, + "learning_rate": 0.001, + "loss": 1.8663, + "step": 16998 + }, + { + "epoch": 0.7191386750148067, + "grad_norm": 0.25316473841667175, + "learning_rate": 0.001, + "loss": 2.7527, + "step": 16999 + }, + { + "epoch": 0.7191809797783231, + "grad_norm": 0.20590928196907043, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 17000 + }, + { + "epoch": 0.7192232845418394, + "grad_norm": 0.23334155976772308, + "learning_rate": 0.001, + "loss": 1.6148, + "step": 17001 + }, + { + "epoch": 0.7192655893053558, + "grad_norm": 0.1720026433467865, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 17002 + }, + { + "epoch": 0.7193078940688722, + "grad_norm": 0.18993832170963287, + "learning_rate": 0.001, + "loss": 2.4176, + "step": 17003 + }, + { + "epoch": 0.7193501988323885, + "grad_norm": 1.2263435125350952, + "learning_rate": 0.001, + "loss": 1.695, + "step": 17004 + }, + { + "epoch": 0.7193925035959049, + "grad_norm": 0.20992887020111084, + "learning_rate": 0.001, + "loss": 3.1313, + "step": 17005 + }, + { + "epoch": 0.7194348083594213, + "grad_norm": 0.2623100280761719, + "learning_rate": 0.001, + "loss": 1.8112, + "step": 17006 + }, + { + "epoch": 0.7194771131229376, + "grad_norm": 0.2040976881980896, + "learning_rate": 0.001, + "loss": 2.8483, + "step": 17007 + }, + { + "epoch": 0.719519417886454, + "grad_norm": 0.1961153894662857, + "learning_rate": 0.001, + "loss": 3.994, + "step": 17008 + }, + { + "epoch": 0.7195617226499704, + "grad_norm": 0.1991744339466095, + "learning_rate": 0.001, + "loss": 1.9362, + "step": 17009 + }, + { + "epoch": 0.7196040274134867, + "grad_norm": 0.1773693561553955, + "learning_rate": 0.001, + "loss": 2.3754, + "step": 17010 + }, + { + "epoch": 0.7196463321770031, + "grad_norm": 0.322916716337204, + "learning_rate": 0.001, + "loss": 2.6735, + "step": 17011 + }, + { + "epoch": 0.7196886369405195, + "grad_norm": 0.1525036096572876, + "learning_rate": 0.001, + "loss": 2.265, + "step": 17012 + }, + { + "epoch": 0.7197309417040358, + "grad_norm": 0.15779347717761993, + "learning_rate": 0.001, + "loss": 1.6281, + "step": 17013 + }, + { + "epoch": 0.7197732464675523, + "grad_norm": 0.15959423780441284, + "learning_rate": 0.001, + "loss": 1.3413, + "step": 17014 + }, + { + "epoch": 0.7198155512310687, + "grad_norm": 0.3631104826927185, + "learning_rate": 0.001, + "loss": 1.6842, + "step": 17015 + }, + { + "epoch": 0.719857855994585, + "grad_norm": 0.474289208650589, + "learning_rate": 0.001, + "loss": 2.0346, + "step": 17016 + }, + { + "epoch": 0.7199001607581014, + "grad_norm": 0.19243498146533966, + "learning_rate": 0.001, + "loss": 2.3207, + "step": 17017 + }, + { + "epoch": 0.7199424655216178, + "grad_norm": 0.17244835197925568, + "learning_rate": 0.001, + "loss": 2.3529, + "step": 17018 + }, + { + "epoch": 0.7199847702851341, + "grad_norm": 0.16670437157154083, + "learning_rate": 0.001, + "loss": 2.4943, + "step": 17019 + }, + { + "epoch": 0.7200270750486505, + "grad_norm": 0.1707116663455963, + "learning_rate": 0.001, + "loss": 2.6172, + "step": 17020 + }, + { + "epoch": 0.7200693798121669, + "grad_norm": 0.17514576017856598, + "learning_rate": 0.001, + "loss": 1.9051, + "step": 17021 + }, + { + "epoch": 0.7201116845756832, + "grad_norm": 0.17995479702949524, + "learning_rate": 0.001, + "loss": 2.3423, + "step": 17022 + }, + { + "epoch": 0.7201539893391996, + "grad_norm": 0.1695273518562317, + "learning_rate": 0.001, + "loss": 1.9609, + "step": 17023 + }, + { + "epoch": 0.720196294102716, + "grad_norm": 0.18186168372631073, + "learning_rate": 0.001, + "loss": 1.8993, + "step": 17024 + }, + { + "epoch": 0.7202385988662323, + "grad_norm": 0.17733162641525269, + "learning_rate": 0.001, + "loss": 2.2507, + "step": 17025 + }, + { + "epoch": 0.7202809036297487, + "grad_norm": 0.17569288611412048, + "learning_rate": 0.001, + "loss": 2.4908, + "step": 17026 + }, + { + "epoch": 0.7203232083932651, + "grad_norm": 0.1461903601884842, + "learning_rate": 0.001, + "loss": 2.3501, + "step": 17027 + }, + { + "epoch": 0.7203655131567814, + "grad_norm": 0.13951033353805542, + "learning_rate": 0.001, + "loss": 1.5234, + "step": 17028 + }, + { + "epoch": 0.7204078179202978, + "grad_norm": 0.19020099937915802, + "learning_rate": 0.001, + "loss": 2.583, + "step": 17029 + }, + { + "epoch": 0.7204501226838141, + "grad_norm": 0.26185300946235657, + "learning_rate": 0.001, + "loss": 2.0379, + "step": 17030 + }, + { + "epoch": 0.7204924274473306, + "grad_norm": 0.15246087312698364, + "learning_rate": 0.001, + "loss": 1.6983, + "step": 17031 + }, + { + "epoch": 0.720534732210847, + "grad_norm": 0.21648335456848145, + "learning_rate": 0.001, + "loss": 2.4807, + "step": 17032 + }, + { + "epoch": 0.7205770369743633, + "grad_norm": 0.15605172514915466, + "learning_rate": 0.001, + "loss": 1.7671, + "step": 17033 + }, + { + "epoch": 0.7206193417378797, + "grad_norm": 0.1461627036333084, + "learning_rate": 0.001, + "loss": 1.6648, + "step": 17034 + }, + { + "epoch": 0.7206616465013961, + "grad_norm": 0.15967121720314026, + "learning_rate": 0.001, + "loss": 2.1308, + "step": 17035 + }, + { + "epoch": 0.7207039512649124, + "grad_norm": 0.17863650619983673, + "learning_rate": 0.001, + "loss": 2.646, + "step": 17036 + }, + { + "epoch": 0.7207462560284288, + "grad_norm": 0.15027737617492676, + "learning_rate": 0.001, + "loss": 2.302, + "step": 17037 + }, + { + "epoch": 0.7207885607919452, + "grad_norm": 0.30569708347320557, + "learning_rate": 0.001, + "loss": 3.3331, + "step": 17038 + }, + { + "epoch": 0.7208308655554615, + "grad_norm": 0.18736647069454193, + "learning_rate": 0.001, + "loss": 1.5433, + "step": 17039 + }, + { + "epoch": 0.7208731703189779, + "grad_norm": 0.16825266182422638, + "learning_rate": 0.001, + "loss": 1.511, + "step": 17040 + }, + { + "epoch": 0.7209154750824943, + "grad_norm": 0.1982458382844925, + "learning_rate": 0.001, + "loss": 2.066, + "step": 17041 + }, + { + "epoch": 0.7209577798460106, + "grad_norm": 0.1793079972267151, + "learning_rate": 0.001, + "loss": 1.6284, + "step": 17042 + }, + { + "epoch": 0.721000084609527, + "grad_norm": 0.1480327844619751, + "learning_rate": 0.001, + "loss": 1.4117, + "step": 17043 + }, + { + "epoch": 0.7210423893730434, + "grad_norm": 0.3210713267326355, + "learning_rate": 0.001, + "loss": 1.739, + "step": 17044 + }, + { + "epoch": 0.7210846941365597, + "grad_norm": 0.2352566421031952, + "learning_rate": 0.001, + "loss": 1.7573, + "step": 17045 + }, + { + "epoch": 0.7211269989000761, + "grad_norm": 0.1587694138288498, + "learning_rate": 0.001, + "loss": 2.3439, + "step": 17046 + }, + { + "epoch": 0.7211693036635926, + "grad_norm": 0.9405317306518555, + "learning_rate": 0.001, + "loss": 1.9967, + "step": 17047 + }, + { + "epoch": 0.7212116084271089, + "grad_norm": 0.1904739886522293, + "learning_rate": 0.001, + "loss": 1.9261, + "step": 17048 + }, + { + "epoch": 0.7212539131906253, + "grad_norm": 0.12879721820354462, + "learning_rate": 0.001, + "loss": 2.6484, + "step": 17049 + }, + { + "epoch": 0.7212962179541417, + "grad_norm": 0.17692124843597412, + "learning_rate": 0.001, + "loss": 2.0815, + "step": 17050 + }, + { + "epoch": 0.721338522717658, + "grad_norm": 0.16509714722633362, + "learning_rate": 0.001, + "loss": 2.9258, + "step": 17051 + }, + { + "epoch": 0.7213808274811744, + "grad_norm": 0.1631636917591095, + "learning_rate": 0.001, + "loss": 2.8743, + "step": 17052 + }, + { + "epoch": 0.7214231322446908, + "grad_norm": 0.174580916762352, + "learning_rate": 0.001, + "loss": 3.0192, + "step": 17053 + }, + { + "epoch": 0.7214654370082071, + "grad_norm": 0.8501453399658203, + "learning_rate": 0.001, + "loss": 1.8267, + "step": 17054 + }, + { + "epoch": 0.7215077417717235, + "grad_norm": 0.15166904032230377, + "learning_rate": 0.001, + "loss": 2.4201, + "step": 17055 + }, + { + "epoch": 0.7215500465352399, + "grad_norm": 0.16179314255714417, + "learning_rate": 0.001, + "loss": 1.9094, + "step": 17056 + }, + { + "epoch": 0.7215923512987562, + "grad_norm": 0.14122450351715088, + "learning_rate": 0.001, + "loss": 1.327, + "step": 17057 + }, + { + "epoch": 0.7216346560622726, + "grad_norm": 0.16468989849090576, + "learning_rate": 0.001, + "loss": 1.9848, + "step": 17058 + }, + { + "epoch": 0.721676960825789, + "grad_norm": 0.14296945929527283, + "learning_rate": 0.001, + "loss": 1.6847, + "step": 17059 + }, + { + "epoch": 0.7217192655893053, + "grad_norm": 0.1540268510580063, + "learning_rate": 0.001, + "loss": 1.4727, + "step": 17060 + }, + { + "epoch": 0.7217615703528217, + "grad_norm": 0.14178214967250824, + "learning_rate": 0.001, + "loss": 1.8472, + "step": 17061 + }, + { + "epoch": 0.7218038751163381, + "grad_norm": 0.1965574473142624, + "learning_rate": 0.001, + "loss": 2.9151, + "step": 17062 + }, + { + "epoch": 0.7218461798798544, + "grad_norm": 0.1626625508069992, + "learning_rate": 0.001, + "loss": 1.8456, + "step": 17063 + }, + { + "epoch": 0.7218884846433709, + "grad_norm": 1.8781062364578247, + "learning_rate": 0.001, + "loss": 2.1349, + "step": 17064 + }, + { + "epoch": 0.7219307894068873, + "grad_norm": 0.1811523288488388, + "learning_rate": 0.001, + "loss": 3.1633, + "step": 17065 + }, + { + "epoch": 0.7219730941704036, + "grad_norm": 0.1420523077249527, + "learning_rate": 0.001, + "loss": 2.1598, + "step": 17066 + }, + { + "epoch": 0.72201539893392, + "grad_norm": 0.23266121745109558, + "learning_rate": 0.001, + "loss": 2.5496, + "step": 17067 + }, + { + "epoch": 0.7220577036974364, + "grad_norm": 0.15366718173027039, + "learning_rate": 0.001, + "loss": 1.6903, + "step": 17068 + }, + { + "epoch": 0.7221000084609527, + "grad_norm": 0.1720408946275711, + "learning_rate": 0.001, + "loss": 2.101, + "step": 17069 + }, + { + "epoch": 0.7221423132244691, + "grad_norm": 0.13053718209266663, + "learning_rate": 0.001, + "loss": 2.9885, + "step": 17070 + }, + { + "epoch": 0.7221846179879855, + "grad_norm": 0.1682283878326416, + "learning_rate": 0.001, + "loss": 2.0249, + "step": 17071 + }, + { + "epoch": 0.7222269227515018, + "grad_norm": 0.1607704609632492, + "learning_rate": 0.001, + "loss": 2.3205, + "step": 17072 + }, + { + "epoch": 0.7222692275150182, + "grad_norm": 0.14773066341876984, + "learning_rate": 0.001, + "loss": 2.3242, + "step": 17073 + }, + { + "epoch": 0.7223115322785346, + "grad_norm": 2.609527349472046, + "learning_rate": 0.001, + "loss": 1.6714, + "step": 17074 + }, + { + "epoch": 0.7223538370420509, + "grad_norm": 0.22098509967327118, + "learning_rate": 0.001, + "loss": 2.6751, + "step": 17075 + }, + { + "epoch": 0.7223961418055673, + "grad_norm": 0.27444925904273987, + "learning_rate": 0.001, + "loss": 2.977, + "step": 17076 + }, + { + "epoch": 0.7224384465690836, + "grad_norm": 1.863974690437317, + "learning_rate": 0.001, + "loss": 1.914, + "step": 17077 + }, + { + "epoch": 0.7224807513326, + "grad_norm": 0.18942175805568695, + "learning_rate": 0.001, + "loss": 1.7231, + "step": 17078 + }, + { + "epoch": 0.7225230560961164, + "grad_norm": 20.305295944213867, + "learning_rate": 0.001, + "loss": 1.9652, + "step": 17079 + }, + { + "epoch": 0.7225653608596327, + "grad_norm": 1.116580843925476, + "learning_rate": 0.001, + "loss": 2.388, + "step": 17080 + }, + { + "epoch": 0.7226076656231492, + "grad_norm": 0.2017759084701538, + "learning_rate": 0.001, + "loss": 2.4368, + "step": 17081 + }, + { + "epoch": 0.7226499703866656, + "grad_norm": 0.20675040781497955, + "learning_rate": 0.001, + "loss": 1.8265, + "step": 17082 + }, + { + "epoch": 0.7226922751501819, + "grad_norm": 0.2491598129272461, + "learning_rate": 0.001, + "loss": 1.6281, + "step": 17083 + }, + { + "epoch": 0.7227345799136983, + "grad_norm": 0.7096131443977356, + "learning_rate": 0.001, + "loss": 2.6187, + "step": 17084 + }, + { + "epoch": 0.7227768846772147, + "grad_norm": 0.5838418006896973, + "learning_rate": 0.001, + "loss": 3.2109, + "step": 17085 + }, + { + "epoch": 0.722819189440731, + "grad_norm": 0.2073265165090561, + "learning_rate": 0.001, + "loss": 1.8221, + "step": 17086 + }, + { + "epoch": 0.7228614942042474, + "grad_norm": 0.13714540004730225, + "learning_rate": 0.001, + "loss": 2.1434, + "step": 17087 + }, + { + "epoch": 0.7229037989677638, + "grad_norm": 0.16703394055366516, + "learning_rate": 0.001, + "loss": 2.1473, + "step": 17088 + }, + { + "epoch": 0.7229461037312801, + "grad_norm": 1.2047275304794312, + "learning_rate": 0.001, + "loss": 2.9204, + "step": 17089 + }, + { + "epoch": 0.7229884084947965, + "grad_norm": 0.22018975019454956, + "learning_rate": 0.001, + "loss": 3.5448, + "step": 17090 + }, + { + "epoch": 0.7230307132583129, + "grad_norm": 0.4262462854385376, + "learning_rate": 0.001, + "loss": 2.1982, + "step": 17091 + }, + { + "epoch": 0.7230730180218292, + "grad_norm": 0.1653733253479004, + "learning_rate": 0.001, + "loss": 3.1182, + "step": 17092 + }, + { + "epoch": 0.7231153227853456, + "grad_norm": 0.20591098070144653, + "learning_rate": 0.001, + "loss": 3.3591, + "step": 17093 + }, + { + "epoch": 0.723157627548862, + "grad_norm": 0.177374005317688, + "learning_rate": 0.001, + "loss": 2.7171, + "step": 17094 + }, + { + "epoch": 0.7231999323123783, + "grad_norm": 0.17736870050430298, + "learning_rate": 0.001, + "loss": 1.6567, + "step": 17095 + }, + { + "epoch": 0.7232422370758947, + "grad_norm": 0.16085311770439148, + "learning_rate": 0.001, + "loss": 1.6649, + "step": 17096 + }, + { + "epoch": 0.7232845418394112, + "grad_norm": 0.1982753872871399, + "learning_rate": 0.001, + "loss": 1.6603, + "step": 17097 + }, + { + "epoch": 0.7233268466029275, + "grad_norm": 0.1861245036125183, + "learning_rate": 0.001, + "loss": 2.91, + "step": 17098 + }, + { + "epoch": 0.7233691513664439, + "grad_norm": 0.7297300100326538, + "learning_rate": 0.001, + "loss": 1.8701, + "step": 17099 + }, + { + "epoch": 0.7234114561299603, + "grad_norm": 0.16953283548355103, + "learning_rate": 0.001, + "loss": 3.1791, + "step": 17100 + }, + { + "epoch": 0.7234537608934766, + "grad_norm": 0.16749191284179688, + "learning_rate": 0.001, + "loss": 2.5591, + "step": 17101 + }, + { + "epoch": 0.723496065656993, + "grad_norm": 0.19384531676769257, + "learning_rate": 0.001, + "loss": 2.2332, + "step": 17102 + }, + { + "epoch": 0.7235383704205094, + "grad_norm": 2.1700196266174316, + "learning_rate": 0.001, + "loss": 3.0336, + "step": 17103 + }, + { + "epoch": 0.7235806751840257, + "grad_norm": 0.1668250858783722, + "learning_rate": 0.001, + "loss": 2.2885, + "step": 17104 + }, + { + "epoch": 0.7236229799475421, + "grad_norm": 0.17841757833957672, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 17105 + }, + { + "epoch": 0.7236652847110585, + "grad_norm": 0.15527507662773132, + "learning_rate": 0.001, + "loss": 1.8841, + "step": 17106 + }, + { + "epoch": 0.7237075894745748, + "grad_norm": 0.5220174789428711, + "learning_rate": 0.001, + "loss": 1.6185, + "step": 17107 + }, + { + "epoch": 0.7237498942380912, + "grad_norm": 0.14695467054843903, + "learning_rate": 0.001, + "loss": 2.0267, + "step": 17108 + }, + { + "epoch": 0.7237921990016076, + "grad_norm": 0.39175304770469666, + "learning_rate": 0.001, + "loss": 1.7712, + "step": 17109 + }, + { + "epoch": 0.7238345037651239, + "grad_norm": 0.15665383636951447, + "learning_rate": 0.001, + "loss": 2.5027, + "step": 17110 + }, + { + "epoch": 0.7238768085286403, + "grad_norm": 0.17397943139076233, + "learning_rate": 0.001, + "loss": 1.9146, + "step": 17111 + }, + { + "epoch": 0.7239191132921567, + "grad_norm": 0.18213899433612823, + "learning_rate": 0.001, + "loss": 2.6108, + "step": 17112 + }, + { + "epoch": 0.723961418055673, + "grad_norm": 0.1579909473657608, + "learning_rate": 0.001, + "loss": 1.4951, + "step": 17113 + }, + { + "epoch": 0.7240037228191895, + "grad_norm": 0.15795019268989563, + "learning_rate": 0.001, + "loss": 1.5673, + "step": 17114 + }, + { + "epoch": 0.7240460275827059, + "grad_norm": 0.24189157783985138, + "learning_rate": 0.001, + "loss": 3.2042, + "step": 17115 + }, + { + "epoch": 0.7240883323462222, + "grad_norm": 0.1958611160516739, + "learning_rate": 0.001, + "loss": 2.8201, + "step": 17116 + }, + { + "epoch": 0.7241306371097386, + "grad_norm": 0.18128181993961334, + "learning_rate": 0.001, + "loss": 2.0228, + "step": 17117 + }, + { + "epoch": 0.724172941873255, + "grad_norm": 0.1512492448091507, + "learning_rate": 0.001, + "loss": 2.3753, + "step": 17118 + }, + { + "epoch": 0.7242152466367713, + "grad_norm": 0.15748295187950134, + "learning_rate": 0.001, + "loss": 1.9492, + "step": 17119 + }, + { + "epoch": 0.7242575514002877, + "grad_norm": 15.251922607421875, + "learning_rate": 0.001, + "loss": 3.1576, + "step": 17120 + }, + { + "epoch": 0.724299856163804, + "grad_norm": 0.17477959394454956, + "learning_rate": 0.001, + "loss": 1.827, + "step": 17121 + }, + { + "epoch": 0.7243421609273204, + "grad_norm": 0.16635996103286743, + "learning_rate": 0.001, + "loss": 2.5203, + "step": 17122 + }, + { + "epoch": 0.7243844656908368, + "grad_norm": 0.12794765830039978, + "learning_rate": 0.001, + "loss": 2.4801, + "step": 17123 + }, + { + "epoch": 0.7244267704543531, + "grad_norm": 0.1665220856666565, + "learning_rate": 0.001, + "loss": 1.6915, + "step": 17124 + }, + { + "epoch": 0.7244690752178695, + "grad_norm": 0.20942756533622742, + "learning_rate": 0.001, + "loss": 2.1367, + "step": 17125 + }, + { + "epoch": 0.7245113799813859, + "grad_norm": 0.1444472074508667, + "learning_rate": 0.001, + "loss": 1.4025, + "step": 17126 + }, + { + "epoch": 0.7245536847449022, + "grad_norm": 0.5599656105041504, + "learning_rate": 0.001, + "loss": 1.6415, + "step": 17127 + }, + { + "epoch": 0.7245959895084186, + "grad_norm": 0.17753274738788605, + "learning_rate": 0.001, + "loss": 2.3513, + "step": 17128 + }, + { + "epoch": 0.724638294271935, + "grad_norm": 0.18549706041812897, + "learning_rate": 0.001, + "loss": 2.4151, + "step": 17129 + }, + { + "epoch": 0.7246805990354513, + "grad_norm": 0.17280645668506622, + "learning_rate": 0.001, + "loss": 2.5004, + "step": 17130 + }, + { + "epoch": 0.7247229037989678, + "grad_norm": 0.16694018244743347, + "learning_rate": 0.001, + "loss": 2.1255, + "step": 17131 + }, + { + "epoch": 0.7247652085624842, + "grad_norm": 0.1705455332994461, + "learning_rate": 0.001, + "loss": 1.967, + "step": 17132 + }, + { + "epoch": 0.7248075133260005, + "grad_norm": 0.1580754816532135, + "learning_rate": 0.001, + "loss": 1.9559, + "step": 17133 + }, + { + "epoch": 0.7248498180895169, + "grad_norm": 0.9335688352584839, + "learning_rate": 0.001, + "loss": 2.4301, + "step": 17134 + }, + { + "epoch": 0.7248921228530333, + "grad_norm": 0.1776943802833557, + "learning_rate": 0.001, + "loss": 1.4292, + "step": 17135 + }, + { + "epoch": 0.7249344276165496, + "grad_norm": 0.15362563729286194, + "learning_rate": 0.001, + "loss": 2.3276, + "step": 17136 + }, + { + "epoch": 0.724976732380066, + "grad_norm": 0.14412017166614532, + "learning_rate": 0.001, + "loss": 1.8388, + "step": 17137 + }, + { + "epoch": 0.7250190371435824, + "grad_norm": 0.2188391387462616, + "learning_rate": 0.001, + "loss": 2.7813, + "step": 17138 + }, + { + "epoch": 0.7250613419070987, + "grad_norm": 0.18921682238578796, + "learning_rate": 0.001, + "loss": 2.1525, + "step": 17139 + }, + { + "epoch": 0.7251036466706151, + "grad_norm": 0.1472938060760498, + "learning_rate": 0.001, + "loss": 2.2099, + "step": 17140 + }, + { + "epoch": 0.7251459514341315, + "grad_norm": 0.1493636518716812, + "learning_rate": 0.001, + "loss": 2.0206, + "step": 17141 + }, + { + "epoch": 0.7251882561976478, + "grad_norm": 0.16733522713184357, + "learning_rate": 0.001, + "loss": 2.0764, + "step": 17142 + }, + { + "epoch": 0.7252305609611642, + "grad_norm": 0.18534940481185913, + "learning_rate": 0.001, + "loss": 2.0909, + "step": 17143 + }, + { + "epoch": 0.7252728657246806, + "grad_norm": 0.25004249811172485, + "learning_rate": 0.001, + "loss": 1.9715, + "step": 17144 + }, + { + "epoch": 0.7253151704881969, + "grad_norm": 0.1925039291381836, + "learning_rate": 0.001, + "loss": 1.8828, + "step": 17145 + }, + { + "epoch": 0.7253574752517133, + "grad_norm": 0.19968828558921814, + "learning_rate": 0.001, + "loss": 1.8314, + "step": 17146 + }, + { + "epoch": 0.7253997800152298, + "grad_norm": 0.19976693391799927, + "learning_rate": 0.001, + "loss": 2.0041, + "step": 17147 + }, + { + "epoch": 0.725442084778746, + "grad_norm": 0.16204243898391724, + "learning_rate": 0.001, + "loss": 1.5418, + "step": 17148 + }, + { + "epoch": 0.7254843895422625, + "grad_norm": 0.6488047242164612, + "learning_rate": 0.001, + "loss": 1.5626, + "step": 17149 + }, + { + "epoch": 0.7255266943057789, + "grad_norm": 0.16644714772701263, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 17150 + }, + { + "epoch": 0.7255689990692952, + "grad_norm": 0.16991443932056427, + "learning_rate": 0.001, + "loss": 1.7933, + "step": 17151 + }, + { + "epoch": 0.7256113038328116, + "grad_norm": 0.23698163032531738, + "learning_rate": 0.001, + "loss": 2.3938, + "step": 17152 + }, + { + "epoch": 0.725653608596328, + "grad_norm": 1.1459918022155762, + "learning_rate": 0.001, + "loss": 2.0805, + "step": 17153 + }, + { + "epoch": 0.7256959133598443, + "grad_norm": 0.19081348180770874, + "learning_rate": 0.001, + "loss": 1.9892, + "step": 17154 + }, + { + "epoch": 0.7257382181233607, + "grad_norm": 0.20583724975585938, + "learning_rate": 0.001, + "loss": 1.9256, + "step": 17155 + }, + { + "epoch": 0.7257805228868771, + "grad_norm": 0.15794368088245392, + "learning_rate": 0.001, + "loss": 1.7678, + "step": 17156 + }, + { + "epoch": 0.7258228276503934, + "grad_norm": 0.22491112351417542, + "learning_rate": 0.001, + "loss": 2.1716, + "step": 17157 + }, + { + "epoch": 0.7258651324139098, + "grad_norm": 0.37008705735206604, + "learning_rate": 0.001, + "loss": 3.8031, + "step": 17158 + }, + { + "epoch": 0.7259074371774262, + "grad_norm": 0.2206166684627533, + "learning_rate": 0.001, + "loss": 2.6122, + "step": 17159 + }, + { + "epoch": 0.7259497419409425, + "grad_norm": 0.1652143895626068, + "learning_rate": 0.001, + "loss": 2.2458, + "step": 17160 + }, + { + "epoch": 0.7259920467044589, + "grad_norm": 0.16952086985111237, + "learning_rate": 0.001, + "loss": 1.978, + "step": 17161 + }, + { + "epoch": 0.7260343514679753, + "grad_norm": 0.15790002048015594, + "learning_rate": 0.001, + "loss": 1.9818, + "step": 17162 + }, + { + "epoch": 0.7260766562314916, + "grad_norm": 0.18033292889595032, + "learning_rate": 0.001, + "loss": 2.0203, + "step": 17163 + }, + { + "epoch": 0.7261189609950081, + "grad_norm": 0.19904761016368866, + "learning_rate": 0.001, + "loss": 3.5056, + "step": 17164 + }, + { + "epoch": 0.7261612657585244, + "grad_norm": 0.18638011813163757, + "learning_rate": 0.001, + "loss": 2.5683, + "step": 17165 + }, + { + "epoch": 0.7262035705220408, + "grad_norm": 0.1429300308227539, + "learning_rate": 0.001, + "loss": 1.5911, + "step": 17166 + }, + { + "epoch": 0.7262458752855572, + "grad_norm": 0.18004530668258667, + "learning_rate": 0.001, + "loss": 2.3988, + "step": 17167 + }, + { + "epoch": 0.7262881800490735, + "grad_norm": 0.16043299436569214, + "learning_rate": 0.001, + "loss": 1.6417, + "step": 17168 + }, + { + "epoch": 0.7263304848125899, + "grad_norm": 0.15215149521827698, + "learning_rate": 0.001, + "loss": 2.1354, + "step": 17169 + }, + { + "epoch": 0.7263727895761063, + "grad_norm": 0.2153785228729248, + "learning_rate": 0.001, + "loss": 3.3095, + "step": 17170 + }, + { + "epoch": 0.7264150943396226, + "grad_norm": 0.17379812896251678, + "learning_rate": 0.001, + "loss": 1.6931, + "step": 17171 + }, + { + "epoch": 0.726457399103139, + "grad_norm": 0.14409448206424713, + "learning_rate": 0.001, + "loss": 1.8867, + "step": 17172 + }, + { + "epoch": 0.7264997038666554, + "grad_norm": 0.16144266724586487, + "learning_rate": 0.001, + "loss": 1.6323, + "step": 17173 + }, + { + "epoch": 0.7265420086301717, + "grad_norm": 0.16007545590400696, + "learning_rate": 0.001, + "loss": 1.6542, + "step": 17174 + }, + { + "epoch": 0.7265843133936881, + "grad_norm": 0.12861347198486328, + "learning_rate": 0.001, + "loss": 1.838, + "step": 17175 + }, + { + "epoch": 0.7266266181572045, + "grad_norm": 0.1386764645576477, + "learning_rate": 0.001, + "loss": 1.7484, + "step": 17176 + }, + { + "epoch": 0.7266689229207208, + "grad_norm": 0.18406988680362701, + "learning_rate": 0.001, + "loss": 2.2947, + "step": 17177 + }, + { + "epoch": 0.7267112276842372, + "grad_norm": 0.1446983367204666, + "learning_rate": 0.001, + "loss": 2.2782, + "step": 17178 + }, + { + "epoch": 0.7267535324477536, + "grad_norm": 0.17378035187721252, + "learning_rate": 0.001, + "loss": 4.0799, + "step": 17179 + }, + { + "epoch": 0.72679583721127, + "grad_norm": 0.5141298174858093, + "learning_rate": 0.001, + "loss": 1.748, + "step": 17180 + }, + { + "epoch": 0.7268381419747864, + "grad_norm": 0.16872873902320862, + "learning_rate": 0.001, + "loss": 2.1087, + "step": 17181 + }, + { + "epoch": 0.7268804467383028, + "grad_norm": 0.1536315381526947, + "learning_rate": 0.001, + "loss": 2.7305, + "step": 17182 + }, + { + "epoch": 0.7269227515018191, + "grad_norm": 17.74666404724121, + "learning_rate": 0.001, + "loss": 1.3806, + "step": 17183 + }, + { + "epoch": 0.7269650562653355, + "grad_norm": 0.17030657827854156, + "learning_rate": 0.001, + "loss": 2.2526, + "step": 17184 + }, + { + "epoch": 0.7270073610288519, + "grad_norm": 0.1639617383480072, + "learning_rate": 0.001, + "loss": 1.9684, + "step": 17185 + }, + { + "epoch": 0.7270496657923682, + "grad_norm": 0.18141882121562958, + "learning_rate": 0.001, + "loss": 1.8486, + "step": 17186 + }, + { + "epoch": 0.7270919705558846, + "grad_norm": 0.18478603661060333, + "learning_rate": 0.001, + "loss": 2.4985, + "step": 17187 + }, + { + "epoch": 0.727134275319401, + "grad_norm": 0.1748218983411789, + "learning_rate": 0.001, + "loss": 1.778, + "step": 17188 + }, + { + "epoch": 0.7271765800829173, + "grad_norm": 2.296082019805908, + "learning_rate": 0.001, + "loss": 2.8597, + "step": 17189 + }, + { + "epoch": 0.7272188848464337, + "grad_norm": 0.18864890933036804, + "learning_rate": 0.001, + "loss": 2.4064, + "step": 17190 + }, + { + "epoch": 0.7272611896099501, + "grad_norm": 0.17584924399852753, + "learning_rate": 0.001, + "loss": 2.0013, + "step": 17191 + }, + { + "epoch": 0.7273034943734664, + "grad_norm": 0.1358068883419037, + "learning_rate": 0.001, + "loss": 1.7993, + "step": 17192 + }, + { + "epoch": 0.7273457991369828, + "grad_norm": 0.17698509991168976, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 17193 + }, + { + "epoch": 0.7273881039004992, + "grad_norm": 0.4649740159511566, + "learning_rate": 0.001, + "loss": 2.0258, + "step": 17194 + }, + { + "epoch": 0.7274304086640155, + "grad_norm": 0.15231867134571075, + "learning_rate": 0.001, + "loss": 2.1438, + "step": 17195 + }, + { + "epoch": 0.727472713427532, + "grad_norm": 0.19217731058597565, + "learning_rate": 0.001, + "loss": 3.0598, + "step": 17196 + }, + { + "epoch": 0.7275150181910484, + "grad_norm": 0.18122124671936035, + "learning_rate": 0.001, + "loss": 1.54, + "step": 17197 + }, + { + "epoch": 0.7275573229545647, + "grad_norm": 0.13571122288703918, + "learning_rate": 0.001, + "loss": 1.2435, + "step": 17198 + }, + { + "epoch": 0.7275996277180811, + "grad_norm": 0.14728273451328278, + "learning_rate": 0.001, + "loss": 1.6244, + "step": 17199 + }, + { + "epoch": 0.7276419324815975, + "grad_norm": 0.17009060084819794, + "learning_rate": 0.001, + "loss": 2.5318, + "step": 17200 + }, + { + "epoch": 0.7276842372451138, + "grad_norm": 0.17915566265583038, + "learning_rate": 0.001, + "loss": 2.2632, + "step": 17201 + }, + { + "epoch": 0.7277265420086302, + "grad_norm": 0.19644324481487274, + "learning_rate": 0.001, + "loss": 1.6086, + "step": 17202 + }, + { + "epoch": 0.7277688467721466, + "grad_norm": 0.16370825469493866, + "learning_rate": 0.001, + "loss": 1.992, + "step": 17203 + }, + { + "epoch": 0.7278111515356629, + "grad_norm": 0.219704270362854, + "learning_rate": 0.001, + "loss": 3.0066, + "step": 17204 + }, + { + "epoch": 0.7278534562991793, + "grad_norm": 0.2036728709936142, + "learning_rate": 0.001, + "loss": 2.1629, + "step": 17205 + }, + { + "epoch": 0.7278957610626957, + "grad_norm": 0.5341696739196777, + "learning_rate": 0.001, + "loss": 1.8417, + "step": 17206 + }, + { + "epoch": 0.727938065826212, + "grad_norm": 0.23344549536705017, + "learning_rate": 0.001, + "loss": 2.0511, + "step": 17207 + }, + { + "epoch": 0.7279803705897284, + "grad_norm": 0.1574479639530182, + "learning_rate": 0.001, + "loss": 2.1478, + "step": 17208 + }, + { + "epoch": 0.7280226753532448, + "grad_norm": 0.14581382274627686, + "learning_rate": 0.001, + "loss": 3.0699, + "step": 17209 + }, + { + "epoch": 0.7280649801167611, + "grad_norm": 0.18210309743881226, + "learning_rate": 0.001, + "loss": 3.2913, + "step": 17210 + }, + { + "epoch": 0.7281072848802775, + "grad_norm": 0.2757953107357025, + "learning_rate": 0.001, + "loss": 2.9062, + "step": 17211 + }, + { + "epoch": 0.7281495896437938, + "grad_norm": 0.19155064225196838, + "learning_rate": 0.001, + "loss": 1.9776, + "step": 17212 + }, + { + "epoch": 0.7281918944073102, + "grad_norm": 0.17974048852920532, + "learning_rate": 0.001, + "loss": 2.187, + "step": 17213 + }, + { + "epoch": 0.7282341991708267, + "grad_norm": 0.16559724509716034, + "learning_rate": 0.001, + "loss": 1.7224, + "step": 17214 + }, + { + "epoch": 0.728276503934343, + "grad_norm": 0.1683637499809265, + "learning_rate": 0.001, + "loss": 3.5013, + "step": 17215 + }, + { + "epoch": 0.7283188086978594, + "grad_norm": 0.1559930294752121, + "learning_rate": 0.001, + "loss": 2.0094, + "step": 17216 + }, + { + "epoch": 0.7283611134613758, + "grad_norm": 0.17992942035198212, + "learning_rate": 0.001, + "loss": 2.7717, + "step": 17217 + }, + { + "epoch": 0.7284034182248921, + "grad_norm": 0.1760255992412567, + "learning_rate": 0.001, + "loss": 2.0994, + "step": 17218 + }, + { + "epoch": 0.7284457229884085, + "grad_norm": 0.15609480440616608, + "learning_rate": 0.001, + "loss": 2.3692, + "step": 17219 + }, + { + "epoch": 0.7284880277519249, + "grad_norm": 0.27221378684043884, + "learning_rate": 0.001, + "loss": 1.5019, + "step": 17220 + }, + { + "epoch": 0.7285303325154412, + "grad_norm": 0.1502402275800705, + "learning_rate": 0.001, + "loss": 2.3058, + "step": 17221 + }, + { + "epoch": 0.7285726372789576, + "grad_norm": 0.886763334274292, + "learning_rate": 0.001, + "loss": 2.2973, + "step": 17222 + }, + { + "epoch": 0.728614942042474, + "grad_norm": 0.1851658821105957, + "learning_rate": 0.001, + "loss": 1.7426, + "step": 17223 + }, + { + "epoch": 0.7286572468059903, + "grad_norm": 0.13750481605529785, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 17224 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.3507780134677887, + "learning_rate": 0.001, + "loss": 2.1812, + "step": 17225 + }, + { + "epoch": 0.7287418563330231, + "grad_norm": 0.2409433126449585, + "learning_rate": 0.001, + "loss": 1.9943, + "step": 17226 + }, + { + "epoch": 0.7287841610965394, + "grad_norm": 1.0777748823165894, + "learning_rate": 0.001, + "loss": 1.5415, + "step": 17227 + }, + { + "epoch": 0.7288264658600558, + "grad_norm": 0.5831173062324524, + "learning_rate": 0.001, + "loss": 2.9824, + "step": 17228 + }, + { + "epoch": 0.7288687706235722, + "grad_norm": 0.16224071383476257, + "learning_rate": 0.001, + "loss": 1.6938, + "step": 17229 + }, + { + "epoch": 0.7289110753870885, + "grad_norm": 0.3064275085926056, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 17230 + }, + { + "epoch": 0.728953380150605, + "grad_norm": 0.16514526307582855, + "learning_rate": 0.001, + "loss": 2.3135, + "step": 17231 + }, + { + "epoch": 0.7289956849141214, + "grad_norm": 0.23485006392002106, + "learning_rate": 0.001, + "loss": 2.1271, + "step": 17232 + }, + { + "epoch": 0.7290379896776377, + "grad_norm": 0.1845911592245102, + "learning_rate": 0.001, + "loss": 3.1486, + "step": 17233 + }, + { + "epoch": 0.7290802944411541, + "grad_norm": 0.23723876476287842, + "learning_rate": 0.001, + "loss": 3.0125, + "step": 17234 + }, + { + "epoch": 0.7291225992046705, + "grad_norm": 0.16877859830856323, + "learning_rate": 0.001, + "loss": 1.6634, + "step": 17235 + }, + { + "epoch": 0.7291649039681868, + "grad_norm": 0.1759745180606842, + "learning_rate": 0.001, + "loss": 1.8636, + "step": 17236 + }, + { + "epoch": 0.7292072087317032, + "grad_norm": 0.14648011326789856, + "learning_rate": 0.001, + "loss": 1.6105, + "step": 17237 + }, + { + "epoch": 0.7292495134952196, + "grad_norm": 0.18712373077869415, + "learning_rate": 0.001, + "loss": 2.1293, + "step": 17238 + }, + { + "epoch": 0.7292918182587359, + "grad_norm": 0.1619027853012085, + "learning_rate": 0.001, + "loss": 1.3905, + "step": 17239 + }, + { + "epoch": 0.7293341230222523, + "grad_norm": 0.16455508768558502, + "learning_rate": 0.001, + "loss": 2.3091, + "step": 17240 + }, + { + "epoch": 0.7293764277857687, + "grad_norm": 0.173019140958786, + "learning_rate": 0.001, + "loss": 1.8305, + "step": 17241 + }, + { + "epoch": 0.729418732549285, + "grad_norm": 0.14237745106220245, + "learning_rate": 0.001, + "loss": 2.0488, + "step": 17242 + }, + { + "epoch": 0.7294610373128014, + "grad_norm": 0.1654769331216812, + "learning_rate": 0.001, + "loss": 1.6978, + "step": 17243 + }, + { + "epoch": 0.7295033420763178, + "grad_norm": 0.13907228410243988, + "learning_rate": 0.001, + "loss": 1.6468, + "step": 17244 + }, + { + "epoch": 0.7295456468398341, + "grad_norm": 0.15198984742164612, + "learning_rate": 0.001, + "loss": 1.644, + "step": 17245 + }, + { + "epoch": 0.7295879516033505, + "grad_norm": 0.3324761688709259, + "learning_rate": 0.001, + "loss": 1.8737, + "step": 17246 + }, + { + "epoch": 0.729630256366867, + "grad_norm": 0.16085344552993774, + "learning_rate": 0.001, + "loss": 1.588, + "step": 17247 + }, + { + "epoch": 0.7296725611303833, + "grad_norm": 0.21202977001667023, + "learning_rate": 0.001, + "loss": 2.3259, + "step": 17248 + }, + { + "epoch": 0.7297148658938997, + "grad_norm": 0.15043148398399353, + "learning_rate": 0.001, + "loss": 1.9283, + "step": 17249 + }, + { + "epoch": 0.7297571706574161, + "grad_norm": 0.1499723643064499, + "learning_rate": 0.001, + "loss": 1.2876, + "step": 17250 + }, + { + "epoch": 0.7297994754209324, + "grad_norm": 0.18416666984558105, + "learning_rate": 0.001, + "loss": 1.9966, + "step": 17251 + }, + { + "epoch": 0.7298417801844488, + "grad_norm": 0.19342102110385895, + "learning_rate": 0.001, + "loss": 2.1563, + "step": 17252 + }, + { + "epoch": 0.7298840849479652, + "grad_norm": 0.16368161141872406, + "learning_rate": 0.001, + "loss": 2.5488, + "step": 17253 + }, + { + "epoch": 0.7299263897114815, + "grad_norm": 0.19238623976707458, + "learning_rate": 0.001, + "loss": 2.4449, + "step": 17254 + }, + { + "epoch": 0.7299686944749979, + "grad_norm": 0.1476447880268097, + "learning_rate": 0.001, + "loss": 2.1851, + "step": 17255 + }, + { + "epoch": 0.7300109992385142, + "grad_norm": 2.3451356887817383, + "learning_rate": 0.001, + "loss": 1.8425, + "step": 17256 + }, + { + "epoch": 0.7300533040020306, + "grad_norm": 0.172628253698349, + "learning_rate": 0.001, + "loss": 2.2727, + "step": 17257 + }, + { + "epoch": 0.730095608765547, + "grad_norm": 0.23679950833320618, + "learning_rate": 0.001, + "loss": 2.0103, + "step": 17258 + }, + { + "epoch": 0.7301379135290633, + "grad_norm": 0.16247250139713287, + "learning_rate": 0.001, + "loss": 1.9596, + "step": 17259 + }, + { + "epoch": 0.7301802182925797, + "grad_norm": 0.9013334512710571, + "learning_rate": 0.001, + "loss": 3.134, + "step": 17260 + }, + { + "epoch": 0.7302225230560961, + "grad_norm": 0.15641777217388153, + "learning_rate": 0.001, + "loss": 2.2646, + "step": 17261 + }, + { + "epoch": 0.7302648278196124, + "grad_norm": 0.15012650191783905, + "learning_rate": 0.001, + "loss": 1.8738, + "step": 17262 + }, + { + "epoch": 0.7303071325831288, + "grad_norm": 0.16362197697162628, + "learning_rate": 0.001, + "loss": 2.5672, + "step": 17263 + }, + { + "epoch": 0.7303494373466453, + "grad_norm": 4.925112724304199, + "learning_rate": 0.001, + "loss": 3.0045, + "step": 17264 + }, + { + "epoch": 0.7303917421101616, + "grad_norm": 0.1557965725660324, + "learning_rate": 0.001, + "loss": 1.6906, + "step": 17265 + }, + { + "epoch": 0.730434046873678, + "grad_norm": 0.1717749536037445, + "learning_rate": 0.001, + "loss": 1.9104, + "step": 17266 + }, + { + "epoch": 0.7304763516371944, + "grad_norm": 0.17836451530456543, + "learning_rate": 0.001, + "loss": 2.1333, + "step": 17267 + }, + { + "epoch": 0.7305186564007107, + "grad_norm": 0.15282116830348969, + "learning_rate": 0.001, + "loss": 1.9864, + "step": 17268 + }, + { + "epoch": 0.7305609611642271, + "grad_norm": 0.16762694716453552, + "learning_rate": 0.001, + "loss": 2.6978, + "step": 17269 + }, + { + "epoch": 0.7306032659277435, + "grad_norm": 0.3359089493751526, + "learning_rate": 0.001, + "loss": 2.244, + "step": 17270 + }, + { + "epoch": 0.7306455706912598, + "grad_norm": 0.6835538148880005, + "learning_rate": 0.001, + "loss": 3.3602, + "step": 17271 + }, + { + "epoch": 0.7306878754547762, + "grad_norm": 0.16285963356494904, + "learning_rate": 0.001, + "loss": 1.4811, + "step": 17272 + }, + { + "epoch": 0.7307301802182926, + "grad_norm": 1.3391896486282349, + "learning_rate": 0.001, + "loss": 2.6051, + "step": 17273 + }, + { + "epoch": 0.7307724849818089, + "grad_norm": 0.17385242879390717, + "learning_rate": 0.001, + "loss": 3.2361, + "step": 17274 + }, + { + "epoch": 0.7308147897453253, + "grad_norm": 5.107343673706055, + "learning_rate": 0.001, + "loss": 2.1832, + "step": 17275 + }, + { + "epoch": 0.7308570945088417, + "grad_norm": 0.14723435044288635, + "learning_rate": 0.001, + "loss": 1.8271, + "step": 17276 + }, + { + "epoch": 0.730899399272358, + "grad_norm": 0.20319636166095734, + "learning_rate": 0.001, + "loss": 2.0419, + "step": 17277 + }, + { + "epoch": 0.7309417040358744, + "grad_norm": 0.21243970096111298, + "learning_rate": 0.001, + "loss": 2.0175, + "step": 17278 + }, + { + "epoch": 0.7309840087993908, + "grad_norm": 0.16713847219944, + "learning_rate": 0.001, + "loss": 1.5741, + "step": 17279 + }, + { + "epoch": 0.7310263135629071, + "grad_norm": 0.16459450125694275, + "learning_rate": 0.001, + "loss": 2.6241, + "step": 17280 + }, + { + "epoch": 0.7310686183264236, + "grad_norm": 0.13629010319709778, + "learning_rate": 0.001, + "loss": 2.9519, + "step": 17281 + }, + { + "epoch": 0.73111092308994, + "grad_norm": 0.1602662056684494, + "learning_rate": 0.001, + "loss": 1.6162, + "step": 17282 + }, + { + "epoch": 0.7311532278534563, + "grad_norm": 0.36450186371803284, + "learning_rate": 0.001, + "loss": 1.7024, + "step": 17283 + }, + { + "epoch": 0.7311955326169727, + "grad_norm": 0.1558094173669815, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 17284 + }, + { + "epoch": 0.7312378373804891, + "grad_norm": 0.161569282412529, + "learning_rate": 0.001, + "loss": 1.6652, + "step": 17285 + }, + { + "epoch": 0.7312801421440054, + "grad_norm": 0.394671767950058, + "learning_rate": 0.001, + "loss": 2.9097, + "step": 17286 + }, + { + "epoch": 0.7313224469075218, + "grad_norm": 0.1987708956003189, + "learning_rate": 0.001, + "loss": 2.3612, + "step": 17287 + }, + { + "epoch": 0.7313647516710382, + "grad_norm": 0.19308647513389587, + "learning_rate": 0.001, + "loss": 2.5338, + "step": 17288 + }, + { + "epoch": 0.7314070564345545, + "grad_norm": 0.15085269510746002, + "learning_rate": 0.001, + "loss": 2.0816, + "step": 17289 + }, + { + "epoch": 0.7314493611980709, + "grad_norm": 0.15625354647636414, + "learning_rate": 0.001, + "loss": 1.6518, + "step": 17290 + }, + { + "epoch": 0.7314916659615873, + "grad_norm": 6.045670032501221, + "learning_rate": 0.001, + "loss": 2.2589, + "step": 17291 + }, + { + "epoch": 0.7315339707251036, + "grad_norm": 1.6492812633514404, + "learning_rate": 0.001, + "loss": 2.7467, + "step": 17292 + }, + { + "epoch": 0.73157627548862, + "grad_norm": 0.18798352777957916, + "learning_rate": 0.001, + "loss": 1.8589, + "step": 17293 + }, + { + "epoch": 0.7316185802521364, + "grad_norm": 0.1604667603969574, + "learning_rate": 0.001, + "loss": 2.51, + "step": 17294 + }, + { + "epoch": 0.7316608850156527, + "grad_norm": 0.15313079953193665, + "learning_rate": 0.001, + "loss": 2.0193, + "step": 17295 + }, + { + "epoch": 0.7317031897791691, + "grad_norm": 0.28037911653518677, + "learning_rate": 0.001, + "loss": 1.8532, + "step": 17296 + }, + { + "epoch": 0.7317454945426856, + "grad_norm": 0.21967746317386627, + "learning_rate": 0.001, + "loss": 2.5508, + "step": 17297 + }, + { + "epoch": 0.7317877993062019, + "grad_norm": 0.16519500315189362, + "learning_rate": 0.001, + "loss": 1.861, + "step": 17298 + }, + { + "epoch": 0.7318301040697183, + "grad_norm": 0.22370240092277527, + "learning_rate": 0.001, + "loss": 3.3633, + "step": 17299 + }, + { + "epoch": 0.7318724088332346, + "grad_norm": 0.18574631214141846, + "learning_rate": 0.001, + "loss": 2.219, + "step": 17300 + }, + { + "epoch": 0.731914713596751, + "grad_norm": 6.258697986602783, + "learning_rate": 0.001, + "loss": 2.1701, + "step": 17301 + }, + { + "epoch": 0.7319570183602674, + "grad_norm": 0.16539275646209717, + "learning_rate": 0.001, + "loss": 1.8636, + "step": 17302 + }, + { + "epoch": 0.7319993231237837, + "grad_norm": 0.47777655720710754, + "learning_rate": 0.001, + "loss": 1.445, + "step": 17303 + }, + { + "epoch": 0.7320416278873001, + "grad_norm": 0.16214773058891296, + "learning_rate": 0.001, + "loss": 2.2202, + "step": 17304 + }, + { + "epoch": 0.7320839326508165, + "grad_norm": 0.16900905966758728, + "learning_rate": 0.001, + "loss": 2.3208, + "step": 17305 + }, + { + "epoch": 0.7321262374143328, + "grad_norm": 0.2838803231716156, + "learning_rate": 0.001, + "loss": 3.2839, + "step": 17306 + }, + { + "epoch": 0.7321685421778492, + "grad_norm": 0.19927377998828888, + "learning_rate": 0.001, + "loss": 1.9397, + "step": 17307 + }, + { + "epoch": 0.7322108469413656, + "grad_norm": 0.2335241436958313, + "learning_rate": 0.001, + "loss": 2.9701, + "step": 17308 + }, + { + "epoch": 0.7322531517048819, + "grad_norm": 0.1455473154783249, + "learning_rate": 0.001, + "loss": 2.0696, + "step": 17309 + }, + { + "epoch": 0.7322954564683983, + "grad_norm": 0.17584875226020813, + "learning_rate": 0.001, + "loss": 1.9355, + "step": 17310 + }, + { + "epoch": 0.7323377612319147, + "grad_norm": 0.1485612541437149, + "learning_rate": 0.001, + "loss": 1.7784, + "step": 17311 + }, + { + "epoch": 0.732380065995431, + "grad_norm": 0.17357254028320312, + "learning_rate": 0.001, + "loss": 2.162, + "step": 17312 + }, + { + "epoch": 0.7324223707589474, + "grad_norm": 0.322561651468277, + "learning_rate": 0.001, + "loss": 1.7913, + "step": 17313 + }, + { + "epoch": 0.7324646755224639, + "grad_norm": 0.17470252513885498, + "learning_rate": 0.001, + "loss": 1.835, + "step": 17314 + }, + { + "epoch": 0.7325069802859802, + "grad_norm": 0.1539514809846878, + "learning_rate": 0.001, + "loss": 1.8439, + "step": 17315 + }, + { + "epoch": 0.7325492850494966, + "grad_norm": 0.13950100541114807, + "learning_rate": 0.001, + "loss": 1.9242, + "step": 17316 + }, + { + "epoch": 0.732591589813013, + "grad_norm": 1.2174321413040161, + "learning_rate": 0.001, + "loss": 2.0929, + "step": 17317 + }, + { + "epoch": 0.7326338945765293, + "grad_norm": 0.6627561450004578, + "learning_rate": 0.001, + "loss": 2.2273, + "step": 17318 + }, + { + "epoch": 0.7326761993400457, + "grad_norm": 0.17448411881923676, + "learning_rate": 0.001, + "loss": 1.911, + "step": 17319 + }, + { + "epoch": 0.7327185041035621, + "grad_norm": 0.20414674282073975, + "learning_rate": 0.001, + "loss": 2.4904, + "step": 17320 + }, + { + "epoch": 0.7327608088670784, + "grad_norm": 3.205564022064209, + "learning_rate": 0.001, + "loss": 2.7201, + "step": 17321 + }, + { + "epoch": 0.7328031136305948, + "grad_norm": 0.17250360548496246, + "learning_rate": 0.001, + "loss": 2.3204, + "step": 17322 + }, + { + "epoch": 0.7328454183941112, + "grad_norm": 0.1782556027173996, + "learning_rate": 0.001, + "loss": 1.8324, + "step": 17323 + }, + { + "epoch": 0.7328877231576275, + "grad_norm": 0.15511076152324677, + "learning_rate": 0.001, + "loss": 1.9694, + "step": 17324 + }, + { + "epoch": 0.7329300279211439, + "grad_norm": 0.49949324131011963, + "learning_rate": 0.001, + "loss": 3.1143, + "step": 17325 + }, + { + "epoch": 0.7329723326846603, + "grad_norm": 0.1534297913312912, + "learning_rate": 0.001, + "loss": 1.7866, + "step": 17326 + }, + { + "epoch": 0.7330146374481766, + "grad_norm": 0.15869884192943573, + "learning_rate": 0.001, + "loss": 1.9116, + "step": 17327 + }, + { + "epoch": 0.733056942211693, + "grad_norm": 0.14846599102020264, + "learning_rate": 0.001, + "loss": 2.6805, + "step": 17328 + }, + { + "epoch": 0.7330992469752095, + "grad_norm": 0.15433016419410706, + "learning_rate": 0.001, + "loss": 2.7788, + "step": 17329 + }, + { + "epoch": 0.7331415517387257, + "grad_norm": 0.17470060288906097, + "learning_rate": 0.001, + "loss": 1.5982, + "step": 17330 + }, + { + "epoch": 0.7331838565022422, + "grad_norm": 0.1676022708415985, + "learning_rate": 0.001, + "loss": 3.3472, + "step": 17331 + }, + { + "epoch": 0.7332261612657586, + "grad_norm": 0.16653110086917877, + "learning_rate": 0.001, + "loss": 1.7236, + "step": 17332 + }, + { + "epoch": 0.7332684660292749, + "grad_norm": 0.1555767059326172, + "learning_rate": 0.001, + "loss": 2.201, + "step": 17333 + }, + { + "epoch": 0.7333107707927913, + "grad_norm": 3.2302515506744385, + "learning_rate": 0.001, + "loss": 2.3596, + "step": 17334 + }, + { + "epoch": 0.7333530755563077, + "grad_norm": 0.13147684931755066, + "learning_rate": 0.001, + "loss": 1.4347, + "step": 17335 + }, + { + "epoch": 0.733395380319824, + "grad_norm": 0.1869000345468521, + "learning_rate": 0.001, + "loss": 1.6992, + "step": 17336 + }, + { + "epoch": 0.7334376850833404, + "grad_norm": 1.2183090448379517, + "learning_rate": 0.001, + "loss": 2.0567, + "step": 17337 + }, + { + "epoch": 0.7334799898468568, + "grad_norm": 0.1941007673740387, + "learning_rate": 0.001, + "loss": 1.8982, + "step": 17338 + }, + { + "epoch": 0.7335222946103731, + "grad_norm": 0.16951008141040802, + "learning_rate": 0.001, + "loss": 2.2301, + "step": 17339 + }, + { + "epoch": 0.7335645993738895, + "grad_norm": 0.19473901391029358, + "learning_rate": 0.001, + "loss": 2.7647, + "step": 17340 + }, + { + "epoch": 0.7336069041374059, + "grad_norm": 0.22168588638305664, + "learning_rate": 0.001, + "loss": 1.8121, + "step": 17341 + }, + { + "epoch": 0.7336492089009222, + "grad_norm": 0.9966937303543091, + "learning_rate": 0.001, + "loss": 1.607, + "step": 17342 + }, + { + "epoch": 0.7336915136644386, + "grad_norm": 0.17989544570446014, + "learning_rate": 0.001, + "loss": 1.695, + "step": 17343 + }, + { + "epoch": 0.733733818427955, + "grad_norm": 0.2009655088186264, + "learning_rate": 0.001, + "loss": 2.5365, + "step": 17344 + }, + { + "epoch": 0.7337761231914713, + "grad_norm": 0.17318382859230042, + "learning_rate": 0.001, + "loss": 2.3375, + "step": 17345 + }, + { + "epoch": 0.7338184279549878, + "grad_norm": 0.5182844996452332, + "learning_rate": 0.001, + "loss": 1.7107, + "step": 17346 + }, + { + "epoch": 0.733860732718504, + "grad_norm": 0.15933315455913544, + "learning_rate": 0.001, + "loss": 2.7681, + "step": 17347 + }, + { + "epoch": 0.7339030374820205, + "grad_norm": 0.17201949656009674, + "learning_rate": 0.001, + "loss": 2.4166, + "step": 17348 + }, + { + "epoch": 0.7339453422455369, + "grad_norm": 0.15629622340202332, + "learning_rate": 0.001, + "loss": 1.4808, + "step": 17349 + }, + { + "epoch": 0.7339876470090532, + "grad_norm": 0.17875443398952484, + "learning_rate": 0.001, + "loss": 1.7621, + "step": 17350 + }, + { + "epoch": 0.7340299517725696, + "grad_norm": 0.18575279414653778, + "learning_rate": 0.001, + "loss": 2.5072, + "step": 17351 + }, + { + "epoch": 0.734072256536086, + "grad_norm": 0.17435061931610107, + "learning_rate": 0.001, + "loss": 1.8393, + "step": 17352 + }, + { + "epoch": 0.7341145612996023, + "grad_norm": 0.18890979886054993, + "learning_rate": 0.001, + "loss": 2.0328, + "step": 17353 + }, + { + "epoch": 0.7341568660631187, + "grad_norm": 0.41785550117492676, + "learning_rate": 0.001, + "loss": 2.1083, + "step": 17354 + }, + { + "epoch": 0.7341991708266351, + "grad_norm": 0.16583463549613953, + "learning_rate": 0.001, + "loss": 1.7583, + "step": 17355 + }, + { + "epoch": 0.7342414755901514, + "grad_norm": 0.19213053584098816, + "learning_rate": 0.001, + "loss": 1.8698, + "step": 17356 + }, + { + "epoch": 0.7342837803536678, + "grad_norm": 0.7036476731300354, + "learning_rate": 0.001, + "loss": 2.8578, + "step": 17357 + }, + { + "epoch": 0.7343260851171842, + "grad_norm": 0.48419860005378723, + "learning_rate": 0.001, + "loss": 2.4056, + "step": 17358 + }, + { + "epoch": 0.7343683898807005, + "grad_norm": 0.16891148686408997, + "learning_rate": 0.001, + "loss": 2.6222, + "step": 17359 + }, + { + "epoch": 0.7344106946442169, + "grad_norm": 0.19437286257743835, + "learning_rate": 0.001, + "loss": 1.9693, + "step": 17360 + }, + { + "epoch": 0.7344529994077333, + "grad_norm": 0.1752423644065857, + "learning_rate": 0.001, + "loss": 2.8185, + "step": 17361 + }, + { + "epoch": 0.7344953041712496, + "grad_norm": 0.19541260600090027, + "learning_rate": 0.001, + "loss": 2.6349, + "step": 17362 + }, + { + "epoch": 0.734537608934766, + "grad_norm": 0.19351324439048767, + "learning_rate": 0.001, + "loss": 2.8542, + "step": 17363 + }, + { + "epoch": 0.7345799136982825, + "grad_norm": 0.17461591958999634, + "learning_rate": 0.001, + "loss": 2.2905, + "step": 17364 + }, + { + "epoch": 0.7346222184617988, + "grad_norm": 0.16678562760353088, + "learning_rate": 0.001, + "loss": 1.2962, + "step": 17365 + }, + { + "epoch": 0.7346645232253152, + "grad_norm": 0.16038046777248383, + "learning_rate": 0.001, + "loss": 1.9527, + "step": 17366 + }, + { + "epoch": 0.7347068279888316, + "grad_norm": 0.1805296242237091, + "learning_rate": 0.001, + "loss": 3.3531, + "step": 17367 + }, + { + "epoch": 0.7347491327523479, + "grad_norm": 0.8957823514938354, + "learning_rate": 0.001, + "loss": 1.7126, + "step": 17368 + }, + { + "epoch": 0.7347914375158643, + "grad_norm": 0.1552397757768631, + "learning_rate": 0.001, + "loss": 1.6727, + "step": 17369 + }, + { + "epoch": 0.7348337422793807, + "grad_norm": 0.2678411900997162, + "learning_rate": 0.001, + "loss": 1.7442, + "step": 17370 + }, + { + "epoch": 0.734876047042897, + "grad_norm": 2.7068116664886475, + "learning_rate": 0.001, + "loss": 2.5148, + "step": 17371 + }, + { + "epoch": 0.7349183518064134, + "grad_norm": 0.18146149814128876, + "learning_rate": 0.001, + "loss": 2.6581, + "step": 17372 + }, + { + "epoch": 0.7349606565699298, + "grad_norm": 0.18075819313526154, + "learning_rate": 0.001, + "loss": 1.3142, + "step": 17373 + }, + { + "epoch": 0.7350029613334461, + "grad_norm": 0.2121601551771164, + "learning_rate": 0.001, + "loss": 1.9923, + "step": 17374 + }, + { + "epoch": 0.7350452660969625, + "grad_norm": 0.15848374366760254, + "learning_rate": 0.001, + "loss": 1.6758, + "step": 17375 + }, + { + "epoch": 0.7350875708604789, + "grad_norm": 0.17913897335529327, + "learning_rate": 0.001, + "loss": 2.6544, + "step": 17376 + }, + { + "epoch": 0.7351298756239952, + "grad_norm": 0.6060518026351929, + "learning_rate": 0.001, + "loss": 2.1991, + "step": 17377 + }, + { + "epoch": 0.7351721803875116, + "grad_norm": 0.16152165830135345, + "learning_rate": 0.001, + "loss": 1.9824, + "step": 17378 + }, + { + "epoch": 0.735214485151028, + "grad_norm": 0.21107393503189087, + "learning_rate": 0.001, + "loss": 2.0665, + "step": 17379 + }, + { + "epoch": 0.7352567899145444, + "grad_norm": 0.26977473497390747, + "learning_rate": 0.001, + "loss": 2.0408, + "step": 17380 + }, + { + "epoch": 0.7352990946780608, + "grad_norm": 0.18134251236915588, + "learning_rate": 0.001, + "loss": 2.6027, + "step": 17381 + }, + { + "epoch": 0.7353413994415772, + "grad_norm": 0.1814247965812683, + "learning_rate": 0.001, + "loss": 2.2769, + "step": 17382 + }, + { + "epoch": 0.7353837042050935, + "grad_norm": 0.1722179353237152, + "learning_rate": 0.001, + "loss": 2.6387, + "step": 17383 + }, + { + "epoch": 0.7354260089686099, + "grad_norm": 0.6016577482223511, + "learning_rate": 0.001, + "loss": 3.3811, + "step": 17384 + }, + { + "epoch": 0.7354683137321263, + "grad_norm": 0.16259095072746277, + "learning_rate": 0.001, + "loss": 2.2258, + "step": 17385 + }, + { + "epoch": 0.7355106184956426, + "grad_norm": 74.26570892333984, + "learning_rate": 0.001, + "loss": 1.5017, + "step": 17386 + }, + { + "epoch": 0.735552923259159, + "grad_norm": 0.2374870926141739, + "learning_rate": 0.001, + "loss": 2.5826, + "step": 17387 + }, + { + "epoch": 0.7355952280226754, + "grad_norm": 0.15083283185958862, + "learning_rate": 0.001, + "loss": 1.9046, + "step": 17388 + }, + { + "epoch": 0.7356375327861917, + "grad_norm": 0.2028711885213852, + "learning_rate": 0.001, + "loss": 2.3847, + "step": 17389 + }, + { + "epoch": 0.7356798375497081, + "grad_norm": 0.20516479015350342, + "learning_rate": 0.001, + "loss": 2.4515, + "step": 17390 + }, + { + "epoch": 0.7357221423132244, + "grad_norm": 0.2230457216501236, + "learning_rate": 0.001, + "loss": 2.0254, + "step": 17391 + }, + { + "epoch": 0.7357644470767408, + "grad_norm": 0.18468298017978668, + "learning_rate": 0.001, + "loss": 3.509, + "step": 17392 + }, + { + "epoch": 0.7358067518402572, + "grad_norm": 0.4097122848033905, + "learning_rate": 0.001, + "loss": 2.2616, + "step": 17393 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 0.17995667457580566, + "learning_rate": 0.001, + "loss": 2.5077, + "step": 17394 + }, + { + "epoch": 0.7358913613672899, + "grad_norm": 0.5382487177848816, + "learning_rate": 0.001, + "loss": 2.278, + "step": 17395 + }, + { + "epoch": 0.7359336661308064, + "grad_norm": 7.614555358886719, + "learning_rate": 0.001, + "loss": 2.148, + "step": 17396 + }, + { + "epoch": 0.7359759708943227, + "grad_norm": 0.22902828454971313, + "learning_rate": 0.001, + "loss": 3.9575, + "step": 17397 + }, + { + "epoch": 0.7360182756578391, + "grad_norm": 0.193025603890419, + "learning_rate": 0.001, + "loss": 2.3315, + "step": 17398 + }, + { + "epoch": 0.7360605804213555, + "grad_norm": 0.164814755320549, + "learning_rate": 0.001, + "loss": 3.175, + "step": 17399 + }, + { + "epoch": 0.7361028851848718, + "grad_norm": 0.16597630083560944, + "learning_rate": 0.001, + "loss": 2.1436, + "step": 17400 + }, + { + "epoch": 0.7361451899483882, + "grad_norm": 0.264950692653656, + "learning_rate": 0.001, + "loss": 2.5343, + "step": 17401 + }, + { + "epoch": 0.7361874947119046, + "grad_norm": 0.23448620736598969, + "learning_rate": 0.001, + "loss": 2.1109, + "step": 17402 + }, + { + "epoch": 0.7362297994754209, + "grad_norm": 1.8346327543258667, + "learning_rate": 0.001, + "loss": 2.1264, + "step": 17403 + }, + { + "epoch": 0.7362721042389373, + "grad_norm": 0.3137005865573883, + "learning_rate": 0.001, + "loss": 3.8748, + "step": 17404 + }, + { + "epoch": 0.7363144090024537, + "grad_norm": 0.15299704670906067, + "learning_rate": 0.001, + "loss": 1.8221, + "step": 17405 + }, + { + "epoch": 0.73635671376597, + "grad_norm": 0.3703068494796753, + "learning_rate": 0.001, + "loss": 2.0435, + "step": 17406 + }, + { + "epoch": 0.7363990185294864, + "grad_norm": 0.15267716348171234, + "learning_rate": 0.001, + "loss": 2.9499, + "step": 17407 + }, + { + "epoch": 0.7364413232930028, + "grad_norm": 0.2238219976425171, + "learning_rate": 0.001, + "loss": 1.8026, + "step": 17408 + }, + { + "epoch": 0.7364836280565191, + "grad_norm": 0.22296573221683502, + "learning_rate": 0.001, + "loss": 2.8872, + "step": 17409 + }, + { + "epoch": 0.7365259328200355, + "grad_norm": 0.3025241494178772, + "learning_rate": 0.001, + "loss": 3.631, + "step": 17410 + }, + { + "epoch": 0.7365682375835519, + "grad_norm": 0.19598890841007233, + "learning_rate": 0.001, + "loss": 3.2347, + "step": 17411 + }, + { + "epoch": 0.7366105423470682, + "grad_norm": 0.15584322810173035, + "learning_rate": 0.001, + "loss": 2.8041, + "step": 17412 + }, + { + "epoch": 0.7366528471105847, + "grad_norm": 10.525110244750977, + "learning_rate": 0.001, + "loss": 2.9718, + "step": 17413 + }, + { + "epoch": 0.7366951518741011, + "grad_norm": 0.19018413126468658, + "learning_rate": 0.001, + "loss": 1.9068, + "step": 17414 + }, + { + "epoch": 0.7367374566376174, + "grad_norm": 0.1911727637052536, + "learning_rate": 0.001, + "loss": 2.2898, + "step": 17415 + }, + { + "epoch": 0.7367797614011338, + "grad_norm": 0.2422974407672882, + "learning_rate": 0.001, + "loss": 1.8923, + "step": 17416 + }, + { + "epoch": 0.7368220661646502, + "grad_norm": 0.18192513287067413, + "learning_rate": 0.001, + "loss": 3.3953, + "step": 17417 + }, + { + "epoch": 0.7368643709281665, + "grad_norm": 1.0662686824798584, + "learning_rate": 0.001, + "loss": 3.1413, + "step": 17418 + }, + { + "epoch": 0.7369066756916829, + "grad_norm": 0.20433665812015533, + "learning_rate": 0.001, + "loss": 2.1175, + "step": 17419 + }, + { + "epoch": 0.7369489804551993, + "grad_norm": 0.2949641942977905, + "learning_rate": 0.001, + "loss": 1.5695, + "step": 17420 + }, + { + "epoch": 0.7369912852187156, + "grad_norm": 0.21626880764961243, + "learning_rate": 0.001, + "loss": 2.4829, + "step": 17421 + }, + { + "epoch": 0.737033589982232, + "grad_norm": 0.18955782055854797, + "learning_rate": 0.001, + "loss": 2.3706, + "step": 17422 + }, + { + "epoch": 0.7370758947457484, + "grad_norm": 0.19381113350391388, + "learning_rate": 0.001, + "loss": 1.5407, + "step": 17423 + }, + { + "epoch": 0.7371181995092647, + "grad_norm": 0.5135764479637146, + "learning_rate": 0.001, + "loss": 2.4144, + "step": 17424 + }, + { + "epoch": 0.7371605042727811, + "grad_norm": 0.15292295813560486, + "learning_rate": 0.001, + "loss": 2.3463, + "step": 17425 + }, + { + "epoch": 0.7372028090362975, + "grad_norm": 5.058320045471191, + "learning_rate": 0.001, + "loss": 1.8338, + "step": 17426 + }, + { + "epoch": 0.7372451137998138, + "grad_norm": 0.22380977869033813, + "learning_rate": 0.001, + "loss": 2.2025, + "step": 17427 + }, + { + "epoch": 0.7372874185633302, + "grad_norm": 0.17026227712631226, + "learning_rate": 0.001, + "loss": 2.0961, + "step": 17428 + }, + { + "epoch": 0.7373297233268467, + "grad_norm": 0.1634519249200821, + "learning_rate": 0.001, + "loss": 3.1614, + "step": 17429 + }, + { + "epoch": 0.737372028090363, + "grad_norm": 0.17133593559265137, + "learning_rate": 0.001, + "loss": 2.1654, + "step": 17430 + }, + { + "epoch": 0.7374143328538794, + "grad_norm": 2.215588092803955, + "learning_rate": 0.001, + "loss": 1.7608, + "step": 17431 + }, + { + "epoch": 0.7374566376173958, + "grad_norm": 3.688143730163574, + "learning_rate": 0.001, + "loss": 2.6195, + "step": 17432 + }, + { + "epoch": 0.7374989423809121, + "grad_norm": 0.18635474145412445, + "learning_rate": 0.001, + "loss": 2.0476, + "step": 17433 + }, + { + "epoch": 0.7375412471444285, + "grad_norm": 0.17662550508975983, + "learning_rate": 0.001, + "loss": 2.4137, + "step": 17434 + }, + { + "epoch": 0.7375835519079449, + "grad_norm": 0.20594938099384308, + "learning_rate": 0.001, + "loss": 2.4498, + "step": 17435 + }, + { + "epoch": 0.7376258566714612, + "grad_norm": 0.33524230122566223, + "learning_rate": 0.001, + "loss": 2.8723, + "step": 17436 + }, + { + "epoch": 0.7376681614349776, + "grad_norm": 0.2059050351381302, + "learning_rate": 0.001, + "loss": 2.2312, + "step": 17437 + }, + { + "epoch": 0.7377104661984939, + "grad_norm": 0.2377101182937622, + "learning_rate": 0.001, + "loss": 2.8656, + "step": 17438 + }, + { + "epoch": 0.7377527709620103, + "grad_norm": 0.18826983869075775, + "learning_rate": 0.001, + "loss": 1.5286, + "step": 17439 + }, + { + "epoch": 0.7377950757255267, + "grad_norm": 0.16893354058265686, + "learning_rate": 0.001, + "loss": 1.7591, + "step": 17440 + }, + { + "epoch": 0.737837380489043, + "grad_norm": 0.20331987738609314, + "learning_rate": 0.001, + "loss": 1.9848, + "step": 17441 + }, + { + "epoch": 0.7378796852525594, + "grad_norm": 0.291287362575531, + "learning_rate": 0.001, + "loss": 2.1354, + "step": 17442 + }, + { + "epoch": 0.7379219900160758, + "grad_norm": 0.17716658115386963, + "learning_rate": 0.001, + "loss": 2.696, + "step": 17443 + }, + { + "epoch": 0.7379642947795921, + "grad_norm": 0.18112552165985107, + "learning_rate": 0.001, + "loss": 2.3412, + "step": 17444 + }, + { + "epoch": 0.7380065995431085, + "grad_norm": 0.2917419672012329, + "learning_rate": 0.001, + "loss": 1.7166, + "step": 17445 + }, + { + "epoch": 0.738048904306625, + "grad_norm": 0.17572566866874695, + "learning_rate": 0.001, + "loss": 1.9647, + "step": 17446 + }, + { + "epoch": 0.7380912090701413, + "grad_norm": 0.27557796239852905, + "learning_rate": 0.001, + "loss": 2.2187, + "step": 17447 + }, + { + "epoch": 0.7381335138336577, + "grad_norm": 10.137809753417969, + "learning_rate": 0.001, + "loss": 1.7207, + "step": 17448 + }, + { + "epoch": 0.7381758185971741, + "grad_norm": 0.20233531296253204, + "learning_rate": 0.001, + "loss": 1.4421, + "step": 17449 + }, + { + "epoch": 0.7382181233606904, + "grad_norm": 0.2746541202068329, + "learning_rate": 0.001, + "loss": 1.8292, + "step": 17450 + }, + { + "epoch": 0.7382604281242068, + "grad_norm": 0.2038559764623642, + "learning_rate": 0.001, + "loss": 2.6724, + "step": 17451 + }, + { + "epoch": 0.7383027328877232, + "grad_norm": 0.22853845357894897, + "learning_rate": 0.001, + "loss": 2.7177, + "step": 17452 + }, + { + "epoch": 0.7383450376512395, + "grad_norm": 0.24598032236099243, + "learning_rate": 0.001, + "loss": 2.0161, + "step": 17453 + }, + { + "epoch": 0.7383873424147559, + "grad_norm": 0.24328875541687012, + "learning_rate": 0.001, + "loss": 3.1901, + "step": 17454 + }, + { + "epoch": 0.7384296471782723, + "grad_norm": 0.18322333693504333, + "learning_rate": 0.001, + "loss": 1.7166, + "step": 17455 + }, + { + "epoch": 0.7384719519417886, + "grad_norm": 0.2122056633234024, + "learning_rate": 0.001, + "loss": 2.1863, + "step": 17456 + }, + { + "epoch": 0.738514256705305, + "grad_norm": 0.22805139422416687, + "learning_rate": 0.001, + "loss": 2.4584, + "step": 17457 + }, + { + "epoch": 0.7385565614688214, + "grad_norm": 2.5409958362579346, + "learning_rate": 0.001, + "loss": 2.2642, + "step": 17458 + }, + { + "epoch": 0.7385988662323377, + "grad_norm": 0.2093057632446289, + "learning_rate": 0.001, + "loss": 1.5797, + "step": 17459 + }, + { + "epoch": 0.7386411709958541, + "grad_norm": 0.1915285587310791, + "learning_rate": 0.001, + "loss": 2.1698, + "step": 17460 + }, + { + "epoch": 0.7386834757593705, + "grad_norm": 1.6814018487930298, + "learning_rate": 0.001, + "loss": 1.5954, + "step": 17461 + }, + { + "epoch": 0.7387257805228868, + "grad_norm": 0.19684362411499023, + "learning_rate": 0.001, + "loss": 1.8076, + "step": 17462 + }, + { + "epoch": 0.7387680852864033, + "grad_norm": 0.18094508349895477, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 17463 + }, + { + "epoch": 0.7388103900499197, + "grad_norm": 1.161989450454712, + "learning_rate": 0.001, + "loss": 2.7361, + "step": 17464 + }, + { + "epoch": 0.738852694813436, + "grad_norm": 0.21886514127254486, + "learning_rate": 0.001, + "loss": 2.6973, + "step": 17465 + }, + { + "epoch": 0.7388949995769524, + "grad_norm": 0.201472669839859, + "learning_rate": 0.001, + "loss": 2.3263, + "step": 17466 + }, + { + "epoch": 0.7389373043404688, + "grad_norm": 0.17126241326332092, + "learning_rate": 0.001, + "loss": 2.0101, + "step": 17467 + }, + { + "epoch": 0.7389796091039851, + "grad_norm": 0.18553845584392548, + "learning_rate": 0.001, + "loss": 1.9736, + "step": 17468 + }, + { + "epoch": 0.7390219138675015, + "grad_norm": 0.17792284488677979, + "learning_rate": 0.001, + "loss": 2.87, + "step": 17469 + }, + { + "epoch": 0.7390642186310179, + "grad_norm": 0.14445818960666656, + "learning_rate": 0.001, + "loss": 1.7331, + "step": 17470 + }, + { + "epoch": 0.7391065233945342, + "grad_norm": 0.15313506126403809, + "learning_rate": 0.001, + "loss": 1.9839, + "step": 17471 + }, + { + "epoch": 0.7391488281580506, + "grad_norm": 0.17701950669288635, + "learning_rate": 0.001, + "loss": 2.0847, + "step": 17472 + }, + { + "epoch": 0.739191132921567, + "grad_norm": 0.1592811495065689, + "learning_rate": 0.001, + "loss": 2.9718, + "step": 17473 + }, + { + "epoch": 0.7392334376850833, + "grad_norm": 0.1532157063484192, + "learning_rate": 0.001, + "loss": 2.1228, + "step": 17474 + }, + { + "epoch": 0.7392757424485997, + "grad_norm": 0.1617754101753235, + "learning_rate": 0.001, + "loss": 2.5941, + "step": 17475 + }, + { + "epoch": 0.7393180472121161, + "grad_norm": 0.1594044715166092, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 17476 + }, + { + "epoch": 0.7393603519756324, + "grad_norm": 0.18134772777557373, + "learning_rate": 0.001, + "loss": 1.7252, + "step": 17477 + }, + { + "epoch": 0.7394026567391488, + "grad_norm": 0.14515188336372375, + "learning_rate": 0.001, + "loss": 1.3929, + "step": 17478 + }, + { + "epoch": 0.7394449615026653, + "grad_norm": 0.2504208981990814, + "learning_rate": 0.001, + "loss": 2.7328, + "step": 17479 + }, + { + "epoch": 0.7394872662661816, + "grad_norm": 0.24100682139396667, + "learning_rate": 0.001, + "loss": 1.7722, + "step": 17480 + }, + { + "epoch": 0.739529571029698, + "grad_norm": 0.19070883095264435, + "learning_rate": 0.001, + "loss": 1.6493, + "step": 17481 + }, + { + "epoch": 0.7395718757932143, + "grad_norm": 0.2634289264678955, + "learning_rate": 0.001, + "loss": 2.8336, + "step": 17482 + }, + { + "epoch": 0.7396141805567307, + "grad_norm": 0.1580793410539627, + "learning_rate": 0.001, + "loss": 2.444, + "step": 17483 + }, + { + "epoch": 0.7396564853202471, + "grad_norm": 0.17331601679325104, + "learning_rate": 0.001, + "loss": 3.8579, + "step": 17484 + }, + { + "epoch": 0.7396987900837634, + "grad_norm": 0.13026723265647888, + "learning_rate": 0.001, + "loss": 2.681, + "step": 17485 + }, + { + "epoch": 0.7397410948472798, + "grad_norm": 0.16975976526737213, + "learning_rate": 0.001, + "loss": 1.8355, + "step": 17486 + }, + { + "epoch": 0.7397833996107962, + "grad_norm": 0.1425335556268692, + "learning_rate": 0.001, + "loss": 2.1688, + "step": 17487 + }, + { + "epoch": 0.7398257043743125, + "grad_norm": 0.16109095513820648, + "learning_rate": 0.001, + "loss": 2.1205, + "step": 17488 + }, + { + "epoch": 0.7398680091378289, + "grad_norm": 0.14170965552330017, + "learning_rate": 0.001, + "loss": 1.7205, + "step": 17489 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.1703367382287979, + "learning_rate": 0.001, + "loss": 2.2542, + "step": 17490 + }, + { + "epoch": 0.7399526186648616, + "grad_norm": 0.18292082846164703, + "learning_rate": 0.001, + "loss": 2.0744, + "step": 17491 + }, + { + "epoch": 0.739994923428378, + "grad_norm": 0.15937143564224243, + "learning_rate": 0.001, + "loss": 2.5822, + "step": 17492 + }, + { + "epoch": 0.7400372281918944, + "grad_norm": 0.14507554471492767, + "learning_rate": 0.001, + "loss": 1.8052, + "step": 17493 + }, + { + "epoch": 0.7400795329554107, + "grad_norm": 0.4141727387905121, + "learning_rate": 0.001, + "loss": 2.2241, + "step": 17494 + }, + { + "epoch": 0.7401218377189271, + "grad_norm": 0.14581911265850067, + "learning_rate": 0.001, + "loss": 1.9695, + "step": 17495 + }, + { + "epoch": 0.7401641424824436, + "grad_norm": 0.1397586613893509, + "learning_rate": 0.001, + "loss": 2.4452, + "step": 17496 + }, + { + "epoch": 0.7402064472459599, + "grad_norm": 0.21478283405303955, + "learning_rate": 0.001, + "loss": 2.4327, + "step": 17497 + }, + { + "epoch": 0.7402487520094763, + "grad_norm": 0.16876371204853058, + "learning_rate": 0.001, + "loss": 2.7402, + "step": 17498 + }, + { + "epoch": 0.7402910567729927, + "grad_norm": 0.3392346203327179, + "learning_rate": 0.001, + "loss": 2.3049, + "step": 17499 + }, + { + "epoch": 0.740333361536509, + "grad_norm": 0.1360718309879303, + "learning_rate": 0.001, + "loss": 2.0013, + "step": 17500 + }, + { + "epoch": 0.7403756663000254, + "grad_norm": 0.14087393879890442, + "learning_rate": 0.001, + "loss": 1.7843, + "step": 17501 + }, + { + "epoch": 0.7404179710635418, + "grad_norm": 0.15217366814613342, + "learning_rate": 0.001, + "loss": 3.6722, + "step": 17502 + }, + { + "epoch": 0.7404602758270581, + "grad_norm": 0.17477324604988098, + "learning_rate": 0.001, + "loss": 2.2989, + "step": 17503 + }, + { + "epoch": 0.7405025805905745, + "grad_norm": 16.218931198120117, + "learning_rate": 0.001, + "loss": 1.676, + "step": 17504 + }, + { + "epoch": 0.7405448853540909, + "grad_norm": 0.1638619303703308, + "learning_rate": 0.001, + "loss": 2.3842, + "step": 17505 + }, + { + "epoch": 0.7405871901176072, + "grad_norm": 31.428306579589844, + "learning_rate": 0.001, + "loss": 3.2131, + "step": 17506 + }, + { + "epoch": 0.7406294948811236, + "grad_norm": 0.1608552634716034, + "learning_rate": 0.001, + "loss": 1.9849, + "step": 17507 + }, + { + "epoch": 0.74067179964464, + "grad_norm": 0.20988480746746063, + "learning_rate": 0.001, + "loss": 2.2174, + "step": 17508 + }, + { + "epoch": 0.7407141044081563, + "grad_norm": 3.444148302078247, + "learning_rate": 0.001, + "loss": 2.4545, + "step": 17509 + }, + { + "epoch": 0.7407564091716727, + "grad_norm": 0.17780986428260803, + "learning_rate": 0.001, + "loss": 2.6699, + "step": 17510 + }, + { + "epoch": 0.7407987139351891, + "grad_norm": 0.19687338173389435, + "learning_rate": 0.001, + "loss": 2.1274, + "step": 17511 + }, + { + "epoch": 0.7408410186987054, + "grad_norm": 0.3244110643863678, + "learning_rate": 0.001, + "loss": 1.981, + "step": 17512 + }, + { + "epoch": 0.7408833234622219, + "grad_norm": 0.19059693813323975, + "learning_rate": 0.001, + "loss": 2.4835, + "step": 17513 + }, + { + "epoch": 0.7409256282257383, + "grad_norm": 0.2618594169616699, + "learning_rate": 0.001, + "loss": 3.4605, + "step": 17514 + }, + { + "epoch": 0.7409679329892546, + "grad_norm": 0.4329042136669159, + "learning_rate": 0.001, + "loss": 2.3207, + "step": 17515 + }, + { + "epoch": 0.741010237752771, + "grad_norm": 0.45828118920326233, + "learning_rate": 0.001, + "loss": 2.6254, + "step": 17516 + }, + { + "epoch": 0.7410525425162874, + "grad_norm": 0.17340798676013947, + "learning_rate": 0.001, + "loss": 3.2398, + "step": 17517 + }, + { + "epoch": 0.7410948472798037, + "grad_norm": 0.1793540120124817, + "learning_rate": 0.001, + "loss": 2.3224, + "step": 17518 + }, + { + "epoch": 0.7411371520433201, + "grad_norm": 0.1443507820367813, + "learning_rate": 0.001, + "loss": 1.4805, + "step": 17519 + }, + { + "epoch": 0.7411794568068365, + "grad_norm": 0.1604100912809372, + "learning_rate": 0.001, + "loss": 1.6807, + "step": 17520 + }, + { + "epoch": 0.7412217615703528, + "grad_norm": 0.17397022247314453, + "learning_rate": 0.001, + "loss": 1.6518, + "step": 17521 + }, + { + "epoch": 0.7412640663338692, + "grad_norm": 0.49950161576271057, + "learning_rate": 0.001, + "loss": 2.6769, + "step": 17522 + }, + { + "epoch": 0.7413063710973856, + "grad_norm": 0.15705753862857819, + "learning_rate": 0.001, + "loss": 1.4069, + "step": 17523 + }, + { + "epoch": 0.7413486758609019, + "grad_norm": 0.14743894338607788, + "learning_rate": 0.001, + "loss": 2.8272, + "step": 17524 + }, + { + "epoch": 0.7413909806244183, + "grad_norm": 0.1543998420238495, + "learning_rate": 0.001, + "loss": 3.0686, + "step": 17525 + }, + { + "epoch": 0.7414332853879346, + "grad_norm": 0.16085979342460632, + "learning_rate": 0.001, + "loss": 1.5601, + "step": 17526 + }, + { + "epoch": 0.741475590151451, + "grad_norm": 0.22067369520664215, + "learning_rate": 0.001, + "loss": 3.7326, + "step": 17527 + }, + { + "epoch": 0.7415178949149674, + "grad_norm": 0.17484331130981445, + "learning_rate": 0.001, + "loss": 2.4677, + "step": 17528 + }, + { + "epoch": 0.7415601996784837, + "grad_norm": 0.3703678548336029, + "learning_rate": 0.001, + "loss": 2.2413, + "step": 17529 + }, + { + "epoch": 0.7416025044420002, + "grad_norm": 0.2962591350078583, + "learning_rate": 0.001, + "loss": 1.7069, + "step": 17530 + }, + { + "epoch": 0.7416448092055166, + "grad_norm": 0.656477689743042, + "learning_rate": 0.001, + "loss": 2.4666, + "step": 17531 + }, + { + "epoch": 0.7416871139690329, + "grad_norm": 0.36826154589653015, + "learning_rate": 0.001, + "loss": 1.8836, + "step": 17532 + }, + { + "epoch": 0.7417294187325493, + "grad_norm": 0.17669114470481873, + "learning_rate": 0.001, + "loss": 1.9258, + "step": 17533 + }, + { + "epoch": 0.7417717234960657, + "grad_norm": 0.14634403586387634, + "learning_rate": 0.001, + "loss": 1.8056, + "step": 17534 + }, + { + "epoch": 0.741814028259582, + "grad_norm": 0.15235856175422668, + "learning_rate": 0.001, + "loss": 3.3709, + "step": 17535 + }, + { + "epoch": 0.7418563330230984, + "grad_norm": 0.1723131686449051, + "learning_rate": 0.001, + "loss": 2.7196, + "step": 17536 + }, + { + "epoch": 0.7418986377866148, + "grad_norm": 0.1826736032962799, + "learning_rate": 0.001, + "loss": 2.494, + "step": 17537 + }, + { + "epoch": 0.7419409425501311, + "grad_norm": 0.16118574142456055, + "learning_rate": 0.001, + "loss": 2.9765, + "step": 17538 + }, + { + "epoch": 0.7419832473136475, + "grad_norm": 0.4343286454677582, + "learning_rate": 0.001, + "loss": 1.7836, + "step": 17539 + }, + { + "epoch": 0.7420255520771639, + "grad_norm": 1.067611575126648, + "learning_rate": 0.001, + "loss": 2.923, + "step": 17540 + }, + { + "epoch": 0.7420678568406802, + "grad_norm": 0.178229421377182, + "learning_rate": 0.001, + "loss": 2.2143, + "step": 17541 + }, + { + "epoch": 0.7421101616041966, + "grad_norm": 0.15373486280441284, + "learning_rate": 0.001, + "loss": 1.7178, + "step": 17542 + }, + { + "epoch": 0.742152466367713, + "grad_norm": 0.16188517212867737, + "learning_rate": 0.001, + "loss": 2.612, + "step": 17543 + }, + { + "epoch": 0.7421947711312293, + "grad_norm": 0.1258399486541748, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 17544 + }, + { + "epoch": 0.7422370758947457, + "grad_norm": 0.14767125248908997, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 17545 + }, + { + "epoch": 0.7422793806582622, + "grad_norm": 0.8984226584434509, + "learning_rate": 0.001, + "loss": 2.3823, + "step": 17546 + }, + { + "epoch": 0.7423216854217785, + "grad_norm": 0.12772268056869507, + "learning_rate": 0.001, + "loss": 2.3786, + "step": 17547 + }, + { + "epoch": 0.7423639901852949, + "grad_norm": 0.14280395209789276, + "learning_rate": 0.001, + "loss": 2.6982, + "step": 17548 + }, + { + "epoch": 0.7424062949488113, + "grad_norm": 0.15735042095184326, + "learning_rate": 0.001, + "loss": 2.907, + "step": 17549 + }, + { + "epoch": 0.7424485997123276, + "grad_norm": 0.22708289325237274, + "learning_rate": 0.001, + "loss": 2.9517, + "step": 17550 + }, + { + "epoch": 0.742490904475844, + "grad_norm": 0.5488585233688354, + "learning_rate": 0.001, + "loss": 3.046, + "step": 17551 + }, + { + "epoch": 0.7425332092393604, + "grad_norm": 0.15782161056995392, + "learning_rate": 0.001, + "loss": 2.2142, + "step": 17552 + }, + { + "epoch": 0.7425755140028767, + "grad_norm": 0.147671177983284, + "learning_rate": 0.001, + "loss": 2.2613, + "step": 17553 + }, + { + "epoch": 0.7426178187663931, + "grad_norm": 0.16425222158432007, + "learning_rate": 0.001, + "loss": 1.8327, + "step": 17554 + }, + { + "epoch": 0.7426601235299095, + "grad_norm": 0.15929697453975677, + "learning_rate": 0.001, + "loss": 2.6455, + "step": 17555 + }, + { + "epoch": 0.7427024282934258, + "grad_norm": 0.16624753177165985, + "learning_rate": 0.001, + "loss": 1.5187, + "step": 17556 + }, + { + "epoch": 0.7427447330569422, + "grad_norm": 0.36340129375457764, + "learning_rate": 0.001, + "loss": 2.103, + "step": 17557 + }, + { + "epoch": 0.7427870378204586, + "grad_norm": 0.20203012228012085, + "learning_rate": 0.001, + "loss": 3.6063, + "step": 17558 + }, + { + "epoch": 0.7428293425839749, + "grad_norm": 0.2196560502052307, + "learning_rate": 0.001, + "loss": 2.2121, + "step": 17559 + }, + { + "epoch": 0.7428716473474913, + "grad_norm": 0.16001419723033905, + "learning_rate": 0.001, + "loss": 2.8535, + "step": 17560 + }, + { + "epoch": 0.7429139521110077, + "grad_norm": 0.2432798594236374, + "learning_rate": 0.001, + "loss": 2.1841, + "step": 17561 + }, + { + "epoch": 0.742956256874524, + "grad_norm": 0.1755453497171402, + "learning_rate": 0.001, + "loss": 2.4195, + "step": 17562 + }, + { + "epoch": 0.7429985616380405, + "grad_norm": 0.16746820509433746, + "learning_rate": 0.001, + "loss": 1.645, + "step": 17563 + }, + { + "epoch": 0.7430408664015569, + "grad_norm": 0.1702737808227539, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 17564 + }, + { + "epoch": 0.7430831711650732, + "grad_norm": 0.15115857124328613, + "learning_rate": 0.001, + "loss": 1.7634, + "step": 17565 + }, + { + "epoch": 0.7431254759285896, + "grad_norm": 0.192986860871315, + "learning_rate": 0.001, + "loss": 2.5294, + "step": 17566 + }, + { + "epoch": 0.743167780692106, + "grad_norm": 0.1629578322172165, + "learning_rate": 0.001, + "loss": 1.551, + "step": 17567 + }, + { + "epoch": 0.7432100854556223, + "grad_norm": 2.344882011413574, + "learning_rate": 0.001, + "loss": 3.6963, + "step": 17568 + }, + { + "epoch": 0.7432523902191387, + "grad_norm": 0.19861623644828796, + "learning_rate": 0.001, + "loss": 2.5566, + "step": 17569 + }, + { + "epoch": 0.7432946949826551, + "grad_norm": 0.1944287270307541, + "learning_rate": 0.001, + "loss": 1.4486, + "step": 17570 + }, + { + "epoch": 0.7433369997461714, + "grad_norm": 0.2017359435558319, + "learning_rate": 0.001, + "loss": 2.2659, + "step": 17571 + }, + { + "epoch": 0.7433793045096878, + "grad_norm": 0.19808664917945862, + "learning_rate": 0.001, + "loss": 2.0332, + "step": 17572 + }, + { + "epoch": 0.7434216092732041, + "grad_norm": 0.1653938591480255, + "learning_rate": 0.001, + "loss": 2.2198, + "step": 17573 + }, + { + "epoch": 0.7434639140367205, + "grad_norm": 0.20208589732646942, + "learning_rate": 0.001, + "loss": 2.2754, + "step": 17574 + }, + { + "epoch": 0.7435062188002369, + "grad_norm": 0.1551780104637146, + "learning_rate": 0.001, + "loss": 1.8926, + "step": 17575 + }, + { + "epoch": 0.7435485235637532, + "grad_norm": 0.20212167501449585, + "learning_rate": 0.001, + "loss": 2.2995, + "step": 17576 + }, + { + "epoch": 0.7435908283272696, + "grad_norm": 0.2041611671447754, + "learning_rate": 0.001, + "loss": 2.7081, + "step": 17577 + }, + { + "epoch": 0.743633133090786, + "grad_norm": 0.18079808354377747, + "learning_rate": 0.001, + "loss": 3.0791, + "step": 17578 + }, + { + "epoch": 0.7436754378543023, + "grad_norm": 0.15058568120002747, + "learning_rate": 0.001, + "loss": 2.3326, + "step": 17579 + }, + { + "epoch": 0.7437177426178188, + "grad_norm": 0.18063566088676453, + "learning_rate": 0.001, + "loss": 2.0293, + "step": 17580 + }, + { + "epoch": 0.7437600473813352, + "grad_norm": 0.16308987140655518, + "learning_rate": 0.001, + "loss": 2.3896, + "step": 17581 + }, + { + "epoch": 0.7438023521448515, + "grad_norm": 0.14598752558231354, + "learning_rate": 0.001, + "loss": 1.7756, + "step": 17582 + }, + { + "epoch": 0.7438446569083679, + "grad_norm": 0.17939159274101257, + "learning_rate": 0.001, + "loss": 2.0577, + "step": 17583 + }, + { + "epoch": 0.7438869616718843, + "grad_norm": 0.2707332670688629, + "learning_rate": 0.001, + "loss": 2.4434, + "step": 17584 + }, + { + "epoch": 0.7439292664354006, + "grad_norm": 0.15140265226364136, + "learning_rate": 0.001, + "loss": 1.9017, + "step": 17585 + }, + { + "epoch": 0.743971571198917, + "grad_norm": 0.18124867975711823, + "learning_rate": 0.001, + "loss": 2.7051, + "step": 17586 + }, + { + "epoch": 0.7440138759624334, + "grad_norm": 0.20102323591709137, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 17587 + }, + { + "epoch": 0.7440561807259497, + "grad_norm": 1.4687132835388184, + "learning_rate": 0.001, + "loss": 2.4702, + "step": 17588 + }, + { + "epoch": 0.7440984854894661, + "grad_norm": 0.4242343604564667, + "learning_rate": 0.001, + "loss": 1.6438, + "step": 17589 + }, + { + "epoch": 0.7441407902529825, + "grad_norm": 0.1698838770389557, + "learning_rate": 0.001, + "loss": 1.8073, + "step": 17590 + }, + { + "epoch": 0.7441830950164988, + "grad_norm": 0.3089980483055115, + "learning_rate": 0.001, + "loss": 2.1405, + "step": 17591 + }, + { + "epoch": 0.7442253997800152, + "grad_norm": 0.8378577828407288, + "learning_rate": 0.001, + "loss": 1.7228, + "step": 17592 + }, + { + "epoch": 0.7442677045435316, + "grad_norm": 0.14282885193824768, + "learning_rate": 0.001, + "loss": 2.7357, + "step": 17593 + }, + { + "epoch": 0.7443100093070479, + "grad_norm": 0.6160475015640259, + "learning_rate": 0.001, + "loss": 1.9501, + "step": 17594 + }, + { + "epoch": 0.7443523140705643, + "grad_norm": 0.17935483157634735, + "learning_rate": 0.001, + "loss": 2.9295, + "step": 17595 + }, + { + "epoch": 0.7443946188340808, + "grad_norm": 0.15372052788734436, + "learning_rate": 0.001, + "loss": 1.741, + "step": 17596 + }, + { + "epoch": 0.7444369235975971, + "grad_norm": 0.13836802542209625, + "learning_rate": 0.001, + "loss": 2.3167, + "step": 17597 + }, + { + "epoch": 0.7444792283611135, + "grad_norm": 0.4473693370819092, + "learning_rate": 0.001, + "loss": 1.2818, + "step": 17598 + }, + { + "epoch": 0.7445215331246299, + "grad_norm": 0.1744404435157776, + "learning_rate": 0.001, + "loss": 1.7035, + "step": 17599 + }, + { + "epoch": 0.7445638378881462, + "grad_norm": 0.1443067491054535, + "learning_rate": 0.001, + "loss": 2.1968, + "step": 17600 + }, + { + "epoch": 0.7446061426516626, + "grad_norm": 0.14389725029468536, + "learning_rate": 0.001, + "loss": 1.8681, + "step": 17601 + }, + { + "epoch": 0.744648447415179, + "grad_norm": 0.1405625343322754, + "learning_rate": 0.001, + "loss": 2.3588, + "step": 17602 + }, + { + "epoch": 0.7446907521786953, + "grad_norm": 0.16592542827129364, + "learning_rate": 0.001, + "loss": 1.8942, + "step": 17603 + }, + { + "epoch": 0.7447330569422117, + "grad_norm": 0.23292699456214905, + "learning_rate": 0.001, + "loss": 3.1274, + "step": 17604 + }, + { + "epoch": 0.7447753617057281, + "grad_norm": 0.23173336684703827, + "learning_rate": 0.001, + "loss": 2.626, + "step": 17605 + }, + { + "epoch": 0.7448176664692444, + "grad_norm": 0.1321907639503479, + "learning_rate": 0.001, + "loss": 1.5545, + "step": 17606 + }, + { + "epoch": 0.7448599712327608, + "grad_norm": 0.15251246094703674, + "learning_rate": 0.001, + "loss": 1.5877, + "step": 17607 + }, + { + "epoch": 0.7449022759962772, + "grad_norm": 0.16497798264026642, + "learning_rate": 0.001, + "loss": 2.6468, + "step": 17608 + }, + { + "epoch": 0.7449445807597935, + "grad_norm": 0.15258750319480896, + "learning_rate": 0.001, + "loss": 2.7736, + "step": 17609 + }, + { + "epoch": 0.7449868855233099, + "grad_norm": 0.1986195147037506, + "learning_rate": 0.001, + "loss": 2.0389, + "step": 17610 + }, + { + "epoch": 0.7450291902868263, + "grad_norm": 0.13651925325393677, + "learning_rate": 0.001, + "loss": 1.9696, + "step": 17611 + }, + { + "epoch": 0.7450714950503426, + "grad_norm": 0.17349354922771454, + "learning_rate": 0.001, + "loss": 1.5452, + "step": 17612 + }, + { + "epoch": 0.7451137998138591, + "grad_norm": 0.18067964911460876, + "learning_rate": 0.001, + "loss": 1.5918, + "step": 17613 + }, + { + "epoch": 0.7451561045773755, + "grad_norm": 91.02615356445312, + "learning_rate": 0.001, + "loss": 1.9013, + "step": 17614 + }, + { + "epoch": 0.7451984093408918, + "grad_norm": 0.17014148831367493, + "learning_rate": 0.001, + "loss": 2.085, + "step": 17615 + }, + { + "epoch": 0.7452407141044082, + "grad_norm": 0.6066452860832214, + "learning_rate": 0.001, + "loss": 2.2974, + "step": 17616 + }, + { + "epoch": 0.7452830188679245, + "grad_norm": 0.1638554334640503, + "learning_rate": 0.001, + "loss": 1.9358, + "step": 17617 + }, + { + "epoch": 0.7453253236314409, + "grad_norm": 0.17447148263454437, + "learning_rate": 0.001, + "loss": 2.4924, + "step": 17618 + }, + { + "epoch": 0.7453676283949573, + "grad_norm": 0.21377401053905487, + "learning_rate": 0.001, + "loss": 1.7172, + "step": 17619 + }, + { + "epoch": 0.7454099331584736, + "grad_norm": 0.21663278341293335, + "learning_rate": 0.001, + "loss": 1.6387, + "step": 17620 + }, + { + "epoch": 0.74545223792199, + "grad_norm": 0.1757373809814453, + "learning_rate": 0.001, + "loss": 2.1301, + "step": 17621 + }, + { + "epoch": 0.7454945426855064, + "grad_norm": 0.2108677327632904, + "learning_rate": 0.001, + "loss": 1.9894, + "step": 17622 + }, + { + "epoch": 0.7455368474490227, + "grad_norm": 0.23199456930160522, + "learning_rate": 0.001, + "loss": 1.837, + "step": 17623 + }, + { + "epoch": 0.7455791522125391, + "grad_norm": 0.2354292869567871, + "learning_rate": 0.001, + "loss": 1.8365, + "step": 17624 + }, + { + "epoch": 0.7456214569760555, + "grad_norm": 0.2139493077993393, + "learning_rate": 0.001, + "loss": 2.3974, + "step": 17625 + }, + { + "epoch": 0.7456637617395718, + "grad_norm": 0.21422770619392395, + "learning_rate": 0.001, + "loss": 3.0719, + "step": 17626 + }, + { + "epoch": 0.7457060665030882, + "grad_norm": 0.28964513540267944, + "learning_rate": 0.001, + "loss": 1.8213, + "step": 17627 + }, + { + "epoch": 0.7457483712666046, + "grad_norm": 0.148183211684227, + "learning_rate": 0.001, + "loss": 1.9446, + "step": 17628 + }, + { + "epoch": 0.745790676030121, + "grad_norm": 0.18475840985774994, + "learning_rate": 0.001, + "loss": 2.2334, + "step": 17629 + }, + { + "epoch": 0.7458329807936374, + "grad_norm": 0.15556128323078156, + "learning_rate": 0.001, + "loss": 2.723, + "step": 17630 + }, + { + "epoch": 0.7458752855571538, + "grad_norm": 0.23832905292510986, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 17631 + }, + { + "epoch": 0.7459175903206701, + "grad_norm": 0.4453236162662506, + "learning_rate": 0.001, + "loss": 2.9713, + "step": 17632 + }, + { + "epoch": 0.7459598950841865, + "grad_norm": 0.17969189584255219, + "learning_rate": 0.001, + "loss": 2.0573, + "step": 17633 + }, + { + "epoch": 0.7460021998477029, + "grad_norm": 0.32808932662010193, + "learning_rate": 0.001, + "loss": 1.9398, + "step": 17634 + }, + { + "epoch": 0.7460445046112192, + "grad_norm": 0.1659063994884491, + "learning_rate": 0.001, + "loss": 1.8874, + "step": 17635 + }, + { + "epoch": 0.7460868093747356, + "grad_norm": 0.14386507868766785, + "learning_rate": 0.001, + "loss": 2.0049, + "step": 17636 + }, + { + "epoch": 0.746129114138252, + "grad_norm": 0.174132838845253, + "learning_rate": 0.001, + "loss": 2.1084, + "step": 17637 + }, + { + "epoch": 0.7461714189017683, + "grad_norm": 15.538298606872559, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 17638 + }, + { + "epoch": 0.7462137236652847, + "grad_norm": 2.191509962081909, + "learning_rate": 0.001, + "loss": 2.0302, + "step": 17639 + }, + { + "epoch": 0.7462560284288011, + "grad_norm": 0.2102295458316803, + "learning_rate": 0.001, + "loss": 2.0219, + "step": 17640 + }, + { + "epoch": 0.7462983331923174, + "grad_norm": 0.4853608310222626, + "learning_rate": 0.001, + "loss": 1.6501, + "step": 17641 + }, + { + "epoch": 0.7463406379558338, + "grad_norm": 0.36355969309806824, + "learning_rate": 0.001, + "loss": 1.8691, + "step": 17642 + }, + { + "epoch": 0.7463829427193502, + "grad_norm": 0.1756628006696701, + "learning_rate": 0.001, + "loss": 2.27, + "step": 17643 + }, + { + "epoch": 0.7464252474828665, + "grad_norm": 0.1729697287082672, + "learning_rate": 0.001, + "loss": 3.3898, + "step": 17644 + }, + { + "epoch": 0.746467552246383, + "grad_norm": 0.17120568454265594, + "learning_rate": 0.001, + "loss": 2.0535, + "step": 17645 + }, + { + "epoch": 0.7465098570098994, + "grad_norm": 0.18185213208198547, + "learning_rate": 0.001, + "loss": 1.8239, + "step": 17646 + }, + { + "epoch": 0.7465521617734157, + "grad_norm": 0.13639383018016815, + "learning_rate": 0.001, + "loss": 1.7961, + "step": 17647 + }, + { + "epoch": 0.7465944665369321, + "grad_norm": 2.5754916667938232, + "learning_rate": 0.001, + "loss": 1.9791, + "step": 17648 + }, + { + "epoch": 0.7466367713004485, + "grad_norm": 0.13714253902435303, + "learning_rate": 0.001, + "loss": 1.497, + "step": 17649 + }, + { + "epoch": 0.7466790760639648, + "grad_norm": 1.1888046264648438, + "learning_rate": 0.001, + "loss": 1.7414, + "step": 17650 + }, + { + "epoch": 0.7467213808274812, + "grad_norm": 0.1794203370809555, + "learning_rate": 0.001, + "loss": 1.3829, + "step": 17651 + }, + { + "epoch": 0.7467636855909976, + "grad_norm": 2.2134206295013428, + "learning_rate": 0.001, + "loss": 1.8633, + "step": 17652 + }, + { + "epoch": 0.7468059903545139, + "grad_norm": 0.1946529895067215, + "learning_rate": 0.001, + "loss": 2.1527, + "step": 17653 + }, + { + "epoch": 0.7468482951180303, + "grad_norm": 0.18843619525432587, + "learning_rate": 0.001, + "loss": 1.8838, + "step": 17654 + }, + { + "epoch": 0.7468905998815467, + "grad_norm": 0.30551382899284363, + "learning_rate": 0.001, + "loss": 2.2091, + "step": 17655 + }, + { + "epoch": 0.746932904645063, + "grad_norm": 0.18397283554077148, + "learning_rate": 0.001, + "loss": 1.8819, + "step": 17656 + }, + { + "epoch": 0.7469752094085794, + "grad_norm": 0.4012526273727417, + "learning_rate": 0.001, + "loss": 2.8932, + "step": 17657 + }, + { + "epoch": 0.7470175141720958, + "grad_norm": 0.1578911393880844, + "learning_rate": 0.001, + "loss": 2.5474, + "step": 17658 + }, + { + "epoch": 0.7470598189356121, + "grad_norm": 0.16640107333660126, + "learning_rate": 0.001, + "loss": 1.9356, + "step": 17659 + }, + { + "epoch": 0.7471021236991285, + "grad_norm": 9.117905616760254, + "learning_rate": 0.001, + "loss": 2.1185, + "step": 17660 + }, + { + "epoch": 0.7471444284626448, + "grad_norm": 0.7601824998855591, + "learning_rate": 0.001, + "loss": 2.4204, + "step": 17661 + }, + { + "epoch": 0.7471867332261612, + "grad_norm": 0.2028346061706543, + "learning_rate": 0.001, + "loss": 2.3328, + "step": 17662 + }, + { + "epoch": 0.7472290379896777, + "grad_norm": 0.4436508119106293, + "learning_rate": 0.001, + "loss": 2.5131, + "step": 17663 + }, + { + "epoch": 0.747271342753194, + "grad_norm": 0.19675542414188385, + "learning_rate": 0.001, + "loss": 2.5793, + "step": 17664 + }, + { + "epoch": 0.7473136475167104, + "grad_norm": 0.22706976532936096, + "learning_rate": 0.001, + "loss": 2.364, + "step": 17665 + }, + { + "epoch": 0.7473559522802268, + "grad_norm": 0.19858993589878082, + "learning_rate": 0.001, + "loss": 2.0174, + "step": 17666 + }, + { + "epoch": 0.7473982570437431, + "grad_norm": 0.20804426074028015, + "learning_rate": 0.001, + "loss": 1.8808, + "step": 17667 + }, + { + "epoch": 0.7474405618072595, + "grad_norm": 0.21756567060947418, + "learning_rate": 0.001, + "loss": 1.7364, + "step": 17668 + }, + { + "epoch": 0.7474828665707759, + "grad_norm": 17.11835479736328, + "learning_rate": 0.001, + "loss": 2.8228, + "step": 17669 + }, + { + "epoch": 0.7475251713342922, + "grad_norm": 0.1679975390434265, + "learning_rate": 0.001, + "loss": 2.387, + "step": 17670 + }, + { + "epoch": 0.7475674760978086, + "grad_norm": 0.16715805232524872, + "learning_rate": 0.001, + "loss": 2.7919, + "step": 17671 + }, + { + "epoch": 0.747609780861325, + "grad_norm": 0.17563515901565552, + "learning_rate": 0.001, + "loss": 2.2313, + "step": 17672 + }, + { + "epoch": 0.7476520856248413, + "grad_norm": 0.16387246549129486, + "learning_rate": 0.001, + "loss": 1.6362, + "step": 17673 + }, + { + "epoch": 0.7476943903883577, + "grad_norm": 0.1781141310930252, + "learning_rate": 0.001, + "loss": 4.1604, + "step": 17674 + }, + { + "epoch": 0.7477366951518741, + "grad_norm": 0.17480358481407166, + "learning_rate": 0.001, + "loss": 1.872, + "step": 17675 + }, + { + "epoch": 0.7477789999153904, + "grad_norm": 0.192658469080925, + "learning_rate": 0.001, + "loss": 1.8116, + "step": 17676 + }, + { + "epoch": 0.7478213046789068, + "grad_norm": 0.2537101209163666, + "learning_rate": 0.001, + "loss": 3.9484, + "step": 17677 + }, + { + "epoch": 0.7478636094424232, + "grad_norm": 1.3975545167922974, + "learning_rate": 0.001, + "loss": 2.4941, + "step": 17678 + }, + { + "epoch": 0.7479059142059395, + "grad_norm": 0.21007265150547028, + "learning_rate": 0.001, + "loss": 2.1936, + "step": 17679 + }, + { + "epoch": 0.747948218969456, + "grad_norm": 0.20976990461349487, + "learning_rate": 0.001, + "loss": 2.3444, + "step": 17680 + }, + { + "epoch": 0.7479905237329724, + "grad_norm": 0.28432410955429077, + "learning_rate": 0.001, + "loss": 2.4682, + "step": 17681 + }, + { + "epoch": 0.7480328284964887, + "grad_norm": 0.17756503820419312, + "learning_rate": 0.001, + "loss": 3.1183, + "step": 17682 + }, + { + "epoch": 0.7480751332600051, + "grad_norm": 0.148085817694664, + "learning_rate": 0.001, + "loss": 1.5892, + "step": 17683 + }, + { + "epoch": 0.7481174380235215, + "grad_norm": 0.15588833391666412, + "learning_rate": 0.001, + "loss": 2.1042, + "step": 17684 + }, + { + "epoch": 0.7481597427870378, + "grad_norm": 0.16696330904960632, + "learning_rate": 0.001, + "loss": 1.4378, + "step": 17685 + }, + { + "epoch": 0.7482020475505542, + "grad_norm": 0.14883705973625183, + "learning_rate": 0.001, + "loss": 1.5751, + "step": 17686 + }, + { + "epoch": 0.7482443523140706, + "grad_norm": 0.8387157320976257, + "learning_rate": 0.001, + "loss": 4.0032, + "step": 17687 + }, + { + "epoch": 0.7482866570775869, + "grad_norm": 0.16816678643226624, + "learning_rate": 0.001, + "loss": 2.0232, + "step": 17688 + }, + { + "epoch": 0.7483289618411033, + "grad_norm": 0.17581240832805634, + "learning_rate": 0.001, + "loss": 2.221, + "step": 17689 + }, + { + "epoch": 0.7483712666046197, + "grad_norm": 0.20603400468826294, + "learning_rate": 0.001, + "loss": 1.6247, + "step": 17690 + }, + { + "epoch": 0.748413571368136, + "grad_norm": 1.0062761306762695, + "learning_rate": 0.001, + "loss": 2.0145, + "step": 17691 + }, + { + "epoch": 0.7484558761316524, + "grad_norm": 0.1723836064338684, + "learning_rate": 0.001, + "loss": 2.6105, + "step": 17692 + }, + { + "epoch": 0.7484981808951688, + "grad_norm": 0.1439078003168106, + "learning_rate": 0.001, + "loss": 2.2332, + "step": 17693 + }, + { + "epoch": 0.7485404856586851, + "grad_norm": 0.19162747263908386, + "learning_rate": 0.001, + "loss": 2.2636, + "step": 17694 + }, + { + "epoch": 0.7485827904222015, + "grad_norm": 0.2729785740375519, + "learning_rate": 0.001, + "loss": 2.4435, + "step": 17695 + }, + { + "epoch": 0.748625095185718, + "grad_norm": 0.9558886885643005, + "learning_rate": 0.001, + "loss": 2.0409, + "step": 17696 + }, + { + "epoch": 0.7486673999492343, + "grad_norm": 0.1588534116744995, + "learning_rate": 0.001, + "loss": 2.9943, + "step": 17697 + }, + { + "epoch": 0.7487097047127507, + "grad_norm": 0.21727414429187775, + "learning_rate": 0.001, + "loss": 1.9491, + "step": 17698 + }, + { + "epoch": 0.7487520094762671, + "grad_norm": 0.18474645912647247, + "learning_rate": 0.001, + "loss": 2.6354, + "step": 17699 + }, + { + "epoch": 0.7487943142397834, + "grad_norm": 0.23161320388317108, + "learning_rate": 0.001, + "loss": 2.2559, + "step": 17700 + }, + { + "epoch": 0.7488366190032998, + "grad_norm": 0.16369740664958954, + "learning_rate": 0.001, + "loss": 1.8356, + "step": 17701 + }, + { + "epoch": 0.7488789237668162, + "grad_norm": 0.17817635834217072, + "learning_rate": 0.001, + "loss": 1.8854, + "step": 17702 + }, + { + "epoch": 0.7489212285303325, + "grad_norm": 0.18831977248191833, + "learning_rate": 0.001, + "loss": 1.8879, + "step": 17703 + }, + { + "epoch": 0.7489635332938489, + "grad_norm": 0.4837913513183594, + "learning_rate": 0.001, + "loss": 1.9171, + "step": 17704 + }, + { + "epoch": 0.7490058380573653, + "grad_norm": 0.5666935443878174, + "learning_rate": 0.001, + "loss": 3.624, + "step": 17705 + }, + { + "epoch": 0.7490481428208816, + "grad_norm": 0.16191402077674866, + "learning_rate": 0.001, + "loss": 3.664, + "step": 17706 + }, + { + "epoch": 0.749090447584398, + "grad_norm": 0.16512976586818695, + "learning_rate": 0.001, + "loss": 1.9419, + "step": 17707 + }, + { + "epoch": 0.7491327523479143, + "grad_norm": 0.15622422099113464, + "learning_rate": 0.001, + "loss": 2.814, + "step": 17708 + }, + { + "epoch": 0.7491750571114307, + "grad_norm": 0.17258132994174957, + "learning_rate": 0.001, + "loss": 2.0253, + "step": 17709 + }, + { + "epoch": 0.7492173618749471, + "grad_norm": 0.19456447660923004, + "learning_rate": 0.001, + "loss": 2.9622, + "step": 17710 + }, + { + "epoch": 0.7492596666384634, + "grad_norm": 2.774994134902954, + "learning_rate": 0.001, + "loss": 2.2187, + "step": 17711 + }, + { + "epoch": 0.7493019714019798, + "grad_norm": 0.17640100419521332, + "learning_rate": 0.001, + "loss": 1.976, + "step": 17712 + }, + { + "epoch": 0.7493442761654963, + "grad_norm": 0.17278911173343658, + "learning_rate": 0.001, + "loss": 2.401, + "step": 17713 + }, + { + "epoch": 0.7493865809290126, + "grad_norm": 0.20537172257900238, + "learning_rate": 0.001, + "loss": 3.0728, + "step": 17714 + }, + { + "epoch": 0.749428885692529, + "grad_norm": 0.35584142804145813, + "learning_rate": 0.001, + "loss": 3.3195, + "step": 17715 + }, + { + "epoch": 0.7494711904560454, + "grad_norm": 0.20729830861091614, + "learning_rate": 0.001, + "loss": 2.5783, + "step": 17716 + }, + { + "epoch": 0.7495134952195617, + "grad_norm": 0.1861492097377777, + "learning_rate": 0.001, + "loss": 1.9174, + "step": 17717 + }, + { + "epoch": 0.7495557999830781, + "grad_norm": 0.18184538185596466, + "learning_rate": 0.001, + "loss": 2.2683, + "step": 17718 + }, + { + "epoch": 0.7495981047465945, + "grad_norm": 0.15846717357635498, + "learning_rate": 0.001, + "loss": 1.5975, + "step": 17719 + }, + { + "epoch": 0.7496404095101108, + "grad_norm": 0.15116605162620544, + "learning_rate": 0.001, + "loss": 1.2862, + "step": 17720 + }, + { + "epoch": 0.7496827142736272, + "grad_norm": 0.15482908487319946, + "learning_rate": 0.001, + "loss": 2.0605, + "step": 17721 + }, + { + "epoch": 0.7497250190371436, + "grad_norm": 0.1819373369216919, + "learning_rate": 0.001, + "loss": 2.8881, + "step": 17722 + }, + { + "epoch": 0.7497673238006599, + "grad_norm": 0.16253606975078583, + "learning_rate": 0.001, + "loss": 1.448, + "step": 17723 + }, + { + "epoch": 0.7498096285641763, + "grad_norm": 0.15486584603786469, + "learning_rate": 0.001, + "loss": 2.7328, + "step": 17724 + }, + { + "epoch": 0.7498519333276927, + "grad_norm": 0.14722788333892822, + "learning_rate": 0.001, + "loss": 2.3268, + "step": 17725 + }, + { + "epoch": 0.749894238091209, + "grad_norm": 0.928293764591217, + "learning_rate": 0.001, + "loss": 2.1948, + "step": 17726 + }, + { + "epoch": 0.7499365428547254, + "grad_norm": 0.14269466698169708, + "learning_rate": 0.001, + "loss": 1.9547, + "step": 17727 + }, + { + "epoch": 0.7499788476182419, + "grad_norm": 0.7089377641677856, + "learning_rate": 0.001, + "loss": 2.996, + "step": 17728 + }, + { + "epoch": 0.7500211523817581, + "grad_norm": 2.6846911907196045, + "learning_rate": 0.001, + "loss": 1.8741, + "step": 17729 + }, + { + "epoch": 0.7500634571452746, + "grad_norm": 0.13764113187789917, + "learning_rate": 0.001, + "loss": 2.2695, + "step": 17730 + }, + { + "epoch": 0.750105761908791, + "grad_norm": 0.16455744206905365, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 17731 + }, + { + "epoch": 0.7501480666723073, + "grad_norm": 1.5582802295684814, + "learning_rate": 0.001, + "loss": 2.6966, + "step": 17732 + }, + { + "epoch": 0.7501903714358237, + "grad_norm": 0.15412558615207672, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 17733 + }, + { + "epoch": 0.7502326761993401, + "grad_norm": 0.17679987847805023, + "learning_rate": 0.001, + "loss": 3.0607, + "step": 17734 + }, + { + "epoch": 0.7502749809628564, + "grad_norm": 0.2634994685649872, + "learning_rate": 0.001, + "loss": 2.326, + "step": 17735 + }, + { + "epoch": 0.7503172857263728, + "grad_norm": 0.18038220703601837, + "learning_rate": 0.001, + "loss": 1.9938, + "step": 17736 + }, + { + "epoch": 0.7503595904898892, + "grad_norm": 0.1706598848104477, + "learning_rate": 0.001, + "loss": 2.8744, + "step": 17737 + }, + { + "epoch": 0.7504018952534055, + "grad_norm": 0.16422058641910553, + "learning_rate": 0.001, + "loss": 2.5134, + "step": 17738 + }, + { + "epoch": 0.7504442000169219, + "grad_norm": 0.1558268815279007, + "learning_rate": 0.001, + "loss": 2.8615, + "step": 17739 + }, + { + "epoch": 0.7504865047804383, + "grad_norm": 0.15481281280517578, + "learning_rate": 0.001, + "loss": 2.537, + "step": 17740 + }, + { + "epoch": 0.7505288095439546, + "grad_norm": 0.1353859305381775, + "learning_rate": 0.001, + "loss": 1.9273, + "step": 17741 + }, + { + "epoch": 0.750571114307471, + "grad_norm": 0.1670721471309662, + "learning_rate": 0.001, + "loss": 2.0898, + "step": 17742 + }, + { + "epoch": 0.7506134190709874, + "grad_norm": 0.17640924453735352, + "learning_rate": 0.001, + "loss": 1.8176, + "step": 17743 + }, + { + "epoch": 0.7506557238345037, + "grad_norm": 0.18629953265190125, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 17744 + }, + { + "epoch": 0.7506980285980202, + "grad_norm": 0.2514755129814148, + "learning_rate": 0.001, + "loss": 3.1144, + "step": 17745 + }, + { + "epoch": 0.7507403333615366, + "grad_norm": 2.408249855041504, + "learning_rate": 0.001, + "loss": 1.7701, + "step": 17746 + }, + { + "epoch": 0.7507826381250529, + "grad_norm": 1.3865883350372314, + "learning_rate": 0.001, + "loss": 2.3281, + "step": 17747 + }, + { + "epoch": 0.7508249428885693, + "grad_norm": 0.16517174243927002, + "learning_rate": 0.001, + "loss": 2.4022, + "step": 17748 + }, + { + "epoch": 0.7508672476520857, + "grad_norm": 7.395416736602783, + "learning_rate": 0.001, + "loss": 2.9588, + "step": 17749 + }, + { + "epoch": 0.750909552415602, + "grad_norm": 0.24712392687797546, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 17750 + }, + { + "epoch": 0.7509518571791184, + "grad_norm": 0.14635969698429108, + "learning_rate": 0.001, + "loss": 1.5613, + "step": 17751 + }, + { + "epoch": 0.7509941619426347, + "grad_norm": 0.16035181283950806, + "learning_rate": 0.001, + "loss": 2.7492, + "step": 17752 + }, + { + "epoch": 0.7510364667061511, + "grad_norm": 0.17972306907176971, + "learning_rate": 0.001, + "loss": 2.2771, + "step": 17753 + }, + { + "epoch": 0.7510787714696675, + "grad_norm": 0.2305939793586731, + "learning_rate": 0.001, + "loss": 1.5971, + "step": 17754 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.21389088034629822, + "learning_rate": 0.001, + "loss": 2.2783, + "step": 17755 + }, + { + "epoch": 0.7511633809967002, + "grad_norm": 0.17007265985012054, + "learning_rate": 0.001, + "loss": 2.0106, + "step": 17756 + }, + { + "epoch": 0.7512056857602166, + "grad_norm": 0.18640732765197754, + "learning_rate": 0.001, + "loss": 1.8577, + "step": 17757 + }, + { + "epoch": 0.7512479905237329, + "grad_norm": 15.663119316101074, + "learning_rate": 0.001, + "loss": 2.9581, + "step": 17758 + }, + { + "epoch": 0.7512902952872493, + "grad_norm": 4.945131778717041, + "learning_rate": 0.001, + "loss": 2.8615, + "step": 17759 + }, + { + "epoch": 0.7513326000507657, + "grad_norm": 0.16905252635478973, + "learning_rate": 0.001, + "loss": 2.2082, + "step": 17760 + }, + { + "epoch": 0.751374904814282, + "grad_norm": 0.18479637801647186, + "learning_rate": 0.001, + "loss": 1.8339, + "step": 17761 + }, + { + "epoch": 0.7514172095777985, + "grad_norm": 0.16893810033798218, + "learning_rate": 0.001, + "loss": 3.0538, + "step": 17762 + }, + { + "epoch": 0.7514595143413149, + "grad_norm": 0.4891713559627533, + "learning_rate": 0.001, + "loss": 2.1883, + "step": 17763 + }, + { + "epoch": 0.7515018191048312, + "grad_norm": 2.8719229698181152, + "learning_rate": 0.001, + "loss": 2.5028, + "step": 17764 + }, + { + "epoch": 0.7515441238683476, + "grad_norm": 0.335845947265625, + "learning_rate": 0.001, + "loss": 1.8828, + "step": 17765 + }, + { + "epoch": 0.751586428631864, + "grad_norm": 0.17647729814052582, + "learning_rate": 0.001, + "loss": 1.4358, + "step": 17766 + }, + { + "epoch": 0.7516287333953803, + "grad_norm": 38.300811767578125, + "learning_rate": 0.001, + "loss": 2.461, + "step": 17767 + }, + { + "epoch": 0.7516710381588967, + "grad_norm": 0.25352421402931213, + "learning_rate": 0.001, + "loss": 2.5402, + "step": 17768 + }, + { + "epoch": 0.7517133429224131, + "grad_norm": 0.1729395091533661, + "learning_rate": 0.001, + "loss": 1.8336, + "step": 17769 + }, + { + "epoch": 0.7517556476859294, + "grad_norm": 0.2317625731229782, + "learning_rate": 0.001, + "loss": 2.022, + "step": 17770 + }, + { + "epoch": 0.7517979524494458, + "grad_norm": 23.282867431640625, + "learning_rate": 0.001, + "loss": 3.9744, + "step": 17771 + }, + { + "epoch": 0.7518402572129622, + "grad_norm": 0.25208935141563416, + "learning_rate": 0.001, + "loss": 2.7925, + "step": 17772 + }, + { + "epoch": 0.7518825619764785, + "grad_norm": 0.16493536531925201, + "learning_rate": 0.001, + "loss": 1.7565, + "step": 17773 + }, + { + "epoch": 0.7519248667399949, + "grad_norm": 0.8458042144775391, + "learning_rate": 0.001, + "loss": 2.9931, + "step": 17774 + }, + { + "epoch": 0.7519671715035113, + "grad_norm": 0.22725796699523926, + "learning_rate": 0.001, + "loss": 2.1438, + "step": 17775 + }, + { + "epoch": 0.7520094762670276, + "grad_norm": 0.15165364742279053, + "learning_rate": 0.001, + "loss": 1.9869, + "step": 17776 + }, + { + "epoch": 0.752051781030544, + "grad_norm": 0.1665864735841751, + "learning_rate": 0.001, + "loss": 1.4288, + "step": 17777 + }, + { + "epoch": 0.7520940857940605, + "grad_norm": 0.20887833833694458, + "learning_rate": 0.001, + "loss": 2.0971, + "step": 17778 + }, + { + "epoch": 0.7521363905575768, + "grad_norm": 0.6601106524467468, + "learning_rate": 0.001, + "loss": 1.7179, + "step": 17779 + }, + { + "epoch": 0.7521786953210932, + "grad_norm": 1.797553300857544, + "learning_rate": 0.001, + "loss": 1.86, + "step": 17780 + }, + { + "epoch": 0.7522210000846096, + "grad_norm": 0.6821169853210449, + "learning_rate": 0.001, + "loss": 1.9908, + "step": 17781 + }, + { + "epoch": 0.7522633048481259, + "grad_norm": 0.24313047528266907, + "learning_rate": 0.001, + "loss": 1.7132, + "step": 17782 + }, + { + "epoch": 0.7523056096116423, + "grad_norm": 0.16890186071395874, + "learning_rate": 0.001, + "loss": 2.7889, + "step": 17783 + }, + { + "epoch": 0.7523479143751587, + "grad_norm": 0.3041240870952606, + "learning_rate": 0.001, + "loss": 1.5615, + "step": 17784 + }, + { + "epoch": 0.752390219138675, + "grad_norm": 0.17701970040798187, + "learning_rate": 0.001, + "loss": 1.9374, + "step": 17785 + }, + { + "epoch": 0.7524325239021914, + "grad_norm": 0.16033196449279785, + "learning_rate": 0.001, + "loss": 1.8036, + "step": 17786 + }, + { + "epoch": 0.7524748286657078, + "grad_norm": 0.18691828846931458, + "learning_rate": 0.001, + "loss": 2.0472, + "step": 17787 + }, + { + "epoch": 0.7525171334292241, + "grad_norm": 0.20954419672489166, + "learning_rate": 0.001, + "loss": 1.2313, + "step": 17788 + }, + { + "epoch": 0.7525594381927405, + "grad_norm": 0.17444777488708496, + "learning_rate": 0.001, + "loss": 2.1319, + "step": 17789 + }, + { + "epoch": 0.7526017429562569, + "grad_norm": 0.1553867757320404, + "learning_rate": 0.001, + "loss": 2.1579, + "step": 17790 + }, + { + "epoch": 0.7526440477197732, + "grad_norm": 0.16180811822414398, + "learning_rate": 0.001, + "loss": 1.8497, + "step": 17791 + }, + { + "epoch": 0.7526863524832896, + "grad_norm": 0.16590484976768494, + "learning_rate": 0.001, + "loss": 1.7005, + "step": 17792 + }, + { + "epoch": 0.752728657246806, + "grad_norm": 0.161569282412529, + "learning_rate": 0.001, + "loss": 1.9812, + "step": 17793 + }, + { + "epoch": 0.7527709620103223, + "grad_norm": 0.1726117730140686, + "learning_rate": 0.001, + "loss": 1.6706, + "step": 17794 + }, + { + "epoch": 0.7528132667738388, + "grad_norm": 1.3740065097808838, + "learning_rate": 0.001, + "loss": 3.4836, + "step": 17795 + }, + { + "epoch": 0.7528555715373552, + "grad_norm": 0.20311777293682098, + "learning_rate": 0.001, + "loss": 3.3955, + "step": 17796 + }, + { + "epoch": 0.7528978763008715, + "grad_norm": 0.1724340319633484, + "learning_rate": 0.001, + "loss": 2.1475, + "step": 17797 + }, + { + "epoch": 0.7529401810643879, + "grad_norm": 0.15523028373718262, + "learning_rate": 0.001, + "loss": 2.4205, + "step": 17798 + }, + { + "epoch": 0.7529824858279042, + "grad_norm": 0.16621935367584229, + "learning_rate": 0.001, + "loss": 2.1048, + "step": 17799 + }, + { + "epoch": 0.7530247905914206, + "grad_norm": 0.1458512395620346, + "learning_rate": 0.001, + "loss": 2.0226, + "step": 17800 + }, + { + "epoch": 0.753067095354937, + "grad_norm": 1.5626475811004639, + "learning_rate": 0.001, + "loss": 1.4764, + "step": 17801 + }, + { + "epoch": 0.7531094001184533, + "grad_norm": 0.17712949216365814, + "learning_rate": 0.001, + "loss": 2.0474, + "step": 17802 + }, + { + "epoch": 0.7531517048819697, + "grad_norm": 0.184153750538826, + "learning_rate": 0.001, + "loss": 1.932, + "step": 17803 + }, + { + "epoch": 0.7531940096454861, + "grad_norm": 0.17124585807323456, + "learning_rate": 0.001, + "loss": 1.8677, + "step": 17804 + }, + { + "epoch": 0.7532363144090024, + "grad_norm": 0.18082095682621002, + "learning_rate": 0.001, + "loss": 2.0149, + "step": 17805 + }, + { + "epoch": 0.7532786191725188, + "grad_norm": 0.4178064465522766, + "learning_rate": 0.001, + "loss": 2.1771, + "step": 17806 + }, + { + "epoch": 0.7533209239360352, + "grad_norm": 0.16732126474380493, + "learning_rate": 0.001, + "loss": 2.033, + "step": 17807 + }, + { + "epoch": 0.7533632286995515, + "grad_norm": 0.6002020239830017, + "learning_rate": 0.001, + "loss": 2.7392, + "step": 17808 + }, + { + "epoch": 0.7534055334630679, + "grad_norm": 0.1773737519979477, + "learning_rate": 0.001, + "loss": 1.8149, + "step": 17809 + }, + { + "epoch": 0.7534478382265843, + "grad_norm": 1.7740546464920044, + "learning_rate": 0.001, + "loss": 2.1816, + "step": 17810 + }, + { + "epoch": 0.7534901429901006, + "grad_norm": 0.6229245066642761, + "learning_rate": 0.001, + "loss": 1.4621, + "step": 17811 + }, + { + "epoch": 0.753532447753617, + "grad_norm": 1.3649619817733765, + "learning_rate": 0.001, + "loss": 1.8576, + "step": 17812 + }, + { + "epoch": 0.7535747525171335, + "grad_norm": 0.14693179726600647, + "learning_rate": 0.001, + "loss": 1.8112, + "step": 17813 + }, + { + "epoch": 0.7536170572806498, + "grad_norm": 0.5923940539360046, + "learning_rate": 0.001, + "loss": 2.9757, + "step": 17814 + }, + { + "epoch": 0.7536593620441662, + "grad_norm": 0.2543085217475891, + "learning_rate": 0.001, + "loss": 3.2552, + "step": 17815 + }, + { + "epoch": 0.7537016668076826, + "grad_norm": 0.1711706966161728, + "learning_rate": 0.001, + "loss": 1.7678, + "step": 17816 + }, + { + "epoch": 0.7537439715711989, + "grad_norm": 0.1597229242324829, + "learning_rate": 0.001, + "loss": 2.3738, + "step": 17817 + }, + { + "epoch": 0.7537862763347153, + "grad_norm": 1.7806466817855835, + "learning_rate": 0.001, + "loss": 3.6486, + "step": 17818 + }, + { + "epoch": 0.7538285810982317, + "grad_norm": 0.1596599519252777, + "learning_rate": 0.001, + "loss": 3.2813, + "step": 17819 + }, + { + "epoch": 0.753870885861748, + "grad_norm": 0.4388027787208557, + "learning_rate": 0.001, + "loss": 2.2502, + "step": 17820 + }, + { + "epoch": 0.7539131906252644, + "grad_norm": 0.15271490812301636, + "learning_rate": 0.001, + "loss": 3.0926, + "step": 17821 + }, + { + "epoch": 0.7539554953887808, + "grad_norm": 0.19142945110797882, + "learning_rate": 0.001, + "loss": 1.8214, + "step": 17822 + }, + { + "epoch": 0.7539978001522971, + "grad_norm": 0.18499502539634705, + "learning_rate": 0.001, + "loss": 1.8132, + "step": 17823 + }, + { + "epoch": 0.7540401049158135, + "grad_norm": 0.21583931148052216, + "learning_rate": 0.001, + "loss": 2.8415, + "step": 17824 + }, + { + "epoch": 0.7540824096793299, + "grad_norm": 0.12202106416225433, + "learning_rate": 0.001, + "loss": 1.5671, + "step": 17825 + }, + { + "epoch": 0.7541247144428462, + "grad_norm": 0.17914700508117676, + "learning_rate": 0.001, + "loss": 2.0909, + "step": 17826 + }, + { + "epoch": 0.7541670192063626, + "grad_norm": 0.17888757586479187, + "learning_rate": 0.001, + "loss": 1.953, + "step": 17827 + }, + { + "epoch": 0.754209323969879, + "grad_norm": 0.16106991469860077, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 17828 + }, + { + "epoch": 0.7542516287333954, + "grad_norm": 0.13767673075199127, + "learning_rate": 0.001, + "loss": 2.0734, + "step": 17829 + }, + { + "epoch": 0.7542939334969118, + "grad_norm": 0.20107002556324005, + "learning_rate": 0.001, + "loss": 2.2336, + "step": 17830 + }, + { + "epoch": 0.7543362382604282, + "grad_norm": 0.1628139615058899, + "learning_rate": 0.001, + "loss": 2.8548, + "step": 17831 + }, + { + "epoch": 0.7543785430239445, + "grad_norm": 0.20057493448257446, + "learning_rate": 0.001, + "loss": 1.914, + "step": 17832 + }, + { + "epoch": 0.7544208477874609, + "grad_norm": 0.1650390923023224, + "learning_rate": 0.001, + "loss": 2.3508, + "step": 17833 + }, + { + "epoch": 0.7544631525509773, + "grad_norm": 0.14668241143226624, + "learning_rate": 0.001, + "loss": 2.1428, + "step": 17834 + }, + { + "epoch": 0.7545054573144936, + "grad_norm": 0.24032439291477203, + "learning_rate": 0.001, + "loss": 2.4814, + "step": 17835 + }, + { + "epoch": 0.75454776207801, + "grad_norm": 0.1620781570672989, + "learning_rate": 0.001, + "loss": 2.1885, + "step": 17836 + }, + { + "epoch": 0.7545900668415264, + "grad_norm": 0.5074538588523865, + "learning_rate": 0.001, + "loss": 2.2945, + "step": 17837 + }, + { + "epoch": 0.7546323716050427, + "grad_norm": 0.1588655561208725, + "learning_rate": 0.001, + "loss": 1.9829, + "step": 17838 + }, + { + "epoch": 0.7546746763685591, + "grad_norm": 0.13773521780967712, + "learning_rate": 0.001, + "loss": 2.0773, + "step": 17839 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.16354626417160034, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 17840 + }, + { + "epoch": 0.7547592858955918, + "grad_norm": 0.15736053884029388, + "learning_rate": 0.001, + "loss": 1.4906, + "step": 17841 + }, + { + "epoch": 0.7548015906591082, + "grad_norm": 0.13895869255065918, + "learning_rate": 0.001, + "loss": 1.3762, + "step": 17842 + }, + { + "epoch": 0.7548438954226245, + "grad_norm": 0.16241589188575745, + "learning_rate": 0.001, + "loss": 2.8513, + "step": 17843 + }, + { + "epoch": 0.7548862001861409, + "grad_norm": 0.1497146338224411, + "learning_rate": 0.001, + "loss": 1.982, + "step": 17844 + }, + { + "epoch": 0.7549285049496574, + "grad_norm": 0.6093889474868774, + "learning_rate": 0.001, + "loss": 2.6218, + "step": 17845 + }, + { + "epoch": 0.7549708097131737, + "grad_norm": 0.2828693091869354, + "learning_rate": 0.001, + "loss": 2.0783, + "step": 17846 + }, + { + "epoch": 0.7550131144766901, + "grad_norm": 0.17641054093837738, + "learning_rate": 0.001, + "loss": 2.1691, + "step": 17847 + }, + { + "epoch": 0.7550554192402065, + "grad_norm": 0.4624340534210205, + "learning_rate": 0.001, + "loss": 1.5838, + "step": 17848 + }, + { + "epoch": 0.7550977240037228, + "grad_norm": 0.18794561922550201, + "learning_rate": 0.001, + "loss": 2.8615, + "step": 17849 + }, + { + "epoch": 0.7551400287672392, + "grad_norm": 0.1454625278711319, + "learning_rate": 0.001, + "loss": 2.3494, + "step": 17850 + }, + { + "epoch": 0.7551823335307556, + "grad_norm": 0.16596491634845734, + "learning_rate": 0.001, + "loss": 2.1871, + "step": 17851 + }, + { + "epoch": 0.7552246382942719, + "grad_norm": 0.1360815316438675, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 17852 + }, + { + "epoch": 0.7552669430577883, + "grad_norm": 0.12390490621328354, + "learning_rate": 0.001, + "loss": 1.8553, + "step": 17853 + }, + { + "epoch": 0.7553092478213047, + "grad_norm": 0.14053763449192047, + "learning_rate": 0.001, + "loss": 1.8414, + "step": 17854 + }, + { + "epoch": 0.755351552584821, + "grad_norm": 1.1963753700256348, + "learning_rate": 0.001, + "loss": 2.295, + "step": 17855 + }, + { + "epoch": 0.7553938573483374, + "grad_norm": 0.17305585741996765, + "learning_rate": 0.001, + "loss": 2.9799, + "step": 17856 + }, + { + "epoch": 0.7554361621118538, + "grad_norm": 0.9570670127868652, + "learning_rate": 0.001, + "loss": 3.0922, + "step": 17857 + }, + { + "epoch": 0.7554784668753701, + "grad_norm": 0.1583283543586731, + "learning_rate": 0.001, + "loss": 2.4881, + "step": 17858 + }, + { + "epoch": 0.7555207716388865, + "grad_norm": 0.18101347982883453, + "learning_rate": 0.001, + "loss": 2.2682, + "step": 17859 + }, + { + "epoch": 0.7555630764024029, + "grad_norm": 0.17083586752414703, + "learning_rate": 0.001, + "loss": 2.5757, + "step": 17860 + }, + { + "epoch": 0.7556053811659192, + "grad_norm": 0.2182934582233429, + "learning_rate": 0.001, + "loss": 2.3445, + "step": 17861 + }, + { + "epoch": 0.7556476859294357, + "grad_norm": 0.13147765398025513, + "learning_rate": 0.001, + "loss": 1.6128, + "step": 17862 + }, + { + "epoch": 0.7556899906929521, + "grad_norm": 0.21211394667625427, + "learning_rate": 0.001, + "loss": 2.4639, + "step": 17863 + }, + { + "epoch": 0.7557322954564684, + "grad_norm": 0.29777416586875916, + "learning_rate": 0.001, + "loss": 2.3622, + "step": 17864 + }, + { + "epoch": 0.7557746002199848, + "grad_norm": 0.20672816038131714, + "learning_rate": 0.001, + "loss": 2.1961, + "step": 17865 + }, + { + "epoch": 0.7558169049835012, + "grad_norm": 2.0547661781311035, + "learning_rate": 0.001, + "loss": 4.3223, + "step": 17866 + }, + { + "epoch": 0.7558592097470175, + "grad_norm": 0.40489253401756287, + "learning_rate": 0.001, + "loss": 2.754, + "step": 17867 + }, + { + "epoch": 0.7559015145105339, + "grad_norm": 0.18186049163341522, + "learning_rate": 0.001, + "loss": 1.9801, + "step": 17868 + }, + { + "epoch": 0.7559438192740503, + "grad_norm": 0.1969180703163147, + "learning_rate": 0.001, + "loss": 1.7136, + "step": 17869 + }, + { + "epoch": 0.7559861240375666, + "grad_norm": 0.16497890651226044, + "learning_rate": 0.001, + "loss": 1.6463, + "step": 17870 + }, + { + "epoch": 0.756028428801083, + "grad_norm": 0.16345803439617157, + "learning_rate": 0.001, + "loss": 2.0088, + "step": 17871 + }, + { + "epoch": 0.7560707335645994, + "grad_norm": 0.2046748548746109, + "learning_rate": 0.001, + "loss": 2.7136, + "step": 17872 + }, + { + "epoch": 0.7561130383281157, + "grad_norm": 0.3788432776927948, + "learning_rate": 0.001, + "loss": 3.172, + "step": 17873 + }, + { + "epoch": 0.7561553430916321, + "grad_norm": 5.223359107971191, + "learning_rate": 0.001, + "loss": 1.9627, + "step": 17874 + }, + { + "epoch": 0.7561976478551485, + "grad_norm": 0.1771371066570282, + "learning_rate": 0.001, + "loss": 2.7354, + "step": 17875 + }, + { + "epoch": 0.7562399526186648, + "grad_norm": 0.19962891936302185, + "learning_rate": 0.001, + "loss": 2.2917, + "step": 17876 + }, + { + "epoch": 0.7562822573821812, + "grad_norm": 0.1908496767282486, + "learning_rate": 0.001, + "loss": 3.2174, + "step": 17877 + }, + { + "epoch": 0.7563245621456977, + "grad_norm": 0.15644274652004242, + "learning_rate": 0.001, + "loss": 2.0214, + "step": 17878 + }, + { + "epoch": 0.756366866909214, + "grad_norm": 0.3016762435436249, + "learning_rate": 0.001, + "loss": 1.9582, + "step": 17879 + }, + { + "epoch": 0.7564091716727304, + "grad_norm": 0.14754663407802582, + "learning_rate": 0.001, + "loss": 2.2023, + "step": 17880 + }, + { + "epoch": 0.7564514764362468, + "grad_norm": 0.14365266263484955, + "learning_rate": 0.001, + "loss": 2.4457, + "step": 17881 + }, + { + "epoch": 0.7564937811997631, + "grad_norm": 0.15732894837856293, + "learning_rate": 0.001, + "loss": 2.6365, + "step": 17882 + }, + { + "epoch": 0.7565360859632795, + "grad_norm": 0.17526346445083618, + "learning_rate": 0.001, + "loss": 2.0218, + "step": 17883 + }, + { + "epoch": 0.7565783907267959, + "grad_norm": 0.15322791039943695, + "learning_rate": 0.001, + "loss": 1.6749, + "step": 17884 + }, + { + "epoch": 0.7566206954903122, + "grad_norm": 0.24172990024089813, + "learning_rate": 0.001, + "loss": 2.5218, + "step": 17885 + }, + { + "epoch": 0.7566630002538286, + "grad_norm": 0.15460051596164703, + "learning_rate": 0.001, + "loss": 2.4132, + "step": 17886 + }, + { + "epoch": 0.7567053050173449, + "grad_norm": 0.13354411721229553, + "learning_rate": 0.001, + "loss": 2.9439, + "step": 17887 + }, + { + "epoch": 0.7567476097808613, + "grad_norm": 0.13748109340667725, + "learning_rate": 0.001, + "loss": 1.4254, + "step": 17888 + }, + { + "epoch": 0.7567899145443777, + "grad_norm": 0.24838584661483765, + "learning_rate": 0.001, + "loss": 2.0226, + "step": 17889 + }, + { + "epoch": 0.756832219307894, + "grad_norm": 0.15621478855609894, + "learning_rate": 0.001, + "loss": 3.084, + "step": 17890 + }, + { + "epoch": 0.7568745240714104, + "grad_norm": 0.543929934501648, + "learning_rate": 0.001, + "loss": 1.88, + "step": 17891 + }, + { + "epoch": 0.7569168288349268, + "grad_norm": 0.4589284062385559, + "learning_rate": 0.001, + "loss": 2.0766, + "step": 17892 + }, + { + "epoch": 0.7569591335984431, + "grad_norm": 0.4541616141796112, + "learning_rate": 0.001, + "loss": 2.6497, + "step": 17893 + }, + { + "epoch": 0.7570014383619595, + "grad_norm": 0.1465660035610199, + "learning_rate": 0.001, + "loss": 2.4708, + "step": 17894 + }, + { + "epoch": 0.757043743125476, + "grad_norm": 0.2618861198425293, + "learning_rate": 0.001, + "loss": 2.753, + "step": 17895 + }, + { + "epoch": 0.7570860478889923, + "grad_norm": 0.15626366436481476, + "learning_rate": 0.001, + "loss": 2.0046, + "step": 17896 + }, + { + "epoch": 0.7571283526525087, + "grad_norm": 0.15386773645877838, + "learning_rate": 0.001, + "loss": 2.168, + "step": 17897 + }, + { + "epoch": 0.7571706574160251, + "grad_norm": 0.15330027043819427, + "learning_rate": 0.001, + "loss": 2.3295, + "step": 17898 + }, + { + "epoch": 0.7572129621795414, + "grad_norm": 0.15845079720020294, + "learning_rate": 0.001, + "loss": 2.0764, + "step": 17899 + }, + { + "epoch": 0.7572552669430578, + "grad_norm": 0.5372272729873657, + "learning_rate": 0.001, + "loss": 2.3282, + "step": 17900 + }, + { + "epoch": 0.7572975717065742, + "grad_norm": 0.1655784249305725, + "learning_rate": 0.001, + "loss": 1.5967, + "step": 17901 + }, + { + "epoch": 0.7573398764700905, + "grad_norm": 0.17060010135173798, + "learning_rate": 0.001, + "loss": 3.1258, + "step": 17902 + }, + { + "epoch": 0.7573821812336069, + "grad_norm": 0.16431990265846252, + "learning_rate": 0.001, + "loss": 1.9269, + "step": 17903 + }, + { + "epoch": 0.7574244859971233, + "grad_norm": 0.17108014225959778, + "learning_rate": 0.001, + "loss": 2.9287, + "step": 17904 + }, + { + "epoch": 0.7574667907606396, + "grad_norm": 0.14799624681472778, + "learning_rate": 0.001, + "loss": 1.8449, + "step": 17905 + }, + { + "epoch": 0.757509095524156, + "grad_norm": 0.3792171776294708, + "learning_rate": 0.001, + "loss": 2.2688, + "step": 17906 + }, + { + "epoch": 0.7575514002876724, + "grad_norm": 0.2001536786556244, + "learning_rate": 0.001, + "loss": 2.1087, + "step": 17907 + }, + { + "epoch": 0.7575937050511887, + "grad_norm": 0.1881311982870102, + "learning_rate": 0.001, + "loss": 2.8072, + "step": 17908 + }, + { + "epoch": 0.7576360098147051, + "grad_norm": 0.16043172776699066, + "learning_rate": 0.001, + "loss": 2.4464, + "step": 17909 + }, + { + "epoch": 0.7576783145782215, + "grad_norm": 0.16389212012290955, + "learning_rate": 0.001, + "loss": 1.8501, + "step": 17910 + }, + { + "epoch": 0.7577206193417378, + "grad_norm": 0.14742010831832886, + "learning_rate": 0.001, + "loss": 1.5688, + "step": 17911 + }, + { + "epoch": 0.7577629241052543, + "grad_norm": 0.3455764949321747, + "learning_rate": 0.001, + "loss": 1.6951, + "step": 17912 + }, + { + "epoch": 0.7578052288687707, + "grad_norm": 0.15572752058506012, + "learning_rate": 0.001, + "loss": 2.8535, + "step": 17913 + }, + { + "epoch": 0.757847533632287, + "grad_norm": 0.14037249982357025, + "learning_rate": 0.001, + "loss": 2.1352, + "step": 17914 + }, + { + "epoch": 0.7578898383958034, + "grad_norm": 0.14692841470241547, + "learning_rate": 0.001, + "loss": 2.1657, + "step": 17915 + }, + { + "epoch": 0.7579321431593198, + "grad_norm": 0.15857268869876862, + "learning_rate": 0.001, + "loss": 2.1919, + "step": 17916 + }, + { + "epoch": 0.7579744479228361, + "grad_norm": 0.20824623107910156, + "learning_rate": 0.001, + "loss": 2.8121, + "step": 17917 + }, + { + "epoch": 0.7580167526863525, + "grad_norm": 0.1485569328069687, + "learning_rate": 0.001, + "loss": 3.027, + "step": 17918 + }, + { + "epoch": 0.7580590574498689, + "grad_norm": 0.3441106081008911, + "learning_rate": 0.001, + "loss": 1.4367, + "step": 17919 + }, + { + "epoch": 0.7581013622133852, + "grad_norm": 0.9153411984443665, + "learning_rate": 0.001, + "loss": 1.6852, + "step": 17920 + }, + { + "epoch": 0.7581436669769016, + "grad_norm": 0.15342696011066437, + "learning_rate": 0.001, + "loss": 2.0896, + "step": 17921 + }, + { + "epoch": 0.758185971740418, + "grad_norm": 0.1278454065322876, + "learning_rate": 0.001, + "loss": 1.6947, + "step": 17922 + }, + { + "epoch": 0.7582282765039343, + "grad_norm": 0.22508245706558228, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 17923 + }, + { + "epoch": 0.7582705812674507, + "grad_norm": 0.1679566353559494, + "learning_rate": 0.001, + "loss": 2.0825, + "step": 17924 + }, + { + "epoch": 0.7583128860309671, + "grad_norm": 0.18976467847824097, + "learning_rate": 0.001, + "loss": 2.0653, + "step": 17925 + }, + { + "epoch": 0.7583551907944834, + "grad_norm": 0.13648661971092224, + "learning_rate": 0.001, + "loss": 1.651, + "step": 17926 + }, + { + "epoch": 0.7583974955579998, + "grad_norm": 0.14440205693244934, + "learning_rate": 0.001, + "loss": 1.8385, + "step": 17927 + }, + { + "epoch": 0.7584398003215163, + "grad_norm": 0.14819002151489258, + "learning_rate": 0.001, + "loss": 1.4996, + "step": 17928 + }, + { + "epoch": 0.7584821050850326, + "grad_norm": 0.17857134342193604, + "learning_rate": 0.001, + "loss": 1.8688, + "step": 17929 + }, + { + "epoch": 0.758524409848549, + "grad_norm": 0.17260822653770447, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 17930 + }, + { + "epoch": 0.7585667146120654, + "grad_norm": 0.15363721549510956, + "learning_rate": 0.001, + "loss": 2.2281, + "step": 17931 + }, + { + "epoch": 0.7586090193755817, + "grad_norm": 0.2327968031167984, + "learning_rate": 0.001, + "loss": 2.4439, + "step": 17932 + }, + { + "epoch": 0.7586513241390981, + "grad_norm": 0.1449955552816391, + "learning_rate": 0.001, + "loss": 1.5978, + "step": 17933 + }, + { + "epoch": 0.7586936289026144, + "grad_norm": 0.14919523894786835, + "learning_rate": 0.001, + "loss": 1.8171, + "step": 17934 + }, + { + "epoch": 0.7587359336661308, + "grad_norm": 0.12412866950035095, + "learning_rate": 0.001, + "loss": 2.6799, + "step": 17935 + }, + { + "epoch": 0.7587782384296472, + "grad_norm": 0.16862471401691437, + "learning_rate": 0.001, + "loss": 2.93, + "step": 17936 + }, + { + "epoch": 0.7588205431931635, + "grad_norm": 0.16229136288166046, + "learning_rate": 0.001, + "loss": 1.2241, + "step": 17937 + }, + { + "epoch": 0.7588628479566799, + "grad_norm": 0.12667950987815857, + "learning_rate": 0.001, + "loss": 1.3335, + "step": 17938 + }, + { + "epoch": 0.7589051527201963, + "grad_norm": 0.1758270263671875, + "learning_rate": 0.001, + "loss": 2.1729, + "step": 17939 + }, + { + "epoch": 0.7589474574837126, + "grad_norm": 0.17729204893112183, + "learning_rate": 0.001, + "loss": 3.5026, + "step": 17940 + }, + { + "epoch": 0.758989762247229, + "grad_norm": 0.16940808296203613, + "learning_rate": 0.001, + "loss": 2.3168, + "step": 17941 + }, + { + "epoch": 0.7590320670107454, + "grad_norm": 0.14315323531627655, + "learning_rate": 0.001, + "loss": 1.6935, + "step": 17942 + }, + { + "epoch": 0.7590743717742617, + "grad_norm": 0.14111851155757904, + "learning_rate": 0.001, + "loss": 1.6618, + "step": 17943 + }, + { + "epoch": 0.7591166765377781, + "grad_norm": 0.28591257333755493, + "learning_rate": 0.001, + "loss": 3.0757, + "step": 17944 + }, + { + "epoch": 0.7591589813012946, + "grad_norm": 0.15771640837192535, + "learning_rate": 0.001, + "loss": 2.0072, + "step": 17945 + }, + { + "epoch": 0.7592012860648109, + "grad_norm": 0.15648822486400604, + "learning_rate": 0.001, + "loss": 2.8095, + "step": 17946 + }, + { + "epoch": 0.7592435908283273, + "grad_norm": 0.20096895098686218, + "learning_rate": 0.001, + "loss": 2.3453, + "step": 17947 + }, + { + "epoch": 0.7592858955918437, + "grad_norm": 0.17997436225414276, + "learning_rate": 0.001, + "loss": 1.9256, + "step": 17948 + }, + { + "epoch": 0.75932820035536, + "grad_norm": 0.1645924299955368, + "learning_rate": 0.001, + "loss": 1.9429, + "step": 17949 + }, + { + "epoch": 0.7593705051188764, + "grad_norm": 0.14926403760910034, + "learning_rate": 0.001, + "loss": 2.39, + "step": 17950 + }, + { + "epoch": 0.7594128098823928, + "grad_norm": 0.17297221720218658, + "learning_rate": 0.001, + "loss": 1.5502, + "step": 17951 + }, + { + "epoch": 0.7594551146459091, + "grad_norm": 0.1334647685289383, + "learning_rate": 0.001, + "loss": 1.7092, + "step": 17952 + }, + { + "epoch": 0.7594974194094255, + "grad_norm": 0.7537721395492554, + "learning_rate": 0.001, + "loss": 1.3184, + "step": 17953 + }, + { + "epoch": 0.7595397241729419, + "grad_norm": 0.2796461880207062, + "learning_rate": 0.001, + "loss": 2.0563, + "step": 17954 + }, + { + "epoch": 0.7595820289364582, + "grad_norm": 0.2787031829357147, + "learning_rate": 0.001, + "loss": 2.0524, + "step": 17955 + }, + { + "epoch": 0.7596243336999746, + "grad_norm": 0.172209694981575, + "learning_rate": 0.001, + "loss": 1.8967, + "step": 17956 + }, + { + "epoch": 0.759666638463491, + "grad_norm": 0.15456169843673706, + "learning_rate": 0.001, + "loss": 2.1134, + "step": 17957 + }, + { + "epoch": 0.7597089432270073, + "grad_norm": 0.16440053284168243, + "learning_rate": 0.001, + "loss": 1.9277, + "step": 17958 + }, + { + "epoch": 0.7597512479905237, + "grad_norm": 0.2558080554008484, + "learning_rate": 0.001, + "loss": 2.6084, + "step": 17959 + }, + { + "epoch": 0.7597935527540401, + "grad_norm": 0.1783752590417862, + "learning_rate": 0.001, + "loss": 2.1082, + "step": 17960 + }, + { + "epoch": 0.7598358575175564, + "grad_norm": 0.14168411493301392, + "learning_rate": 0.001, + "loss": 1.8182, + "step": 17961 + }, + { + "epoch": 0.7598781622810729, + "grad_norm": 0.1725158542394638, + "learning_rate": 0.001, + "loss": 2.5628, + "step": 17962 + }, + { + "epoch": 0.7599204670445893, + "grad_norm": 0.13498811423778534, + "learning_rate": 0.001, + "loss": 2.1323, + "step": 17963 + }, + { + "epoch": 0.7599627718081056, + "grad_norm": 0.15643754601478577, + "learning_rate": 0.001, + "loss": 2.3276, + "step": 17964 + }, + { + "epoch": 0.760005076571622, + "grad_norm": 0.16019053757190704, + "learning_rate": 0.001, + "loss": 2.2398, + "step": 17965 + }, + { + "epoch": 0.7600473813351384, + "grad_norm": 0.1496978998184204, + "learning_rate": 0.001, + "loss": 3.8534, + "step": 17966 + }, + { + "epoch": 0.7600896860986547, + "grad_norm": 0.18137343227863312, + "learning_rate": 0.001, + "loss": 2.4502, + "step": 17967 + }, + { + "epoch": 0.7601319908621711, + "grad_norm": 0.14350903034210205, + "learning_rate": 0.001, + "loss": 2.6869, + "step": 17968 + }, + { + "epoch": 0.7601742956256875, + "grad_norm": 0.17714965343475342, + "learning_rate": 0.001, + "loss": 1.9266, + "step": 17969 + }, + { + "epoch": 0.7602166003892038, + "grad_norm": 0.13317923247814178, + "learning_rate": 0.001, + "loss": 1.5833, + "step": 17970 + }, + { + "epoch": 0.7602589051527202, + "grad_norm": 0.13361962139606476, + "learning_rate": 0.001, + "loss": 1.6225, + "step": 17971 + }, + { + "epoch": 0.7603012099162366, + "grad_norm": 0.1693313717842102, + "learning_rate": 0.001, + "loss": 1.9779, + "step": 17972 + }, + { + "epoch": 0.7603435146797529, + "grad_norm": 1.043839454650879, + "learning_rate": 0.001, + "loss": 1.9982, + "step": 17973 + }, + { + "epoch": 0.7603858194432693, + "grad_norm": 0.19669762253761292, + "learning_rate": 0.001, + "loss": 2.6891, + "step": 17974 + }, + { + "epoch": 0.7604281242067857, + "grad_norm": 0.1441902369260788, + "learning_rate": 0.001, + "loss": 2.0312, + "step": 17975 + }, + { + "epoch": 0.760470428970302, + "grad_norm": 0.13222302496433258, + "learning_rate": 0.001, + "loss": 2.0124, + "step": 17976 + }, + { + "epoch": 0.7605127337338184, + "grad_norm": 0.16443274915218353, + "learning_rate": 0.001, + "loss": 3.1681, + "step": 17977 + }, + { + "epoch": 0.7605550384973347, + "grad_norm": 0.1863120049238205, + "learning_rate": 0.001, + "loss": 2.4867, + "step": 17978 + }, + { + "epoch": 0.7605973432608512, + "grad_norm": 0.15816029906272888, + "learning_rate": 0.001, + "loss": 3.2455, + "step": 17979 + }, + { + "epoch": 0.7606396480243676, + "grad_norm": 0.20944538712501526, + "learning_rate": 0.001, + "loss": 2.5672, + "step": 17980 + }, + { + "epoch": 0.7606819527878839, + "grad_norm": 0.13495473563671112, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 17981 + }, + { + "epoch": 0.7607242575514003, + "grad_norm": 0.160276859998703, + "learning_rate": 0.001, + "loss": 2.5909, + "step": 17982 + }, + { + "epoch": 0.7607665623149167, + "grad_norm": 0.14563147723674774, + "learning_rate": 0.001, + "loss": 1.6701, + "step": 17983 + }, + { + "epoch": 0.760808867078433, + "grad_norm": 0.16634686291217804, + "learning_rate": 0.001, + "loss": 2.6882, + "step": 17984 + }, + { + "epoch": 0.7608511718419494, + "grad_norm": 0.17471922934055328, + "learning_rate": 0.001, + "loss": 3.1525, + "step": 17985 + }, + { + "epoch": 0.7608934766054658, + "grad_norm": 0.18728522956371307, + "learning_rate": 0.001, + "loss": 3.5754, + "step": 17986 + }, + { + "epoch": 0.7609357813689821, + "grad_norm": 0.13889671862125397, + "learning_rate": 0.001, + "loss": 2.6335, + "step": 17987 + }, + { + "epoch": 0.7609780861324985, + "grad_norm": 0.1883813589811325, + "learning_rate": 0.001, + "loss": 2.5539, + "step": 17988 + }, + { + "epoch": 0.7610203908960149, + "grad_norm": 0.16442306339740753, + "learning_rate": 0.001, + "loss": 2.6422, + "step": 17989 + }, + { + "epoch": 0.7610626956595312, + "grad_norm": 0.154715433716774, + "learning_rate": 0.001, + "loss": 2.8743, + "step": 17990 + }, + { + "epoch": 0.7611050004230476, + "grad_norm": 1.595334529876709, + "learning_rate": 0.001, + "loss": 2.4011, + "step": 17991 + }, + { + "epoch": 0.761147305186564, + "grad_norm": 0.14046059548854828, + "learning_rate": 0.001, + "loss": 2.2203, + "step": 17992 + }, + { + "epoch": 0.7611896099500803, + "grad_norm": 0.16517886519432068, + "learning_rate": 0.001, + "loss": 1.8345, + "step": 17993 + }, + { + "epoch": 0.7612319147135967, + "grad_norm": 0.15693698823451996, + "learning_rate": 0.001, + "loss": 2.506, + "step": 17994 + }, + { + "epoch": 0.7612742194771132, + "grad_norm": 0.2535119950771332, + "learning_rate": 0.001, + "loss": 2.2293, + "step": 17995 + }, + { + "epoch": 0.7613165242406295, + "grad_norm": 0.30919238924980164, + "learning_rate": 0.001, + "loss": 2.0888, + "step": 17996 + }, + { + "epoch": 0.7613588290041459, + "grad_norm": 0.21086013317108154, + "learning_rate": 0.001, + "loss": 1.4487, + "step": 17997 + }, + { + "epoch": 0.7614011337676623, + "grad_norm": 0.14983509480953217, + "learning_rate": 0.001, + "loss": 1.9109, + "step": 17998 + }, + { + "epoch": 0.7614434385311786, + "grad_norm": 0.16807793080806732, + "learning_rate": 0.001, + "loss": 4.2247, + "step": 17999 + }, + { + "epoch": 0.761485743294695, + "grad_norm": 0.15390969812870026, + "learning_rate": 0.001, + "loss": 1.5664, + "step": 18000 + }, + { + "epoch": 0.7615280480582114, + "grad_norm": 0.16306841373443604, + "learning_rate": 0.001, + "loss": 1.6577, + "step": 18001 + }, + { + "epoch": 0.7615703528217277, + "grad_norm": 0.16728578507900238, + "learning_rate": 0.001, + "loss": 1.9575, + "step": 18002 + }, + { + "epoch": 0.7616126575852441, + "grad_norm": 0.16129907965660095, + "learning_rate": 0.001, + "loss": 1.9566, + "step": 18003 + }, + { + "epoch": 0.7616549623487605, + "grad_norm": 0.1789942979812622, + "learning_rate": 0.001, + "loss": 1.806, + "step": 18004 + }, + { + "epoch": 0.7616972671122768, + "grad_norm": 0.15799731016159058, + "learning_rate": 0.001, + "loss": 2.8574, + "step": 18005 + }, + { + "epoch": 0.7617395718757932, + "grad_norm": 0.13212554156780243, + "learning_rate": 0.001, + "loss": 1.6819, + "step": 18006 + }, + { + "epoch": 0.7617818766393096, + "grad_norm": 0.13850046694278717, + "learning_rate": 0.001, + "loss": 2.0633, + "step": 18007 + }, + { + "epoch": 0.7618241814028259, + "grad_norm": 0.22076542675495148, + "learning_rate": 0.001, + "loss": 2.0944, + "step": 18008 + }, + { + "epoch": 0.7618664861663423, + "grad_norm": 0.18561388552188873, + "learning_rate": 0.001, + "loss": 1.7448, + "step": 18009 + }, + { + "epoch": 0.7619087909298587, + "grad_norm": 0.15947283804416656, + "learning_rate": 0.001, + "loss": 1.9042, + "step": 18010 + }, + { + "epoch": 0.761951095693375, + "grad_norm": 0.49783751368522644, + "learning_rate": 0.001, + "loss": 2.8313, + "step": 18011 + }, + { + "epoch": 0.7619934004568915, + "grad_norm": 0.16959191858768463, + "learning_rate": 0.001, + "loss": 2.4751, + "step": 18012 + }, + { + "epoch": 0.7620357052204079, + "grad_norm": 0.1717720329761505, + "learning_rate": 0.001, + "loss": 3.4214, + "step": 18013 + }, + { + "epoch": 0.7620780099839242, + "grad_norm": 0.1780117154121399, + "learning_rate": 0.001, + "loss": 2.4375, + "step": 18014 + }, + { + "epoch": 0.7621203147474406, + "grad_norm": 0.1493871957063675, + "learning_rate": 0.001, + "loss": 1.8541, + "step": 18015 + }, + { + "epoch": 0.762162619510957, + "grad_norm": 0.1525392383337021, + "learning_rate": 0.001, + "loss": 1.7101, + "step": 18016 + }, + { + "epoch": 0.7622049242744733, + "grad_norm": 0.14318202435970306, + "learning_rate": 0.001, + "loss": 3.5063, + "step": 18017 + }, + { + "epoch": 0.7622472290379897, + "grad_norm": 0.5359983444213867, + "learning_rate": 0.001, + "loss": 1.6701, + "step": 18018 + }, + { + "epoch": 0.7622895338015061, + "grad_norm": 0.19831795990467072, + "learning_rate": 0.001, + "loss": 2.242, + "step": 18019 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.4935970604419708, + "learning_rate": 0.001, + "loss": 2.697, + "step": 18020 + }, + { + "epoch": 0.7623741433285388, + "grad_norm": 0.19180017709732056, + "learning_rate": 0.001, + "loss": 2.217, + "step": 18021 + }, + { + "epoch": 0.7624164480920551, + "grad_norm": 0.25960850715637207, + "learning_rate": 0.001, + "loss": 1.561, + "step": 18022 + }, + { + "epoch": 0.7624587528555715, + "grad_norm": 0.25631117820739746, + "learning_rate": 0.001, + "loss": 2.6779, + "step": 18023 + }, + { + "epoch": 0.7625010576190879, + "grad_norm": 2.3049187660217285, + "learning_rate": 0.001, + "loss": 2.1273, + "step": 18024 + }, + { + "epoch": 0.7625433623826042, + "grad_norm": 0.1636704057455063, + "learning_rate": 0.001, + "loss": 1.6586, + "step": 18025 + }, + { + "epoch": 0.7625856671461206, + "grad_norm": 0.17722514271736145, + "learning_rate": 0.001, + "loss": 1.6602, + "step": 18026 + }, + { + "epoch": 0.762627971909637, + "grad_norm": 0.22410543262958527, + "learning_rate": 0.001, + "loss": 2.5346, + "step": 18027 + }, + { + "epoch": 0.7626702766731533, + "grad_norm": 0.15916574001312256, + "learning_rate": 0.001, + "loss": 2.2563, + "step": 18028 + }, + { + "epoch": 0.7627125814366698, + "grad_norm": 0.17527827620506287, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 18029 + }, + { + "epoch": 0.7627548862001862, + "grad_norm": 0.2694207727909088, + "learning_rate": 0.001, + "loss": 2.3781, + "step": 18030 + }, + { + "epoch": 0.7627971909637025, + "grad_norm": 0.2880823314189911, + "learning_rate": 0.001, + "loss": 2.6001, + "step": 18031 + }, + { + "epoch": 0.7628394957272189, + "grad_norm": 0.165915384888649, + "learning_rate": 0.001, + "loss": 2.18, + "step": 18032 + }, + { + "epoch": 0.7628818004907353, + "grad_norm": 0.19809134304523468, + "learning_rate": 0.001, + "loss": 2.8755, + "step": 18033 + }, + { + "epoch": 0.7629241052542516, + "grad_norm": 0.18514703214168549, + "learning_rate": 0.001, + "loss": 1.9967, + "step": 18034 + }, + { + "epoch": 0.762966410017768, + "grad_norm": 0.17875197529792786, + "learning_rate": 0.001, + "loss": 2.2859, + "step": 18035 + }, + { + "epoch": 0.7630087147812844, + "grad_norm": 0.1806037873029709, + "learning_rate": 0.001, + "loss": 2.6236, + "step": 18036 + }, + { + "epoch": 0.7630510195448007, + "grad_norm": 0.15905499458312988, + "learning_rate": 0.001, + "loss": 2.1554, + "step": 18037 + }, + { + "epoch": 0.7630933243083171, + "grad_norm": 0.1544172763824463, + "learning_rate": 0.001, + "loss": 1.8756, + "step": 18038 + }, + { + "epoch": 0.7631356290718335, + "grad_norm": 0.20902614295482635, + "learning_rate": 0.001, + "loss": 2.6106, + "step": 18039 + }, + { + "epoch": 0.7631779338353498, + "grad_norm": 0.15740123391151428, + "learning_rate": 0.001, + "loss": 2.3193, + "step": 18040 + }, + { + "epoch": 0.7632202385988662, + "grad_norm": 0.15449416637420654, + "learning_rate": 0.001, + "loss": 1.6584, + "step": 18041 + }, + { + "epoch": 0.7632625433623826, + "grad_norm": 0.17526280879974365, + "learning_rate": 0.001, + "loss": 1.9062, + "step": 18042 + }, + { + "epoch": 0.7633048481258989, + "grad_norm": 0.19311073422431946, + "learning_rate": 0.001, + "loss": 2.3969, + "step": 18043 + }, + { + "epoch": 0.7633471528894153, + "grad_norm": 0.17864413559436798, + "learning_rate": 0.001, + "loss": 1.7516, + "step": 18044 + }, + { + "epoch": 0.7633894576529318, + "grad_norm": 0.16350750625133514, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 18045 + }, + { + "epoch": 0.7634317624164481, + "grad_norm": 0.29366812109947205, + "learning_rate": 0.001, + "loss": 3.6984, + "step": 18046 + }, + { + "epoch": 0.7634740671799645, + "grad_norm": 0.8536985516548157, + "learning_rate": 0.001, + "loss": 1.4539, + "step": 18047 + }, + { + "epoch": 0.7635163719434809, + "grad_norm": 0.1530151069164276, + "learning_rate": 0.001, + "loss": 1.9095, + "step": 18048 + }, + { + "epoch": 0.7635586767069972, + "grad_norm": 0.1519416868686676, + "learning_rate": 0.001, + "loss": 1.5347, + "step": 18049 + }, + { + "epoch": 0.7636009814705136, + "grad_norm": 0.31431666016578674, + "learning_rate": 0.001, + "loss": 2.056, + "step": 18050 + }, + { + "epoch": 0.76364328623403, + "grad_norm": 0.9129075407981873, + "learning_rate": 0.001, + "loss": 1.9324, + "step": 18051 + }, + { + "epoch": 0.7636855909975463, + "grad_norm": 0.1397624909877777, + "learning_rate": 0.001, + "loss": 1.7459, + "step": 18052 + }, + { + "epoch": 0.7637278957610627, + "grad_norm": 0.1634640395641327, + "learning_rate": 0.001, + "loss": 2.0907, + "step": 18053 + }, + { + "epoch": 0.7637702005245791, + "grad_norm": 0.18420040607452393, + "learning_rate": 0.001, + "loss": 2.1298, + "step": 18054 + }, + { + "epoch": 0.7638125052880954, + "grad_norm": 1.812959909439087, + "learning_rate": 0.001, + "loss": 3.1896, + "step": 18055 + }, + { + "epoch": 0.7638548100516118, + "grad_norm": 0.40835338830947876, + "learning_rate": 0.001, + "loss": 2.3623, + "step": 18056 + }, + { + "epoch": 0.7638971148151282, + "grad_norm": 5.703685283660889, + "learning_rate": 0.001, + "loss": 2.2481, + "step": 18057 + }, + { + "epoch": 0.7639394195786445, + "grad_norm": 0.18355365097522736, + "learning_rate": 0.001, + "loss": 2.5109, + "step": 18058 + }, + { + "epoch": 0.7639817243421609, + "grad_norm": 38.65842056274414, + "learning_rate": 0.001, + "loss": 2.5208, + "step": 18059 + }, + { + "epoch": 0.7640240291056773, + "grad_norm": 0.1402842104434967, + "learning_rate": 0.001, + "loss": 2.0796, + "step": 18060 + }, + { + "epoch": 0.7640663338691936, + "grad_norm": 0.28925231099128723, + "learning_rate": 0.001, + "loss": 2.1025, + "step": 18061 + }, + { + "epoch": 0.7641086386327101, + "grad_norm": 0.1598457396030426, + "learning_rate": 0.001, + "loss": 1.9011, + "step": 18062 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 0.21429701149463654, + "learning_rate": 0.001, + "loss": 1.6832, + "step": 18063 + }, + { + "epoch": 0.7641932481597428, + "grad_norm": 0.1933804154396057, + "learning_rate": 0.001, + "loss": 2.435, + "step": 18064 + }, + { + "epoch": 0.7642355529232592, + "grad_norm": 0.21492686867713928, + "learning_rate": 0.001, + "loss": 1.7783, + "step": 18065 + }, + { + "epoch": 0.7642778576867756, + "grad_norm": 0.19295653700828552, + "learning_rate": 0.001, + "loss": 1.6563, + "step": 18066 + }, + { + "epoch": 0.7643201624502919, + "grad_norm": 0.22416500747203827, + "learning_rate": 0.001, + "loss": 1.7468, + "step": 18067 + }, + { + "epoch": 0.7643624672138083, + "grad_norm": 0.1974020004272461, + "learning_rate": 0.001, + "loss": 2.6946, + "step": 18068 + }, + { + "epoch": 0.7644047719773246, + "grad_norm": 0.1901572346687317, + "learning_rate": 0.001, + "loss": 2.1431, + "step": 18069 + }, + { + "epoch": 0.764447076740841, + "grad_norm": 0.1712511032819748, + "learning_rate": 0.001, + "loss": 2.1096, + "step": 18070 + }, + { + "epoch": 0.7644893815043574, + "grad_norm": 0.1516590565443039, + "learning_rate": 0.001, + "loss": 1.7473, + "step": 18071 + }, + { + "epoch": 0.7645316862678737, + "grad_norm": 0.7932908535003662, + "learning_rate": 0.001, + "loss": 1.8818, + "step": 18072 + }, + { + "epoch": 0.7645739910313901, + "grad_norm": 0.12099297344684601, + "learning_rate": 0.001, + "loss": 1.6472, + "step": 18073 + }, + { + "epoch": 0.7646162957949065, + "grad_norm": 0.1378883421421051, + "learning_rate": 0.001, + "loss": 1.8047, + "step": 18074 + }, + { + "epoch": 0.7646586005584228, + "grad_norm": 0.1538868397474289, + "learning_rate": 0.001, + "loss": 2.1053, + "step": 18075 + }, + { + "epoch": 0.7647009053219392, + "grad_norm": 2.756589889526367, + "learning_rate": 0.001, + "loss": 2.4575, + "step": 18076 + }, + { + "epoch": 0.7647432100854556, + "grad_norm": 0.16562233865261078, + "learning_rate": 0.001, + "loss": 1.8964, + "step": 18077 + }, + { + "epoch": 0.764785514848972, + "grad_norm": 0.17701025307178497, + "learning_rate": 0.001, + "loss": 2.101, + "step": 18078 + }, + { + "epoch": 0.7648278196124884, + "grad_norm": 0.2576713562011719, + "learning_rate": 0.001, + "loss": 1.7816, + "step": 18079 + }, + { + "epoch": 0.7648701243760048, + "grad_norm": 0.20448629558086395, + "learning_rate": 0.001, + "loss": 2.3767, + "step": 18080 + }, + { + "epoch": 0.7649124291395211, + "grad_norm": 0.18108433485031128, + "learning_rate": 0.001, + "loss": 1.4452, + "step": 18081 + }, + { + "epoch": 0.7649547339030375, + "grad_norm": 0.17302542924880981, + "learning_rate": 0.001, + "loss": 2.3699, + "step": 18082 + }, + { + "epoch": 0.7649970386665539, + "grad_norm": 0.16477636992931366, + "learning_rate": 0.001, + "loss": 2.436, + "step": 18083 + }, + { + "epoch": 0.7650393434300702, + "grad_norm": 0.1961863785982132, + "learning_rate": 0.001, + "loss": 2.1719, + "step": 18084 + }, + { + "epoch": 0.7650816481935866, + "grad_norm": 0.14665809273719788, + "learning_rate": 0.001, + "loss": 1.5338, + "step": 18085 + }, + { + "epoch": 0.765123952957103, + "grad_norm": 0.16945761442184448, + "learning_rate": 0.001, + "loss": 3.4643, + "step": 18086 + }, + { + "epoch": 0.7651662577206193, + "grad_norm": 0.16046762466430664, + "learning_rate": 0.001, + "loss": 1.7755, + "step": 18087 + }, + { + "epoch": 0.7652085624841357, + "grad_norm": 0.15023651719093323, + "learning_rate": 0.001, + "loss": 2.3674, + "step": 18088 + }, + { + "epoch": 0.7652508672476521, + "grad_norm": 0.18328256905078888, + "learning_rate": 0.001, + "loss": 3.5899, + "step": 18089 + }, + { + "epoch": 0.7652931720111684, + "grad_norm": 0.164375901222229, + "learning_rate": 0.001, + "loss": 2.2567, + "step": 18090 + }, + { + "epoch": 0.7653354767746848, + "grad_norm": 0.15443533658981323, + "learning_rate": 0.001, + "loss": 1.6702, + "step": 18091 + }, + { + "epoch": 0.7653777815382012, + "grad_norm": 0.1532910019159317, + "learning_rate": 0.001, + "loss": 1.7926, + "step": 18092 + }, + { + "epoch": 0.7654200863017175, + "grad_norm": 0.17233017086982727, + "learning_rate": 0.001, + "loss": 1.9303, + "step": 18093 + }, + { + "epoch": 0.765462391065234, + "grad_norm": 0.3771326243877411, + "learning_rate": 0.001, + "loss": 2.6317, + "step": 18094 + }, + { + "epoch": 0.7655046958287504, + "grad_norm": 0.15392722189426422, + "learning_rate": 0.001, + "loss": 2.2798, + "step": 18095 + }, + { + "epoch": 0.7655470005922667, + "grad_norm": 0.15488563477993011, + "learning_rate": 0.001, + "loss": 2.1111, + "step": 18096 + }, + { + "epoch": 0.7655893053557831, + "grad_norm": 0.17814667522907257, + "learning_rate": 0.001, + "loss": 3.3561, + "step": 18097 + }, + { + "epoch": 0.7656316101192995, + "grad_norm": 0.26778027415275574, + "learning_rate": 0.001, + "loss": 1.7169, + "step": 18098 + }, + { + "epoch": 0.7656739148828158, + "grad_norm": 0.18762215971946716, + "learning_rate": 0.001, + "loss": 2.769, + "step": 18099 + }, + { + "epoch": 0.7657162196463322, + "grad_norm": 20.262495040893555, + "learning_rate": 0.001, + "loss": 2.2304, + "step": 18100 + }, + { + "epoch": 0.7657585244098486, + "grad_norm": 0.15660881996154785, + "learning_rate": 0.001, + "loss": 2.1479, + "step": 18101 + }, + { + "epoch": 0.7658008291733649, + "grad_norm": 0.1680443435907364, + "learning_rate": 0.001, + "loss": 1.9905, + "step": 18102 + }, + { + "epoch": 0.7658431339368813, + "grad_norm": 0.16119442880153656, + "learning_rate": 0.001, + "loss": 2.5656, + "step": 18103 + }, + { + "epoch": 0.7658854387003977, + "grad_norm": 0.1531805545091629, + "learning_rate": 0.001, + "loss": 1.4154, + "step": 18104 + }, + { + "epoch": 0.765927743463914, + "grad_norm": 1.081006407737732, + "learning_rate": 0.001, + "loss": 2.4958, + "step": 18105 + }, + { + "epoch": 0.7659700482274304, + "grad_norm": 0.24466001987457275, + "learning_rate": 0.001, + "loss": 2.9361, + "step": 18106 + }, + { + "epoch": 0.7660123529909468, + "grad_norm": 0.17916032671928406, + "learning_rate": 0.001, + "loss": 1.9019, + "step": 18107 + }, + { + "epoch": 0.7660546577544631, + "grad_norm": 0.19994664192199707, + "learning_rate": 0.001, + "loss": 1.9385, + "step": 18108 + }, + { + "epoch": 0.7660969625179795, + "grad_norm": 0.2043987214565277, + "learning_rate": 0.001, + "loss": 2.122, + "step": 18109 + }, + { + "epoch": 0.766139267281496, + "grad_norm": 0.18687529861927032, + "learning_rate": 0.001, + "loss": 1.7168, + "step": 18110 + }, + { + "epoch": 0.7661815720450122, + "grad_norm": 0.32144445180892944, + "learning_rate": 0.001, + "loss": 2.2964, + "step": 18111 + }, + { + "epoch": 0.7662238768085287, + "grad_norm": 0.1715632677078247, + "learning_rate": 0.001, + "loss": 1.7487, + "step": 18112 + }, + { + "epoch": 0.766266181572045, + "grad_norm": 0.22985003888607025, + "learning_rate": 0.001, + "loss": 2.0248, + "step": 18113 + }, + { + "epoch": 0.7663084863355614, + "grad_norm": 1.1345404386520386, + "learning_rate": 0.001, + "loss": 2.2758, + "step": 18114 + }, + { + "epoch": 0.7663507910990778, + "grad_norm": 0.18999895453453064, + "learning_rate": 0.001, + "loss": 1.8452, + "step": 18115 + }, + { + "epoch": 0.7663930958625941, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.001, + "loss": 2.1866, + "step": 18116 + }, + { + "epoch": 0.7664354006261105, + "grad_norm": 0.19124074280261993, + "learning_rate": 0.001, + "loss": 2.4042, + "step": 18117 + }, + { + "epoch": 0.7664777053896269, + "grad_norm": 0.20431570708751678, + "learning_rate": 0.001, + "loss": 1.7436, + "step": 18118 + }, + { + "epoch": 0.7665200101531432, + "grad_norm": 0.2346670776605606, + "learning_rate": 0.001, + "loss": 2.3448, + "step": 18119 + }, + { + "epoch": 0.7665623149166596, + "grad_norm": 0.19955824315547943, + "learning_rate": 0.001, + "loss": 3.1449, + "step": 18120 + }, + { + "epoch": 0.766604619680176, + "grad_norm": 0.1841670423746109, + "learning_rate": 0.001, + "loss": 1.9185, + "step": 18121 + }, + { + "epoch": 0.7666469244436923, + "grad_norm": 0.44874322414398193, + "learning_rate": 0.001, + "loss": 2.127, + "step": 18122 + }, + { + "epoch": 0.7666892292072087, + "grad_norm": 0.1544332355260849, + "learning_rate": 0.001, + "loss": 2.2697, + "step": 18123 + }, + { + "epoch": 0.7667315339707251, + "grad_norm": 0.1674651801586151, + "learning_rate": 0.001, + "loss": 1.9383, + "step": 18124 + }, + { + "epoch": 0.7667738387342414, + "grad_norm": 3.113783359527588, + "learning_rate": 0.001, + "loss": 1.4092, + "step": 18125 + }, + { + "epoch": 0.7668161434977578, + "grad_norm": 0.1927240639925003, + "learning_rate": 0.001, + "loss": 3.3805, + "step": 18126 + }, + { + "epoch": 0.7668584482612743, + "grad_norm": 0.21248434484004974, + "learning_rate": 0.001, + "loss": 2.3386, + "step": 18127 + }, + { + "epoch": 0.7669007530247905, + "grad_norm": 0.2704300582408905, + "learning_rate": 0.001, + "loss": 2.011, + "step": 18128 + }, + { + "epoch": 0.766943057788307, + "grad_norm": 0.33151501417160034, + "learning_rate": 0.001, + "loss": 1.9698, + "step": 18129 + }, + { + "epoch": 0.7669853625518234, + "grad_norm": 0.2982079088687897, + "learning_rate": 0.001, + "loss": 2.9232, + "step": 18130 + }, + { + "epoch": 0.7670276673153397, + "grad_norm": 0.185440793633461, + "learning_rate": 0.001, + "loss": 2.1382, + "step": 18131 + }, + { + "epoch": 0.7670699720788561, + "grad_norm": 0.1537931114435196, + "learning_rate": 0.001, + "loss": 3.2961, + "step": 18132 + }, + { + "epoch": 0.7671122768423725, + "grad_norm": 0.19790029525756836, + "learning_rate": 0.001, + "loss": 1.857, + "step": 18133 + }, + { + "epoch": 0.7671545816058888, + "grad_norm": 0.16664250195026398, + "learning_rate": 0.001, + "loss": 2.3612, + "step": 18134 + }, + { + "epoch": 0.7671968863694052, + "grad_norm": 0.19958829879760742, + "learning_rate": 0.001, + "loss": 1.444, + "step": 18135 + }, + { + "epoch": 0.7672391911329216, + "grad_norm": 0.25182950496673584, + "learning_rate": 0.001, + "loss": 2.2047, + "step": 18136 + }, + { + "epoch": 0.7672814958964379, + "grad_norm": 0.19357016682624817, + "learning_rate": 0.001, + "loss": 2.5632, + "step": 18137 + }, + { + "epoch": 0.7673238006599543, + "grad_norm": 0.20123781263828278, + "learning_rate": 0.001, + "loss": 3.6875, + "step": 18138 + }, + { + "epoch": 0.7673661054234707, + "grad_norm": 0.21408236026763916, + "learning_rate": 0.001, + "loss": 2.7204, + "step": 18139 + }, + { + "epoch": 0.767408410186987, + "grad_norm": 0.1696946918964386, + "learning_rate": 0.001, + "loss": 2.0886, + "step": 18140 + }, + { + "epoch": 0.7674507149505034, + "grad_norm": 0.8207197189331055, + "learning_rate": 0.001, + "loss": 2.7853, + "step": 18141 + }, + { + "epoch": 0.7674930197140198, + "grad_norm": 2.7755489349365234, + "learning_rate": 0.001, + "loss": 4.5532, + "step": 18142 + }, + { + "epoch": 0.7675353244775361, + "grad_norm": 0.20465411245822906, + "learning_rate": 0.001, + "loss": 1.803, + "step": 18143 + }, + { + "epoch": 0.7675776292410526, + "grad_norm": 0.16455033421516418, + "learning_rate": 0.001, + "loss": 2.0663, + "step": 18144 + }, + { + "epoch": 0.767619934004569, + "grad_norm": 3.680520534515381, + "learning_rate": 0.001, + "loss": 1.9112, + "step": 18145 + }, + { + "epoch": 0.7676622387680853, + "grad_norm": 0.20432768762111664, + "learning_rate": 0.001, + "loss": 1.8184, + "step": 18146 + }, + { + "epoch": 0.7677045435316017, + "grad_norm": 0.193999245762825, + "learning_rate": 0.001, + "loss": 2.0712, + "step": 18147 + }, + { + "epoch": 0.7677468482951181, + "grad_norm": 0.18594130873680115, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 18148 + }, + { + "epoch": 0.7677891530586344, + "grad_norm": 5.5671491622924805, + "learning_rate": 0.001, + "loss": 2.2832, + "step": 18149 + }, + { + "epoch": 0.7678314578221508, + "grad_norm": 0.22034242749214172, + "learning_rate": 0.001, + "loss": 3.3777, + "step": 18150 + }, + { + "epoch": 0.7678737625856672, + "grad_norm": 0.16389696300029755, + "learning_rate": 0.001, + "loss": 2.6919, + "step": 18151 + }, + { + "epoch": 0.7679160673491835, + "grad_norm": 4.396114349365234, + "learning_rate": 0.001, + "loss": 2.7745, + "step": 18152 + }, + { + "epoch": 0.7679583721126999, + "grad_norm": 0.18078528344631195, + "learning_rate": 0.001, + "loss": 2.5654, + "step": 18153 + }, + { + "epoch": 0.7680006768762163, + "grad_norm": 0.2645280361175537, + "learning_rate": 0.001, + "loss": 1.831, + "step": 18154 + }, + { + "epoch": 0.7680429816397326, + "grad_norm": 0.2649464011192322, + "learning_rate": 0.001, + "loss": 2.6965, + "step": 18155 + }, + { + "epoch": 0.768085286403249, + "grad_norm": 0.22064730525016785, + "learning_rate": 0.001, + "loss": 2.3275, + "step": 18156 + }, + { + "epoch": 0.7681275911667654, + "grad_norm": 0.22132393717765808, + "learning_rate": 0.001, + "loss": 1.9728, + "step": 18157 + }, + { + "epoch": 0.7681698959302817, + "grad_norm": 0.20887607336044312, + "learning_rate": 0.001, + "loss": 2.3195, + "step": 18158 + }, + { + "epoch": 0.7682122006937981, + "grad_norm": 1.3075305223464966, + "learning_rate": 0.001, + "loss": 2.5001, + "step": 18159 + }, + { + "epoch": 0.7682545054573144, + "grad_norm": 0.395717978477478, + "learning_rate": 0.001, + "loss": 2.147, + "step": 18160 + }, + { + "epoch": 0.7682968102208309, + "grad_norm": 0.18617072701454163, + "learning_rate": 0.001, + "loss": 2.0584, + "step": 18161 + }, + { + "epoch": 0.7683391149843473, + "grad_norm": 2.8809750080108643, + "learning_rate": 0.001, + "loss": 3.424, + "step": 18162 + }, + { + "epoch": 0.7683814197478636, + "grad_norm": 0.1813565492630005, + "learning_rate": 0.001, + "loss": 2.0456, + "step": 18163 + }, + { + "epoch": 0.76842372451138, + "grad_norm": 0.15641596913337708, + "learning_rate": 0.001, + "loss": 2.2862, + "step": 18164 + }, + { + "epoch": 0.7684660292748964, + "grad_norm": 0.1734810769557953, + "learning_rate": 0.001, + "loss": 3.9354, + "step": 18165 + }, + { + "epoch": 0.7685083340384127, + "grad_norm": 0.17491766810417175, + "learning_rate": 0.001, + "loss": 2.1404, + "step": 18166 + }, + { + "epoch": 0.7685506388019291, + "grad_norm": 0.1915895938873291, + "learning_rate": 0.001, + "loss": 2.9, + "step": 18167 + }, + { + "epoch": 0.7685929435654455, + "grad_norm": 0.1676090508699417, + "learning_rate": 0.001, + "loss": 2.2399, + "step": 18168 + }, + { + "epoch": 0.7686352483289618, + "grad_norm": 0.19483682513237, + "learning_rate": 0.001, + "loss": 2.1038, + "step": 18169 + }, + { + "epoch": 0.7686775530924782, + "grad_norm": 0.19385400414466858, + "learning_rate": 0.001, + "loss": 1.5603, + "step": 18170 + }, + { + "epoch": 0.7687198578559946, + "grad_norm": 0.15440934896469116, + "learning_rate": 0.001, + "loss": 1.5675, + "step": 18171 + }, + { + "epoch": 0.7687621626195109, + "grad_norm": 0.1513241082429886, + "learning_rate": 0.001, + "loss": 1.92, + "step": 18172 + }, + { + "epoch": 0.7688044673830273, + "grad_norm": 0.1512671858072281, + "learning_rate": 0.001, + "loss": 2.1714, + "step": 18173 + }, + { + "epoch": 0.7688467721465437, + "grad_norm": 0.31254085898399353, + "learning_rate": 0.001, + "loss": 4.2346, + "step": 18174 + }, + { + "epoch": 0.76888907691006, + "grad_norm": 0.14157380163669586, + "learning_rate": 0.001, + "loss": 1.5382, + "step": 18175 + }, + { + "epoch": 0.7689313816735764, + "grad_norm": 0.1927814781665802, + "learning_rate": 0.001, + "loss": 2.2692, + "step": 18176 + }, + { + "epoch": 0.7689736864370929, + "grad_norm": 0.6695175766944885, + "learning_rate": 0.001, + "loss": 2.4544, + "step": 18177 + }, + { + "epoch": 0.7690159912006092, + "grad_norm": 0.15355220437049866, + "learning_rate": 0.001, + "loss": 1.2551, + "step": 18178 + }, + { + "epoch": 0.7690582959641256, + "grad_norm": 1.621734380722046, + "learning_rate": 0.001, + "loss": 2.1838, + "step": 18179 + }, + { + "epoch": 0.769100600727642, + "grad_norm": 0.13713295757770538, + "learning_rate": 0.001, + "loss": 1.9986, + "step": 18180 + }, + { + "epoch": 0.7691429054911583, + "grad_norm": 0.3079672157764435, + "learning_rate": 0.001, + "loss": 2.9259, + "step": 18181 + }, + { + "epoch": 0.7691852102546747, + "grad_norm": 0.7089863419532776, + "learning_rate": 0.001, + "loss": 2.9016, + "step": 18182 + }, + { + "epoch": 0.7692275150181911, + "grad_norm": 1.244018316268921, + "learning_rate": 0.001, + "loss": 3.3858, + "step": 18183 + }, + { + "epoch": 0.7692698197817074, + "grad_norm": 0.19387851655483246, + "learning_rate": 0.001, + "loss": 3.4741, + "step": 18184 + }, + { + "epoch": 0.7693121245452238, + "grad_norm": 0.20450355112552643, + "learning_rate": 0.001, + "loss": 1.7754, + "step": 18185 + }, + { + "epoch": 0.7693544293087402, + "grad_norm": 0.18945342302322388, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 18186 + }, + { + "epoch": 0.7693967340722565, + "grad_norm": 0.191238135099411, + "learning_rate": 0.001, + "loss": 2.1089, + "step": 18187 + }, + { + "epoch": 0.7694390388357729, + "grad_norm": 0.2367229014635086, + "learning_rate": 0.001, + "loss": 2.7953, + "step": 18188 + }, + { + "epoch": 0.7694813435992893, + "grad_norm": 2.5353989601135254, + "learning_rate": 0.001, + "loss": 1.9474, + "step": 18189 + }, + { + "epoch": 0.7695236483628056, + "grad_norm": 2.5594029426574707, + "learning_rate": 0.001, + "loss": 1.7098, + "step": 18190 + }, + { + "epoch": 0.769565953126322, + "grad_norm": 0.21357464790344238, + "learning_rate": 0.001, + "loss": 3.2049, + "step": 18191 + }, + { + "epoch": 0.7696082578898384, + "grad_norm": 0.1735057681798935, + "learning_rate": 0.001, + "loss": 3.6089, + "step": 18192 + }, + { + "epoch": 0.7696505626533547, + "grad_norm": 0.18362459540367126, + "learning_rate": 0.001, + "loss": 1.9546, + "step": 18193 + }, + { + "epoch": 0.7696928674168712, + "grad_norm": 2.1002511978149414, + "learning_rate": 0.001, + "loss": 2.7012, + "step": 18194 + }, + { + "epoch": 0.7697351721803876, + "grad_norm": 0.1482006460428238, + "learning_rate": 0.001, + "loss": 1.4662, + "step": 18195 + }, + { + "epoch": 0.7697774769439039, + "grad_norm": 0.6873133778572083, + "learning_rate": 0.001, + "loss": 4.3751, + "step": 18196 + }, + { + "epoch": 0.7698197817074203, + "grad_norm": 0.19415785372257233, + "learning_rate": 0.001, + "loss": 2.9213, + "step": 18197 + }, + { + "epoch": 0.7698620864709367, + "grad_norm": 0.18056610226631165, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 18198 + }, + { + "epoch": 0.769904391234453, + "grad_norm": 0.15959571301937103, + "learning_rate": 0.001, + "loss": 2.5785, + "step": 18199 + }, + { + "epoch": 0.7699466959979694, + "grad_norm": 1.567895770072937, + "learning_rate": 0.001, + "loss": 1.4776, + "step": 18200 + }, + { + "epoch": 0.7699890007614858, + "grad_norm": 0.17681176960468292, + "learning_rate": 0.001, + "loss": 1.8845, + "step": 18201 + }, + { + "epoch": 0.7700313055250021, + "grad_norm": 0.16770051419734955, + "learning_rate": 0.001, + "loss": 1.8349, + "step": 18202 + }, + { + "epoch": 0.7700736102885185, + "grad_norm": 2.6900062561035156, + "learning_rate": 0.001, + "loss": 2.6632, + "step": 18203 + }, + { + "epoch": 0.7701159150520348, + "grad_norm": 0.18311475217342377, + "learning_rate": 0.001, + "loss": 1.4378, + "step": 18204 + }, + { + "epoch": 0.7701582198155512, + "grad_norm": 0.19490399956703186, + "learning_rate": 0.001, + "loss": 2.3233, + "step": 18205 + }, + { + "epoch": 0.7702005245790676, + "grad_norm": 0.17303673923015594, + "learning_rate": 0.001, + "loss": 1.9792, + "step": 18206 + }, + { + "epoch": 0.7702428293425839, + "grad_norm": 0.18099454045295715, + "learning_rate": 0.001, + "loss": 2.1376, + "step": 18207 + }, + { + "epoch": 0.7702851341061003, + "grad_norm": 0.1846335083246231, + "learning_rate": 0.001, + "loss": 2.6739, + "step": 18208 + }, + { + "epoch": 0.7703274388696167, + "grad_norm": 0.9617498517036438, + "learning_rate": 0.001, + "loss": 2.0516, + "step": 18209 + }, + { + "epoch": 0.770369743633133, + "grad_norm": 0.1959528774023056, + "learning_rate": 0.001, + "loss": 1.6241, + "step": 18210 + }, + { + "epoch": 0.7704120483966495, + "grad_norm": 0.18654143810272217, + "learning_rate": 0.001, + "loss": 2.8616, + "step": 18211 + }, + { + "epoch": 0.7704543531601659, + "grad_norm": 0.19656072556972504, + "learning_rate": 0.001, + "loss": 1.9619, + "step": 18212 + }, + { + "epoch": 0.7704966579236822, + "grad_norm": 0.42158690094947815, + "learning_rate": 0.001, + "loss": 2.3077, + "step": 18213 + }, + { + "epoch": 0.7705389626871986, + "grad_norm": 0.2899084687232971, + "learning_rate": 0.001, + "loss": 1.9099, + "step": 18214 + }, + { + "epoch": 0.770581267450715, + "grad_norm": 0.19894026219844818, + "learning_rate": 0.001, + "loss": 2.0858, + "step": 18215 + }, + { + "epoch": 0.7706235722142313, + "grad_norm": 0.16096381843090057, + "learning_rate": 0.001, + "loss": 1.724, + "step": 18216 + }, + { + "epoch": 0.7706658769777477, + "grad_norm": 0.19218985736370087, + "learning_rate": 0.001, + "loss": 2.7003, + "step": 18217 + }, + { + "epoch": 0.7707081817412641, + "grad_norm": 0.24575604498386383, + "learning_rate": 0.001, + "loss": 2.7877, + "step": 18218 + }, + { + "epoch": 0.7707504865047804, + "grad_norm": 0.17563903331756592, + "learning_rate": 0.001, + "loss": 2.192, + "step": 18219 + }, + { + "epoch": 0.7707927912682968, + "grad_norm": 0.18912796676158905, + "learning_rate": 0.001, + "loss": 2.3836, + "step": 18220 + }, + { + "epoch": 0.7708350960318132, + "grad_norm": 0.16776569187641144, + "learning_rate": 0.001, + "loss": 1.9411, + "step": 18221 + }, + { + "epoch": 0.7708774007953295, + "grad_norm": 0.18997344374656677, + "learning_rate": 0.001, + "loss": 2.5683, + "step": 18222 + }, + { + "epoch": 0.7709197055588459, + "grad_norm": 3.814743757247925, + "learning_rate": 0.001, + "loss": 2.4773, + "step": 18223 + }, + { + "epoch": 0.7709620103223623, + "grad_norm": 0.17335738241672516, + "learning_rate": 0.001, + "loss": 1.8626, + "step": 18224 + }, + { + "epoch": 0.7710043150858786, + "grad_norm": 1.7937777042388916, + "learning_rate": 0.001, + "loss": 2.8774, + "step": 18225 + }, + { + "epoch": 0.771046619849395, + "grad_norm": 0.16799603402614594, + "learning_rate": 0.001, + "loss": 1.7608, + "step": 18226 + }, + { + "epoch": 0.7710889246129115, + "grad_norm": 0.17312444746494293, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 18227 + }, + { + "epoch": 0.7711312293764278, + "grad_norm": 1.968964695930481, + "learning_rate": 0.001, + "loss": 1.916, + "step": 18228 + }, + { + "epoch": 0.7711735341399442, + "grad_norm": 0.1496216207742691, + "learning_rate": 0.001, + "loss": 1.789, + "step": 18229 + }, + { + "epoch": 0.7712158389034606, + "grad_norm": 0.15248148143291473, + "learning_rate": 0.001, + "loss": 2.2559, + "step": 18230 + }, + { + "epoch": 0.7712581436669769, + "grad_norm": 0.2088804543018341, + "learning_rate": 0.001, + "loss": 2.0968, + "step": 18231 + }, + { + "epoch": 0.7713004484304933, + "grad_norm": 0.20815947651863098, + "learning_rate": 0.001, + "loss": 2.6797, + "step": 18232 + }, + { + "epoch": 0.7713427531940097, + "grad_norm": 0.5374844670295715, + "learning_rate": 0.001, + "loss": 3.3856, + "step": 18233 + }, + { + "epoch": 0.771385057957526, + "grad_norm": 0.26020729541778564, + "learning_rate": 0.001, + "loss": 1.9107, + "step": 18234 + }, + { + "epoch": 0.7714273627210424, + "grad_norm": 0.27381160855293274, + "learning_rate": 0.001, + "loss": 2.0782, + "step": 18235 + }, + { + "epoch": 0.7714696674845588, + "grad_norm": 0.23972102999687195, + "learning_rate": 0.001, + "loss": 2.4307, + "step": 18236 + }, + { + "epoch": 0.7715119722480751, + "grad_norm": 0.1825583428144455, + "learning_rate": 0.001, + "loss": 1.7901, + "step": 18237 + }, + { + "epoch": 0.7715542770115915, + "grad_norm": 0.28334465622901917, + "learning_rate": 0.001, + "loss": 1.9967, + "step": 18238 + }, + { + "epoch": 0.7715965817751079, + "grad_norm": 1.4033657312393188, + "learning_rate": 0.001, + "loss": 3.312, + "step": 18239 + }, + { + "epoch": 0.7716388865386242, + "grad_norm": 0.20028841495513916, + "learning_rate": 0.001, + "loss": 1.7072, + "step": 18240 + }, + { + "epoch": 0.7716811913021406, + "grad_norm": 0.1848575472831726, + "learning_rate": 0.001, + "loss": 1.6997, + "step": 18241 + }, + { + "epoch": 0.771723496065657, + "grad_norm": 0.1979037970304489, + "learning_rate": 0.001, + "loss": 2.4092, + "step": 18242 + }, + { + "epoch": 0.7717658008291733, + "grad_norm": 0.2577193081378937, + "learning_rate": 0.001, + "loss": 2.3364, + "step": 18243 + }, + { + "epoch": 0.7718081055926898, + "grad_norm": 0.21102125942707062, + "learning_rate": 0.001, + "loss": 2.8763, + "step": 18244 + }, + { + "epoch": 0.7718504103562062, + "grad_norm": 0.17282043397426605, + "learning_rate": 0.001, + "loss": 1.6663, + "step": 18245 + }, + { + "epoch": 0.7718927151197225, + "grad_norm": 0.2045888453722, + "learning_rate": 0.001, + "loss": 2.0012, + "step": 18246 + }, + { + "epoch": 0.7719350198832389, + "grad_norm": 0.22302895784378052, + "learning_rate": 0.001, + "loss": 2.485, + "step": 18247 + }, + { + "epoch": 0.7719773246467552, + "grad_norm": 0.19768020510673523, + "learning_rate": 0.001, + "loss": 2.3736, + "step": 18248 + }, + { + "epoch": 0.7720196294102716, + "grad_norm": 0.15795622766017914, + "learning_rate": 0.001, + "loss": 1.9866, + "step": 18249 + }, + { + "epoch": 0.772061934173788, + "grad_norm": 0.1597718894481659, + "learning_rate": 0.001, + "loss": 2.3061, + "step": 18250 + }, + { + "epoch": 0.7721042389373043, + "grad_norm": 0.14455056190490723, + "learning_rate": 0.001, + "loss": 2.7711, + "step": 18251 + }, + { + "epoch": 0.7721465437008207, + "grad_norm": 0.16666211187839508, + "learning_rate": 0.001, + "loss": 2.4853, + "step": 18252 + }, + { + "epoch": 0.7721888484643371, + "grad_norm": 0.15804295241832733, + "learning_rate": 0.001, + "loss": 2.7146, + "step": 18253 + }, + { + "epoch": 0.7722311532278534, + "grad_norm": 0.14087384939193726, + "learning_rate": 0.001, + "loss": 1.6901, + "step": 18254 + }, + { + "epoch": 0.7722734579913698, + "grad_norm": 0.24747240543365479, + "learning_rate": 0.001, + "loss": 2.3874, + "step": 18255 + }, + { + "epoch": 0.7723157627548862, + "grad_norm": 0.20662999153137207, + "learning_rate": 0.001, + "loss": 1.7758, + "step": 18256 + }, + { + "epoch": 0.7723580675184025, + "grad_norm": 0.15056248009204865, + "learning_rate": 0.001, + "loss": 2.264, + "step": 18257 + }, + { + "epoch": 0.7724003722819189, + "grad_norm": 0.1590527892112732, + "learning_rate": 0.001, + "loss": 2.5744, + "step": 18258 + }, + { + "epoch": 0.7724426770454353, + "grad_norm": 0.18478628993034363, + "learning_rate": 0.001, + "loss": 2.4574, + "step": 18259 + }, + { + "epoch": 0.7724849818089516, + "grad_norm": 0.47995075583457947, + "learning_rate": 0.001, + "loss": 2.2169, + "step": 18260 + }, + { + "epoch": 0.772527286572468, + "grad_norm": 0.18007495999336243, + "learning_rate": 0.001, + "loss": 2.0898, + "step": 18261 + }, + { + "epoch": 0.7725695913359845, + "grad_norm": 0.15627458691596985, + "learning_rate": 0.001, + "loss": 2.0773, + "step": 18262 + }, + { + "epoch": 0.7726118960995008, + "grad_norm": 0.18800586462020874, + "learning_rate": 0.001, + "loss": 3.2193, + "step": 18263 + }, + { + "epoch": 0.7726542008630172, + "grad_norm": 0.3073885440826416, + "learning_rate": 0.001, + "loss": 2.0712, + "step": 18264 + }, + { + "epoch": 0.7726965056265336, + "grad_norm": 0.1837342083454132, + "learning_rate": 0.001, + "loss": 3.1809, + "step": 18265 + }, + { + "epoch": 0.7727388103900499, + "grad_norm": 0.24296334385871887, + "learning_rate": 0.001, + "loss": 2.2377, + "step": 18266 + }, + { + "epoch": 0.7727811151535663, + "grad_norm": 0.17990991473197937, + "learning_rate": 0.001, + "loss": 2.3957, + "step": 18267 + }, + { + "epoch": 0.7728234199170827, + "grad_norm": 0.1581333428621292, + "learning_rate": 0.001, + "loss": 2.1276, + "step": 18268 + }, + { + "epoch": 0.772865724680599, + "grad_norm": 0.1533811092376709, + "learning_rate": 0.001, + "loss": 1.9765, + "step": 18269 + }, + { + "epoch": 0.7729080294441154, + "grad_norm": 0.3548398017883301, + "learning_rate": 0.001, + "loss": 3.269, + "step": 18270 + }, + { + "epoch": 0.7729503342076318, + "grad_norm": 0.16134926676750183, + "learning_rate": 0.001, + "loss": 2.0527, + "step": 18271 + }, + { + "epoch": 0.7729926389711481, + "grad_norm": 0.713190495967865, + "learning_rate": 0.001, + "loss": 1.8366, + "step": 18272 + }, + { + "epoch": 0.7730349437346645, + "grad_norm": 0.8022021055221558, + "learning_rate": 0.001, + "loss": 1.7423, + "step": 18273 + }, + { + "epoch": 0.7730772484981809, + "grad_norm": 0.38094237446784973, + "learning_rate": 0.001, + "loss": 2.4639, + "step": 18274 + }, + { + "epoch": 0.7731195532616972, + "grad_norm": 0.1615244299173355, + "learning_rate": 0.001, + "loss": 2.2037, + "step": 18275 + }, + { + "epoch": 0.7731618580252136, + "grad_norm": 0.23506537079811096, + "learning_rate": 0.001, + "loss": 2.9041, + "step": 18276 + }, + { + "epoch": 0.77320416278873, + "grad_norm": 0.26342645287513733, + "learning_rate": 0.001, + "loss": 2.0227, + "step": 18277 + }, + { + "epoch": 0.7732464675522464, + "grad_norm": 0.16665269434452057, + "learning_rate": 0.001, + "loss": 1.5072, + "step": 18278 + }, + { + "epoch": 0.7732887723157628, + "grad_norm": 0.15748518705368042, + "learning_rate": 0.001, + "loss": 1.9506, + "step": 18279 + }, + { + "epoch": 0.7733310770792792, + "grad_norm": 0.20437052845954895, + "learning_rate": 0.001, + "loss": 1.7317, + "step": 18280 + }, + { + "epoch": 0.7733733818427955, + "grad_norm": 0.1785205602645874, + "learning_rate": 0.001, + "loss": 2.6008, + "step": 18281 + }, + { + "epoch": 0.7734156866063119, + "grad_norm": 0.15185962617397308, + "learning_rate": 0.001, + "loss": 2.136, + "step": 18282 + }, + { + "epoch": 0.7734579913698283, + "grad_norm": 0.1808384209871292, + "learning_rate": 0.001, + "loss": 1.8633, + "step": 18283 + }, + { + "epoch": 0.7735002961333446, + "grad_norm": 0.2651410698890686, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 18284 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 0.15414567291736603, + "learning_rate": 0.001, + "loss": 2.2774, + "step": 18285 + }, + { + "epoch": 0.7735849056603774, + "grad_norm": 0.16589045524597168, + "learning_rate": 0.001, + "loss": 1.5533, + "step": 18286 + }, + { + "epoch": 0.7736272104238937, + "grad_norm": 0.15032519400119781, + "learning_rate": 0.001, + "loss": 1.6888, + "step": 18287 + }, + { + "epoch": 0.7736695151874101, + "grad_norm": 0.7096246480941772, + "learning_rate": 0.001, + "loss": 1.9838, + "step": 18288 + }, + { + "epoch": 0.7737118199509265, + "grad_norm": 0.16361872851848602, + "learning_rate": 0.001, + "loss": 1.6044, + "step": 18289 + }, + { + "epoch": 0.7737541247144428, + "grad_norm": 0.3765888810157776, + "learning_rate": 0.001, + "loss": 1.7739, + "step": 18290 + }, + { + "epoch": 0.7737964294779592, + "grad_norm": 0.3084273934364319, + "learning_rate": 0.001, + "loss": 1.2695, + "step": 18291 + }, + { + "epoch": 0.7738387342414756, + "grad_norm": 0.17378173768520355, + "learning_rate": 0.001, + "loss": 1.9922, + "step": 18292 + }, + { + "epoch": 0.7738810390049919, + "grad_norm": 0.671561598777771, + "learning_rate": 0.001, + "loss": 2.7077, + "step": 18293 + }, + { + "epoch": 0.7739233437685084, + "grad_norm": 0.17134688794612885, + "learning_rate": 0.001, + "loss": 2.1455, + "step": 18294 + }, + { + "epoch": 0.7739656485320247, + "grad_norm": 0.15048515796661377, + "learning_rate": 0.001, + "loss": 2.2337, + "step": 18295 + }, + { + "epoch": 0.7740079532955411, + "grad_norm": 0.14688226580619812, + "learning_rate": 0.001, + "loss": 2.2607, + "step": 18296 + }, + { + "epoch": 0.7740502580590575, + "grad_norm": 0.15102264285087585, + "learning_rate": 0.001, + "loss": 2.3796, + "step": 18297 + }, + { + "epoch": 0.7740925628225738, + "grad_norm": 0.15952768921852112, + "learning_rate": 0.001, + "loss": 2.2732, + "step": 18298 + }, + { + "epoch": 0.7741348675860902, + "grad_norm": 0.15502522885799408, + "learning_rate": 0.001, + "loss": 2.3354, + "step": 18299 + }, + { + "epoch": 0.7741771723496066, + "grad_norm": 0.17360159754753113, + "learning_rate": 0.001, + "loss": 1.4465, + "step": 18300 + }, + { + "epoch": 0.7742194771131229, + "grad_norm": 0.14462324976921082, + "learning_rate": 0.001, + "loss": 1.5657, + "step": 18301 + }, + { + "epoch": 0.7742617818766393, + "grad_norm": 0.22618313133716583, + "learning_rate": 0.001, + "loss": 1.497, + "step": 18302 + }, + { + "epoch": 0.7743040866401557, + "grad_norm": 0.14733265340328217, + "learning_rate": 0.001, + "loss": 2.7168, + "step": 18303 + }, + { + "epoch": 0.774346391403672, + "grad_norm": 0.16763785481452942, + "learning_rate": 0.001, + "loss": 2.0218, + "step": 18304 + }, + { + "epoch": 0.7743886961671884, + "grad_norm": 0.14719419181346893, + "learning_rate": 0.001, + "loss": 1.8908, + "step": 18305 + }, + { + "epoch": 0.7744310009307048, + "grad_norm": 0.14694784581661224, + "learning_rate": 0.001, + "loss": 2.3633, + "step": 18306 + }, + { + "epoch": 0.7744733056942211, + "grad_norm": 0.1397048681974411, + "learning_rate": 0.001, + "loss": 1.574, + "step": 18307 + }, + { + "epoch": 0.7745156104577375, + "grad_norm": 0.138363778591156, + "learning_rate": 0.001, + "loss": 2.1553, + "step": 18308 + }, + { + "epoch": 0.774557915221254, + "grad_norm": 0.17978732287883759, + "learning_rate": 0.001, + "loss": 2.1659, + "step": 18309 + }, + { + "epoch": 0.7746002199847702, + "grad_norm": 2.1564154624938965, + "learning_rate": 0.001, + "loss": 2.5153, + "step": 18310 + }, + { + "epoch": 0.7746425247482867, + "grad_norm": 0.1512441188097, + "learning_rate": 0.001, + "loss": 1.7129, + "step": 18311 + }, + { + "epoch": 0.7746848295118031, + "grad_norm": 0.21785925328731537, + "learning_rate": 0.001, + "loss": 2.0366, + "step": 18312 + }, + { + "epoch": 0.7747271342753194, + "grad_norm": 0.5639128684997559, + "learning_rate": 0.001, + "loss": 1.4766, + "step": 18313 + }, + { + "epoch": 0.7747694390388358, + "grad_norm": 0.2070891112089157, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 18314 + }, + { + "epoch": 0.7748117438023522, + "grad_norm": 0.16625727713108063, + "learning_rate": 0.001, + "loss": 1.5067, + "step": 18315 + }, + { + "epoch": 0.7748540485658685, + "grad_norm": 0.5115146040916443, + "learning_rate": 0.001, + "loss": 2.7549, + "step": 18316 + }, + { + "epoch": 0.7748963533293849, + "grad_norm": 0.17250525951385498, + "learning_rate": 0.001, + "loss": 1.7243, + "step": 18317 + }, + { + "epoch": 0.7749386580929013, + "grad_norm": 1.3713905811309814, + "learning_rate": 0.001, + "loss": 2.225, + "step": 18318 + }, + { + "epoch": 0.7749809628564176, + "grad_norm": 0.21677060425281525, + "learning_rate": 0.001, + "loss": 3.3116, + "step": 18319 + }, + { + "epoch": 0.775023267619934, + "grad_norm": 0.21051226556301117, + "learning_rate": 0.001, + "loss": 2.2168, + "step": 18320 + }, + { + "epoch": 0.7750655723834504, + "grad_norm": 0.15511994063854218, + "learning_rate": 0.001, + "loss": 1.6332, + "step": 18321 + }, + { + "epoch": 0.7751078771469667, + "grad_norm": 0.19078783690929413, + "learning_rate": 0.001, + "loss": 2.149, + "step": 18322 + }, + { + "epoch": 0.7751501819104831, + "grad_norm": 0.186100572347641, + "learning_rate": 0.001, + "loss": 2.3936, + "step": 18323 + }, + { + "epoch": 0.7751924866739995, + "grad_norm": 0.19699545204639435, + "learning_rate": 0.001, + "loss": 1.6859, + "step": 18324 + }, + { + "epoch": 0.7752347914375158, + "grad_norm": 0.1534021496772766, + "learning_rate": 0.001, + "loss": 1.7206, + "step": 18325 + }, + { + "epoch": 0.7752770962010322, + "grad_norm": 0.1593978852033615, + "learning_rate": 0.001, + "loss": 2.3795, + "step": 18326 + }, + { + "epoch": 0.7753194009645487, + "grad_norm": 0.9152129292488098, + "learning_rate": 0.001, + "loss": 2.5192, + "step": 18327 + }, + { + "epoch": 0.775361705728065, + "grad_norm": 3.8987066745758057, + "learning_rate": 0.001, + "loss": 1.7634, + "step": 18328 + }, + { + "epoch": 0.7754040104915814, + "grad_norm": 0.16911078989505768, + "learning_rate": 0.001, + "loss": 1.7858, + "step": 18329 + }, + { + "epoch": 0.7754463152550978, + "grad_norm": 0.1814729869365692, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 18330 + }, + { + "epoch": 0.7754886200186141, + "grad_norm": 0.15336821973323822, + "learning_rate": 0.001, + "loss": 2.127, + "step": 18331 + }, + { + "epoch": 0.7755309247821305, + "grad_norm": 0.7835121750831604, + "learning_rate": 0.001, + "loss": 2.1583, + "step": 18332 + }, + { + "epoch": 0.7755732295456469, + "grad_norm": 0.19294503331184387, + "learning_rate": 0.001, + "loss": 2.2585, + "step": 18333 + }, + { + "epoch": 0.7756155343091632, + "grad_norm": 6.45863151550293, + "learning_rate": 0.001, + "loss": 2.1715, + "step": 18334 + }, + { + "epoch": 0.7756578390726796, + "grad_norm": 0.17118413746356964, + "learning_rate": 0.001, + "loss": 1.3514, + "step": 18335 + }, + { + "epoch": 0.775700143836196, + "grad_norm": 0.24696171283721924, + "learning_rate": 0.001, + "loss": 2.2035, + "step": 18336 + }, + { + "epoch": 0.7757424485997123, + "grad_norm": 0.33256638050079346, + "learning_rate": 0.001, + "loss": 2.5719, + "step": 18337 + }, + { + "epoch": 0.7757847533632287, + "grad_norm": 0.19757241010665894, + "learning_rate": 0.001, + "loss": 2.4873, + "step": 18338 + }, + { + "epoch": 0.775827058126745, + "grad_norm": 0.17525336146354675, + "learning_rate": 0.001, + "loss": 2.2333, + "step": 18339 + }, + { + "epoch": 0.7758693628902614, + "grad_norm": 0.15870629251003265, + "learning_rate": 0.001, + "loss": 1.8791, + "step": 18340 + }, + { + "epoch": 0.7759116676537778, + "grad_norm": 0.13413628935813904, + "learning_rate": 0.001, + "loss": 2.8583, + "step": 18341 + }, + { + "epoch": 0.7759539724172941, + "grad_norm": 0.15136483311653137, + "learning_rate": 0.001, + "loss": 2.368, + "step": 18342 + }, + { + "epoch": 0.7759962771808105, + "grad_norm": 0.15002518892288208, + "learning_rate": 0.001, + "loss": 1.6174, + "step": 18343 + }, + { + "epoch": 0.776038581944327, + "grad_norm": 0.15149597823619843, + "learning_rate": 0.001, + "loss": 2.7551, + "step": 18344 + }, + { + "epoch": 0.7760808867078433, + "grad_norm": 0.548881471157074, + "learning_rate": 0.001, + "loss": 1.8676, + "step": 18345 + }, + { + "epoch": 0.7761231914713597, + "grad_norm": 0.1294621080160141, + "learning_rate": 0.001, + "loss": 1.3881, + "step": 18346 + }, + { + "epoch": 0.7761654962348761, + "grad_norm": 0.1591734141111374, + "learning_rate": 0.001, + "loss": 2.6516, + "step": 18347 + }, + { + "epoch": 0.7762078009983924, + "grad_norm": 0.7913060784339905, + "learning_rate": 0.001, + "loss": 2.4505, + "step": 18348 + }, + { + "epoch": 0.7762501057619088, + "grad_norm": 8.328814506530762, + "learning_rate": 0.001, + "loss": 2.4966, + "step": 18349 + }, + { + "epoch": 0.7762924105254252, + "grad_norm": 0.15105003118515015, + "learning_rate": 0.001, + "loss": 2.0033, + "step": 18350 + }, + { + "epoch": 0.7763347152889415, + "grad_norm": 0.1463308483362198, + "learning_rate": 0.001, + "loss": 2.31, + "step": 18351 + }, + { + "epoch": 0.7763770200524579, + "grad_norm": 1.781333327293396, + "learning_rate": 0.001, + "loss": 3.0765, + "step": 18352 + }, + { + "epoch": 0.7764193248159743, + "grad_norm": 0.17749984562397003, + "learning_rate": 0.001, + "loss": 2.4567, + "step": 18353 + }, + { + "epoch": 0.7764616295794906, + "grad_norm": 0.16004282236099243, + "learning_rate": 0.001, + "loss": 2.1417, + "step": 18354 + }, + { + "epoch": 0.776503934343007, + "grad_norm": 0.18965484201908112, + "learning_rate": 0.001, + "loss": 2.5025, + "step": 18355 + }, + { + "epoch": 0.7765462391065234, + "grad_norm": 0.4342174530029297, + "learning_rate": 0.001, + "loss": 1.8881, + "step": 18356 + }, + { + "epoch": 0.7765885438700397, + "grad_norm": 1.2400356531143188, + "learning_rate": 0.001, + "loss": 3.1552, + "step": 18357 + }, + { + "epoch": 0.7766308486335561, + "grad_norm": 0.33312109112739563, + "learning_rate": 0.001, + "loss": 2.3914, + "step": 18358 + }, + { + "epoch": 0.7766731533970725, + "grad_norm": 0.1646675318479538, + "learning_rate": 0.001, + "loss": 2.3323, + "step": 18359 + }, + { + "epoch": 0.7767154581605888, + "grad_norm": 0.16873008012771606, + "learning_rate": 0.001, + "loss": 2.777, + "step": 18360 + }, + { + "epoch": 0.7767577629241053, + "grad_norm": 0.2983357906341553, + "learning_rate": 0.001, + "loss": 2.6237, + "step": 18361 + }, + { + "epoch": 0.7768000676876217, + "grad_norm": 10.579153060913086, + "learning_rate": 0.001, + "loss": 1.5843, + "step": 18362 + }, + { + "epoch": 0.776842372451138, + "grad_norm": 0.1804652363061905, + "learning_rate": 0.001, + "loss": 2.4111, + "step": 18363 + }, + { + "epoch": 0.7768846772146544, + "grad_norm": 0.24087312817573547, + "learning_rate": 0.001, + "loss": 2.2012, + "step": 18364 + }, + { + "epoch": 0.7769269819781708, + "grad_norm": 0.19113610684871674, + "learning_rate": 0.001, + "loss": 2.2822, + "step": 18365 + }, + { + "epoch": 0.7769692867416871, + "grad_norm": 0.20484258234500885, + "learning_rate": 0.001, + "loss": 2.0493, + "step": 18366 + }, + { + "epoch": 0.7770115915052035, + "grad_norm": 0.19894827902317047, + "learning_rate": 0.001, + "loss": 3.3502, + "step": 18367 + }, + { + "epoch": 0.7770538962687199, + "grad_norm": 0.20970451831817627, + "learning_rate": 0.001, + "loss": 1.6449, + "step": 18368 + }, + { + "epoch": 0.7770962010322362, + "grad_norm": 0.1429346650838852, + "learning_rate": 0.001, + "loss": 1.8039, + "step": 18369 + }, + { + "epoch": 0.7771385057957526, + "grad_norm": 0.16938546299934387, + "learning_rate": 0.001, + "loss": 1.9748, + "step": 18370 + }, + { + "epoch": 0.777180810559269, + "grad_norm": 0.9999366998672485, + "learning_rate": 0.001, + "loss": 1.5648, + "step": 18371 + }, + { + "epoch": 0.7772231153227853, + "grad_norm": 0.16697566211223602, + "learning_rate": 0.001, + "loss": 1.9137, + "step": 18372 + }, + { + "epoch": 0.7772654200863017, + "grad_norm": 0.1452540159225464, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 18373 + }, + { + "epoch": 0.7773077248498181, + "grad_norm": 0.19160769879817963, + "learning_rate": 0.001, + "loss": 3.139, + "step": 18374 + }, + { + "epoch": 0.7773500296133344, + "grad_norm": 0.1582278609275818, + "learning_rate": 0.001, + "loss": 2.0894, + "step": 18375 + }, + { + "epoch": 0.7773923343768508, + "grad_norm": 0.27617523074150085, + "learning_rate": 0.001, + "loss": 2.507, + "step": 18376 + }, + { + "epoch": 0.7774346391403673, + "grad_norm": 0.15912631154060364, + "learning_rate": 0.001, + "loss": 1.2789, + "step": 18377 + }, + { + "epoch": 0.7774769439038836, + "grad_norm": 0.13841427862644196, + "learning_rate": 0.001, + "loss": 1.6491, + "step": 18378 + }, + { + "epoch": 0.7775192486674, + "grad_norm": 0.1486634612083435, + "learning_rate": 0.001, + "loss": 1.6654, + "step": 18379 + }, + { + "epoch": 0.7775615534309164, + "grad_norm": 0.16134053468704224, + "learning_rate": 0.001, + "loss": 1.6808, + "step": 18380 + }, + { + "epoch": 0.7776038581944327, + "grad_norm": 0.5859285593032837, + "learning_rate": 0.001, + "loss": 1.9378, + "step": 18381 + }, + { + "epoch": 0.7776461629579491, + "grad_norm": 0.1989150196313858, + "learning_rate": 0.001, + "loss": 2.8664, + "step": 18382 + }, + { + "epoch": 0.7776884677214654, + "grad_norm": 0.21168352663516998, + "learning_rate": 0.001, + "loss": 2.7791, + "step": 18383 + }, + { + "epoch": 0.7777307724849818, + "grad_norm": 0.16079917550086975, + "learning_rate": 0.001, + "loss": 1.5344, + "step": 18384 + }, + { + "epoch": 0.7777730772484982, + "grad_norm": 0.17647407948970795, + "learning_rate": 0.001, + "loss": 2.9867, + "step": 18385 + }, + { + "epoch": 0.7778153820120145, + "grad_norm": 0.15070800483226776, + "learning_rate": 0.001, + "loss": 2.0682, + "step": 18386 + }, + { + "epoch": 0.7778576867755309, + "grad_norm": 0.1633099466562271, + "learning_rate": 0.001, + "loss": 1.5573, + "step": 18387 + }, + { + "epoch": 0.7778999915390473, + "grad_norm": 0.1457083374261856, + "learning_rate": 0.001, + "loss": 1.8068, + "step": 18388 + }, + { + "epoch": 0.7779422963025636, + "grad_norm": 0.1393345296382904, + "learning_rate": 0.001, + "loss": 2.3662, + "step": 18389 + }, + { + "epoch": 0.77798460106608, + "grad_norm": 0.17682814598083496, + "learning_rate": 0.001, + "loss": 2.6571, + "step": 18390 + }, + { + "epoch": 0.7780269058295964, + "grad_norm": 0.1801498383283615, + "learning_rate": 0.001, + "loss": 2.6531, + "step": 18391 + }, + { + "epoch": 0.7780692105931127, + "grad_norm": 0.14461541175842285, + "learning_rate": 0.001, + "loss": 1.6549, + "step": 18392 + }, + { + "epoch": 0.7781115153566291, + "grad_norm": 0.16726653277873993, + "learning_rate": 0.001, + "loss": 2.2859, + "step": 18393 + }, + { + "epoch": 0.7781538201201456, + "grad_norm": 0.1532205492258072, + "learning_rate": 0.001, + "loss": 1.4662, + "step": 18394 + }, + { + "epoch": 0.7781961248836619, + "grad_norm": 0.13347402215003967, + "learning_rate": 0.001, + "loss": 2.1848, + "step": 18395 + }, + { + "epoch": 0.7782384296471783, + "grad_norm": 0.20856840908527374, + "learning_rate": 0.001, + "loss": 2.0527, + "step": 18396 + }, + { + "epoch": 0.7782807344106947, + "grad_norm": 0.15012764930725098, + "learning_rate": 0.001, + "loss": 2.0695, + "step": 18397 + }, + { + "epoch": 0.778323039174211, + "grad_norm": 2.177178144454956, + "learning_rate": 0.001, + "loss": 1.4426, + "step": 18398 + }, + { + "epoch": 0.7783653439377274, + "grad_norm": 0.22611618041992188, + "learning_rate": 0.001, + "loss": 2.052, + "step": 18399 + }, + { + "epoch": 0.7784076487012438, + "grad_norm": 0.1328599900007248, + "learning_rate": 0.001, + "loss": 1.7561, + "step": 18400 + }, + { + "epoch": 0.7784499534647601, + "grad_norm": 0.14337456226348877, + "learning_rate": 0.001, + "loss": 1.1991, + "step": 18401 + }, + { + "epoch": 0.7784922582282765, + "grad_norm": 0.15648649632930756, + "learning_rate": 0.001, + "loss": 2.0966, + "step": 18402 + }, + { + "epoch": 0.7785345629917929, + "grad_norm": 0.15187643468379974, + "learning_rate": 0.001, + "loss": 1.8629, + "step": 18403 + }, + { + "epoch": 0.7785768677553092, + "grad_norm": 0.12914754450321198, + "learning_rate": 0.001, + "loss": 1.6909, + "step": 18404 + }, + { + "epoch": 0.7786191725188256, + "grad_norm": 0.14733096957206726, + "learning_rate": 0.001, + "loss": 1.9314, + "step": 18405 + }, + { + "epoch": 0.778661477282342, + "grad_norm": 0.19098341464996338, + "learning_rate": 0.001, + "loss": 2.2461, + "step": 18406 + }, + { + "epoch": 0.7787037820458583, + "grad_norm": 0.14427435398101807, + "learning_rate": 0.001, + "loss": 1.5976, + "step": 18407 + }, + { + "epoch": 0.7787460868093747, + "grad_norm": 0.14376568794250488, + "learning_rate": 0.001, + "loss": 1.7142, + "step": 18408 + }, + { + "epoch": 0.7787883915728911, + "grad_norm": 0.1582573801279068, + "learning_rate": 0.001, + "loss": 3.5712, + "step": 18409 + }, + { + "epoch": 0.7788306963364074, + "grad_norm": 0.16873016953468323, + "learning_rate": 0.001, + "loss": 2.0883, + "step": 18410 + }, + { + "epoch": 0.7788730010999239, + "grad_norm": 0.14867006242275238, + "learning_rate": 0.001, + "loss": 1.5226, + "step": 18411 + }, + { + "epoch": 0.7789153058634403, + "grad_norm": 0.1830413043498993, + "learning_rate": 0.001, + "loss": 1.5773, + "step": 18412 + }, + { + "epoch": 0.7789576106269566, + "grad_norm": 0.21609345078468323, + "learning_rate": 0.001, + "loss": 2.2652, + "step": 18413 + }, + { + "epoch": 0.778999915390473, + "grad_norm": 0.36571288108825684, + "learning_rate": 0.001, + "loss": 1.8297, + "step": 18414 + }, + { + "epoch": 0.7790422201539894, + "grad_norm": 0.15142600238323212, + "learning_rate": 0.001, + "loss": 1.5149, + "step": 18415 + }, + { + "epoch": 0.7790845249175057, + "grad_norm": 0.7442256212234497, + "learning_rate": 0.001, + "loss": 4.0185, + "step": 18416 + }, + { + "epoch": 0.7791268296810221, + "grad_norm": 0.28748589754104614, + "learning_rate": 0.001, + "loss": 3.1263, + "step": 18417 + }, + { + "epoch": 0.7791691344445385, + "grad_norm": 0.16540317237377167, + "learning_rate": 0.001, + "loss": 3.5016, + "step": 18418 + }, + { + "epoch": 0.7792114392080548, + "grad_norm": 0.15089643001556396, + "learning_rate": 0.001, + "loss": 2.1887, + "step": 18419 + }, + { + "epoch": 0.7792537439715712, + "grad_norm": 0.18112583458423615, + "learning_rate": 0.001, + "loss": 1.6532, + "step": 18420 + }, + { + "epoch": 0.7792960487350876, + "grad_norm": 0.14614035189151764, + "learning_rate": 0.001, + "loss": 1.8557, + "step": 18421 + }, + { + "epoch": 0.7793383534986039, + "grad_norm": 0.1359279751777649, + "learning_rate": 0.001, + "loss": 1.6901, + "step": 18422 + }, + { + "epoch": 0.7793806582621203, + "grad_norm": 1.8162649869918823, + "learning_rate": 0.001, + "loss": 2.0045, + "step": 18423 + }, + { + "epoch": 0.7794229630256367, + "grad_norm": 0.33419543504714966, + "learning_rate": 0.001, + "loss": 2.7161, + "step": 18424 + }, + { + "epoch": 0.779465267789153, + "grad_norm": 0.14467467367649078, + "learning_rate": 0.001, + "loss": 1.7674, + "step": 18425 + }, + { + "epoch": 0.7795075725526694, + "grad_norm": 0.17725689709186554, + "learning_rate": 0.001, + "loss": 1.6678, + "step": 18426 + }, + { + "epoch": 0.7795498773161859, + "grad_norm": 0.1722497195005417, + "learning_rate": 0.001, + "loss": 1.9924, + "step": 18427 + }, + { + "epoch": 0.7795921820797022, + "grad_norm": 0.17150290310382843, + "learning_rate": 0.001, + "loss": 1.5635, + "step": 18428 + }, + { + "epoch": 0.7796344868432186, + "grad_norm": 0.14820575714111328, + "learning_rate": 0.001, + "loss": 2.9991, + "step": 18429 + }, + { + "epoch": 0.7796767916067349, + "grad_norm": 0.18718744814395905, + "learning_rate": 0.001, + "loss": 2.1338, + "step": 18430 + }, + { + "epoch": 0.7797190963702513, + "grad_norm": 0.17354276776313782, + "learning_rate": 0.001, + "loss": 2.0303, + "step": 18431 + }, + { + "epoch": 0.7797614011337677, + "grad_norm": 0.17366386950016022, + "learning_rate": 0.001, + "loss": 1.5947, + "step": 18432 + }, + { + "epoch": 0.779803705897284, + "grad_norm": 0.1608889102935791, + "learning_rate": 0.001, + "loss": 3.5028, + "step": 18433 + }, + { + "epoch": 0.7798460106608004, + "grad_norm": 0.16302143037319183, + "learning_rate": 0.001, + "loss": 2.1618, + "step": 18434 + }, + { + "epoch": 0.7798883154243168, + "grad_norm": 0.19998212158679962, + "learning_rate": 0.001, + "loss": 1.3854, + "step": 18435 + }, + { + "epoch": 0.7799306201878331, + "grad_norm": 0.20762750506401062, + "learning_rate": 0.001, + "loss": 3.0683, + "step": 18436 + }, + { + "epoch": 0.7799729249513495, + "grad_norm": 0.1632770299911499, + "learning_rate": 0.001, + "loss": 1.3391, + "step": 18437 + }, + { + "epoch": 0.7800152297148659, + "grad_norm": 0.4506990909576416, + "learning_rate": 0.001, + "loss": 3.0315, + "step": 18438 + }, + { + "epoch": 0.7800575344783822, + "grad_norm": 0.1689714938402176, + "learning_rate": 0.001, + "loss": 2.7837, + "step": 18439 + }, + { + "epoch": 0.7800998392418986, + "grad_norm": 0.16053323447704315, + "learning_rate": 0.001, + "loss": 1.4578, + "step": 18440 + }, + { + "epoch": 0.780142144005415, + "grad_norm": 0.14367254078388214, + "learning_rate": 0.001, + "loss": 2.418, + "step": 18441 + }, + { + "epoch": 0.7801844487689313, + "grad_norm": 0.6091342568397522, + "learning_rate": 0.001, + "loss": 2.745, + "step": 18442 + }, + { + "epoch": 0.7802267535324477, + "grad_norm": 0.2298995554447174, + "learning_rate": 0.001, + "loss": 3.9425, + "step": 18443 + }, + { + "epoch": 0.7802690582959642, + "grad_norm": 0.15140703320503235, + "learning_rate": 0.001, + "loss": 2.7755, + "step": 18444 + }, + { + "epoch": 0.7803113630594805, + "grad_norm": 0.29984918236732483, + "learning_rate": 0.001, + "loss": 2.1757, + "step": 18445 + }, + { + "epoch": 0.7803536678229969, + "grad_norm": 0.18075025081634521, + "learning_rate": 0.001, + "loss": 1.7825, + "step": 18446 + }, + { + "epoch": 0.7803959725865133, + "grad_norm": 0.23606182634830475, + "learning_rate": 0.001, + "loss": 1.9542, + "step": 18447 + }, + { + "epoch": 0.7804382773500296, + "grad_norm": 0.17639142274856567, + "learning_rate": 0.001, + "loss": 2.5699, + "step": 18448 + }, + { + "epoch": 0.780480582113546, + "grad_norm": 0.3127742111682892, + "learning_rate": 0.001, + "loss": 1.3408, + "step": 18449 + }, + { + "epoch": 0.7805228868770624, + "grad_norm": 0.2586624026298523, + "learning_rate": 0.001, + "loss": 1.4728, + "step": 18450 + }, + { + "epoch": 0.7805651916405787, + "grad_norm": 0.14627474546432495, + "learning_rate": 0.001, + "loss": 1.5882, + "step": 18451 + }, + { + "epoch": 0.7806074964040951, + "grad_norm": 0.22566846013069153, + "learning_rate": 0.001, + "loss": 1.6143, + "step": 18452 + }, + { + "epoch": 0.7806498011676115, + "grad_norm": 0.5345284342765808, + "learning_rate": 0.001, + "loss": 2.4135, + "step": 18453 + }, + { + "epoch": 0.7806921059311278, + "grad_norm": 0.16989070177078247, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 18454 + }, + { + "epoch": 0.7807344106946442, + "grad_norm": 0.7486119866371155, + "learning_rate": 0.001, + "loss": 2.1102, + "step": 18455 + }, + { + "epoch": 0.7807767154581606, + "grad_norm": 0.1473877727985382, + "learning_rate": 0.001, + "loss": 2.0649, + "step": 18456 + }, + { + "epoch": 0.7808190202216769, + "grad_norm": 0.1684478372335434, + "learning_rate": 0.001, + "loss": 1.9911, + "step": 18457 + }, + { + "epoch": 0.7808613249851933, + "grad_norm": 2.9781100749969482, + "learning_rate": 0.001, + "loss": 1.858, + "step": 18458 + }, + { + "epoch": 0.7809036297487097, + "grad_norm": 0.13891150057315826, + "learning_rate": 0.001, + "loss": 1.716, + "step": 18459 + }, + { + "epoch": 0.780945934512226, + "grad_norm": 0.16608332097530365, + "learning_rate": 0.001, + "loss": 1.6527, + "step": 18460 + }, + { + "epoch": 0.7809882392757425, + "grad_norm": 0.161564901471138, + "learning_rate": 0.001, + "loss": 2.0179, + "step": 18461 + }, + { + "epoch": 0.7810305440392589, + "grad_norm": 0.17590850591659546, + "learning_rate": 0.001, + "loss": 1.8993, + "step": 18462 + }, + { + "epoch": 0.7810728488027752, + "grad_norm": 0.15447676181793213, + "learning_rate": 0.001, + "loss": 2.4161, + "step": 18463 + }, + { + "epoch": 0.7811151535662916, + "grad_norm": 0.13972973823547363, + "learning_rate": 0.001, + "loss": 2.4988, + "step": 18464 + }, + { + "epoch": 0.781157458329808, + "grad_norm": 0.1585020124912262, + "learning_rate": 0.001, + "loss": 2.4953, + "step": 18465 + }, + { + "epoch": 0.7811997630933243, + "grad_norm": 0.7317923307418823, + "learning_rate": 0.001, + "loss": 2.9961, + "step": 18466 + }, + { + "epoch": 0.7812420678568407, + "grad_norm": 0.1768050640821457, + "learning_rate": 0.001, + "loss": 3.3468, + "step": 18467 + }, + { + "epoch": 0.7812843726203571, + "grad_norm": 0.22169825434684753, + "learning_rate": 0.001, + "loss": 2.0874, + "step": 18468 + }, + { + "epoch": 0.7813266773838734, + "grad_norm": 0.16282053291797638, + "learning_rate": 0.001, + "loss": 2.3298, + "step": 18469 + }, + { + "epoch": 0.7813689821473898, + "grad_norm": 0.13636824488639832, + "learning_rate": 0.001, + "loss": 1.5925, + "step": 18470 + }, + { + "epoch": 0.7814112869109062, + "grad_norm": 0.1489405781030655, + "learning_rate": 0.001, + "loss": 2.6353, + "step": 18471 + }, + { + "epoch": 0.7814535916744225, + "grad_norm": 0.19491471350193024, + "learning_rate": 0.001, + "loss": 3.0984, + "step": 18472 + }, + { + "epoch": 0.7814958964379389, + "grad_norm": 0.22249197959899902, + "learning_rate": 0.001, + "loss": 2.7739, + "step": 18473 + }, + { + "epoch": 0.7815382012014552, + "grad_norm": 0.21340025961399078, + "learning_rate": 0.001, + "loss": 2.2505, + "step": 18474 + }, + { + "epoch": 0.7815805059649716, + "grad_norm": 0.16923391819000244, + "learning_rate": 0.001, + "loss": 2.5597, + "step": 18475 + }, + { + "epoch": 0.781622810728488, + "grad_norm": 0.1564716249704361, + "learning_rate": 0.001, + "loss": 2.6755, + "step": 18476 + }, + { + "epoch": 0.7816651154920043, + "grad_norm": 0.18122835457324982, + "learning_rate": 0.001, + "loss": 1.8659, + "step": 18477 + }, + { + "epoch": 0.7817074202555208, + "grad_norm": 0.5826538801193237, + "learning_rate": 0.001, + "loss": 2.1115, + "step": 18478 + }, + { + "epoch": 0.7817497250190372, + "grad_norm": 0.15686936676502228, + "learning_rate": 0.001, + "loss": 2.1577, + "step": 18479 + }, + { + "epoch": 0.7817920297825535, + "grad_norm": 0.1563858985900879, + "learning_rate": 0.001, + "loss": 1.9294, + "step": 18480 + }, + { + "epoch": 0.7818343345460699, + "grad_norm": 0.6455838680267334, + "learning_rate": 0.001, + "loss": 2.0729, + "step": 18481 + }, + { + "epoch": 0.7818766393095863, + "grad_norm": 0.16011108458042145, + "learning_rate": 0.001, + "loss": 2.1436, + "step": 18482 + }, + { + "epoch": 0.7819189440731026, + "grad_norm": 0.1725652515888214, + "learning_rate": 0.001, + "loss": 2.3672, + "step": 18483 + }, + { + "epoch": 0.781961248836619, + "grad_norm": 0.17138071358203888, + "learning_rate": 0.001, + "loss": 2.05, + "step": 18484 + }, + { + "epoch": 0.7820035536001354, + "grad_norm": 0.16126024723052979, + "learning_rate": 0.001, + "loss": 1.8446, + "step": 18485 + }, + { + "epoch": 0.7820458583636517, + "grad_norm": 0.20708617568016052, + "learning_rate": 0.001, + "loss": 2.3597, + "step": 18486 + }, + { + "epoch": 0.7820881631271681, + "grad_norm": 0.3041653037071228, + "learning_rate": 0.001, + "loss": 2.9242, + "step": 18487 + }, + { + "epoch": 0.7821304678906845, + "grad_norm": 0.2294047325849533, + "learning_rate": 0.001, + "loss": 2.7331, + "step": 18488 + }, + { + "epoch": 0.7821727726542008, + "grad_norm": 0.16051490604877472, + "learning_rate": 0.001, + "loss": 1.3815, + "step": 18489 + }, + { + "epoch": 0.7822150774177172, + "grad_norm": 0.1533360332250595, + "learning_rate": 0.001, + "loss": 1.7686, + "step": 18490 + }, + { + "epoch": 0.7822573821812336, + "grad_norm": 0.3606812655925751, + "learning_rate": 0.001, + "loss": 2.5197, + "step": 18491 + }, + { + "epoch": 0.7822996869447499, + "grad_norm": 0.15387903153896332, + "learning_rate": 0.001, + "loss": 1.8732, + "step": 18492 + }, + { + "epoch": 0.7823419917082663, + "grad_norm": 0.1506211757659912, + "learning_rate": 0.001, + "loss": 1.9354, + "step": 18493 + }, + { + "epoch": 0.7823842964717828, + "grad_norm": 0.1458059847354889, + "learning_rate": 0.001, + "loss": 1.6647, + "step": 18494 + }, + { + "epoch": 0.7824266012352991, + "grad_norm": 0.1346750259399414, + "learning_rate": 0.001, + "loss": 1.8682, + "step": 18495 + }, + { + "epoch": 0.7824689059988155, + "grad_norm": 0.1503671556711197, + "learning_rate": 0.001, + "loss": 1.8601, + "step": 18496 + }, + { + "epoch": 0.7825112107623319, + "grad_norm": 0.15590086579322815, + "learning_rate": 0.001, + "loss": 3.2885, + "step": 18497 + }, + { + "epoch": 0.7825535155258482, + "grad_norm": 0.16297362744808197, + "learning_rate": 0.001, + "loss": 2.2588, + "step": 18498 + }, + { + "epoch": 0.7825958202893646, + "grad_norm": 0.15889789164066315, + "learning_rate": 0.001, + "loss": 1.5667, + "step": 18499 + }, + { + "epoch": 0.782638125052881, + "grad_norm": 0.16023693978786469, + "learning_rate": 0.001, + "loss": 2.3448, + "step": 18500 + }, + { + "epoch": 0.7826804298163973, + "grad_norm": 0.19175249338150024, + "learning_rate": 0.001, + "loss": 2.4261, + "step": 18501 + }, + { + "epoch": 0.7827227345799137, + "grad_norm": 0.14406760036945343, + "learning_rate": 0.001, + "loss": 1.5323, + "step": 18502 + }, + { + "epoch": 0.7827650393434301, + "grad_norm": 0.14279572665691376, + "learning_rate": 0.001, + "loss": 1.4731, + "step": 18503 + }, + { + "epoch": 0.7828073441069464, + "grad_norm": 7.251346111297607, + "learning_rate": 0.001, + "loss": 2.6161, + "step": 18504 + }, + { + "epoch": 0.7828496488704628, + "grad_norm": 0.580826997756958, + "learning_rate": 0.001, + "loss": 2.6049, + "step": 18505 + }, + { + "epoch": 0.7828919536339792, + "grad_norm": 2.745222330093384, + "learning_rate": 0.001, + "loss": 2.3253, + "step": 18506 + }, + { + "epoch": 0.7829342583974955, + "grad_norm": 0.14414088428020477, + "learning_rate": 0.001, + "loss": 1.7045, + "step": 18507 + }, + { + "epoch": 0.7829765631610119, + "grad_norm": 0.1275712102651596, + "learning_rate": 0.001, + "loss": 2.0115, + "step": 18508 + }, + { + "epoch": 0.7830188679245284, + "grad_norm": 0.1394435614347458, + "learning_rate": 0.001, + "loss": 1.4622, + "step": 18509 + }, + { + "epoch": 0.7830611726880446, + "grad_norm": 0.1480412781238556, + "learning_rate": 0.001, + "loss": 1.8277, + "step": 18510 + }, + { + "epoch": 0.7831034774515611, + "grad_norm": 0.14259982109069824, + "learning_rate": 0.001, + "loss": 2.2549, + "step": 18511 + }, + { + "epoch": 0.7831457822150775, + "grad_norm": 2.498133897781372, + "learning_rate": 0.001, + "loss": 3.1715, + "step": 18512 + }, + { + "epoch": 0.7831880869785938, + "grad_norm": 0.19657468795776367, + "learning_rate": 0.001, + "loss": 2.4656, + "step": 18513 + }, + { + "epoch": 0.7832303917421102, + "grad_norm": 0.1902858018875122, + "learning_rate": 0.001, + "loss": 2.3779, + "step": 18514 + }, + { + "epoch": 0.7832726965056266, + "grad_norm": 2.73234486579895, + "learning_rate": 0.001, + "loss": 2.9337, + "step": 18515 + }, + { + "epoch": 0.7833150012691429, + "grad_norm": 0.16122861206531525, + "learning_rate": 0.001, + "loss": 3.7783, + "step": 18516 + }, + { + "epoch": 0.7833573060326593, + "grad_norm": 0.146601602435112, + "learning_rate": 0.001, + "loss": 1.7877, + "step": 18517 + }, + { + "epoch": 0.7833996107961756, + "grad_norm": 0.1656588464975357, + "learning_rate": 0.001, + "loss": 2.4555, + "step": 18518 + }, + { + "epoch": 0.783441915559692, + "grad_norm": 0.1633993536233902, + "learning_rate": 0.001, + "loss": 2.131, + "step": 18519 + }, + { + "epoch": 0.7834842203232084, + "grad_norm": 0.13799798488616943, + "learning_rate": 0.001, + "loss": 1.6201, + "step": 18520 + }, + { + "epoch": 0.7835265250867247, + "grad_norm": 0.1774521917104721, + "learning_rate": 0.001, + "loss": 2.4657, + "step": 18521 + }, + { + "epoch": 0.7835688298502411, + "grad_norm": 0.17615440487861633, + "learning_rate": 0.001, + "loss": 2.1959, + "step": 18522 + }, + { + "epoch": 0.7836111346137575, + "grad_norm": 0.18356232345104218, + "learning_rate": 0.001, + "loss": 2.6112, + "step": 18523 + }, + { + "epoch": 0.7836534393772738, + "grad_norm": 0.1374877244234085, + "learning_rate": 0.001, + "loss": 1.8857, + "step": 18524 + }, + { + "epoch": 0.7836957441407902, + "grad_norm": 0.2870613634586334, + "learning_rate": 0.001, + "loss": 1.7536, + "step": 18525 + }, + { + "epoch": 0.7837380489043067, + "grad_norm": 0.16457685828208923, + "learning_rate": 0.001, + "loss": 1.7759, + "step": 18526 + }, + { + "epoch": 0.783780353667823, + "grad_norm": 0.12657257914543152, + "learning_rate": 0.001, + "loss": 2.1712, + "step": 18527 + }, + { + "epoch": 0.7838226584313394, + "grad_norm": 1.0247093439102173, + "learning_rate": 0.001, + "loss": 1.3212, + "step": 18528 + }, + { + "epoch": 0.7838649631948558, + "grad_norm": 2.865370273590088, + "learning_rate": 0.001, + "loss": 2.1017, + "step": 18529 + }, + { + "epoch": 0.7839072679583721, + "grad_norm": 0.2102050930261612, + "learning_rate": 0.001, + "loss": 1.917, + "step": 18530 + }, + { + "epoch": 0.7839495727218885, + "grad_norm": 0.23753419518470764, + "learning_rate": 0.001, + "loss": 2.7787, + "step": 18531 + }, + { + "epoch": 0.7839918774854049, + "grad_norm": 0.15850041806697845, + "learning_rate": 0.001, + "loss": 1.5583, + "step": 18532 + }, + { + "epoch": 0.7840341822489212, + "grad_norm": 0.1486329436302185, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 18533 + }, + { + "epoch": 0.7840764870124376, + "grad_norm": 0.13608387112617493, + "learning_rate": 0.001, + "loss": 2.6181, + "step": 18534 + }, + { + "epoch": 0.784118791775954, + "grad_norm": 0.1750655174255371, + "learning_rate": 0.001, + "loss": 3.3077, + "step": 18535 + }, + { + "epoch": 0.7841610965394703, + "grad_norm": 0.15577921271324158, + "learning_rate": 0.001, + "loss": 2.3589, + "step": 18536 + }, + { + "epoch": 0.7842034013029867, + "grad_norm": 0.16472339630126953, + "learning_rate": 0.001, + "loss": 2.7532, + "step": 18537 + }, + { + "epoch": 0.7842457060665031, + "grad_norm": 0.8173422813415527, + "learning_rate": 0.001, + "loss": 2.1228, + "step": 18538 + }, + { + "epoch": 0.7842880108300194, + "grad_norm": 0.16363029181957245, + "learning_rate": 0.001, + "loss": 3.242, + "step": 18539 + }, + { + "epoch": 0.7843303155935358, + "grad_norm": 1.0863935947418213, + "learning_rate": 0.001, + "loss": 2.4159, + "step": 18540 + }, + { + "epoch": 0.7843726203570522, + "grad_norm": 0.17449085414409637, + "learning_rate": 0.001, + "loss": 3.3047, + "step": 18541 + }, + { + "epoch": 0.7844149251205685, + "grad_norm": 0.2527814507484436, + "learning_rate": 0.001, + "loss": 2.5967, + "step": 18542 + }, + { + "epoch": 0.784457229884085, + "grad_norm": 0.13562943041324615, + "learning_rate": 0.001, + "loss": 2.5034, + "step": 18543 + }, + { + "epoch": 0.7844995346476014, + "grad_norm": 0.22819682955741882, + "learning_rate": 0.001, + "loss": 1.9655, + "step": 18544 + }, + { + "epoch": 0.7845418394111177, + "grad_norm": 0.7350271940231323, + "learning_rate": 0.001, + "loss": 2.8429, + "step": 18545 + }, + { + "epoch": 0.7845841441746341, + "grad_norm": 0.16228371858596802, + "learning_rate": 0.001, + "loss": 1.4537, + "step": 18546 + }, + { + "epoch": 0.7846264489381505, + "grad_norm": 0.5120057463645935, + "learning_rate": 0.001, + "loss": 1.5295, + "step": 18547 + }, + { + "epoch": 0.7846687537016668, + "grad_norm": 0.17189784348011017, + "learning_rate": 0.001, + "loss": 2.5171, + "step": 18548 + }, + { + "epoch": 0.7847110584651832, + "grad_norm": 0.2788395285606384, + "learning_rate": 0.001, + "loss": 1.7338, + "step": 18549 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.17548993229866028, + "learning_rate": 0.001, + "loss": 2.0646, + "step": 18550 + }, + { + "epoch": 0.7847956679922159, + "grad_norm": 0.15027286112308502, + "learning_rate": 0.001, + "loss": 1.6634, + "step": 18551 + }, + { + "epoch": 0.7848379727557323, + "grad_norm": 0.14293362200260162, + "learning_rate": 0.001, + "loss": 1.7618, + "step": 18552 + }, + { + "epoch": 0.7848802775192487, + "grad_norm": 0.17201675474643707, + "learning_rate": 0.001, + "loss": 2.2748, + "step": 18553 + }, + { + "epoch": 0.784922582282765, + "grad_norm": 0.22527389228343964, + "learning_rate": 0.001, + "loss": 1.3851, + "step": 18554 + }, + { + "epoch": 0.7849648870462814, + "grad_norm": 0.16594509780406952, + "learning_rate": 0.001, + "loss": 2.1222, + "step": 18555 + }, + { + "epoch": 0.7850071918097978, + "grad_norm": 0.1816147267818451, + "learning_rate": 0.001, + "loss": 2.4502, + "step": 18556 + }, + { + "epoch": 0.7850494965733141, + "grad_norm": 0.14666712284088135, + "learning_rate": 0.001, + "loss": 1.4992, + "step": 18557 + }, + { + "epoch": 0.7850918013368305, + "grad_norm": 0.14625667035579681, + "learning_rate": 0.001, + "loss": 2.2672, + "step": 18558 + }, + { + "epoch": 0.785134106100347, + "grad_norm": 0.5537320375442505, + "learning_rate": 0.001, + "loss": 1.6027, + "step": 18559 + }, + { + "epoch": 0.7851764108638633, + "grad_norm": 0.17199261486530304, + "learning_rate": 0.001, + "loss": 2.5807, + "step": 18560 + }, + { + "epoch": 0.7852187156273797, + "grad_norm": 0.14282803237438202, + "learning_rate": 0.001, + "loss": 2.0102, + "step": 18561 + }, + { + "epoch": 0.7852610203908961, + "grad_norm": 0.7231976985931396, + "learning_rate": 0.001, + "loss": 2.3132, + "step": 18562 + }, + { + "epoch": 0.7853033251544124, + "grad_norm": 0.15639185905456543, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 18563 + }, + { + "epoch": 0.7853456299179288, + "grad_norm": 0.1664103865623474, + "learning_rate": 0.001, + "loss": 2.3088, + "step": 18564 + }, + { + "epoch": 0.7853879346814451, + "grad_norm": 0.18371713161468506, + "learning_rate": 0.001, + "loss": 2.4793, + "step": 18565 + }, + { + "epoch": 0.7854302394449615, + "grad_norm": 1.8584163188934326, + "learning_rate": 0.001, + "loss": 1.5245, + "step": 18566 + }, + { + "epoch": 0.7854725442084779, + "grad_norm": 0.44644030928611755, + "learning_rate": 0.001, + "loss": 2.4425, + "step": 18567 + }, + { + "epoch": 0.7855148489719942, + "grad_norm": 0.15669460594654083, + "learning_rate": 0.001, + "loss": 1.7045, + "step": 18568 + }, + { + "epoch": 0.7855571537355106, + "grad_norm": 0.24448037147521973, + "learning_rate": 0.001, + "loss": 1.8752, + "step": 18569 + }, + { + "epoch": 0.785599458499027, + "grad_norm": 0.1537441611289978, + "learning_rate": 0.001, + "loss": 2.3786, + "step": 18570 + }, + { + "epoch": 0.7856417632625433, + "grad_norm": 0.14563751220703125, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 18571 + }, + { + "epoch": 0.7856840680260597, + "grad_norm": 0.17651155591011047, + "learning_rate": 0.001, + "loss": 2.319, + "step": 18572 + }, + { + "epoch": 0.7857263727895761, + "grad_norm": 0.17922982573509216, + "learning_rate": 0.001, + "loss": 2.112, + "step": 18573 + }, + { + "epoch": 0.7857686775530924, + "grad_norm": 0.1861950010061264, + "learning_rate": 0.001, + "loss": 2.7733, + "step": 18574 + }, + { + "epoch": 0.7858109823166088, + "grad_norm": 0.156846284866333, + "learning_rate": 0.001, + "loss": 2.2694, + "step": 18575 + }, + { + "epoch": 0.7858532870801253, + "grad_norm": 0.4748559892177582, + "learning_rate": 0.001, + "loss": 2.1577, + "step": 18576 + }, + { + "epoch": 0.7858955918436415, + "grad_norm": 0.14767950773239136, + "learning_rate": 0.001, + "loss": 2.1209, + "step": 18577 + }, + { + "epoch": 0.785937896607158, + "grad_norm": 0.5291089415550232, + "learning_rate": 0.001, + "loss": 2.5013, + "step": 18578 + }, + { + "epoch": 0.7859802013706744, + "grad_norm": 0.1678914725780487, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 18579 + }, + { + "epoch": 0.7860225061341907, + "grad_norm": 0.1400989443063736, + "learning_rate": 0.001, + "loss": 1.9519, + "step": 18580 + }, + { + "epoch": 0.7860648108977071, + "grad_norm": 0.20776335895061493, + "learning_rate": 0.001, + "loss": 2.9314, + "step": 18581 + }, + { + "epoch": 0.7861071156612235, + "grad_norm": 0.16925200819969177, + "learning_rate": 0.001, + "loss": 3.7282, + "step": 18582 + }, + { + "epoch": 0.7861494204247398, + "grad_norm": 0.14598102867603302, + "learning_rate": 0.001, + "loss": 2.3061, + "step": 18583 + }, + { + "epoch": 0.7861917251882562, + "grad_norm": 0.6664958000183105, + "learning_rate": 0.001, + "loss": 2.0033, + "step": 18584 + }, + { + "epoch": 0.7862340299517726, + "grad_norm": 0.14569751918315887, + "learning_rate": 0.001, + "loss": 2.4761, + "step": 18585 + }, + { + "epoch": 0.7862763347152889, + "grad_norm": 0.2870123088359833, + "learning_rate": 0.001, + "loss": 2.782, + "step": 18586 + }, + { + "epoch": 0.7863186394788053, + "grad_norm": 0.16477970778942108, + "learning_rate": 0.001, + "loss": 1.6753, + "step": 18587 + }, + { + "epoch": 0.7863609442423217, + "grad_norm": 0.18875035643577576, + "learning_rate": 0.001, + "loss": 1.7905, + "step": 18588 + }, + { + "epoch": 0.786403249005838, + "grad_norm": 0.1353473663330078, + "learning_rate": 0.001, + "loss": 3.2058, + "step": 18589 + }, + { + "epoch": 0.7864455537693544, + "grad_norm": 0.2390325963497162, + "learning_rate": 0.001, + "loss": 2.155, + "step": 18590 + }, + { + "epoch": 0.7864878585328708, + "grad_norm": 0.1434008628129959, + "learning_rate": 0.001, + "loss": 2.0458, + "step": 18591 + }, + { + "epoch": 0.7865301632963871, + "grad_norm": 0.1311628371477127, + "learning_rate": 0.001, + "loss": 1.5251, + "step": 18592 + }, + { + "epoch": 0.7865724680599036, + "grad_norm": 0.20478565990924835, + "learning_rate": 0.001, + "loss": 2.3965, + "step": 18593 + }, + { + "epoch": 0.78661477282342, + "grad_norm": 0.157118022441864, + "learning_rate": 0.001, + "loss": 2.0589, + "step": 18594 + }, + { + "epoch": 0.7866570775869363, + "grad_norm": 0.17026092112064362, + "learning_rate": 0.001, + "loss": 2.3213, + "step": 18595 + }, + { + "epoch": 0.7866993823504527, + "grad_norm": 0.12616808712482452, + "learning_rate": 0.001, + "loss": 1.5168, + "step": 18596 + }, + { + "epoch": 0.7867416871139691, + "grad_norm": 0.12917254865169525, + "learning_rate": 0.001, + "loss": 1.4579, + "step": 18597 + }, + { + "epoch": 0.7867839918774854, + "grad_norm": 0.18053403496742249, + "learning_rate": 0.001, + "loss": 2.1042, + "step": 18598 + }, + { + "epoch": 0.7868262966410018, + "grad_norm": 0.17954039573669434, + "learning_rate": 0.001, + "loss": 2.2459, + "step": 18599 + }, + { + "epoch": 0.7868686014045182, + "grad_norm": 0.23645149171352386, + "learning_rate": 0.001, + "loss": 2.0196, + "step": 18600 + }, + { + "epoch": 0.7869109061680345, + "grad_norm": 0.1373950093984604, + "learning_rate": 0.001, + "loss": 1.6001, + "step": 18601 + }, + { + "epoch": 0.7869532109315509, + "grad_norm": 0.17674638330936432, + "learning_rate": 0.001, + "loss": 3.6847, + "step": 18602 + }, + { + "epoch": 0.7869955156950673, + "grad_norm": 0.1795940101146698, + "learning_rate": 0.001, + "loss": 2.7799, + "step": 18603 + }, + { + "epoch": 0.7870378204585836, + "grad_norm": 0.16243141889572144, + "learning_rate": 0.001, + "loss": 1.7403, + "step": 18604 + }, + { + "epoch": 0.7870801252221, + "grad_norm": 13.920958518981934, + "learning_rate": 0.001, + "loss": 1.9834, + "step": 18605 + }, + { + "epoch": 0.7871224299856164, + "grad_norm": 0.16762857139110565, + "learning_rate": 0.001, + "loss": 1.9104, + "step": 18606 + }, + { + "epoch": 0.7871647347491327, + "grad_norm": 0.15232449769973755, + "learning_rate": 0.001, + "loss": 1.9612, + "step": 18607 + }, + { + "epoch": 0.7872070395126491, + "grad_norm": 0.1300850510597229, + "learning_rate": 0.001, + "loss": 1.663, + "step": 18608 + }, + { + "epoch": 0.7872493442761654, + "grad_norm": 0.12953154742717743, + "learning_rate": 0.001, + "loss": 2.6655, + "step": 18609 + }, + { + "epoch": 0.7872916490396819, + "grad_norm": 0.1451409012079239, + "learning_rate": 0.001, + "loss": 2.2023, + "step": 18610 + }, + { + "epoch": 0.7873339538031983, + "grad_norm": 0.370164692401886, + "learning_rate": 0.001, + "loss": 2.1399, + "step": 18611 + }, + { + "epoch": 0.7873762585667146, + "grad_norm": 0.3032169044017792, + "learning_rate": 0.001, + "loss": 1.439, + "step": 18612 + }, + { + "epoch": 0.787418563330231, + "grad_norm": 0.15478359162807465, + "learning_rate": 0.001, + "loss": 1.7436, + "step": 18613 + }, + { + "epoch": 0.7874608680937474, + "grad_norm": 0.1523745208978653, + "learning_rate": 0.001, + "loss": 1.7031, + "step": 18614 + }, + { + "epoch": 0.7875031728572637, + "grad_norm": 0.1674836277961731, + "learning_rate": 0.001, + "loss": 1.8924, + "step": 18615 + }, + { + "epoch": 0.7875454776207801, + "grad_norm": 0.16084153950214386, + "learning_rate": 0.001, + "loss": 3.0371, + "step": 18616 + }, + { + "epoch": 0.7875877823842965, + "grad_norm": 0.15043555200099945, + "learning_rate": 0.001, + "loss": 1.669, + "step": 18617 + }, + { + "epoch": 0.7876300871478128, + "grad_norm": 0.15042801201343536, + "learning_rate": 0.001, + "loss": 1.4761, + "step": 18618 + }, + { + "epoch": 0.7876723919113292, + "grad_norm": 0.20407025516033173, + "learning_rate": 0.001, + "loss": 2.5795, + "step": 18619 + }, + { + "epoch": 0.7877146966748456, + "grad_norm": 0.37123432755470276, + "learning_rate": 0.001, + "loss": 1.9516, + "step": 18620 + }, + { + "epoch": 0.7877570014383619, + "grad_norm": 0.14106714725494385, + "learning_rate": 0.001, + "loss": 3.1985, + "step": 18621 + }, + { + "epoch": 0.7877993062018783, + "grad_norm": 0.16978901624679565, + "learning_rate": 0.001, + "loss": 2.6184, + "step": 18622 + }, + { + "epoch": 0.7878416109653947, + "grad_norm": 0.32933205366134644, + "learning_rate": 0.001, + "loss": 3.5388, + "step": 18623 + }, + { + "epoch": 0.787883915728911, + "grad_norm": 0.16782082617282867, + "learning_rate": 0.001, + "loss": 2.385, + "step": 18624 + }, + { + "epoch": 0.7879262204924274, + "grad_norm": 0.16868159174919128, + "learning_rate": 0.001, + "loss": 2.1474, + "step": 18625 + }, + { + "epoch": 0.7879685252559439, + "grad_norm": 0.20239630341529846, + "learning_rate": 0.001, + "loss": 2.5787, + "step": 18626 + }, + { + "epoch": 0.7880108300194602, + "grad_norm": 0.49413931369781494, + "learning_rate": 0.001, + "loss": 2.7789, + "step": 18627 + }, + { + "epoch": 0.7880531347829766, + "grad_norm": 0.19046078622341156, + "learning_rate": 0.001, + "loss": 2.8731, + "step": 18628 + }, + { + "epoch": 0.788095439546493, + "grad_norm": 0.6242032647132874, + "learning_rate": 0.001, + "loss": 3.4162, + "step": 18629 + }, + { + "epoch": 0.7881377443100093, + "grad_norm": 0.29068413376808167, + "learning_rate": 0.001, + "loss": 2.5984, + "step": 18630 + }, + { + "epoch": 0.7881800490735257, + "grad_norm": 0.159669429063797, + "learning_rate": 0.001, + "loss": 2.885, + "step": 18631 + }, + { + "epoch": 0.7882223538370421, + "grad_norm": 0.14053291082382202, + "learning_rate": 0.001, + "loss": 1.7608, + "step": 18632 + }, + { + "epoch": 0.7882646586005584, + "grad_norm": 0.34683123230934143, + "learning_rate": 0.001, + "loss": 1.8457, + "step": 18633 + }, + { + "epoch": 0.7883069633640748, + "grad_norm": 0.1669548749923706, + "learning_rate": 0.001, + "loss": 1.6075, + "step": 18634 + }, + { + "epoch": 0.7883492681275912, + "grad_norm": 0.19117628037929535, + "learning_rate": 0.001, + "loss": 2.7102, + "step": 18635 + }, + { + "epoch": 0.7883915728911075, + "grad_norm": 0.13999809324741364, + "learning_rate": 0.001, + "loss": 1.5316, + "step": 18636 + }, + { + "epoch": 0.7884338776546239, + "grad_norm": 0.3912406265735626, + "learning_rate": 0.001, + "loss": 2.5317, + "step": 18637 + }, + { + "epoch": 0.7884761824181403, + "grad_norm": 0.20486436784267426, + "learning_rate": 0.001, + "loss": 3.0638, + "step": 18638 + }, + { + "epoch": 0.7885184871816566, + "grad_norm": 0.18025173246860504, + "learning_rate": 0.001, + "loss": 1.9768, + "step": 18639 + }, + { + "epoch": 0.788560791945173, + "grad_norm": 0.1740838885307312, + "learning_rate": 0.001, + "loss": 3.0294, + "step": 18640 + }, + { + "epoch": 0.7886030967086894, + "grad_norm": 0.18480761349201202, + "learning_rate": 0.001, + "loss": 2.3447, + "step": 18641 + }, + { + "epoch": 0.7886454014722057, + "grad_norm": 14.460678100585938, + "learning_rate": 0.001, + "loss": 2.7443, + "step": 18642 + }, + { + "epoch": 0.7886877062357222, + "grad_norm": 0.19740672409534454, + "learning_rate": 0.001, + "loss": 1.882, + "step": 18643 + }, + { + "epoch": 0.7887300109992386, + "grad_norm": 0.13065767288208008, + "learning_rate": 0.001, + "loss": 2.2647, + "step": 18644 + }, + { + "epoch": 0.7887723157627549, + "grad_norm": 0.49033114314079285, + "learning_rate": 0.001, + "loss": 2.3273, + "step": 18645 + }, + { + "epoch": 0.7888146205262713, + "grad_norm": 0.16719037294387817, + "learning_rate": 0.001, + "loss": 2.3332, + "step": 18646 + }, + { + "epoch": 0.7888569252897877, + "grad_norm": 0.16855984926223755, + "learning_rate": 0.001, + "loss": 3.0683, + "step": 18647 + }, + { + "epoch": 0.788899230053304, + "grad_norm": 0.1336914449930191, + "learning_rate": 0.001, + "loss": 1.381, + "step": 18648 + }, + { + "epoch": 0.7889415348168204, + "grad_norm": 0.16387201845645905, + "learning_rate": 0.001, + "loss": 2.3596, + "step": 18649 + }, + { + "epoch": 0.7889838395803368, + "grad_norm": 2.3988749980926514, + "learning_rate": 0.001, + "loss": 2.2531, + "step": 18650 + }, + { + "epoch": 0.7890261443438531, + "grad_norm": 0.18819689750671387, + "learning_rate": 0.001, + "loss": 2.171, + "step": 18651 + }, + { + "epoch": 0.7890684491073695, + "grad_norm": 0.16322028636932373, + "learning_rate": 0.001, + "loss": 2.2966, + "step": 18652 + }, + { + "epoch": 0.7891107538708859, + "grad_norm": 0.15296301245689392, + "learning_rate": 0.001, + "loss": 1.9929, + "step": 18653 + }, + { + "epoch": 0.7891530586344022, + "grad_norm": 0.14490869641304016, + "learning_rate": 0.001, + "loss": 1.8859, + "step": 18654 + }, + { + "epoch": 0.7891953633979186, + "grad_norm": 0.16145136952400208, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 18655 + }, + { + "epoch": 0.7892376681614349, + "grad_norm": 0.15146324038505554, + "learning_rate": 0.001, + "loss": 1.7612, + "step": 18656 + }, + { + "epoch": 0.7892799729249513, + "grad_norm": 0.15252503752708435, + "learning_rate": 0.001, + "loss": 3.3884, + "step": 18657 + }, + { + "epoch": 0.7893222776884677, + "grad_norm": 1.6242767572402954, + "learning_rate": 0.001, + "loss": 1.7785, + "step": 18658 + }, + { + "epoch": 0.789364582451984, + "grad_norm": 0.19868817925453186, + "learning_rate": 0.001, + "loss": 3.0005, + "step": 18659 + }, + { + "epoch": 0.7894068872155005, + "grad_norm": 0.24456727504730225, + "learning_rate": 0.001, + "loss": 1.73, + "step": 18660 + }, + { + "epoch": 0.7894491919790169, + "grad_norm": 0.23000819981098175, + "learning_rate": 0.001, + "loss": 2.2173, + "step": 18661 + }, + { + "epoch": 0.7894914967425332, + "grad_norm": 0.220015287399292, + "learning_rate": 0.001, + "loss": 2.1518, + "step": 18662 + }, + { + "epoch": 0.7895338015060496, + "grad_norm": 0.20717963576316833, + "learning_rate": 0.001, + "loss": 3.1284, + "step": 18663 + }, + { + "epoch": 0.789576106269566, + "grad_norm": 0.2126576453447342, + "learning_rate": 0.001, + "loss": 2.8512, + "step": 18664 + }, + { + "epoch": 0.7896184110330823, + "grad_norm": 0.2596065402030945, + "learning_rate": 0.001, + "loss": 2.0507, + "step": 18665 + }, + { + "epoch": 0.7896607157965987, + "grad_norm": 0.22077849507331848, + "learning_rate": 0.001, + "loss": 1.6807, + "step": 18666 + }, + { + "epoch": 0.7897030205601151, + "grad_norm": 0.1828576773405075, + "learning_rate": 0.001, + "loss": 2.0024, + "step": 18667 + }, + { + "epoch": 0.7897453253236314, + "grad_norm": 0.5003877282142639, + "learning_rate": 0.001, + "loss": 3.519, + "step": 18668 + }, + { + "epoch": 0.7897876300871478, + "grad_norm": 0.17294269800186157, + "learning_rate": 0.001, + "loss": 2.7761, + "step": 18669 + }, + { + "epoch": 0.7898299348506642, + "grad_norm": 0.19300001859664917, + "learning_rate": 0.001, + "loss": 1.9447, + "step": 18670 + }, + { + "epoch": 0.7898722396141805, + "grad_norm": 1.2025370597839355, + "learning_rate": 0.001, + "loss": 3.5368, + "step": 18671 + }, + { + "epoch": 0.7899145443776969, + "grad_norm": 0.14662951231002808, + "learning_rate": 0.001, + "loss": 2.5346, + "step": 18672 + }, + { + "epoch": 0.7899568491412133, + "grad_norm": 2.872141122817993, + "learning_rate": 0.001, + "loss": 1.9286, + "step": 18673 + }, + { + "epoch": 0.7899991539047296, + "grad_norm": 0.16066762804985046, + "learning_rate": 0.001, + "loss": 2.2664, + "step": 18674 + }, + { + "epoch": 0.790041458668246, + "grad_norm": 0.16592992842197418, + "learning_rate": 0.001, + "loss": 1.8831, + "step": 18675 + }, + { + "epoch": 0.7900837634317625, + "grad_norm": 0.16850398480892181, + "learning_rate": 0.001, + "loss": 1.8177, + "step": 18676 + }, + { + "epoch": 0.7901260681952788, + "grad_norm": 0.16996052861213684, + "learning_rate": 0.001, + "loss": 1.9456, + "step": 18677 + }, + { + "epoch": 0.7901683729587952, + "grad_norm": 0.39866968989372253, + "learning_rate": 0.001, + "loss": 2.8809, + "step": 18678 + }, + { + "epoch": 0.7902106777223116, + "grad_norm": 0.16642092168331146, + "learning_rate": 0.001, + "loss": 2.6154, + "step": 18679 + }, + { + "epoch": 0.7902529824858279, + "grad_norm": 0.17593850195407867, + "learning_rate": 0.001, + "loss": 1.7041, + "step": 18680 + }, + { + "epoch": 0.7902952872493443, + "grad_norm": 0.17078939080238342, + "learning_rate": 0.001, + "loss": 2.9029, + "step": 18681 + }, + { + "epoch": 0.7903375920128607, + "grad_norm": 2.226186990737915, + "learning_rate": 0.001, + "loss": 1.6863, + "step": 18682 + }, + { + "epoch": 0.790379896776377, + "grad_norm": 1.4724563360214233, + "learning_rate": 0.001, + "loss": 2.1862, + "step": 18683 + }, + { + "epoch": 0.7904222015398934, + "grad_norm": 0.1770937144756317, + "learning_rate": 0.001, + "loss": 2.435, + "step": 18684 + }, + { + "epoch": 0.7904645063034098, + "grad_norm": 0.16543549299240112, + "learning_rate": 0.001, + "loss": 1.5855, + "step": 18685 + }, + { + "epoch": 0.7905068110669261, + "grad_norm": 0.19836363196372986, + "learning_rate": 0.001, + "loss": 1.9606, + "step": 18686 + }, + { + "epoch": 0.7905491158304425, + "grad_norm": 0.1877475082874298, + "learning_rate": 0.001, + "loss": 1.8057, + "step": 18687 + }, + { + "epoch": 0.7905914205939589, + "grad_norm": 0.14974345266819, + "learning_rate": 0.001, + "loss": 2.067, + "step": 18688 + }, + { + "epoch": 0.7906337253574752, + "grad_norm": 0.21924450993537903, + "learning_rate": 0.001, + "loss": 3.1355, + "step": 18689 + }, + { + "epoch": 0.7906760301209916, + "grad_norm": 0.17212629318237305, + "learning_rate": 0.001, + "loss": 1.5735, + "step": 18690 + }, + { + "epoch": 0.790718334884508, + "grad_norm": 0.4152930676937103, + "learning_rate": 0.001, + "loss": 2.7388, + "step": 18691 + }, + { + "epoch": 0.7907606396480243, + "grad_norm": 0.1306760460138321, + "learning_rate": 0.001, + "loss": 1.4797, + "step": 18692 + }, + { + "epoch": 0.7908029444115408, + "grad_norm": 0.166093111038208, + "learning_rate": 0.001, + "loss": 2.0058, + "step": 18693 + }, + { + "epoch": 0.7908452491750572, + "grad_norm": 1.3968002796173096, + "learning_rate": 0.001, + "loss": 1.5695, + "step": 18694 + }, + { + "epoch": 0.7908875539385735, + "grad_norm": 0.18365883827209473, + "learning_rate": 0.001, + "loss": 2.3175, + "step": 18695 + }, + { + "epoch": 0.7909298587020899, + "grad_norm": 0.14197491109371185, + "learning_rate": 0.001, + "loss": 1.7524, + "step": 18696 + }, + { + "epoch": 0.7909721634656063, + "grad_norm": 0.19356654584407806, + "learning_rate": 0.001, + "loss": 3.7002, + "step": 18697 + }, + { + "epoch": 0.7910144682291226, + "grad_norm": 0.1772981733083725, + "learning_rate": 0.001, + "loss": 2.2998, + "step": 18698 + }, + { + "epoch": 0.791056772992639, + "grad_norm": 0.2028152197599411, + "learning_rate": 0.001, + "loss": 1.9209, + "step": 18699 + }, + { + "epoch": 0.7910990777561553, + "grad_norm": 0.4528128504753113, + "learning_rate": 0.001, + "loss": 1.9742, + "step": 18700 + }, + { + "epoch": 0.7911413825196717, + "grad_norm": 0.19012227654457092, + "learning_rate": 0.001, + "loss": 2.475, + "step": 18701 + }, + { + "epoch": 0.7911836872831881, + "grad_norm": 0.17222073674201965, + "learning_rate": 0.001, + "loss": 1.9615, + "step": 18702 + }, + { + "epoch": 0.7912259920467044, + "grad_norm": 0.19183796644210815, + "learning_rate": 0.001, + "loss": 2.1465, + "step": 18703 + }, + { + "epoch": 0.7912682968102208, + "grad_norm": 0.1738269031047821, + "learning_rate": 0.001, + "loss": 2.6244, + "step": 18704 + }, + { + "epoch": 0.7913106015737372, + "grad_norm": 0.18650266528129578, + "learning_rate": 0.001, + "loss": 2.0074, + "step": 18705 + }, + { + "epoch": 0.7913529063372535, + "grad_norm": 0.1911695897579193, + "learning_rate": 0.001, + "loss": 2.1624, + "step": 18706 + }, + { + "epoch": 0.7913952111007699, + "grad_norm": 0.2884248197078705, + "learning_rate": 0.001, + "loss": 2.7871, + "step": 18707 + }, + { + "epoch": 0.7914375158642863, + "grad_norm": 0.15906710922718048, + "learning_rate": 0.001, + "loss": 2.9539, + "step": 18708 + }, + { + "epoch": 0.7914798206278026, + "grad_norm": 0.19278644025325775, + "learning_rate": 0.001, + "loss": 1.6402, + "step": 18709 + }, + { + "epoch": 0.791522125391319, + "grad_norm": 0.15218260884284973, + "learning_rate": 0.001, + "loss": 1.8649, + "step": 18710 + }, + { + "epoch": 0.7915644301548355, + "grad_norm": 0.1600116640329361, + "learning_rate": 0.001, + "loss": 1.6598, + "step": 18711 + }, + { + "epoch": 0.7916067349183518, + "grad_norm": 0.15089303255081177, + "learning_rate": 0.001, + "loss": 1.9441, + "step": 18712 + }, + { + "epoch": 0.7916490396818682, + "grad_norm": 0.15738871693611145, + "learning_rate": 0.001, + "loss": 1.888, + "step": 18713 + }, + { + "epoch": 0.7916913444453846, + "grad_norm": 0.15308193862438202, + "learning_rate": 0.001, + "loss": 1.8442, + "step": 18714 + }, + { + "epoch": 0.7917336492089009, + "grad_norm": 0.1537911295890808, + "learning_rate": 0.001, + "loss": 1.8999, + "step": 18715 + }, + { + "epoch": 0.7917759539724173, + "grad_norm": 0.16866432130336761, + "learning_rate": 0.001, + "loss": 2.5294, + "step": 18716 + }, + { + "epoch": 0.7918182587359337, + "grad_norm": 1.7140220403671265, + "learning_rate": 0.001, + "loss": 1.9505, + "step": 18717 + }, + { + "epoch": 0.79186056349945, + "grad_norm": 0.15509742498397827, + "learning_rate": 0.001, + "loss": 1.7273, + "step": 18718 + }, + { + "epoch": 0.7919028682629664, + "grad_norm": 0.16215214133262634, + "learning_rate": 0.001, + "loss": 3.0471, + "step": 18719 + }, + { + "epoch": 0.7919451730264828, + "grad_norm": 0.1872335970401764, + "learning_rate": 0.001, + "loss": 3.027, + "step": 18720 + }, + { + "epoch": 0.7919874777899991, + "grad_norm": 0.14136208593845367, + "learning_rate": 0.001, + "loss": 2.0332, + "step": 18721 + }, + { + "epoch": 0.7920297825535155, + "grad_norm": 0.2219260334968567, + "learning_rate": 0.001, + "loss": 2.2754, + "step": 18722 + }, + { + "epoch": 0.7920720873170319, + "grad_norm": 0.21660354733467102, + "learning_rate": 0.001, + "loss": 1.7934, + "step": 18723 + }, + { + "epoch": 0.7921143920805482, + "grad_norm": 0.16659744083881378, + "learning_rate": 0.001, + "loss": 2.2417, + "step": 18724 + }, + { + "epoch": 0.7921566968440646, + "grad_norm": 0.1875210851430893, + "learning_rate": 0.001, + "loss": 2.3443, + "step": 18725 + }, + { + "epoch": 0.792199001607581, + "grad_norm": 0.1662781685590744, + "learning_rate": 0.001, + "loss": 1.914, + "step": 18726 + }, + { + "epoch": 0.7922413063710974, + "grad_norm": 0.24961335957050323, + "learning_rate": 0.001, + "loss": 1.9302, + "step": 18727 + }, + { + "epoch": 0.7922836111346138, + "grad_norm": 0.17793142795562744, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 18728 + }, + { + "epoch": 0.7923259158981302, + "grad_norm": 0.7194470763206482, + "learning_rate": 0.001, + "loss": 2.9523, + "step": 18729 + }, + { + "epoch": 0.7923682206616465, + "grad_norm": 0.19897310435771942, + "learning_rate": 0.001, + "loss": 1.8951, + "step": 18730 + }, + { + "epoch": 0.7924105254251629, + "grad_norm": 0.16758960485458374, + "learning_rate": 0.001, + "loss": 3.3784, + "step": 18731 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.18163642287254333, + "learning_rate": 0.001, + "loss": 2.5377, + "step": 18732 + }, + { + "epoch": 0.7924951349521956, + "grad_norm": 0.1600625365972519, + "learning_rate": 0.001, + "loss": 2.8553, + "step": 18733 + }, + { + "epoch": 0.792537439715712, + "grad_norm": 0.17819511890411377, + "learning_rate": 0.001, + "loss": 3.1244, + "step": 18734 + }, + { + "epoch": 0.7925797444792284, + "grad_norm": 0.16769671440124512, + "learning_rate": 0.001, + "loss": 1.4566, + "step": 18735 + }, + { + "epoch": 0.7926220492427447, + "grad_norm": 0.18893221020698547, + "learning_rate": 0.001, + "loss": 2.7459, + "step": 18736 + }, + { + "epoch": 0.7926643540062611, + "grad_norm": 0.16248752176761627, + "learning_rate": 0.001, + "loss": 1.829, + "step": 18737 + }, + { + "epoch": 0.7927066587697775, + "grad_norm": 0.1498391479253769, + "learning_rate": 0.001, + "loss": 2.218, + "step": 18738 + }, + { + "epoch": 0.7927489635332938, + "grad_norm": 0.2448561042547226, + "learning_rate": 0.001, + "loss": 1.7601, + "step": 18739 + }, + { + "epoch": 0.7927912682968102, + "grad_norm": 0.17693863809108734, + "learning_rate": 0.001, + "loss": 1.7627, + "step": 18740 + }, + { + "epoch": 0.7928335730603266, + "grad_norm": 0.13363198935985565, + "learning_rate": 0.001, + "loss": 1.4754, + "step": 18741 + }, + { + "epoch": 0.792875877823843, + "grad_norm": 0.1259775161743164, + "learning_rate": 0.001, + "loss": 1.5831, + "step": 18742 + }, + { + "epoch": 0.7929181825873594, + "grad_norm": 0.18019893765449524, + "learning_rate": 0.001, + "loss": 2.3857, + "step": 18743 + }, + { + "epoch": 0.7929604873508757, + "grad_norm": 0.2442205846309662, + "learning_rate": 0.001, + "loss": 2.0327, + "step": 18744 + }, + { + "epoch": 0.7930027921143921, + "grad_norm": 1.5287914276123047, + "learning_rate": 0.001, + "loss": 2.4658, + "step": 18745 + }, + { + "epoch": 0.7930450968779085, + "grad_norm": 0.15980815887451172, + "learning_rate": 0.001, + "loss": 1.9778, + "step": 18746 + }, + { + "epoch": 0.7930874016414248, + "grad_norm": 0.1930980682373047, + "learning_rate": 0.001, + "loss": 2.2091, + "step": 18747 + }, + { + "epoch": 0.7931297064049412, + "grad_norm": 0.17652654647827148, + "learning_rate": 0.001, + "loss": 2.2903, + "step": 18748 + }, + { + "epoch": 0.7931720111684576, + "grad_norm": 0.22159257531166077, + "learning_rate": 0.001, + "loss": 2.6479, + "step": 18749 + }, + { + "epoch": 0.7932143159319739, + "grad_norm": 4.054377555847168, + "learning_rate": 0.001, + "loss": 1.5173, + "step": 18750 + }, + { + "epoch": 0.7932566206954903, + "grad_norm": 0.17010121047496796, + "learning_rate": 0.001, + "loss": 2.6045, + "step": 18751 + }, + { + "epoch": 0.7932989254590067, + "grad_norm": 0.15084238350391388, + "learning_rate": 0.001, + "loss": 2.4313, + "step": 18752 + }, + { + "epoch": 0.793341230222523, + "grad_norm": 0.18187488615512848, + "learning_rate": 0.001, + "loss": 2.0466, + "step": 18753 + }, + { + "epoch": 0.7933835349860394, + "grad_norm": 2.5695083141326904, + "learning_rate": 0.001, + "loss": 1.9052, + "step": 18754 + }, + { + "epoch": 0.7934258397495558, + "grad_norm": 0.1252131462097168, + "learning_rate": 0.001, + "loss": 2.476, + "step": 18755 + }, + { + "epoch": 0.7934681445130721, + "grad_norm": 0.194590225815773, + "learning_rate": 0.001, + "loss": 1.5803, + "step": 18756 + }, + { + "epoch": 0.7935104492765885, + "grad_norm": 0.2729746103286743, + "learning_rate": 0.001, + "loss": 2.2466, + "step": 18757 + }, + { + "epoch": 0.793552754040105, + "grad_norm": 0.2805977165699005, + "learning_rate": 0.001, + "loss": 2.5365, + "step": 18758 + }, + { + "epoch": 0.7935950588036212, + "grad_norm": 0.18586377799510956, + "learning_rate": 0.001, + "loss": 1.7274, + "step": 18759 + }, + { + "epoch": 0.7936373635671377, + "grad_norm": 0.16096092760562897, + "learning_rate": 0.001, + "loss": 3.2489, + "step": 18760 + }, + { + "epoch": 0.7936796683306541, + "grad_norm": 0.14794796705245972, + "learning_rate": 0.001, + "loss": 1.7881, + "step": 18761 + }, + { + "epoch": 0.7937219730941704, + "grad_norm": 0.17098478972911835, + "learning_rate": 0.001, + "loss": 3.5723, + "step": 18762 + }, + { + "epoch": 0.7937642778576868, + "grad_norm": 0.1851399689912796, + "learning_rate": 0.001, + "loss": 2.101, + "step": 18763 + }, + { + "epoch": 0.7938065826212032, + "grad_norm": 0.16206832230091095, + "learning_rate": 0.001, + "loss": 2.5929, + "step": 18764 + }, + { + "epoch": 0.7938488873847195, + "grad_norm": 0.16010499000549316, + "learning_rate": 0.001, + "loss": 1.6522, + "step": 18765 + }, + { + "epoch": 0.7938911921482359, + "grad_norm": 0.1725982278585434, + "learning_rate": 0.001, + "loss": 2.5854, + "step": 18766 + }, + { + "epoch": 0.7939334969117523, + "grad_norm": 0.13452595472335815, + "learning_rate": 0.001, + "loss": 1.9167, + "step": 18767 + }, + { + "epoch": 0.7939758016752686, + "grad_norm": 0.12271041423082352, + "learning_rate": 0.001, + "loss": 2.1516, + "step": 18768 + }, + { + "epoch": 0.794018106438785, + "grad_norm": 0.15723437070846558, + "learning_rate": 0.001, + "loss": 2.172, + "step": 18769 + }, + { + "epoch": 0.7940604112023014, + "grad_norm": 0.1402629315853119, + "learning_rate": 0.001, + "loss": 1.7417, + "step": 18770 + }, + { + "epoch": 0.7941027159658177, + "grad_norm": 0.11944486945867538, + "learning_rate": 0.001, + "loss": 1.5726, + "step": 18771 + }, + { + "epoch": 0.7941450207293341, + "grad_norm": 0.12228669226169586, + "learning_rate": 0.001, + "loss": 2.0272, + "step": 18772 + }, + { + "epoch": 0.7941873254928505, + "grad_norm": 0.18956278264522552, + "learning_rate": 0.001, + "loss": 3.0162, + "step": 18773 + }, + { + "epoch": 0.7942296302563668, + "grad_norm": 0.6293951272964478, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 18774 + }, + { + "epoch": 0.7942719350198832, + "grad_norm": 0.14916300773620605, + "learning_rate": 0.001, + "loss": 1.9637, + "step": 18775 + }, + { + "epoch": 0.7943142397833997, + "grad_norm": 0.14974258840084076, + "learning_rate": 0.001, + "loss": 2.385, + "step": 18776 + }, + { + "epoch": 0.794356544546916, + "grad_norm": 0.151955246925354, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 18777 + }, + { + "epoch": 0.7943988493104324, + "grad_norm": 0.13923077285289764, + "learning_rate": 0.001, + "loss": 2.3116, + "step": 18778 + }, + { + "epoch": 0.7944411540739488, + "grad_norm": 0.14543022215366364, + "learning_rate": 0.001, + "loss": 2.7402, + "step": 18779 + }, + { + "epoch": 0.7944834588374651, + "grad_norm": 0.15743476152420044, + "learning_rate": 0.001, + "loss": 2.3271, + "step": 18780 + }, + { + "epoch": 0.7945257636009815, + "grad_norm": 0.1735035479068756, + "learning_rate": 0.001, + "loss": 1.9675, + "step": 18781 + }, + { + "epoch": 0.7945680683644979, + "grad_norm": 0.1507929265499115, + "learning_rate": 0.001, + "loss": 1.7867, + "step": 18782 + }, + { + "epoch": 0.7946103731280142, + "grad_norm": 0.47875481843948364, + "learning_rate": 0.001, + "loss": 1.931, + "step": 18783 + }, + { + "epoch": 0.7946526778915306, + "grad_norm": 1.3483095169067383, + "learning_rate": 0.001, + "loss": 1.6993, + "step": 18784 + }, + { + "epoch": 0.794694982655047, + "grad_norm": 0.13868524134159088, + "learning_rate": 0.001, + "loss": 1.9223, + "step": 18785 + }, + { + "epoch": 0.7947372874185633, + "grad_norm": 0.2855166494846344, + "learning_rate": 0.001, + "loss": 1.5803, + "step": 18786 + }, + { + "epoch": 0.7947795921820797, + "grad_norm": 0.16588838398456573, + "learning_rate": 0.001, + "loss": 2.3712, + "step": 18787 + }, + { + "epoch": 0.7948218969455961, + "grad_norm": 0.1705247461795807, + "learning_rate": 0.001, + "loss": 2.6876, + "step": 18788 + }, + { + "epoch": 0.7948642017091124, + "grad_norm": 0.19775214791297913, + "learning_rate": 0.001, + "loss": 2.2817, + "step": 18789 + }, + { + "epoch": 0.7949065064726288, + "grad_norm": 0.691185474395752, + "learning_rate": 0.001, + "loss": 1.8467, + "step": 18790 + }, + { + "epoch": 0.7949488112361451, + "grad_norm": 0.1467396765947342, + "learning_rate": 0.001, + "loss": 3.6287, + "step": 18791 + }, + { + "epoch": 0.7949911159996615, + "grad_norm": 0.1282058209180832, + "learning_rate": 0.001, + "loss": 2.5868, + "step": 18792 + }, + { + "epoch": 0.795033420763178, + "grad_norm": 0.16036106646060944, + "learning_rate": 0.001, + "loss": 1.739, + "step": 18793 + }, + { + "epoch": 0.7950757255266943, + "grad_norm": 0.17441564798355103, + "learning_rate": 0.001, + "loss": 1.7735, + "step": 18794 + }, + { + "epoch": 0.7951180302902107, + "grad_norm": 0.15197812020778656, + "learning_rate": 0.001, + "loss": 1.4933, + "step": 18795 + }, + { + "epoch": 0.7951603350537271, + "grad_norm": 0.16789105534553528, + "learning_rate": 0.001, + "loss": 1.5755, + "step": 18796 + }, + { + "epoch": 0.7952026398172434, + "grad_norm": 0.34609055519104004, + "learning_rate": 0.001, + "loss": 2.2578, + "step": 18797 + }, + { + "epoch": 0.7952449445807598, + "grad_norm": 0.1598827838897705, + "learning_rate": 0.001, + "loss": 2.316, + "step": 18798 + }, + { + "epoch": 0.7952872493442762, + "grad_norm": 0.15156984329223633, + "learning_rate": 0.001, + "loss": 2.456, + "step": 18799 + }, + { + "epoch": 0.7953295541077925, + "grad_norm": 0.151535302400589, + "learning_rate": 0.001, + "loss": 1.4494, + "step": 18800 + }, + { + "epoch": 0.7953718588713089, + "grad_norm": 0.19072596728801727, + "learning_rate": 0.001, + "loss": 2.3097, + "step": 18801 + }, + { + "epoch": 0.7954141636348253, + "grad_norm": 0.14297893643379211, + "learning_rate": 0.001, + "loss": 1.9724, + "step": 18802 + }, + { + "epoch": 0.7954564683983416, + "grad_norm": 25.156417846679688, + "learning_rate": 0.001, + "loss": 2.0334, + "step": 18803 + }, + { + "epoch": 0.795498773161858, + "grad_norm": 2.4311437606811523, + "learning_rate": 0.001, + "loss": 2.0976, + "step": 18804 + }, + { + "epoch": 0.7955410779253744, + "grad_norm": 0.24896959960460663, + "learning_rate": 0.001, + "loss": 2.2145, + "step": 18805 + }, + { + "epoch": 0.7955833826888907, + "grad_norm": 0.17482326924800873, + "learning_rate": 0.001, + "loss": 2.1033, + "step": 18806 + }, + { + "epoch": 0.7956256874524071, + "grad_norm": 0.13971750438213348, + "learning_rate": 0.001, + "loss": 1.864, + "step": 18807 + }, + { + "epoch": 0.7956679922159235, + "grad_norm": 0.227436363697052, + "learning_rate": 0.001, + "loss": 2.6199, + "step": 18808 + }, + { + "epoch": 0.7957102969794398, + "grad_norm": 0.14270329475402832, + "learning_rate": 0.001, + "loss": 2.267, + "step": 18809 + }, + { + "epoch": 0.7957526017429563, + "grad_norm": 0.17308275401592255, + "learning_rate": 0.001, + "loss": 2.0791, + "step": 18810 + }, + { + "epoch": 0.7957949065064727, + "grad_norm": 0.16473335027694702, + "learning_rate": 0.001, + "loss": 1.7909, + "step": 18811 + }, + { + "epoch": 0.795837211269989, + "grad_norm": 0.17767003178596497, + "learning_rate": 0.001, + "loss": 3.3289, + "step": 18812 + }, + { + "epoch": 0.7958795160335054, + "grad_norm": 0.182156041264534, + "learning_rate": 0.001, + "loss": 2.5886, + "step": 18813 + }, + { + "epoch": 0.7959218207970218, + "grad_norm": 0.6405094861984253, + "learning_rate": 0.001, + "loss": 2.5443, + "step": 18814 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.20531894266605377, + "learning_rate": 0.001, + "loss": 2.3202, + "step": 18815 + }, + { + "epoch": 0.7960064303240545, + "grad_norm": 0.1372930109500885, + "learning_rate": 0.001, + "loss": 2.0397, + "step": 18816 + }, + { + "epoch": 0.7960487350875709, + "grad_norm": 0.15792180597782135, + "learning_rate": 0.001, + "loss": 2.4564, + "step": 18817 + }, + { + "epoch": 0.7960910398510872, + "grad_norm": 1.705318808555603, + "learning_rate": 0.001, + "loss": 1.7707, + "step": 18818 + }, + { + "epoch": 0.7961333446146036, + "grad_norm": 0.1559934914112091, + "learning_rate": 0.001, + "loss": 1.7459, + "step": 18819 + }, + { + "epoch": 0.79617564937812, + "grad_norm": 0.1450294852256775, + "learning_rate": 0.001, + "loss": 2.4879, + "step": 18820 + }, + { + "epoch": 0.7962179541416363, + "grad_norm": 0.1646071970462799, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 18821 + }, + { + "epoch": 0.7962602589051527, + "grad_norm": 6.388550758361816, + "learning_rate": 0.001, + "loss": 3.5271, + "step": 18822 + }, + { + "epoch": 0.7963025636686691, + "grad_norm": 0.13386476039886475, + "learning_rate": 0.001, + "loss": 2.1901, + "step": 18823 + }, + { + "epoch": 0.7963448684321854, + "grad_norm": 0.15483008325099945, + "learning_rate": 0.001, + "loss": 2.7349, + "step": 18824 + }, + { + "epoch": 0.7963871731957018, + "grad_norm": 0.16752897202968597, + "learning_rate": 0.001, + "loss": 2.2906, + "step": 18825 + }, + { + "epoch": 0.7964294779592183, + "grad_norm": 0.17875881493091583, + "learning_rate": 0.001, + "loss": 2.8077, + "step": 18826 + }, + { + "epoch": 0.7964717827227346, + "grad_norm": 0.16410550475120544, + "learning_rate": 0.001, + "loss": 2.6807, + "step": 18827 + }, + { + "epoch": 0.796514087486251, + "grad_norm": 0.15869401395320892, + "learning_rate": 0.001, + "loss": 2.016, + "step": 18828 + }, + { + "epoch": 0.7965563922497674, + "grad_norm": 0.24351370334625244, + "learning_rate": 0.001, + "loss": 2.5587, + "step": 18829 + }, + { + "epoch": 0.7965986970132837, + "grad_norm": 0.16663382947444916, + "learning_rate": 0.001, + "loss": 1.3997, + "step": 18830 + }, + { + "epoch": 0.7966410017768001, + "grad_norm": 0.15760239958763123, + "learning_rate": 0.001, + "loss": 1.8689, + "step": 18831 + }, + { + "epoch": 0.7966833065403165, + "grad_norm": 0.19611531496047974, + "learning_rate": 0.001, + "loss": 1.8662, + "step": 18832 + }, + { + "epoch": 0.7967256113038328, + "grad_norm": 0.1665828973054886, + "learning_rate": 0.001, + "loss": 1.6031, + "step": 18833 + }, + { + "epoch": 0.7967679160673492, + "grad_norm": 1.8677862882614136, + "learning_rate": 0.001, + "loss": 1.6596, + "step": 18834 + }, + { + "epoch": 0.7968102208308655, + "grad_norm": 16.954547882080078, + "learning_rate": 0.001, + "loss": 1.8566, + "step": 18835 + }, + { + "epoch": 0.7968525255943819, + "grad_norm": 0.18517085909843445, + "learning_rate": 0.001, + "loss": 2.7776, + "step": 18836 + }, + { + "epoch": 0.7968948303578983, + "grad_norm": 0.15459203720092773, + "learning_rate": 0.001, + "loss": 1.4028, + "step": 18837 + }, + { + "epoch": 0.7969371351214146, + "grad_norm": 1.6629470586776733, + "learning_rate": 0.001, + "loss": 2.8192, + "step": 18838 + }, + { + "epoch": 0.796979439884931, + "grad_norm": 0.20710648596286774, + "learning_rate": 0.001, + "loss": 2.1015, + "step": 18839 + }, + { + "epoch": 0.7970217446484474, + "grad_norm": 0.2469254583120346, + "learning_rate": 0.001, + "loss": 2.866, + "step": 18840 + }, + { + "epoch": 0.7970640494119637, + "grad_norm": 0.8012136816978455, + "learning_rate": 0.001, + "loss": 3.8139, + "step": 18841 + }, + { + "epoch": 0.7971063541754801, + "grad_norm": 0.4675610661506653, + "learning_rate": 0.001, + "loss": 2.2759, + "step": 18842 + }, + { + "epoch": 0.7971486589389966, + "grad_norm": 0.2675013244152069, + "learning_rate": 0.001, + "loss": 2.0761, + "step": 18843 + }, + { + "epoch": 0.7971909637025129, + "grad_norm": 0.18237780034542084, + "learning_rate": 0.001, + "loss": 2.3745, + "step": 18844 + }, + { + "epoch": 0.7972332684660293, + "grad_norm": 3.2315402030944824, + "learning_rate": 0.001, + "loss": 2.6036, + "step": 18845 + }, + { + "epoch": 0.7972755732295457, + "grad_norm": 0.17838868498802185, + "learning_rate": 0.001, + "loss": 2.091, + "step": 18846 + }, + { + "epoch": 0.797317877993062, + "grad_norm": 1.5021365880966187, + "learning_rate": 0.001, + "loss": 2.3548, + "step": 18847 + }, + { + "epoch": 0.7973601827565784, + "grad_norm": 0.1974629908800125, + "learning_rate": 0.001, + "loss": 1.9397, + "step": 18848 + }, + { + "epoch": 0.7974024875200948, + "grad_norm": 0.8569322228431702, + "learning_rate": 0.001, + "loss": 2.618, + "step": 18849 + }, + { + "epoch": 0.7974447922836111, + "grad_norm": 0.20803521573543549, + "learning_rate": 0.001, + "loss": 2.1371, + "step": 18850 + }, + { + "epoch": 0.7974870970471275, + "grad_norm": 0.24137185513973236, + "learning_rate": 0.001, + "loss": 2.1892, + "step": 18851 + }, + { + "epoch": 0.7975294018106439, + "grad_norm": 0.7290237545967102, + "learning_rate": 0.001, + "loss": 1.9481, + "step": 18852 + }, + { + "epoch": 0.7975717065741602, + "grad_norm": 0.1846807599067688, + "learning_rate": 0.001, + "loss": 1.9675, + "step": 18853 + }, + { + "epoch": 0.7976140113376766, + "grad_norm": 0.19551526010036469, + "learning_rate": 0.001, + "loss": 2.5217, + "step": 18854 + }, + { + "epoch": 0.797656316101193, + "grad_norm": 0.2642076909542084, + "learning_rate": 0.001, + "loss": 2.4063, + "step": 18855 + }, + { + "epoch": 0.7976986208647093, + "grad_norm": 0.15774700045585632, + "learning_rate": 0.001, + "loss": 2.3304, + "step": 18856 + }, + { + "epoch": 0.7977409256282257, + "grad_norm": 0.23456871509552002, + "learning_rate": 0.001, + "loss": 2.4911, + "step": 18857 + }, + { + "epoch": 0.7977832303917421, + "grad_norm": 0.15300516784191132, + "learning_rate": 0.001, + "loss": 1.6669, + "step": 18858 + }, + { + "epoch": 0.7978255351552584, + "grad_norm": 0.171369269490242, + "learning_rate": 0.001, + "loss": 2.9436, + "step": 18859 + }, + { + "epoch": 0.7978678399187749, + "grad_norm": 0.17169873416423798, + "learning_rate": 0.001, + "loss": 2.2797, + "step": 18860 + }, + { + "epoch": 0.7979101446822913, + "grad_norm": 0.17338159680366516, + "learning_rate": 0.001, + "loss": 1.5475, + "step": 18861 + }, + { + "epoch": 0.7979524494458076, + "grad_norm": 0.19149628281593323, + "learning_rate": 0.001, + "loss": 2.2884, + "step": 18862 + }, + { + "epoch": 0.797994754209324, + "grad_norm": 0.3046054244041443, + "learning_rate": 0.001, + "loss": 2.6577, + "step": 18863 + }, + { + "epoch": 0.7980370589728404, + "grad_norm": 0.19914156198501587, + "learning_rate": 0.001, + "loss": 3.0521, + "step": 18864 + }, + { + "epoch": 0.7980793637363567, + "grad_norm": 0.14342699944972992, + "learning_rate": 0.001, + "loss": 2.2552, + "step": 18865 + }, + { + "epoch": 0.7981216684998731, + "grad_norm": 0.16391606628894806, + "learning_rate": 0.001, + "loss": 1.8426, + "step": 18866 + }, + { + "epoch": 0.7981639732633895, + "grad_norm": 0.17975471913814545, + "learning_rate": 0.001, + "loss": 2.6264, + "step": 18867 + }, + { + "epoch": 0.7982062780269058, + "grad_norm": 0.2970002293586731, + "learning_rate": 0.001, + "loss": 2.2272, + "step": 18868 + }, + { + "epoch": 0.7982485827904222, + "grad_norm": 4.976659774780273, + "learning_rate": 0.001, + "loss": 2.746, + "step": 18869 + }, + { + "epoch": 0.7982908875539386, + "grad_norm": 0.34519144892692566, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 18870 + }, + { + "epoch": 0.7983331923174549, + "grad_norm": 0.14414265751838684, + "learning_rate": 0.001, + "loss": 2.1009, + "step": 18871 + }, + { + "epoch": 0.7983754970809713, + "grad_norm": 0.15004147589206696, + "learning_rate": 0.001, + "loss": 1.9651, + "step": 18872 + }, + { + "epoch": 0.7984178018444877, + "grad_norm": 0.1422182321548462, + "learning_rate": 0.001, + "loss": 1.8298, + "step": 18873 + }, + { + "epoch": 0.798460106608004, + "grad_norm": 0.1388990730047226, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 18874 + }, + { + "epoch": 0.7985024113715204, + "grad_norm": 0.14638282358646393, + "learning_rate": 0.001, + "loss": 2.0963, + "step": 18875 + }, + { + "epoch": 0.7985447161350369, + "grad_norm": 0.17456696927547455, + "learning_rate": 0.001, + "loss": 1.9739, + "step": 18876 + }, + { + "epoch": 0.7985870208985532, + "grad_norm": 0.17205384373664856, + "learning_rate": 0.001, + "loss": 3.2143, + "step": 18877 + }, + { + "epoch": 0.7986293256620696, + "grad_norm": 0.16180726885795593, + "learning_rate": 0.001, + "loss": 2.422, + "step": 18878 + }, + { + "epoch": 0.7986716304255859, + "grad_norm": 8.663548469543457, + "learning_rate": 0.001, + "loss": 2.9214, + "step": 18879 + }, + { + "epoch": 0.7987139351891023, + "grad_norm": 0.1585705280303955, + "learning_rate": 0.001, + "loss": 2.3293, + "step": 18880 + }, + { + "epoch": 0.7987562399526187, + "grad_norm": 0.19919650256633759, + "learning_rate": 0.001, + "loss": 1.7659, + "step": 18881 + }, + { + "epoch": 0.798798544716135, + "grad_norm": 0.18491598963737488, + "learning_rate": 0.001, + "loss": 1.8007, + "step": 18882 + }, + { + "epoch": 0.7988408494796514, + "grad_norm": 4.440988540649414, + "learning_rate": 0.001, + "loss": 2.3786, + "step": 18883 + }, + { + "epoch": 0.7988831542431678, + "grad_norm": 0.346278578042984, + "learning_rate": 0.001, + "loss": 2.312, + "step": 18884 + }, + { + "epoch": 0.7989254590066841, + "grad_norm": 0.16744297742843628, + "learning_rate": 0.001, + "loss": 3.3667, + "step": 18885 + }, + { + "epoch": 0.7989677637702005, + "grad_norm": 0.2730819880962372, + "learning_rate": 0.001, + "loss": 1.373, + "step": 18886 + }, + { + "epoch": 0.7990100685337169, + "grad_norm": 0.1686711460351944, + "learning_rate": 0.001, + "loss": 2.4004, + "step": 18887 + }, + { + "epoch": 0.7990523732972332, + "grad_norm": 0.32986345887184143, + "learning_rate": 0.001, + "loss": 2.1665, + "step": 18888 + }, + { + "epoch": 0.7990946780607496, + "grad_norm": 0.23133182525634766, + "learning_rate": 0.001, + "loss": 2.0615, + "step": 18889 + }, + { + "epoch": 0.799136982824266, + "grad_norm": 0.18338367342948914, + "learning_rate": 0.001, + "loss": 2.3496, + "step": 18890 + }, + { + "epoch": 0.7991792875877823, + "grad_norm": 0.15058721601963043, + "learning_rate": 0.001, + "loss": 1.7269, + "step": 18891 + }, + { + "epoch": 0.7992215923512987, + "grad_norm": 3.878596544265747, + "learning_rate": 0.001, + "loss": 2.3469, + "step": 18892 + }, + { + "epoch": 0.7992638971148152, + "grad_norm": 0.15892788767814636, + "learning_rate": 0.001, + "loss": 2.3426, + "step": 18893 + }, + { + "epoch": 0.7993062018783315, + "grad_norm": 0.18293994665145874, + "learning_rate": 0.001, + "loss": 1.5293, + "step": 18894 + }, + { + "epoch": 0.7993485066418479, + "grad_norm": 0.1805136799812317, + "learning_rate": 0.001, + "loss": 2.0846, + "step": 18895 + }, + { + "epoch": 0.7993908114053643, + "grad_norm": 0.16091568768024445, + "learning_rate": 0.001, + "loss": 1.4057, + "step": 18896 + }, + { + "epoch": 0.7994331161688806, + "grad_norm": 0.20409560203552246, + "learning_rate": 0.001, + "loss": 3.3809, + "step": 18897 + }, + { + "epoch": 0.799475420932397, + "grad_norm": 0.16749215126037598, + "learning_rate": 0.001, + "loss": 2.3456, + "step": 18898 + }, + { + "epoch": 0.7995177256959134, + "grad_norm": 0.165800079703331, + "learning_rate": 0.001, + "loss": 2.5899, + "step": 18899 + }, + { + "epoch": 0.7995600304594297, + "grad_norm": 0.17839626967906952, + "learning_rate": 0.001, + "loss": 1.6863, + "step": 18900 + }, + { + "epoch": 0.7996023352229461, + "grad_norm": 0.15647844970226288, + "learning_rate": 0.001, + "loss": 2.4963, + "step": 18901 + }, + { + "epoch": 0.7996446399864625, + "grad_norm": 0.22941941022872925, + "learning_rate": 0.001, + "loss": 2.7686, + "step": 18902 + }, + { + "epoch": 0.7996869447499788, + "grad_norm": 0.17130149900913239, + "learning_rate": 0.001, + "loss": 2.7059, + "step": 18903 + }, + { + "epoch": 0.7997292495134952, + "grad_norm": 0.1701461225748062, + "learning_rate": 0.001, + "loss": 1.87, + "step": 18904 + }, + { + "epoch": 0.7997715542770116, + "grad_norm": 0.514470100402832, + "learning_rate": 0.001, + "loss": 2.4732, + "step": 18905 + }, + { + "epoch": 0.7998138590405279, + "grad_norm": 0.13958191871643066, + "learning_rate": 0.001, + "loss": 2.6836, + "step": 18906 + }, + { + "epoch": 0.7998561638040443, + "grad_norm": 0.15818491578102112, + "learning_rate": 0.001, + "loss": 2.7445, + "step": 18907 + }, + { + "epoch": 0.7998984685675607, + "grad_norm": 0.18377156555652618, + "learning_rate": 0.001, + "loss": 1.9231, + "step": 18908 + }, + { + "epoch": 0.799940773331077, + "grad_norm": 0.35791802406311035, + "learning_rate": 0.001, + "loss": 2.0333, + "step": 18909 + }, + { + "epoch": 0.7999830780945935, + "grad_norm": 0.3409176170825958, + "learning_rate": 0.001, + "loss": 2.113, + "step": 18910 + }, + { + "epoch": 0.8000253828581099, + "grad_norm": 0.1757473349571228, + "learning_rate": 0.001, + "loss": 1.5102, + "step": 18911 + }, + { + "epoch": 0.8000676876216262, + "grad_norm": 1.6405963897705078, + "learning_rate": 0.001, + "loss": 2.0664, + "step": 18912 + }, + { + "epoch": 0.8001099923851426, + "grad_norm": 0.1534256935119629, + "learning_rate": 0.001, + "loss": 2.2177, + "step": 18913 + }, + { + "epoch": 0.800152297148659, + "grad_norm": 0.21698273718357086, + "learning_rate": 0.001, + "loss": 2.74, + "step": 18914 + }, + { + "epoch": 0.8001946019121753, + "grad_norm": 0.1605890989303589, + "learning_rate": 0.001, + "loss": 1.9252, + "step": 18915 + }, + { + "epoch": 0.8002369066756917, + "grad_norm": 0.16456551849842072, + "learning_rate": 0.001, + "loss": 1.993, + "step": 18916 + }, + { + "epoch": 0.8002792114392081, + "grad_norm": 0.1534430980682373, + "learning_rate": 0.001, + "loss": 1.7507, + "step": 18917 + }, + { + "epoch": 0.8003215162027244, + "grad_norm": 0.17245927453041077, + "learning_rate": 0.001, + "loss": 3.3313, + "step": 18918 + }, + { + "epoch": 0.8003638209662408, + "grad_norm": 0.2450478971004486, + "learning_rate": 0.001, + "loss": 2.6292, + "step": 18919 + }, + { + "epoch": 0.8004061257297572, + "grad_norm": 0.2959800958633423, + "learning_rate": 0.001, + "loss": 2.498, + "step": 18920 + }, + { + "epoch": 0.8004484304932735, + "grad_norm": 0.15317165851593018, + "learning_rate": 0.001, + "loss": 2.6568, + "step": 18921 + }, + { + "epoch": 0.8004907352567899, + "grad_norm": 0.1782752275466919, + "learning_rate": 0.001, + "loss": 1.7211, + "step": 18922 + }, + { + "epoch": 0.8005330400203063, + "grad_norm": 0.1783091425895691, + "learning_rate": 0.001, + "loss": 2.6381, + "step": 18923 + }, + { + "epoch": 0.8005753447838226, + "grad_norm": 0.17199842631816864, + "learning_rate": 0.001, + "loss": 2.553, + "step": 18924 + }, + { + "epoch": 0.800617649547339, + "grad_norm": 1.3231291770935059, + "learning_rate": 0.001, + "loss": 2.695, + "step": 18925 + }, + { + "epoch": 0.8006599543108553, + "grad_norm": 2.2281360626220703, + "learning_rate": 0.001, + "loss": 2.2209, + "step": 18926 + }, + { + "epoch": 0.8007022590743718, + "grad_norm": 0.20170575380325317, + "learning_rate": 0.001, + "loss": 2.7078, + "step": 18927 + }, + { + "epoch": 0.8007445638378882, + "grad_norm": 0.17073240876197815, + "learning_rate": 0.001, + "loss": 2.4441, + "step": 18928 + }, + { + "epoch": 0.8007868686014045, + "grad_norm": 0.15086358785629272, + "learning_rate": 0.001, + "loss": 2.0499, + "step": 18929 + }, + { + "epoch": 0.8008291733649209, + "grad_norm": 0.8255175352096558, + "learning_rate": 0.001, + "loss": 2.4192, + "step": 18930 + }, + { + "epoch": 0.8008714781284373, + "grad_norm": 0.1681467890739441, + "learning_rate": 0.001, + "loss": 2.2557, + "step": 18931 + }, + { + "epoch": 0.8009137828919536, + "grad_norm": 0.14386318624019623, + "learning_rate": 0.001, + "loss": 1.9512, + "step": 18932 + }, + { + "epoch": 0.80095608765547, + "grad_norm": 0.36584439873695374, + "learning_rate": 0.001, + "loss": 1.7842, + "step": 18933 + }, + { + "epoch": 0.8009983924189864, + "grad_norm": 0.14948461949825287, + "learning_rate": 0.001, + "loss": 1.7132, + "step": 18934 + }, + { + "epoch": 0.8010406971825027, + "grad_norm": 0.1546405851840973, + "learning_rate": 0.001, + "loss": 1.8271, + "step": 18935 + }, + { + "epoch": 0.8010830019460191, + "grad_norm": 0.15600258111953735, + "learning_rate": 0.001, + "loss": 2.0882, + "step": 18936 + }, + { + "epoch": 0.8011253067095355, + "grad_norm": 0.20449069142341614, + "learning_rate": 0.001, + "loss": 1.752, + "step": 18937 + }, + { + "epoch": 0.8011676114730518, + "grad_norm": 0.143666610121727, + "learning_rate": 0.001, + "loss": 2.4691, + "step": 18938 + }, + { + "epoch": 0.8012099162365682, + "grad_norm": 0.17219458520412445, + "learning_rate": 0.001, + "loss": 1.9348, + "step": 18939 + }, + { + "epoch": 0.8012522210000846, + "grad_norm": 0.14504066109657288, + "learning_rate": 0.001, + "loss": 2.5255, + "step": 18940 + }, + { + "epoch": 0.8012945257636009, + "grad_norm": 0.1552887260913849, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 18941 + }, + { + "epoch": 0.8013368305271173, + "grad_norm": 0.3989625871181488, + "learning_rate": 0.001, + "loss": 1.6939, + "step": 18942 + }, + { + "epoch": 0.8013791352906338, + "grad_norm": 0.15734554827213287, + "learning_rate": 0.001, + "loss": 1.8861, + "step": 18943 + }, + { + "epoch": 0.8014214400541501, + "grad_norm": 0.159386545419693, + "learning_rate": 0.001, + "loss": 2.3796, + "step": 18944 + }, + { + "epoch": 0.8014637448176665, + "grad_norm": 0.1559273898601532, + "learning_rate": 0.001, + "loss": 3.0239, + "step": 18945 + }, + { + "epoch": 0.8015060495811829, + "grad_norm": 0.15622538328170776, + "learning_rate": 0.001, + "loss": 2.6106, + "step": 18946 + }, + { + "epoch": 0.8015483543446992, + "grad_norm": 0.17876319587230682, + "learning_rate": 0.001, + "loss": 1.8505, + "step": 18947 + }, + { + "epoch": 0.8015906591082156, + "grad_norm": 0.1366472691297531, + "learning_rate": 0.001, + "loss": 1.9985, + "step": 18948 + }, + { + "epoch": 0.801632963871732, + "grad_norm": 0.17091487348079681, + "learning_rate": 0.001, + "loss": 2.836, + "step": 18949 + }, + { + "epoch": 0.8016752686352483, + "grad_norm": 0.1562683880329132, + "learning_rate": 0.001, + "loss": 1.4412, + "step": 18950 + }, + { + "epoch": 0.8017175733987647, + "grad_norm": 0.1484891027212143, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 18951 + }, + { + "epoch": 0.8017598781622811, + "grad_norm": 1.3076022863388062, + "learning_rate": 0.001, + "loss": 2.7306, + "step": 18952 + }, + { + "epoch": 0.8018021829257974, + "grad_norm": 0.16072845458984375, + "learning_rate": 0.001, + "loss": 2.0295, + "step": 18953 + }, + { + "epoch": 0.8018444876893138, + "grad_norm": 0.14599545300006866, + "learning_rate": 0.001, + "loss": 2.7667, + "step": 18954 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 0.15150395035743713, + "learning_rate": 0.001, + "loss": 3.1838, + "step": 18955 + }, + { + "epoch": 0.8019290972163465, + "grad_norm": 0.18270723521709442, + "learning_rate": 0.001, + "loss": 1.6511, + "step": 18956 + }, + { + "epoch": 0.8019714019798629, + "grad_norm": 0.1671307235956192, + "learning_rate": 0.001, + "loss": 2.9056, + "step": 18957 + }, + { + "epoch": 0.8020137067433794, + "grad_norm": 0.14215819537639618, + "learning_rate": 0.001, + "loss": 2.8229, + "step": 18958 + }, + { + "epoch": 0.8020560115068956, + "grad_norm": 0.14553506672382355, + "learning_rate": 0.001, + "loss": 1.3122, + "step": 18959 + }, + { + "epoch": 0.8020983162704121, + "grad_norm": 0.1251428723335266, + "learning_rate": 0.001, + "loss": 1.9551, + "step": 18960 + }, + { + "epoch": 0.8021406210339285, + "grad_norm": 0.7137900590896606, + "learning_rate": 0.001, + "loss": 2.8314, + "step": 18961 + }, + { + "epoch": 0.8021829257974448, + "grad_norm": 0.1495407074689865, + "learning_rate": 0.001, + "loss": 1.7774, + "step": 18962 + }, + { + "epoch": 0.8022252305609612, + "grad_norm": 0.14792723953723907, + "learning_rate": 0.001, + "loss": 2.1685, + "step": 18963 + }, + { + "epoch": 0.8022675353244776, + "grad_norm": 0.7943689823150635, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 18964 + }, + { + "epoch": 0.8023098400879939, + "grad_norm": 19.49368667602539, + "learning_rate": 0.001, + "loss": 3.6481, + "step": 18965 + }, + { + "epoch": 0.8023521448515103, + "grad_norm": 2.110476493835449, + "learning_rate": 0.001, + "loss": 1.4227, + "step": 18966 + }, + { + "epoch": 0.8023944496150267, + "grad_norm": 0.13607414066791534, + "learning_rate": 0.001, + "loss": 1.6882, + "step": 18967 + }, + { + "epoch": 0.802436754378543, + "grad_norm": 1.1978744268417358, + "learning_rate": 0.001, + "loss": 1.8216, + "step": 18968 + }, + { + "epoch": 0.8024790591420594, + "grad_norm": 0.19706468284130096, + "learning_rate": 0.001, + "loss": 2.8467, + "step": 18969 + }, + { + "epoch": 0.8025213639055757, + "grad_norm": 0.14383366703987122, + "learning_rate": 0.001, + "loss": 1.8751, + "step": 18970 + }, + { + "epoch": 0.8025636686690921, + "grad_norm": 0.20489506423473358, + "learning_rate": 0.001, + "loss": 2.8281, + "step": 18971 + }, + { + "epoch": 0.8026059734326085, + "grad_norm": 1.1792211532592773, + "learning_rate": 0.001, + "loss": 2.9794, + "step": 18972 + }, + { + "epoch": 0.8026482781961248, + "grad_norm": 0.5277860164642334, + "learning_rate": 0.001, + "loss": 1.6409, + "step": 18973 + }, + { + "epoch": 0.8026905829596412, + "grad_norm": 0.13644924759864807, + "learning_rate": 0.001, + "loss": 1.4607, + "step": 18974 + }, + { + "epoch": 0.8027328877231577, + "grad_norm": 0.17572204768657684, + "learning_rate": 0.001, + "loss": 2.3758, + "step": 18975 + }, + { + "epoch": 0.802775192486674, + "grad_norm": 0.356366366147995, + "learning_rate": 0.001, + "loss": 1.5677, + "step": 18976 + }, + { + "epoch": 0.8028174972501904, + "grad_norm": 0.1653011590242386, + "learning_rate": 0.001, + "loss": 2.6266, + "step": 18977 + }, + { + "epoch": 0.8028598020137068, + "grad_norm": 0.1638076901435852, + "learning_rate": 0.001, + "loss": 1.9443, + "step": 18978 + }, + { + "epoch": 0.8029021067772231, + "grad_norm": 0.26030609011650085, + "learning_rate": 0.001, + "loss": 1.6647, + "step": 18979 + }, + { + "epoch": 0.8029444115407395, + "grad_norm": 0.16686412692070007, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 18980 + }, + { + "epoch": 0.8029867163042559, + "grad_norm": 0.2093593031167984, + "learning_rate": 0.001, + "loss": 2.0305, + "step": 18981 + }, + { + "epoch": 0.8030290210677722, + "grad_norm": 0.17999699711799622, + "learning_rate": 0.001, + "loss": 2.2384, + "step": 18982 + }, + { + "epoch": 0.8030713258312886, + "grad_norm": 0.9324764013290405, + "learning_rate": 0.001, + "loss": 2.6971, + "step": 18983 + }, + { + "epoch": 0.803113630594805, + "grad_norm": 0.17794398963451385, + "learning_rate": 0.001, + "loss": 2.0274, + "step": 18984 + }, + { + "epoch": 0.8031559353583213, + "grad_norm": 0.18477022647857666, + "learning_rate": 0.001, + "loss": 2.2527, + "step": 18985 + }, + { + "epoch": 0.8031982401218377, + "grad_norm": 0.1455012857913971, + "learning_rate": 0.001, + "loss": 2.1315, + "step": 18986 + }, + { + "epoch": 0.8032405448853541, + "grad_norm": 1.0563513040542603, + "learning_rate": 0.001, + "loss": 2.3651, + "step": 18987 + }, + { + "epoch": 0.8032828496488704, + "grad_norm": 0.28026530146598816, + "learning_rate": 0.001, + "loss": 3.0058, + "step": 18988 + }, + { + "epoch": 0.8033251544123868, + "grad_norm": 58.185855865478516, + "learning_rate": 0.001, + "loss": 1.8485, + "step": 18989 + }, + { + "epoch": 0.8033674591759032, + "grad_norm": 0.35726267099380493, + "learning_rate": 0.001, + "loss": 1.8895, + "step": 18990 + }, + { + "epoch": 0.8034097639394195, + "grad_norm": 0.2682846486568451, + "learning_rate": 0.001, + "loss": 1.7967, + "step": 18991 + }, + { + "epoch": 0.803452068702936, + "grad_norm": 0.16665484011173248, + "learning_rate": 0.001, + "loss": 2.3617, + "step": 18992 + }, + { + "epoch": 0.8034943734664524, + "grad_norm": 0.16260682046413422, + "learning_rate": 0.001, + "loss": 2.4323, + "step": 18993 + }, + { + "epoch": 0.8035366782299687, + "grad_norm": 0.16122184693813324, + "learning_rate": 0.001, + "loss": 2.6508, + "step": 18994 + }, + { + "epoch": 0.8035789829934851, + "grad_norm": 0.16327929496765137, + "learning_rate": 0.001, + "loss": 1.5013, + "step": 18995 + }, + { + "epoch": 0.8036212877570015, + "grad_norm": 0.19706536829471588, + "learning_rate": 0.001, + "loss": 2.5097, + "step": 18996 + }, + { + "epoch": 0.8036635925205178, + "grad_norm": 0.3086377680301666, + "learning_rate": 0.001, + "loss": 1.9712, + "step": 18997 + }, + { + "epoch": 0.8037058972840342, + "grad_norm": 0.1745164543390274, + "learning_rate": 0.001, + "loss": 1.9054, + "step": 18998 + }, + { + "epoch": 0.8037482020475506, + "grad_norm": 0.20080901682376862, + "learning_rate": 0.001, + "loss": 2.6199, + "step": 18999 + }, + { + "epoch": 0.8037905068110669, + "grad_norm": 0.8673862814903259, + "learning_rate": 0.001, + "loss": 1.8435, + "step": 19000 + }, + { + "epoch": 0.8038328115745833, + "grad_norm": 0.17495474219322205, + "learning_rate": 0.001, + "loss": 2.3335, + "step": 19001 + }, + { + "epoch": 0.8038751163380997, + "grad_norm": 0.2492198944091797, + "learning_rate": 0.001, + "loss": 3.421, + "step": 19002 + }, + { + "epoch": 0.803917421101616, + "grad_norm": 0.2866990864276886, + "learning_rate": 0.001, + "loss": 2.3914, + "step": 19003 + }, + { + "epoch": 0.8039597258651324, + "grad_norm": 0.1283341497182846, + "learning_rate": 0.001, + "loss": 1.6294, + "step": 19004 + }, + { + "epoch": 0.8040020306286488, + "grad_norm": 0.18748348951339722, + "learning_rate": 0.001, + "loss": 3.779, + "step": 19005 + }, + { + "epoch": 0.8040443353921651, + "grad_norm": 0.2228836864233017, + "learning_rate": 0.001, + "loss": 2.1684, + "step": 19006 + }, + { + "epoch": 0.8040866401556815, + "grad_norm": 0.21044659614562988, + "learning_rate": 0.001, + "loss": 2.5293, + "step": 19007 + }, + { + "epoch": 0.804128944919198, + "grad_norm": 0.13109223544597626, + "learning_rate": 0.001, + "loss": 1.7461, + "step": 19008 + }, + { + "epoch": 0.8041712496827143, + "grad_norm": 0.16634885966777802, + "learning_rate": 0.001, + "loss": 1.6819, + "step": 19009 + }, + { + "epoch": 0.8042135544462307, + "grad_norm": 26.008522033691406, + "learning_rate": 0.001, + "loss": 1.8201, + "step": 19010 + }, + { + "epoch": 0.8042558592097471, + "grad_norm": 0.14524191617965698, + "learning_rate": 0.001, + "loss": 1.7735, + "step": 19011 + }, + { + "epoch": 0.8042981639732634, + "grad_norm": 0.14269325137138367, + "learning_rate": 0.001, + "loss": 2.4637, + "step": 19012 + }, + { + "epoch": 0.8043404687367798, + "grad_norm": 0.31678712368011475, + "learning_rate": 0.001, + "loss": 2.9312, + "step": 19013 + }, + { + "epoch": 0.8043827735002962, + "grad_norm": 0.16404123604297638, + "learning_rate": 0.001, + "loss": 1.8627, + "step": 19014 + }, + { + "epoch": 0.8044250782638125, + "grad_norm": 0.16034923493862152, + "learning_rate": 0.001, + "loss": 2.3652, + "step": 19015 + }, + { + "epoch": 0.8044673830273289, + "grad_norm": 0.1982741504907608, + "learning_rate": 0.001, + "loss": 2.198, + "step": 19016 + }, + { + "epoch": 0.8045096877908452, + "grad_norm": 0.1432885080575943, + "learning_rate": 0.001, + "loss": 1.7994, + "step": 19017 + }, + { + "epoch": 0.8045519925543616, + "grad_norm": 0.2005062848329544, + "learning_rate": 0.001, + "loss": 2.2732, + "step": 19018 + }, + { + "epoch": 0.804594297317878, + "grad_norm": 0.18627607822418213, + "learning_rate": 0.001, + "loss": 2.2855, + "step": 19019 + }, + { + "epoch": 0.8046366020813943, + "grad_norm": 0.17629285156726837, + "learning_rate": 0.001, + "loss": 2.3181, + "step": 19020 + }, + { + "epoch": 0.8046789068449107, + "grad_norm": 0.15572401881217957, + "learning_rate": 0.001, + "loss": 2.2486, + "step": 19021 + }, + { + "epoch": 0.8047212116084271, + "grad_norm": 0.13063278794288635, + "learning_rate": 0.001, + "loss": 2.0039, + "step": 19022 + }, + { + "epoch": 0.8047635163719434, + "grad_norm": 0.17360426485538483, + "learning_rate": 0.001, + "loss": 1.9815, + "step": 19023 + }, + { + "epoch": 0.8048058211354598, + "grad_norm": 0.14736105501651764, + "learning_rate": 0.001, + "loss": 1.7787, + "step": 19024 + }, + { + "epoch": 0.8048481258989763, + "grad_norm": 0.40037959814071655, + "learning_rate": 0.001, + "loss": 1.4608, + "step": 19025 + }, + { + "epoch": 0.8048904306624926, + "grad_norm": 0.13307571411132812, + "learning_rate": 0.001, + "loss": 2.472, + "step": 19026 + }, + { + "epoch": 0.804932735426009, + "grad_norm": 0.2580379545688629, + "learning_rate": 0.001, + "loss": 2.6533, + "step": 19027 + }, + { + "epoch": 0.8049750401895254, + "grad_norm": 1.3895004987716675, + "learning_rate": 0.001, + "loss": 2.1698, + "step": 19028 + }, + { + "epoch": 0.8050173449530417, + "grad_norm": 0.1485358327627182, + "learning_rate": 0.001, + "loss": 1.8783, + "step": 19029 + }, + { + "epoch": 0.8050596497165581, + "grad_norm": 33.825557708740234, + "learning_rate": 0.001, + "loss": 2.6339, + "step": 19030 + }, + { + "epoch": 0.8051019544800745, + "grad_norm": 0.15291500091552734, + "learning_rate": 0.001, + "loss": 1.5062, + "step": 19031 + }, + { + "epoch": 0.8051442592435908, + "grad_norm": 0.19544963538646698, + "learning_rate": 0.001, + "loss": 2.7773, + "step": 19032 + }, + { + "epoch": 0.8051865640071072, + "grad_norm": 0.7990001440048218, + "learning_rate": 0.001, + "loss": 2.6352, + "step": 19033 + }, + { + "epoch": 0.8052288687706236, + "grad_norm": 0.20977818965911865, + "learning_rate": 0.001, + "loss": 2.6468, + "step": 19034 + }, + { + "epoch": 0.8052711735341399, + "grad_norm": 0.3944449722766876, + "learning_rate": 0.001, + "loss": 2.3777, + "step": 19035 + }, + { + "epoch": 0.8053134782976563, + "grad_norm": 0.1769852191209793, + "learning_rate": 0.001, + "loss": 1.7299, + "step": 19036 + }, + { + "epoch": 0.8053557830611727, + "grad_norm": 0.2292184680700302, + "learning_rate": 0.001, + "loss": 2.9147, + "step": 19037 + }, + { + "epoch": 0.805398087824689, + "grad_norm": 0.20135535299777985, + "learning_rate": 0.001, + "loss": 1.8794, + "step": 19038 + }, + { + "epoch": 0.8054403925882054, + "grad_norm": 9.884795188903809, + "learning_rate": 0.001, + "loss": 2.4281, + "step": 19039 + }, + { + "epoch": 0.8054826973517218, + "grad_norm": 0.22030168771743774, + "learning_rate": 0.001, + "loss": 2.7934, + "step": 19040 + }, + { + "epoch": 0.8055250021152381, + "grad_norm": 0.3213954269886017, + "learning_rate": 0.001, + "loss": 1.9529, + "step": 19041 + }, + { + "epoch": 0.8055673068787546, + "grad_norm": 0.18723493814468384, + "learning_rate": 0.001, + "loss": 2.0792, + "step": 19042 + }, + { + "epoch": 0.805609611642271, + "grad_norm": 0.19658009707927704, + "learning_rate": 0.001, + "loss": 2.5037, + "step": 19043 + }, + { + "epoch": 0.8056519164057873, + "grad_norm": 0.18413256108760834, + "learning_rate": 0.001, + "loss": 2.0197, + "step": 19044 + }, + { + "epoch": 0.8056942211693037, + "grad_norm": 0.1551973521709442, + "learning_rate": 0.001, + "loss": 2.6713, + "step": 19045 + }, + { + "epoch": 0.8057365259328201, + "grad_norm": 0.18275663256645203, + "learning_rate": 0.001, + "loss": 2.6608, + "step": 19046 + }, + { + "epoch": 0.8057788306963364, + "grad_norm": 0.1611073613166809, + "learning_rate": 0.001, + "loss": 2.5312, + "step": 19047 + }, + { + "epoch": 0.8058211354598528, + "grad_norm": 0.160042405128479, + "learning_rate": 0.001, + "loss": 1.8463, + "step": 19048 + }, + { + "epoch": 0.8058634402233692, + "grad_norm": 0.19275878369808197, + "learning_rate": 0.001, + "loss": 1.818, + "step": 19049 + }, + { + "epoch": 0.8059057449868855, + "grad_norm": 1.6935782432556152, + "learning_rate": 0.001, + "loss": 4.1077, + "step": 19050 + }, + { + "epoch": 0.8059480497504019, + "grad_norm": 0.16775570809841156, + "learning_rate": 0.001, + "loss": 2.496, + "step": 19051 + }, + { + "epoch": 0.8059903545139183, + "grad_norm": 0.1673063337802887, + "learning_rate": 0.001, + "loss": 2.0237, + "step": 19052 + }, + { + "epoch": 0.8060326592774346, + "grad_norm": 0.17094041407108307, + "learning_rate": 0.001, + "loss": 1.7422, + "step": 19053 + }, + { + "epoch": 0.806074964040951, + "grad_norm": 0.18810485303401947, + "learning_rate": 0.001, + "loss": 2.8559, + "step": 19054 + }, + { + "epoch": 0.8061172688044674, + "grad_norm": 0.17345726490020752, + "learning_rate": 0.001, + "loss": 2.183, + "step": 19055 + }, + { + "epoch": 0.8061595735679837, + "grad_norm": 0.27425530552864075, + "learning_rate": 0.001, + "loss": 2.9688, + "step": 19056 + }, + { + "epoch": 0.8062018783315001, + "grad_norm": 16.195755004882812, + "learning_rate": 0.001, + "loss": 1.9708, + "step": 19057 + }, + { + "epoch": 0.8062441830950166, + "grad_norm": 0.15767496824264526, + "learning_rate": 0.001, + "loss": 1.8641, + "step": 19058 + }, + { + "epoch": 0.8062864878585329, + "grad_norm": 0.16588301956653595, + "learning_rate": 0.001, + "loss": 2.869, + "step": 19059 + }, + { + "epoch": 0.8063287926220493, + "grad_norm": 0.15558847784996033, + "learning_rate": 0.001, + "loss": 1.9133, + "step": 19060 + }, + { + "epoch": 0.8063710973855656, + "grad_norm": 0.16034120321273804, + "learning_rate": 0.001, + "loss": 3.0425, + "step": 19061 + }, + { + "epoch": 0.806413402149082, + "grad_norm": 0.1412864327430725, + "learning_rate": 0.001, + "loss": 2.0012, + "step": 19062 + }, + { + "epoch": 0.8064557069125984, + "grad_norm": 0.2045813649892807, + "learning_rate": 0.001, + "loss": 1.9367, + "step": 19063 + }, + { + "epoch": 0.8064980116761147, + "grad_norm": 0.20166616141796112, + "learning_rate": 0.001, + "loss": 1.8073, + "step": 19064 + }, + { + "epoch": 0.8065403164396311, + "grad_norm": 0.19779857993125916, + "learning_rate": 0.001, + "loss": 2.8934, + "step": 19065 + }, + { + "epoch": 0.8065826212031475, + "grad_norm": 0.15480828285217285, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 19066 + }, + { + "epoch": 0.8066249259666638, + "grad_norm": 0.5121575593948364, + "learning_rate": 0.001, + "loss": 2.1465, + "step": 19067 + }, + { + "epoch": 0.8066672307301802, + "grad_norm": 0.25678855180740356, + "learning_rate": 0.001, + "loss": 2.5373, + "step": 19068 + }, + { + "epoch": 0.8067095354936966, + "grad_norm": 0.1567380726337433, + "learning_rate": 0.001, + "loss": 1.5213, + "step": 19069 + }, + { + "epoch": 0.8067518402572129, + "grad_norm": 0.15887407958507538, + "learning_rate": 0.001, + "loss": 2.0361, + "step": 19070 + }, + { + "epoch": 0.8067941450207293, + "grad_norm": 0.2849603593349457, + "learning_rate": 0.001, + "loss": 2.3657, + "step": 19071 + }, + { + "epoch": 0.8068364497842457, + "grad_norm": 0.27255427837371826, + "learning_rate": 0.001, + "loss": 2.005, + "step": 19072 + }, + { + "epoch": 0.806878754547762, + "grad_norm": 1.081209659576416, + "learning_rate": 0.001, + "loss": 2.7503, + "step": 19073 + }, + { + "epoch": 0.8069210593112784, + "grad_norm": 0.20307116210460663, + "learning_rate": 0.001, + "loss": 2.4047, + "step": 19074 + }, + { + "epoch": 0.8069633640747949, + "grad_norm": 0.3475703299045563, + "learning_rate": 0.001, + "loss": 2.1349, + "step": 19075 + }, + { + "epoch": 0.8070056688383112, + "grad_norm": 0.18069538474082947, + "learning_rate": 0.001, + "loss": 2.8262, + "step": 19076 + }, + { + "epoch": 0.8070479736018276, + "grad_norm": 0.16302266716957092, + "learning_rate": 0.001, + "loss": 2.1926, + "step": 19077 + }, + { + "epoch": 0.807090278365344, + "grad_norm": 0.13929545879364014, + "learning_rate": 0.001, + "loss": 1.7021, + "step": 19078 + }, + { + "epoch": 0.8071325831288603, + "grad_norm": 2.2254645824432373, + "learning_rate": 0.001, + "loss": 2.0774, + "step": 19079 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 0.12101297825574875, + "learning_rate": 0.001, + "loss": 2.1299, + "step": 19080 + }, + { + "epoch": 0.8072171926558931, + "grad_norm": 0.15485353767871857, + "learning_rate": 0.001, + "loss": 1.941, + "step": 19081 + }, + { + "epoch": 0.8072594974194094, + "grad_norm": 0.16078881919384003, + "learning_rate": 0.001, + "loss": 2.2291, + "step": 19082 + }, + { + "epoch": 0.8073018021829258, + "grad_norm": 1.5918997526168823, + "learning_rate": 0.001, + "loss": 3.3536, + "step": 19083 + }, + { + "epoch": 0.8073441069464422, + "grad_norm": 0.16375018656253815, + "learning_rate": 0.001, + "loss": 2.5549, + "step": 19084 + }, + { + "epoch": 0.8073864117099585, + "grad_norm": 1.0432242155075073, + "learning_rate": 0.001, + "loss": 2.6411, + "step": 19085 + }, + { + "epoch": 0.8074287164734749, + "grad_norm": 0.1408742517232895, + "learning_rate": 0.001, + "loss": 1.4977, + "step": 19086 + }, + { + "epoch": 0.8074710212369913, + "grad_norm": 0.13570459187030792, + "learning_rate": 0.001, + "loss": 3.3056, + "step": 19087 + }, + { + "epoch": 0.8075133260005076, + "grad_norm": 0.22087857127189636, + "learning_rate": 0.001, + "loss": 2.2098, + "step": 19088 + }, + { + "epoch": 0.807555630764024, + "grad_norm": 1.198239803314209, + "learning_rate": 0.001, + "loss": 3.1157, + "step": 19089 + }, + { + "epoch": 0.8075979355275404, + "grad_norm": 0.1383177936077118, + "learning_rate": 0.001, + "loss": 2.3563, + "step": 19090 + }, + { + "epoch": 0.8076402402910567, + "grad_norm": 0.14231859147548676, + "learning_rate": 0.001, + "loss": 1.863, + "step": 19091 + }, + { + "epoch": 0.8076825450545732, + "grad_norm": 0.17325039207935333, + "learning_rate": 0.001, + "loss": 3.1426, + "step": 19092 + }, + { + "epoch": 0.8077248498180896, + "grad_norm": 0.14337441325187683, + "learning_rate": 0.001, + "loss": 1.6843, + "step": 19093 + }, + { + "epoch": 0.8077671545816059, + "grad_norm": 0.15518362820148468, + "learning_rate": 0.001, + "loss": 2.3449, + "step": 19094 + }, + { + "epoch": 0.8078094593451223, + "grad_norm": 0.13496747612953186, + "learning_rate": 0.001, + "loss": 2.6607, + "step": 19095 + }, + { + "epoch": 0.8078517641086387, + "grad_norm": 0.17812597751617432, + "learning_rate": 0.001, + "loss": 2.6483, + "step": 19096 + }, + { + "epoch": 0.807894068872155, + "grad_norm": 0.24298779666423798, + "learning_rate": 0.001, + "loss": 1.6009, + "step": 19097 + }, + { + "epoch": 0.8079363736356714, + "grad_norm": 0.13833343982696533, + "learning_rate": 0.001, + "loss": 1.7103, + "step": 19098 + }, + { + "epoch": 0.8079786783991878, + "grad_norm": 0.17393560707569122, + "learning_rate": 0.001, + "loss": 3.2679, + "step": 19099 + }, + { + "epoch": 0.8080209831627041, + "grad_norm": 0.1347343772649765, + "learning_rate": 0.001, + "loss": 1.3848, + "step": 19100 + }, + { + "epoch": 0.8080632879262205, + "grad_norm": 0.3251754641532898, + "learning_rate": 0.001, + "loss": 2.6169, + "step": 19101 + }, + { + "epoch": 0.8081055926897369, + "grad_norm": 0.13301436603069305, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 19102 + }, + { + "epoch": 0.8081478974532532, + "grad_norm": 0.35224834084510803, + "learning_rate": 0.001, + "loss": 1.9909, + "step": 19103 + }, + { + "epoch": 0.8081902022167696, + "grad_norm": 0.2184954732656479, + "learning_rate": 0.001, + "loss": 2.3797, + "step": 19104 + }, + { + "epoch": 0.8082325069802859, + "grad_norm": 0.14901390671730042, + "learning_rate": 0.001, + "loss": 2.0435, + "step": 19105 + }, + { + "epoch": 0.8082748117438023, + "grad_norm": 0.1670936793088913, + "learning_rate": 0.001, + "loss": 1.9192, + "step": 19106 + }, + { + "epoch": 0.8083171165073187, + "grad_norm": 0.1560143530368805, + "learning_rate": 0.001, + "loss": 1.6413, + "step": 19107 + }, + { + "epoch": 0.808359421270835, + "grad_norm": 0.14152587950229645, + "learning_rate": 0.001, + "loss": 1.7786, + "step": 19108 + }, + { + "epoch": 0.8084017260343515, + "grad_norm": 0.2246204912662506, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 19109 + }, + { + "epoch": 0.8084440307978679, + "grad_norm": 0.17294353246688843, + "learning_rate": 0.001, + "loss": 2.3991, + "step": 19110 + }, + { + "epoch": 0.8084863355613842, + "grad_norm": 0.14539849758148193, + "learning_rate": 0.001, + "loss": 1.9719, + "step": 19111 + }, + { + "epoch": 0.8085286403249006, + "grad_norm": 0.16276989877223969, + "learning_rate": 0.001, + "loss": 1.7146, + "step": 19112 + }, + { + "epoch": 0.808570945088417, + "grad_norm": 9.473787307739258, + "learning_rate": 0.001, + "loss": 2.1896, + "step": 19113 + }, + { + "epoch": 0.8086132498519333, + "grad_norm": 0.1611110121011734, + "learning_rate": 0.001, + "loss": 1.8925, + "step": 19114 + }, + { + "epoch": 0.8086555546154497, + "grad_norm": 0.14507554471492767, + "learning_rate": 0.001, + "loss": 1.5477, + "step": 19115 + }, + { + "epoch": 0.8086978593789661, + "grad_norm": 0.17105506360530853, + "learning_rate": 0.001, + "loss": 2.5952, + "step": 19116 + }, + { + "epoch": 0.8087401641424824, + "grad_norm": 0.563596248626709, + "learning_rate": 0.001, + "loss": 2.5732, + "step": 19117 + }, + { + "epoch": 0.8087824689059988, + "grad_norm": 0.2284759283065796, + "learning_rate": 0.001, + "loss": 3.6089, + "step": 19118 + }, + { + "epoch": 0.8088247736695152, + "grad_norm": 0.2556639611721039, + "learning_rate": 0.001, + "loss": 1.9265, + "step": 19119 + }, + { + "epoch": 0.8088670784330315, + "grad_norm": 0.1958867907524109, + "learning_rate": 0.001, + "loss": 2.395, + "step": 19120 + }, + { + "epoch": 0.8089093831965479, + "grad_norm": 0.22015972435474396, + "learning_rate": 0.001, + "loss": 2.0847, + "step": 19121 + }, + { + "epoch": 0.8089516879600643, + "grad_norm": 0.17518767714500427, + "learning_rate": 0.001, + "loss": 2.8718, + "step": 19122 + }, + { + "epoch": 0.8089939927235806, + "grad_norm": 6.914678573608398, + "learning_rate": 0.001, + "loss": 1.9315, + "step": 19123 + }, + { + "epoch": 0.809036297487097, + "grad_norm": 1.030540108680725, + "learning_rate": 0.001, + "loss": 2.9412, + "step": 19124 + }, + { + "epoch": 0.8090786022506135, + "grad_norm": 28.061410903930664, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 19125 + }, + { + "epoch": 0.8091209070141298, + "grad_norm": 0.18755550682544708, + "learning_rate": 0.001, + "loss": 2.0356, + "step": 19126 + }, + { + "epoch": 0.8091632117776462, + "grad_norm": 0.15464842319488525, + "learning_rate": 0.001, + "loss": 1.7849, + "step": 19127 + }, + { + "epoch": 0.8092055165411626, + "grad_norm": 0.20664550364017487, + "learning_rate": 0.001, + "loss": 3.0477, + "step": 19128 + }, + { + "epoch": 0.8092478213046789, + "grad_norm": 0.19141417741775513, + "learning_rate": 0.001, + "loss": 2.0378, + "step": 19129 + }, + { + "epoch": 0.8092901260681953, + "grad_norm": 0.7262970805168152, + "learning_rate": 0.001, + "loss": 2.168, + "step": 19130 + }, + { + "epoch": 0.8093324308317117, + "grad_norm": 0.18341998755931854, + "learning_rate": 0.001, + "loss": 2.529, + "step": 19131 + }, + { + "epoch": 0.809374735595228, + "grad_norm": 0.16619554162025452, + "learning_rate": 0.001, + "loss": 1.9075, + "step": 19132 + }, + { + "epoch": 0.8094170403587444, + "grad_norm": 0.16377653181552887, + "learning_rate": 0.001, + "loss": 2.7905, + "step": 19133 + }, + { + "epoch": 0.8094593451222608, + "grad_norm": 0.15902559459209442, + "learning_rate": 0.001, + "loss": 2.0185, + "step": 19134 + }, + { + "epoch": 0.8095016498857771, + "grad_norm": 0.1559763252735138, + "learning_rate": 0.001, + "loss": 2.0704, + "step": 19135 + }, + { + "epoch": 0.8095439546492935, + "grad_norm": 0.1679486483335495, + "learning_rate": 0.001, + "loss": 2.4996, + "step": 19136 + }, + { + "epoch": 0.8095862594128099, + "grad_norm": 0.15577644109725952, + "learning_rate": 0.001, + "loss": 2.4594, + "step": 19137 + }, + { + "epoch": 0.8096285641763262, + "grad_norm": 0.14891107380390167, + "learning_rate": 0.001, + "loss": 1.9317, + "step": 19138 + }, + { + "epoch": 0.8096708689398426, + "grad_norm": 0.20740364491939545, + "learning_rate": 0.001, + "loss": 1.7758, + "step": 19139 + }, + { + "epoch": 0.809713173703359, + "grad_norm": 0.17380544543266296, + "learning_rate": 0.001, + "loss": 1.6315, + "step": 19140 + }, + { + "epoch": 0.8097554784668753, + "grad_norm": 0.15323974192142487, + "learning_rate": 0.001, + "loss": 2.2311, + "step": 19141 + }, + { + "epoch": 0.8097977832303918, + "grad_norm": 1.1222220659255981, + "learning_rate": 0.001, + "loss": 1.8476, + "step": 19142 + }, + { + "epoch": 0.8098400879939082, + "grad_norm": 0.17634250223636627, + "learning_rate": 0.001, + "loss": 2.2164, + "step": 19143 + }, + { + "epoch": 0.8098823927574245, + "grad_norm": 0.16410218179225922, + "learning_rate": 0.001, + "loss": 1.9059, + "step": 19144 + }, + { + "epoch": 0.8099246975209409, + "grad_norm": 0.16236481070518494, + "learning_rate": 0.001, + "loss": 1.8111, + "step": 19145 + }, + { + "epoch": 0.8099670022844573, + "grad_norm": 5.717217445373535, + "learning_rate": 0.001, + "loss": 2.1306, + "step": 19146 + }, + { + "epoch": 0.8100093070479736, + "grad_norm": 0.1764141023159027, + "learning_rate": 0.001, + "loss": 1.4532, + "step": 19147 + }, + { + "epoch": 0.81005161181149, + "grad_norm": 0.18373993039131165, + "learning_rate": 0.001, + "loss": 1.8148, + "step": 19148 + }, + { + "epoch": 0.8100939165750064, + "grad_norm": 0.1714664250612259, + "learning_rate": 0.001, + "loss": 2.0202, + "step": 19149 + }, + { + "epoch": 0.8101362213385227, + "grad_norm": 0.22828002274036407, + "learning_rate": 0.001, + "loss": 1.8922, + "step": 19150 + }, + { + "epoch": 0.8101785261020391, + "grad_norm": 0.37784022092819214, + "learning_rate": 0.001, + "loss": 3.023, + "step": 19151 + }, + { + "epoch": 0.8102208308655554, + "grad_norm": 0.17360255122184753, + "learning_rate": 0.001, + "loss": 2.1345, + "step": 19152 + }, + { + "epoch": 0.8102631356290718, + "grad_norm": 0.16689980030059814, + "learning_rate": 0.001, + "loss": 2.1847, + "step": 19153 + }, + { + "epoch": 0.8103054403925882, + "grad_norm": 0.23073500394821167, + "learning_rate": 0.001, + "loss": 2.0032, + "step": 19154 + }, + { + "epoch": 0.8103477451561045, + "grad_norm": 0.1540898084640503, + "learning_rate": 0.001, + "loss": 1.4885, + "step": 19155 + }, + { + "epoch": 0.8103900499196209, + "grad_norm": 2.4491209983825684, + "learning_rate": 0.001, + "loss": 2.0554, + "step": 19156 + }, + { + "epoch": 0.8104323546831373, + "grad_norm": 0.191018208861351, + "learning_rate": 0.001, + "loss": 2.0957, + "step": 19157 + }, + { + "epoch": 0.8104746594466536, + "grad_norm": 4.3804240226745605, + "learning_rate": 0.001, + "loss": 2.0839, + "step": 19158 + }, + { + "epoch": 0.81051696421017, + "grad_norm": 0.41173967719078064, + "learning_rate": 0.001, + "loss": 2.0613, + "step": 19159 + }, + { + "epoch": 0.8105592689736865, + "grad_norm": 0.13415801525115967, + "learning_rate": 0.001, + "loss": 1.7669, + "step": 19160 + }, + { + "epoch": 0.8106015737372028, + "grad_norm": 0.15621332824230194, + "learning_rate": 0.001, + "loss": 2.5883, + "step": 19161 + }, + { + "epoch": 0.8106438785007192, + "grad_norm": 2.067545175552368, + "learning_rate": 0.001, + "loss": 2.3052, + "step": 19162 + }, + { + "epoch": 0.8106861832642356, + "grad_norm": 0.15664492547512054, + "learning_rate": 0.001, + "loss": 2.0756, + "step": 19163 + }, + { + "epoch": 0.8107284880277519, + "grad_norm": 0.32015523314476013, + "learning_rate": 0.001, + "loss": 1.449, + "step": 19164 + }, + { + "epoch": 0.8107707927912683, + "grad_norm": 0.18561597168445587, + "learning_rate": 0.001, + "loss": 3.2036, + "step": 19165 + }, + { + "epoch": 0.8108130975547847, + "grad_norm": 0.1604049801826477, + "learning_rate": 0.001, + "loss": 2.1215, + "step": 19166 + }, + { + "epoch": 0.810855402318301, + "grad_norm": 0.13937008380889893, + "learning_rate": 0.001, + "loss": 2.1111, + "step": 19167 + }, + { + "epoch": 0.8108977070818174, + "grad_norm": 0.1511482447385788, + "learning_rate": 0.001, + "loss": 2.4686, + "step": 19168 + }, + { + "epoch": 0.8109400118453338, + "grad_norm": 0.4182822108268738, + "learning_rate": 0.001, + "loss": 2.0505, + "step": 19169 + }, + { + "epoch": 0.8109823166088501, + "grad_norm": 0.17562200129032135, + "learning_rate": 0.001, + "loss": 3.0039, + "step": 19170 + }, + { + "epoch": 0.8110246213723665, + "grad_norm": 0.14364513754844666, + "learning_rate": 0.001, + "loss": 1.9338, + "step": 19171 + }, + { + "epoch": 0.8110669261358829, + "grad_norm": 1.0508071184158325, + "learning_rate": 0.001, + "loss": 2.679, + "step": 19172 + }, + { + "epoch": 0.8111092308993992, + "grad_norm": 0.14670781791210175, + "learning_rate": 0.001, + "loss": 1.7561, + "step": 19173 + }, + { + "epoch": 0.8111515356629156, + "grad_norm": 0.12752504646778107, + "learning_rate": 0.001, + "loss": 1.1323, + "step": 19174 + }, + { + "epoch": 0.8111938404264321, + "grad_norm": 0.13774631917476654, + "learning_rate": 0.001, + "loss": 2.117, + "step": 19175 + }, + { + "epoch": 0.8112361451899484, + "grad_norm": 0.31936532258987427, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 19176 + }, + { + "epoch": 0.8112784499534648, + "grad_norm": 0.3710169196128845, + "learning_rate": 0.001, + "loss": 3.5819, + "step": 19177 + }, + { + "epoch": 0.8113207547169812, + "grad_norm": 0.1603461056947708, + "learning_rate": 0.001, + "loss": 2.0246, + "step": 19178 + }, + { + "epoch": 0.8113630594804975, + "grad_norm": 0.18489456176757812, + "learning_rate": 0.001, + "loss": 2.6565, + "step": 19179 + }, + { + "epoch": 0.8114053642440139, + "grad_norm": 0.16004648804664612, + "learning_rate": 0.001, + "loss": 1.6997, + "step": 19180 + }, + { + "epoch": 0.8114476690075303, + "grad_norm": 0.1679944545030594, + "learning_rate": 0.001, + "loss": 3.4693, + "step": 19181 + }, + { + "epoch": 0.8114899737710466, + "grad_norm": 0.1983286440372467, + "learning_rate": 0.001, + "loss": 1.9687, + "step": 19182 + }, + { + "epoch": 0.811532278534563, + "grad_norm": 0.5275279879570007, + "learning_rate": 0.001, + "loss": 1.5419, + "step": 19183 + }, + { + "epoch": 0.8115745832980794, + "grad_norm": 0.15500518679618835, + "learning_rate": 0.001, + "loss": 2.7367, + "step": 19184 + }, + { + "epoch": 0.8116168880615957, + "grad_norm": 0.16536766290664673, + "learning_rate": 0.001, + "loss": 2.1103, + "step": 19185 + }, + { + "epoch": 0.8116591928251121, + "grad_norm": 0.17966187000274658, + "learning_rate": 0.001, + "loss": 2.0974, + "step": 19186 + }, + { + "epoch": 0.8117014975886285, + "grad_norm": 4.907825946807861, + "learning_rate": 0.001, + "loss": 3.4044, + "step": 19187 + }, + { + "epoch": 0.8117438023521448, + "grad_norm": 0.18678595125675201, + "learning_rate": 0.001, + "loss": 2.5199, + "step": 19188 + }, + { + "epoch": 0.8117861071156612, + "grad_norm": 0.751160740852356, + "learning_rate": 0.001, + "loss": 2.3407, + "step": 19189 + }, + { + "epoch": 0.8118284118791776, + "grad_norm": 0.1894712597131729, + "learning_rate": 0.001, + "loss": 2.066, + "step": 19190 + }, + { + "epoch": 0.811870716642694, + "grad_norm": 0.174768328666687, + "learning_rate": 0.001, + "loss": 2.0651, + "step": 19191 + }, + { + "epoch": 0.8119130214062104, + "grad_norm": 0.17409004271030426, + "learning_rate": 0.001, + "loss": 2.4093, + "step": 19192 + }, + { + "epoch": 0.8119553261697268, + "grad_norm": 0.154340922832489, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 19193 + }, + { + "epoch": 0.8119976309332431, + "grad_norm": 0.18005342781543732, + "learning_rate": 0.001, + "loss": 2.8527, + "step": 19194 + }, + { + "epoch": 0.8120399356967595, + "grad_norm": 0.1886720359325409, + "learning_rate": 0.001, + "loss": 2.052, + "step": 19195 + }, + { + "epoch": 0.8120822404602758, + "grad_norm": 0.15437401831150055, + "learning_rate": 0.001, + "loss": 2.0393, + "step": 19196 + }, + { + "epoch": 0.8121245452237922, + "grad_norm": 0.8311869502067566, + "learning_rate": 0.001, + "loss": 2.9646, + "step": 19197 + }, + { + "epoch": 0.8121668499873086, + "grad_norm": 0.26318734884262085, + "learning_rate": 0.001, + "loss": 3.0618, + "step": 19198 + }, + { + "epoch": 0.8122091547508249, + "grad_norm": 0.15267373621463776, + "learning_rate": 0.001, + "loss": 1.8216, + "step": 19199 + }, + { + "epoch": 0.8122514595143413, + "grad_norm": 0.20256157219409943, + "learning_rate": 0.001, + "loss": 2.2829, + "step": 19200 + }, + { + "epoch": 0.8122937642778577, + "grad_norm": 0.9764918088912964, + "learning_rate": 0.001, + "loss": 2.902, + "step": 19201 + }, + { + "epoch": 0.812336069041374, + "grad_norm": 0.5934033393859863, + "learning_rate": 0.001, + "loss": 2.9391, + "step": 19202 + }, + { + "epoch": 0.8123783738048904, + "grad_norm": 0.41964349150657654, + "learning_rate": 0.001, + "loss": 1.6686, + "step": 19203 + }, + { + "epoch": 0.8124206785684068, + "grad_norm": 0.15301333367824554, + "learning_rate": 0.001, + "loss": 2.829, + "step": 19204 + }, + { + "epoch": 0.8124629833319231, + "grad_norm": 0.15774710476398468, + "learning_rate": 0.001, + "loss": 2.467, + "step": 19205 + }, + { + "epoch": 0.8125052880954395, + "grad_norm": 0.3927057683467865, + "learning_rate": 0.001, + "loss": 2.3201, + "step": 19206 + }, + { + "epoch": 0.812547592858956, + "grad_norm": 4.193830490112305, + "learning_rate": 0.001, + "loss": 1.9153, + "step": 19207 + }, + { + "epoch": 0.8125898976224722, + "grad_norm": 0.16809159517288208, + "learning_rate": 0.001, + "loss": 2.6222, + "step": 19208 + }, + { + "epoch": 0.8126322023859887, + "grad_norm": 0.36612462997436523, + "learning_rate": 0.001, + "loss": 1.8079, + "step": 19209 + }, + { + "epoch": 0.8126745071495051, + "grad_norm": 0.17396114766597748, + "learning_rate": 0.001, + "loss": 1.7655, + "step": 19210 + }, + { + "epoch": 0.8127168119130214, + "grad_norm": 0.14558352530002594, + "learning_rate": 0.001, + "loss": 2.35, + "step": 19211 + }, + { + "epoch": 0.8127591166765378, + "grad_norm": 0.1535443514585495, + "learning_rate": 0.001, + "loss": 1.8861, + "step": 19212 + }, + { + "epoch": 0.8128014214400542, + "grad_norm": 0.14222368597984314, + "learning_rate": 0.001, + "loss": 1.6644, + "step": 19213 + }, + { + "epoch": 0.8128437262035705, + "grad_norm": 0.16522639989852905, + "learning_rate": 0.001, + "loss": 2.5941, + "step": 19214 + }, + { + "epoch": 0.8128860309670869, + "grad_norm": 1.800579309463501, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 19215 + }, + { + "epoch": 0.8129283357306033, + "grad_norm": 4.028626918792725, + "learning_rate": 0.001, + "loss": 1.5658, + "step": 19216 + }, + { + "epoch": 0.8129706404941196, + "grad_norm": 0.15518246591091156, + "learning_rate": 0.001, + "loss": 1.8562, + "step": 19217 + }, + { + "epoch": 0.813012945257636, + "grad_norm": 1.1874980926513672, + "learning_rate": 0.001, + "loss": 2.0366, + "step": 19218 + }, + { + "epoch": 0.8130552500211524, + "grad_norm": 0.16775746643543243, + "learning_rate": 0.001, + "loss": 1.7891, + "step": 19219 + }, + { + "epoch": 0.8130975547846687, + "grad_norm": 0.30443984270095825, + "learning_rate": 0.001, + "loss": 3.444, + "step": 19220 + }, + { + "epoch": 0.8131398595481851, + "grad_norm": 0.1578860729932785, + "learning_rate": 0.001, + "loss": 1.8633, + "step": 19221 + }, + { + "epoch": 0.8131821643117015, + "grad_norm": 17.17110252380371, + "learning_rate": 0.001, + "loss": 2.2716, + "step": 19222 + }, + { + "epoch": 0.8132244690752178, + "grad_norm": 0.16925646364688873, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 19223 + }, + { + "epoch": 0.8132667738387342, + "grad_norm": 0.46783486008644104, + "learning_rate": 0.001, + "loss": 1.9686, + "step": 19224 + }, + { + "epoch": 0.8133090786022507, + "grad_norm": 0.21370871365070343, + "learning_rate": 0.001, + "loss": 2.3163, + "step": 19225 + }, + { + "epoch": 0.813351383365767, + "grad_norm": 2.3018229007720947, + "learning_rate": 0.001, + "loss": 3.1378, + "step": 19226 + }, + { + "epoch": 0.8133936881292834, + "grad_norm": 0.26849448680877686, + "learning_rate": 0.001, + "loss": 2.8301, + "step": 19227 + }, + { + "epoch": 0.8134359928927998, + "grad_norm": 0.36854100227355957, + "learning_rate": 0.001, + "loss": 2.2498, + "step": 19228 + }, + { + "epoch": 0.8134782976563161, + "grad_norm": 0.44008371233940125, + "learning_rate": 0.001, + "loss": 2.2804, + "step": 19229 + }, + { + "epoch": 0.8135206024198325, + "grad_norm": 0.2091284990310669, + "learning_rate": 0.001, + "loss": 2.2224, + "step": 19230 + }, + { + "epoch": 0.8135629071833489, + "grad_norm": 0.2483440637588501, + "learning_rate": 0.001, + "loss": 2.2146, + "step": 19231 + }, + { + "epoch": 0.8136052119468652, + "grad_norm": 0.19964353740215302, + "learning_rate": 0.001, + "loss": 1.861, + "step": 19232 + }, + { + "epoch": 0.8136475167103816, + "grad_norm": 0.22766467928886414, + "learning_rate": 0.001, + "loss": 2.3613, + "step": 19233 + }, + { + "epoch": 0.813689821473898, + "grad_norm": 0.20995555818080902, + "learning_rate": 0.001, + "loss": 2.4426, + "step": 19234 + }, + { + "epoch": 0.8137321262374143, + "grad_norm": 0.16860271990299225, + "learning_rate": 0.001, + "loss": 2.3357, + "step": 19235 + }, + { + "epoch": 0.8137744310009307, + "grad_norm": 0.18488185107707977, + "learning_rate": 0.001, + "loss": 2.7789, + "step": 19236 + }, + { + "epoch": 0.8138167357644471, + "grad_norm": 2.0512919425964355, + "learning_rate": 0.001, + "loss": 2.3825, + "step": 19237 + }, + { + "epoch": 0.8138590405279634, + "grad_norm": 0.39424288272857666, + "learning_rate": 0.001, + "loss": 2.3707, + "step": 19238 + }, + { + "epoch": 0.8139013452914798, + "grad_norm": 0.18912164866924286, + "learning_rate": 0.001, + "loss": 2.5157, + "step": 19239 + }, + { + "epoch": 0.8139436500549961, + "grad_norm": 0.17487472295761108, + "learning_rate": 0.001, + "loss": 2.0539, + "step": 19240 + }, + { + "epoch": 0.8139859548185125, + "grad_norm": 0.1670231968164444, + "learning_rate": 0.001, + "loss": 2.7521, + "step": 19241 + }, + { + "epoch": 0.814028259582029, + "grad_norm": 0.16060633957386017, + "learning_rate": 0.001, + "loss": 1.8548, + "step": 19242 + }, + { + "epoch": 0.8140705643455453, + "grad_norm": 0.1655425727367401, + "learning_rate": 0.001, + "loss": 2.6851, + "step": 19243 + }, + { + "epoch": 0.8141128691090617, + "grad_norm": 0.7323225736618042, + "learning_rate": 0.001, + "loss": 3.2264, + "step": 19244 + }, + { + "epoch": 0.8141551738725781, + "grad_norm": 0.16756169497966766, + "learning_rate": 0.001, + "loss": 2.0877, + "step": 19245 + }, + { + "epoch": 0.8141974786360944, + "grad_norm": 4.373884201049805, + "learning_rate": 0.001, + "loss": 3.0818, + "step": 19246 + }, + { + "epoch": 0.8142397833996108, + "grad_norm": 0.15877367556095123, + "learning_rate": 0.001, + "loss": 2.477, + "step": 19247 + }, + { + "epoch": 0.8142820881631272, + "grad_norm": 14.3695707321167, + "learning_rate": 0.001, + "loss": 3.6114, + "step": 19248 + }, + { + "epoch": 0.8143243929266435, + "grad_norm": 0.35859042406082153, + "learning_rate": 0.001, + "loss": 2.4061, + "step": 19249 + }, + { + "epoch": 0.8143666976901599, + "grad_norm": 0.17189420759677887, + "learning_rate": 0.001, + "loss": 2.6325, + "step": 19250 + }, + { + "epoch": 0.8144090024536763, + "grad_norm": 0.1583157330751419, + "learning_rate": 0.001, + "loss": 2.0351, + "step": 19251 + }, + { + "epoch": 0.8144513072171926, + "grad_norm": 0.1763736605644226, + "learning_rate": 0.001, + "loss": 1.9754, + "step": 19252 + }, + { + "epoch": 0.814493611980709, + "grad_norm": 0.16979268193244934, + "learning_rate": 0.001, + "loss": 2.0023, + "step": 19253 + }, + { + "epoch": 0.8145359167442254, + "grad_norm": 0.1730976700782776, + "learning_rate": 0.001, + "loss": 2.8671, + "step": 19254 + }, + { + "epoch": 0.8145782215077417, + "grad_norm": 0.17058543860912323, + "learning_rate": 0.001, + "loss": 1.4882, + "step": 19255 + }, + { + "epoch": 0.8146205262712581, + "grad_norm": 0.29904067516326904, + "learning_rate": 0.001, + "loss": 2.019, + "step": 19256 + }, + { + "epoch": 0.8146628310347745, + "grad_norm": 0.6264715194702148, + "learning_rate": 0.001, + "loss": 2.8114, + "step": 19257 + }, + { + "epoch": 0.8147051357982908, + "grad_norm": 0.14027170836925507, + "learning_rate": 0.001, + "loss": 3.2401, + "step": 19258 + }, + { + "epoch": 0.8147474405618073, + "grad_norm": 0.16307754814624786, + "learning_rate": 0.001, + "loss": 2.6808, + "step": 19259 + }, + { + "epoch": 0.8147897453253237, + "grad_norm": 0.9267063736915588, + "learning_rate": 0.001, + "loss": 2.3898, + "step": 19260 + }, + { + "epoch": 0.81483205008884, + "grad_norm": 0.1487603634595871, + "learning_rate": 0.001, + "loss": 2.9254, + "step": 19261 + }, + { + "epoch": 0.8148743548523564, + "grad_norm": 0.1394791156053543, + "learning_rate": 0.001, + "loss": 1.7672, + "step": 19262 + }, + { + "epoch": 0.8149166596158728, + "grad_norm": 45.422454833984375, + "learning_rate": 0.001, + "loss": 2.5624, + "step": 19263 + }, + { + "epoch": 0.8149589643793891, + "grad_norm": 0.4591314196586609, + "learning_rate": 0.001, + "loss": 2.3038, + "step": 19264 + }, + { + "epoch": 0.8150012691429055, + "grad_norm": 0.13458524644374847, + "learning_rate": 0.001, + "loss": 2.2434, + "step": 19265 + }, + { + "epoch": 0.8150435739064219, + "grad_norm": 0.14545901119709015, + "learning_rate": 0.001, + "loss": 2.3079, + "step": 19266 + }, + { + "epoch": 0.8150858786699382, + "grad_norm": 0.16260690987110138, + "learning_rate": 0.001, + "loss": 2.7739, + "step": 19267 + }, + { + "epoch": 0.8151281834334546, + "grad_norm": 0.15987445414066315, + "learning_rate": 0.001, + "loss": 1.6845, + "step": 19268 + }, + { + "epoch": 0.815170488196971, + "grad_norm": 0.16004207730293274, + "learning_rate": 0.001, + "loss": 2.0792, + "step": 19269 + }, + { + "epoch": 0.8152127929604873, + "grad_norm": 0.14027880132198334, + "learning_rate": 0.001, + "loss": 2.14, + "step": 19270 + }, + { + "epoch": 0.8152550977240037, + "grad_norm": 0.1478835940361023, + "learning_rate": 0.001, + "loss": 1.4469, + "step": 19271 + }, + { + "epoch": 0.8152974024875201, + "grad_norm": 0.6844944357872009, + "learning_rate": 0.001, + "loss": 2.0617, + "step": 19272 + }, + { + "epoch": 0.8153397072510364, + "grad_norm": 0.24570874869823456, + "learning_rate": 0.001, + "loss": 2.0208, + "step": 19273 + }, + { + "epoch": 0.8153820120145528, + "grad_norm": 0.15631866455078125, + "learning_rate": 0.001, + "loss": 1.826, + "step": 19274 + }, + { + "epoch": 0.8154243167780693, + "grad_norm": 0.18372394144535065, + "learning_rate": 0.001, + "loss": 2.138, + "step": 19275 + }, + { + "epoch": 0.8154666215415856, + "grad_norm": 0.6491008400917053, + "learning_rate": 0.001, + "loss": 3.212, + "step": 19276 + }, + { + "epoch": 0.815508926305102, + "grad_norm": 0.13584677875041962, + "learning_rate": 0.001, + "loss": 2.0065, + "step": 19277 + }, + { + "epoch": 0.8155512310686184, + "grad_norm": 0.16441628336906433, + "learning_rate": 0.001, + "loss": 1.7349, + "step": 19278 + }, + { + "epoch": 0.8155935358321347, + "grad_norm": 0.16459910571575165, + "learning_rate": 0.001, + "loss": 1.9561, + "step": 19279 + }, + { + "epoch": 0.8156358405956511, + "grad_norm": 0.8286261558532715, + "learning_rate": 0.001, + "loss": 1.744, + "step": 19280 + }, + { + "epoch": 0.8156781453591675, + "grad_norm": 0.18971623480319977, + "learning_rate": 0.001, + "loss": 1.968, + "step": 19281 + }, + { + "epoch": 0.8157204501226838, + "grad_norm": 0.15174627304077148, + "learning_rate": 0.001, + "loss": 1.5037, + "step": 19282 + }, + { + "epoch": 0.8157627548862002, + "grad_norm": 0.1971951127052307, + "learning_rate": 0.001, + "loss": 1.9095, + "step": 19283 + }, + { + "epoch": 0.8158050596497166, + "grad_norm": 0.6228017807006836, + "learning_rate": 0.001, + "loss": 3.6166, + "step": 19284 + }, + { + "epoch": 0.8158473644132329, + "grad_norm": 0.20216991007328033, + "learning_rate": 0.001, + "loss": 2.2737, + "step": 19285 + }, + { + "epoch": 0.8158896691767493, + "grad_norm": 0.15689660608768463, + "learning_rate": 0.001, + "loss": 2.5109, + "step": 19286 + }, + { + "epoch": 0.8159319739402656, + "grad_norm": 0.16045741736888885, + "learning_rate": 0.001, + "loss": 2.3619, + "step": 19287 + }, + { + "epoch": 0.815974278703782, + "grad_norm": 0.14327400922775269, + "learning_rate": 0.001, + "loss": 1.7969, + "step": 19288 + }, + { + "epoch": 0.8160165834672984, + "grad_norm": 0.16195663809776306, + "learning_rate": 0.001, + "loss": 1.8645, + "step": 19289 + }, + { + "epoch": 0.8160588882308147, + "grad_norm": 0.17734597623348236, + "learning_rate": 0.001, + "loss": 1.8547, + "step": 19290 + }, + { + "epoch": 0.8161011929943311, + "grad_norm": 0.1504170149564743, + "learning_rate": 0.001, + "loss": 3.0344, + "step": 19291 + }, + { + "epoch": 0.8161434977578476, + "grad_norm": 0.3913165032863617, + "learning_rate": 0.001, + "loss": 2.165, + "step": 19292 + }, + { + "epoch": 0.8161858025213639, + "grad_norm": 0.14916856586933136, + "learning_rate": 0.001, + "loss": 2.5323, + "step": 19293 + }, + { + "epoch": 0.8162281072848803, + "grad_norm": 0.18995776772499084, + "learning_rate": 0.001, + "loss": 3.1768, + "step": 19294 + }, + { + "epoch": 0.8162704120483967, + "grad_norm": 0.16553127765655518, + "learning_rate": 0.001, + "loss": 2.8746, + "step": 19295 + }, + { + "epoch": 0.816312716811913, + "grad_norm": 0.16702581942081451, + "learning_rate": 0.001, + "loss": 1.8839, + "step": 19296 + }, + { + "epoch": 0.8163550215754294, + "grad_norm": 0.1927644908428192, + "learning_rate": 0.001, + "loss": 2.6619, + "step": 19297 + }, + { + "epoch": 0.8163973263389458, + "grad_norm": 1.5043902397155762, + "learning_rate": 0.001, + "loss": 1.8784, + "step": 19298 + }, + { + "epoch": 0.8164396311024621, + "grad_norm": 0.15827623009681702, + "learning_rate": 0.001, + "loss": 1.8496, + "step": 19299 + }, + { + "epoch": 0.8164819358659785, + "grad_norm": 0.15658104419708252, + "learning_rate": 0.001, + "loss": 2.642, + "step": 19300 + }, + { + "epoch": 0.8165242406294949, + "grad_norm": 0.2382393330335617, + "learning_rate": 0.001, + "loss": 2.8102, + "step": 19301 + }, + { + "epoch": 0.8165665453930112, + "grad_norm": 0.7768169045448303, + "learning_rate": 0.001, + "loss": 2.6393, + "step": 19302 + }, + { + "epoch": 0.8166088501565276, + "grad_norm": 0.9647676944732666, + "learning_rate": 0.001, + "loss": 2.4991, + "step": 19303 + }, + { + "epoch": 0.816651154920044, + "grad_norm": 0.17201387882232666, + "learning_rate": 0.001, + "loss": 2.1453, + "step": 19304 + }, + { + "epoch": 0.8166934596835603, + "grad_norm": 0.18046537041664124, + "learning_rate": 0.001, + "loss": 1.9127, + "step": 19305 + }, + { + "epoch": 0.8167357644470767, + "grad_norm": 0.1487366259098053, + "learning_rate": 0.001, + "loss": 2.5597, + "step": 19306 + }, + { + "epoch": 0.8167780692105931, + "grad_norm": 0.36740607023239136, + "learning_rate": 0.001, + "loss": 2.5074, + "step": 19307 + }, + { + "epoch": 0.8168203739741094, + "grad_norm": 0.2924293875694275, + "learning_rate": 0.001, + "loss": 3.529, + "step": 19308 + }, + { + "epoch": 0.8168626787376259, + "grad_norm": 0.14903929829597473, + "learning_rate": 0.001, + "loss": 3.0469, + "step": 19309 + }, + { + "epoch": 0.8169049835011423, + "grad_norm": 0.1474577784538269, + "learning_rate": 0.001, + "loss": 2.6798, + "step": 19310 + }, + { + "epoch": 0.8169472882646586, + "grad_norm": 0.14111989736557007, + "learning_rate": 0.001, + "loss": 2.0767, + "step": 19311 + }, + { + "epoch": 0.816989593028175, + "grad_norm": 0.16078026592731476, + "learning_rate": 0.001, + "loss": 2.0343, + "step": 19312 + }, + { + "epoch": 0.8170318977916914, + "grad_norm": 0.14191174507141113, + "learning_rate": 0.001, + "loss": 1.7657, + "step": 19313 + }, + { + "epoch": 0.8170742025552077, + "grad_norm": 0.23834367096424103, + "learning_rate": 0.001, + "loss": 1.9448, + "step": 19314 + }, + { + "epoch": 0.8171165073187241, + "grad_norm": 0.15897387266159058, + "learning_rate": 0.001, + "loss": 2.4028, + "step": 19315 + }, + { + "epoch": 0.8171588120822405, + "grad_norm": 0.14142805337905884, + "learning_rate": 0.001, + "loss": 2.5824, + "step": 19316 + }, + { + "epoch": 0.8172011168457568, + "grad_norm": 0.13879472017288208, + "learning_rate": 0.001, + "loss": 1.7327, + "step": 19317 + }, + { + "epoch": 0.8172434216092732, + "grad_norm": 0.13679929077625275, + "learning_rate": 0.001, + "loss": 1.8926, + "step": 19318 + }, + { + "epoch": 0.8172857263727896, + "grad_norm": 0.13544319570064545, + "learning_rate": 0.001, + "loss": 3.3204, + "step": 19319 + }, + { + "epoch": 0.8173280311363059, + "grad_norm": 0.23128291964530945, + "learning_rate": 0.001, + "loss": 3.2543, + "step": 19320 + }, + { + "epoch": 0.8173703358998223, + "grad_norm": 0.16951590776443481, + "learning_rate": 0.001, + "loss": 2.3761, + "step": 19321 + }, + { + "epoch": 0.8174126406633387, + "grad_norm": 0.13225175440311432, + "learning_rate": 0.001, + "loss": 3.1764, + "step": 19322 + }, + { + "epoch": 0.817454945426855, + "grad_norm": 0.22651726007461548, + "learning_rate": 0.001, + "loss": 2.3727, + "step": 19323 + }, + { + "epoch": 0.8174972501903714, + "grad_norm": 0.14250196516513824, + "learning_rate": 0.001, + "loss": 2.555, + "step": 19324 + }, + { + "epoch": 0.8175395549538879, + "grad_norm": 0.13862445950508118, + "learning_rate": 0.001, + "loss": 2.1272, + "step": 19325 + }, + { + "epoch": 0.8175818597174042, + "grad_norm": 0.6835310459136963, + "learning_rate": 0.001, + "loss": 2.1929, + "step": 19326 + }, + { + "epoch": 0.8176241644809206, + "grad_norm": 0.49331387877464294, + "learning_rate": 0.001, + "loss": 1.605, + "step": 19327 + }, + { + "epoch": 0.817666469244437, + "grad_norm": 0.8337584733963013, + "learning_rate": 0.001, + "loss": 1.8696, + "step": 19328 + }, + { + "epoch": 0.8177087740079533, + "grad_norm": 0.13888110220432281, + "learning_rate": 0.001, + "loss": 1.7783, + "step": 19329 + }, + { + "epoch": 0.8177510787714697, + "grad_norm": 0.15153059363365173, + "learning_rate": 0.001, + "loss": 3.0953, + "step": 19330 + }, + { + "epoch": 0.817793383534986, + "grad_norm": 0.16064806282520294, + "learning_rate": 0.001, + "loss": 2.2135, + "step": 19331 + }, + { + "epoch": 0.8178356882985024, + "grad_norm": 0.18726807832717896, + "learning_rate": 0.001, + "loss": 1.5846, + "step": 19332 + }, + { + "epoch": 0.8178779930620188, + "grad_norm": 1.7091985940933228, + "learning_rate": 0.001, + "loss": 1.8989, + "step": 19333 + }, + { + "epoch": 0.8179202978255351, + "grad_norm": 10.048054695129395, + "learning_rate": 0.001, + "loss": 1.9446, + "step": 19334 + }, + { + "epoch": 0.8179626025890515, + "grad_norm": 0.1563667356967926, + "learning_rate": 0.001, + "loss": 1.8827, + "step": 19335 + }, + { + "epoch": 0.8180049073525679, + "grad_norm": 0.14625267684459686, + "learning_rate": 0.001, + "loss": 2.9246, + "step": 19336 + }, + { + "epoch": 0.8180472121160842, + "grad_norm": 0.14250653982162476, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 19337 + }, + { + "epoch": 0.8180895168796006, + "grad_norm": 0.1519334465265274, + "learning_rate": 0.001, + "loss": 1.6979, + "step": 19338 + }, + { + "epoch": 0.818131821643117, + "grad_norm": 0.18076607584953308, + "learning_rate": 0.001, + "loss": 1.4037, + "step": 19339 + }, + { + "epoch": 0.8181741264066333, + "grad_norm": 0.20953752100467682, + "learning_rate": 0.001, + "loss": 2.7733, + "step": 19340 + }, + { + "epoch": 0.8182164311701497, + "grad_norm": 0.17903457581996918, + "learning_rate": 0.001, + "loss": 1.9328, + "step": 19341 + }, + { + "epoch": 0.8182587359336662, + "grad_norm": 0.1693604737520218, + "learning_rate": 0.001, + "loss": 1.6344, + "step": 19342 + }, + { + "epoch": 0.8183010406971825, + "grad_norm": 0.18389926850795746, + "learning_rate": 0.001, + "loss": 1.6565, + "step": 19343 + }, + { + "epoch": 0.8183433454606989, + "grad_norm": 0.15521776676177979, + "learning_rate": 0.001, + "loss": 2.1015, + "step": 19344 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.1567741185426712, + "learning_rate": 0.001, + "loss": 2.3034, + "step": 19345 + }, + { + "epoch": 0.8184279549877316, + "grad_norm": 0.1486789584159851, + "learning_rate": 0.001, + "loss": 1.6981, + "step": 19346 + }, + { + "epoch": 0.818470259751248, + "grad_norm": 0.14541268348693848, + "learning_rate": 0.001, + "loss": 2.1189, + "step": 19347 + }, + { + "epoch": 0.8185125645147644, + "grad_norm": 0.1486058086156845, + "learning_rate": 0.001, + "loss": 1.6406, + "step": 19348 + }, + { + "epoch": 0.8185548692782807, + "grad_norm": 0.1677182912826538, + "learning_rate": 0.001, + "loss": 1.6275, + "step": 19349 + }, + { + "epoch": 0.8185971740417971, + "grad_norm": 0.1867937445640564, + "learning_rate": 0.001, + "loss": 2.5879, + "step": 19350 + }, + { + "epoch": 0.8186394788053135, + "grad_norm": 0.18571646511554718, + "learning_rate": 0.001, + "loss": 2.914, + "step": 19351 + }, + { + "epoch": 0.8186817835688298, + "grad_norm": 0.17345742881298065, + "learning_rate": 0.001, + "loss": 2.0374, + "step": 19352 + }, + { + "epoch": 0.8187240883323462, + "grad_norm": 0.164658322930336, + "learning_rate": 0.001, + "loss": 1.6735, + "step": 19353 + }, + { + "epoch": 0.8187663930958626, + "grad_norm": 15.121390342712402, + "learning_rate": 0.001, + "loss": 2.3059, + "step": 19354 + }, + { + "epoch": 0.8188086978593789, + "grad_norm": 0.16249455511569977, + "learning_rate": 0.001, + "loss": 2.387, + "step": 19355 + }, + { + "epoch": 0.8188510026228953, + "grad_norm": 0.21848152577877045, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 19356 + }, + { + "epoch": 0.8188933073864118, + "grad_norm": 0.18375152349472046, + "learning_rate": 0.001, + "loss": 1.7933, + "step": 19357 + }, + { + "epoch": 0.818935612149928, + "grad_norm": 0.1748533397912979, + "learning_rate": 0.001, + "loss": 3.2785, + "step": 19358 + }, + { + "epoch": 0.8189779169134445, + "grad_norm": 0.18232855200767517, + "learning_rate": 0.001, + "loss": 2.2611, + "step": 19359 + }, + { + "epoch": 0.8190202216769609, + "grad_norm": 0.811168909072876, + "learning_rate": 0.001, + "loss": 1.9481, + "step": 19360 + }, + { + "epoch": 0.8190625264404772, + "grad_norm": 40.37857437133789, + "learning_rate": 0.001, + "loss": 1.5613, + "step": 19361 + }, + { + "epoch": 0.8191048312039936, + "grad_norm": 1.3089685440063477, + "learning_rate": 0.001, + "loss": 3.6364, + "step": 19362 + }, + { + "epoch": 0.81914713596751, + "grad_norm": 0.1557542085647583, + "learning_rate": 0.001, + "loss": 2.8861, + "step": 19363 + }, + { + "epoch": 0.8191894407310263, + "grad_norm": 0.1572728157043457, + "learning_rate": 0.001, + "loss": 1.987, + "step": 19364 + }, + { + "epoch": 0.8192317454945427, + "grad_norm": 0.13320891559123993, + "learning_rate": 0.001, + "loss": 2.0874, + "step": 19365 + }, + { + "epoch": 0.8192740502580591, + "grad_norm": 0.18693287670612335, + "learning_rate": 0.001, + "loss": 1.5792, + "step": 19366 + }, + { + "epoch": 0.8193163550215754, + "grad_norm": 0.17227143049240112, + "learning_rate": 0.001, + "loss": 2.8165, + "step": 19367 + }, + { + "epoch": 0.8193586597850918, + "grad_norm": 0.19532877206802368, + "learning_rate": 0.001, + "loss": 2.0586, + "step": 19368 + }, + { + "epoch": 0.8194009645486082, + "grad_norm": 0.18054965138435364, + "learning_rate": 0.001, + "loss": 2.1502, + "step": 19369 + }, + { + "epoch": 0.8194432693121245, + "grad_norm": 0.3110874593257904, + "learning_rate": 0.001, + "loss": 3.9523, + "step": 19370 + }, + { + "epoch": 0.8194855740756409, + "grad_norm": 0.2044413536787033, + "learning_rate": 0.001, + "loss": 2.5399, + "step": 19371 + }, + { + "epoch": 0.8195278788391573, + "grad_norm": 0.21336640417575836, + "learning_rate": 0.001, + "loss": 2.1525, + "step": 19372 + }, + { + "epoch": 0.8195701836026736, + "grad_norm": 0.22624215483665466, + "learning_rate": 0.001, + "loss": 2.2209, + "step": 19373 + }, + { + "epoch": 0.81961248836619, + "grad_norm": 0.23245839774608612, + "learning_rate": 0.001, + "loss": 2.8199, + "step": 19374 + }, + { + "epoch": 0.8196547931297065, + "grad_norm": 0.19043493270874023, + "learning_rate": 0.001, + "loss": 1.8749, + "step": 19375 + }, + { + "epoch": 0.8196970978932228, + "grad_norm": 0.16715998947620392, + "learning_rate": 0.001, + "loss": 3.1058, + "step": 19376 + }, + { + "epoch": 0.8197394026567392, + "grad_norm": 0.15719690918922424, + "learning_rate": 0.001, + "loss": 2.2472, + "step": 19377 + }, + { + "epoch": 0.8197817074202555, + "grad_norm": 0.15933258831501007, + "learning_rate": 0.001, + "loss": 2.5364, + "step": 19378 + }, + { + "epoch": 0.8198240121837719, + "grad_norm": 0.29013222455978394, + "learning_rate": 0.001, + "loss": 3.0306, + "step": 19379 + }, + { + "epoch": 0.8198663169472883, + "grad_norm": 0.17771287262439728, + "learning_rate": 0.001, + "loss": 2.3702, + "step": 19380 + }, + { + "epoch": 0.8199086217108046, + "grad_norm": 0.14083783328533173, + "learning_rate": 0.001, + "loss": 1.8384, + "step": 19381 + }, + { + "epoch": 0.819950926474321, + "grad_norm": 0.17068462073802948, + "learning_rate": 0.001, + "loss": 2.4943, + "step": 19382 + }, + { + "epoch": 0.8199932312378374, + "grad_norm": 0.1593535989522934, + "learning_rate": 0.001, + "loss": 2.0397, + "step": 19383 + }, + { + "epoch": 0.8200355360013537, + "grad_norm": 0.3052305579185486, + "learning_rate": 0.001, + "loss": 2.3592, + "step": 19384 + }, + { + "epoch": 0.8200778407648701, + "grad_norm": 0.1488254964351654, + "learning_rate": 0.001, + "loss": 1.8685, + "step": 19385 + }, + { + "epoch": 0.8201201455283865, + "grad_norm": 0.18521372973918915, + "learning_rate": 0.001, + "loss": 2.578, + "step": 19386 + }, + { + "epoch": 0.8201624502919028, + "grad_norm": 0.1931733936071396, + "learning_rate": 0.001, + "loss": 1.9068, + "step": 19387 + }, + { + "epoch": 0.8202047550554192, + "grad_norm": 0.16335810720920563, + "learning_rate": 0.001, + "loss": 2.0929, + "step": 19388 + }, + { + "epoch": 0.8202470598189356, + "grad_norm": 0.14352665841579437, + "learning_rate": 0.001, + "loss": 1.5255, + "step": 19389 + }, + { + "epoch": 0.8202893645824519, + "grad_norm": 0.22038647532463074, + "learning_rate": 0.001, + "loss": 3.3152, + "step": 19390 + }, + { + "epoch": 0.8203316693459684, + "grad_norm": 0.17173390090465546, + "learning_rate": 0.001, + "loss": 1.8423, + "step": 19391 + }, + { + "epoch": 0.8203739741094848, + "grad_norm": 0.1474774330854416, + "learning_rate": 0.001, + "loss": 1.6557, + "step": 19392 + }, + { + "epoch": 0.8204162788730011, + "grad_norm": 0.1488204449415207, + "learning_rate": 0.001, + "loss": 1.867, + "step": 19393 + }, + { + "epoch": 0.8204585836365175, + "grad_norm": 0.26279768347740173, + "learning_rate": 0.001, + "loss": 2.287, + "step": 19394 + }, + { + "epoch": 0.8205008884000339, + "grad_norm": 0.3085935413837433, + "learning_rate": 0.001, + "loss": 2.1505, + "step": 19395 + }, + { + "epoch": 0.8205431931635502, + "grad_norm": 0.14529921114444733, + "learning_rate": 0.001, + "loss": 1.7498, + "step": 19396 + }, + { + "epoch": 0.8205854979270666, + "grad_norm": 0.1698216050863266, + "learning_rate": 0.001, + "loss": 2.3125, + "step": 19397 + }, + { + "epoch": 0.820627802690583, + "grad_norm": 0.1525857001543045, + "learning_rate": 0.001, + "loss": 2.6507, + "step": 19398 + }, + { + "epoch": 0.8206701074540993, + "grad_norm": 0.48174238204956055, + "learning_rate": 0.001, + "loss": 1.9215, + "step": 19399 + }, + { + "epoch": 0.8207124122176157, + "grad_norm": 0.1638476401567459, + "learning_rate": 0.001, + "loss": 2.4539, + "step": 19400 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 0.1637507677078247, + "learning_rate": 0.001, + "loss": 1.5719, + "step": 19401 + }, + { + "epoch": 0.8207970217446484, + "grad_norm": 0.1512572169303894, + "learning_rate": 0.001, + "loss": 2.4486, + "step": 19402 + }, + { + "epoch": 0.8208393265081648, + "grad_norm": 0.12860354781150818, + "learning_rate": 0.001, + "loss": 2.5915, + "step": 19403 + }, + { + "epoch": 0.8208816312716812, + "grad_norm": 0.15637515485286713, + "learning_rate": 0.001, + "loss": 2.1001, + "step": 19404 + }, + { + "epoch": 0.8209239360351975, + "grad_norm": 0.13939064741134644, + "learning_rate": 0.001, + "loss": 2.1013, + "step": 19405 + }, + { + "epoch": 0.8209662407987139, + "grad_norm": 0.1056860163807869, + "learning_rate": 0.001, + "loss": 1.4263, + "step": 19406 + }, + { + "epoch": 0.8210085455622304, + "grad_norm": 0.14277103543281555, + "learning_rate": 0.001, + "loss": 2.5677, + "step": 19407 + }, + { + "epoch": 0.8210508503257467, + "grad_norm": 0.12135271728038788, + "learning_rate": 0.001, + "loss": 2.5086, + "step": 19408 + }, + { + "epoch": 0.8210931550892631, + "grad_norm": 0.6470211148262024, + "learning_rate": 0.001, + "loss": 2.8058, + "step": 19409 + }, + { + "epoch": 0.8211354598527795, + "grad_norm": 0.14677073061466217, + "learning_rate": 0.001, + "loss": 2.0381, + "step": 19410 + }, + { + "epoch": 0.8211777646162958, + "grad_norm": 0.14049549400806427, + "learning_rate": 0.001, + "loss": 2.1267, + "step": 19411 + }, + { + "epoch": 0.8212200693798122, + "grad_norm": 0.1972610056400299, + "learning_rate": 0.001, + "loss": 1.4748, + "step": 19412 + }, + { + "epoch": 0.8212623741433286, + "grad_norm": 0.16732430458068848, + "learning_rate": 0.001, + "loss": 2.0388, + "step": 19413 + }, + { + "epoch": 0.8213046789068449, + "grad_norm": 0.17487087845802307, + "learning_rate": 0.001, + "loss": 1.8259, + "step": 19414 + }, + { + "epoch": 0.8213469836703613, + "grad_norm": 0.2771112322807312, + "learning_rate": 0.001, + "loss": 2.8102, + "step": 19415 + }, + { + "epoch": 0.8213892884338777, + "grad_norm": 0.15633077919483185, + "learning_rate": 0.001, + "loss": 2.9745, + "step": 19416 + }, + { + "epoch": 0.821431593197394, + "grad_norm": 0.8696543574333191, + "learning_rate": 0.001, + "loss": 2.1251, + "step": 19417 + }, + { + "epoch": 0.8214738979609104, + "grad_norm": 0.7702298164367676, + "learning_rate": 0.001, + "loss": 2.4299, + "step": 19418 + }, + { + "epoch": 0.8215162027244268, + "grad_norm": 1.8494880199432373, + "learning_rate": 0.001, + "loss": 2.1975, + "step": 19419 + }, + { + "epoch": 0.8215585074879431, + "grad_norm": 0.14353471994400024, + "learning_rate": 0.001, + "loss": 1.602, + "step": 19420 + }, + { + "epoch": 0.8216008122514595, + "grad_norm": 0.152914896607399, + "learning_rate": 0.001, + "loss": 2.5665, + "step": 19421 + }, + { + "epoch": 0.8216431170149758, + "grad_norm": 0.1569298803806305, + "learning_rate": 0.001, + "loss": 2.2651, + "step": 19422 + }, + { + "epoch": 0.8216854217784922, + "grad_norm": 0.14512638747692108, + "learning_rate": 0.001, + "loss": 1.6607, + "step": 19423 + }, + { + "epoch": 0.8217277265420087, + "grad_norm": 0.16484789550304413, + "learning_rate": 0.001, + "loss": 2.1772, + "step": 19424 + }, + { + "epoch": 0.821770031305525, + "grad_norm": 0.1395651400089264, + "learning_rate": 0.001, + "loss": 1.821, + "step": 19425 + }, + { + "epoch": 0.8218123360690414, + "grad_norm": 0.675366997718811, + "learning_rate": 0.001, + "loss": 2.4906, + "step": 19426 + }, + { + "epoch": 0.8218546408325578, + "grad_norm": 0.14823488891124725, + "learning_rate": 0.001, + "loss": 2.0327, + "step": 19427 + }, + { + "epoch": 0.8218969455960741, + "grad_norm": 0.14115604758262634, + "learning_rate": 0.001, + "loss": 1.8341, + "step": 19428 + }, + { + "epoch": 0.8219392503595905, + "grad_norm": 0.14076372981071472, + "learning_rate": 0.001, + "loss": 2.0194, + "step": 19429 + }, + { + "epoch": 0.8219815551231069, + "grad_norm": 0.1530357003211975, + "learning_rate": 0.001, + "loss": 1.8015, + "step": 19430 + }, + { + "epoch": 0.8220238598866232, + "grad_norm": 0.1735893338918686, + "learning_rate": 0.001, + "loss": 2.0837, + "step": 19431 + }, + { + "epoch": 0.8220661646501396, + "grad_norm": 264.96417236328125, + "learning_rate": 0.001, + "loss": 2.6431, + "step": 19432 + }, + { + "epoch": 0.822108469413656, + "grad_norm": 0.16807694733142853, + "learning_rate": 0.001, + "loss": 2.2922, + "step": 19433 + }, + { + "epoch": 0.8221507741771723, + "grad_norm": 0.1564405858516693, + "learning_rate": 0.001, + "loss": 1.6603, + "step": 19434 + }, + { + "epoch": 0.8221930789406887, + "grad_norm": 0.1768580824136734, + "learning_rate": 0.001, + "loss": 1.8339, + "step": 19435 + }, + { + "epoch": 0.8222353837042051, + "grad_norm": 0.16592250764369965, + "learning_rate": 0.001, + "loss": 2.0608, + "step": 19436 + }, + { + "epoch": 0.8222776884677214, + "grad_norm": 0.8113108277320862, + "learning_rate": 0.001, + "loss": 2.8101, + "step": 19437 + }, + { + "epoch": 0.8223199932312378, + "grad_norm": 3.1174583435058594, + "learning_rate": 0.001, + "loss": 2.0837, + "step": 19438 + }, + { + "epoch": 0.8223622979947542, + "grad_norm": 0.1804606169462204, + "learning_rate": 0.001, + "loss": 1.8602, + "step": 19439 + }, + { + "epoch": 0.8224046027582705, + "grad_norm": 0.2099166363477707, + "learning_rate": 0.001, + "loss": 2.2021, + "step": 19440 + }, + { + "epoch": 0.822446907521787, + "grad_norm": 0.2024306058883667, + "learning_rate": 0.001, + "loss": 2.5865, + "step": 19441 + }, + { + "epoch": 0.8224892122853034, + "grad_norm": 1.9643110036849976, + "learning_rate": 0.001, + "loss": 2.6401, + "step": 19442 + }, + { + "epoch": 0.8225315170488197, + "grad_norm": 0.2750485837459564, + "learning_rate": 0.001, + "loss": 1.6251, + "step": 19443 + }, + { + "epoch": 0.8225738218123361, + "grad_norm": 0.26389703154563904, + "learning_rate": 0.001, + "loss": 1.9309, + "step": 19444 + }, + { + "epoch": 0.8226161265758525, + "grad_norm": 0.1789424866437912, + "learning_rate": 0.001, + "loss": 1.6066, + "step": 19445 + }, + { + "epoch": 0.8226584313393688, + "grad_norm": 0.2300262451171875, + "learning_rate": 0.001, + "loss": 2.7709, + "step": 19446 + }, + { + "epoch": 0.8227007361028852, + "grad_norm": 0.25446465611457825, + "learning_rate": 0.001, + "loss": 2.4887, + "step": 19447 + }, + { + "epoch": 0.8227430408664016, + "grad_norm": 0.2057051956653595, + "learning_rate": 0.001, + "loss": 1.8832, + "step": 19448 + }, + { + "epoch": 0.8227853456299179, + "grad_norm": 0.17963270843029022, + "learning_rate": 0.001, + "loss": 3.0981, + "step": 19449 + }, + { + "epoch": 0.8228276503934343, + "grad_norm": 0.17611320316791534, + "learning_rate": 0.001, + "loss": 2.6242, + "step": 19450 + }, + { + "epoch": 0.8228699551569507, + "grad_norm": 0.19677342474460602, + "learning_rate": 0.001, + "loss": 1.7286, + "step": 19451 + }, + { + "epoch": 0.822912259920467, + "grad_norm": 0.19172005355358124, + "learning_rate": 0.001, + "loss": 1.6635, + "step": 19452 + }, + { + "epoch": 0.8229545646839834, + "grad_norm": 0.17036627233028412, + "learning_rate": 0.001, + "loss": 1.5419, + "step": 19453 + }, + { + "epoch": 0.8229968694474998, + "grad_norm": 0.1823369711637497, + "learning_rate": 0.001, + "loss": 2.7447, + "step": 19454 + }, + { + "epoch": 0.8230391742110161, + "grad_norm": 0.30361148715019226, + "learning_rate": 0.001, + "loss": 2.7019, + "step": 19455 + }, + { + "epoch": 0.8230814789745325, + "grad_norm": 0.14085571467876434, + "learning_rate": 0.001, + "loss": 2.5237, + "step": 19456 + }, + { + "epoch": 0.823123783738049, + "grad_norm": 0.15281033515930176, + "learning_rate": 0.001, + "loss": 2.1091, + "step": 19457 + }, + { + "epoch": 0.8231660885015653, + "grad_norm": 0.14232198894023895, + "learning_rate": 0.001, + "loss": 2.4139, + "step": 19458 + }, + { + "epoch": 0.8232083932650817, + "grad_norm": 0.142531618475914, + "learning_rate": 0.001, + "loss": 2.5104, + "step": 19459 + }, + { + "epoch": 0.8232506980285981, + "grad_norm": 0.14800485968589783, + "learning_rate": 0.001, + "loss": 1.6023, + "step": 19460 + }, + { + "epoch": 0.8232930027921144, + "grad_norm": 0.15559349954128265, + "learning_rate": 0.001, + "loss": 2.9792, + "step": 19461 + }, + { + "epoch": 0.8233353075556308, + "grad_norm": 0.1521608531475067, + "learning_rate": 0.001, + "loss": 2.7687, + "step": 19462 + }, + { + "epoch": 0.8233776123191472, + "grad_norm": 0.14281997084617615, + "learning_rate": 0.001, + "loss": 1.9515, + "step": 19463 + }, + { + "epoch": 0.8234199170826635, + "grad_norm": 0.9176727533340454, + "learning_rate": 0.001, + "loss": 1.5359, + "step": 19464 + }, + { + "epoch": 0.8234622218461799, + "grad_norm": 0.14004619419574738, + "learning_rate": 0.001, + "loss": 2.0011, + "step": 19465 + }, + { + "epoch": 0.8235045266096962, + "grad_norm": 0.14893396198749542, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 19466 + }, + { + "epoch": 0.8235468313732126, + "grad_norm": 0.14513948559761047, + "learning_rate": 0.001, + "loss": 2.2117, + "step": 19467 + }, + { + "epoch": 0.823589136136729, + "grad_norm": 0.12800917029380798, + "learning_rate": 0.001, + "loss": 2.3386, + "step": 19468 + }, + { + "epoch": 0.8236314409002453, + "grad_norm": 0.15164655447006226, + "learning_rate": 0.001, + "loss": 1.9978, + "step": 19469 + }, + { + "epoch": 0.8236737456637617, + "grad_norm": 0.18457886576652527, + "learning_rate": 0.001, + "loss": 1.6669, + "step": 19470 + }, + { + "epoch": 0.8237160504272781, + "grad_norm": 0.3100104331970215, + "learning_rate": 0.001, + "loss": 1.9054, + "step": 19471 + }, + { + "epoch": 0.8237583551907944, + "grad_norm": 0.16601592302322388, + "learning_rate": 0.001, + "loss": 2.0019, + "step": 19472 + }, + { + "epoch": 0.8238006599543108, + "grad_norm": 0.25218597054481506, + "learning_rate": 0.001, + "loss": 2.5409, + "step": 19473 + }, + { + "epoch": 0.8238429647178273, + "grad_norm": 0.15414927899837494, + "learning_rate": 0.001, + "loss": 2.0852, + "step": 19474 + }, + { + "epoch": 0.8238852694813436, + "grad_norm": 0.1316404789686203, + "learning_rate": 0.001, + "loss": 1.8446, + "step": 19475 + }, + { + "epoch": 0.82392757424486, + "grad_norm": 0.19027061760425568, + "learning_rate": 0.001, + "loss": 2.2132, + "step": 19476 + }, + { + "epoch": 0.8239698790083764, + "grad_norm": 1.3380327224731445, + "learning_rate": 0.001, + "loss": 1.9915, + "step": 19477 + }, + { + "epoch": 0.8240121837718927, + "grad_norm": 0.14829513430595398, + "learning_rate": 0.001, + "loss": 2.1132, + "step": 19478 + }, + { + "epoch": 0.8240544885354091, + "grad_norm": 0.18493659794330597, + "learning_rate": 0.001, + "loss": 2.2199, + "step": 19479 + }, + { + "epoch": 0.8240967932989255, + "grad_norm": 0.13995139300823212, + "learning_rate": 0.001, + "loss": 1.894, + "step": 19480 + }, + { + "epoch": 0.8241390980624418, + "grad_norm": 0.6962743401527405, + "learning_rate": 0.001, + "loss": 2.9984, + "step": 19481 + }, + { + "epoch": 0.8241814028259582, + "grad_norm": 0.17796729505062103, + "learning_rate": 0.001, + "loss": 2.2849, + "step": 19482 + }, + { + "epoch": 0.8242237075894746, + "grad_norm": 0.5168524980545044, + "learning_rate": 0.001, + "loss": 3.2221, + "step": 19483 + }, + { + "epoch": 0.8242660123529909, + "grad_norm": 0.13838616013526917, + "learning_rate": 0.001, + "loss": 1.861, + "step": 19484 + }, + { + "epoch": 0.8243083171165073, + "grad_norm": 1.7203575372695923, + "learning_rate": 0.001, + "loss": 1.9319, + "step": 19485 + }, + { + "epoch": 0.8243506218800237, + "grad_norm": 0.13858763873577118, + "learning_rate": 0.001, + "loss": 2.8774, + "step": 19486 + }, + { + "epoch": 0.82439292664354, + "grad_norm": 0.15123365819454193, + "learning_rate": 0.001, + "loss": 2.2039, + "step": 19487 + }, + { + "epoch": 0.8244352314070564, + "grad_norm": 0.46859481930732727, + "learning_rate": 0.001, + "loss": 2.0706, + "step": 19488 + }, + { + "epoch": 0.8244775361705728, + "grad_norm": 0.16294020414352417, + "learning_rate": 0.001, + "loss": 1.4793, + "step": 19489 + }, + { + "epoch": 0.8245198409340891, + "grad_norm": 0.13173837959766388, + "learning_rate": 0.001, + "loss": 1.8825, + "step": 19490 + }, + { + "epoch": 0.8245621456976056, + "grad_norm": 0.1741243302822113, + "learning_rate": 0.001, + "loss": 2.136, + "step": 19491 + }, + { + "epoch": 0.824604450461122, + "grad_norm": 0.16755490005016327, + "learning_rate": 0.001, + "loss": 1.7519, + "step": 19492 + }, + { + "epoch": 0.8246467552246383, + "grad_norm": 0.17870347201824188, + "learning_rate": 0.001, + "loss": 2.4803, + "step": 19493 + }, + { + "epoch": 0.8246890599881547, + "grad_norm": 0.15022897720336914, + "learning_rate": 0.001, + "loss": 1.5368, + "step": 19494 + }, + { + "epoch": 0.8247313647516711, + "grad_norm": 2.018383026123047, + "learning_rate": 0.001, + "loss": 1.3298, + "step": 19495 + }, + { + "epoch": 0.8247736695151874, + "grad_norm": 0.15447530150413513, + "learning_rate": 0.001, + "loss": 1.6869, + "step": 19496 + }, + { + "epoch": 0.8248159742787038, + "grad_norm": 6.437837600708008, + "learning_rate": 0.001, + "loss": 2.0111, + "step": 19497 + }, + { + "epoch": 0.8248582790422202, + "grad_norm": 0.14872145652770996, + "learning_rate": 0.001, + "loss": 2.4922, + "step": 19498 + }, + { + "epoch": 0.8249005838057365, + "grad_norm": 0.15467225015163422, + "learning_rate": 0.001, + "loss": 2.5399, + "step": 19499 + }, + { + "epoch": 0.8249428885692529, + "grad_norm": 0.16929113864898682, + "learning_rate": 0.001, + "loss": 1.8642, + "step": 19500 + }, + { + "epoch": 0.8249851933327693, + "grad_norm": 0.14469696581363678, + "learning_rate": 0.001, + "loss": 2.8897, + "step": 19501 + }, + { + "epoch": 0.8250274980962856, + "grad_norm": 0.1643838882446289, + "learning_rate": 0.001, + "loss": 2.0871, + "step": 19502 + }, + { + "epoch": 0.825069802859802, + "grad_norm": 0.14725084602832794, + "learning_rate": 0.001, + "loss": 1.7524, + "step": 19503 + }, + { + "epoch": 0.8251121076233184, + "grad_norm": 0.14372789859771729, + "learning_rate": 0.001, + "loss": 3.4166, + "step": 19504 + }, + { + "epoch": 0.8251544123868347, + "grad_norm": 0.155779629945755, + "learning_rate": 0.001, + "loss": 1.9682, + "step": 19505 + }, + { + "epoch": 0.8251967171503511, + "grad_norm": 1.8167226314544678, + "learning_rate": 0.001, + "loss": 2.1802, + "step": 19506 + }, + { + "epoch": 0.8252390219138676, + "grad_norm": 0.4720889925956726, + "learning_rate": 0.001, + "loss": 1.7518, + "step": 19507 + }, + { + "epoch": 0.8252813266773839, + "grad_norm": 1.381333351135254, + "learning_rate": 0.001, + "loss": 2.3623, + "step": 19508 + }, + { + "epoch": 0.8253236314409003, + "grad_norm": 0.3292122781276703, + "learning_rate": 0.001, + "loss": 1.4812, + "step": 19509 + }, + { + "epoch": 0.8253659362044167, + "grad_norm": 0.16913935542106628, + "learning_rate": 0.001, + "loss": 1.9194, + "step": 19510 + }, + { + "epoch": 0.825408240967933, + "grad_norm": 0.20442567765712738, + "learning_rate": 0.001, + "loss": 2.7706, + "step": 19511 + }, + { + "epoch": 0.8254505457314494, + "grad_norm": 0.12844796478748322, + "learning_rate": 0.001, + "loss": 1.7295, + "step": 19512 + }, + { + "epoch": 0.8254928504949657, + "grad_norm": 0.6568970084190369, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 19513 + }, + { + "epoch": 0.8255351552584821, + "grad_norm": 0.3706272542476654, + "learning_rate": 0.001, + "loss": 1.5584, + "step": 19514 + }, + { + "epoch": 0.8255774600219985, + "grad_norm": 18.31789207458496, + "learning_rate": 0.001, + "loss": 2.2803, + "step": 19515 + }, + { + "epoch": 0.8256197647855148, + "grad_norm": 0.15570005774497986, + "learning_rate": 0.001, + "loss": 2.3987, + "step": 19516 + }, + { + "epoch": 0.8256620695490312, + "grad_norm": 0.1890856921672821, + "learning_rate": 0.001, + "loss": 2.9565, + "step": 19517 + }, + { + "epoch": 0.8257043743125476, + "grad_norm": 0.17735014855861664, + "learning_rate": 0.001, + "loss": 1.7442, + "step": 19518 + }, + { + "epoch": 0.8257466790760639, + "grad_norm": 0.5632053017616272, + "learning_rate": 0.001, + "loss": 3.1095, + "step": 19519 + }, + { + "epoch": 0.8257889838395803, + "grad_norm": 0.1494271159172058, + "learning_rate": 0.001, + "loss": 1.583, + "step": 19520 + }, + { + "epoch": 0.8258312886030967, + "grad_norm": 3.3606419563293457, + "learning_rate": 0.001, + "loss": 2.7092, + "step": 19521 + }, + { + "epoch": 0.825873593366613, + "grad_norm": 0.162941113114357, + "learning_rate": 0.001, + "loss": 2.5138, + "step": 19522 + }, + { + "epoch": 0.8259158981301294, + "grad_norm": 0.18008768558502197, + "learning_rate": 0.001, + "loss": 1.5397, + "step": 19523 + }, + { + "epoch": 0.8259582028936459, + "grad_norm": 0.2005215436220169, + "learning_rate": 0.001, + "loss": 2.3149, + "step": 19524 + }, + { + "epoch": 0.8260005076571622, + "grad_norm": 21.058948516845703, + "learning_rate": 0.001, + "loss": 1.64, + "step": 19525 + }, + { + "epoch": 0.8260428124206786, + "grad_norm": 0.18747086822986603, + "learning_rate": 0.001, + "loss": 1.8543, + "step": 19526 + }, + { + "epoch": 0.826085117184195, + "grad_norm": 0.16638407111167908, + "learning_rate": 0.001, + "loss": 3.0271, + "step": 19527 + }, + { + "epoch": 0.8261274219477113, + "grad_norm": 0.26135191321372986, + "learning_rate": 0.001, + "loss": 1.9732, + "step": 19528 + }, + { + "epoch": 0.8261697267112277, + "grad_norm": 0.906550943851471, + "learning_rate": 0.001, + "loss": 1.9329, + "step": 19529 + }, + { + "epoch": 0.8262120314747441, + "grad_norm": 0.19989506900310516, + "learning_rate": 0.001, + "loss": 2.3301, + "step": 19530 + }, + { + "epoch": 0.8262543362382604, + "grad_norm": 0.1638246327638626, + "learning_rate": 0.001, + "loss": 2.0817, + "step": 19531 + }, + { + "epoch": 0.8262966410017768, + "grad_norm": 0.17549832165241241, + "learning_rate": 0.001, + "loss": 3.5144, + "step": 19532 + }, + { + "epoch": 0.8263389457652932, + "grad_norm": 0.18301671743392944, + "learning_rate": 0.001, + "loss": 2.1419, + "step": 19533 + }, + { + "epoch": 0.8263812505288095, + "grad_norm": 0.17851994931697845, + "learning_rate": 0.001, + "loss": 2.0738, + "step": 19534 + }, + { + "epoch": 0.8264235552923259, + "grad_norm": 0.18530461192131042, + "learning_rate": 0.001, + "loss": 1.9832, + "step": 19535 + }, + { + "epoch": 0.8264658600558423, + "grad_norm": 1.8142375946044922, + "learning_rate": 0.001, + "loss": 2.8591, + "step": 19536 + }, + { + "epoch": 0.8265081648193586, + "grad_norm": 0.15847790241241455, + "learning_rate": 0.001, + "loss": 1.9201, + "step": 19537 + }, + { + "epoch": 0.826550469582875, + "grad_norm": 0.8735809922218323, + "learning_rate": 0.001, + "loss": 3.1536, + "step": 19538 + }, + { + "epoch": 0.8265927743463914, + "grad_norm": 0.18533921241760254, + "learning_rate": 0.001, + "loss": 2.4057, + "step": 19539 + }, + { + "epoch": 0.8266350791099077, + "grad_norm": 2.2356624603271484, + "learning_rate": 0.001, + "loss": 2.8036, + "step": 19540 + }, + { + "epoch": 0.8266773838734242, + "grad_norm": 0.1672806292772293, + "learning_rate": 0.001, + "loss": 2.093, + "step": 19541 + }, + { + "epoch": 0.8267196886369406, + "grad_norm": 0.26861119270324707, + "learning_rate": 0.001, + "loss": 2.5095, + "step": 19542 + }, + { + "epoch": 0.8267619934004569, + "grad_norm": 0.1656004935503006, + "learning_rate": 0.001, + "loss": 2.269, + "step": 19543 + }, + { + "epoch": 0.8268042981639733, + "grad_norm": 0.1717766374349594, + "learning_rate": 0.001, + "loss": 1.9956, + "step": 19544 + }, + { + "epoch": 0.8268466029274897, + "grad_norm": 0.21453844010829926, + "learning_rate": 0.001, + "loss": 3.3933, + "step": 19545 + }, + { + "epoch": 0.826888907691006, + "grad_norm": 0.15061205625534058, + "learning_rate": 0.001, + "loss": 2.3823, + "step": 19546 + }, + { + "epoch": 0.8269312124545224, + "grad_norm": 0.20553554594516754, + "learning_rate": 0.001, + "loss": 2.4435, + "step": 19547 + }, + { + "epoch": 0.8269735172180388, + "grad_norm": 0.41001060605049133, + "learning_rate": 0.001, + "loss": 2.516, + "step": 19548 + }, + { + "epoch": 0.8270158219815551, + "grad_norm": 0.16718702018260956, + "learning_rate": 0.001, + "loss": 2.3812, + "step": 19549 + }, + { + "epoch": 0.8270581267450715, + "grad_norm": 0.13911297917366028, + "learning_rate": 0.001, + "loss": 1.7068, + "step": 19550 + }, + { + "epoch": 0.8271004315085879, + "grad_norm": 4.676766872406006, + "learning_rate": 0.001, + "loss": 3.0646, + "step": 19551 + }, + { + "epoch": 0.8271427362721042, + "grad_norm": 0.1728333830833435, + "learning_rate": 0.001, + "loss": 1.8783, + "step": 19552 + }, + { + "epoch": 0.8271850410356206, + "grad_norm": 0.1679517775774002, + "learning_rate": 0.001, + "loss": 1.7856, + "step": 19553 + }, + { + "epoch": 0.827227345799137, + "grad_norm": 0.1749308705329895, + "learning_rate": 0.001, + "loss": 1.9916, + "step": 19554 + }, + { + "epoch": 0.8272696505626533, + "grad_norm": 0.6144496202468872, + "learning_rate": 0.001, + "loss": 2.504, + "step": 19555 + }, + { + "epoch": 0.8273119553261697, + "grad_norm": 0.398067444562912, + "learning_rate": 0.001, + "loss": 2.041, + "step": 19556 + }, + { + "epoch": 0.827354260089686, + "grad_norm": 0.15144911408424377, + "learning_rate": 0.001, + "loss": 2.1569, + "step": 19557 + }, + { + "epoch": 0.8273965648532025, + "grad_norm": 0.1478147804737091, + "learning_rate": 0.001, + "loss": 2.3869, + "step": 19558 + }, + { + "epoch": 0.8274388696167189, + "grad_norm": 0.17387191951274872, + "learning_rate": 0.001, + "loss": 3.1704, + "step": 19559 + }, + { + "epoch": 0.8274811743802352, + "grad_norm": 0.7300922870635986, + "learning_rate": 0.001, + "loss": 2.1563, + "step": 19560 + }, + { + "epoch": 0.8275234791437516, + "grad_norm": 0.17235367000102997, + "learning_rate": 0.001, + "loss": 1.8931, + "step": 19561 + }, + { + "epoch": 0.827565783907268, + "grad_norm": 0.21332013607025146, + "learning_rate": 0.001, + "loss": 2.7753, + "step": 19562 + }, + { + "epoch": 0.8276080886707843, + "grad_norm": 0.18801867961883545, + "learning_rate": 0.001, + "loss": 1.9607, + "step": 19563 + }, + { + "epoch": 0.8276503934343007, + "grad_norm": 0.15900994837284088, + "learning_rate": 0.001, + "loss": 2.6224, + "step": 19564 + }, + { + "epoch": 0.8276926981978171, + "grad_norm": 0.1822887808084488, + "learning_rate": 0.001, + "loss": 2.2022, + "step": 19565 + }, + { + "epoch": 0.8277350029613334, + "grad_norm": 0.1728331744670868, + "learning_rate": 0.001, + "loss": 1.7164, + "step": 19566 + }, + { + "epoch": 0.8277773077248498, + "grad_norm": 0.18861548602581024, + "learning_rate": 0.001, + "loss": 2.6136, + "step": 19567 + }, + { + "epoch": 0.8278196124883662, + "grad_norm": 4.526989459991455, + "learning_rate": 0.001, + "loss": 2.0578, + "step": 19568 + }, + { + "epoch": 0.8278619172518825, + "grad_norm": 1.0483940839767456, + "learning_rate": 0.001, + "loss": 2.2614, + "step": 19569 + }, + { + "epoch": 0.8279042220153989, + "grad_norm": 6.750152587890625, + "learning_rate": 0.001, + "loss": 1.6585, + "step": 19570 + }, + { + "epoch": 0.8279465267789153, + "grad_norm": 0.15433065593242645, + "learning_rate": 0.001, + "loss": 1.8853, + "step": 19571 + }, + { + "epoch": 0.8279888315424316, + "grad_norm": 7.699106693267822, + "learning_rate": 0.001, + "loss": 1.9598, + "step": 19572 + }, + { + "epoch": 0.828031136305948, + "grad_norm": 0.1841927170753479, + "learning_rate": 0.001, + "loss": 1.8607, + "step": 19573 + }, + { + "epoch": 0.8280734410694645, + "grad_norm": 0.16091950237751007, + "learning_rate": 0.001, + "loss": 2.6873, + "step": 19574 + }, + { + "epoch": 0.8281157458329808, + "grad_norm": 0.188730388879776, + "learning_rate": 0.001, + "loss": 2.4136, + "step": 19575 + }, + { + "epoch": 0.8281580505964972, + "grad_norm": 0.18793466687202454, + "learning_rate": 0.001, + "loss": 2.9059, + "step": 19576 + }, + { + "epoch": 0.8282003553600136, + "grad_norm": 10.438246726989746, + "learning_rate": 0.001, + "loss": 2.4116, + "step": 19577 + }, + { + "epoch": 0.8282426601235299, + "grad_norm": 1.2651994228363037, + "learning_rate": 0.001, + "loss": 2.6492, + "step": 19578 + }, + { + "epoch": 0.8282849648870463, + "grad_norm": 0.23723463714122772, + "learning_rate": 0.001, + "loss": 1.6084, + "step": 19579 + }, + { + "epoch": 0.8283272696505627, + "grad_norm": 0.18241967260837555, + "learning_rate": 0.001, + "loss": 2.2461, + "step": 19580 + }, + { + "epoch": 0.828369574414079, + "grad_norm": 0.1818530410528183, + "learning_rate": 0.001, + "loss": 2.7537, + "step": 19581 + }, + { + "epoch": 0.8284118791775954, + "grad_norm": 0.20046694576740265, + "learning_rate": 0.001, + "loss": 1.8172, + "step": 19582 + }, + { + "epoch": 0.8284541839411118, + "grad_norm": 0.20764821767807007, + "learning_rate": 0.001, + "loss": 1.5514, + "step": 19583 + }, + { + "epoch": 0.8284964887046281, + "grad_norm": 0.19038403034210205, + "learning_rate": 0.001, + "loss": 2.4937, + "step": 19584 + }, + { + "epoch": 0.8285387934681445, + "grad_norm": 13.164509773254395, + "learning_rate": 0.001, + "loss": 2.2863, + "step": 19585 + }, + { + "epoch": 0.8285810982316609, + "grad_norm": 0.18057294189929962, + "learning_rate": 0.001, + "loss": 2.6837, + "step": 19586 + }, + { + "epoch": 0.8286234029951772, + "grad_norm": 0.20905058085918427, + "learning_rate": 0.001, + "loss": 2.1163, + "step": 19587 + }, + { + "epoch": 0.8286657077586936, + "grad_norm": 0.23860158026218414, + "learning_rate": 0.001, + "loss": 2.5112, + "step": 19588 + }, + { + "epoch": 0.82870801252221, + "grad_norm": 0.1781463325023651, + "learning_rate": 0.001, + "loss": 3.0306, + "step": 19589 + }, + { + "epoch": 0.8287503172857263, + "grad_norm": 0.7263326644897461, + "learning_rate": 0.001, + "loss": 1.8875, + "step": 19590 + }, + { + "epoch": 0.8287926220492428, + "grad_norm": 0.8033844232559204, + "learning_rate": 0.001, + "loss": 2.5529, + "step": 19591 + }, + { + "epoch": 0.8288349268127592, + "grad_norm": 0.18857400119304657, + "learning_rate": 0.001, + "loss": 2.7359, + "step": 19592 + }, + { + "epoch": 0.8288772315762755, + "grad_norm": 0.16333748400211334, + "learning_rate": 0.001, + "loss": 2.0609, + "step": 19593 + }, + { + "epoch": 0.8289195363397919, + "grad_norm": 0.19050641357898712, + "learning_rate": 0.001, + "loss": 1.7926, + "step": 19594 + }, + { + "epoch": 0.8289618411033083, + "grad_norm": 0.1565246731042862, + "learning_rate": 0.001, + "loss": 2.0922, + "step": 19595 + }, + { + "epoch": 0.8290041458668246, + "grad_norm": 0.13431625068187714, + "learning_rate": 0.001, + "loss": 1.9739, + "step": 19596 + }, + { + "epoch": 0.829046450630341, + "grad_norm": 4.049526214599609, + "learning_rate": 0.001, + "loss": 3.013, + "step": 19597 + }, + { + "epoch": 0.8290887553938574, + "grad_norm": 0.15808945894241333, + "learning_rate": 0.001, + "loss": 2.2172, + "step": 19598 + }, + { + "epoch": 0.8291310601573737, + "grad_norm": 0.15318995714187622, + "learning_rate": 0.001, + "loss": 2.4165, + "step": 19599 + }, + { + "epoch": 0.8291733649208901, + "grad_norm": 0.16532470285892487, + "learning_rate": 0.001, + "loss": 1.7708, + "step": 19600 + }, + { + "epoch": 0.8292156696844064, + "grad_norm": 0.21278615295886993, + "learning_rate": 0.001, + "loss": 1.9634, + "step": 19601 + }, + { + "epoch": 0.8292579744479228, + "grad_norm": 0.16973280906677246, + "learning_rate": 0.001, + "loss": 2.0971, + "step": 19602 + }, + { + "epoch": 0.8293002792114392, + "grad_norm": 0.191205233335495, + "learning_rate": 0.001, + "loss": 2.2638, + "step": 19603 + }, + { + "epoch": 0.8293425839749555, + "grad_norm": 0.18113046884536743, + "learning_rate": 0.001, + "loss": 1.898, + "step": 19604 + }, + { + "epoch": 0.8293848887384719, + "grad_norm": 7.267598628997803, + "learning_rate": 0.001, + "loss": 1.8977, + "step": 19605 + }, + { + "epoch": 0.8294271935019883, + "grad_norm": 0.16045303642749786, + "learning_rate": 0.001, + "loss": 1.5686, + "step": 19606 + }, + { + "epoch": 0.8294694982655046, + "grad_norm": 0.15106256306171417, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 19607 + }, + { + "epoch": 0.829511803029021, + "grad_norm": 0.16756461560726166, + "learning_rate": 0.001, + "loss": 1.7174, + "step": 19608 + }, + { + "epoch": 0.8295541077925375, + "grad_norm": 0.6214344501495361, + "learning_rate": 0.001, + "loss": 2.2515, + "step": 19609 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.1721925288438797, + "learning_rate": 0.001, + "loss": 1.7405, + "step": 19610 + }, + { + "epoch": 0.8296387173195702, + "grad_norm": 0.1743050515651703, + "learning_rate": 0.001, + "loss": 1.6597, + "step": 19611 + }, + { + "epoch": 0.8296810220830866, + "grad_norm": 0.5375502109527588, + "learning_rate": 0.001, + "loss": 2.6013, + "step": 19612 + }, + { + "epoch": 0.8297233268466029, + "grad_norm": 2.799589157104492, + "learning_rate": 0.001, + "loss": 1.9554, + "step": 19613 + }, + { + "epoch": 0.8297656316101193, + "grad_norm": 0.17461428046226501, + "learning_rate": 0.001, + "loss": 1.6489, + "step": 19614 + }, + { + "epoch": 0.8298079363736357, + "grad_norm": 0.16564109921455383, + "learning_rate": 0.001, + "loss": 1.7323, + "step": 19615 + }, + { + "epoch": 0.829850241137152, + "grad_norm": 0.2872896194458008, + "learning_rate": 0.001, + "loss": 2.0232, + "step": 19616 + }, + { + "epoch": 0.8298925459006684, + "grad_norm": 0.13868144154548645, + "learning_rate": 0.001, + "loss": 1.6235, + "step": 19617 + }, + { + "epoch": 0.8299348506641848, + "grad_norm": 0.17989665269851685, + "learning_rate": 0.001, + "loss": 1.8112, + "step": 19618 + }, + { + "epoch": 0.8299771554277011, + "grad_norm": 0.42621657252311707, + "learning_rate": 0.001, + "loss": 2.8575, + "step": 19619 + }, + { + "epoch": 0.8300194601912175, + "grad_norm": 0.24013011157512665, + "learning_rate": 0.001, + "loss": 2.0607, + "step": 19620 + }, + { + "epoch": 0.8300617649547339, + "grad_norm": 0.22159551084041595, + "learning_rate": 0.001, + "loss": 2.8096, + "step": 19621 + }, + { + "epoch": 0.8301040697182502, + "grad_norm": 0.5154104232788086, + "learning_rate": 0.001, + "loss": 2.2271, + "step": 19622 + }, + { + "epoch": 0.8301463744817666, + "grad_norm": 0.9761685729026794, + "learning_rate": 0.001, + "loss": 2.3908, + "step": 19623 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.36297187209129333, + "learning_rate": 0.001, + "loss": 3.5581, + "step": 19624 + }, + { + "epoch": 0.8302309840087994, + "grad_norm": 0.15795831382274628, + "learning_rate": 0.001, + "loss": 1.8869, + "step": 19625 + }, + { + "epoch": 0.8302732887723158, + "grad_norm": 0.19538545608520508, + "learning_rate": 0.001, + "loss": 2.2421, + "step": 19626 + }, + { + "epoch": 0.8303155935358322, + "grad_norm": 0.1804639995098114, + "learning_rate": 0.001, + "loss": 2.4326, + "step": 19627 + }, + { + "epoch": 0.8303578982993485, + "grad_norm": 0.14908581972122192, + "learning_rate": 0.001, + "loss": 2.9382, + "step": 19628 + }, + { + "epoch": 0.8304002030628649, + "grad_norm": 0.21959789097309113, + "learning_rate": 0.001, + "loss": 2.003, + "step": 19629 + }, + { + "epoch": 0.8304425078263813, + "grad_norm": 0.5916885733604431, + "learning_rate": 0.001, + "loss": 3.7824, + "step": 19630 + }, + { + "epoch": 0.8304848125898976, + "grad_norm": 0.1725091189146042, + "learning_rate": 0.001, + "loss": 1.8455, + "step": 19631 + }, + { + "epoch": 0.830527117353414, + "grad_norm": 1.093875765800476, + "learning_rate": 0.001, + "loss": 1.9038, + "step": 19632 + }, + { + "epoch": 0.8305694221169304, + "grad_norm": 0.1586592197418213, + "learning_rate": 0.001, + "loss": 1.7105, + "step": 19633 + }, + { + "epoch": 0.8306117268804467, + "grad_norm": 0.1464533805847168, + "learning_rate": 0.001, + "loss": 1.5658, + "step": 19634 + }, + { + "epoch": 0.8306540316439631, + "grad_norm": 0.2901553213596344, + "learning_rate": 0.001, + "loss": 1.9834, + "step": 19635 + }, + { + "epoch": 0.8306963364074795, + "grad_norm": 0.2217240035533905, + "learning_rate": 0.001, + "loss": 2.5072, + "step": 19636 + }, + { + "epoch": 0.8307386411709958, + "grad_norm": 0.24927441775798798, + "learning_rate": 0.001, + "loss": 1.8134, + "step": 19637 + }, + { + "epoch": 0.8307809459345122, + "grad_norm": 0.1451728343963623, + "learning_rate": 0.001, + "loss": 2.3247, + "step": 19638 + }, + { + "epoch": 0.8308232506980286, + "grad_norm": 0.1595974713563919, + "learning_rate": 0.001, + "loss": 1.5477, + "step": 19639 + }, + { + "epoch": 0.830865555461545, + "grad_norm": 0.4610111117362976, + "learning_rate": 0.001, + "loss": 1.5437, + "step": 19640 + }, + { + "epoch": 0.8309078602250614, + "grad_norm": 0.1525607854127884, + "learning_rate": 0.001, + "loss": 2.1338, + "step": 19641 + }, + { + "epoch": 0.8309501649885778, + "grad_norm": 0.16522721946239471, + "learning_rate": 0.001, + "loss": 3.4636, + "step": 19642 + }, + { + "epoch": 0.8309924697520941, + "grad_norm": 0.4826166033744812, + "learning_rate": 0.001, + "loss": 1.8839, + "step": 19643 + }, + { + "epoch": 0.8310347745156105, + "grad_norm": 0.15211009979248047, + "learning_rate": 0.001, + "loss": 1.7515, + "step": 19644 + }, + { + "epoch": 0.8310770792791269, + "grad_norm": 0.13759112358093262, + "learning_rate": 0.001, + "loss": 1.6183, + "step": 19645 + }, + { + "epoch": 0.8311193840426432, + "grad_norm": 0.1878431886434555, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 19646 + }, + { + "epoch": 0.8311616888061596, + "grad_norm": 0.5903341174125671, + "learning_rate": 0.001, + "loss": 2.8315, + "step": 19647 + }, + { + "epoch": 0.8312039935696759, + "grad_norm": 0.16062727570533752, + "learning_rate": 0.001, + "loss": 2.2591, + "step": 19648 + }, + { + "epoch": 0.8312462983331923, + "grad_norm": 0.1513572633266449, + "learning_rate": 0.001, + "loss": 1.9523, + "step": 19649 + }, + { + "epoch": 0.8312886030967087, + "grad_norm": 3.5382885932922363, + "learning_rate": 0.001, + "loss": 1.8145, + "step": 19650 + }, + { + "epoch": 0.831330907860225, + "grad_norm": 0.6378043293952942, + "learning_rate": 0.001, + "loss": 2.7972, + "step": 19651 + }, + { + "epoch": 0.8313732126237414, + "grad_norm": 0.5603189468383789, + "learning_rate": 0.001, + "loss": 1.9918, + "step": 19652 + }, + { + "epoch": 0.8314155173872578, + "grad_norm": 0.7151978611946106, + "learning_rate": 0.001, + "loss": 1.8135, + "step": 19653 + }, + { + "epoch": 0.8314578221507741, + "grad_norm": 0.14867694675922394, + "learning_rate": 0.001, + "loss": 1.9327, + "step": 19654 + }, + { + "epoch": 0.8315001269142905, + "grad_norm": 0.15285193920135498, + "learning_rate": 0.001, + "loss": 1.8189, + "step": 19655 + }, + { + "epoch": 0.831542431677807, + "grad_norm": 0.26532498002052307, + "learning_rate": 0.001, + "loss": 2.4956, + "step": 19656 + }, + { + "epoch": 0.8315847364413232, + "grad_norm": 0.18189120292663574, + "learning_rate": 0.001, + "loss": 2.8454, + "step": 19657 + }, + { + "epoch": 0.8316270412048397, + "grad_norm": 0.335979700088501, + "learning_rate": 0.001, + "loss": 1.7169, + "step": 19658 + }, + { + "epoch": 0.8316693459683561, + "grad_norm": 1.2381075620651245, + "learning_rate": 0.001, + "loss": 1.5355, + "step": 19659 + }, + { + "epoch": 0.8317116507318724, + "grad_norm": 0.18235164880752563, + "learning_rate": 0.001, + "loss": 2.2521, + "step": 19660 + }, + { + "epoch": 0.8317539554953888, + "grad_norm": 0.5001907348632812, + "learning_rate": 0.001, + "loss": 2.8203, + "step": 19661 + }, + { + "epoch": 0.8317962602589052, + "grad_norm": 0.13656170666217804, + "learning_rate": 0.001, + "loss": 2.2657, + "step": 19662 + }, + { + "epoch": 0.8318385650224215, + "grad_norm": 0.1643732637166977, + "learning_rate": 0.001, + "loss": 1.7424, + "step": 19663 + }, + { + "epoch": 0.8318808697859379, + "grad_norm": 0.15448181331157684, + "learning_rate": 0.001, + "loss": 1.4355, + "step": 19664 + }, + { + "epoch": 0.8319231745494543, + "grad_norm": 2.5105960369110107, + "learning_rate": 0.001, + "loss": 1.913, + "step": 19665 + }, + { + "epoch": 0.8319654793129706, + "grad_norm": 0.16451245546340942, + "learning_rate": 0.001, + "loss": 2.2253, + "step": 19666 + }, + { + "epoch": 0.832007784076487, + "grad_norm": 0.15725699067115784, + "learning_rate": 0.001, + "loss": 1.9037, + "step": 19667 + }, + { + "epoch": 0.8320500888400034, + "grad_norm": 0.32337695360183716, + "learning_rate": 0.001, + "loss": 2.0358, + "step": 19668 + }, + { + "epoch": 0.8320923936035197, + "grad_norm": 0.21804969012737274, + "learning_rate": 0.001, + "loss": 1.8434, + "step": 19669 + }, + { + "epoch": 0.8321346983670361, + "grad_norm": 5.778602123260498, + "learning_rate": 0.001, + "loss": 1.8731, + "step": 19670 + }, + { + "epoch": 0.8321770031305525, + "grad_norm": 0.21234005689620972, + "learning_rate": 0.001, + "loss": 1.9701, + "step": 19671 + }, + { + "epoch": 0.8322193078940688, + "grad_norm": 2.040515661239624, + "learning_rate": 0.001, + "loss": 3.1024, + "step": 19672 + }, + { + "epoch": 0.8322616126575852, + "grad_norm": 0.14439794421195984, + "learning_rate": 0.001, + "loss": 1.4982, + "step": 19673 + }, + { + "epoch": 0.8323039174211017, + "grad_norm": 0.14553527534008026, + "learning_rate": 0.001, + "loss": 2.5106, + "step": 19674 + }, + { + "epoch": 0.832346222184618, + "grad_norm": 0.1754305213689804, + "learning_rate": 0.001, + "loss": 2.5787, + "step": 19675 + }, + { + "epoch": 0.8323885269481344, + "grad_norm": 0.13530834019184113, + "learning_rate": 0.001, + "loss": 1.6652, + "step": 19676 + }, + { + "epoch": 0.8324308317116508, + "grad_norm": 16.148717880249023, + "learning_rate": 0.001, + "loss": 1.9761, + "step": 19677 + }, + { + "epoch": 0.8324731364751671, + "grad_norm": 0.16768808662891388, + "learning_rate": 0.001, + "loss": 2.5138, + "step": 19678 + }, + { + "epoch": 0.8325154412386835, + "grad_norm": 0.16565167903900146, + "learning_rate": 0.001, + "loss": 1.8295, + "step": 19679 + }, + { + "epoch": 0.8325577460021999, + "grad_norm": 0.24499014019966125, + "learning_rate": 0.001, + "loss": 2.4988, + "step": 19680 + }, + { + "epoch": 0.8326000507657162, + "grad_norm": 0.1661049723625183, + "learning_rate": 0.001, + "loss": 2.2289, + "step": 19681 + }, + { + "epoch": 0.8326423555292326, + "grad_norm": 0.16528615355491638, + "learning_rate": 0.001, + "loss": 1.5157, + "step": 19682 + }, + { + "epoch": 0.832684660292749, + "grad_norm": 0.1665087193250656, + "learning_rate": 0.001, + "loss": 2.2805, + "step": 19683 + }, + { + "epoch": 0.8327269650562653, + "grad_norm": 0.1810268610715866, + "learning_rate": 0.001, + "loss": 2.3202, + "step": 19684 + }, + { + "epoch": 0.8327692698197817, + "grad_norm": 0.15981633961200714, + "learning_rate": 0.001, + "loss": 1.511, + "step": 19685 + }, + { + "epoch": 0.8328115745832981, + "grad_norm": 0.43922242522239685, + "learning_rate": 0.001, + "loss": 3.167, + "step": 19686 + }, + { + "epoch": 0.8328538793468144, + "grad_norm": 0.16174915432929993, + "learning_rate": 0.001, + "loss": 1.8022, + "step": 19687 + }, + { + "epoch": 0.8328961841103308, + "grad_norm": 0.12966187298297882, + "learning_rate": 0.001, + "loss": 2.7875, + "step": 19688 + }, + { + "epoch": 0.8329384888738472, + "grad_norm": 0.17948752641677856, + "learning_rate": 0.001, + "loss": 1.7173, + "step": 19689 + }, + { + "epoch": 0.8329807936373635, + "grad_norm": 0.17646683752536774, + "learning_rate": 0.001, + "loss": 1.9492, + "step": 19690 + }, + { + "epoch": 0.83302309840088, + "grad_norm": 0.17098137736320496, + "learning_rate": 0.001, + "loss": 1.5657, + "step": 19691 + }, + { + "epoch": 0.8330654031643963, + "grad_norm": 7.42794942855835, + "learning_rate": 0.001, + "loss": 1.7042, + "step": 19692 + }, + { + "epoch": 0.8331077079279127, + "grad_norm": 0.15731237828731537, + "learning_rate": 0.001, + "loss": 2.4516, + "step": 19693 + }, + { + "epoch": 0.8331500126914291, + "grad_norm": 0.16267119348049164, + "learning_rate": 0.001, + "loss": 1.8393, + "step": 19694 + }, + { + "epoch": 0.8331923174549454, + "grad_norm": 0.1380946934223175, + "learning_rate": 0.001, + "loss": 2.3719, + "step": 19695 + }, + { + "epoch": 0.8332346222184618, + "grad_norm": 0.1593260020017624, + "learning_rate": 0.001, + "loss": 1.9913, + "step": 19696 + }, + { + "epoch": 0.8332769269819782, + "grad_norm": 0.19638261198997498, + "learning_rate": 0.001, + "loss": 2.1259, + "step": 19697 + }, + { + "epoch": 0.8333192317454945, + "grad_norm": 0.1739358901977539, + "learning_rate": 0.001, + "loss": 2.602, + "step": 19698 + }, + { + "epoch": 0.8333615365090109, + "grad_norm": 0.28092655539512634, + "learning_rate": 0.001, + "loss": 1.8167, + "step": 19699 + }, + { + "epoch": 0.8334038412725273, + "grad_norm": 0.15453416109085083, + "learning_rate": 0.001, + "loss": 1.8884, + "step": 19700 + }, + { + "epoch": 0.8334461460360436, + "grad_norm": 0.2648554742336273, + "learning_rate": 0.001, + "loss": 3.4063, + "step": 19701 + }, + { + "epoch": 0.83348845079956, + "grad_norm": 0.15070085227489471, + "learning_rate": 0.001, + "loss": 1.9261, + "step": 19702 + }, + { + "epoch": 0.8335307555630764, + "grad_norm": 0.15136472880840302, + "learning_rate": 0.001, + "loss": 1.8056, + "step": 19703 + }, + { + "epoch": 0.8335730603265927, + "grad_norm": 0.17444024980068207, + "learning_rate": 0.001, + "loss": 2.3862, + "step": 19704 + }, + { + "epoch": 0.8336153650901091, + "grad_norm": 0.20091457664966583, + "learning_rate": 0.001, + "loss": 2.6777, + "step": 19705 + }, + { + "epoch": 0.8336576698536255, + "grad_norm": 0.17483146488666534, + "learning_rate": 0.001, + "loss": 3.9762, + "step": 19706 + }, + { + "epoch": 0.8336999746171418, + "grad_norm": 0.14179664850234985, + "learning_rate": 0.001, + "loss": 1.7762, + "step": 19707 + }, + { + "epoch": 0.8337422793806583, + "grad_norm": 0.14632867276668549, + "learning_rate": 0.001, + "loss": 2.4196, + "step": 19708 + }, + { + "epoch": 0.8337845841441747, + "grad_norm": 0.1618465930223465, + "learning_rate": 0.001, + "loss": 2.0228, + "step": 19709 + }, + { + "epoch": 0.833826888907691, + "grad_norm": 0.17605137825012207, + "learning_rate": 0.001, + "loss": 2.255, + "step": 19710 + }, + { + "epoch": 0.8338691936712074, + "grad_norm": 0.1627848893404007, + "learning_rate": 0.001, + "loss": 2.5271, + "step": 19711 + }, + { + "epoch": 0.8339114984347238, + "grad_norm": 0.15712319314479828, + "learning_rate": 0.001, + "loss": 1.6956, + "step": 19712 + }, + { + "epoch": 0.8339538031982401, + "grad_norm": 0.2377215325832367, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 19713 + }, + { + "epoch": 0.8339961079617565, + "grad_norm": 0.14248542487621307, + "learning_rate": 0.001, + "loss": 2.9125, + "step": 19714 + }, + { + "epoch": 0.8340384127252729, + "grad_norm": 0.1505517214536667, + "learning_rate": 0.001, + "loss": 1.4674, + "step": 19715 + }, + { + "epoch": 0.8340807174887892, + "grad_norm": 0.11878804862499237, + "learning_rate": 0.001, + "loss": 1.7173, + "step": 19716 + }, + { + "epoch": 0.8341230222523056, + "grad_norm": 0.1470642238855362, + "learning_rate": 0.001, + "loss": 1.3979, + "step": 19717 + }, + { + "epoch": 0.834165327015822, + "grad_norm": 0.14857931435108185, + "learning_rate": 0.001, + "loss": 2.3536, + "step": 19718 + }, + { + "epoch": 0.8342076317793383, + "grad_norm": 0.14904291927814484, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 19719 + }, + { + "epoch": 0.8342499365428547, + "grad_norm": 4.184340476989746, + "learning_rate": 0.001, + "loss": 2.0502, + "step": 19720 + }, + { + "epoch": 0.8342922413063711, + "grad_norm": 0.16382192075252533, + "learning_rate": 0.001, + "loss": 3.3113, + "step": 19721 + }, + { + "epoch": 0.8343345460698874, + "grad_norm": 2.076079845428467, + "learning_rate": 0.001, + "loss": 2.152, + "step": 19722 + }, + { + "epoch": 0.8343768508334038, + "grad_norm": 0.13855180144309998, + "learning_rate": 0.001, + "loss": 1.7025, + "step": 19723 + }, + { + "epoch": 0.8344191555969203, + "grad_norm": 0.1491064429283142, + "learning_rate": 0.001, + "loss": 1.7431, + "step": 19724 + }, + { + "epoch": 0.8344614603604366, + "grad_norm": 0.3308009207248688, + "learning_rate": 0.001, + "loss": 1.3676, + "step": 19725 + }, + { + "epoch": 0.834503765123953, + "grad_norm": 1.1953833103179932, + "learning_rate": 0.001, + "loss": 2.9253, + "step": 19726 + }, + { + "epoch": 0.8345460698874694, + "grad_norm": 0.11848998814821243, + "learning_rate": 0.001, + "loss": 1.6943, + "step": 19727 + }, + { + "epoch": 0.8345883746509857, + "grad_norm": 1.0493426322937012, + "learning_rate": 0.001, + "loss": 1.9968, + "step": 19728 + }, + { + "epoch": 0.8346306794145021, + "grad_norm": 0.13267287611961365, + "learning_rate": 0.001, + "loss": 1.927, + "step": 19729 + }, + { + "epoch": 0.8346729841780185, + "grad_norm": 0.1327977031469345, + "learning_rate": 0.001, + "loss": 2.9165, + "step": 19730 + }, + { + "epoch": 0.8347152889415348, + "grad_norm": 0.16689974069595337, + "learning_rate": 0.001, + "loss": 2.0789, + "step": 19731 + }, + { + "epoch": 0.8347575937050512, + "grad_norm": 0.17344988882541656, + "learning_rate": 0.001, + "loss": 2.5676, + "step": 19732 + }, + { + "epoch": 0.8347998984685676, + "grad_norm": 0.14860732853412628, + "learning_rate": 0.001, + "loss": 1.6259, + "step": 19733 + }, + { + "epoch": 0.8348422032320839, + "grad_norm": 0.15001067519187927, + "learning_rate": 0.001, + "loss": 1.5225, + "step": 19734 + }, + { + "epoch": 0.8348845079956003, + "grad_norm": 0.16379481554031372, + "learning_rate": 0.001, + "loss": 3.1107, + "step": 19735 + }, + { + "epoch": 0.8349268127591166, + "grad_norm": 0.16210788488388062, + "learning_rate": 0.001, + "loss": 1.9486, + "step": 19736 + }, + { + "epoch": 0.834969117522633, + "grad_norm": 0.17542105913162231, + "learning_rate": 0.001, + "loss": 1.36, + "step": 19737 + }, + { + "epoch": 0.8350114222861494, + "grad_norm": 0.168275848031044, + "learning_rate": 0.001, + "loss": 2.1156, + "step": 19738 + }, + { + "epoch": 0.8350537270496657, + "grad_norm": 0.153310164809227, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 19739 + }, + { + "epoch": 0.8350960318131821, + "grad_norm": 1.9082057476043701, + "learning_rate": 0.001, + "loss": 2.4097, + "step": 19740 + }, + { + "epoch": 0.8351383365766986, + "grad_norm": 0.1443844437599182, + "learning_rate": 0.001, + "loss": 2.4301, + "step": 19741 + }, + { + "epoch": 0.8351806413402149, + "grad_norm": 0.14315207302570343, + "learning_rate": 0.001, + "loss": 1.3867, + "step": 19742 + }, + { + "epoch": 0.8352229461037313, + "grad_norm": 0.15107427537441254, + "learning_rate": 0.001, + "loss": 3.8243, + "step": 19743 + }, + { + "epoch": 0.8352652508672477, + "grad_norm": 0.1486080139875412, + "learning_rate": 0.001, + "loss": 1.3748, + "step": 19744 + }, + { + "epoch": 0.835307555630764, + "grad_norm": 0.13850051164627075, + "learning_rate": 0.001, + "loss": 2.7205, + "step": 19745 + }, + { + "epoch": 0.8353498603942804, + "grad_norm": 0.1894737184047699, + "learning_rate": 0.001, + "loss": 2.0814, + "step": 19746 + }, + { + "epoch": 0.8353921651577968, + "grad_norm": 3.34781551361084, + "learning_rate": 0.001, + "loss": 1.7776, + "step": 19747 + }, + { + "epoch": 0.8354344699213131, + "grad_norm": 0.14483730494976044, + "learning_rate": 0.001, + "loss": 2.1921, + "step": 19748 + }, + { + "epoch": 0.8354767746848295, + "grad_norm": 0.15554209053516388, + "learning_rate": 0.001, + "loss": 2.1626, + "step": 19749 + }, + { + "epoch": 0.8355190794483459, + "grad_norm": 0.2667386531829834, + "learning_rate": 0.001, + "loss": 2.3544, + "step": 19750 + }, + { + "epoch": 0.8355613842118622, + "grad_norm": 0.17081429064273834, + "learning_rate": 0.001, + "loss": 3.0333, + "step": 19751 + }, + { + "epoch": 0.8356036889753786, + "grad_norm": 0.18492451310157776, + "learning_rate": 0.001, + "loss": 2.3743, + "step": 19752 + }, + { + "epoch": 0.835645993738895, + "grad_norm": 1.0934829711914062, + "learning_rate": 0.001, + "loss": 2.1734, + "step": 19753 + }, + { + "epoch": 0.8356882985024113, + "grad_norm": 0.2422739565372467, + "learning_rate": 0.001, + "loss": 1.8901, + "step": 19754 + }, + { + "epoch": 0.8357306032659277, + "grad_norm": 0.39902499318122864, + "learning_rate": 0.001, + "loss": 2.4519, + "step": 19755 + }, + { + "epoch": 0.8357729080294442, + "grad_norm": 18.87704849243164, + "learning_rate": 0.001, + "loss": 4.1573, + "step": 19756 + }, + { + "epoch": 0.8358152127929604, + "grad_norm": 0.1568903774023056, + "learning_rate": 0.001, + "loss": 1.8906, + "step": 19757 + }, + { + "epoch": 0.8358575175564769, + "grad_norm": 0.2181655913591385, + "learning_rate": 0.001, + "loss": 2.7009, + "step": 19758 + }, + { + "epoch": 0.8358998223199933, + "grad_norm": 0.14595481753349304, + "learning_rate": 0.001, + "loss": 1.4677, + "step": 19759 + }, + { + "epoch": 0.8359421270835096, + "grad_norm": 0.18573489785194397, + "learning_rate": 0.001, + "loss": 1.9539, + "step": 19760 + }, + { + "epoch": 0.835984431847026, + "grad_norm": 0.19582517445087433, + "learning_rate": 0.001, + "loss": 1.9018, + "step": 19761 + }, + { + "epoch": 0.8360267366105424, + "grad_norm": 0.18559232354164124, + "learning_rate": 0.001, + "loss": 2.0573, + "step": 19762 + }, + { + "epoch": 0.8360690413740587, + "grad_norm": 0.2434457689523697, + "learning_rate": 0.001, + "loss": 2.4941, + "step": 19763 + }, + { + "epoch": 0.8361113461375751, + "grad_norm": 0.24671140313148499, + "learning_rate": 0.001, + "loss": 2.2932, + "step": 19764 + }, + { + "epoch": 0.8361536509010915, + "grad_norm": 55.95083999633789, + "learning_rate": 0.001, + "loss": 1.4447, + "step": 19765 + }, + { + "epoch": 0.8361959556646078, + "grad_norm": 0.18943527340888977, + "learning_rate": 0.001, + "loss": 1.8799, + "step": 19766 + }, + { + "epoch": 0.8362382604281242, + "grad_norm": 0.248708114027977, + "learning_rate": 0.001, + "loss": 2.1987, + "step": 19767 + }, + { + "epoch": 0.8362805651916406, + "grad_norm": 0.2755674719810486, + "learning_rate": 0.001, + "loss": 2.5924, + "step": 19768 + }, + { + "epoch": 0.8363228699551569, + "grad_norm": 0.8433470726013184, + "learning_rate": 0.001, + "loss": 2.656, + "step": 19769 + }, + { + "epoch": 0.8363651747186733, + "grad_norm": 0.25172102451324463, + "learning_rate": 0.001, + "loss": 2.0833, + "step": 19770 + }, + { + "epoch": 0.8364074794821897, + "grad_norm": 0.20284956693649292, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 19771 + }, + { + "epoch": 0.836449784245706, + "grad_norm": 0.16717220842838287, + "learning_rate": 0.001, + "loss": 1.6458, + "step": 19772 + }, + { + "epoch": 0.8364920890092225, + "grad_norm": 0.18188956379890442, + "learning_rate": 0.001, + "loss": 2.6662, + "step": 19773 + }, + { + "epoch": 0.8365343937727389, + "grad_norm": 0.19936378300189972, + "learning_rate": 0.001, + "loss": 2.1813, + "step": 19774 + }, + { + "epoch": 0.8365766985362552, + "grad_norm": 0.24121548235416412, + "learning_rate": 0.001, + "loss": 2.8356, + "step": 19775 + }, + { + "epoch": 0.8366190032997716, + "grad_norm": 2.9213600158691406, + "learning_rate": 0.001, + "loss": 2.4832, + "step": 19776 + }, + { + "epoch": 0.836661308063288, + "grad_norm": 0.16870221495628357, + "learning_rate": 0.001, + "loss": 2.8586, + "step": 19777 + }, + { + "epoch": 0.8367036128268043, + "grad_norm": 0.1627093255519867, + "learning_rate": 0.001, + "loss": 1.8701, + "step": 19778 + }, + { + "epoch": 0.8367459175903207, + "grad_norm": 0.18144871294498444, + "learning_rate": 0.001, + "loss": 2.6356, + "step": 19779 + }, + { + "epoch": 0.8367882223538371, + "grad_norm": 0.20875905454158783, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 19780 + }, + { + "epoch": 0.8368305271173534, + "grad_norm": 0.17586883902549744, + "learning_rate": 0.001, + "loss": 2.2282, + "step": 19781 + }, + { + "epoch": 0.8368728318808698, + "grad_norm": 0.17279373109340668, + "learning_rate": 0.001, + "loss": 1.6294, + "step": 19782 + }, + { + "epoch": 0.8369151366443861, + "grad_norm": 0.19814880192279816, + "learning_rate": 0.001, + "loss": 1.8659, + "step": 19783 + }, + { + "epoch": 0.8369574414079025, + "grad_norm": 0.3138713538646698, + "learning_rate": 0.001, + "loss": 2.3541, + "step": 19784 + }, + { + "epoch": 0.8369997461714189, + "grad_norm": 0.15233896672725677, + "learning_rate": 0.001, + "loss": 2.0767, + "step": 19785 + }, + { + "epoch": 0.8370420509349352, + "grad_norm": 9.646958351135254, + "learning_rate": 0.001, + "loss": 2.3433, + "step": 19786 + }, + { + "epoch": 0.8370843556984516, + "grad_norm": 0.3921230733394623, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 19787 + }, + { + "epoch": 0.837126660461968, + "grad_norm": 0.17926689982414246, + "learning_rate": 0.001, + "loss": 2.008, + "step": 19788 + }, + { + "epoch": 0.8371689652254843, + "grad_norm": 0.17443296313285828, + "learning_rate": 0.001, + "loss": 2.0443, + "step": 19789 + }, + { + "epoch": 0.8372112699890008, + "grad_norm": 0.1694057285785675, + "learning_rate": 0.001, + "loss": 1.7924, + "step": 19790 + }, + { + "epoch": 0.8372535747525172, + "grad_norm": 0.16491025686264038, + "learning_rate": 0.001, + "loss": 2.1166, + "step": 19791 + }, + { + "epoch": 0.8372958795160335, + "grad_norm": 0.14807593822479248, + "learning_rate": 0.001, + "loss": 1.6911, + "step": 19792 + }, + { + "epoch": 0.8373381842795499, + "grad_norm": 0.19752711057662964, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 19793 + }, + { + "epoch": 0.8373804890430663, + "grad_norm": 0.1440856158733368, + "learning_rate": 0.001, + "loss": 2.3304, + "step": 19794 + }, + { + "epoch": 0.8374227938065826, + "grad_norm": 0.2171926349401474, + "learning_rate": 0.001, + "loss": 2.6035, + "step": 19795 + }, + { + "epoch": 0.837465098570099, + "grad_norm": 0.1901613473892212, + "learning_rate": 0.001, + "loss": 2.6214, + "step": 19796 + }, + { + "epoch": 0.8375074033336154, + "grad_norm": 0.15385551750659943, + "learning_rate": 0.001, + "loss": 1.7298, + "step": 19797 + }, + { + "epoch": 0.8375497080971317, + "grad_norm": 1.251185655593872, + "learning_rate": 0.001, + "loss": 2.613, + "step": 19798 + }, + { + "epoch": 0.8375920128606481, + "grad_norm": 0.14876984059810638, + "learning_rate": 0.001, + "loss": 2.2236, + "step": 19799 + }, + { + "epoch": 0.8376343176241645, + "grad_norm": 0.19195450842380524, + "learning_rate": 0.001, + "loss": 2.2847, + "step": 19800 + }, + { + "epoch": 0.8376766223876808, + "grad_norm": 0.14749985933303833, + "learning_rate": 0.001, + "loss": 1.8272, + "step": 19801 + }, + { + "epoch": 0.8377189271511972, + "grad_norm": 0.18800973892211914, + "learning_rate": 0.001, + "loss": 1.738, + "step": 19802 + }, + { + "epoch": 0.8377612319147136, + "grad_norm": 27.659873962402344, + "learning_rate": 0.001, + "loss": 2.2451, + "step": 19803 + }, + { + "epoch": 0.8378035366782299, + "grad_norm": 0.16692310571670532, + "learning_rate": 0.001, + "loss": 2.4913, + "step": 19804 + }, + { + "epoch": 0.8378458414417463, + "grad_norm": 0.2250969558954239, + "learning_rate": 0.001, + "loss": 2.685, + "step": 19805 + }, + { + "epoch": 0.8378881462052628, + "grad_norm": 0.13324400782585144, + "learning_rate": 0.001, + "loss": 2.6045, + "step": 19806 + }, + { + "epoch": 0.837930450968779, + "grad_norm": 2.1039130687713623, + "learning_rate": 0.001, + "loss": 2.5165, + "step": 19807 + }, + { + "epoch": 0.8379727557322955, + "grad_norm": 0.14113382995128632, + "learning_rate": 0.001, + "loss": 1.6369, + "step": 19808 + }, + { + "epoch": 0.8380150604958119, + "grad_norm": 0.15121187269687653, + "learning_rate": 0.001, + "loss": 1.6652, + "step": 19809 + }, + { + "epoch": 0.8380573652593282, + "grad_norm": 0.15335388481616974, + "learning_rate": 0.001, + "loss": 2.2542, + "step": 19810 + }, + { + "epoch": 0.8380996700228446, + "grad_norm": 3.9381353855133057, + "learning_rate": 0.001, + "loss": 2.8601, + "step": 19811 + }, + { + "epoch": 0.838141974786361, + "grad_norm": 0.7249288558959961, + "learning_rate": 0.001, + "loss": 2.4747, + "step": 19812 + }, + { + "epoch": 0.8381842795498773, + "grad_norm": 6.57703971862793, + "learning_rate": 0.001, + "loss": 1.7587, + "step": 19813 + }, + { + "epoch": 0.8382265843133937, + "grad_norm": 0.7364828586578369, + "learning_rate": 0.001, + "loss": 3.9219, + "step": 19814 + }, + { + "epoch": 0.8382688890769101, + "grad_norm": 0.18288707733154297, + "learning_rate": 0.001, + "loss": 2.7903, + "step": 19815 + }, + { + "epoch": 0.8383111938404264, + "grad_norm": 0.1727524846792221, + "learning_rate": 0.001, + "loss": 1.9508, + "step": 19816 + }, + { + "epoch": 0.8383534986039428, + "grad_norm": 0.1617797166109085, + "learning_rate": 0.001, + "loss": 3.1475, + "step": 19817 + }, + { + "epoch": 0.8383958033674592, + "grad_norm": 0.16539444029331207, + "learning_rate": 0.001, + "loss": 2.1155, + "step": 19818 + }, + { + "epoch": 0.8384381081309755, + "grad_norm": 4.960668563842773, + "learning_rate": 0.001, + "loss": 2.3467, + "step": 19819 + }, + { + "epoch": 0.8384804128944919, + "grad_norm": 0.20643018186092377, + "learning_rate": 0.001, + "loss": 2.0452, + "step": 19820 + }, + { + "epoch": 0.8385227176580083, + "grad_norm": 2.9467222690582275, + "learning_rate": 0.001, + "loss": 2.9591, + "step": 19821 + }, + { + "epoch": 0.8385650224215246, + "grad_norm": 0.13099977374076843, + "learning_rate": 0.001, + "loss": 1.5804, + "step": 19822 + }, + { + "epoch": 0.838607327185041, + "grad_norm": 0.15258918702602386, + "learning_rate": 0.001, + "loss": 2.4391, + "step": 19823 + }, + { + "epoch": 0.8386496319485575, + "grad_norm": 0.18933935463428497, + "learning_rate": 0.001, + "loss": 2.2122, + "step": 19824 + }, + { + "epoch": 0.8386919367120738, + "grad_norm": 0.14285174012184143, + "learning_rate": 0.001, + "loss": 2.3817, + "step": 19825 + }, + { + "epoch": 0.8387342414755902, + "grad_norm": 1.277612328529358, + "learning_rate": 0.001, + "loss": 2.2287, + "step": 19826 + }, + { + "epoch": 0.8387765462391065, + "grad_norm": 0.12771695852279663, + "learning_rate": 0.001, + "loss": 1.9689, + "step": 19827 + }, + { + "epoch": 0.8388188510026229, + "grad_norm": 0.16007982194423676, + "learning_rate": 0.001, + "loss": 2.2604, + "step": 19828 + }, + { + "epoch": 0.8388611557661393, + "grad_norm": 0.625321090221405, + "learning_rate": 0.001, + "loss": 1.6921, + "step": 19829 + }, + { + "epoch": 0.8389034605296556, + "grad_norm": 0.44362181425094604, + "learning_rate": 0.001, + "loss": 2.2605, + "step": 19830 + }, + { + "epoch": 0.838945765293172, + "grad_norm": 0.16147875785827637, + "learning_rate": 0.001, + "loss": 1.9878, + "step": 19831 + }, + { + "epoch": 0.8389880700566884, + "grad_norm": 0.37815141677856445, + "learning_rate": 0.001, + "loss": 2.8435, + "step": 19832 + }, + { + "epoch": 0.8390303748202047, + "grad_norm": 0.15180058777332306, + "learning_rate": 0.001, + "loss": 1.7832, + "step": 19833 + }, + { + "epoch": 0.8390726795837211, + "grad_norm": 0.16070739924907684, + "learning_rate": 0.001, + "loss": 2.4271, + "step": 19834 + }, + { + "epoch": 0.8391149843472375, + "grad_norm": 2.378019332885742, + "learning_rate": 0.001, + "loss": 2.1971, + "step": 19835 + }, + { + "epoch": 0.8391572891107538, + "grad_norm": 0.13618861138820648, + "learning_rate": 0.001, + "loss": 2.482, + "step": 19836 + }, + { + "epoch": 0.8391995938742702, + "grad_norm": 0.1882878690958023, + "learning_rate": 0.001, + "loss": 2.0153, + "step": 19837 + }, + { + "epoch": 0.8392418986377866, + "grad_norm": 0.24582351744174957, + "learning_rate": 0.001, + "loss": 2.3494, + "step": 19838 + }, + { + "epoch": 0.8392842034013029, + "grad_norm": 0.22917570173740387, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 19839 + }, + { + "epoch": 0.8393265081648194, + "grad_norm": 0.15284200012683868, + "learning_rate": 0.001, + "loss": 2.142, + "step": 19840 + }, + { + "epoch": 0.8393688129283358, + "grad_norm": 0.15016065537929535, + "learning_rate": 0.001, + "loss": 1.9708, + "step": 19841 + }, + { + "epoch": 0.8394111176918521, + "grad_norm": 0.17199917137622833, + "learning_rate": 0.001, + "loss": 2.0482, + "step": 19842 + }, + { + "epoch": 0.8394534224553685, + "grad_norm": 2.0709526538848877, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 19843 + }, + { + "epoch": 0.8394957272188849, + "grad_norm": 0.14755693078041077, + "learning_rate": 0.001, + "loss": 2.07, + "step": 19844 + }, + { + "epoch": 0.8395380319824012, + "grad_norm": 0.45920902490615845, + "learning_rate": 0.001, + "loss": 3.1264, + "step": 19845 + }, + { + "epoch": 0.8395803367459176, + "grad_norm": 0.1509585976600647, + "learning_rate": 0.001, + "loss": 1.4513, + "step": 19846 + }, + { + "epoch": 0.839622641509434, + "grad_norm": 0.15342389047145844, + "learning_rate": 0.001, + "loss": 2.4979, + "step": 19847 + }, + { + "epoch": 0.8396649462729503, + "grad_norm": 0.16700981557369232, + "learning_rate": 0.001, + "loss": 1.6099, + "step": 19848 + }, + { + "epoch": 0.8397072510364667, + "grad_norm": 0.200181245803833, + "learning_rate": 0.001, + "loss": 1.9247, + "step": 19849 + }, + { + "epoch": 0.8397495557999831, + "grad_norm": 0.13869711756706238, + "learning_rate": 0.001, + "loss": 1.6663, + "step": 19850 + }, + { + "epoch": 0.8397918605634994, + "grad_norm": 0.19390735030174255, + "learning_rate": 0.001, + "loss": 2.6041, + "step": 19851 + }, + { + "epoch": 0.8398341653270158, + "grad_norm": 0.15957386791706085, + "learning_rate": 0.001, + "loss": 1.6804, + "step": 19852 + }, + { + "epoch": 0.8398764700905322, + "grad_norm": 0.1435195654630661, + "learning_rate": 0.001, + "loss": 2.222, + "step": 19853 + }, + { + "epoch": 0.8399187748540485, + "grad_norm": 0.4569486379623413, + "learning_rate": 0.001, + "loss": 1.7374, + "step": 19854 + }, + { + "epoch": 0.8399610796175649, + "grad_norm": 0.6527416706085205, + "learning_rate": 0.001, + "loss": 1.8554, + "step": 19855 + }, + { + "epoch": 0.8400033843810814, + "grad_norm": 0.36150479316711426, + "learning_rate": 0.001, + "loss": 2.1568, + "step": 19856 + }, + { + "epoch": 0.8400456891445977, + "grad_norm": 0.16183467209339142, + "learning_rate": 0.001, + "loss": 2.5408, + "step": 19857 + }, + { + "epoch": 0.8400879939081141, + "grad_norm": 0.14693614840507507, + "learning_rate": 0.001, + "loss": 1.7957, + "step": 19858 + }, + { + "epoch": 0.8401302986716305, + "grad_norm": 0.16061703860759735, + "learning_rate": 0.001, + "loss": 1.9237, + "step": 19859 + }, + { + "epoch": 0.8401726034351468, + "grad_norm": 0.1884782463312149, + "learning_rate": 0.001, + "loss": 1.936, + "step": 19860 + }, + { + "epoch": 0.8402149081986632, + "grad_norm": 0.15317420661449432, + "learning_rate": 0.001, + "loss": 1.857, + "step": 19861 + }, + { + "epoch": 0.8402572129621796, + "grad_norm": 0.17697061598300934, + "learning_rate": 0.001, + "loss": 2.4018, + "step": 19862 + }, + { + "epoch": 0.8402995177256959, + "grad_norm": 0.14646954834461212, + "learning_rate": 0.001, + "loss": 1.6583, + "step": 19863 + }, + { + "epoch": 0.8403418224892123, + "grad_norm": 0.1281052678823471, + "learning_rate": 0.001, + "loss": 3.3, + "step": 19864 + }, + { + "epoch": 0.8403841272527287, + "grad_norm": 0.12583167850971222, + "learning_rate": 0.001, + "loss": 2.1985, + "step": 19865 + }, + { + "epoch": 0.840426432016245, + "grad_norm": 0.16362233459949493, + "learning_rate": 0.001, + "loss": 1.9579, + "step": 19866 + }, + { + "epoch": 0.8404687367797614, + "grad_norm": 14.114072799682617, + "learning_rate": 0.001, + "loss": 2.0031, + "step": 19867 + }, + { + "epoch": 0.8405110415432778, + "grad_norm": 1.4896159172058105, + "learning_rate": 0.001, + "loss": 2.8004, + "step": 19868 + }, + { + "epoch": 0.8405533463067941, + "grad_norm": 0.15967680513858795, + "learning_rate": 0.001, + "loss": 2.0756, + "step": 19869 + }, + { + "epoch": 0.8405956510703105, + "grad_norm": 0.1979799121618271, + "learning_rate": 0.001, + "loss": 2.3233, + "step": 19870 + }, + { + "epoch": 0.8406379558338269, + "grad_norm": 0.3304239809513092, + "learning_rate": 0.001, + "loss": 2.0972, + "step": 19871 + }, + { + "epoch": 0.8406802605973432, + "grad_norm": 0.13525773584842682, + "learning_rate": 0.001, + "loss": 1.4994, + "step": 19872 + }, + { + "epoch": 0.8407225653608597, + "grad_norm": 1.1187829971313477, + "learning_rate": 0.001, + "loss": 2.6048, + "step": 19873 + }, + { + "epoch": 0.840764870124376, + "grad_norm": 1.9175190925598145, + "learning_rate": 0.001, + "loss": 2.1303, + "step": 19874 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.17562279105186462, + "learning_rate": 0.001, + "loss": 2.3723, + "step": 19875 + }, + { + "epoch": 0.8408494796514088, + "grad_norm": 0.14081628620624542, + "learning_rate": 0.001, + "loss": 2.3064, + "step": 19876 + }, + { + "epoch": 0.8408917844149251, + "grad_norm": 0.4894697368144989, + "learning_rate": 0.001, + "loss": 2.8948, + "step": 19877 + }, + { + "epoch": 0.8409340891784415, + "grad_norm": 0.19866247475147247, + "learning_rate": 0.001, + "loss": 2.9252, + "step": 19878 + }, + { + "epoch": 0.8409763939419579, + "grad_norm": 0.4068894684314728, + "learning_rate": 0.001, + "loss": 2.2374, + "step": 19879 + }, + { + "epoch": 0.8410186987054742, + "grad_norm": 0.1639883816242218, + "learning_rate": 0.001, + "loss": 2.0532, + "step": 19880 + }, + { + "epoch": 0.8410610034689906, + "grad_norm": 0.3562491536140442, + "learning_rate": 0.001, + "loss": 4.0336, + "step": 19881 + }, + { + "epoch": 0.841103308232507, + "grad_norm": 0.13724088668823242, + "learning_rate": 0.001, + "loss": 1.8362, + "step": 19882 + }, + { + "epoch": 0.8411456129960233, + "grad_norm": 0.13581812381744385, + "learning_rate": 0.001, + "loss": 1.3958, + "step": 19883 + }, + { + "epoch": 0.8411879177595397, + "grad_norm": 0.17569448053836823, + "learning_rate": 0.001, + "loss": 2.8263, + "step": 19884 + }, + { + "epoch": 0.8412302225230561, + "grad_norm": 0.14061707258224487, + "learning_rate": 0.001, + "loss": 2.2112, + "step": 19885 + }, + { + "epoch": 0.8412725272865724, + "grad_norm": 0.9066682457923889, + "learning_rate": 0.001, + "loss": 2.0064, + "step": 19886 + }, + { + "epoch": 0.8413148320500888, + "grad_norm": 0.13263536989688873, + "learning_rate": 0.001, + "loss": 2.3227, + "step": 19887 + }, + { + "epoch": 0.8413571368136052, + "grad_norm": 0.1647445261478424, + "learning_rate": 0.001, + "loss": 3.9107, + "step": 19888 + }, + { + "epoch": 0.8413994415771215, + "grad_norm": 0.3302064538002014, + "learning_rate": 0.001, + "loss": 2.5003, + "step": 19889 + }, + { + "epoch": 0.841441746340638, + "grad_norm": 0.14488548040390015, + "learning_rate": 0.001, + "loss": 2.0212, + "step": 19890 + }, + { + "epoch": 0.8414840511041544, + "grad_norm": 0.1621946543455124, + "learning_rate": 0.001, + "loss": 2.5805, + "step": 19891 + }, + { + "epoch": 0.8415263558676707, + "grad_norm": 0.13542355597019196, + "learning_rate": 0.001, + "loss": 1.8591, + "step": 19892 + }, + { + "epoch": 0.8415686606311871, + "grad_norm": 0.13882328569889069, + "learning_rate": 0.001, + "loss": 1.6565, + "step": 19893 + }, + { + "epoch": 0.8416109653947035, + "grad_norm": 0.18400336802005768, + "learning_rate": 0.001, + "loss": 2.3863, + "step": 19894 + }, + { + "epoch": 0.8416532701582198, + "grad_norm": 0.14745768904685974, + "learning_rate": 0.001, + "loss": 1.6539, + "step": 19895 + }, + { + "epoch": 0.8416955749217362, + "grad_norm": 0.5481185913085938, + "learning_rate": 0.001, + "loss": 2.4827, + "step": 19896 + }, + { + "epoch": 0.8417378796852526, + "grad_norm": 0.18567869067192078, + "learning_rate": 0.001, + "loss": 1.7485, + "step": 19897 + }, + { + "epoch": 0.8417801844487689, + "grad_norm": 40.28692626953125, + "learning_rate": 0.001, + "loss": 2.2203, + "step": 19898 + }, + { + "epoch": 0.8418224892122853, + "grad_norm": 0.17718671262264252, + "learning_rate": 0.001, + "loss": 2.2116, + "step": 19899 + }, + { + "epoch": 0.8418647939758017, + "grad_norm": 0.20770712196826935, + "learning_rate": 0.001, + "loss": 1.6705, + "step": 19900 + }, + { + "epoch": 0.841907098739318, + "grad_norm": 0.33446425199508667, + "learning_rate": 0.001, + "loss": 2.2231, + "step": 19901 + }, + { + "epoch": 0.8419494035028344, + "grad_norm": 0.15271897614002228, + "learning_rate": 0.001, + "loss": 2.0796, + "step": 19902 + }, + { + "epoch": 0.8419917082663508, + "grad_norm": 0.3776164650917053, + "learning_rate": 0.001, + "loss": 2.7789, + "step": 19903 + }, + { + "epoch": 0.8420340130298671, + "grad_norm": 0.18110236525535583, + "learning_rate": 0.001, + "loss": 3.1009, + "step": 19904 + }, + { + "epoch": 0.8420763177933835, + "grad_norm": 0.14142876863479614, + "learning_rate": 0.001, + "loss": 1.5932, + "step": 19905 + }, + { + "epoch": 0.8421186225569, + "grad_norm": 0.1594587117433548, + "learning_rate": 0.001, + "loss": 2.9, + "step": 19906 + }, + { + "epoch": 0.8421609273204163, + "grad_norm": 0.17610591650009155, + "learning_rate": 0.001, + "loss": 2.0221, + "step": 19907 + }, + { + "epoch": 0.8422032320839327, + "grad_norm": 0.1743817925453186, + "learning_rate": 0.001, + "loss": 2.0761, + "step": 19908 + }, + { + "epoch": 0.8422455368474491, + "grad_norm": 0.3120170831680298, + "learning_rate": 0.001, + "loss": 3.1248, + "step": 19909 + }, + { + "epoch": 0.8422878416109654, + "grad_norm": 0.14950038492679596, + "learning_rate": 0.001, + "loss": 3.4022, + "step": 19910 + }, + { + "epoch": 0.8423301463744818, + "grad_norm": 0.17454718053340912, + "learning_rate": 0.001, + "loss": 1.8271, + "step": 19911 + }, + { + "epoch": 0.8423724511379982, + "grad_norm": 0.18811197578907013, + "learning_rate": 0.001, + "loss": 2.814, + "step": 19912 + }, + { + "epoch": 0.8424147559015145, + "grad_norm": 0.13598325848579407, + "learning_rate": 0.001, + "loss": 2.043, + "step": 19913 + }, + { + "epoch": 0.8424570606650309, + "grad_norm": 0.14503377676010132, + "learning_rate": 0.001, + "loss": 1.8979, + "step": 19914 + }, + { + "epoch": 0.8424993654285473, + "grad_norm": 0.16746678948402405, + "learning_rate": 0.001, + "loss": 1.9977, + "step": 19915 + }, + { + "epoch": 0.8425416701920636, + "grad_norm": 8.220918655395508, + "learning_rate": 0.001, + "loss": 2.3796, + "step": 19916 + }, + { + "epoch": 0.84258397495558, + "grad_norm": 0.20891021192073822, + "learning_rate": 0.001, + "loss": 2.4267, + "step": 19917 + }, + { + "epoch": 0.8426262797190963, + "grad_norm": 0.15309026837348938, + "learning_rate": 0.001, + "loss": 2.0111, + "step": 19918 + }, + { + "epoch": 0.8426685844826127, + "grad_norm": 1.157206654548645, + "learning_rate": 0.001, + "loss": 3.1127, + "step": 19919 + }, + { + "epoch": 0.8427108892461291, + "grad_norm": 0.1353883147239685, + "learning_rate": 0.001, + "loss": 1.5835, + "step": 19920 + }, + { + "epoch": 0.8427531940096454, + "grad_norm": 0.16961653530597687, + "learning_rate": 0.001, + "loss": 1.4123, + "step": 19921 + }, + { + "epoch": 0.8427954987731618, + "grad_norm": 0.15629060566425323, + "learning_rate": 0.001, + "loss": 1.8995, + "step": 19922 + }, + { + "epoch": 0.8428378035366783, + "grad_norm": 14.496636390686035, + "learning_rate": 0.001, + "loss": 1.9417, + "step": 19923 + }, + { + "epoch": 0.8428801083001946, + "grad_norm": 0.16082890331745148, + "learning_rate": 0.001, + "loss": 1.7762, + "step": 19924 + }, + { + "epoch": 0.842922413063711, + "grad_norm": 0.4194250702857971, + "learning_rate": 0.001, + "loss": 1.7252, + "step": 19925 + }, + { + "epoch": 0.8429647178272274, + "grad_norm": 0.25590166449546814, + "learning_rate": 0.001, + "loss": 2.1403, + "step": 19926 + }, + { + "epoch": 0.8430070225907437, + "grad_norm": 0.6089497804641724, + "learning_rate": 0.001, + "loss": 2.328, + "step": 19927 + }, + { + "epoch": 0.8430493273542601, + "grad_norm": 0.17226269841194153, + "learning_rate": 0.001, + "loss": 1.7442, + "step": 19928 + }, + { + "epoch": 0.8430916321177765, + "grad_norm": 12.525866508483887, + "learning_rate": 0.001, + "loss": 2.9423, + "step": 19929 + }, + { + "epoch": 0.8431339368812928, + "grad_norm": 0.15503224730491638, + "learning_rate": 0.001, + "loss": 2.119, + "step": 19930 + }, + { + "epoch": 0.8431762416448092, + "grad_norm": 0.5280601978302002, + "learning_rate": 0.001, + "loss": 2.2592, + "step": 19931 + }, + { + "epoch": 0.8432185464083256, + "grad_norm": 0.19108590483665466, + "learning_rate": 0.001, + "loss": 1.7601, + "step": 19932 + }, + { + "epoch": 0.8432608511718419, + "grad_norm": 0.2428058534860611, + "learning_rate": 0.001, + "loss": 3.0192, + "step": 19933 + }, + { + "epoch": 0.8433031559353583, + "grad_norm": 0.16223500669002533, + "learning_rate": 0.001, + "loss": 2.3037, + "step": 19934 + }, + { + "epoch": 0.8433454606988747, + "grad_norm": 5.290875434875488, + "learning_rate": 0.001, + "loss": 2.3735, + "step": 19935 + }, + { + "epoch": 0.843387765462391, + "grad_norm": 0.139143168926239, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 19936 + }, + { + "epoch": 0.8434300702259074, + "grad_norm": 0.2556076645851135, + "learning_rate": 0.001, + "loss": 2.5694, + "step": 19937 + }, + { + "epoch": 0.8434723749894238, + "grad_norm": 0.17417319118976593, + "learning_rate": 0.001, + "loss": 2.2479, + "step": 19938 + }, + { + "epoch": 0.8435146797529401, + "grad_norm": 0.15048249065876007, + "learning_rate": 0.001, + "loss": 2.2647, + "step": 19939 + }, + { + "epoch": 0.8435569845164566, + "grad_norm": 0.32785776257514954, + "learning_rate": 0.001, + "loss": 2.3088, + "step": 19940 + }, + { + "epoch": 0.843599289279973, + "grad_norm": 0.16994161903858185, + "learning_rate": 0.001, + "loss": 2.9571, + "step": 19941 + }, + { + "epoch": 0.8436415940434893, + "grad_norm": 0.16292929649353027, + "learning_rate": 0.001, + "loss": 2.268, + "step": 19942 + }, + { + "epoch": 0.8436838988070057, + "grad_norm": 0.1685224175453186, + "learning_rate": 0.001, + "loss": 3.1586, + "step": 19943 + }, + { + "epoch": 0.8437262035705221, + "grad_norm": 0.2571621239185333, + "learning_rate": 0.001, + "loss": 2.2422, + "step": 19944 + }, + { + "epoch": 0.8437685083340384, + "grad_norm": 0.1383311152458191, + "learning_rate": 0.001, + "loss": 1.6768, + "step": 19945 + }, + { + "epoch": 0.8438108130975548, + "grad_norm": 0.1316850483417511, + "learning_rate": 0.001, + "loss": 1.3501, + "step": 19946 + }, + { + "epoch": 0.8438531178610712, + "grad_norm": 0.1473502814769745, + "learning_rate": 0.001, + "loss": 2.4567, + "step": 19947 + }, + { + "epoch": 0.8438954226245875, + "grad_norm": 0.2394011914730072, + "learning_rate": 0.001, + "loss": 1.5157, + "step": 19948 + }, + { + "epoch": 0.8439377273881039, + "grad_norm": 0.21005521714687347, + "learning_rate": 0.001, + "loss": 3.4788, + "step": 19949 + }, + { + "epoch": 0.8439800321516203, + "grad_norm": 0.22676526010036469, + "learning_rate": 0.001, + "loss": 1.5542, + "step": 19950 + }, + { + "epoch": 0.8440223369151366, + "grad_norm": 0.19313664734363556, + "learning_rate": 0.001, + "loss": 2.9371, + "step": 19951 + }, + { + "epoch": 0.844064641678653, + "grad_norm": 0.18418525159358978, + "learning_rate": 0.001, + "loss": 3.3624, + "step": 19952 + }, + { + "epoch": 0.8441069464421694, + "grad_norm": 0.15955480933189392, + "learning_rate": 0.001, + "loss": 2.4359, + "step": 19953 + }, + { + "epoch": 0.8441492512056857, + "grad_norm": 0.15745678544044495, + "learning_rate": 0.001, + "loss": 2.4083, + "step": 19954 + }, + { + "epoch": 0.8441915559692021, + "grad_norm": 0.1493915468454361, + "learning_rate": 0.001, + "loss": 2.0375, + "step": 19955 + }, + { + "epoch": 0.8442338607327186, + "grad_norm": 0.13950100541114807, + "learning_rate": 0.001, + "loss": 2.7912, + "step": 19956 + }, + { + "epoch": 0.8442761654962349, + "grad_norm": 0.47593954205513, + "learning_rate": 0.001, + "loss": 1.4144, + "step": 19957 + }, + { + "epoch": 0.8443184702597513, + "grad_norm": 0.17434100806713104, + "learning_rate": 0.001, + "loss": 2.265, + "step": 19958 + }, + { + "epoch": 0.8443607750232677, + "grad_norm": 0.12682586908340454, + "learning_rate": 0.001, + "loss": 2.5613, + "step": 19959 + }, + { + "epoch": 0.844403079786784, + "grad_norm": 1.8817963600158691, + "learning_rate": 0.001, + "loss": 1.676, + "step": 19960 + }, + { + "epoch": 0.8444453845503004, + "grad_norm": 0.19121527671813965, + "learning_rate": 0.001, + "loss": 1.512, + "step": 19961 + }, + { + "epoch": 0.8444876893138167, + "grad_norm": 0.1444321870803833, + "learning_rate": 0.001, + "loss": 1.92, + "step": 19962 + }, + { + "epoch": 0.8445299940773331, + "grad_norm": 0.16925615072250366, + "learning_rate": 0.001, + "loss": 1.6687, + "step": 19963 + }, + { + "epoch": 0.8445722988408495, + "grad_norm": 4.560879707336426, + "learning_rate": 0.001, + "loss": 2.2629, + "step": 19964 + }, + { + "epoch": 0.8446146036043658, + "grad_norm": 0.13227243721485138, + "learning_rate": 0.001, + "loss": 2.5459, + "step": 19965 + }, + { + "epoch": 0.8446569083678822, + "grad_norm": 0.15780355036258698, + "learning_rate": 0.001, + "loss": 2.7421, + "step": 19966 + }, + { + "epoch": 0.8446992131313986, + "grad_norm": 0.1389780193567276, + "learning_rate": 0.001, + "loss": 2.5234, + "step": 19967 + }, + { + "epoch": 0.8447415178949149, + "grad_norm": 0.15538786351680756, + "learning_rate": 0.001, + "loss": 2.0989, + "step": 19968 + }, + { + "epoch": 0.8447838226584313, + "grad_norm": 9.396347999572754, + "learning_rate": 0.001, + "loss": 1.7852, + "step": 19969 + }, + { + "epoch": 0.8448261274219477, + "grad_norm": 0.2267782837152481, + "learning_rate": 0.001, + "loss": 1.8979, + "step": 19970 + }, + { + "epoch": 0.844868432185464, + "grad_norm": 0.1871020346879959, + "learning_rate": 0.001, + "loss": 3.3206, + "step": 19971 + }, + { + "epoch": 0.8449107369489804, + "grad_norm": 0.17954017221927643, + "learning_rate": 0.001, + "loss": 1.9435, + "step": 19972 + }, + { + "epoch": 0.8449530417124969, + "grad_norm": 0.146229088306427, + "learning_rate": 0.001, + "loss": 1.5802, + "step": 19973 + }, + { + "epoch": 0.8449953464760132, + "grad_norm": 0.1864139437675476, + "learning_rate": 0.001, + "loss": 2.0258, + "step": 19974 + }, + { + "epoch": 0.8450376512395296, + "grad_norm": 0.1569264978170395, + "learning_rate": 0.001, + "loss": 1.9869, + "step": 19975 + }, + { + "epoch": 0.845079956003046, + "grad_norm": 0.15957815945148468, + "learning_rate": 0.001, + "loss": 2.4842, + "step": 19976 + }, + { + "epoch": 0.8451222607665623, + "grad_norm": 0.16529959440231323, + "learning_rate": 0.001, + "loss": 1.9408, + "step": 19977 + }, + { + "epoch": 0.8451645655300787, + "grad_norm": 0.1697777956724167, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 19978 + }, + { + "epoch": 0.8452068702935951, + "grad_norm": 1.7855709791183472, + "learning_rate": 0.001, + "loss": 3.2382, + "step": 19979 + }, + { + "epoch": 0.8452491750571114, + "grad_norm": 0.17001429200172424, + "learning_rate": 0.001, + "loss": 2.8689, + "step": 19980 + }, + { + "epoch": 0.8452914798206278, + "grad_norm": 0.17882667481899261, + "learning_rate": 0.001, + "loss": 1.466, + "step": 19981 + }, + { + "epoch": 0.8453337845841442, + "grad_norm": 3.9639627933502197, + "learning_rate": 0.001, + "loss": 1.8731, + "step": 19982 + }, + { + "epoch": 0.8453760893476605, + "grad_norm": 0.14277885854244232, + "learning_rate": 0.001, + "loss": 1.8745, + "step": 19983 + }, + { + "epoch": 0.8454183941111769, + "grad_norm": 0.17287899553775787, + "learning_rate": 0.001, + "loss": 2.4062, + "step": 19984 + }, + { + "epoch": 0.8454606988746933, + "grad_norm": 0.1661319136619568, + "learning_rate": 0.001, + "loss": 1.7102, + "step": 19985 + }, + { + "epoch": 0.8455030036382096, + "grad_norm": 0.1925491839647293, + "learning_rate": 0.001, + "loss": 2.2246, + "step": 19986 + }, + { + "epoch": 0.845545308401726, + "grad_norm": 0.15277276933193207, + "learning_rate": 0.001, + "loss": 2.5972, + "step": 19987 + }, + { + "epoch": 0.8455876131652424, + "grad_norm": 0.1781436949968338, + "learning_rate": 0.001, + "loss": 3.143, + "step": 19988 + }, + { + "epoch": 0.8456299179287587, + "grad_norm": 0.15217041969299316, + "learning_rate": 0.001, + "loss": 2.5483, + "step": 19989 + }, + { + "epoch": 0.8456722226922752, + "grad_norm": 0.15171130001544952, + "learning_rate": 0.001, + "loss": 1.8737, + "step": 19990 + }, + { + "epoch": 0.8457145274557916, + "grad_norm": 0.15747137367725372, + "learning_rate": 0.001, + "loss": 1.7848, + "step": 19991 + }, + { + "epoch": 0.8457568322193079, + "grad_norm": 0.22473227977752686, + "learning_rate": 0.001, + "loss": 2.192, + "step": 19992 + }, + { + "epoch": 0.8457991369828243, + "grad_norm": 0.19137416779994965, + "learning_rate": 0.001, + "loss": 4.3493, + "step": 19993 + }, + { + "epoch": 0.8458414417463407, + "grad_norm": 0.17920900881290436, + "learning_rate": 0.001, + "loss": 2.6782, + "step": 19994 + }, + { + "epoch": 0.845883746509857, + "grad_norm": 0.16247224807739258, + "learning_rate": 0.001, + "loss": 2.0712, + "step": 19995 + }, + { + "epoch": 0.8459260512733734, + "grad_norm": 2.3473522663116455, + "learning_rate": 0.001, + "loss": 2.1859, + "step": 19996 + }, + { + "epoch": 0.8459683560368898, + "grad_norm": 0.1629837155342102, + "learning_rate": 0.001, + "loss": 2.4058, + "step": 19997 + }, + { + "epoch": 0.8460106608004061, + "grad_norm": 0.1541110724210739, + "learning_rate": 0.001, + "loss": 2.6553, + "step": 19998 + }, + { + "epoch": 0.8460529655639225, + "grad_norm": 0.12468776851892471, + "learning_rate": 0.001, + "loss": 2.0798, + "step": 19999 + }, + { + "epoch": 0.8460952703274389, + "grad_norm": 0.13102620840072632, + "learning_rate": 0.001, + "loss": 3.1871, + "step": 20000 + }, + { + "epoch": 0.8461375750909552, + "grad_norm": 0.16048361361026764, + "learning_rate": 0.001, + "loss": 1.6277, + "step": 20001 + }, + { + "epoch": 0.8461798798544716, + "grad_norm": 0.12173058837652206, + "learning_rate": 0.001, + "loss": 1.9958, + "step": 20002 + }, + { + "epoch": 0.846222184617988, + "grad_norm": 0.1663282960653305, + "learning_rate": 0.001, + "loss": 1.9365, + "step": 20003 + }, + { + "epoch": 0.8462644893815043, + "grad_norm": 0.2031364142894745, + "learning_rate": 0.001, + "loss": 2.5175, + "step": 20004 + }, + { + "epoch": 0.8463067941450207, + "grad_norm": 0.17239537835121155, + "learning_rate": 0.001, + "loss": 1.9938, + "step": 20005 + }, + { + "epoch": 0.8463490989085372, + "grad_norm": 0.14862409234046936, + "learning_rate": 0.001, + "loss": 1.9449, + "step": 20006 + }, + { + "epoch": 0.8463914036720535, + "grad_norm": 0.18297027051448822, + "learning_rate": 0.001, + "loss": 2.2749, + "step": 20007 + }, + { + "epoch": 0.8464337084355699, + "grad_norm": 0.16779938340187073, + "learning_rate": 0.001, + "loss": 1.6555, + "step": 20008 + }, + { + "epoch": 0.8464760131990862, + "grad_norm": 0.21444658935070038, + "learning_rate": 0.001, + "loss": 2.8099, + "step": 20009 + }, + { + "epoch": 0.8465183179626026, + "grad_norm": 0.13690084218978882, + "learning_rate": 0.001, + "loss": 3.1176, + "step": 20010 + }, + { + "epoch": 0.846560622726119, + "grad_norm": 0.235006645321846, + "learning_rate": 0.001, + "loss": 2.1184, + "step": 20011 + }, + { + "epoch": 0.8466029274896353, + "grad_norm": 0.1450556516647339, + "learning_rate": 0.001, + "loss": 1.8434, + "step": 20012 + }, + { + "epoch": 0.8466452322531517, + "grad_norm": 0.14639632403850555, + "learning_rate": 0.001, + "loss": 1.8493, + "step": 20013 + }, + { + "epoch": 0.8466875370166681, + "grad_norm": 0.12522344291210175, + "learning_rate": 0.001, + "loss": 1.5215, + "step": 20014 + }, + { + "epoch": 0.8467298417801844, + "grad_norm": 0.19824448227882385, + "learning_rate": 0.001, + "loss": 3.5594, + "step": 20015 + }, + { + "epoch": 0.8467721465437008, + "grad_norm": 0.13851766288280487, + "learning_rate": 0.001, + "loss": 2.2443, + "step": 20016 + }, + { + "epoch": 0.8468144513072172, + "grad_norm": 0.13753733038902283, + "learning_rate": 0.001, + "loss": 2.3559, + "step": 20017 + }, + { + "epoch": 0.8468567560707335, + "grad_norm": 0.15897375345230103, + "learning_rate": 0.001, + "loss": 1.8587, + "step": 20018 + }, + { + "epoch": 0.8468990608342499, + "grad_norm": 0.13712716102600098, + "learning_rate": 0.001, + "loss": 2.0603, + "step": 20019 + }, + { + "epoch": 0.8469413655977663, + "grad_norm": 0.1391967236995697, + "learning_rate": 0.001, + "loss": 2.0169, + "step": 20020 + }, + { + "epoch": 0.8469836703612826, + "grad_norm": 0.1359555423259735, + "learning_rate": 0.001, + "loss": 2.2155, + "step": 20021 + }, + { + "epoch": 0.847025975124799, + "grad_norm": 0.7110766768455505, + "learning_rate": 0.001, + "loss": 3.0314, + "step": 20022 + }, + { + "epoch": 0.8470682798883155, + "grad_norm": 0.1405889242887497, + "learning_rate": 0.001, + "loss": 2.1806, + "step": 20023 + }, + { + "epoch": 0.8471105846518318, + "grad_norm": 0.14044038951396942, + "learning_rate": 0.001, + "loss": 1.9486, + "step": 20024 + }, + { + "epoch": 0.8471528894153482, + "grad_norm": 0.122348852455616, + "learning_rate": 0.001, + "loss": 1.5374, + "step": 20025 + }, + { + "epoch": 0.8471951941788646, + "grad_norm": 0.14094999432563782, + "learning_rate": 0.001, + "loss": 1.7621, + "step": 20026 + }, + { + "epoch": 0.8472374989423809, + "grad_norm": 0.6236584782600403, + "learning_rate": 0.001, + "loss": 1.964, + "step": 20027 + }, + { + "epoch": 0.8472798037058973, + "grad_norm": 0.16254521906375885, + "learning_rate": 0.001, + "loss": 2.0317, + "step": 20028 + }, + { + "epoch": 0.8473221084694137, + "grad_norm": 0.16386757791042328, + "learning_rate": 0.001, + "loss": 2.4891, + "step": 20029 + }, + { + "epoch": 0.84736441323293, + "grad_norm": 0.26580187678337097, + "learning_rate": 0.001, + "loss": 1.803, + "step": 20030 + }, + { + "epoch": 0.8474067179964464, + "grad_norm": 0.13852697610855103, + "learning_rate": 0.001, + "loss": 1.9764, + "step": 20031 + }, + { + "epoch": 0.8474490227599628, + "grad_norm": 0.13823279738426208, + "learning_rate": 0.001, + "loss": 1.6498, + "step": 20032 + }, + { + "epoch": 0.8474913275234791, + "grad_norm": 0.15287473797798157, + "learning_rate": 0.001, + "loss": 2.5265, + "step": 20033 + }, + { + "epoch": 0.8475336322869955, + "grad_norm": 0.661291241645813, + "learning_rate": 0.001, + "loss": 2.3638, + "step": 20034 + }, + { + "epoch": 0.8475759370505119, + "grad_norm": 0.16113339364528656, + "learning_rate": 0.001, + "loss": 2.0843, + "step": 20035 + }, + { + "epoch": 0.8476182418140282, + "grad_norm": 0.1465645283460617, + "learning_rate": 0.001, + "loss": 2.0648, + "step": 20036 + }, + { + "epoch": 0.8476605465775446, + "grad_norm": 1.3468478918075562, + "learning_rate": 0.001, + "loss": 2.1217, + "step": 20037 + }, + { + "epoch": 0.847702851341061, + "grad_norm": 1.280560851097107, + "learning_rate": 0.001, + "loss": 2.0666, + "step": 20038 + }, + { + "epoch": 0.8477451561045773, + "grad_norm": 0.17766667902469635, + "learning_rate": 0.001, + "loss": 2.116, + "step": 20039 + }, + { + "epoch": 0.8477874608680938, + "grad_norm": 0.20155932009220123, + "learning_rate": 0.001, + "loss": 1.9426, + "step": 20040 + }, + { + "epoch": 0.8478297656316102, + "grad_norm": 0.2062946856021881, + "learning_rate": 0.001, + "loss": 2.7091, + "step": 20041 + }, + { + "epoch": 0.8478720703951265, + "grad_norm": 0.1285688877105713, + "learning_rate": 0.001, + "loss": 2.7954, + "step": 20042 + }, + { + "epoch": 0.8479143751586429, + "grad_norm": 2.014345407485962, + "learning_rate": 0.001, + "loss": 2.2796, + "step": 20043 + }, + { + "epoch": 0.8479566799221593, + "grad_norm": 6.951234817504883, + "learning_rate": 0.001, + "loss": 3.4023, + "step": 20044 + }, + { + "epoch": 0.8479989846856756, + "grad_norm": 0.16423997282981873, + "learning_rate": 0.001, + "loss": 2.1239, + "step": 20045 + }, + { + "epoch": 0.848041289449192, + "grad_norm": 0.14751717448234558, + "learning_rate": 0.001, + "loss": 2.3457, + "step": 20046 + }, + { + "epoch": 0.8480835942127084, + "grad_norm": 0.1324821263551712, + "learning_rate": 0.001, + "loss": 1.9863, + "step": 20047 + }, + { + "epoch": 0.8481258989762247, + "grad_norm": 0.2402341365814209, + "learning_rate": 0.001, + "loss": 1.8448, + "step": 20048 + }, + { + "epoch": 0.8481682037397411, + "grad_norm": 7.083425045013428, + "learning_rate": 0.001, + "loss": 2.5456, + "step": 20049 + }, + { + "epoch": 0.8482105085032575, + "grad_norm": 0.1683296114206314, + "learning_rate": 0.001, + "loss": 2.432, + "step": 20050 + }, + { + "epoch": 0.8482528132667738, + "grad_norm": 0.18871085345745087, + "learning_rate": 0.001, + "loss": 1.9026, + "step": 20051 + }, + { + "epoch": 0.8482951180302902, + "grad_norm": 0.13885819911956787, + "learning_rate": 0.001, + "loss": 1.7946, + "step": 20052 + }, + { + "epoch": 0.8483374227938065, + "grad_norm": 0.13285697996616364, + "learning_rate": 0.001, + "loss": 1.395, + "step": 20053 + }, + { + "epoch": 0.8483797275573229, + "grad_norm": 0.1700686663389206, + "learning_rate": 0.001, + "loss": 2.1973, + "step": 20054 + }, + { + "epoch": 0.8484220323208393, + "grad_norm": 0.20374974608421326, + "learning_rate": 0.001, + "loss": 2.6799, + "step": 20055 + }, + { + "epoch": 0.8484643370843556, + "grad_norm": 0.2053978443145752, + "learning_rate": 0.001, + "loss": 1.7101, + "step": 20056 + }, + { + "epoch": 0.8485066418478721, + "grad_norm": 0.16068226099014282, + "learning_rate": 0.001, + "loss": 2.1583, + "step": 20057 + }, + { + "epoch": 0.8485489466113885, + "grad_norm": 0.1553301364183426, + "learning_rate": 0.001, + "loss": 2.122, + "step": 20058 + }, + { + "epoch": 0.8485912513749048, + "grad_norm": 0.16317340731620789, + "learning_rate": 0.001, + "loss": 1.9723, + "step": 20059 + }, + { + "epoch": 0.8486335561384212, + "grad_norm": 0.6233834028244019, + "learning_rate": 0.001, + "loss": 1.8776, + "step": 20060 + }, + { + "epoch": 0.8486758609019376, + "grad_norm": 0.1380307525396347, + "learning_rate": 0.001, + "loss": 2.1375, + "step": 20061 + }, + { + "epoch": 0.8487181656654539, + "grad_norm": 0.17783686518669128, + "learning_rate": 0.001, + "loss": 2.2367, + "step": 20062 + }, + { + "epoch": 0.8487604704289703, + "grad_norm": 0.14636710286140442, + "learning_rate": 0.001, + "loss": 1.6968, + "step": 20063 + }, + { + "epoch": 0.8488027751924867, + "grad_norm": 0.2252545803785324, + "learning_rate": 0.001, + "loss": 2.1908, + "step": 20064 + }, + { + "epoch": 0.848845079956003, + "grad_norm": 0.129207044839859, + "learning_rate": 0.001, + "loss": 2.0196, + "step": 20065 + }, + { + "epoch": 0.8488873847195194, + "grad_norm": 471.605712890625, + "learning_rate": 0.001, + "loss": 2.1384, + "step": 20066 + }, + { + "epoch": 0.8489296894830358, + "grad_norm": 0.18888062238693237, + "learning_rate": 0.001, + "loss": 2.9906, + "step": 20067 + }, + { + "epoch": 0.8489719942465521, + "grad_norm": 0.16716241836547852, + "learning_rate": 0.001, + "loss": 2.5418, + "step": 20068 + }, + { + "epoch": 0.8490142990100685, + "grad_norm": 0.23846466839313507, + "learning_rate": 0.001, + "loss": 1.9297, + "step": 20069 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.2950774133205414, + "learning_rate": 0.001, + "loss": 2.0279, + "step": 20070 + }, + { + "epoch": 0.8490989085371012, + "grad_norm": 0.18450148403644562, + "learning_rate": 0.001, + "loss": 1.901, + "step": 20071 + }, + { + "epoch": 0.8491412133006176, + "grad_norm": 0.15849687159061432, + "learning_rate": 0.001, + "loss": 1.8239, + "step": 20072 + }, + { + "epoch": 0.8491835180641341, + "grad_norm": 0.17029528319835663, + "learning_rate": 0.001, + "loss": 1.5521, + "step": 20073 + }, + { + "epoch": 0.8492258228276504, + "grad_norm": 0.1589442491531372, + "learning_rate": 0.001, + "loss": 1.79, + "step": 20074 + }, + { + "epoch": 0.8492681275911668, + "grad_norm": 0.19395272433757782, + "learning_rate": 0.001, + "loss": 2.1332, + "step": 20075 + }, + { + "epoch": 0.8493104323546832, + "grad_norm": 0.1506047248840332, + "learning_rate": 0.001, + "loss": 2.3834, + "step": 20076 + }, + { + "epoch": 0.8493527371181995, + "grad_norm": 0.1427820324897766, + "learning_rate": 0.001, + "loss": 1.5925, + "step": 20077 + }, + { + "epoch": 0.8493950418817159, + "grad_norm": 0.14985975623130798, + "learning_rate": 0.001, + "loss": 2.5144, + "step": 20078 + }, + { + "epoch": 0.8494373466452323, + "grad_norm": 10.775849342346191, + "learning_rate": 0.001, + "loss": 1.8041, + "step": 20079 + }, + { + "epoch": 0.8494796514087486, + "grad_norm": 0.16326069831848145, + "learning_rate": 0.001, + "loss": 1.98, + "step": 20080 + }, + { + "epoch": 0.849521956172265, + "grad_norm": 0.6944580674171448, + "learning_rate": 0.001, + "loss": 3.5848, + "step": 20081 + }, + { + "epoch": 0.8495642609357814, + "grad_norm": 0.17368917167186737, + "learning_rate": 0.001, + "loss": 2.6593, + "step": 20082 + }, + { + "epoch": 0.8496065656992977, + "grad_norm": 0.17858272790908813, + "learning_rate": 0.001, + "loss": 2.2117, + "step": 20083 + }, + { + "epoch": 0.8496488704628141, + "grad_norm": 0.2750988304615021, + "learning_rate": 0.001, + "loss": 2.6538, + "step": 20084 + }, + { + "epoch": 0.8496911752263305, + "grad_norm": 0.4688890874385834, + "learning_rate": 0.001, + "loss": 2.3822, + "step": 20085 + }, + { + "epoch": 0.8497334799898468, + "grad_norm": 0.43606501817703247, + "learning_rate": 0.001, + "loss": 3.1274, + "step": 20086 + }, + { + "epoch": 0.8497757847533632, + "grad_norm": 0.1552165150642395, + "learning_rate": 0.001, + "loss": 1.8759, + "step": 20087 + }, + { + "epoch": 0.8498180895168796, + "grad_norm": 0.14785443246364594, + "learning_rate": 0.001, + "loss": 2.4866, + "step": 20088 + }, + { + "epoch": 0.849860394280396, + "grad_norm": 0.14098820090293884, + "learning_rate": 0.001, + "loss": 1.97, + "step": 20089 + }, + { + "epoch": 0.8499026990439124, + "grad_norm": 0.14183367788791656, + "learning_rate": 0.001, + "loss": 1.7328, + "step": 20090 + }, + { + "epoch": 0.8499450038074288, + "grad_norm": 0.2905732989311218, + "learning_rate": 0.001, + "loss": 2.5444, + "step": 20091 + }, + { + "epoch": 0.8499873085709451, + "grad_norm": 6.777652263641357, + "learning_rate": 0.001, + "loss": 1.8801, + "step": 20092 + }, + { + "epoch": 0.8500296133344615, + "grad_norm": 0.1453995257616043, + "learning_rate": 0.001, + "loss": 1.6809, + "step": 20093 + }, + { + "epoch": 0.8500719180979779, + "grad_norm": 0.18101413547992706, + "learning_rate": 0.001, + "loss": 2.2516, + "step": 20094 + }, + { + "epoch": 0.8501142228614942, + "grad_norm": 7.015632152557373, + "learning_rate": 0.001, + "loss": 1.9355, + "step": 20095 + }, + { + "epoch": 0.8501565276250106, + "grad_norm": 0.21599209308624268, + "learning_rate": 0.001, + "loss": 1.6428, + "step": 20096 + }, + { + "epoch": 0.8501988323885269, + "grad_norm": 0.19514302909374237, + "learning_rate": 0.001, + "loss": 1.8462, + "step": 20097 + }, + { + "epoch": 0.8502411371520433, + "grad_norm": 0.3018536865711212, + "learning_rate": 0.001, + "loss": 2.9103, + "step": 20098 + }, + { + "epoch": 0.8502834419155597, + "grad_norm": 0.220729798078537, + "learning_rate": 0.001, + "loss": 2.7488, + "step": 20099 + }, + { + "epoch": 0.850325746679076, + "grad_norm": 0.9878958463668823, + "learning_rate": 0.001, + "loss": 1.8072, + "step": 20100 + }, + { + "epoch": 0.8503680514425924, + "grad_norm": 0.19620686769485474, + "learning_rate": 0.001, + "loss": 1.6413, + "step": 20101 + }, + { + "epoch": 0.8504103562061088, + "grad_norm": 0.4310097098350525, + "learning_rate": 0.001, + "loss": 1.7159, + "step": 20102 + }, + { + "epoch": 0.8504526609696251, + "grad_norm": 0.352130264043808, + "learning_rate": 0.001, + "loss": 2.5859, + "step": 20103 + }, + { + "epoch": 0.8504949657331415, + "grad_norm": 0.3044179081916809, + "learning_rate": 0.001, + "loss": 1.5859, + "step": 20104 + }, + { + "epoch": 0.850537270496658, + "grad_norm": 0.2029111385345459, + "learning_rate": 0.001, + "loss": 2.3172, + "step": 20105 + }, + { + "epoch": 0.8505795752601742, + "grad_norm": 1.908150315284729, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 20106 + }, + { + "epoch": 0.8506218800236907, + "grad_norm": 0.14281626045703888, + "learning_rate": 0.001, + "loss": 1.5915, + "step": 20107 + }, + { + "epoch": 0.8506641847872071, + "grad_norm": 0.16866661608219147, + "learning_rate": 0.001, + "loss": 2.6013, + "step": 20108 + }, + { + "epoch": 0.8507064895507234, + "grad_norm": 0.20509420335292816, + "learning_rate": 0.001, + "loss": 2.5722, + "step": 20109 + }, + { + "epoch": 0.8507487943142398, + "grad_norm": 0.17332904040813446, + "learning_rate": 0.001, + "loss": 1.7543, + "step": 20110 + }, + { + "epoch": 0.8507910990777562, + "grad_norm": 0.1855224370956421, + "learning_rate": 0.001, + "loss": 2.1487, + "step": 20111 + }, + { + "epoch": 0.8508334038412725, + "grad_norm": 0.17093543708324432, + "learning_rate": 0.001, + "loss": 2.3077, + "step": 20112 + }, + { + "epoch": 0.8508757086047889, + "grad_norm": 10.589107513427734, + "learning_rate": 0.001, + "loss": 2.2467, + "step": 20113 + }, + { + "epoch": 0.8509180133683053, + "grad_norm": 0.18264487385749817, + "learning_rate": 0.001, + "loss": 2.2559, + "step": 20114 + }, + { + "epoch": 0.8509603181318216, + "grad_norm": 0.19633848965168, + "learning_rate": 0.001, + "loss": 2.1265, + "step": 20115 + }, + { + "epoch": 0.851002622895338, + "grad_norm": 0.1672111302614212, + "learning_rate": 0.001, + "loss": 2.9774, + "step": 20116 + }, + { + "epoch": 0.8510449276588544, + "grad_norm": 0.25148797035217285, + "learning_rate": 0.001, + "loss": 2.0833, + "step": 20117 + }, + { + "epoch": 0.8510872324223707, + "grad_norm": 0.22718261182308197, + "learning_rate": 0.001, + "loss": 3.6184, + "step": 20118 + }, + { + "epoch": 0.8511295371858871, + "grad_norm": 3.537106990814209, + "learning_rate": 0.001, + "loss": 2.5408, + "step": 20119 + }, + { + "epoch": 0.8511718419494035, + "grad_norm": 0.16952507197856903, + "learning_rate": 0.001, + "loss": 2.31, + "step": 20120 + }, + { + "epoch": 0.8512141467129198, + "grad_norm": 0.23125673830509186, + "learning_rate": 0.001, + "loss": 2.2545, + "step": 20121 + }, + { + "epoch": 0.8512564514764362, + "grad_norm": 0.19513604044914246, + "learning_rate": 0.001, + "loss": 3.295, + "step": 20122 + }, + { + "epoch": 0.8512987562399527, + "grad_norm": 0.20079001784324646, + "learning_rate": 0.001, + "loss": 1.5811, + "step": 20123 + }, + { + "epoch": 0.851341061003469, + "grad_norm": 0.22401869297027588, + "learning_rate": 0.001, + "loss": 1.9916, + "step": 20124 + }, + { + "epoch": 0.8513833657669854, + "grad_norm": 0.19173742830753326, + "learning_rate": 0.001, + "loss": 2.5162, + "step": 20125 + }, + { + "epoch": 0.8514256705305018, + "grad_norm": 5.623570919036865, + "learning_rate": 0.001, + "loss": 1.7293, + "step": 20126 + }, + { + "epoch": 0.8514679752940181, + "grad_norm": 0.2150885909795761, + "learning_rate": 0.001, + "loss": 2.5409, + "step": 20127 + }, + { + "epoch": 0.8515102800575345, + "grad_norm": 0.2799954116344452, + "learning_rate": 0.001, + "loss": 3.2083, + "step": 20128 + }, + { + "epoch": 0.8515525848210509, + "grad_norm": 0.19551977515220642, + "learning_rate": 0.001, + "loss": 2.7637, + "step": 20129 + }, + { + "epoch": 0.8515948895845672, + "grad_norm": 0.24339886009693146, + "learning_rate": 0.001, + "loss": 3.3285, + "step": 20130 + }, + { + "epoch": 0.8516371943480836, + "grad_norm": 0.25382792949676514, + "learning_rate": 0.001, + "loss": 1.9141, + "step": 20131 + }, + { + "epoch": 0.8516794991116, + "grad_norm": 0.17884451150894165, + "learning_rate": 0.001, + "loss": 2.4345, + "step": 20132 + }, + { + "epoch": 0.8517218038751163, + "grad_norm": 0.9406694769859314, + "learning_rate": 0.001, + "loss": 2.2004, + "step": 20133 + }, + { + "epoch": 0.8517641086386327, + "grad_norm": 0.874098002910614, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 20134 + }, + { + "epoch": 0.8518064134021491, + "grad_norm": 0.17671692371368408, + "learning_rate": 0.001, + "loss": 2.7296, + "step": 20135 + }, + { + "epoch": 0.8518487181656654, + "grad_norm": 0.16355298459529877, + "learning_rate": 0.001, + "loss": 3.0569, + "step": 20136 + }, + { + "epoch": 0.8518910229291818, + "grad_norm": 0.24405351281166077, + "learning_rate": 0.001, + "loss": 1.9078, + "step": 20137 + }, + { + "epoch": 0.8519333276926983, + "grad_norm": 0.14516574144363403, + "learning_rate": 0.001, + "loss": 1.629, + "step": 20138 + }, + { + "epoch": 0.8519756324562145, + "grad_norm": 0.1458577811717987, + "learning_rate": 0.001, + "loss": 1.9627, + "step": 20139 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 5.451334476470947, + "learning_rate": 0.001, + "loss": 2.0621, + "step": 20140 + }, + { + "epoch": 0.8520602419832474, + "grad_norm": 0.15075600147247314, + "learning_rate": 0.001, + "loss": 1.9024, + "step": 20141 + }, + { + "epoch": 0.8521025467467637, + "grad_norm": 0.16598421335220337, + "learning_rate": 0.001, + "loss": 1.7182, + "step": 20142 + }, + { + "epoch": 0.8521448515102801, + "grad_norm": 0.14806927740573883, + "learning_rate": 0.001, + "loss": 1.511, + "step": 20143 + }, + { + "epoch": 0.8521871562737964, + "grad_norm": 0.8803728818893433, + "learning_rate": 0.001, + "loss": 2.5187, + "step": 20144 + }, + { + "epoch": 0.8522294610373128, + "grad_norm": 0.14872333407402039, + "learning_rate": 0.001, + "loss": 2.2161, + "step": 20145 + }, + { + "epoch": 0.8522717658008292, + "grad_norm": 0.14645099639892578, + "learning_rate": 0.001, + "loss": 1.8971, + "step": 20146 + }, + { + "epoch": 0.8523140705643455, + "grad_norm": 0.16836774349212646, + "learning_rate": 0.001, + "loss": 2.1939, + "step": 20147 + }, + { + "epoch": 0.8523563753278619, + "grad_norm": 0.24831850826740265, + "learning_rate": 0.001, + "loss": 2.191, + "step": 20148 + }, + { + "epoch": 0.8523986800913783, + "grad_norm": 0.1501324474811554, + "learning_rate": 0.001, + "loss": 2.3453, + "step": 20149 + }, + { + "epoch": 0.8524409848548946, + "grad_norm": 0.15773944556713104, + "learning_rate": 0.001, + "loss": 2.0649, + "step": 20150 + }, + { + "epoch": 0.852483289618411, + "grad_norm": 0.13998405635356903, + "learning_rate": 0.001, + "loss": 2.1773, + "step": 20151 + }, + { + "epoch": 0.8525255943819274, + "grad_norm": 0.13668106496334076, + "learning_rate": 0.001, + "loss": 3.5096, + "step": 20152 + }, + { + "epoch": 0.8525678991454437, + "grad_norm": 2.312943935394287, + "learning_rate": 0.001, + "loss": 2.0723, + "step": 20153 + }, + { + "epoch": 0.8526102039089601, + "grad_norm": 0.12327218800783157, + "learning_rate": 0.001, + "loss": 1.5741, + "step": 20154 + }, + { + "epoch": 0.8526525086724766, + "grad_norm": 0.13412897288799286, + "learning_rate": 0.001, + "loss": 1.9401, + "step": 20155 + }, + { + "epoch": 0.8526948134359928, + "grad_norm": 0.8391098976135254, + "learning_rate": 0.001, + "loss": 2.408, + "step": 20156 + }, + { + "epoch": 0.8527371181995093, + "grad_norm": 0.14044369757175446, + "learning_rate": 0.001, + "loss": 2.7366, + "step": 20157 + }, + { + "epoch": 0.8527794229630257, + "grad_norm": 0.15021243691444397, + "learning_rate": 0.001, + "loss": 1.9738, + "step": 20158 + }, + { + "epoch": 0.852821727726542, + "grad_norm": 0.14377515017986298, + "learning_rate": 0.001, + "loss": 2.2906, + "step": 20159 + }, + { + "epoch": 0.8528640324900584, + "grad_norm": 0.17275890707969666, + "learning_rate": 0.001, + "loss": 1.9053, + "step": 20160 + }, + { + "epoch": 0.8529063372535748, + "grad_norm": 0.6215050220489502, + "learning_rate": 0.001, + "loss": 2.247, + "step": 20161 + }, + { + "epoch": 0.8529486420170911, + "grad_norm": 0.16696125268936157, + "learning_rate": 0.001, + "loss": 2.4084, + "step": 20162 + }, + { + "epoch": 0.8529909467806075, + "grad_norm": 1.0884296894073486, + "learning_rate": 0.001, + "loss": 2.9437, + "step": 20163 + }, + { + "epoch": 0.8530332515441239, + "grad_norm": 0.1563209444284439, + "learning_rate": 0.001, + "loss": 1.662, + "step": 20164 + }, + { + "epoch": 0.8530755563076402, + "grad_norm": 0.1429872363805771, + "learning_rate": 0.001, + "loss": 2.4649, + "step": 20165 + }, + { + "epoch": 0.8531178610711566, + "grad_norm": 2.498642921447754, + "learning_rate": 0.001, + "loss": 3.1658, + "step": 20166 + }, + { + "epoch": 0.853160165834673, + "grad_norm": 0.20113472640514374, + "learning_rate": 0.001, + "loss": 2.6068, + "step": 20167 + }, + { + "epoch": 0.8532024705981893, + "grad_norm": 0.14465253055095673, + "learning_rate": 0.001, + "loss": 2.3038, + "step": 20168 + }, + { + "epoch": 0.8532447753617057, + "grad_norm": 0.16391976177692413, + "learning_rate": 0.001, + "loss": 2.1344, + "step": 20169 + }, + { + "epoch": 0.8532870801252221, + "grad_norm": 0.16122137010097504, + "learning_rate": 0.001, + "loss": 3.441, + "step": 20170 + }, + { + "epoch": 0.8533293848887384, + "grad_norm": 0.7779273390769958, + "learning_rate": 0.001, + "loss": 1.4662, + "step": 20171 + }, + { + "epoch": 0.8533716896522549, + "grad_norm": 24.723020553588867, + "learning_rate": 0.001, + "loss": 1.8251, + "step": 20172 + }, + { + "epoch": 0.8534139944157713, + "grad_norm": 0.2329087257385254, + "learning_rate": 0.001, + "loss": 2.7639, + "step": 20173 + }, + { + "epoch": 0.8534562991792876, + "grad_norm": 0.14637401700019836, + "learning_rate": 0.001, + "loss": 1.5176, + "step": 20174 + }, + { + "epoch": 0.853498603942804, + "grad_norm": 0.16772529482841492, + "learning_rate": 0.001, + "loss": 2.0491, + "step": 20175 + }, + { + "epoch": 0.8535409087063204, + "grad_norm": 0.22313785552978516, + "learning_rate": 0.001, + "loss": 2.9042, + "step": 20176 + }, + { + "epoch": 0.8535832134698367, + "grad_norm": 0.16870731115341187, + "learning_rate": 0.001, + "loss": 2.4619, + "step": 20177 + }, + { + "epoch": 0.8536255182333531, + "grad_norm": 0.1313748061656952, + "learning_rate": 0.001, + "loss": 1.8931, + "step": 20178 + }, + { + "epoch": 0.8536678229968695, + "grad_norm": 0.1411811113357544, + "learning_rate": 0.001, + "loss": 1.7761, + "step": 20179 + }, + { + "epoch": 0.8537101277603858, + "grad_norm": 0.17591390013694763, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 20180 + }, + { + "epoch": 0.8537524325239022, + "grad_norm": 0.12935340404510498, + "learning_rate": 0.001, + "loss": 2.8795, + "step": 20181 + }, + { + "epoch": 0.8537947372874186, + "grad_norm": 0.16716565191745758, + "learning_rate": 0.001, + "loss": 1.745, + "step": 20182 + }, + { + "epoch": 0.8538370420509349, + "grad_norm": 0.4727073907852173, + "learning_rate": 0.001, + "loss": 1.8856, + "step": 20183 + }, + { + "epoch": 0.8538793468144513, + "grad_norm": 0.13519716262817383, + "learning_rate": 0.001, + "loss": 1.5485, + "step": 20184 + }, + { + "epoch": 0.8539216515779677, + "grad_norm": 0.14166143536567688, + "learning_rate": 0.001, + "loss": 1.9562, + "step": 20185 + }, + { + "epoch": 0.853963956341484, + "grad_norm": 0.6362976431846619, + "learning_rate": 0.001, + "loss": 1.7597, + "step": 20186 + }, + { + "epoch": 0.8540062611050004, + "grad_norm": 0.1704770028591156, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 20187 + }, + { + "epoch": 0.8540485658685167, + "grad_norm": 0.1493573784828186, + "learning_rate": 0.001, + "loss": 1.9775, + "step": 20188 + }, + { + "epoch": 0.8540908706320332, + "grad_norm": 0.14581403136253357, + "learning_rate": 0.001, + "loss": 1.6185, + "step": 20189 + }, + { + "epoch": 0.8541331753955496, + "grad_norm": 4.783538341522217, + "learning_rate": 0.001, + "loss": 2.7544, + "step": 20190 + }, + { + "epoch": 0.8541754801590659, + "grad_norm": 1.13120436668396, + "learning_rate": 0.001, + "loss": 2.2622, + "step": 20191 + }, + { + "epoch": 0.8542177849225823, + "grad_norm": 0.18143315613269806, + "learning_rate": 0.001, + "loss": 2.1182, + "step": 20192 + }, + { + "epoch": 0.8542600896860987, + "grad_norm": 0.37129276990890503, + "learning_rate": 0.001, + "loss": 2.6587, + "step": 20193 + }, + { + "epoch": 0.854302394449615, + "grad_norm": 2.5433599948883057, + "learning_rate": 0.001, + "loss": 2.0521, + "step": 20194 + }, + { + "epoch": 0.8543446992131314, + "grad_norm": 0.16446053981781006, + "learning_rate": 0.001, + "loss": 1.7504, + "step": 20195 + }, + { + "epoch": 0.8543870039766478, + "grad_norm": 0.16676220297813416, + "learning_rate": 0.001, + "loss": 1.6936, + "step": 20196 + }, + { + "epoch": 0.8544293087401641, + "grad_norm": 0.15930965542793274, + "learning_rate": 0.001, + "loss": 1.8836, + "step": 20197 + }, + { + "epoch": 0.8544716135036805, + "grad_norm": 0.16597270965576172, + "learning_rate": 0.001, + "loss": 1.7821, + "step": 20198 + }, + { + "epoch": 0.8545139182671969, + "grad_norm": 0.18009643256664276, + "learning_rate": 0.001, + "loss": 2.971, + "step": 20199 + }, + { + "epoch": 0.8545562230307132, + "grad_norm": 0.6921260952949524, + "learning_rate": 0.001, + "loss": 2.1276, + "step": 20200 + }, + { + "epoch": 0.8545985277942296, + "grad_norm": 0.1574309915304184, + "learning_rate": 0.001, + "loss": 2.1452, + "step": 20201 + }, + { + "epoch": 0.854640832557746, + "grad_norm": 0.16404499113559723, + "learning_rate": 0.001, + "loss": 2.189, + "step": 20202 + }, + { + "epoch": 0.8546831373212623, + "grad_norm": 0.13803629577159882, + "learning_rate": 0.001, + "loss": 1.8653, + "step": 20203 + }, + { + "epoch": 0.8547254420847787, + "grad_norm": 0.21450106799602509, + "learning_rate": 0.001, + "loss": 2.7563, + "step": 20204 + }, + { + "epoch": 0.8547677468482952, + "grad_norm": 0.1293824017047882, + "learning_rate": 0.001, + "loss": 1.6771, + "step": 20205 + }, + { + "epoch": 0.8548100516118115, + "grad_norm": 0.14951960742473602, + "learning_rate": 0.001, + "loss": 2.6877, + "step": 20206 + }, + { + "epoch": 0.8548523563753279, + "grad_norm": 1.9558665752410889, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 20207 + }, + { + "epoch": 0.8548946611388443, + "grad_norm": 0.16537365317344666, + "learning_rate": 0.001, + "loss": 1.9014, + "step": 20208 + }, + { + "epoch": 0.8549369659023606, + "grad_norm": 0.15604209899902344, + "learning_rate": 0.001, + "loss": 2.6344, + "step": 20209 + }, + { + "epoch": 0.854979270665877, + "grad_norm": 0.18542635440826416, + "learning_rate": 0.001, + "loss": 2.0202, + "step": 20210 + }, + { + "epoch": 0.8550215754293934, + "grad_norm": 0.4348926246166229, + "learning_rate": 0.001, + "loss": 2.0383, + "step": 20211 + }, + { + "epoch": 0.8550638801929097, + "grad_norm": 0.24259275197982788, + "learning_rate": 0.001, + "loss": 2.2676, + "step": 20212 + }, + { + "epoch": 0.8551061849564261, + "grad_norm": 0.1625998318195343, + "learning_rate": 0.001, + "loss": 1.8147, + "step": 20213 + }, + { + "epoch": 0.8551484897199425, + "grad_norm": 0.1525462120771408, + "learning_rate": 0.001, + "loss": 2.0379, + "step": 20214 + }, + { + "epoch": 0.8551907944834588, + "grad_norm": 0.1710134744644165, + "learning_rate": 0.001, + "loss": 2.2464, + "step": 20215 + }, + { + "epoch": 0.8552330992469752, + "grad_norm": 0.16427335143089294, + "learning_rate": 0.001, + "loss": 1.8928, + "step": 20216 + }, + { + "epoch": 0.8552754040104916, + "grad_norm": 0.1514868587255478, + "learning_rate": 0.001, + "loss": 2.0247, + "step": 20217 + }, + { + "epoch": 0.8553177087740079, + "grad_norm": 0.16171973943710327, + "learning_rate": 0.001, + "loss": 1.8759, + "step": 20218 + }, + { + "epoch": 0.8553600135375243, + "grad_norm": 1.2077202796936035, + "learning_rate": 0.001, + "loss": 2.7401, + "step": 20219 + }, + { + "epoch": 0.8554023183010407, + "grad_norm": 0.2370886206626892, + "learning_rate": 0.001, + "loss": 3.1154, + "step": 20220 + }, + { + "epoch": 0.855444623064557, + "grad_norm": 0.43571701645851135, + "learning_rate": 0.001, + "loss": 1.787, + "step": 20221 + }, + { + "epoch": 0.8554869278280735, + "grad_norm": 0.358680784702301, + "learning_rate": 0.001, + "loss": 2.0282, + "step": 20222 + }, + { + "epoch": 0.8555292325915899, + "grad_norm": 0.18976160883903503, + "learning_rate": 0.001, + "loss": 2.3883, + "step": 20223 + }, + { + "epoch": 0.8555715373551062, + "grad_norm": 0.1478756070137024, + "learning_rate": 0.001, + "loss": 1.8335, + "step": 20224 + }, + { + "epoch": 0.8556138421186226, + "grad_norm": 0.31914398074150085, + "learning_rate": 0.001, + "loss": 2.1218, + "step": 20225 + }, + { + "epoch": 0.855656146882139, + "grad_norm": 0.16509748995304108, + "learning_rate": 0.001, + "loss": 2.9447, + "step": 20226 + }, + { + "epoch": 0.8556984516456553, + "grad_norm": 0.20428349077701569, + "learning_rate": 0.001, + "loss": 1.7563, + "step": 20227 + }, + { + "epoch": 0.8557407564091717, + "grad_norm": 0.15375010669231415, + "learning_rate": 0.001, + "loss": 2.957, + "step": 20228 + }, + { + "epoch": 0.8557830611726881, + "grad_norm": 0.1550106555223465, + "learning_rate": 0.001, + "loss": 2.0625, + "step": 20229 + }, + { + "epoch": 0.8558253659362044, + "grad_norm": 0.7794061303138733, + "learning_rate": 0.001, + "loss": 2.9718, + "step": 20230 + }, + { + "epoch": 0.8558676706997208, + "grad_norm": 0.1645902544260025, + "learning_rate": 0.001, + "loss": 1.8263, + "step": 20231 + }, + { + "epoch": 0.8559099754632372, + "grad_norm": 0.14967839419841766, + "learning_rate": 0.001, + "loss": 1.828, + "step": 20232 + }, + { + "epoch": 0.8559522802267535, + "grad_norm": 0.3559803366661072, + "learning_rate": 0.001, + "loss": 2.069, + "step": 20233 + }, + { + "epoch": 0.8559945849902699, + "grad_norm": 0.1499088853597641, + "learning_rate": 0.001, + "loss": 1.9467, + "step": 20234 + }, + { + "epoch": 0.8560368897537862, + "grad_norm": 0.15247589349746704, + "learning_rate": 0.001, + "loss": 2.4844, + "step": 20235 + }, + { + "epoch": 0.8560791945173026, + "grad_norm": 0.16640542447566986, + "learning_rate": 0.001, + "loss": 2.222, + "step": 20236 + }, + { + "epoch": 0.856121499280819, + "grad_norm": 0.18216025829315186, + "learning_rate": 0.001, + "loss": 2.7879, + "step": 20237 + }, + { + "epoch": 0.8561638040443353, + "grad_norm": 0.2965571880340576, + "learning_rate": 0.001, + "loss": 2.3667, + "step": 20238 + }, + { + "epoch": 0.8562061088078518, + "grad_norm": 193.41860961914062, + "learning_rate": 0.001, + "loss": 1.4623, + "step": 20239 + }, + { + "epoch": 0.8562484135713682, + "grad_norm": 0.18015336990356445, + "learning_rate": 0.001, + "loss": 1.3319, + "step": 20240 + }, + { + "epoch": 0.8562907183348845, + "grad_norm": 0.1855952888727188, + "learning_rate": 0.001, + "loss": 2.0279, + "step": 20241 + }, + { + "epoch": 0.8563330230984009, + "grad_norm": 0.14489391446113586, + "learning_rate": 0.001, + "loss": 1.9138, + "step": 20242 + }, + { + "epoch": 0.8563753278619173, + "grad_norm": 0.19585612416267395, + "learning_rate": 0.001, + "loss": 1.921, + "step": 20243 + }, + { + "epoch": 0.8564176326254336, + "grad_norm": 0.1694945991039276, + "learning_rate": 0.001, + "loss": 1.8727, + "step": 20244 + }, + { + "epoch": 0.85645993738895, + "grad_norm": 0.2455274611711502, + "learning_rate": 0.001, + "loss": 2.0229, + "step": 20245 + }, + { + "epoch": 0.8565022421524664, + "grad_norm": 0.5265862941741943, + "learning_rate": 0.001, + "loss": 2.5353, + "step": 20246 + }, + { + "epoch": 0.8565445469159827, + "grad_norm": 0.7586323022842407, + "learning_rate": 0.001, + "loss": 2.2412, + "step": 20247 + }, + { + "epoch": 0.8565868516794991, + "grad_norm": 0.1906781643629074, + "learning_rate": 0.001, + "loss": 2.3801, + "step": 20248 + }, + { + "epoch": 0.8566291564430155, + "grad_norm": 0.18281270563602448, + "learning_rate": 0.001, + "loss": 1.8586, + "step": 20249 + }, + { + "epoch": 0.8566714612065318, + "grad_norm": 0.2404775768518448, + "learning_rate": 0.001, + "loss": 3.2444, + "step": 20250 + }, + { + "epoch": 0.8567137659700482, + "grad_norm": 0.1609775424003601, + "learning_rate": 0.001, + "loss": 2.8216, + "step": 20251 + }, + { + "epoch": 0.8567560707335646, + "grad_norm": 2.4240500926971436, + "learning_rate": 0.001, + "loss": 1.9752, + "step": 20252 + }, + { + "epoch": 0.8567983754970809, + "grad_norm": 0.2600942552089691, + "learning_rate": 0.001, + "loss": 2.9503, + "step": 20253 + }, + { + "epoch": 0.8568406802605973, + "grad_norm": 0.19915421307086945, + "learning_rate": 0.001, + "loss": 2.0628, + "step": 20254 + }, + { + "epoch": 0.8568829850241138, + "grad_norm": 0.16194456815719604, + "learning_rate": 0.001, + "loss": 2.0255, + "step": 20255 + }, + { + "epoch": 0.85692528978763, + "grad_norm": 0.15854515135288239, + "learning_rate": 0.001, + "loss": 2.5253, + "step": 20256 + }, + { + "epoch": 0.8569675945511465, + "grad_norm": 0.13806360960006714, + "learning_rate": 0.001, + "loss": 1.7646, + "step": 20257 + }, + { + "epoch": 0.8570098993146629, + "grad_norm": 0.47766992449760437, + "learning_rate": 0.001, + "loss": 2.2224, + "step": 20258 + }, + { + "epoch": 0.8570522040781792, + "grad_norm": 0.168570414185524, + "learning_rate": 0.001, + "loss": 3.5105, + "step": 20259 + }, + { + "epoch": 0.8570945088416956, + "grad_norm": 0.1976046860218048, + "learning_rate": 0.001, + "loss": 2.3813, + "step": 20260 + }, + { + "epoch": 0.857136813605212, + "grad_norm": 0.21734581887722015, + "learning_rate": 0.001, + "loss": 2.301, + "step": 20261 + }, + { + "epoch": 0.8571791183687283, + "grad_norm": 0.1577748954296112, + "learning_rate": 0.001, + "loss": 1.7362, + "step": 20262 + }, + { + "epoch": 0.8572214231322447, + "grad_norm": 0.1552111953496933, + "learning_rate": 0.001, + "loss": 2.7548, + "step": 20263 + }, + { + "epoch": 0.8572637278957611, + "grad_norm": 0.15418484807014465, + "learning_rate": 0.001, + "loss": 2.4548, + "step": 20264 + }, + { + "epoch": 0.8573060326592774, + "grad_norm": 0.2172391414642334, + "learning_rate": 0.001, + "loss": 2.6717, + "step": 20265 + }, + { + "epoch": 0.8573483374227938, + "grad_norm": 0.15989260375499725, + "learning_rate": 0.001, + "loss": 1.8661, + "step": 20266 + }, + { + "epoch": 0.8573906421863102, + "grad_norm": 2.102097988128662, + "learning_rate": 0.001, + "loss": 1.6953, + "step": 20267 + }, + { + "epoch": 0.8574329469498265, + "grad_norm": 0.13255327939987183, + "learning_rate": 0.001, + "loss": 2.1407, + "step": 20268 + }, + { + "epoch": 0.8574752517133429, + "grad_norm": 0.1770564168691635, + "learning_rate": 0.001, + "loss": 2.2512, + "step": 20269 + }, + { + "epoch": 0.8575175564768593, + "grad_norm": 0.14766398072242737, + "learning_rate": 0.001, + "loss": 1.7971, + "step": 20270 + }, + { + "epoch": 0.8575598612403756, + "grad_norm": 0.22131910920143127, + "learning_rate": 0.001, + "loss": 2.2181, + "step": 20271 + }, + { + "epoch": 0.857602166003892, + "grad_norm": 0.3644495904445648, + "learning_rate": 0.001, + "loss": 2.25, + "step": 20272 + }, + { + "epoch": 0.8576444707674085, + "grad_norm": 0.14953762292861938, + "learning_rate": 0.001, + "loss": 2.2675, + "step": 20273 + }, + { + "epoch": 0.8576867755309248, + "grad_norm": 0.16283674538135529, + "learning_rate": 0.001, + "loss": 1.8331, + "step": 20274 + }, + { + "epoch": 0.8577290802944412, + "grad_norm": 0.1207035556435585, + "learning_rate": 0.001, + "loss": 2.1584, + "step": 20275 + }, + { + "epoch": 0.8577713850579576, + "grad_norm": 0.12849994003772736, + "learning_rate": 0.001, + "loss": 2.2343, + "step": 20276 + }, + { + "epoch": 0.8578136898214739, + "grad_norm": 2.081145763397217, + "learning_rate": 0.001, + "loss": 1.8637, + "step": 20277 + }, + { + "epoch": 0.8578559945849903, + "grad_norm": 0.1410299390554428, + "learning_rate": 0.001, + "loss": 2.8566, + "step": 20278 + }, + { + "epoch": 0.8578982993485066, + "grad_norm": 0.13738563656806946, + "learning_rate": 0.001, + "loss": 2.2659, + "step": 20279 + }, + { + "epoch": 0.857940604112023, + "grad_norm": 0.13959841430187225, + "learning_rate": 0.001, + "loss": 2.9711, + "step": 20280 + }, + { + "epoch": 0.8579829088755394, + "grad_norm": 0.1706565022468567, + "learning_rate": 0.001, + "loss": 2.1167, + "step": 20281 + }, + { + "epoch": 0.8580252136390557, + "grad_norm": 0.14428362250328064, + "learning_rate": 0.001, + "loss": 2.3054, + "step": 20282 + }, + { + "epoch": 0.8580675184025721, + "grad_norm": 0.1867976188659668, + "learning_rate": 0.001, + "loss": 1.8198, + "step": 20283 + }, + { + "epoch": 0.8581098231660885, + "grad_norm": 0.1552973836660385, + "learning_rate": 0.001, + "loss": 2.5998, + "step": 20284 + }, + { + "epoch": 0.8581521279296048, + "grad_norm": 0.1912197321653366, + "learning_rate": 0.001, + "loss": 1.9071, + "step": 20285 + }, + { + "epoch": 0.8581944326931212, + "grad_norm": 0.18686088919639587, + "learning_rate": 0.001, + "loss": 1.5572, + "step": 20286 + }, + { + "epoch": 0.8582367374566376, + "grad_norm": 0.21159033477306366, + "learning_rate": 0.001, + "loss": 2.1987, + "step": 20287 + }, + { + "epoch": 0.8582790422201539, + "grad_norm": 0.16880102455615997, + "learning_rate": 0.001, + "loss": 2.0401, + "step": 20288 + }, + { + "epoch": 0.8583213469836704, + "grad_norm": 0.16665174067020416, + "learning_rate": 0.001, + "loss": 1.763, + "step": 20289 + }, + { + "epoch": 0.8583636517471868, + "grad_norm": 0.5530775189399719, + "learning_rate": 0.001, + "loss": 2.763, + "step": 20290 + }, + { + "epoch": 0.8584059565107031, + "grad_norm": 0.1329568773508072, + "learning_rate": 0.001, + "loss": 1.8301, + "step": 20291 + }, + { + "epoch": 0.8584482612742195, + "grad_norm": 22.826807022094727, + "learning_rate": 0.001, + "loss": 2.5194, + "step": 20292 + }, + { + "epoch": 0.8584905660377359, + "grad_norm": 1.763818621635437, + "learning_rate": 0.001, + "loss": 3.4182, + "step": 20293 + }, + { + "epoch": 0.8585328708012522, + "grad_norm": 0.1884799599647522, + "learning_rate": 0.001, + "loss": 1.8589, + "step": 20294 + }, + { + "epoch": 0.8585751755647686, + "grad_norm": 0.1700194627046585, + "learning_rate": 0.001, + "loss": 2.0548, + "step": 20295 + }, + { + "epoch": 0.858617480328285, + "grad_norm": 0.17256033420562744, + "learning_rate": 0.001, + "loss": 2.0633, + "step": 20296 + }, + { + "epoch": 0.8586597850918013, + "grad_norm": 0.1553211808204651, + "learning_rate": 0.001, + "loss": 2.8999, + "step": 20297 + }, + { + "epoch": 0.8587020898553177, + "grad_norm": 0.17085745930671692, + "learning_rate": 0.001, + "loss": 2.191, + "step": 20298 + }, + { + "epoch": 0.8587443946188341, + "grad_norm": 0.18587633967399597, + "learning_rate": 0.001, + "loss": 2.1725, + "step": 20299 + }, + { + "epoch": 0.8587866993823504, + "grad_norm": 0.1755942404270172, + "learning_rate": 0.001, + "loss": 2.9481, + "step": 20300 + }, + { + "epoch": 0.8588290041458668, + "grad_norm": 0.6229716539382935, + "learning_rate": 0.001, + "loss": 4.1505, + "step": 20301 + }, + { + "epoch": 0.8588713089093832, + "grad_norm": 0.14304950833320618, + "learning_rate": 0.001, + "loss": 1.9846, + "step": 20302 + }, + { + "epoch": 0.8589136136728995, + "grad_norm": 0.1368221938610077, + "learning_rate": 0.001, + "loss": 2.0029, + "step": 20303 + }, + { + "epoch": 0.8589559184364159, + "grad_norm": 0.1675972193479538, + "learning_rate": 0.001, + "loss": 1.9319, + "step": 20304 + }, + { + "epoch": 0.8589982231999324, + "grad_norm": 0.1587626338005066, + "learning_rate": 0.001, + "loss": 1.855, + "step": 20305 + }, + { + "epoch": 0.8590405279634487, + "grad_norm": 0.3222064673900604, + "learning_rate": 0.001, + "loss": 3.0261, + "step": 20306 + }, + { + "epoch": 0.8590828327269651, + "grad_norm": 0.13862450420856476, + "learning_rate": 0.001, + "loss": 2.2649, + "step": 20307 + }, + { + "epoch": 0.8591251374904815, + "grad_norm": 0.20226208865642548, + "learning_rate": 0.001, + "loss": 2.0139, + "step": 20308 + }, + { + "epoch": 0.8591674422539978, + "grad_norm": 0.16037043929100037, + "learning_rate": 0.001, + "loss": 2.5581, + "step": 20309 + }, + { + "epoch": 0.8592097470175142, + "grad_norm": 0.17345164716243744, + "learning_rate": 0.001, + "loss": 1.9058, + "step": 20310 + }, + { + "epoch": 0.8592520517810306, + "grad_norm": 0.4555630385875702, + "learning_rate": 0.001, + "loss": 3.2089, + "step": 20311 + }, + { + "epoch": 0.8592943565445469, + "grad_norm": 0.6636331677436829, + "learning_rate": 0.001, + "loss": 1.7678, + "step": 20312 + }, + { + "epoch": 0.8593366613080633, + "grad_norm": 1.500398874282837, + "learning_rate": 0.001, + "loss": 1.8767, + "step": 20313 + }, + { + "epoch": 0.8593789660715797, + "grad_norm": 0.11064822226762772, + "learning_rate": 0.001, + "loss": 2.8582, + "step": 20314 + }, + { + "epoch": 0.859421270835096, + "grad_norm": 0.2151585817337036, + "learning_rate": 0.001, + "loss": 1.8232, + "step": 20315 + }, + { + "epoch": 0.8594635755986124, + "grad_norm": 0.14713852107524872, + "learning_rate": 0.001, + "loss": 2.7334, + "step": 20316 + }, + { + "epoch": 0.8595058803621288, + "grad_norm": 0.13766881823539734, + "learning_rate": 0.001, + "loss": 1.6997, + "step": 20317 + }, + { + "epoch": 0.8595481851256451, + "grad_norm": 0.18348439037799835, + "learning_rate": 0.001, + "loss": 2.0182, + "step": 20318 + }, + { + "epoch": 0.8595904898891615, + "grad_norm": 0.12168294191360474, + "learning_rate": 0.001, + "loss": 3.2898, + "step": 20319 + }, + { + "epoch": 0.859632794652678, + "grad_norm": 0.12945544719696045, + "learning_rate": 0.001, + "loss": 1.9306, + "step": 20320 + }, + { + "epoch": 0.8596750994161942, + "grad_norm": 0.13580989837646484, + "learning_rate": 0.001, + "loss": 1.7863, + "step": 20321 + }, + { + "epoch": 0.8597174041797107, + "grad_norm": 0.1241777166724205, + "learning_rate": 0.001, + "loss": 2.0355, + "step": 20322 + }, + { + "epoch": 0.859759708943227, + "grad_norm": 0.15077315270900726, + "learning_rate": 0.001, + "loss": 2.9516, + "step": 20323 + }, + { + "epoch": 0.8598020137067434, + "grad_norm": 0.1805119663476944, + "learning_rate": 0.001, + "loss": 2.0972, + "step": 20324 + }, + { + "epoch": 0.8598443184702598, + "grad_norm": 0.24641673266887665, + "learning_rate": 0.001, + "loss": 2.3205, + "step": 20325 + }, + { + "epoch": 0.8598866232337761, + "grad_norm": 0.15835225582122803, + "learning_rate": 0.001, + "loss": 2.644, + "step": 20326 + }, + { + "epoch": 0.8599289279972925, + "grad_norm": 0.1202114075422287, + "learning_rate": 0.001, + "loss": 1.4397, + "step": 20327 + }, + { + "epoch": 0.8599712327608089, + "grad_norm": 0.13486960530281067, + "learning_rate": 0.001, + "loss": 1.5115, + "step": 20328 + }, + { + "epoch": 0.8600135375243252, + "grad_norm": 0.14606483280658722, + "learning_rate": 0.001, + "loss": 1.9028, + "step": 20329 + }, + { + "epoch": 0.8600558422878416, + "grad_norm": 0.14410161972045898, + "learning_rate": 0.001, + "loss": 2.147, + "step": 20330 + }, + { + "epoch": 0.860098147051358, + "grad_norm": 0.14581367373466492, + "learning_rate": 0.001, + "loss": 2.5193, + "step": 20331 + }, + { + "epoch": 0.8601404518148743, + "grad_norm": 1.6199474334716797, + "learning_rate": 0.001, + "loss": 1.8664, + "step": 20332 + }, + { + "epoch": 0.8601827565783907, + "grad_norm": 0.14424443244934082, + "learning_rate": 0.001, + "loss": 2.7133, + "step": 20333 + }, + { + "epoch": 0.8602250613419071, + "grad_norm": 2.242705821990967, + "learning_rate": 0.001, + "loss": 1.5209, + "step": 20334 + }, + { + "epoch": 0.8602673661054234, + "grad_norm": 0.2191903442144394, + "learning_rate": 0.001, + "loss": 1.4768, + "step": 20335 + }, + { + "epoch": 0.8603096708689398, + "grad_norm": 0.1648568958044052, + "learning_rate": 0.001, + "loss": 1.5056, + "step": 20336 + }, + { + "epoch": 0.8603519756324562, + "grad_norm": 0.16343936324119568, + "learning_rate": 0.001, + "loss": 1.8221, + "step": 20337 + }, + { + "epoch": 0.8603942803959725, + "grad_norm": 0.20765286684036255, + "learning_rate": 0.001, + "loss": 1.8847, + "step": 20338 + }, + { + "epoch": 0.860436585159489, + "grad_norm": 0.16252531111240387, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 20339 + }, + { + "epoch": 0.8604788899230054, + "grad_norm": 0.4384823739528656, + "learning_rate": 0.001, + "loss": 2.4298, + "step": 20340 + }, + { + "epoch": 0.8605211946865217, + "grad_norm": 0.1340947151184082, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 20341 + }, + { + "epoch": 0.8605634994500381, + "grad_norm": 4.556620121002197, + "learning_rate": 0.001, + "loss": 1.8013, + "step": 20342 + }, + { + "epoch": 0.8606058042135545, + "grad_norm": 0.2241327315568924, + "learning_rate": 0.001, + "loss": 2.6166, + "step": 20343 + }, + { + "epoch": 0.8606481089770708, + "grad_norm": 0.18115611374378204, + "learning_rate": 0.001, + "loss": 2.5444, + "step": 20344 + }, + { + "epoch": 0.8606904137405872, + "grad_norm": 0.15894751250743866, + "learning_rate": 0.001, + "loss": 2.2565, + "step": 20345 + }, + { + "epoch": 0.8607327185041036, + "grad_norm": 0.9839511513710022, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 20346 + }, + { + "epoch": 0.8607750232676199, + "grad_norm": 0.14979402720928192, + "learning_rate": 0.001, + "loss": 2.9431, + "step": 20347 + }, + { + "epoch": 0.8608173280311363, + "grad_norm": 0.22843679785728455, + "learning_rate": 0.001, + "loss": 2.2349, + "step": 20348 + }, + { + "epoch": 0.8608596327946527, + "grad_norm": 0.26081418991088867, + "learning_rate": 0.001, + "loss": 2.255, + "step": 20349 + }, + { + "epoch": 0.860901937558169, + "grad_norm": 0.20167173445224762, + "learning_rate": 0.001, + "loss": 3.1905, + "step": 20350 + }, + { + "epoch": 0.8609442423216854, + "grad_norm": 0.22882631421089172, + "learning_rate": 0.001, + "loss": 2.6694, + "step": 20351 + }, + { + "epoch": 0.8609865470852018, + "grad_norm": 0.23098324239253998, + "learning_rate": 0.001, + "loss": 1.9321, + "step": 20352 + }, + { + "epoch": 0.8610288518487181, + "grad_norm": 0.18798838555812836, + "learning_rate": 0.001, + "loss": 2.3873, + "step": 20353 + }, + { + "epoch": 0.8610711566122345, + "grad_norm": 0.20491915941238403, + "learning_rate": 0.001, + "loss": 1.764, + "step": 20354 + }, + { + "epoch": 0.861113461375751, + "grad_norm": 0.2087901085615158, + "learning_rate": 0.001, + "loss": 2.1135, + "step": 20355 + }, + { + "epoch": 0.8611557661392673, + "grad_norm": 0.2142881155014038, + "learning_rate": 0.001, + "loss": 2.0897, + "step": 20356 + }, + { + "epoch": 0.8611980709027837, + "grad_norm": 2.1609714031219482, + "learning_rate": 0.001, + "loss": 4.1002, + "step": 20357 + }, + { + "epoch": 0.8612403756663001, + "grad_norm": 0.21619194746017456, + "learning_rate": 0.001, + "loss": 2.0694, + "step": 20358 + }, + { + "epoch": 0.8612826804298164, + "grad_norm": 738.5827026367188, + "learning_rate": 0.001, + "loss": 3.4232, + "step": 20359 + }, + { + "epoch": 0.8613249851933328, + "grad_norm": 0.20825865864753723, + "learning_rate": 0.001, + "loss": 2.4266, + "step": 20360 + }, + { + "epoch": 0.8613672899568492, + "grad_norm": 0.41342079639434814, + "learning_rate": 0.001, + "loss": 3.1381, + "step": 20361 + }, + { + "epoch": 0.8614095947203655, + "grad_norm": 0.2036539614200592, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 20362 + }, + { + "epoch": 0.8614518994838819, + "grad_norm": 0.9940122365951538, + "learning_rate": 0.001, + "loss": 2.2586, + "step": 20363 + }, + { + "epoch": 0.8614942042473983, + "grad_norm": 0.19654519855976105, + "learning_rate": 0.001, + "loss": 1.582, + "step": 20364 + }, + { + "epoch": 0.8615365090109146, + "grad_norm": 0.1715071052312851, + "learning_rate": 0.001, + "loss": 1.2585, + "step": 20365 + }, + { + "epoch": 0.861578813774431, + "grad_norm": 0.14859987795352936, + "learning_rate": 0.001, + "loss": 2.478, + "step": 20366 + }, + { + "epoch": 0.8616211185379474, + "grad_norm": 0.18359853327274323, + "learning_rate": 0.001, + "loss": 1.8721, + "step": 20367 + }, + { + "epoch": 0.8616634233014637, + "grad_norm": 0.24993368983268738, + "learning_rate": 0.001, + "loss": 2.4343, + "step": 20368 + }, + { + "epoch": 0.8617057280649801, + "grad_norm": 0.2071733921766281, + "learning_rate": 0.001, + "loss": 2.7097, + "step": 20369 + }, + { + "epoch": 0.8617480328284964, + "grad_norm": 2.571146249771118, + "learning_rate": 0.001, + "loss": 2.0202, + "step": 20370 + }, + { + "epoch": 0.8617903375920128, + "grad_norm": 0.5075177550315857, + "learning_rate": 0.001, + "loss": 2.1632, + "step": 20371 + }, + { + "epoch": 0.8618326423555293, + "grad_norm": 0.18654043972492218, + "learning_rate": 0.001, + "loss": 2.4338, + "step": 20372 + }, + { + "epoch": 0.8618749471190456, + "grad_norm": 0.2202577441930771, + "learning_rate": 0.001, + "loss": 3.0875, + "step": 20373 + }, + { + "epoch": 0.861917251882562, + "grad_norm": 3.0814690589904785, + "learning_rate": 0.001, + "loss": 2.5025, + "step": 20374 + }, + { + "epoch": 0.8619595566460784, + "grad_norm": 1.5214474201202393, + "learning_rate": 0.001, + "loss": 3.0009, + "step": 20375 + }, + { + "epoch": 0.8620018614095947, + "grad_norm": 0.14105013012886047, + "learning_rate": 0.001, + "loss": 1.2828, + "step": 20376 + }, + { + "epoch": 0.8620441661731111, + "grad_norm": 0.16624034941196442, + "learning_rate": 0.001, + "loss": 2.2575, + "step": 20377 + }, + { + "epoch": 0.8620864709366275, + "grad_norm": 0.14870575070381165, + "learning_rate": 0.001, + "loss": 2.1555, + "step": 20378 + }, + { + "epoch": 0.8621287757001438, + "grad_norm": 0.17573565244674683, + "learning_rate": 0.001, + "loss": 1.6783, + "step": 20379 + }, + { + "epoch": 0.8621710804636602, + "grad_norm": 0.3848056197166443, + "learning_rate": 0.001, + "loss": 2.9054, + "step": 20380 + }, + { + "epoch": 0.8622133852271766, + "grad_norm": 4.15639591217041, + "learning_rate": 0.001, + "loss": 3.0232, + "step": 20381 + }, + { + "epoch": 0.8622556899906929, + "grad_norm": 0.9076377749443054, + "learning_rate": 0.001, + "loss": 2.6062, + "step": 20382 + }, + { + "epoch": 0.8622979947542093, + "grad_norm": 0.19111178815364838, + "learning_rate": 0.001, + "loss": 2.8008, + "step": 20383 + }, + { + "epoch": 0.8623402995177257, + "grad_norm": 0.15568110346794128, + "learning_rate": 0.001, + "loss": 1.6041, + "step": 20384 + }, + { + "epoch": 0.862382604281242, + "grad_norm": 0.15372473001480103, + "learning_rate": 0.001, + "loss": 2.1381, + "step": 20385 + }, + { + "epoch": 0.8624249090447584, + "grad_norm": 6.081607341766357, + "learning_rate": 0.001, + "loss": 2.3537, + "step": 20386 + }, + { + "epoch": 0.8624672138082748, + "grad_norm": 0.15494304895401, + "learning_rate": 0.001, + "loss": 2.4234, + "step": 20387 + }, + { + "epoch": 0.8625095185717911, + "grad_norm": 0.2465604841709137, + "learning_rate": 0.001, + "loss": 2.8873, + "step": 20388 + }, + { + "epoch": 0.8625518233353076, + "grad_norm": 0.16855676472187042, + "learning_rate": 0.001, + "loss": 2.7802, + "step": 20389 + }, + { + "epoch": 0.862594128098824, + "grad_norm": 0.1656714677810669, + "learning_rate": 0.001, + "loss": 2.3807, + "step": 20390 + }, + { + "epoch": 0.8626364328623403, + "grad_norm": 0.38702452182769775, + "learning_rate": 0.001, + "loss": 2.1454, + "step": 20391 + }, + { + "epoch": 0.8626787376258567, + "grad_norm": 0.17900040745735168, + "learning_rate": 0.001, + "loss": 3.0575, + "step": 20392 + }, + { + "epoch": 0.8627210423893731, + "grad_norm": 0.17749975621700287, + "learning_rate": 0.001, + "loss": 1.8404, + "step": 20393 + }, + { + "epoch": 0.8627633471528894, + "grad_norm": 0.17908912897109985, + "learning_rate": 0.001, + "loss": 1.9364, + "step": 20394 + }, + { + "epoch": 0.8628056519164058, + "grad_norm": 0.7642483115196228, + "learning_rate": 0.001, + "loss": 3.3047, + "step": 20395 + }, + { + "epoch": 0.8628479566799222, + "grad_norm": 0.2033914178609848, + "learning_rate": 0.001, + "loss": 2.6474, + "step": 20396 + }, + { + "epoch": 0.8628902614434385, + "grad_norm": 0.7764859199523926, + "learning_rate": 0.001, + "loss": 2.9559, + "step": 20397 + }, + { + "epoch": 0.8629325662069549, + "grad_norm": 0.1633528769016266, + "learning_rate": 0.001, + "loss": 2.0213, + "step": 20398 + }, + { + "epoch": 0.8629748709704713, + "grad_norm": 0.16931723058223724, + "learning_rate": 0.001, + "loss": 1.7263, + "step": 20399 + }, + { + "epoch": 0.8630171757339876, + "grad_norm": 0.1585397720336914, + "learning_rate": 0.001, + "loss": 2.8618, + "step": 20400 + }, + { + "epoch": 0.863059480497504, + "grad_norm": 0.1430475264787674, + "learning_rate": 0.001, + "loss": 1.8755, + "step": 20401 + }, + { + "epoch": 0.8631017852610204, + "grad_norm": 0.12753239274024963, + "learning_rate": 0.001, + "loss": 2.3516, + "step": 20402 + }, + { + "epoch": 0.8631440900245367, + "grad_norm": 0.1464456021785736, + "learning_rate": 0.001, + "loss": 2.6126, + "step": 20403 + }, + { + "epoch": 0.8631863947880531, + "grad_norm": 0.20379814505577087, + "learning_rate": 0.001, + "loss": 1.7753, + "step": 20404 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.15957339107990265, + "learning_rate": 0.001, + "loss": 1.6083, + "step": 20405 + }, + { + "epoch": 0.8632710043150859, + "grad_norm": 0.2491006702184677, + "learning_rate": 0.001, + "loss": 2.6279, + "step": 20406 + }, + { + "epoch": 0.8633133090786023, + "grad_norm": 0.1599300354719162, + "learning_rate": 0.001, + "loss": 2.2997, + "step": 20407 + }, + { + "epoch": 0.8633556138421187, + "grad_norm": 0.14536763727664948, + "learning_rate": 0.001, + "loss": 2.2268, + "step": 20408 + }, + { + "epoch": 0.863397918605635, + "grad_norm": 0.12874636054039001, + "learning_rate": 0.001, + "loss": 2.0432, + "step": 20409 + }, + { + "epoch": 0.8634402233691514, + "grad_norm": 0.1630754917860031, + "learning_rate": 0.001, + "loss": 1.519, + "step": 20410 + }, + { + "epoch": 0.8634825281326678, + "grad_norm": 0.18456068634986877, + "learning_rate": 0.001, + "loss": 2.3714, + "step": 20411 + }, + { + "epoch": 0.8635248328961841, + "grad_norm": 0.3489452004432678, + "learning_rate": 0.001, + "loss": 2.1257, + "step": 20412 + }, + { + "epoch": 0.8635671376597005, + "grad_norm": 0.13253667950630188, + "learning_rate": 0.001, + "loss": 1.619, + "step": 20413 + }, + { + "epoch": 0.8636094424232168, + "grad_norm": 0.14252302050590515, + "learning_rate": 0.001, + "loss": 1.8472, + "step": 20414 + }, + { + "epoch": 0.8636517471867332, + "grad_norm": 0.3455297350883484, + "learning_rate": 0.001, + "loss": 1.9293, + "step": 20415 + }, + { + "epoch": 0.8636940519502496, + "grad_norm": 0.14407595992088318, + "learning_rate": 0.001, + "loss": 2.3692, + "step": 20416 + }, + { + "epoch": 0.8637363567137659, + "grad_norm": 0.1891782134771347, + "learning_rate": 0.001, + "loss": 2.3615, + "step": 20417 + }, + { + "epoch": 0.8637786614772823, + "grad_norm": 0.17591020464897156, + "learning_rate": 0.001, + "loss": 2.9479, + "step": 20418 + }, + { + "epoch": 0.8638209662407987, + "grad_norm": 2.7036447525024414, + "learning_rate": 0.001, + "loss": 1.6538, + "step": 20419 + }, + { + "epoch": 0.863863271004315, + "grad_norm": 0.14303186535835266, + "learning_rate": 0.001, + "loss": 2.2801, + "step": 20420 + }, + { + "epoch": 0.8639055757678314, + "grad_norm": 0.15167652070522308, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 20421 + }, + { + "epoch": 0.8639478805313479, + "grad_norm": 0.1477757692337036, + "learning_rate": 0.001, + "loss": 1.8386, + "step": 20422 + }, + { + "epoch": 0.8639901852948642, + "grad_norm": 0.1253604143857956, + "learning_rate": 0.001, + "loss": 1.2549, + "step": 20423 + }, + { + "epoch": 0.8640324900583806, + "grad_norm": 0.13061819970607758, + "learning_rate": 0.001, + "loss": 1.4896, + "step": 20424 + }, + { + "epoch": 0.864074794821897, + "grad_norm": 0.21047991514205933, + "learning_rate": 0.001, + "loss": 2.2363, + "step": 20425 + }, + { + "epoch": 0.8641170995854133, + "grad_norm": 0.1477625072002411, + "learning_rate": 0.001, + "loss": 1.5357, + "step": 20426 + }, + { + "epoch": 0.8641594043489297, + "grad_norm": 0.19324536621570587, + "learning_rate": 0.001, + "loss": 1.9138, + "step": 20427 + }, + { + "epoch": 0.8642017091124461, + "grad_norm": 0.508693516254425, + "learning_rate": 0.001, + "loss": 2.217, + "step": 20428 + }, + { + "epoch": 0.8642440138759624, + "grad_norm": 0.15548713505268097, + "learning_rate": 0.001, + "loss": 1.7976, + "step": 20429 + }, + { + "epoch": 0.8642863186394788, + "grad_norm": 0.18489326536655426, + "learning_rate": 0.001, + "loss": 3.005, + "step": 20430 + }, + { + "epoch": 0.8643286234029952, + "grad_norm": 0.1529025137424469, + "learning_rate": 0.001, + "loss": 2.1427, + "step": 20431 + }, + { + "epoch": 0.8643709281665115, + "grad_norm": 0.1769225001335144, + "learning_rate": 0.001, + "loss": 1.596, + "step": 20432 + }, + { + "epoch": 0.8644132329300279, + "grad_norm": 0.12776906788349152, + "learning_rate": 0.001, + "loss": 1.8726, + "step": 20433 + }, + { + "epoch": 0.8644555376935443, + "grad_norm": 0.1688249409198761, + "learning_rate": 0.001, + "loss": 1.9496, + "step": 20434 + }, + { + "epoch": 0.8644978424570606, + "grad_norm": 0.6416119337081909, + "learning_rate": 0.001, + "loss": 2.3831, + "step": 20435 + }, + { + "epoch": 0.864540147220577, + "grad_norm": 0.14832055568695068, + "learning_rate": 0.001, + "loss": 1.8445, + "step": 20436 + }, + { + "epoch": 0.8645824519840934, + "grad_norm": 0.25075727701187134, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 20437 + }, + { + "epoch": 0.8646247567476097, + "grad_norm": 0.15989474952220917, + "learning_rate": 0.001, + "loss": 1.8512, + "step": 20438 + }, + { + "epoch": 0.8646670615111262, + "grad_norm": 0.1592397540807724, + "learning_rate": 0.001, + "loss": 2.8129, + "step": 20439 + }, + { + "epoch": 0.8647093662746426, + "grad_norm": 0.14796599745750427, + "learning_rate": 0.001, + "loss": 1.5769, + "step": 20440 + }, + { + "epoch": 0.8647516710381589, + "grad_norm": 0.1716962605714798, + "learning_rate": 0.001, + "loss": 2.3114, + "step": 20441 + }, + { + "epoch": 0.8647939758016753, + "grad_norm": 0.4446561336517334, + "learning_rate": 0.001, + "loss": 1.3337, + "step": 20442 + }, + { + "epoch": 0.8648362805651917, + "grad_norm": 0.15819323062896729, + "learning_rate": 0.001, + "loss": 2.7212, + "step": 20443 + }, + { + "epoch": 0.864878585328708, + "grad_norm": 1.2492092847824097, + "learning_rate": 0.001, + "loss": 2.112, + "step": 20444 + }, + { + "epoch": 0.8649208900922244, + "grad_norm": 0.1683206856250763, + "learning_rate": 0.001, + "loss": 2.6859, + "step": 20445 + }, + { + "epoch": 0.8649631948557408, + "grad_norm": 0.18615072965621948, + "learning_rate": 0.001, + "loss": 2.0135, + "step": 20446 + }, + { + "epoch": 0.8650054996192571, + "grad_norm": 0.16364848613739014, + "learning_rate": 0.001, + "loss": 2.1575, + "step": 20447 + }, + { + "epoch": 0.8650478043827735, + "grad_norm": 0.1338125467300415, + "learning_rate": 0.001, + "loss": 1.5287, + "step": 20448 + }, + { + "epoch": 0.8650901091462899, + "grad_norm": 0.14159221947193146, + "learning_rate": 0.001, + "loss": 2.4444, + "step": 20449 + }, + { + "epoch": 0.8651324139098062, + "grad_norm": 0.15110106766223907, + "learning_rate": 0.001, + "loss": 2.3197, + "step": 20450 + }, + { + "epoch": 0.8651747186733226, + "grad_norm": 0.13722842931747437, + "learning_rate": 0.001, + "loss": 1.7881, + "step": 20451 + }, + { + "epoch": 0.865217023436839, + "grad_norm": 0.13986808061599731, + "learning_rate": 0.001, + "loss": 1.699, + "step": 20452 + }, + { + "epoch": 0.8652593282003553, + "grad_norm": 0.19141452014446259, + "learning_rate": 0.001, + "loss": 1.6697, + "step": 20453 + }, + { + "epoch": 0.8653016329638717, + "grad_norm": 0.15517202019691467, + "learning_rate": 0.001, + "loss": 3.0925, + "step": 20454 + }, + { + "epoch": 0.8653439377273882, + "grad_norm": 0.13340319693088531, + "learning_rate": 0.001, + "loss": 1.3833, + "step": 20455 + }, + { + "epoch": 0.8653862424909045, + "grad_norm": 0.14518952369689941, + "learning_rate": 0.001, + "loss": 1.6842, + "step": 20456 + }, + { + "epoch": 0.8654285472544209, + "grad_norm": 0.15267087519168854, + "learning_rate": 0.001, + "loss": 1.7472, + "step": 20457 + }, + { + "epoch": 0.8654708520179372, + "grad_norm": 0.16295918822288513, + "learning_rate": 0.001, + "loss": 1.6337, + "step": 20458 + }, + { + "epoch": 0.8655131567814536, + "grad_norm": 0.3994349241256714, + "learning_rate": 0.001, + "loss": 2.1025, + "step": 20459 + }, + { + "epoch": 0.86555546154497, + "grad_norm": 0.14686226844787598, + "learning_rate": 0.001, + "loss": 2.2015, + "step": 20460 + }, + { + "epoch": 0.8655977663084863, + "grad_norm": 2.716871500015259, + "learning_rate": 0.001, + "loss": 3.038, + "step": 20461 + }, + { + "epoch": 0.8656400710720027, + "grad_norm": 0.152231365442276, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 20462 + }, + { + "epoch": 0.8656823758355191, + "grad_norm": 0.16370278596878052, + "learning_rate": 0.001, + "loss": 2.3014, + "step": 20463 + }, + { + "epoch": 0.8657246805990354, + "grad_norm": 0.18813279271125793, + "learning_rate": 0.001, + "loss": 3.1802, + "step": 20464 + }, + { + "epoch": 0.8657669853625518, + "grad_norm": 2.1478288173675537, + "learning_rate": 0.001, + "loss": 2.8626, + "step": 20465 + }, + { + "epoch": 0.8658092901260682, + "grad_norm": 0.6545858979225159, + "learning_rate": 0.001, + "loss": 1.976, + "step": 20466 + }, + { + "epoch": 0.8658515948895845, + "grad_norm": 0.163694828748703, + "learning_rate": 0.001, + "loss": 1.7941, + "step": 20467 + }, + { + "epoch": 0.8658938996531009, + "grad_norm": 0.15311507880687714, + "learning_rate": 0.001, + "loss": 1.8408, + "step": 20468 + }, + { + "epoch": 0.8659362044166173, + "grad_norm": 0.13591715693473816, + "learning_rate": 0.001, + "loss": 1.8756, + "step": 20469 + }, + { + "epoch": 0.8659785091801336, + "grad_norm": 0.12307523190975189, + "learning_rate": 0.001, + "loss": 2.4576, + "step": 20470 + }, + { + "epoch": 0.86602081394365, + "grad_norm": 0.23300588130950928, + "learning_rate": 0.001, + "loss": 1.5551, + "step": 20471 + }, + { + "epoch": 0.8660631187071665, + "grad_norm": 0.30247703194618225, + "learning_rate": 0.001, + "loss": 1.9095, + "step": 20472 + }, + { + "epoch": 0.8661054234706828, + "grad_norm": 0.12208034843206406, + "learning_rate": 0.001, + "loss": 2.1441, + "step": 20473 + }, + { + "epoch": 0.8661477282341992, + "grad_norm": 0.427654504776001, + "learning_rate": 0.001, + "loss": 3.1424, + "step": 20474 + }, + { + "epoch": 0.8661900329977156, + "grad_norm": 0.15135295689105988, + "learning_rate": 0.001, + "loss": 1.4906, + "step": 20475 + }, + { + "epoch": 0.8662323377612319, + "grad_norm": 0.15214575827121735, + "learning_rate": 0.001, + "loss": 2.8835, + "step": 20476 + }, + { + "epoch": 0.8662746425247483, + "grad_norm": 0.3086780309677124, + "learning_rate": 0.001, + "loss": 3.6189, + "step": 20477 + }, + { + "epoch": 0.8663169472882647, + "grad_norm": 0.2229657918214798, + "learning_rate": 0.001, + "loss": 2.7436, + "step": 20478 + }, + { + "epoch": 0.866359252051781, + "grad_norm": 0.21993772685527802, + "learning_rate": 0.001, + "loss": 2.4626, + "step": 20479 + }, + { + "epoch": 0.8664015568152974, + "grad_norm": 7.104289531707764, + "learning_rate": 0.001, + "loss": 2.2925, + "step": 20480 + }, + { + "epoch": 0.8664438615788138, + "grad_norm": 0.18431110680103302, + "learning_rate": 0.001, + "loss": 2.1564, + "step": 20481 + }, + { + "epoch": 0.8664861663423301, + "grad_norm": 0.17499950528144836, + "learning_rate": 0.001, + "loss": 2.6352, + "step": 20482 + }, + { + "epoch": 0.8665284711058465, + "grad_norm": 0.17781849205493927, + "learning_rate": 0.001, + "loss": 2.3366, + "step": 20483 + }, + { + "epoch": 0.8665707758693629, + "grad_norm": 0.1824718415737152, + "learning_rate": 0.001, + "loss": 1.9301, + "step": 20484 + }, + { + "epoch": 0.8666130806328792, + "grad_norm": 0.14647133648395538, + "learning_rate": 0.001, + "loss": 2.0635, + "step": 20485 + }, + { + "epoch": 0.8666553853963956, + "grad_norm": 0.17452256381511688, + "learning_rate": 0.001, + "loss": 2.4909, + "step": 20486 + }, + { + "epoch": 0.866697690159912, + "grad_norm": 0.15065862238407135, + "learning_rate": 0.001, + "loss": 2.3384, + "step": 20487 + }, + { + "epoch": 0.8667399949234283, + "grad_norm": 0.19424915313720703, + "learning_rate": 0.001, + "loss": 2.3333, + "step": 20488 + }, + { + "epoch": 0.8667822996869448, + "grad_norm": 0.15558700263500214, + "learning_rate": 0.001, + "loss": 2.7007, + "step": 20489 + }, + { + "epoch": 0.8668246044504612, + "grad_norm": 0.14827044308185577, + "learning_rate": 0.001, + "loss": 1.6913, + "step": 20490 + }, + { + "epoch": 0.8668669092139775, + "grad_norm": 0.48052114248275757, + "learning_rate": 0.001, + "loss": 2.0873, + "step": 20491 + }, + { + "epoch": 0.8669092139774939, + "grad_norm": 0.1724882572889328, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 20492 + }, + { + "epoch": 0.8669515187410103, + "grad_norm": 0.2082466185092926, + "learning_rate": 0.001, + "loss": 2.881, + "step": 20493 + }, + { + "epoch": 0.8669938235045266, + "grad_norm": 0.14403583109378815, + "learning_rate": 0.001, + "loss": 1.8366, + "step": 20494 + }, + { + "epoch": 0.867036128268043, + "grad_norm": 0.1699734479188919, + "learning_rate": 0.001, + "loss": 3.2508, + "step": 20495 + }, + { + "epoch": 0.8670784330315594, + "grad_norm": 0.1627493053674698, + "learning_rate": 0.001, + "loss": 1.6992, + "step": 20496 + }, + { + "epoch": 0.8671207377950757, + "grad_norm": 0.1512979418039322, + "learning_rate": 0.001, + "loss": 1.8145, + "step": 20497 + }, + { + "epoch": 0.8671630425585921, + "grad_norm": 0.1480405628681183, + "learning_rate": 0.001, + "loss": 2.0845, + "step": 20498 + }, + { + "epoch": 0.8672053473221085, + "grad_norm": 0.12783098220825195, + "learning_rate": 0.001, + "loss": 3.377, + "step": 20499 + }, + { + "epoch": 0.8672476520856248, + "grad_norm": 0.13607032597064972, + "learning_rate": 0.001, + "loss": 1.555, + "step": 20500 + }, + { + "epoch": 0.8672899568491412, + "grad_norm": 0.1391061395406723, + "learning_rate": 0.001, + "loss": 1.4297, + "step": 20501 + }, + { + "epoch": 0.8673322616126576, + "grad_norm": 0.12696748971939087, + "learning_rate": 0.001, + "loss": 1.5243, + "step": 20502 + }, + { + "epoch": 0.8673745663761739, + "grad_norm": 0.15007275342941284, + "learning_rate": 0.001, + "loss": 2.754, + "step": 20503 + }, + { + "epoch": 0.8674168711396903, + "grad_norm": 0.12282463908195496, + "learning_rate": 0.001, + "loss": 2.6723, + "step": 20504 + }, + { + "epoch": 0.8674591759032066, + "grad_norm": 0.14588630199432373, + "learning_rate": 0.001, + "loss": 2.4033, + "step": 20505 + }, + { + "epoch": 0.8675014806667231, + "grad_norm": 0.518017053604126, + "learning_rate": 0.001, + "loss": 2.2406, + "step": 20506 + }, + { + "epoch": 0.8675437854302395, + "grad_norm": 0.14224489033222198, + "learning_rate": 0.001, + "loss": 1.4533, + "step": 20507 + }, + { + "epoch": 0.8675860901937558, + "grad_norm": 0.13003142178058624, + "learning_rate": 0.001, + "loss": 2.5315, + "step": 20508 + }, + { + "epoch": 0.8676283949572722, + "grad_norm": 0.14314842224121094, + "learning_rate": 0.001, + "loss": 3.1466, + "step": 20509 + }, + { + "epoch": 0.8676706997207886, + "grad_norm": 0.18065118789672852, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 20510 + }, + { + "epoch": 0.8677130044843049, + "grad_norm": 0.16202561557292938, + "learning_rate": 0.001, + "loss": 1.7793, + "step": 20511 + }, + { + "epoch": 0.8677553092478213, + "grad_norm": 0.13975529372692108, + "learning_rate": 0.001, + "loss": 2.85, + "step": 20512 + }, + { + "epoch": 0.8677976140113377, + "grad_norm": 0.15059174597263336, + "learning_rate": 0.001, + "loss": 1.3228, + "step": 20513 + }, + { + "epoch": 0.867839918774854, + "grad_norm": 0.15195417404174805, + "learning_rate": 0.001, + "loss": 1.7705, + "step": 20514 + }, + { + "epoch": 0.8678822235383704, + "grad_norm": 0.159585103392601, + "learning_rate": 0.001, + "loss": 2.5388, + "step": 20515 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.14191685616970062, + "learning_rate": 0.001, + "loss": 2.1223, + "step": 20516 + }, + { + "epoch": 0.8679668330654031, + "grad_norm": 0.18546472489833832, + "learning_rate": 0.001, + "loss": 2.4321, + "step": 20517 + }, + { + "epoch": 0.8680091378289195, + "grad_norm": 0.13588781654834747, + "learning_rate": 0.001, + "loss": 2.0784, + "step": 20518 + }, + { + "epoch": 0.8680514425924359, + "grad_norm": 15.1656494140625, + "learning_rate": 0.001, + "loss": 1.8606, + "step": 20519 + }, + { + "epoch": 0.8680937473559522, + "grad_norm": 0.1273522973060608, + "learning_rate": 0.001, + "loss": 1.9861, + "step": 20520 + }, + { + "epoch": 0.8681360521194686, + "grad_norm": 0.3258301317691803, + "learning_rate": 0.001, + "loss": 2.4265, + "step": 20521 + }, + { + "epoch": 0.8681783568829851, + "grad_norm": 0.1691277176141739, + "learning_rate": 0.001, + "loss": 1.8709, + "step": 20522 + }, + { + "epoch": 0.8682206616465014, + "grad_norm": 0.1776563823223114, + "learning_rate": 0.001, + "loss": 1.9714, + "step": 20523 + }, + { + "epoch": 0.8682629664100178, + "grad_norm": 0.19368451833724976, + "learning_rate": 0.001, + "loss": 3.038, + "step": 20524 + }, + { + "epoch": 0.8683052711735342, + "grad_norm": 0.15642546117305756, + "learning_rate": 0.001, + "loss": 2.7047, + "step": 20525 + }, + { + "epoch": 0.8683475759370505, + "grad_norm": 0.23724526166915894, + "learning_rate": 0.001, + "loss": 2.3636, + "step": 20526 + }, + { + "epoch": 0.8683898807005669, + "grad_norm": 0.14943532645702362, + "learning_rate": 0.001, + "loss": 1.8734, + "step": 20527 + }, + { + "epoch": 0.8684321854640833, + "grad_norm": 0.15623928606510162, + "learning_rate": 0.001, + "loss": 2.4747, + "step": 20528 + }, + { + "epoch": 0.8684744902275996, + "grad_norm": 0.24184876680374146, + "learning_rate": 0.001, + "loss": 1.9066, + "step": 20529 + }, + { + "epoch": 0.868516794991116, + "grad_norm": 0.523157000541687, + "learning_rate": 0.001, + "loss": 2.8953, + "step": 20530 + }, + { + "epoch": 0.8685590997546324, + "grad_norm": 0.1399133801460266, + "learning_rate": 0.001, + "loss": 2.914, + "step": 20531 + }, + { + "epoch": 0.8686014045181487, + "grad_norm": 0.3917766809463501, + "learning_rate": 0.001, + "loss": 1.447, + "step": 20532 + }, + { + "epoch": 0.8686437092816651, + "grad_norm": 0.15621374547481537, + "learning_rate": 0.001, + "loss": 2.3168, + "step": 20533 + }, + { + "epoch": 0.8686860140451815, + "grad_norm": 0.12716899812221527, + "learning_rate": 0.001, + "loss": 1.4502, + "step": 20534 + }, + { + "epoch": 0.8687283188086978, + "grad_norm": 0.14455370604991913, + "learning_rate": 0.001, + "loss": 2.2383, + "step": 20535 + }, + { + "epoch": 0.8687706235722142, + "grad_norm": 0.14087846875190735, + "learning_rate": 0.001, + "loss": 2.0278, + "step": 20536 + }, + { + "epoch": 0.8688129283357306, + "grad_norm": 0.17462292313575745, + "learning_rate": 0.001, + "loss": 1.7217, + "step": 20537 + }, + { + "epoch": 0.868855233099247, + "grad_norm": 0.16990859806537628, + "learning_rate": 0.001, + "loss": 3.8208, + "step": 20538 + }, + { + "epoch": 0.8688975378627634, + "grad_norm": 0.131483256816864, + "learning_rate": 0.001, + "loss": 1.4108, + "step": 20539 + }, + { + "epoch": 0.8689398426262798, + "grad_norm": 0.13738611340522766, + "learning_rate": 0.001, + "loss": 2.644, + "step": 20540 + }, + { + "epoch": 0.8689821473897961, + "grad_norm": 0.2787913978099823, + "learning_rate": 0.001, + "loss": 1.9129, + "step": 20541 + }, + { + "epoch": 0.8690244521533125, + "grad_norm": 0.8234505653381348, + "learning_rate": 0.001, + "loss": 1.6344, + "step": 20542 + }, + { + "epoch": 0.8690667569168289, + "grad_norm": 0.15260133147239685, + "learning_rate": 0.001, + "loss": 1.8331, + "step": 20543 + }, + { + "epoch": 0.8691090616803452, + "grad_norm": 0.12822067737579346, + "learning_rate": 0.001, + "loss": 2.4685, + "step": 20544 + }, + { + "epoch": 0.8691513664438616, + "grad_norm": 0.14594188332557678, + "learning_rate": 0.001, + "loss": 1.5119, + "step": 20545 + }, + { + "epoch": 0.869193671207378, + "grad_norm": 0.14818280935287476, + "learning_rate": 0.001, + "loss": 2.4656, + "step": 20546 + }, + { + "epoch": 0.8692359759708943, + "grad_norm": 0.26366791129112244, + "learning_rate": 0.001, + "loss": 2.9585, + "step": 20547 + }, + { + "epoch": 0.8692782807344107, + "grad_norm": 0.17102564871311188, + "learning_rate": 0.001, + "loss": 2.0877, + "step": 20548 + }, + { + "epoch": 0.869320585497927, + "grad_norm": 0.1857893019914627, + "learning_rate": 0.001, + "loss": 2.5178, + "step": 20549 + }, + { + "epoch": 0.8693628902614434, + "grad_norm": 0.26888343691825867, + "learning_rate": 0.001, + "loss": 1.9708, + "step": 20550 + }, + { + "epoch": 0.8694051950249598, + "grad_norm": 0.4817790687084198, + "learning_rate": 0.001, + "loss": 1.3568, + "step": 20551 + }, + { + "epoch": 0.8694474997884761, + "grad_norm": 0.6389137506484985, + "learning_rate": 0.001, + "loss": 1.9148, + "step": 20552 + }, + { + "epoch": 0.8694898045519925, + "grad_norm": 0.17424023151397705, + "learning_rate": 0.001, + "loss": 1.8895, + "step": 20553 + }, + { + "epoch": 0.869532109315509, + "grad_norm": 0.15143568813800812, + "learning_rate": 0.001, + "loss": 1.5483, + "step": 20554 + }, + { + "epoch": 0.8695744140790252, + "grad_norm": 0.13719114661216736, + "learning_rate": 0.001, + "loss": 1.8091, + "step": 20555 + }, + { + "epoch": 0.8696167188425417, + "grad_norm": 0.1803659200668335, + "learning_rate": 0.001, + "loss": 2.2232, + "step": 20556 + }, + { + "epoch": 0.8696590236060581, + "grad_norm": 1.9404886960983276, + "learning_rate": 0.001, + "loss": 1.9271, + "step": 20557 + }, + { + "epoch": 0.8697013283695744, + "grad_norm": 0.4086678624153137, + "learning_rate": 0.001, + "loss": 2.0375, + "step": 20558 + }, + { + "epoch": 0.8697436331330908, + "grad_norm": 0.1896570473909378, + "learning_rate": 0.001, + "loss": 2.4307, + "step": 20559 + }, + { + "epoch": 0.8697859378966072, + "grad_norm": 0.1653144508600235, + "learning_rate": 0.001, + "loss": 2.1597, + "step": 20560 + }, + { + "epoch": 0.8698282426601235, + "grad_norm": 0.38588979840278625, + "learning_rate": 0.001, + "loss": 1.9474, + "step": 20561 + }, + { + "epoch": 0.8698705474236399, + "grad_norm": 0.20403960347175598, + "learning_rate": 0.001, + "loss": 2.1097, + "step": 20562 + }, + { + "epoch": 0.8699128521871563, + "grad_norm": 69.25297546386719, + "learning_rate": 0.001, + "loss": 2.0653, + "step": 20563 + }, + { + "epoch": 0.8699551569506726, + "grad_norm": 7.444055557250977, + "learning_rate": 0.001, + "loss": 1.743, + "step": 20564 + }, + { + "epoch": 0.869997461714189, + "grad_norm": 0.1590234786272049, + "learning_rate": 0.001, + "loss": 1.8023, + "step": 20565 + }, + { + "epoch": 0.8700397664777054, + "grad_norm": 0.3205440640449524, + "learning_rate": 0.001, + "loss": 1.9231, + "step": 20566 + }, + { + "epoch": 0.8700820712412217, + "grad_norm": 0.19606617093086243, + "learning_rate": 0.001, + "loss": 2.6873, + "step": 20567 + }, + { + "epoch": 0.8701243760047381, + "grad_norm": 0.44805920124053955, + "learning_rate": 0.001, + "loss": 2.4899, + "step": 20568 + }, + { + "epoch": 0.8701666807682545, + "grad_norm": 0.22577597200870514, + "learning_rate": 0.001, + "loss": 2.2703, + "step": 20569 + }, + { + "epoch": 0.8702089855317708, + "grad_norm": 0.168071910738945, + "learning_rate": 0.001, + "loss": 1.7944, + "step": 20570 + }, + { + "epoch": 0.8702512902952872, + "grad_norm": 0.17601118981838226, + "learning_rate": 0.001, + "loss": 1.8357, + "step": 20571 + }, + { + "epoch": 0.8702935950588037, + "grad_norm": 0.1971052885055542, + "learning_rate": 0.001, + "loss": 1.9707, + "step": 20572 + }, + { + "epoch": 0.87033589982232, + "grad_norm": 0.16236917674541473, + "learning_rate": 0.001, + "loss": 1.9057, + "step": 20573 + }, + { + "epoch": 0.8703782045858364, + "grad_norm": 0.22024036943912506, + "learning_rate": 0.001, + "loss": 1.9232, + "step": 20574 + }, + { + "epoch": 0.8704205093493528, + "grad_norm": 0.18604035675525665, + "learning_rate": 0.001, + "loss": 2.3201, + "step": 20575 + }, + { + "epoch": 0.8704628141128691, + "grad_norm": 0.19944395124912262, + "learning_rate": 0.001, + "loss": 2.2175, + "step": 20576 + }, + { + "epoch": 0.8705051188763855, + "grad_norm": 6.948441982269287, + "learning_rate": 0.001, + "loss": 2.322, + "step": 20577 + }, + { + "epoch": 0.8705474236399019, + "grad_norm": 0.19903208315372467, + "learning_rate": 0.001, + "loss": 2.2643, + "step": 20578 + }, + { + "epoch": 0.8705897284034182, + "grad_norm": 0.15716981887817383, + "learning_rate": 0.001, + "loss": 1.4093, + "step": 20579 + }, + { + "epoch": 0.8706320331669346, + "grad_norm": 0.13574804365634918, + "learning_rate": 0.001, + "loss": 2.0119, + "step": 20580 + }, + { + "epoch": 0.870674337930451, + "grad_norm": 0.15148890018463135, + "learning_rate": 0.001, + "loss": 2.0046, + "step": 20581 + }, + { + "epoch": 0.8707166426939673, + "grad_norm": 0.15020988881587982, + "learning_rate": 0.001, + "loss": 2.022, + "step": 20582 + }, + { + "epoch": 0.8707589474574837, + "grad_norm": 0.15000440180301666, + "learning_rate": 0.001, + "loss": 2.3382, + "step": 20583 + }, + { + "epoch": 0.8708012522210001, + "grad_norm": 0.13530394434928894, + "learning_rate": 0.001, + "loss": 1.8668, + "step": 20584 + }, + { + "epoch": 0.8708435569845164, + "grad_norm": 0.17123256623744965, + "learning_rate": 0.001, + "loss": 2.1002, + "step": 20585 + }, + { + "epoch": 0.8708858617480328, + "grad_norm": 0.15746386349201202, + "learning_rate": 0.001, + "loss": 2.6803, + "step": 20586 + }, + { + "epoch": 0.8709281665115493, + "grad_norm": 3.3827056884765625, + "learning_rate": 0.001, + "loss": 2.1643, + "step": 20587 + }, + { + "epoch": 0.8709704712750655, + "grad_norm": 0.16982562839984894, + "learning_rate": 0.001, + "loss": 2.244, + "step": 20588 + }, + { + "epoch": 0.871012776038582, + "grad_norm": 2.7574925422668457, + "learning_rate": 0.001, + "loss": 2.6158, + "step": 20589 + }, + { + "epoch": 0.8710550808020984, + "grad_norm": 0.18137569725513458, + "learning_rate": 0.001, + "loss": 1.7065, + "step": 20590 + }, + { + "epoch": 0.8710973855656147, + "grad_norm": 0.18702997267246246, + "learning_rate": 0.001, + "loss": 1.7625, + "step": 20591 + }, + { + "epoch": 0.8711396903291311, + "grad_norm": 0.21095755696296692, + "learning_rate": 0.001, + "loss": 2.8752, + "step": 20592 + }, + { + "epoch": 0.8711819950926475, + "grad_norm": 0.410576730966568, + "learning_rate": 0.001, + "loss": 1.894, + "step": 20593 + }, + { + "epoch": 0.8712242998561638, + "grad_norm": 0.19124916195869446, + "learning_rate": 0.001, + "loss": 1.6841, + "step": 20594 + }, + { + "epoch": 0.8712666046196802, + "grad_norm": 0.1525120586156845, + "learning_rate": 0.001, + "loss": 2.486, + "step": 20595 + }, + { + "epoch": 0.8713089093831965, + "grad_norm": 0.2803274393081665, + "learning_rate": 0.001, + "loss": 2.2124, + "step": 20596 + }, + { + "epoch": 0.8713512141467129, + "grad_norm": 6.939259052276611, + "learning_rate": 0.001, + "loss": 1.5995, + "step": 20597 + }, + { + "epoch": 0.8713935189102293, + "grad_norm": 0.14391854405403137, + "learning_rate": 0.001, + "loss": 2.7667, + "step": 20598 + }, + { + "epoch": 0.8714358236737456, + "grad_norm": 0.15434549748897552, + "learning_rate": 0.001, + "loss": 1.9091, + "step": 20599 + }, + { + "epoch": 0.871478128437262, + "grad_norm": 0.17144039273262024, + "learning_rate": 0.001, + "loss": 1.7429, + "step": 20600 + }, + { + "epoch": 0.8715204332007784, + "grad_norm": 0.15034642815589905, + "learning_rate": 0.001, + "loss": 1.6829, + "step": 20601 + }, + { + "epoch": 0.8715627379642947, + "grad_norm": 0.17373819649219513, + "learning_rate": 0.001, + "loss": 1.9474, + "step": 20602 + }, + { + "epoch": 0.8716050427278111, + "grad_norm": 0.7324623465538025, + "learning_rate": 0.001, + "loss": 2.0428, + "step": 20603 + }, + { + "epoch": 0.8716473474913276, + "grad_norm": 1.5604692697525024, + "learning_rate": 0.001, + "loss": 3.489, + "step": 20604 + }, + { + "epoch": 0.8716896522548438, + "grad_norm": 0.2737126350402832, + "learning_rate": 0.001, + "loss": 2.5385, + "step": 20605 + }, + { + "epoch": 0.8717319570183603, + "grad_norm": 0.20344385504722595, + "learning_rate": 0.001, + "loss": 2.0625, + "step": 20606 + }, + { + "epoch": 0.8717742617818767, + "grad_norm": 0.16309478878974915, + "learning_rate": 0.001, + "loss": 2.4926, + "step": 20607 + }, + { + "epoch": 0.871816566545393, + "grad_norm": 0.17930370569229126, + "learning_rate": 0.001, + "loss": 1.9174, + "step": 20608 + }, + { + "epoch": 0.8718588713089094, + "grad_norm": 0.25973576307296753, + "learning_rate": 0.001, + "loss": 2.8351, + "step": 20609 + }, + { + "epoch": 0.8719011760724258, + "grad_norm": 2.8675005435943604, + "learning_rate": 0.001, + "loss": 3.2205, + "step": 20610 + }, + { + "epoch": 0.8719434808359421, + "grad_norm": 0.17134712636470795, + "learning_rate": 0.001, + "loss": 2.1032, + "step": 20611 + }, + { + "epoch": 0.8719857855994585, + "grad_norm": 0.14888092875480652, + "learning_rate": 0.001, + "loss": 2.7499, + "step": 20612 + }, + { + "epoch": 0.8720280903629749, + "grad_norm": 0.13400807976722717, + "learning_rate": 0.001, + "loss": 1.7189, + "step": 20613 + }, + { + "epoch": 0.8720703951264912, + "grad_norm": 0.12388193607330322, + "learning_rate": 0.001, + "loss": 2.486, + "step": 20614 + }, + { + "epoch": 0.8721126998900076, + "grad_norm": 0.1444062441587448, + "learning_rate": 0.001, + "loss": 1.9426, + "step": 20615 + }, + { + "epoch": 0.872155004653524, + "grad_norm": 0.13056041300296783, + "learning_rate": 0.001, + "loss": 1.5044, + "step": 20616 + }, + { + "epoch": 0.8721973094170403, + "grad_norm": 0.1537250131368637, + "learning_rate": 0.001, + "loss": 2.2731, + "step": 20617 + }, + { + "epoch": 0.8722396141805567, + "grad_norm": 0.21203458309173584, + "learning_rate": 0.001, + "loss": 2.0906, + "step": 20618 + }, + { + "epoch": 0.8722819189440731, + "grad_norm": 0.19286859035491943, + "learning_rate": 0.001, + "loss": 1.6875, + "step": 20619 + }, + { + "epoch": 0.8723242237075894, + "grad_norm": 2.2411351203918457, + "learning_rate": 0.001, + "loss": 2.294, + "step": 20620 + }, + { + "epoch": 0.8723665284711059, + "grad_norm": 0.12386021763086319, + "learning_rate": 0.001, + "loss": 1.9246, + "step": 20621 + }, + { + "epoch": 0.8724088332346223, + "grad_norm": 0.1696896106004715, + "learning_rate": 0.001, + "loss": 2.8406, + "step": 20622 + }, + { + "epoch": 0.8724511379981386, + "grad_norm": 0.14747262001037598, + "learning_rate": 0.001, + "loss": 2.1123, + "step": 20623 + }, + { + "epoch": 0.872493442761655, + "grad_norm": 0.6882927417755127, + "learning_rate": 0.001, + "loss": 2.9728, + "step": 20624 + }, + { + "epoch": 0.8725357475251714, + "grad_norm": 0.13660912215709686, + "learning_rate": 0.001, + "loss": 1.3383, + "step": 20625 + }, + { + "epoch": 0.8725780522886877, + "grad_norm": 0.15439388155937195, + "learning_rate": 0.001, + "loss": 2.3997, + "step": 20626 + }, + { + "epoch": 0.8726203570522041, + "grad_norm": 0.16791464388370514, + "learning_rate": 0.001, + "loss": 2.0219, + "step": 20627 + }, + { + "epoch": 0.8726626618157205, + "grad_norm": 0.17568817734718323, + "learning_rate": 0.001, + "loss": 2.2818, + "step": 20628 + }, + { + "epoch": 0.8727049665792368, + "grad_norm": 0.1622392237186432, + "learning_rate": 0.001, + "loss": 2.0132, + "step": 20629 + }, + { + "epoch": 0.8727472713427532, + "grad_norm": 0.15402914583683014, + "learning_rate": 0.001, + "loss": 1.813, + "step": 20630 + }, + { + "epoch": 0.8727895761062696, + "grad_norm": 0.13786296546459198, + "learning_rate": 0.001, + "loss": 1.9361, + "step": 20631 + }, + { + "epoch": 0.8728318808697859, + "grad_norm": 0.19948144257068634, + "learning_rate": 0.001, + "loss": 2.8847, + "step": 20632 + }, + { + "epoch": 0.8728741856333023, + "grad_norm": 0.1480330228805542, + "learning_rate": 0.001, + "loss": 1.8211, + "step": 20633 + }, + { + "epoch": 0.8729164903968187, + "grad_norm": 0.16000570356845856, + "learning_rate": 0.001, + "loss": 1.5714, + "step": 20634 + }, + { + "epoch": 0.872958795160335, + "grad_norm": 1.0622891187667847, + "learning_rate": 0.001, + "loss": 2.1747, + "step": 20635 + }, + { + "epoch": 0.8730010999238514, + "grad_norm": 0.192660391330719, + "learning_rate": 0.001, + "loss": 2.0402, + "step": 20636 + }, + { + "epoch": 0.8730434046873679, + "grad_norm": 0.17580977082252502, + "learning_rate": 0.001, + "loss": 1.9806, + "step": 20637 + }, + { + "epoch": 0.8730857094508842, + "grad_norm": 0.14421142637729645, + "learning_rate": 0.001, + "loss": 1.8746, + "step": 20638 + }, + { + "epoch": 0.8731280142144006, + "grad_norm": 0.14612741768360138, + "learning_rate": 0.001, + "loss": 1.8898, + "step": 20639 + }, + { + "epoch": 0.8731703189779169, + "grad_norm": 0.4607209861278534, + "learning_rate": 0.001, + "loss": 1.58, + "step": 20640 + }, + { + "epoch": 0.8732126237414333, + "grad_norm": 0.13212980329990387, + "learning_rate": 0.001, + "loss": 2.2389, + "step": 20641 + }, + { + "epoch": 0.8732549285049497, + "grad_norm": 0.14031372964382172, + "learning_rate": 0.001, + "loss": 2.0144, + "step": 20642 + }, + { + "epoch": 0.873297233268466, + "grad_norm": 0.1437501460313797, + "learning_rate": 0.001, + "loss": 1.7929, + "step": 20643 + }, + { + "epoch": 0.8733395380319824, + "grad_norm": 0.13045533001422882, + "learning_rate": 0.001, + "loss": 2.6569, + "step": 20644 + }, + { + "epoch": 0.8733818427954988, + "grad_norm": 0.16348956525325775, + "learning_rate": 0.001, + "loss": 2.2738, + "step": 20645 + }, + { + "epoch": 0.8734241475590151, + "grad_norm": 0.14668866991996765, + "learning_rate": 0.001, + "loss": 2.4409, + "step": 20646 + }, + { + "epoch": 0.8734664523225315, + "grad_norm": 0.12461234629154205, + "learning_rate": 0.001, + "loss": 2.0134, + "step": 20647 + }, + { + "epoch": 0.8735087570860479, + "grad_norm": 0.16764739155769348, + "learning_rate": 0.001, + "loss": 1.9933, + "step": 20648 + }, + { + "epoch": 0.8735510618495642, + "grad_norm": 0.15134289860725403, + "learning_rate": 0.001, + "loss": 2.7479, + "step": 20649 + }, + { + "epoch": 0.8735933666130806, + "grad_norm": 0.16396795213222504, + "learning_rate": 0.001, + "loss": 2.1933, + "step": 20650 + }, + { + "epoch": 0.873635671376597, + "grad_norm": 0.15628643333911896, + "learning_rate": 0.001, + "loss": 1.9693, + "step": 20651 + }, + { + "epoch": 0.8736779761401133, + "grad_norm": 0.15194100141525269, + "learning_rate": 0.001, + "loss": 2.0662, + "step": 20652 + }, + { + "epoch": 0.8737202809036297, + "grad_norm": 0.14548543095588684, + "learning_rate": 0.001, + "loss": 1.4378, + "step": 20653 + }, + { + "epoch": 0.8737625856671462, + "grad_norm": 0.4701218903064728, + "learning_rate": 0.001, + "loss": 2.1941, + "step": 20654 + }, + { + "epoch": 0.8738048904306625, + "grad_norm": 0.2454076111316681, + "learning_rate": 0.001, + "loss": 1.673, + "step": 20655 + }, + { + "epoch": 0.8738471951941789, + "grad_norm": 0.1675247997045517, + "learning_rate": 0.001, + "loss": 1.6599, + "step": 20656 + }, + { + "epoch": 0.8738894999576953, + "grad_norm": 0.1386534571647644, + "learning_rate": 0.001, + "loss": 2.4196, + "step": 20657 + }, + { + "epoch": 0.8739318047212116, + "grad_norm": 0.1279524713754654, + "learning_rate": 0.001, + "loss": 1.4271, + "step": 20658 + }, + { + "epoch": 0.873974109484728, + "grad_norm": 1.2931326627731323, + "learning_rate": 0.001, + "loss": 1.8761, + "step": 20659 + }, + { + "epoch": 0.8740164142482444, + "grad_norm": 0.18578656017780304, + "learning_rate": 0.001, + "loss": 1.5095, + "step": 20660 + }, + { + "epoch": 0.8740587190117607, + "grad_norm": 0.17007368803024292, + "learning_rate": 0.001, + "loss": 1.7172, + "step": 20661 + }, + { + "epoch": 0.8741010237752771, + "grad_norm": 0.1408626288175583, + "learning_rate": 0.001, + "loss": 2.1397, + "step": 20662 + }, + { + "epoch": 0.8741433285387935, + "grad_norm": 0.1541232317686081, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 20663 + }, + { + "epoch": 0.8741856333023098, + "grad_norm": 1.1258147954940796, + "learning_rate": 0.001, + "loss": 1.9507, + "step": 20664 + }, + { + "epoch": 0.8742279380658262, + "grad_norm": 0.13171681761741638, + "learning_rate": 0.001, + "loss": 2.7839, + "step": 20665 + }, + { + "epoch": 0.8742702428293426, + "grad_norm": 0.13253676891326904, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 20666 + }, + { + "epoch": 0.8743125475928589, + "grad_norm": 0.1618097722530365, + "learning_rate": 0.001, + "loss": 2.0003, + "step": 20667 + }, + { + "epoch": 0.8743548523563753, + "grad_norm": 0.1612173318862915, + "learning_rate": 0.001, + "loss": 1.7852, + "step": 20668 + }, + { + "epoch": 0.8743971571198917, + "grad_norm": 0.20455177128314972, + "learning_rate": 0.001, + "loss": 1.7746, + "step": 20669 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 0.2330559939146042, + "learning_rate": 0.001, + "loss": 2.5461, + "step": 20670 + }, + { + "epoch": 0.8744817666469245, + "grad_norm": 4.225989818572998, + "learning_rate": 0.001, + "loss": 2.0473, + "step": 20671 + }, + { + "epoch": 0.8745240714104409, + "grad_norm": 4.388603210449219, + "learning_rate": 0.001, + "loss": 2.6031, + "step": 20672 + }, + { + "epoch": 0.8745663761739572, + "grad_norm": 0.6356297135353088, + "learning_rate": 0.001, + "loss": 1.7837, + "step": 20673 + }, + { + "epoch": 0.8746086809374736, + "grad_norm": 0.5951091647148132, + "learning_rate": 0.001, + "loss": 2.1406, + "step": 20674 + }, + { + "epoch": 0.87465098570099, + "grad_norm": 0.1605328619480133, + "learning_rate": 0.001, + "loss": 1.8724, + "step": 20675 + }, + { + "epoch": 0.8746932904645063, + "grad_norm": 0.22600284218788147, + "learning_rate": 0.001, + "loss": 1.5535, + "step": 20676 + }, + { + "epoch": 0.8747355952280227, + "grad_norm": 0.15503069758415222, + "learning_rate": 0.001, + "loss": 1.7348, + "step": 20677 + }, + { + "epoch": 0.8747778999915391, + "grad_norm": 0.5110114216804504, + "learning_rate": 0.001, + "loss": 2.3343, + "step": 20678 + }, + { + "epoch": 0.8748202047550554, + "grad_norm": 0.16068077087402344, + "learning_rate": 0.001, + "loss": 2.4127, + "step": 20679 + }, + { + "epoch": 0.8748625095185718, + "grad_norm": 4.883456230163574, + "learning_rate": 0.001, + "loss": 2.3072, + "step": 20680 + }, + { + "epoch": 0.8749048142820882, + "grad_norm": 0.1795644909143448, + "learning_rate": 0.001, + "loss": 2.4872, + "step": 20681 + }, + { + "epoch": 0.8749471190456045, + "grad_norm": 0.14965955913066864, + "learning_rate": 0.001, + "loss": 2.6468, + "step": 20682 + }, + { + "epoch": 0.8749894238091209, + "grad_norm": 0.17840637266635895, + "learning_rate": 0.001, + "loss": 1.908, + "step": 20683 + }, + { + "epoch": 0.8750317285726372, + "grad_norm": 0.18810167908668518, + "learning_rate": 0.001, + "loss": 1.8151, + "step": 20684 + }, + { + "epoch": 0.8750740333361536, + "grad_norm": 0.20336748659610748, + "learning_rate": 0.001, + "loss": 2.9232, + "step": 20685 + }, + { + "epoch": 0.87511633809967, + "grad_norm": 0.2506280243396759, + "learning_rate": 0.001, + "loss": 2.7538, + "step": 20686 + }, + { + "epoch": 0.8751586428631863, + "grad_norm": 0.24045564234256744, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 20687 + }, + { + "epoch": 0.8752009476267028, + "grad_norm": 0.1784004122018814, + "learning_rate": 0.001, + "loss": 2.0315, + "step": 20688 + }, + { + "epoch": 0.8752432523902192, + "grad_norm": 0.19089365005493164, + "learning_rate": 0.001, + "loss": 3.481, + "step": 20689 + }, + { + "epoch": 0.8752855571537355, + "grad_norm": 0.15477164089679718, + "learning_rate": 0.001, + "loss": 2.0263, + "step": 20690 + }, + { + "epoch": 0.8753278619172519, + "grad_norm": 0.22737683355808258, + "learning_rate": 0.001, + "loss": 3.2862, + "step": 20691 + }, + { + "epoch": 0.8753701666807683, + "grad_norm": 0.142948180437088, + "learning_rate": 0.001, + "loss": 2.0765, + "step": 20692 + }, + { + "epoch": 0.8754124714442846, + "grad_norm": 0.2728869616985321, + "learning_rate": 0.001, + "loss": 1.7859, + "step": 20693 + }, + { + "epoch": 0.875454776207801, + "grad_norm": 2.2189605236053467, + "learning_rate": 0.001, + "loss": 2.2061, + "step": 20694 + }, + { + "epoch": 0.8754970809713174, + "grad_norm": 0.28525206446647644, + "learning_rate": 0.001, + "loss": 2.3435, + "step": 20695 + }, + { + "epoch": 0.8755393857348337, + "grad_norm": 2.395468235015869, + "learning_rate": 0.001, + "loss": 3.1819, + "step": 20696 + }, + { + "epoch": 0.8755816904983501, + "grad_norm": 0.16036246716976166, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 20697 + }, + { + "epoch": 0.8756239952618665, + "grad_norm": 0.15271493792533875, + "learning_rate": 0.001, + "loss": 2.0569, + "step": 20698 + }, + { + "epoch": 0.8756663000253828, + "grad_norm": 0.2659335732460022, + "learning_rate": 0.001, + "loss": 2.4376, + "step": 20699 + }, + { + "epoch": 0.8757086047888992, + "grad_norm": 0.6031797528266907, + "learning_rate": 0.001, + "loss": 2.0957, + "step": 20700 + }, + { + "epoch": 0.8757509095524156, + "grad_norm": 0.16728293895721436, + "learning_rate": 0.001, + "loss": 2.6738, + "step": 20701 + }, + { + "epoch": 0.8757932143159319, + "grad_norm": 0.17319273948669434, + "learning_rate": 0.001, + "loss": 2.0715, + "step": 20702 + }, + { + "epoch": 0.8758355190794483, + "grad_norm": 0.1556321531534195, + "learning_rate": 0.001, + "loss": 2.0015, + "step": 20703 + }, + { + "epoch": 0.8758778238429648, + "grad_norm": 0.15744656324386597, + "learning_rate": 0.001, + "loss": 1.8593, + "step": 20704 + }, + { + "epoch": 0.875920128606481, + "grad_norm": 0.13687211275100708, + "learning_rate": 0.001, + "loss": 1.7606, + "step": 20705 + }, + { + "epoch": 0.8759624333699975, + "grad_norm": 0.20388025045394897, + "learning_rate": 0.001, + "loss": 2.3, + "step": 20706 + }, + { + "epoch": 0.8760047381335139, + "grad_norm": 0.1699550747871399, + "learning_rate": 0.001, + "loss": 2.1128, + "step": 20707 + }, + { + "epoch": 0.8760470428970302, + "grad_norm": 0.17213287949562073, + "learning_rate": 0.001, + "loss": 3.0813, + "step": 20708 + }, + { + "epoch": 0.8760893476605466, + "grad_norm": 0.14642708003520966, + "learning_rate": 0.001, + "loss": 1.7773, + "step": 20709 + }, + { + "epoch": 0.876131652424063, + "grad_norm": 0.24943861365318298, + "learning_rate": 0.001, + "loss": 1.8321, + "step": 20710 + }, + { + "epoch": 0.8761739571875793, + "grad_norm": 0.26976755261421204, + "learning_rate": 0.001, + "loss": 1.9682, + "step": 20711 + }, + { + "epoch": 0.8762162619510957, + "grad_norm": 0.2460852712392807, + "learning_rate": 0.001, + "loss": 1.6967, + "step": 20712 + }, + { + "epoch": 0.8762585667146121, + "grad_norm": 0.13001763820648193, + "learning_rate": 0.001, + "loss": 1.8121, + "step": 20713 + }, + { + "epoch": 0.8763008714781284, + "grad_norm": 0.17002592980861664, + "learning_rate": 0.001, + "loss": 2.4601, + "step": 20714 + }, + { + "epoch": 0.8763431762416448, + "grad_norm": 0.15308377146720886, + "learning_rate": 0.001, + "loss": 2.1304, + "step": 20715 + }, + { + "epoch": 0.8763854810051612, + "grad_norm": 0.19930648803710938, + "learning_rate": 0.001, + "loss": 2.1169, + "step": 20716 + }, + { + "epoch": 0.8764277857686775, + "grad_norm": 0.15591351687908173, + "learning_rate": 0.001, + "loss": 3.023, + "step": 20717 + }, + { + "epoch": 0.8764700905321939, + "grad_norm": 0.38943207263946533, + "learning_rate": 0.001, + "loss": 3.6015, + "step": 20718 + }, + { + "epoch": 0.8765123952957103, + "grad_norm": 0.16425389051437378, + "learning_rate": 0.001, + "loss": 1.75, + "step": 20719 + }, + { + "epoch": 0.8765547000592266, + "grad_norm": 0.1728183478116989, + "learning_rate": 0.001, + "loss": 2.144, + "step": 20720 + }, + { + "epoch": 0.876597004822743, + "grad_norm": 1.0912002325057983, + "learning_rate": 0.001, + "loss": 2.133, + "step": 20721 + }, + { + "epoch": 0.8766393095862595, + "grad_norm": 0.6945961117744446, + "learning_rate": 0.001, + "loss": 3.1325, + "step": 20722 + }, + { + "epoch": 0.8766816143497758, + "grad_norm": 0.1368747353553772, + "learning_rate": 0.001, + "loss": 1.4968, + "step": 20723 + }, + { + "epoch": 0.8767239191132922, + "grad_norm": 0.1539839655160904, + "learning_rate": 0.001, + "loss": 3.2301, + "step": 20724 + }, + { + "epoch": 0.8767662238768086, + "grad_norm": 0.1409660428762436, + "learning_rate": 0.001, + "loss": 1.2512, + "step": 20725 + }, + { + "epoch": 0.8768085286403249, + "grad_norm": 0.5920034050941467, + "learning_rate": 0.001, + "loss": 1.7211, + "step": 20726 + }, + { + "epoch": 0.8768508334038413, + "grad_norm": 0.7836800217628479, + "learning_rate": 0.001, + "loss": 2.5221, + "step": 20727 + }, + { + "epoch": 0.8768931381673577, + "grad_norm": 0.1688200980424881, + "learning_rate": 0.001, + "loss": 1.8585, + "step": 20728 + }, + { + "epoch": 0.876935442930874, + "grad_norm": 2.877446174621582, + "learning_rate": 0.001, + "loss": 1.2657, + "step": 20729 + }, + { + "epoch": 0.8769777476943904, + "grad_norm": 0.1525486409664154, + "learning_rate": 0.001, + "loss": 2.808, + "step": 20730 + }, + { + "epoch": 0.8770200524579067, + "grad_norm": 0.17019006609916687, + "learning_rate": 0.001, + "loss": 2.4638, + "step": 20731 + }, + { + "epoch": 0.8770623572214231, + "grad_norm": 5.263099670410156, + "learning_rate": 0.001, + "loss": 2.6641, + "step": 20732 + }, + { + "epoch": 0.8771046619849395, + "grad_norm": 0.18145911395549774, + "learning_rate": 0.001, + "loss": 2.248, + "step": 20733 + }, + { + "epoch": 0.8771469667484558, + "grad_norm": 0.1718670129776001, + "learning_rate": 0.001, + "loss": 2.284, + "step": 20734 + }, + { + "epoch": 0.8771892715119722, + "grad_norm": 0.16717854142189026, + "learning_rate": 0.001, + "loss": 2.2931, + "step": 20735 + }, + { + "epoch": 0.8772315762754886, + "grad_norm": 0.17688849568367004, + "learning_rate": 0.001, + "loss": 2.4102, + "step": 20736 + }, + { + "epoch": 0.8772738810390049, + "grad_norm": 0.19576434791088104, + "learning_rate": 0.001, + "loss": 2.9707, + "step": 20737 + }, + { + "epoch": 0.8773161858025214, + "grad_norm": 0.16361911594867706, + "learning_rate": 0.001, + "loss": 1.8927, + "step": 20738 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 0.20847147703170776, + "learning_rate": 0.001, + "loss": 2.5862, + "step": 20739 + }, + { + "epoch": 0.8774007953295541, + "grad_norm": 0.1973280906677246, + "learning_rate": 0.001, + "loss": 1.5039, + "step": 20740 + }, + { + "epoch": 0.8774431000930705, + "grad_norm": 0.18212057650089264, + "learning_rate": 0.001, + "loss": 2.205, + "step": 20741 + }, + { + "epoch": 0.8774854048565869, + "grad_norm": 0.7270743250846863, + "learning_rate": 0.001, + "loss": 1.9571, + "step": 20742 + }, + { + "epoch": 0.8775277096201032, + "grad_norm": 0.18873396515846252, + "learning_rate": 0.001, + "loss": 1.9786, + "step": 20743 + }, + { + "epoch": 0.8775700143836196, + "grad_norm": 0.18425945937633514, + "learning_rate": 0.001, + "loss": 2.1827, + "step": 20744 + }, + { + "epoch": 0.877612319147136, + "grad_norm": 0.23821745812892914, + "learning_rate": 0.001, + "loss": 2.3047, + "step": 20745 + }, + { + "epoch": 0.8776546239106523, + "grad_norm": 0.16767555475234985, + "learning_rate": 0.001, + "loss": 3.2195, + "step": 20746 + }, + { + "epoch": 0.8776969286741687, + "grad_norm": 0.2009771168231964, + "learning_rate": 0.001, + "loss": 2.3288, + "step": 20747 + }, + { + "epoch": 0.8777392334376851, + "grad_norm": 0.19222243130207062, + "learning_rate": 0.001, + "loss": 2.1529, + "step": 20748 + }, + { + "epoch": 0.8777815382012014, + "grad_norm": 0.28680652379989624, + "learning_rate": 0.001, + "loss": 1.7814, + "step": 20749 + }, + { + "epoch": 0.8778238429647178, + "grad_norm": 0.18338562548160553, + "learning_rate": 0.001, + "loss": 3.939, + "step": 20750 + }, + { + "epoch": 0.8778661477282342, + "grad_norm": 0.9251672029495239, + "learning_rate": 0.001, + "loss": 1.9422, + "step": 20751 + }, + { + "epoch": 0.8779084524917505, + "grad_norm": 0.5910675525665283, + "learning_rate": 0.001, + "loss": 1.9307, + "step": 20752 + }, + { + "epoch": 0.877950757255267, + "grad_norm": 0.2055887132883072, + "learning_rate": 0.001, + "loss": 2.2307, + "step": 20753 + }, + { + "epoch": 0.8779930620187834, + "grad_norm": 0.27294355630874634, + "learning_rate": 0.001, + "loss": 2.7468, + "step": 20754 + }, + { + "epoch": 0.8780353667822997, + "grad_norm": 0.14302602410316467, + "learning_rate": 0.001, + "loss": 1.4932, + "step": 20755 + }, + { + "epoch": 0.8780776715458161, + "grad_norm": 0.15693305432796478, + "learning_rate": 0.001, + "loss": 2.1727, + "step": 20756 + }, + { + "epoch": 0.8781199763093325, + "grad_norm": 2.7029690742492676, + "learning_rate": 0.001, + "loss": 2.2942, + "step": 20757 + }, + { + "epoch": 0.8781622810728488, + "grad_norm": 0.16679178178310394, + "learning_rate": 0.001, + "loss": 2.4154, + "step": 20758 + }, + { + "epoch": 0.8782045858363652, + "grad_norm": 0.29728999733924866, + "learning_rate": 0.001, + "loss": 1.5379, + "step": 20759 + }, + { + "epoch": 0.8782468905998816, + "grad_norm": 0.7710385918617249, + "learning_rate": 0.001, + "loss": 2.386, + "step": 20760 + }, + { + "epoch": 0.8782891953633979, + "grad_norm": 0.2641301453113556, + "learning_rate": 0.001, + "loss": 1.7395, + "step": 20761 + }, + { + "epoch": 0.8783315001269143, + "grad_norm": 0.15455834567546844, + "learning_rate": 0.001, + "loss": 2.7225, + "step": 20762 + }, + { + "epoch": 0.8783738048904307, + "grad_norm": 1.485619068145752, + "learning_rate": 0.001, + "loss": 2.3048, + "step": 20763 + }, + { + "epoch": 0.878416109653947, + "grad_norm": 0.675614595413208, + "learning_rate": 0.001, + "loss": 1.5047, + "step": 20764 + }, + { + "epoch": 0.8784584144174634, + "grad_norm": 0.18644104897975922, + "learning_rate": 0.001, + "loss": 1.9652, + "step": 20765 + }, + { + "epoch": 0.8785007191809798, + "grad_norm": 0.1697007417678833, + "learning_rate": 0.001, + "loss": 1.9008, + "step": 20766 + }, + { + "epoch": 0.8785430239444961, + "grad_norm": 0.1929423213005066, + "learning_rate": 0.001, + "loss": 2.574, + "step": 20767 + }, + { + "epoch": 0.8785853287080125, + "grad_norm": 0.3777572810649872, + "learning_rate": 0.001, + "loss": 2.3181, + "step": 20768 + }, + { + "epoch": 0.878627633471529, + "grad_norm": 0.17042919993400574, + "learning_rate": 0.001, + "loss": 2.2234, + "step": 20769 + }, + { + "epoch": 0.8786699382350452, + "grad_norm": 0.19270406663417816, + "learning_rate": 0.001, + "loss": 1.4735, + "step": 20770 + }, + { + "epoch": 0.8787122429985617, + "grad_norm": 0.2564689517021179, + "learning_rate": 0.001, + "loss": 1.9759, + "step": 20771 + }, + { + "epoch": 0.8787545477620781, + "grad_norm": 4.633014678955078, + "learning_rate": 0.001, + "loss": 1.7299, + "step": 20772 + }, + { + "epoch": 0.8787968525255944, + "grad_norm": 0.14340277016162872, + "learning_rate": 0.001, + "loss": 1.7605, + "step": 20773 + }, + { + "epoch": 0.8788391572891108, + "grad_norm": 0.1704220324754715, + "learning_rate": 0.001, + "loss": 2.0632, + "step": 20774 + }, + { + "epoch": 0.8788814620526271, + "grad_norm": 0.17304351925849915, + "learning_rate": 0.001, + "loss": 2.2619, + "step": 20775 + }, + { + "epoch": 0.8789237668161435, + "grad_norm": 0.1750032901763916, + "learning_rate": 0.001, + "loss": 1.918, + "step": 20776 + }, + { + "epoch": 0.8789660715796599, + "grad_norm": 4.89594030380249, + "learning_rate": 0.001, + "loss": 1.3372, + "step": 20777 + }, + { + "epoch": 0.8790083763431762, + "grad_norm": 0.172307088971138, + "learning_rate": 0.001, + "loss": 2.11, + "step": 20778 + }, + { + "epoch": 0.8790506811066926, + "grad_norm": 0.1498982012271881, + "learning_rate": 0.001, + "loss": 3.3868, + "step": 20779 + }, + { + "epoch": 0.879092985870209, + "grad_norm": 0.1848546415567398, + "learning_rate": 0.001, + "loss": 1.8795, + "step": 20780 + }, + { + "epoch": 0.8791352906337253, + "grad_norm": 2.189716100692749, + "learning_rate": 0.001, + "loss": 3.0549, + "step": 20781 + }, + { + "epoch": 0.8791775953972417, + "grad_norm": 0.7227365970611572, + "learning_rate": 0.001, + "loss": 2.0596, + "step": 20782 + }, + { + "epoch": 0.8792199001607581, + "grad_norm": 0.1363682597875595, + "learning_rate": 0.001, + "loss": 1.9833, + "step": 20783 + }, + { + "epoch": 0.8792622049242744, + "grad_norm": 0.4821024239063263, + "learning_rate": 0.001, + "loss": 2.1675, + "step": 20784 + }, + { + "epoch": 0.8793045096877908, + "grad_norm": 0.16702604293823242, + "learning_rate": 0.001, + "loss": 2.4858, + "step": 20785 + }, + { + "epoch": 0.8793468144513072, + "grad_norm": 9.462590217590332, + "learning_rate": 0.001, + "loss": 1.7947, + "step": 20786 + }, + { + "epoch": 0.8793891192148235, + "grad_norm": 0.14354799687862396, + "learning_rate": 0.001, + "loss": 2.4982, + "step": 20787 + }, + { + "epoch": 0.87943142397834, + "grad_norm": 0.17250943183898926, + "learning_rate": 0.001, + "loss": 2.817, + "step": 20788 + }, + { + "epoch": 0.8794737287418564, + "grad_norm": 0.13783836364746094, + "learning_rate": 0.001, + "loss": 2.5655, + "step": 20789 + }, + { + "epoch": 0.8795160335053727, + "grad_norm": 0.18853969871997833, + "learning_rate": 0.001, + "loss": 2.2248, + "step": 20790 + }, + { + "epoch": 0.8795583382688891, + "grad_norm": 0.1598181277513504, + "learning_rate": 0.001, + "loss": 2.6343, + "step": 20791 + }, + { + "epoch": 0.8796006430324055, + "grad_norm": 0.16585583984851837, + "learning_rate": 0.001, + "loss": 1.8722, + "step": 20792 + }, + { + "epoch": 0.8796429477959218, + "grad_norm": 0.20554226636886597, + "learning_rate": 0.001, + "loss": 2.6009, + "step": 20793 + }, + { + "epoch": 0.8796852525594382, + "grad_norm": 2.5491397380828857, + "learning_rate": 0.001, + "loss": 1.7133, + "step": 20794 + }, + { + "epoch": 0.8797275573229546, + "grad_norm": 0.1531909704208374, + "learning_rate": 0.001, + "loss": 2.1605, + "step": 20795 + }, + { + "epoch": 0.8797698620864709, + "grad_norm": 0.16960613429546356, + "learning_rate": 0.001, + "loss": 2.6522, + "step": 20796 + }, + { + "epoch": 0.8798121668499873, + "grad_norm": 0.20041488111019135, + "learning_rate": 0.001, + "loss": 1.9546, + "step": 20797 + }, + { + "epoch": 0.8798544716135037, + "grad_norm": 0.20710138976573944, + "learning_rate": 0.001, + "loss": 2.7599, + "step": 20798 + }, + { + "epoch": 0.87989677637702, + "grad_norm": 0.153824120759964, + "learning_rate": 0.001, + "loss": 2.2814, + "step": 20799 + }, + { + "epoch": 0.8799390811405364, + "grad_norm": 0.1634339690208435, + "learning_rate": 0.001, + "loss": 2.3355, + "step": 20800 + }, + { + "epoch": 0.8799813859040528, + "grad_norm": 0.19159682095050812, + "learning_rate": 0.001, + "loss": 1.9803, + "step": 20801 + }, + { + "epoch": 0.8800236906675691, + "grad_norm": 0.40518084168434143, + "learning_rate": 0.001, + "loss": 2.6101, + "step": 20802 + }, + { + "epoch": 0.8800659954310855, + "grad_norm": 0.19787724316120148, + "learning_rate": 0.001, + "loss": 2.0322, + "step": 20803 + }, + { + "epoch": 0.880108300194602, + "grad_norm": 0.18997430801391602, + "learning_rate": 0.001, + "loss": 1.7237, + "step": 20804 + }, + { + "epoch": 0.8801506049581183, + "grad_norm": 0.4623604118824005, + "learning_rate": 0.001, + "loss": 2.3957, + "step": 20805 + }, + { + "epoch": 0.8801929097216347, + "grad_norm": 0.14696352183818817, + "learning_rate": 0.001, + "loss": 2.1396, + "step": 20806 + }, + { + "epoch": 0.8802352144851511, + "grad_norm": 0.2328002154827118, + "learning_rate": 0.001, + "loss": 2.0078, + "step": 20807 + }, + { + "epoch": 0.8802775192486674, + "grad_norm": 0.16920128464698792, + "learning_rate": 0.001, + "loss": 1.444, + "step": 20808 + }, + { + "epoch": 0.8803198240121838, + "grad_norm": 0.30305415391921997, + "learning_rate": 0.001, + "loss": 3.162, + "step": 20809 + }, + { + "epoch": 0.8803621287757002, + "grad_norm": 0.2750433087348938, + "learning_rate": 0.001, + "loss": 1.8929, + "step": 20810 + }, + { + "epoch": 0.8804044335392165, + "grad_norm": 0.38231226801872253, + "learning_rate": 0.001, + "loss": 2.9644, + "step": 20811 + }, + { + "epoch": 0.8804467383027329, + "grad_norm": 0.14932866394519806, + "learning_rate": 0.001, + "loss": 2.08, + "step": 20812 + }, + { + "epoch": 0.8804890430662493, + "grad_norm": 0.17217305302619934, + "learning_rate": 0.001, + "loss": 2.4867, + "step": 20813 + }, + { + "epoch": 0.8805313478297656, + "grad_norm": 0.1518954336643219, + "learning_rate": 0.001, + "loss": 2.6287, + "step": 20814 + }, + { + "epoch": 0.880573652593282, + "grad_norm": 0.1420203298330307, + "learning_rate": 0.001, + "loss": 2.0718, + "step": 20815 + }, + { + "epoch": 0.8806159573567984, + "grad_norm": 0.15746936202049255, + "learning_rate": 0.001, + "loss": 1.3726, + "step": 20816 + }, + { + "epoch": 0.8806582621203147, + "grad_norm": 0.20516842603683472, + "learning_rate": 0.001, + "loss": 2.5636, + "step": 20817 + }, + { + "epoch": 0.8807005668838311, + "grad_norm": 0.18061251938343048, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 20818 + }, + { + "epoch": 0.8807428716473474, + "grad_norm": 0.19963960349559784, + "learning_rate": 0.001, + "loss": 1.8834, + "step": 20819 + }, + { + "epoch": 0.8807851764108638, + "grad_norm": 0.2850932776927948, + "learning_rate": 0.001, + "loss": 2.3297, + "step": 20820 + }, + { + "epoch": 0.8808274811743803, + "grad_norm": 0.16823464632034302, + "learning_rate": 0.001, + "loss": 2.6427, + "step": 20821 + }, + { + "epoch": 0.8808697859378966, + "grad_norm": 0.16549649834632874, + "learning_rate": 0.001, + "loss": 2.9588, + "step": 20822 + }, + { + "epoch": 0.880912090701413, + "grad_norm": 0.18909013271331787, + "learning_rate": 0.001, + "loss": 2.0608, + "step": 20823 + }, + { + "epoch": 0.8809543954649294, + "grad_norm": 0.16195768117904663, + "learning_rate": 0.001, + "loss": 2.2387, + "step": 20824 + }, + { + "epoch": 0.8809967002284457, + "grad_norm": 0.14382191002368927, + "learning_rate": 0.001, + "loss": 2.3996, + "step": 20825 + }, + { + "epoch": 0.8810390049919621, + "grad_norm": 0.1621260792016983, + "learning_rate": 0.001, + "loss": 2.7321, + "step": 20826 + }, + { + "epoch": 0.8810813097554785, + "grad_norm": 0.13346368074417114, + "learning_rate": 0.001, + "loss": 3.0278, + "step": 20827 + }, + { + "epoch": 0.8811236145189948, + "grad_norm": 0.14685030281543732, + "learning_rate": 0.001, + "loss": 1.7467, + "step": 20828 + }, + { + "epoch": 0.8811659192825112, + "grad_norm": 0.15578360855579376, + "learning_rate": 0.001, + "loss": 3.2155, + "step": 20829 + }, + { + "epoch": 0.8812082240460276, + "grad_norm": 1.1892778873443604, + "learning_rate": 0.001, + "loss": 2.3981, + "step": 20830 + }, + { + "epoch": 0.8812505288095439, + "grad_norm": 0.13254669308662415, + "learning_rate": 0.001, + "loss": 2.8435, + "step": 20831 + }, + { + "epoch": 0.8812928335730603, + "grad_norm": 0.1270717978477478, + "learning_rate": 0.001, + "loss": 2.3439, + "step": 20832 + }, + { + "epoch": 0.8813351383365767, + "grad_norm": 0.1563170850276947, + "learning_rate": 0.001, + "loss": 1.8762, + "step": 20833 + }, + { + "epoch": 0.881377443100093, + "grad_norm": 0.8175224661827087, + "learning_rate": 0.001, + "loss": 2.7527, + "step": 20834 + }, + { + "epoch": 0.8814197478636094, + "grad_norm": 0.13206033408641815, + "learning_rate": 0.001, + "loss": 2.0897, + "step": 20835 + }, + { + "epoch": 0.8814620526271258, + "grad_norm": 0.2126411646604538, + "learning_rate": 0.001, + "loss": 3.5215, + "step": 20836 + }, + { + "epoch": 0.8815043573906421, + "grad_norm": 0.12830336391925812, + "learning_rate": 0.001, + "loss": 1.3334, + "step": 20837 + }, + { + "epoch": 0.8815466621541586, + "grad_norm": 0.601836621761322, + "learning_rate": 0.001, + "loss": 1.9787, + "step": 20838 + }, + { + "epoch": 0.881588966917675, + "grad_norm": 0.13936357200145721, + "learning_rate": 0.001, + "loss": 2.0659, + "step": 20839 + }, + { + "epoch": 0.8816312716811913, + "grad_norm": 0.1285770684480667, + "learning_rate": 0.001, + "loss": 2.1898, + "step": 20840 + }, + { + "epoch": 0.8816735764447077, + "grad_norm": 0.12381594628095627, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 20841 + }, + { + "epoch": 0.8817158812082241, + "grad_norm": 0.12983018159866333, + "learning_rate": 0.001, + "loss": 2.5275, + "step": 20842 + }, + { + "epoch": 0.8817581859717404, + "grad_norm": 0.1269802451133728, + "learning_rate": 0.001, + "loss": 2.2177, + "step": 20843 + }, + { + "epoch": 0.8818004907352568, + "grad_norm": 0.12933611869812012, + "learning_rate": 0.001, + "loss": 1.416, + "step": 20844 + }, + { + "epoch": 0.8818427954987732, + "grad_norm": 0.5921683311462402, + "learning_rate": 0.001, + "loss": 2.5021, + "step": 20845 + }, + { + "epoch": 0.8818851002622895, + "grad_norm": 0.15616628527641296, + "learning_rate": 0.001, + "loss": 1.7597, + "step": 20846 + }, + { + "epoch": 0.8819274050258059, + "grad_norm": 0.16037492454051971, + "learning_rate": 0.001, + "loss": 3.31, + "step": 20847 + }, + { + "epoch": 0.8819697097893223, + "grad_norm": 0.19823972880840302, + "learning_rate": 0.001, + "loss": 2.1503, + "step": 20848 + }, + { + "epoch": 0.8820120145528386, + "grad_norm": 0.17855338752269745, + "learning_rate": 0.001, + "loss": 2.6852, + "step": 20849 + }, + { + "epoch": 0.882054319316355, + "grad_norm": 0.15427084267139435, + "learning_rate": 0.001, + "loss": 2.0922, + "step": 20850 + }, + { + "epoch": 0.8820966240798714, + "grad_norm": 0.14406783878803253, + "learning_rate": 0.001, + "loss": 2.3578, + "step": 20851 + }, + { + "epoch": 0.8821389288433877, + "grad_norm": 0.16214242577552795, + "learning_rate": 0.001, + "loss": 1.3277, + "step": 20852 + }, + { + "epoch": 0.8821812336069041, + "grad_norm": 0.1684330701828003, + "learning_rate": 0.001, + "loss": 2.0406, + "step": 20853 + }, + { + "epoch": 0.8822235383704206, + "grad_norm": 8.746175765991211, + "learning_rate": 0.001, + "loss": 2.3627, + "step": 20854 + }, + { + "epoch": 0.8822658431339369, + "grad_norm": 0.17695072293281555, + "learning_rate": 0.001, + "loss": 1.8508, + "step": 20855 + }, + { + "epoch": 0.8823081478974533, + "grad_norm": 0.147260844707489, + "learning_rate": 0.001, + "loss": 1.9841, + "step": 20856 + }, + { + "epoch": 0.8823504526609697, + "grad_norm": 0.13849042356014252, + "learning_rate": 0.001, + "loss": 1.6593, + "step": 20857 + }, + { + "epoch": 0.882392757424486, + "grad_norm": 1.9527419805526733, + "learning_rate": 0.001, + "loss": 2.0942, + "step": 20858 + }, + { + "epoch": 0.8824350621880024, + "grad_norm": 0.18281447887420654, + "learning_rate": 0.001, + "loss": 3.9133, + "step": 20859 + }, + { + "epoch": 0.8824773669515188, + "grad_norm": 0.8285083174705505, + "learning_rate": 0.001, + "loss": 2.7313, + "step": 20860 + }, + { + "epoch": 0.8825196717150351, + "grad_norm": 0.163629949092865, + "learning_rate": 0.001, + "loss": 1.8258, + "step": 20861 + }, + { + "epoch": 0.8825619764785515, + "grad_norm": 0.6261329054832458, + "learning_rate": 0.001, + "loss": 1.9942, + "step": 20862 + }, + { + "epoch": 0.8826042812420679, + "grad_norm": 0.11821243911981583, + "learning_rate": 0.001, + "loss": 2.0009, + "step": 20863 + }, + { + "epoch": 0.8826465860055842, + "grad_norm": 0.17022605240345, + "learning_rate": 0.001, + "loss": 2.0427, + "step": 20864 + }, + { + "epoch": 0.8826888907691006, + "grad_norm": 0.28422749042510986, + "learning_rate": 0.001, + "loss": 2.141, + "step": 20865 + }, + { + "epoch": 0.8827311955326169, + "grad_norm": 0.1458466500043869, + "learning_rate": 0.001, + "loss": 1.9478, + "step": 20866 + }, + { + "epoch": 0.8827735002961333, + "grad_norm": 0.1322123408317566, + "learning_rate": 0.001, + "loss": 1.4996, + "step": 20867 + }, + { + "epoch": 0.8828158050596497, + "grad_norm": 0.1259404420852661, + "learning_rate": 0.001, + "loss": 1.9031, + "step": 20868 + }, + { + "epoch": 0.882858109823166, + "grad_norm": 0.23596730828285217, + "learning_rate": 0.001, + "loss": 1.8853, + "step": 20869 + }, + { + "epoch": 0.8829004145866824, + "grad_norm": 0.15984271466732025, + "learning_rate": 0.001, + "loss": 3.1109, + "step": 20870 + }, + { + "epoch": 0.8829427193501989, + "grad_norm": 0.1410568356513977, + "learning_rate": 0.001, + "loss": 2.5422, + "step": 20871 + }, + { + "epoch": 0.8829850241137152, + "grad_norm": 0.15058191120624542, + "learning_rate": 0.001, + "loss": 2.1368, + "step": 20872 + }, + { + "epoch": 0.8830273288772316, + "grad_norm": 0.1391940861940384, + "learning_rate": 0.001, + "loss": 2.1887, + "step": 20873 + }, + { + "epoch": 0.883069633640748, + "grad_norm": 0.1491466760635376, + "learning_rate": 0.001, + "loss": 1.7719, + "step": 20874 + }, + { + "epoch": 0.8831119384042643, + "grad_norm": 0.13309025764465332, + "learning_rate": 0.001, + "loss": 1.9236, + "step": 20875 + }, + { + "epoch": 0.8831542431677807, + "grad_norm": 0.1632324457168579, + "learning_rate": 0.001, + "loss": 2.0839, + "step": 20876 + }, + { + "epoch": 0.8831965479312971, + "grad_norm": 0.26213958859443665, + "learning_rate": 0.001, + "loss": 2.0513, + "step": 20877 + }, + { + "epoch": 0.8832388526948134, + "grad_norm": 0.14815469086170197, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 20878 + }, + { + "epoch": 0.8832811574583298, + "grad_norm": 0.243429496884346, + "learning_rate": 0.001, + "loss": 3.2333, + "step": 20879 + }, + { + "epoch": 0.8833234622218462, + "grad_norm": 0.5001935958862305, + "learning_rate": 0.001, + "loss": 2.6958, + "step": 20880 + }, + { + "epoch": 0.8833657669853625, + "grad_norm": 0.14644937217235565, + "learning_rate": 0.001, + "loss": 1.6308, + "step": 20881 + }, + { + "epoch": 0.8834080717488789, + "grad_norm": 0.1332141011953354, + "learning_rate": 0.001, + "loss": 1.7561, + "step": 20882 + }, + { + "epoch": 0.8834503765123953, + "grad_norm": 0.5288890600204468, + "learning_rate": 0.001, + "loss": 2.2926, + "step": 20883 + }, + { + "epoch": 0.8834926812759116, + "grad_norm": 0.14572833478450775, + "learning_rate": 0.001, + "loss": 2.7297, + "step": 20884 + }, + { + "epoch": 0.883534986039428, + "grad_norm": 0.16958162188529968, + "learning_rate": 0.001, + "loss": 2.3938, + "step": 20885 + }, + { + "epoch": 0.8835772908029444, + "grad_norm": 0.16472996771335602, + "learning_rate": 0.001, + "loss": 2.0189, + "step": 20886 + }, + { + "epoch": 0.8836195955664607, + "grad_norm": 0.14302243292331696, + "learning_rate": 0.001, + "loss": 1.9094, + "step": 20887 + }, + { + "epoch": 0.8836619003299772, + "grad_norm": 0.14594019949436188, + "learning_rate": 0.001, + "loss": 1.9324, + "step": 20888 + }, + { + "epoch": 0.8837042050934936, + "grad_norm": 0.1699601113796234, + "learning_rate": 0.001, + "loss": 2.5119, + "step": 20889 + }, + { + "epoch": 0.8837465098570099, + "grad_norm": 0.16255362331867218, + "learning_rate": 0.001, + "loss": 3.1678, + "step": 20890 + }, + { + "epoch": 0.8837888146205263, + "grad_norm": 0.2458498328924179, + "learning_rate": 0.001, + "loss": 1.8441, + "step": 20891 + }, + { + "epoch": 0.8838311193840427, + "grad_norm": 0.14497283101081848, + "learning_rate": 0.001, + "loss": 2.5701, + "step": 20892 + }, + { + "epoch": 0.883873424147559, + "grad_norm": 2.2221720218658447, + "learning_rate": 0.001, + "loss": 2.7294, + "step": 20893 + }, + { + "epoch": 0.8839157289110754, + "grad_norm": 0.17696496844291687, + "learning_rate": 0.001, + "loss": 2.1181, + "step": 20894 + }, + { + "epoch": 0.8839580336745918, + "grad_norm": 0.1702883541584015, + "learning_rate": 0.001, + "loss": 2.9156, + "step": 20895 + }, + { + "epoch": 0.8840003384381081, + "grad_norm": 0.1452932506799698, + "learning_rate": 0.001, + "loss": 1.5805, + "step": 20896 + }, + { + "epoch": 0.8840426432016245, + "grad_norm": 0.14172394573688507, + "learning_rate": 0.001, + "loss": 1.5672, + "step": 20897 + }, + { + "epoch": 0.8840849479651409, + "grad_norm": 0.1812184453010559, + "learning_rate": 0.001, + "loss": 2.7763, + "step": 20898 + }, + { + "epoch": 0.8841272527286572, + "grad_norm": 0.1486140936613083, + "learning_rate": 0.001, + "loss": 2.2094, + "step": 20899 + }, + { + "epoch": 0.8841695574921736, + "grad_norm": 0.17032857239246368, + "learning_rate": 0.001, + "loss": 1.8822, + "step": 20900 + }, + { + "epoch": 0.88421186225569, + "grad_norm": 0.21682803332805634, + "learning_rate": 0.001, + "loss": 2.0348, + "step": 20901 + }, + { + "epoch": 0.8842541670192063, + "grad_norm": 0.1323508471250534, + "learning_rate": 0.001, + "loss": 1.682, + "step": 20902 + }, + { + "epoch": 0.8842964717827227, + "grad_norm": 0.3486897647380829, + "learning_rate": 0.001, + "loss": 2.2666, + "step": 20903 + }, + { + "epoch": 0.8843387765462392, + "grad_norm": 3.172335386276245, + "learning_rate": 0.001, + "loss": 2.2502, + "step": 20904 + }, + { + "epoch": 0.8843810813097555, + "grad_norm": 0.14836838841438293, + "learning_rate": 0.001, + "loss": 1.4472, + "step": 20905 + }, + { + "epoch": 0.8844233860732719, + "grad_norm": 0.195405974984169, + "learning_rate": 0.001, + "loss": 1.8995, + "step": 20906 + }, + { + "epoch": 0.8844656908367883, + "grad_norm": 0.41760605573654175, + "learning_rate": 0.001, + "loss": 2.139, + "step": 20907 + }, + { + "epoch": 0.8845079956003046, + "grad_norm": 0.16416040062904358, + "learning_rate": 0.001, + "loss": 2.0343, + "step": 20908 + }, + { + "epoch": 0.884550300363821, + "grad_norm": 0.16729791462421417, + "learning_rate": 0.001, + "loss": 2.0194, + "step": 20909 + }, + { + "epoch": 0.8845926051273373, + "grad_norm": 0.15572674572467804, + "learning_rate": 0.001, + "loss": 1.7202, + "step": 20910 + }, + { + "epoch": 0.8846349098908537, + "grad_norm": 36.179466247558594, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 20911 + }, + { + "epoch": 0.8846772146543701, + "grad_norm": 0.15077073872089386, + "learning_rate": 0.001, + "loss": 1.6993, + "step": 20912 + }, + { + "epoch": 0.8847195194178864, + "grad_norm": 0.1646421253681183, + "learning_rate": 0.001, + "loss": 2.058, + "step": 20913 + }, + { + "epoch": 0.8847618241814028, + "grad_norm": 0.2249441295862198, + "learning_rate": 0.001, + "loss": 2.8581, + "step": 20914 + }, + { + "epoch": 0.8848041289449192, + "grad_norm": 0.19129936397075653, + "learning_rate": 0.001, + "loss": 2.1256, + "step": 20915 + }, + { + "epoch": 0.8848464337084355, + "grad_norm": 0.16032202541828156, + "learning_rate": 0.001, + "loss": 1.7309, + "step": 20916 + }, + { + "epoch": 0.8848887384719519, + "grad_norm": 0.21041136980056763, + "learning_rate": 0.001, + "loss": 3.1444, + "step": 20917 + }, + { + "epoch": 0.8849310432354683, + "grad_norm": 0.1504105031490326, + "learning_rate": 0.001, + "loss": 1.4867, + "step": 20918 + }, + { + "epoch": 0.8849733479989846, + "grad_norm": 0.16670987010002136, + "learning_rate": 0.001, + "loss": 1.6321, + "step": 20919 + }, + { + "epoch": 0.885015652762501, + "grad_norm": 0.22837378084659576, + "learning_rate": 0.001, + "loss": 2.2452, + "step": 20920 + }, + { + "epoch": 0.8850579575260175, + "grad_norm": 0.15080545842647552, + "learning_rate": 0.001, + "loss": 2.2631, + "step": 20921 + }, + { + "epoch": 0.8851002622895338, + "grad_norm": 0.16845811903476715, + "learning_rate": 0.001, + "loss": 1.9818, + "step": 20922 + }, + { + "epoch": 0.8851425670530502, + "grad_norm": 0.6838306784629822, + "learning_rate": 0.001, + "loss": 1.6603, + "step": 20923 + }, + { + "epoch": 0.8851848718165666, + "grad_norm": 0.12884469330310822, + "learning_rate": 0.001, + "loss": 1.7758, + "step": 20924 + }, + { + "epoch": 0.8852271765800829, + "grad_norm": 0.7844867706298828, + "learning_rate": 0.001, + "loss": 2.031, + "step": 20925 + }, + { + "epoch": 0.8852694813435993, + "grad_norm": 0.1945054531097412, + "learning_rate": 0.001, + "loss": 3.3753, + "step": 20926 + }, + { + "epoch": 0.8853117861071157, + "grad_norm": 30.440494537353516, + "learning_rate": 0.001, + "loss": 2.9578, + "step": 20927 + }, + { + "epoch": 0.885354090870632, + "grad_norm": 0.13637934625148773, + "learning_rate": 0.001, + "loss": 1.677, + "step": 20928 + }, + { + "epoch": 0.8853963956341484, + "grad_norm": 0.7692943811416626, + "learning_rate": 0.001, + "loss": 2.1675, + "step": 20929 + }, + { + "epoch": 0.8854387003976648, + "grad_norm": 0.9168503880500793, + "learning_rate": 0.001, + "loss": 2.1578, + "step": 20930 + }, + { + "epoch": 0.8854810051611811, + "grad_norm": 0.1996372491121292, + "learning_rate": 0.001, + "loss": 2.6837, + "step": 20931 + }, + { + "epoch": 0.8855233099246975, + "grad_norm": 0.15608003735542297, + "learning_rate": 0.001, + "loss": 3.4255, + "step": 20932 + }, + { + "epoch": 0.8855656146882139, + "grad_norm": 0.7648287415504456, + "learning_rate": 0.001, + "loss": 1.6965, + "step": 20933 + }, + { + "epoch": 0.8856079194517302, + "grad_norm": 8.121953964233398, + "learning_rate": 0.001, + "loss": 2.2537, + "step": 20934 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 2.677286148071289, + "learning_rate": 0.001, + "loss": 3.2583, + "step": 20935 + }, + { + "epoch": 0.885692528978763, + "grad_norm": 0.17523032426834106, + "learning_rate": 0.001, + "loss": 2.1655, + "step": 20936 + }, + { + "epoch": 0.8857348337422793, + "grad_norm": 0.16115929186344147, + "learning_rate": 0.001, + "loss": 1.5949, + "step": 20937 + }, + { + "epoch": 0.8857771385057958, + "grad_norm": 0.18050625920295715, + "learning_rate": 0.001, + "loss": 2.2334, + "step": 20938 + }, + { + "epoch": 0.8858194432693122, + "grad_norm": 0.19290222227573395, + "learning_rate": 0.001, + "loss": 2.4616, + "step": 20939 + }, + { + "epoch": 0.8858617480328285, + "grad_norm": 0.15666300058364868, + "learning_rate": 0.001, + "loss": 2.5885, + "step": 20940 + }, + { + "epoch": 0.8859040527963449, + "grad_norm": 0.21814775466918945, + "learning_rate": 0.001, + "loss": 1.8548, + "step": 20941 + }, + { + "epoch": 0.8859463575598613, + "grad_norm": 0.18262353539466858, + "learning_rate": 0.001, + "loss": 1.8053, + "step": 20942 + }, + { + "epoch": 0.8859886623233776, + "grad_norm": 0.4183075726032257, + "learning_rate": 0.001, + "loss": 2.103, + "step": 20943 + }, + { + "epoch": 0.886030967086894, + "grad_norm": 0.18610745668411255, + "learning_rate": 0.001, + "loss": 2.7605, + "step": 20944 + }, + { + "epoch": 0.8860732718504104, + "grad_norm": 10.397679328918457, + "learning_rate": 0.001, + "loss": 2.124, + "step": 20945 + }, + { + "epoch": 0.8861155766139267, + "grad_norm": 0.14724650979042053, + "learning_rate": 0.001, + "loss": 2.377, + "step": 20946 + }, + { + "epoch": 0.8861578813774431, + "grad_norm": 0.4719243049621582, + "learning_rate": 0.001, + "loss": 1.979, + "step": 20947 + }, + { + "epoch": 0.8862001861409595, + "grad_norm": 0.21508583426475525, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 20948 + }, + { + "epoch": 0.8862424909044758, + "grad_norm": 0.16444562375545502, + "learning_rate": 0.001, + "loss": 1.8839, + "step": 20949 + }, + { + "epoch": 0.8862847956679922, + "grad_norm": 0.22835667431354523, + "learning_rate": 0.001, + "loss": 2.5681, + "step": 20950 + }, + { + "epoch": 0.8863271004315086, + "grad_norm": 0.17170853912830353, + "learning_rate": 0.001, + "loss": 2.0785, + "step": 20951 + }, + { + "epoch": 0.8863694051950249, + "grad_norm": 0.135117307305336, + "learning_rate": 0.001, + "loss": 2.4973, + "step": 20952 + }, + { + "epoch": 0.8864117099585413, + "grad_norm": 1.2881138324737549, + "learning_rate": 0.001, + "loss": 1.8363, + "step": 20953 + }, + { + "epoch": 0.8864540147220576, + "grad_norm": 0.1489884853363037, + "learning_rate": 0.001, + "loss": 1.9252, + "step": 20954 + }, + { + "epoch": 0.8864963194855741, + "grad_norm": 0.14874590933322906, + "learning_rate": 0.001, + "loss": 1.4085, + "step": 20955 + }, + { + "epoch": 0.8865386242490905, + "grad_norm": 0.13370899856090546, + "learning_rate": 0.001, + "loss": 2.2094, + "step": 20956 + }, + { + "epoch": 0.8865809290126068, + "grad_norm": 4.428647041320801, + "learning_rate": 0.001, + "loss": 2.2555, + "step": 20957 + }, + { + "epoch": 0.8866232337761232, + "grad_norm": 0.12054305523633957, + "learning_rate": 0.001, + "loss": 2.2724, + "step": 20958 + }, + { + "epoch": 0.8866655385396396, + "grad_norm": 0.19234895706176758, + "learning_rate": 0.001, + "loss": 2.7066, + "step": 20959 + }, + { + "epoch": 0.8867078433031559, + "grad_norm": 0.18497143685817719, + "learning_rate": 0.001, + "loss": 2.3736, + "step": 20960 + }, + { + "epoch": 0.8867501480666723, + "grad_norm": 0.2834151089191437, + "learning_rate": 0.001, + "loss": 2.2192, + "step": 20961 + }, + { + "epoch": 0.8867924528301887, + "grad_norm": 0.18010100722312927, + "learning_rate": 0.001, + "loss": 2.4076, + "step": 20962 + }, + { + "epoch": 0.886834757593705, + "grad_norm": 0.3335997462272644, + "learning_rate": 0.001, + "loss": 3.2665, + "step": 20963 + }, + { + "epoch": 0.8868770623572214, + "grad_norm": 0.15331315994262695, + "learning_rate": 0.001, + "loss": 2.5892, + "step": 20964 + }, + { + "epoch": 0.8869193671207378, + "grad_norm": 0.16281548142433167, + "learning_rate": 0.001, + "loss": 2.1734, + "step": 20965 + }, + { + "epoch": 0.8869616718842541, + "grad_norm": 0.17195676267147064, + "learning_rate": 0.001, + "loss": 2.339, + "step": 20966 + }, + { + "epoch": 0.8870039766477705, + "grad_norm": 0.13720469176769257, + "learning_rate": 0.001, + "loss": 1.9478, + "step": 20967 + }, + { + "epoch": 0.8870462814112869, + "grad_norm": 0.17597544193267822, + "learning_rate": 0.001, + "loss": 1.9665, + "step": 20968 + }, + { + "epoch": 0.8870885861748032, + "grad_norm": 0.1732187271118164, + "learning_rate": 0.001, + "loss": 1.9538, + "step": 20969 + }, + { + "epoch": 0.8871308909383196, + "grad_norm": 0.14298303425312042, + "learning_rate": 0.001, + "loss": 2.8327, + "step": 20970 + }, + { + "epoch": 0.8871731957018361, + "grad_norm": 0.15061107277870178, + "learning_rate": 0.001, + "loss": 2.009, + "step": 20971 + }, + { + "epoch": 0.8872155004653524, + "grad_norm": 0.14526939392089844, + "learning_rate": 0.001, + "loss": 1.8338, + "step": 20972 + }, + { + "epoch": 0.8872578052288688, + "grad_norm": 0.1140129417181015, + "learning_rate": 0.001, + "loss": 1.2921, + "step": 20973 + }, + { + "epoch": 0.8873001099923852, + "grad_norm": 9.253741264343262, + "learning_rate": 0.001, + "loss": 2.1857, + "step": 20974 + }, + { + "epoch": 0.8873424147559015, + "grad_norm": 0.12813687324523926, + "learning_rate": 0.001, + "loss": 1.8503, + "step": 20975 + }, + { + "epoch": 0.8873847195194179, + "grad_norm": 0.1430562436580658, + "learning_rate": 0.001, + "loss": 1.9686, + "step": 20976 + }, + { + "epoch": 0.8874270242829343, + "grad_norm": 0.1441165953874588, + "learning_rate": 0.001, + "loss": 2.0366, + "step": 20977 + }, + { + "epoch": 0.8874693290464506, + "grad_norm": 0.6266450881958008, + "learning_rate": 0.001, + "loss": 1.4746, + "step": 20978 + }, + { + "epoch": 0.887511633809967, + "grad_norm": 0.13469688594341278, + "learning_rate": 0.001, + "loss": 2.3698, + "step": 20979 + }, + { + "epoch": 0.8875539385734834, + "grad_norm": 0.13266874849796295, + "learning_rate": 0.001, + "loss": 1.6635, + "step": 20980 + }, + { + "epoch": 0.8875962433369997, + "grad_norm": 25.529739379882812, + "learning_rate": 0.001, + "loss": 2.0676, + "step": 20981 + }, + { + "epoch": 0.8876385481005161, + "grad_norm": 0.2457568198442459, + "learning_rate": 0.001, + "loss": 3.1092, + "step": 20982 + }, + { + "epoch": 0.8876808528640325, + "grad_norm": 0.15447290241718292, + "learning_rate": 0.001, + "loss": 1.8904, + "step": 20983 + }, + { + "epoch": 0.8877231576275488, + "grad_norm": 0.280048131942749, + "learning_rate": 0.001, + "loss": 2.263, + "step": 20984 + }, + { + "epoch": 0.8877654623910652, + "grad_norm": 0.11680661141872406, + "learning_rate": 0.001, + "loss": 1.9202, + "step": 20985 + }, + { + "epoch": 0.8878077671545817, + "grad_norm": 16.68817138671875, + "learning_rate": 0.001, + "loss": 1.7731, + "step": 20986 + }, + { + "epoch": 0.887850071918098, + "grad_norm": 0.17112363874912262, + "learning_rate": 0.001, + "loss": 3.0417, + "step": 20987 + }, + { + "epoch": 0.8878923766816144, + "grad_norm": 0.19305647909641266, + "learning_rate": 0.001, + "loss": 1.4348, + "step": 20988 + }, + { + "epoch": 0.8879346814451308, + "grad_norm": 0.162348210811615, + "learning_rate": 0.001, + "loss": 1.9347, + "step": 20989 + }, + { + "epoch": 0.8879769862086471, + "grad_norm": 0.14823856949806213, + "learning_rate": 0.001, + "loss": 1.9564, + "step": 20990 + }, + { + "epoch": 0.8880192909721635, + "grad_norm": 29.342689514160156, + "learning_rate": 0.001, + "loss": 2.334, + "step": 20991 + }, + { + "epoch": 0.8880615957356799, + "grad_norm": 0.7693207859992981, + "learning_rate": 0.001, + "loss": 1.766, + "step": 20992 + }, + { + "epoch": 0.8881039004991962, + "grad_norm": 1.0036860704421997, + "learning_rate": 0.001, + "loss": 2.2152, + "step": 20993 + }, + { + "epoch": 0.8881462052627126, + "grad_norm": 3.9714372158050537, + "learning_rate": 0.001, + "loss": 2.1756, + "step": 20994 + }, + { + "epoch": 0.888188510026229, + "grad_norm": 0.21561984717845917, + "learning_rate": 0.001, + "loss": 1.7397, + "step": 20995 + }, + { + "epoch": 0.8882308147897453, + "grad_norm": 0.1669800579547882, + "learning_rate": 0.001, + "loss": 2.7163, + "step": 20996 + }, + { + "epoch": 0.8882731195532617, + "grad_norm": 0.19429895281791687, + "learning_rate": 0.001, + "loss": 2.321, + "step": 20997 + }, + { + "epoch": 0.8883154243167781, + "grad_norm": 0.21047398447990417, + "learning_rate": 0.001, + "loss": 1.6531, + "step": 20998 + }, + { + "epoch": 0.8883577290802944, + "grad_norm": 0.20937636494636536, + "learning_rate": 0.001, + "loss": 2.1078, + "step": 20999 + }, + { + "epoch": 0.8884000338438108, + "grad_norm": 0.4300979673862457, + "learning_rate": 0.001, + "loss": 2.4772, + "step": 21000 + }, + { + "epoch": 0.8884423386073271, + "grad_norm": 0.18528619408607483, + "learning_rate": 0.001, + "loss": 1.5542, + "step": 21001 + }, + { + "epoch": 0.8884846433708435, + "grad_norm": 0.989427387714386, + "learning_rate": 0.001, + "loss": 1.9901, + "step": 21002 + }, + { + "epoch": 0.88852694813436, + "grad_norm": 0.19814398884773254, + "learning_rate": 0.001, + "loss": 2.3798, + "step": 21003 + }, + { + "epoch": 0.8885692528978762, + "grad_norm": 0.16753339767456055, + "learning_rate": 0.001, + "loss": 2.8676, + "step": 21004 + }, + { + "epoch": 0.8886115576613927, + "grad_norm": 0.1605309545993805, + "learning_rate": 0.001, + "loss": 1.5876, + "step": 21005 + }, + { + "epoch": 0.8886538624249091, + "grad_norm": 1.0436333417892456, + "learning_rate": 0.001, + "loss": 3.0352, + "step": 21006 + }, + { + "epoch": 0.8886961671884254, + "grad_norm": 0.5114812850952148, + "learning_rate": 0.001, + "loss": 1.9538, + "step": 21007 + }, + { + "epoch": 0.8887384719519418, + "grad_norm": 0.16753728687763214, + "learning_rate": 0.001, + "loss": 1.6974, + "step": 21008 + }, + { + "epoch": 0.8887807767154582, + "grad_norm": 0.17720456421375275, + "learning_rate": 0.001, + "loss": 2.0584, + "step": 21009 + }, + { + "epoch": 0.8888230814789745, + "grad_norm": 0.23170483112335205, + "learning_rate": 0.001, + "loss": 3.2882, + "step": 21010 + }, + { + "epoch": 0.8888653862424909, + "grad_norm": 0.41039520502090454, + "learning_rate": 0.001, + "loss": 2.4214, + "step": 21011 + }, + { + "epoch": 0.8889076910060073, + "grad_norm": 0.1671202927827835, + "learning_rate": 0.001, + "loss": 1.606, + "step": 21012 + }, + { + "epoch": 0.8889499957695236, + "grad_norm": 0.2298729419708252, + "learning_rate": 0.001, + "loss": 2.035, + "step": 21013 + }, + { + "epoch": 0.88899230053304, + "grad_norm": 0.1773902326822281, + "learning_rate": 0.001, + "loss": 2.9743, + "step": 21014 + }, + { + "epoch": 0.8890346052965564, + "grad_norm": 0.15954598784446716, + "learning_rate": 0.001, + "loss": 1.7877, + "step": 21015 + }, + { + "epoch": 0.8890769100600727, + "grad_norm": 0.1797361969947815, + "learning_rate": 0.001, + "loss": 2.4128, + "step": 21016 + }, + { + "epoch": 0.8891192148235891, + "grad_norm": 0.1873590052127838, + "learning_rate": 0.001, + "loss": 2.0114, + "step": 21017 + }, + { + "epoch": 0.8891615195871055, + "grad_norm": 0.20286858081817627, + "learning_rate": 0.001, + "loss": 1.7583, + "step": 21018 + }, + { + "epoch": 0.8892038243506218, + "grad_norm": 0.17168866097927094, + "learning_rate": 0.001, + "loss": 1.9782, + "step": 21019 + }, + { + "epoch": 0.8892461291141383, + "grad_norm": 0.1769993156194687, + "learning_rate": 0.001, + "loss": 1.922, + "step": 21020 + }, + { + "epoch": 0.8892884338776547, + "grad_norm": 0.1596214920282364, + "learning_rate": 0.001, + "loss": 2.1562, + "step": 21021 + }, + { + "epoch": 0.889330738641171, + "grad_norm": 1.326114535331726, + "learning_rate": 0.001, + "loss": 1.683, + "step": 21022 + }, + { + "epoch": 0.8893730434046874, + "grad_norm": 0.19646044075489044, + "learning_rate": 0.001, + "loss": 2.3388, + "step": 21023 + }, + { + "epoch": 0.8894153481682038, + "grad_norm": 0.173504039645195, + "learning_rate": 0.001, + "loss": 1.6195, + "step": 21024 + }, + { + "epoch": 0.8894576529317201, + "grad_norm": 0.145315483212471, + "learning_rate": 0.001, + "loss": 2.3937, + "step": 21025 + }, + { + "epoch": 0.8894999576952365, + "grad_norm": 0.3077605366706848, + "learning_rate": 0.001, + "loss": 3.0489, + "step": 21026 + }, + { + "epoch": 0.8895422624587529, + "grad_norm": 0.15277047455310822, + "learning_rate": 0.001, + "loss": 1.9625, + "step": 21027 + }, + { + "epoch": 0.8895845672222692, + "grad_norm": 7.509362697601318, + "learning_rate": 0.001, + "loss": 3.2375, + "step": 21028 + }, + { + "epoch": 0.8896268719857856, + "grad_norm": 0.23799103498458862, + "learning_rate": 0.001, + "loss": 1.8929, + "step": 21029 + }, + { + "epoch": 0.889669176749302, + "grad_norm": 0.16391988098621368, + "learning_rate": 0.001, + "loss": 2.3133, + "step": 21030 + }, + { + "epoch": 0.8897114815128183, + "grad_norm": 0.13483858108520508, + "learning_rate": 0.001, + "loss": 1.8435, + "step": 21031 + }, + { + "epoch": 0.8897537862763347, + "grad_norm": 0.1675466150045395, + "learning_rate": 0.001, + "loss": 2.2034, + "step": 21032 + }, + { + "epoch": 0.8897960910398511, + "grad_norm": 0.5026352405548096, + "learning_rate": 0.001, + "loss": 3.6788, + "step": 21033 + }, + { + "epoch": 0.8898383958033674, + "grad_norm": 0.1782296597957611, + "learning_rate": 0.001, + "loss": 1.971, + "step": 21034 + }, + { + "epoch": 0.8898807005668838, + "grad_norm": 0.1635374128818512, + "learning_rate": 0.001, + "loss": 1.5724, + "step": 21035 + }, + { + "epoch": 0.8899230053304003, + "grad_norm": 0.17833854258060455, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 21036 + }, + { + "epoch": 0.8899653100939166, + "grad_norm": 0.15048396587371826, + "learning_rate": 0.001, + "loss": 1.8958, + "step": 21037 + }, + { + "epoch": 0.890007614857433, + "grad_norm": 0.4295186996459961, + "learning_rate": 0.001, + "loss": 3.0759, + "step": 21038 + }, + { + "epoch": 0.8900499196209494, + "grad_norm": 0.26509931683540344, + "learning_rate": 0.001, + "loss": 3.175, + "step": 21039 + }, + { + "epoch": 0.8900922243844657, + "grad_norm": 0.21517863869667053, + "learning_rate": 0.001, + "loss": 2.5894, + "step": 21040 + }, + { + "epoch": 0.8901345291479821, + "grad_norm": 0.15803544223308563, + "learning_rate": 0.001, + "loss": 1.7448, + "step": 21041 + }, + { + "epoch": 0.8901768339114985, + "grad_norm": 0.6315363645553589, + "learning_rate": 0.001, + "loss": 2.1491, + "step": 21042 + }, + { + "epoch": 0.8902191386750148, + "grad_norm": 0.1378752887248993, + "learning_rate": 0.001, + "loss": 1.8168, + "step": 21043 + }, + { + "epoch": 0.8902614434385312, + "grad_norm": 0.16324613988399506, + "learning_rate": 0.001, + "loss": 3.0616, + "step": 21044 + }, + { + "epoch": 0.8903037482020475, + "grad_norm": 0.16827210783958435, + "learning_rate": 0.001, + "loss": 1.9812, + "step": 21045 + }, + { + "epoch": 0.8903460529655639, + "grad_norm": 0.16511641442775726, + "learning_rate": 0.001, + "loss": 2.405, + "step": 21046 + }, + { + "epoch": 0.8903883577290803, + "grad_norm": 0.14627037942409515, + "learning_rate": 0.001, + "loss": 1.4163, + "step": 21047 + }, + { + "epoch": 0.8904306624925966, + "grad_norm": 8.328338623046875, + "learning_rate": 0.001, + "loss": 2.2162, + "step": 21048 + }, + { + "epoch": 0.890472967256113, + "grad_norm": 0.2969225347042084, + "learning_rate": 0.001, + "loss": 1.7355, + "step": 21049 + }, + { + "epoch": 0.8905152720196294, + "grad_norm": 0.17002110183238983, + "learning_rate": 0.001, + "loss": 1.5175, + "step": 21050 + }, + { + "epoch": 0.8905575767831457, + "grad_norm": 4.17087459564209, + "learning_rate": 0.001, + "loss": 1.9892, + "step": 21051 + }, + { + "epoch": 0.8905998815466621, + "grad_norm": 0.1925853043794632, + "learning_rate": 0.001, + "loss": 1.745, + "step": 21052 + }, + { + "epoch": 0.8906421863101786, + "grad_norm": 0.19640986621379852, + "learning_rate": 0.001, + "loss": 2.2911, + "step": 21053 + }, + { + "epoch": 0.8906844910736949, + "grad_norm": 0.3781888484954834, + "learning_rate": 0.001, + "loss": 1.5467, + "step": 21054 + }, + { + "epoch": 0.8907267958372113, + "grad_norm": 0.20206791162490845, + "learning_rate": 0.001, + "loss": 1.8796, + "step": 21055 + }, + { + "epoch": 0.8907691006007277, + "grad_norm": 0.13937760889530182, + "learning_rate": 0.001, + "loss": 2.0243, + "step": 21056 + }, + { + "epoch": 0.890811405364244, + "grad_norm": 0.2623637318611145, + "learning_rate": 0.001, + "loss": 1.7412, + "step": 21057 + }, + { + "epoch": 0.8908537101277604, + "grad_norm": 0.21680723130702972, + "learning_rate": 0.001, + "loss": 2.2334, + "step": 21058 + }, + { + "epoch": 0.8908960148912768, + "grad_norm": 0.18895690143108368, + "learning_rate": 0.001, + "loss": 1.9865, + "step": 21059 + }, + { + "epoch": 0.8909383196547931, + "grad_norm": 0.1650429368019104, + "learning_rate": 0.001, + "loss": 2.9103, + "step": 21060 + }, + { + "epoch": 0.8909806244183095, + "grad_norm": 0.32935771346092224, + "learning_rate": 0.001, + "loss": 2.0757, + "step": 21061 + }, + { + "epoch": 0.8910229291818259, + "grad_norm": 0.14048244059085846, + "learning_rate": 0.001, + "loss": 1.6505, + "step": 21062 + }, + { + "epoch": 0.8910652339453422, + "grad_norm": 0.15878313779830933, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 21063 + }, + { + "epoch": 0.8911075387088586, + "grad_norm": 0.17732056975364685, + "learning_rate": 0.001, + "loss": 1.7493, + "step": 21064 + }, + { + "epoch": 0.891149843472375, + "grad_norm": 0.15534935891628265, + "learning_rate": 0.001, + "loss": 2.3862, + "step": 21065 + }, + { + "epoch": 0.8911921482358913, + "grad_norm": 0.5262129902839661, + "learning_rate": 0.001, + "loss": 1.3208, + "step": 21066 + }, + { + "epoch": 0.8912344529994077, + "grad_norm": 0.15670378506183624, + "learning_rate": 0.001, + "loss": 2.1793, + "step": 21067 + }, + { + "epoch": 0.8912767577629241, + "grad_norm": 1.0509247779846191, + "learning_rate": 0.001, + "loss": 1.7555, + "step": 21068 + }, + { + "epoch": 0.8913190625264404, + "grad_norm": 0.16009117662906647, + "learning_rate": 0.001, + "loss": 1.5403, + "step": 21069 + }, + { + "epoch": 0.8913613672899569, + "grad_norm": 0.19776473939418793, + "learning_rate": 0.001, + "loss": 1.93, + "step": 21070 + }, + { + "epoch": 0.8914036720534733, + "grad_norm": 0.12899529933929443, + "learning_rate": 0.001, + "loss": 1.7009, + "step": 21071 + }, + { + "epoch": 0.8914459768169896, + "grad_norm": 0.49440255761146545, + "learning_rate": 0.001, + "loss": 2.0824, + "step": 21072 + }, + { + "epoch": 0.891488281580506, + "grad_norm": 0.1522773802280426, + "learning_rate": 0.001, + "loss": 2.4829, + "step": 21073 + }, + { + "epoch": 0.8915305863440224, + "grad_norm": 0.16316981613636017, + "learning_rate": 0.001, + "loss": 1.7877, + "step": 21074 + }, + { + "epoch": 0.8915728911075387, + "grad_norm": 0.18345019221305847, + "learning_rate": 0.001, + "loss": 1.7857, + "step": 21075 + }, + { + "epoch": 0.8916151958710551, + "grad_norm": 0.19031278789043427, + "learning_rate": 0.001, + "loss": 2.7861, + "step": 21076 + }, + { + "epoch": 0.8916575006345715, + "grad_norm": 0.14421750605106354, + "learning_rate": 0.001, + "loss": 2.4813, + "step": 21077 + }, + { + "epoch": 0.8916998053980878, + "grad_norm": 0.13803726434707642, + "learning_rate": 0.001, + "loss": 1.7713, + "step": 21078 + }, + { + "epoch": 0.8917421101616042, + "grad_norm": 0.16265204548835754, + "learning_rate": 0.001, + "loss": 1.8057, + "step": 21079 + }, + { + "epoch": 0.8917844149251206, + "grad_norm": 0.17232954502105713, + "learning_rate": 0.001, + "loss": 1.8699, + "step": 21080 + }, + { + "epoch": 0.8918267196886369, + "grad_norm": 0.14032989740371704, + "learning_rate": 0.001, + "loss": 1.632, + "step": 21081 + }, + { + "epoch": 0.8918690244521533, + "grad_norm": 0.16456128656864166, + "learning_rate": 0.001, + "loss": 2.1099, + "step": 21082 + }, + { + "epoch": 0.8919113292156697, + "grad_norm": 0.6810503005981445, + "learning_rate": 0.001, + "loss": 2.6714, + "step": 21083 + }, + { + "epoch": 0.891953633979186, + "grad_norm": 0.12685897946357727, + "learning_rate": 0.001, + "loss": 1.3731, + "step": 21084 + }, + { + "epoch": 0.8919959387427024, + "grad_norm": 0.17308743298053741, + "learning_rate": 0.001, + "loss": 2.6058, + "step": 21085 + }, + { + "epoch": 0.8920382435062189, + "grad_norm": 0.17059041559696198, + "learning_rate": 0.001, + "loss": 2.7914, + "step": 21086 + }, + { + "epoch": 0.8920805482697352, + "grad_norm": 0.14783084392547607, + "learning_rate": 0.001, + "loss": 1.8769, + "step": 21087 + }, + { + "epoch": 0.8921228530332516, + "grad_norm": 0.1687219738960266, + "learning_rate": 0.001, + "loss": 2.3727, + "step": 21088 + }, + { + "epoch": 0.892165157796768, + "grad_norm": 0.1436719000339508, + "learning_rate": 0.001, + "loss": 1.8052, + "step": 21089 + }, + { + "epoch": 0.8922074625602843, + "grad_norm": 0.18090786039829254, + "learning_rate": 0.001, + "loss": 1.6288, + "step": 21090 + }, + { + "epoch": 0.8922497673238007, + "grad_norm": 0.2179936170578003, + "learning_rate": 0.001, + "loss": 1.6717, + "step": 21091 + }, + { + "epoch": 0.892292072087317, + "grad_norm": 0.36641108989715576, + "learning_rate": 0.001, + "loss": 2.6457, + "step": 21092 + }, + { + "epoch": 0.8923343768508334, + "grad_norm": 0.13992157578468323, + "learning_rate": 0.001, + "loss": 3.2825, + "step": 21093 + }, + { + "epoch": 0.8923766816143498, + "grad_norm": 2.9790940284729004, + "learning_rate": 0.001, + "loss": 2.9316, + "step": 21094 + }, + { + "epoch": 0.8924189863778661, + "grad_norm": 0.1301809698343277, + "learning_rate": 0.001, + "loss": 1.6709, + "step": 21095 + }, + { + "epoch": 0.8924612911413825, + "grad_norm": 0.13229301571846008, + "learning_rate": 0.001, + "loss": 1.9206, + "step": 21096 + }, + { + "epoch": 0.8925035959048989, + "grad_norm": 3.1202783584594727, + "learning_rate": 0.001, + "loss": 1.9438, + "step": 21097 + }, + { + "epoch": 0.8925459006684152, + "grad_norm": 0.17747105658054352, + "learning_rate": 0.001, + "loss": 2.134, + "step": 21098 + }, + { + "epoch": 0.8925882054319316, + "grad_norm": 0.16291233897209167, + "learning_rate": 0.001, + "loss": 2.8814, + "step": 21099 + }, + { + "epoch": 0.892630510195448, + "grad_norm": 0.13646738231182098, + "learning_rate": 0.001, + "loss": 2.265, + "step": 21100 + }, + { + "epoch": 0.8926728149589643, + "grad_norm": 0.19404670596122742, + "learning_rate": 0.001, + "loss": 2.3592, + "step": 21101 + }, + { + "epoch": 0.8927151197224807, + "grad_norm": 0.5409544110298157, + "learning_rate": 0.001, + "loss": 2.8194, + "step": 21102 + }, + { + "epoch": 0.8927574244859972, + "grad_norm": 36.899574279785156, + "learning_rate": 0.001, + "loss": 1.8905, + "step": 21103 + }, + { + "epoch": 0.8927997292495135, + "grad_norm": 0.705781877040863, + "learning_rate": 0.001, + "loss": 2.5204, + "step": 21104 + }, + { + "epoch": 0.8928420340130299, + "grad_norm": 0.14836551249027252, + "learning_rate": 0.001, + "loss": 1.9536, + "step": 21105 + }, + { + "epoch": 0.8928843387765463, + "grad_norm": 0.14388398826122284, + "learning_rate": 0.001, + "loss": 2.0019, + "step": 21106 + }, + { + "epoch": 0.8929266435400626, + "grad_norm": 0.15267707407474518, + "learning_rate": 0.001, + "loss": 1.9647, + "step": 21107 + }, + { + "epoch": 0.892968948303579, + "grad_norm": 0.8427422046661377, + "learning_rate": 0.001, + "loss": 2.3751, + "step": 21108 + }, + { + "epoch": 0.8930112530670954, + "grad_norm": 0.13757242262363434, + "learning_rate": 0.001, + "loss": 1.9751, + "step": 21109 + }, + { + "epoch": 0.8930535578306117, + "grad_norm": 4.548515319824219, + "learning_rate": 0.001, + "loss": 1.8397, + "step": 21110 + }, + { + "epoch": 0.8930958625941281, + "grad_norm": 0.14274819195270538, + "learning_rate": 0.001, + "loss": 1.7187, + "step": 21111 + }, + { + "epoch": 0.8931381673576445, + "grad_norm": 0.15041141211986542, + "learning_rate": 0.001, + "loss": 2.6026, + "step": 21112 + }, + { + "epoch": 0.8931804721211608, + "grad_norm": 0.1963375061750412, + "learning_rate": 0.001, + "loss": 2.2654, + "step": 21113 + }, + { + "epoch": 0.8932227768846772, + "grad_norm": 0.14106130599975586, + "learning_rate": 0.001, + "loss": 2.6673, + "step": 21114 + }, + { + "epoch": 0.8932650816481936, + "grad_norm": 0.15438930690288544, + "learning_rate": 0.001, + "loss": 1.556, + "step": 21115 + }, + { + "epoch": 0.8933073864117099, + "grad_norm": 0.18182194232940674, + "learning_rate": 0.001, + "loss": 2.803, + "step": 21116 + }, + { + "epoch": 0.8933496911752263, + "grad_norm": 2.966031074523926, + "learning_rate": 0.001, + "loss": 3.5384, + "step": 21117 + }, + { + "epoch": 0.8933919959387427, + "grad_norm": 0.16081485152244568, + "learning_rate": 0.001, + "loss": 2.8794, + "step": 21118 + }, + { + "epoch": 0.893434300702259, + "grad_norm": 0.15742535889148712, + "learning_rate": 0.001, + "loss": 1.7857, + "step": 21119 + }, + { + "epoch": 0.8934766054657755, + "grad_norm": 0.16597464680671692, + "learning_rate": 0.001, + "loss": 2.4486, + "step": 21120 + }, + { + "epoch": 0.8935189102292919, + "grad_norm": 0.1744956225156784, + "learning_rate": 0.001, + "loss": 2.2636, + "step": 21121 + }, + { + "epoch": 0.8935612149928082, + "grad_norm": 0.1758044958114624, + "learning_rate": 0.001, + "loss": 2.0722, + "step": 21122 + }, + { + "epoch": 0.8936035197563246, + "grad_norm": 0.1646287441253662, + "learning_rate": 0.001, + "loss": 2.4013, + "step": 21123 + }, + { + "epoch": 0.893645824519841, + "grad_norm": 1.3558330535888672, + "learning_rate": 0.001, + "loss": 1.6679, + "step": 21124 + }, + { + "epoch": 0.8936881292833573, + "grad_norm": 0.1626473218202591, + "learning_rate": 0.001, + "loss": 1.708, + "step": 21125 + }, + { + "epoch": 0.8937304340468737, + "grad_norm": 0.505549967288971, + "learning_rate": 0.001, + "loss": 1.9515, + "step": 21126 + }, + { + "epoch": 0.8937727388103901, + "grad_norm": 0.20900815725326538, + "learning_rate": 0.001, + "loss": 2.2761, + "step": 21127 + }, + { + "epoch": 0.8938150435739064, + "grad_norm": 0.1580853909254074, + "learning_rate": 0.001, + "loss": 2.0052, + "step": 21128 + }, + { + "epoch": 0.8938573483374228, + "grad_norm": 0.14948229491710663, + "learning_rate": 0.001, + "loss": 1.5362, + "step": 21129 + }, + { + "epoch": 0.8938996531009392, + "grad_norm": 0.17575781047344208, + "learning_rate": 0.001, + "loss": 1.8339, + "step": 21130 + }, + { + "epoch": 0.8939419578644555, + "grad_norm": 0.16846154630184174, + "learning_rate": 0.001, + "loss": 3.2954, + "step": 21131 + }, + { + "epoch": 0.8939842626279719, + "grad_norm": 0.18402104079723358, + "learning_rate": 0.001, + "loss": 2.1852, + "step": 21132 + }, + { + "epoch": 0.8940265673914883, + "grad_norm": 0.1554890275001526, + "learning_rate": 0.001, + "loss": 2.3382, + "step": 21133 + }, + { + "epoch": 0.8940688721550046, + "grad_norm": 0.13957303762435913, + "learning_rate": 0.001, + "loss": 2.1363, + "step": 21134 + }, + { + "epoch": 0.894111176918521, + "grad_norm": 0.16399943828582764, + "learning_rate": 0.001, + "loss": 1.6929, + "step": 21135 + }, + { + "epoch": 0.8941534816820373, + "grad_norm": 0.8293852210044861, + "learning_rate": 0.001, + "loss": 1.8677, + "step": 21136 + }, + { + "epoch": 0.8941957864455538, + "grad_norm": 0.15984512865543365, + "learning_rate": 0.001, + "loss": 1.7682, + "step": 21137 + }, + { + "epoch": 0.8942380912090702, + "grad_norm": 0.3316708505153656, + "learning_rate": 0.001, + "loss": 2.9118, + "step": 21138 + }, + { + "epoch": 0.8942803959725865, + "grad_norm": 0.1807178258895874, + "learning_rate": 0.001, + "loss": 2.949, + "step": 21139 + }, + { + "epoch": 0.8943227007361029, + "grad_norm": 0.13455355167388916, + "learning_rate": 0.001, + "loss": 1.867, + "step": 21140 + }, + { + "epoch": 0.8943650054996193, + "grad_norm": 0.1604076772928238, + "learning_rate": 0.001, + "loss": 3.1111, + "step": 21141 + }, + { + "epoch": 0.8944073102631356, + "grad_norm": 0.2634793221950531, + "learning_rate": 0.001, + "loss": 1.3338, + "step": 21142 + }, + { + "epoch": 0.894449615026652, + "grad_norm": 0.15633098781108856, + "learning_rate": 0.001, + "loss": 2.0984, + "step": 21143 + }, + { + "epoch": 0.8944919197901684, + "grad_norm": 0.16510465741157532, + "learning_rate": 0.001, + "loss": 1.7809, + "step": 21144 + }, + { + "epoch": 0.8945342245536847, + "grad_norm": 0.13871534168720245, + "learning_rate": 0.001, + "loss": 1.702, + "step": 21145 + }, + { + "epoch": 0.8945765293172011, + "grad_norm": 3.331571340560913, + "learning_rate": 0.001, + "loss": 3.3754, + "step": 21146 + }, + { + "epoch": 0.8946188340807175, + "grad_norm": 0.14018763601779938, + "learning_rate": 0.001, + "loss": 2.0596, + "step": 21147 + }, + { + "epoch": 0.8946611388442338, + "grad_norm": 0.14845266938209534, + "learning_rate": 0.001, + "loss": 1.8956, + "step": 21148 + }, + { + "epoch": 0.8947034436077502, + "grad_norm": 0.18321087956428528, + "learning_rate": 0.001, + "loss": 1.821, + "step": 21149 + }, + { + "epoch": 0.8947457483712666, + "grad_norm": 39.97751998901367, + "learning_rate": 0.001, + "loss": 2.6723, + "step": 21150 + }, + { + "epoch": 0.8947880531347829, + "grad_norm": 0.7367990612983704, + "learning_rate": 0.001, + "loss": 2.3226, + "step": 21151 + }, + { + "epoch": 0.8948303578982993, + "grad_norm": 0.15124103426933289, + "learning_rate": 0.001, + "loss": 1.99, + "step": 21152 + }, + { + "epoch": 0.8948726626618158, + "grad_norm": 0.14537779986858368, + "learning_rate": 0.001, + "loss": 2.4792, + "step": 21153 + }, + { + "epoch": 0.894914967425332, + "grad_norm": 0.16853304207324982, + "learning_rate": 0.001, + "loss": 2.233, + "step": 21154 + }, + { + "epoch": 0.8949572721888485, + "grad_norm": 0.17724493145942688, + "learning_rate": 0.001, + "loss": 2.6854, + "step": 21155 + }, + { + "epoch": 0.8949995769523649, + "grad_norm": 0.1677272468805313, + "learning_rate": 0.001, + "loss": 1.6028, + "step": 21156 + }, + { + "epoch": 0.8950418817158812, + "grad_norm": 0.17956793308258057, + "learning_rate": 0.001, + "loss": 1.9077, + "step": 21157 + }, + { + "epoch": 0.8950841864793976, + "grad_norm": 0.18584755063056946, + "learning_rate": 0.001, + "loss": 1.6491, + "step": 21158 + }, + { + "epoch": 0.895126491242914, + "grad_norm": 0.15615415573120117, + "learning_rate": 0.001, + "loss": 2.2369, + "step": 21159 + }, + { + "epoch": 0.8951687960064303, + "grad_norm": 0.16031022369861603, + "learning_rate": 0.001, + "loss": 1.9678, + "step": 21160 + }, + { + "epoch": 0.8952111007699467, + "grad_norm": 10.240532875061035, + "learning_rate": 0.001, + "loss": 3.2672, + "step": 21161 + }, + { + "epoch": 0.8952534055334631, + "grad_norm": 0.1851503998041153, + "learning_rate": 0.001, + "loss": 2.2191, + "step": 21162 + }, + { + "epoch": 0.8952957102969794, + "grad_norm": 0.17084771394729614, + "learning_rate": 0.001, + "loss": 1.9097, + "step": 21163 + }, + { + "epoch": 0.8953380150604958, + "grad_norm": 0.6595641374588013, + "learning_rate": 0.001, + "loss": 3.2942, + "step": 21164 + }, + { + "epoch": 0.8953803198240122, + "grad_norm": 0.12742707133293152, + "learning_rate": 0.001, + "loss": 2.523, + "step": 21165 + }, + { + "epoch": 0.8954226245875285, + "grad_norm": 3.2080767154693604, + "learning_rate": 0.001, + "loss": 2.5101, + "step": 21166 + }, + { + "epoch": 0.8954649293510449, + "grad_norm": 0.1244649738073349, + "learning_rate": 0.001, + "loss": 3.2143, + "step": 21167 + }, + { + "epoch": 0.8955072341145613, + "grad_norm": 0.14987188577651978, + "learning_rate": 0.001, + "loss": 2.8929, + "step": 21168 + }, + { + "epoch": 0.8955495388780776, + "grad_norm": 0.20873646438121796, + "learning_rate": 0.001, + "loss": 1.5905, + "step": 21169 + }, + { + "epoch": 0.895591843641594, + "grad_norm": 1.1376652717590332, + "learning_rate": 0.001, + "loss": 1.4977, + "step": 21170 + }, + { + "epoch": 0.8956341484051105, + "grad_norm": 0.16955767571926117, + "learning_rate": 0.001, + "loss": 2.2745, + "step": 21171 + }, + { + "epoch": 0.8956764531686268, + "grad_norm": 0.1822686642408371, + "learning_rate": 0.001, + "loss": 2.4192, + "step": 21172 + }, + { + "epoch": 0.8957187579321432, + "grad_norm": 0.1916595995426178, + "learning_rate": 0.001, + "loss": 1.7201, + "step": 21173 + }, + { + "epoch": 0.8957610626956596, + "grad_norm": 0.14500486850738525, + "learning_rate": 0.001, + "loss": 1.6737, + "step": 21174 + }, + { + "epoch": 0.8958033674591759, + "grad_norm": 0.15613558888435364, + "learning_rate": 0.001, + "loss": 1.9017, + "step": 21175 + }, + { + "epoch": 0.8958456722226923, + "grad_norm": 0.149849534034729, + "learning_rate": 0.001, + "loss": 2.7908, + "step": 21176 + }, + { + "epoch": 0.8958879769862087, + "grad_norm": 0.1522967666387558, + "learning_rate": 0.001, + "loss": 3.3811, + "step": 21177 + }, + { + "epoch": 0.895930281749725, + "grad_norm": 0.20839162170886993, + "learning_rate": 0.001, + "loss": 1.6453, + "step": 21178 + }, + { + "epoch": 0.8959725865132414, + "grad_norm": 0.18423724174499512, + "learning_rate": 0.001, + "loss": 2.3114, + "step": 21179 + }, + { + "epoch": 0.8960148912767577, + "grad_norm": 0.20367826521396637, + "learning_rate": 0.001, + "loss": 2.0914, + "step": 21180 + }, + { + "epoch": 0.8960571960402741, + "grad_norm": 2.9200098514556885, + "learning_rate": 0.001, + "loss": 1.8609, + "step": 21181 + }, + { + "epoch": 0.8960995008037905, + "grad_norm": 0.2010602205991745, + "learning_rate": 0.001, + "loss": 2.1336, + "step": 21182 + }, + { + "epoch": 0.8961418055673068, + "grad_norm": 0.15395566821098328, + "learning_rate": 0.001, + "loss": 3.506, + "step": 21183 + }, + { + "epoch": 0.8961841103308232, + "grad_norm": 0.17142242193222046, + "learning_rate": 0.001, + "loss": 3.1603, + "step": 21184 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 0.36323732137680054, + "learning_rate": 0.001, + "loss": 2.0423, + "step": 21185 + }, + { + "epoch": 0.8962687198578559, + "grad_norm": 1.7019706964492798, + "learning_rate": 0.001, + "loss": 3.0269, + "step": 21186 + }, + { + "epoch": 0.8963110246213724, + "grad_norm": 2.892049551010132, + "learning_rate": 0.001, + "loss": 2.2748, + "step": 21187 + }, + { + "epoch": 0.8963533293848888, + "grad_norm": 0.4944346845149994, + "learning_rate": 0.001, + "loss": 2.5055, + "step": 21188 + }, + { + "epoch": 0.8963956341484051, + "grad_norm": 0.19812704622745514, + "learning_rate": 0.001, + "loss": 1.639, + "step": 21189 + }, + { + "epoch": 0.8964379389119215, + "grad_norm": 0.32802027463912964, + "learning_rate": 0.001, + "loss": 2.4047, + "step": 21190 + }, + { + "epoch": 0.8964802436754379, + "grad_norm": 0.15056705474853516, + "learning_rate": 0.001, + "loss": 1.724, + "step": 21191 + }, + { + "epoch": 0.8965225484389542, + "grad_norm": 0.15505051612854004, + "learning_rate": 0.001, + "loss": 2.1101, + "step": 21192 + }, + { + "epoch": 0.8965648532024706, + "grad_norm": 0.16855968534946442, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 21193 + }, + { + "epoch": 0.896607157965987, + "grad_norm": 0.15163734555244446, + "learning_rate": 0.001, + "loss": 1.4858, + "step": 21194 + }, + { + "epoch": 0.8966494627295033, + "grad_norm": 0.21959041059017181, + "learning_rate": 0.001, + "loss": 2.2257, + "step": 21195 + }, + { + "epoch": 0.8966917674930197, + "grad_norm": 0.18358977138996124, + "learning_rate": 0.001, + "loss": 1.8863, + "step": 21196 + }, + { + "epoch": 0.8967340722565361, + "grad_norm": 0.19511397182941437, + "learning_rate": 0.001, + "loss": 1.9636, + "step": 21197 + }, + { + "epoch": 0.8967763770200524, + "grad_norm": 0.19627627730369568, + "learning_rate": 0.001, + "loss": 2.0717, + "step": 21198 + }, + { + "epoch": 0.8968186817835688, + "grad_norm": 0.22323928773403168, + "learning_rate": 0.001, + "loss": 1.7666, + "step": 21199 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.14983603358268738, + "learning_rate": 0.001, + "loss": 2.8837, + "step": 21200 + }, + { + "epoch": 0.8969032913106015, + "grad_norm": 0.13686136901378632, + "learning_rate": 0.001, + "loss": 2.385, + "step": 21201 + }, + { + "epoch": 0.896945596074118, + "grad_norm": 0.15880164504051208, + "learning_rate": 0.001, + "loss": 2.1539, + "step": 21202 + }, + { + "epoch": 0.8969879008376344, + "grad_norm": 0.13948020339012146, + "learning_rate": 0.001, + "loss": 1.6086, + "step": 21203 + }, + { + "epoch": 0.8970302056011507, + "grad_norm": 0.12735982239246368, + "learning_rate": 0.001, + "loss": 1.8913, + "step": 21204 + }, + { + "epoch": 0.8970725103646671, + "grad_norm": 0.1532490998506546, + "learning_rate": 0.001, + "loss": 1.7689, + "step": 21205 + }, + { + "epoch": 0.8971148151281835, + "grad_norm": 0.20498578250408173, + "learning_rate": 0.001, + "loss": 1.7279, + "step": 21206 + }, + { + "epoch": 0.8971571198916998, + "grad_norm": 0.13713718950748444, + "learning_rate": 0.001, + "loss": 1.7066, + "step": 21207 + }, + { + "epoch": 0.8971994246552162, + "grad_norm": 0.15375199913978577, + "learning_rate": 0.001, + "loss": 1.5455, + "step": 21208 + }, + { + "epoch": 0.8972417294187326, + "grad_norm": 0.12200403213500977, + "learning_rate": 0.001, + "loss": 1.8769, + "step": 21209 + }, + { + "epoch": 0.8972840341822489, + "grad_norm": 0.15674884617328644, + "learning_rate": 0.001, + "loss": 2.4422, + "step": 21210 + }, + { + "epoch": 0.8973263389457653, + "grad_norm": 0.7319082617759705, + "learning_rate": 0.001, + "loss": 2.3794, + "step": 21211 + }, + { + "epoch": 0.8973686437092817, + "grad_norm": 0.14720921218395233, + "learning_rate": 0.001, + "loss": 1.9387, + "step": 21212 + }, + { + "epoch": 0.897410948472798, + "grad_norm": 0.2884654402732849, + "learning_rate": 0.001, + "loss": 3.2008, + "step": 21213 + }, + { + "epoch": 0.8974532532363144, + "grad_norm": 1.7841389179229736, + "learning_rate": 0.001, + "loss": 3.6275, + "step": 21214 + }, + { + "epoch": 0.8974955579998308, + "grad_norm": 0.7872580885887146, + "learning_rate": 0.001, + "loss": 1.6937, + "step": 21215 + }, + { + "epoch": 0.8975378627633471, + "grad_norm": 0.16804750263690948, + "learning_rate": 0.001, + "loss": 2.4865, + "step": 21216 + }, + { + "epoch": 0.8975801675268635, + "grad_norm": 0.18174222111701965, + "learning_rate": 0.001, + "loss": 2.2794, + "step": 21217 + }, + { + "epoch": 0.89762247229038, + "grad_norm": 0.21821044385433197, + "learning_rate": 0.001, + "loss": 3.6223, + "step": 21218 + }, + { + "epoch": 0.8976647770538962, + "grad_norm": 0.2736088037490845, + "learning_rate": 0.001, + "loss": 2.0796, + "step": 21219 + }, + { + "epoch": 0.8977070818174127, + "grad_norm": 0.15616385638713837, + "learning_rate": 0.001, + "loss": 2.9314, + "step": 21220 + }, + { + "epoch": 0.8977493865809291, + "grad_norm": 0.13997872173786163, + "learning_rate": 0.001, + "loss": 1.6217, + "step": 21221 + }, + { + "epoch": 0.8977916913444454, + "grad_norm": 0.17286120355129242, + "learning_rate": 0.001, + "loss": 2.5522, + "step": 21222 + }, + { + "epoch": 0.8978339961079618, + "grad_norm": 0.16818682849407196, + "learning_rate": 0.001, + "loss": 2.1953, + "step": 21223 + }, + { + "epoch": 0.8978763008714782, + "grad_norm": 0.8613235354423523, + "learning_rate": 0.001, + "loss": 2.311, + "step": 21224 + }, + { + "epoch": 0.8979186056349945, + "grad_norm": 1.0731678009033203, + "learning_rate": 0.001, + "loss": 3.8418, + "step": 21225 + }, + { + "epoch": 0.8979609103985109, + "grad_norm": 0.1638665348291397, + "learning_rate": 0.001, + "loss": 2.2482, + "step": 21226 + }, + { + "epoch": 0.8980032151620272, + "grad_norm": 0.3347175419330597, + "learning_rate": 0.001, + "loss": 1.9502, + "step": 21227 + }, + { + "epoch": 0.8980455199255436, + "grad_norm": 1.1207183599472046, + "learning_rate": 0.001, + "loss": 1.5674, + "step": 21228 + }, + { + "epoch": 0.89808782468906, + "grad_norm": 0.13141308724880219, + "learning_rate": 0.001, + "loss": 2.1263, + "step": 21229 + }, + { + "epoch": 0.8981301294525763, + "grad_norm": 0.14101554453372955, + "learning_rate": 0.001, + "loss": 2.591, + "step": 21230 + }, + { + "epoch": 0.8981724342160927, + "grad_norm": 0.17062920331954956, + "learning_rate": 0.001, + "loss": 1.6321, + "step": 21231 + }, + { + "epoch": 0.8982147389796091, + "grad_norm": 0.18323302268981934, + "learning_rate": 0.001, + "loss": 2.9428, + "step": 21232 + }, + { + "epoch": 0.8982570437431254, + "grad_norm": 0.15976440906524658, + "learning_rate": 0.001, + "loss": 1.8825, + "step": 21233 + }, + { + "epoch": 0.8982993485066418, + "grad_norm": 0.15031205117702484, + "learning_rate": 0.001, + "loss": 2.5819, + "step": 21234 + }, + { + "epoch": 0.8983416532701582, + "grad_norm": 4.328239917755127, + "learning_rate": 0.001, + "loss": 1.6748, + "step": 21235 + }, + { + "epoch": 0.8983839580336745, + "grad_norm": 0.17019319534301758, + "learning_rate": 0.001, + "loss": 3.3208, + "step": 21236 + }, + { + "epoch": 0.898426262797191, + "grad_norm": 0.21404282748699188, + "learning_rate": 0.001, + "loss": 1.6199, + "step": 21237 + }, + { + "epoch": 0.8984685675607074, + "grad_norm": 0.15579356253147125, + "learning_rate": 0.001, + "loss": 1.7415, + "step": 21238 + }, + { + "epoch": 0.8985108723242237, + "grad_norm": 0.3757559359073639, + "learning_rate": 0.001, + "loss": 1.8607, + "step": 21239 + }, + { + "epoch": 0.8985531770877401, + "grad_norm": 0.16331882774829865, + "learning_rate": 0.001, + "loss": 2.3862, + "step": 21240 + }, + { + "epoch": 0.8985954818512565, + "grad_norm": 0.1645757555961609, + "learning_rate": 0.001, + "loss": 1.8158, + "step": 21241 + }, + { + "epoch": 0.8986377866147728, + "grad_norm": 0.159241184592247, + "learning_rate": 0.001, + "loss": 1.7989, + "step": 21242 + }, + { + "epoch": 0.8986800913782892, + "grad_norm": 0.1543315201997757, + "learning_rate": 0.001, + "loss": 3.289, + "step": 21243 + }, + { + "epoch": 0.8987223961418056, + "grad_norm": 0.14359703660011292, + "learning_rate": 0.001, + "loss": 2.5562, + "step": 21244 + }, + { + "epoch": 0.8987647009053219, + "grad_norm": 2.587345600128174, + "learning_rate": 0.001, + "loss": 2.6168, + "step": 21245 + }, + { + "epoch": 0.8988070056688383, + "grad_norm": 0.17357079684734344, + "learning_rate": 0.001, + "loss": 1.9072, + "step": 21246 + }, + { + "epoch": 0.8988493104323547, + "grad_norm": 0.1991180181503296, + "learning_rate": 0.001, + "loss": 1.8736, + "step": 21247 + }, + { + "epoch": 0.898891615195871, + "grad_norm": 0.14238642156124115, + "learning_rate": 0.001, + "loss": 1.1997, + "step": 21248 + }, + { + "epoch": 0.8989339199593874, + "grad_norm": 0.20077811181545258, + "learning_rate": 0.001, + "loss": 2.0872, + "step": 21249 + }, + { + "epoch": 0.8989762247229038, + "grad_norm": 0.7532477378845215, + "learning_rate": 0.001, + "loss": 2.3801, + "step": 21250 + }, + { + "epoch": 0.8990185294864201, + "grad_norm": 0.1633615493774414, + "learning_rate": 0.001, + "loss": 2.2373, + "step": 21251 + }, + { + "epoch": 0.8990608342499365, + "grad_norm": 0.13562917709350586, + "learning_rate": 0.001, + "loss": 1.5633, + "step": 21252 + }, + { + "epoch": 0.899103139013453, + "grad_norm": 0.14501778781414032, + "learning_rate": 0.001, + "loss": 1.7519, + "step": 21253 + }, + { + "epoch": 0.8991454437769693, + "grad_norm": 0.14665882289409637, + "learning_rate": 0.001, + "loss": 1.2838, + "step": 21254 + }, + { + "epoch": 0.8991877485404857, + "grad_norm": 0.14085936546325684, + "learning_rate": 0.001, + "loss": 1.6832, + "step": 21255 + }, + { + "epoch": 0.8992300533040021, + "grad_norm": 0.5537809133529663, + "learning_rate": 0.001, + "loss": 1.9436, + "step": 21256 + }, + { + "epoch": 0.8992723580675184, + "grad_norm": 0.14022964239120483, + "learning_rate": 0.001, + "loss": 2.0391, + "step": 21257 + }, + { + "epoch": 0.8993146628310348, + "grad_norm": 0.13678425550460815, + "learning_rate": 0.001, + "loss": 1.2781, + "step": 21258 + }, + { + "epoch": 0.8993569675945512, + "grad_norm": 0.1590747982263565, + "learning_rate": 0.001, + "loss": 2.4737, + "step": 21259 + }, + { + "epoch": 0.8993992723580675, + "grad_norm": 0.31500861048698425, + "learning_rate": 0.001, + "loss": 2.0949, + "step": 21260 + }, + { + "epoch": 0.8994415771215839, + "grad_norm": 0.7429576516151428, + "learning_rate": 0.001, + "loss": 2.6399, + "step": 21261 + }, + { + "epoch": 0.8994838818851003, + "grad_norm": 0.1560697704553604, + "learning_rate": 0.001, + "loss": 2.4382, + "step": 21262 + }, + { + "epoch": 0.8995261866486166, + "grad_norm": 0.1646103411912918, + "learning_rate": 0.001, + "loss": 1.8626, + "step": 21263 + }, + { + "epoch": 0.899568491412133, + "grad_norm": 0.4191820025444031, + "learning_rate": 0.001, + "loss": 2.2337, + "step": 21264 + }, + { + "epoch": 0.8996107961756494, + "grad_norm": 0.16874629259109497, + "learning_rate": 0.001, + "loss": 3.4052, + "step": 21265 + }, + { + "epoch": 0.8996531009391657, + "grad_norm": 0.12579210102558136, + "learning_rate": 0.001, + "loss": 2.4531, + "step": 21266 + }, + { + "epoch": 0.8996954057026821, + "grad_norm": 0.14168886840343475, + "learning_rate": 0.001, + "loss": 2.4614, + "step": 21267 + }, + { + "epoch": 0.8997377104661985, + "grad_norm": 0.16749995946884155, + "learning_rate": 0.001, + "loss": 1.3653, + "step": 21268 + }, + { + "epoch": 0.8997800152297148, + "grad_norm": 0.6988632678985596, + "learning_rate": 0.001, + "loss": 1.7769, + "step": 21269 + }, + { + "epoch": 0.8998223199932313, + "grad_norm": 0.22250531613826752, + "learning_rate": 0.001, + "loss": 1.8593, + "step": 21270 + }, + { + "epoch": 0.8998646247567476, + "grad_norm": 0.11966440081596375, + "learning_rate": 0.001, + "loss": 1.4307, + "step": 21271 + }, + { + "epoch": 0.899906929520264, + "grad_norm": 0.3543547987937927, + "learning_rate": 0.001, + "loss": 1.6872, + "step": 21272 + }, + { + "epoch": 0.8999492342837804, + "grad_norm": 0.15212112665176392, + "learning_rate": 0.001, + "loss": 2.4326, + "step": 21273 + }, + { + "epoch": 0.8999915390472967, + "grad_norm": 0.14390593767166138, + "learning_rate": 0.001, + "loss": 1.6194, + "step": 21274 + }, + { + "epoch": 0.9000338438108131, + "grad_norm": 0.1561872363090515, + "learning_rate": 0.001, + "loss": 1.9529, + "step": 21275 + }, + { + "epoch": 0.9000761485743295, + "grad_norm": 0.14127811789512634, + "learning_rate": 0.001, + "loss": 1.3719, + "step": 21276 + }, + { + "epoch": 0.9001184533378458, + "grad_norm": 0.13319812715053558, + "learning_rate": 0.001, + "loss": 1.8501, + "step": 21277 + }, + { + "epoch": 0.9001607581013622, + "grad_norm": 0.13405123353004456, + "learning_rate": 0.001, + "loss": 2.0724, + "step": 21278 + }, + { + "epoch": 0.9002030628648786, + "grad_norm": 0.18458141386508942, + "learning_rate": 0.001, + "loss": 2.1271, + "step": 21279 + }, + { + "epoch": 0.9002453676283949, + "grad_norm": 0.1432388573884964, + "learning_rate": 0.001, + "loss": 1.6285, + "step": 21280 + }, + { + "epoch": 0.9002876723919113, + "grad_norm": 0.12944474816322327, + "learning_rate": 0.001, + "loss": 2.7784, + "step": 21281 + }, + { + "epoch": 0.9003299771554277, + "grad_norm": 0.305156946182251, + "learning_rate": 0.001, + "loss": 2.5529, + "step": 21282 + }, + { + "epoch": 0.900372281918944, + "grad_norm": 104.79377746582031, + "learning_rate": 0.001, + "loss": 2.4094, + "step": 21283 + }, + { + "epoch": 0.9004145866824604, + "grad_norm": 0.16589900851249695, + "learning_rate": 0.001, + "loss": 2.0991, + "step": 21284 + }, + { + "epoch": 0.9004568914459768, + "grad_norm": 0.14582109451293945, + "learning_rate": 0.001, + "loss": 2.7329, + "step": 21285 + }, + { + "epoch": 0.9004991962094931, + "grad_norm": 0.17398712038993835, + "learning_rate": 0.001, + "loss": 1.9659, + "step": 21286 + }, + { + "epoch": 0.9005415009730096, + "grad_norm": 0.16176342964172363, + "learning_rate": 0.001, + "loss": 1.6683, + "step": 21287 + }, + { + "epoch": 0.900583805736526, + "grad_norm": 0.2542744278907776, + "learning_rate": 0.001, + "loss": 2.9904, + "step": 21288 + }, + { + "epoch": 0.9006261105000423, + "grad_norm": 0.1428564190864563, + "learning_rate": 0.001, + "loss": 2.8341, + "step": 21289 + }, + { + "epoch": 0.9006684152635587, + "grad_norm": 0.16790971159934998, + "learning_rate": 0.001, + "loss": 1.8499, + "step": 21290 + }, + { + "epoch": 0.9007107200270751, + "grad_norm": 0.32263800501823425, + "learning_rate": 0.001, + "loss": 2.4642, + "step": 21291 + }, + { + "epoch": 0.9007530247905914, + "grad_norm": 0.19915816187858582, + "learning_rate": 0.001, + "loss": 1.686, + "step": 21292 + }, + { + "epoch": 0.9007953295541078, + "grad_norm": 0.17057372629642487, + "learning_rate": 0.001, + "loss": 1.6805, + "step": 21293 + }, + { + "epoch": 0.9008376343176242, + "grad_norm": 0.22040168941020966, + "learning_rate": 0.001, + "loss": 2.4221, + "step": 21294 + }, + { + "epoch": 0.9008799390811405, + "grad_norm": 0.41536402702331543, + "learning_rate": 0.001, + "loss": 1.8372, + "step": 21295 + }, + { + "epoch": 0.9009222438446569, + "grad_norm": 2.5897037982940674, + "learning_rate": 0.001, + "loss": 2.0451, + "step": 21296 + }, + { + "epoch": 0.9009645486081733, + "grad_norm": 0.16217264533042908, + "learning_rate": 0.001, + "loss": 2.2002, + "step": 21297 + }, + { + "epoch": 0.9010068533716896, + "grad_norm": 0.16462668776512146, + "learning_rate": 0.001, + "loss": 2.3787, + "step": 21298 + }, + { + "epoch": 0.901049158135206, + "grad_norm": 0.13496233522891998, + "learning_rate": 0.001, + "loss": 1.757, + "step": 21299 + }, + { + "epoch": 0.9010914628987224, + "grad_norm": 0.15986299514770508, + "learning_rate": 0.001, + "loss": 2.5716, + "step": 21300 + }, + { + "epoch": 0.9011337676622387, + "grad_norm": 2.0072743892669678, + "learning_rate": 0.001, + "loss": 1.5631, + "step": 21301 + }, + { + "epoch": 0.9011760724257551, + "grad_norm": 0.15145206451416016, + "learning_rate": 0.001, + "loss": 1.5825, + "step": 21302 + }, + { + "epoch": 0.9012183771892716, + "grad_norm": 0.1307021826505661, + "learning_rate": 0.001, + "loss": 2.7586, + "step": 21303 + }, + { + "epoch": 0.9012606819527879, + "grad_norm": 1.082275152206421, + "learning_rate": 0.001, + "loss": 1.5857, + "step": 21304 + }, + { + "epoch": 0.9013029867163043, + "grad_norm": 0.13324807584285736, + "learning_rate": 0.001, + "loss": 1.9865, + "step": 21305 + }, + { + "epoch": 0.9013452914798207, + "grad_norm": 0.18132184445858002, + "learning_rate": 0.001, + "loss": 2.2404, + "step": 21306 + }, + { + "epoch": 0.901387596243337, + "grad_norm": 0.15229110419750214, + "learning_rate": 0.001, + "loss": 1.9643, + "step": 21307 + }, + { + "epoch": 0.9014299010068534, + "grad_norm": 0.15678609907627106, + "learning_rate": 0.001, + "loss": 1.4846, + "step": 21308 + }, + { + "epoch": 0.9014722057703698, + "grad_norm": 0.2569018006324768, + "learning_rate": 0.001, + "loss": 1.4851, + "step": 21309 + }, + { + "epoch": 0.9015145105338861, + "grad_norm": 0.2765081822872162, + "learning_rate": 0.001, + "loss": 2.6487, + "step": 21310 + }, + { + "epoch": 0.9015568152974025, + "grad_norm": 0.4249289631843567, + "learning_rate": 0.001, + "loss": 2.114, + "step": 21311 + }, + { + "epoch": 0.9015991200609189, + "grad_norm": 0.14523530006408691, + "learning_rate": 0.001, + "loss": 1.4716, + "step": 21312 + }, + { + "epoch": 0.9016414248244352, + "grad_norm": 0.14755897223949432, + "learning_rate": 0.001, + "loss": 1.7763, + "step": 21313 + }, + { + "epoch": 0.9016837295879516, + "grad_norm": 6.201207637786865, + "learning_rate": 0.001, + "loss": 1.6094, + "step": 21314 + }, + { + "epoch": 0.9017260343514679, + "grad_norm": 0.15470610558986664, + "learning_rate": 0.001, + "loss": 2.9894, + "step": 21315 + }, + { + "epoch": 0.9017683391149843, + "grad_norm": 0.2583865821361542, + "learning_rate": 0.001, + "loss": 2.2243, + "step": 21316 + }, + { + "epoch": 0.9018106438785007, + "grad_norm": 0.1813354194164276, + "learning_rate": 0.001, + "loss": 1.8183, + "step": 21317 + }, + { + "epoch": 0.901852948642017, + "grad_norm": 0.1495773047208786, + "learning_rate": 0.001, + "loss": 2.1804, + "step": 21318 + }, + { + "epoch": 0.9018952534055334, + "grad_norm": 0.15756896138191223, + "learning_rate": 0.001, + "loss": 1.8869, + "step": 21319 + }, + { + "epoch": 0.9019375581690499, + "grad_norm": 0.44224387407302856, + "learning_rate": 0.001, + "loss": 2.3278, + "step": 21320 + }, + { + "epoch": 0.9019798629325662, + "grad_norm": 0.23249466717243195, + "learning_rate": 0.001, + "loss": 1.8345, + "step": 21321 + }, + { + "epoch": 0.9020221676960826, + "grad_norm": 0.17175745964050293, + "learning_rate": 0.001, + "loss": 1.687, + "step": 21322 + }, + { + "epoch": 0.902064472459599, + "grad_norm": 0.15828604996204376, + "learning_rate": 0.001, + "loss": 2.0996, + "step": 21323 + }, + { + "epoch": 0.9021067772231153, + "grad_norm": 0.1508798450231552, + "learning_rate": 0.001, + "loss": 2.1381, + "step": 21324 + }, + { + "epoch": 0.9021490819866317, + "grad_norm": 0.1373659372329712, + "learning_rate": 0.001, + "loss": 2.6336, + "step": 21325 + }, + { + "epoch": 0.9021913867501481, + "grad_norm": 0.3546662926673889, + "learning_rate": 0.001, + "loss": 2.1206, + "step": 21326 + }, + { + "epoch": 0.9022336915136644, + "grad_norm": 0.17910721898078918, + "learning_rate": 0.001, + "loss": 3.109, + "step": 21327 + }, + { + "epoch": 0.9022759962771808, + "grad_norm": 2.0273354053497314, + "learning_rate": 0.001, + "loss": 1.6428, + "step": 21328 + }, + { + "epoch": 0.9023183010406972, + "grad_norm": 0.17860253155231476, + "learning_rate": 0.001, + "loss": 2.9683, + "step": 21329 + }, + { + "epoch": 0.9023606058042135, + "grad_norm": 0.16391193866729736, + "learning_rate": 0.001, + "loss": 1.66, + "step": 21330 + }, + { + "epoch": 0.9024029105677299, + "grad_norm": 0.41829827427864075, + "learning_rate": 0.001, + "loss": 2.3392, + "step": 21331 + }, + { + "epoch": 0.9024452153312463, + "grad_norm": 0.15510700643062592, + "learning_rate": 0.001, + "loss": 2.1811, + "step": 21332 + }, + { + "epoch": 0.9024875200947626, + "grad_norm": 0.16545583307743073, + "learning_rate": 0.001, + "loss": 2.1031, + "step": 21333 + }, + { + "epoch": 0.902529824858279, + "grad_norm": 1.293203592300415, + "learning_rate": 0.001, + "loss": 1.5144, + "step": 21334 + }, + { + "epoch": 0.9025721296217954, + "grad_norm": 0.14911191165447235, + "learning_rate": 0.001, + "loss": 2.7838, + "step": 21335 + }, + { + "epoch": 0.9026144343853117, + "grad_norm": 0.2668335437774658, + "learning_rate": 0.001, + "loss": 3.2185, + "step": 21336 + }, + { + "epoch": 0.9026567391488282, + "grad_norm": 0.2777593433856964, + "learning_rate": 0.001, + "loss": 2.5853, + "step": 21337 + }, + { + "epoch": 0.9026990439123446, + "grad_norm": 0.6844047904014587, + "learning_rate": 0.001, + "loss": 1.6514, + "step": 21338 + }, + { + "epoch": 0.9027413486758609, + "grad_norm": 0.1647360622882843, + "learning_rate": 0.001, + "loss": 2.2108, + "step": 21339 + }, + { + "epoch": 0.9027836534393773, + "grad_norm": 0.15791663527488708, + "learning_rate": 0.001, + "loss": 1.6112, + "step": 21340 + }, + { + "epoch": 0.9028259582028937, + "grad_norm": 0.1473037302494049, + "learning_rate": 0.001, + "loss": 2.266, + "step": 21341 + }, + { + "epoch": 0.90286826296641, + "grad_norm": 0.15689495205879211, + "learning_rate": 0.001, + "loss": 1.765, + "step": 21342 + }, + { + "epoch": 0.9029105677299264, + "grad_norm": 0.18425515294075012, + "learning_rate": 0.001, + "loss": 1.8901, + "step": 21343 + }, + { + "epoch": 0.9029528724934428, + "grad_norm": 0.2768228054046631, + "learning_rate": 0.001, + "loss": 1.9768, + "step": 21344 + }, + { + "epoch": 0.9029951772569591, + "grad_norm": 0.15262466669082642, + "learning_rate": 0.001, + "loss": 2.0877, + "step": 21345 + }, + { + "epoch": 0.9030374820204755, + "grad_norm": 2.121443748474121, + "learning_rate": 0.001, + "loss": 2.5901, + "step": 21346 + }, + { + "epoch": 0.9030797867839919, + "grad_norm": 0.14500996470451355, + "learning_rate": 0.001, + "loss": 2.0742, + "step": 21347 + }, + { + "epoch": 0.9031220915475082, + "grad_norm": 0.19610632956027985, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 21348 + }, + { + "epoch": 0.9031643963110246, + "grad_norm": 0.17215634882450104, + "learning_rate": 0.001, + "loss": 3.0487, + "step": 21349 + }, + { + "epoch": 0.903206701074541, + "grad_norm": 0.20982448756694794, + "learning_rate": 0.001, + "loss": 2.9082, + "step": 21350 + }, + { + "epoch": 0.9032490058380573, + "grad_norm": 4.3952484130859375, + "learning_rate": 0.001, + "loss": 1.7265, + "step": 21351 + }, + { + "epoch": 0.9032913106015737, + "grad_norm": 0.21057921648025513, + "learning_rate": 0.001, + "loss": 1.9388, + "step": 21352 + }, + { + "epoch": 0.9033336153650902, + "grad_norm": 0.17475013434886932, + "learning_rate": 0.001, + "loss": 2.6006, + "step": 21353 + }, + { + "epoch": 0.9033759201286065, + "grad_norm": 4.874578475952148, + "learning_rate": 0.001, + "loss": 2.2432, + "step": 21354 + }, + { + "epoch": 0.9034182248921229, + "grad_norm": 0.16315825283527374, + "learning_rate": 0.001, + "loss": 1.6954, + "step": 21355 + }, + { + "epoch": 0.9034605296556393, + "grad_norm": 0.1910039633512497, + "learning_rate": 0.001, + "loss": 2.0015, + "step": 21356 + }, + { + "epoch": 0.9035028344191556, + "grad_norm": 0.14265620708465576, + "learning_rate": 0.001, + "loss": 2.2112, + "step": 21357 + }, + { + "epoch": 0.903545139182672, + "grad_norm": 0.4426799714565277, + "learning_rate": 0.001, + "loss": 2.4595, + "step": 21358 + }, + { + "epoch": 0.9035874439461884, + "grad_norm": 0.13497935235500336, + "learning_rate": 0.001, + "loss": 2.0469, + "step": 21359 + }, + { + "epoch": 0.9036297487097047, + "grad_norm": 0.2634698152542114, + "learning_rate": 0.001, + "loss": 1.2785, + "step": 21360 + }, + { + "epoch": 0.9036720534732211, + "grad_norm": 7.950343608856201, + "learning_rate": 0.001, + "loss": 3.839, + "step": 21361 + }, + { + "epoch": 0.9037143582367374, + "grad_norm": 0.30899831652641296, + "learning_rate": 0.001, + "loss": 2.0495, + "step": 21362 + }, + { + "epoch": 0.9037566630002538, + "grad_norm": 0.5557863116264343, + "learning_rate": 0.001, + "loss": 2.8, + "step": 21363 + }, + { + "epoch": 0.9037989677637702, + "grad_norm": 0.14372897148132324, + "learning_rate": 0.001, + "loss": 1.3838, + "step": 21364 + }, + { + "epoch": 0.9038412725272865, + "grad_norm": 0.17638888955116272, + "learning_rate": 0.001, + "loss": 1.7204, + "step": 21365 + }, + { + "epoch": 0.9038835772908029, + "grad_norm": 0.5824902653694153, + "learning_rate": 0.001, + "loss": 2.1462, + "step": 21366 + }, + { + "epoch": 0.9039258820543193, + "grad_norm": 0.18031518161296844, + "learning_rate": 0.001, + "loss": 1.7274, + "step": 21367 + }, + { + "epoch": 0.9039681868178356, + "grad_norm": 0.1800137311220169, + "learning_rate": 0.001, + "loss": 2.7204, + "step": 21368 + }, + { + "epoch": 0.904010491581352, + "grad_norm": 1.138705849647522, + "learning_rate": 0.001, + "loss": 1.8414, + "step": 21369 + }, + { + "epoch": 0.9040527963448685, + "grad_norm": 0.164835125207901, + "learning_rate": 0.001, + "loss": 2.7179, + "step": 21370 + }, + { + "epoch": 0.9040951011083848, + "grad_norm": 0.15999598801136017, + "learning_rate": 0.001, + "loss": 2.2956, + "step": 21371 + }, + { + "epoch": 0.9041374058719012, + "grad_norm": 0.17977473139762878, + "learning_rate": 0.001, + "loss": 3.2465, + "step": 21372 + }, + { + "epoch": 0.9041797106354176, + "grad_norm": 0.15177109837532043, + "learning_rate": 0.001, + "loss": 1.9558, + "step": 21373 + }, + { + "epoch": 0.9042220153989339, + "grad_norm": 0.9045629501342773, + "learning_rate": 0.001, + "loss": 2.9101, + "step": 21374 + }, + { + "epoch": 0.9042643201624503, + "grad_norm": 0.15818485617637634, + "learning_rate": 0.001, + "loss": 2.2249, + "step": 21375 + }, + { + "epoch": 0.9043066249259667, + "grad_norm": 0.16541606187820435, + "learning_rate": 0.001, + "loss": 2.3738, + "step": 21376 + }, + { + "epoch": 0.904348929689483, + "grad_norm": 0.9918728470802307, + "learning_rate": 0.001, + "loss": 2.5462, + "step": 21377 + }, + { + "epoch": 0.9043912344529994, + "grad_norm": 0.12892168760299683, + "learning_rate": 0.001, + "loss": 3.0338, + "step": 21378 + }, + { + "epoch": 0.9044335392165158, + "grad_norm": 0.1821994036436081, + "learning_rate": 0.001, + "loss": 2.3973, + "step": 21379 + }, + { + "epoch": 0.9044758439800321, + "grad_norm": 0.3445868492126465, + "learning_rate": 0.001, + "loss": 2.9188, + "step": 21380 + }, + { + "epoch": 0.9045181487435485, + "grad_norm": 0.14604288339614868, + "learning_rate": 0.001, + "loss": 2.1574, + "step": 21381 + }, + { + "epoch": 0.9045604535070649, + "grad_norm": 0.14212435483932495, + "learning_rate": 0.001, + "loss": 2.1932, + "step": 21382 + }, + { + "epoch": 0.9046027582705812, + "grad_norm": 0.12347246706485748, + "learning_rate": 0.001, + "loss": 1.7564, + "step": 21383 + }, + { + "epoch": 0.9046450630340976, + "grad_norm": 0.3492525517940521, + "learning_rate": 0.001, + "loss": 1.7874, + "step": 21384 + }, + { + "epoch": 0.904687367797614, + "grad_norm": 0.1499442309141159, + "learning_rate": 0.001, + "loss": 2.3026, + "step": 21385 + }, + { + "epoch": 0.9047296725611303, + "grad_norm": 0.15123094618320465, + "learning_rate": 0.001, + "loss": 2.6157, + "step": 21386 + }, + { + "epoch": 0.9047719773246468, + "grad_norm": 0.18827593326568604, + "learning_rate": 0.001, + "loss": 2.5425, + "step": 21387 + }, + { + "epoch": 0.9048142820881632, + "grad_norm": 0.16215500235557556, + "learning_rate": 0.001, + "loss": 2.1401, + "step": 21388 + }, + { + "epoch": 0.9048565868516795, + "grad_norm": 0.15039432048797607, + "learning_rate": 0.001, + "loss": 2.3611, + "step": 21389 + }, + { + "epoch": 0.9048988916151959, + "grad_norm": 0.183966264128685, + "learning_rate": 0.001, + "loss": 2.4818, + "step": 21390 + }, + { + "epoch": 0.9049411963787123, + "grad_norm": 0.1891811490058899, + "learning_rate": 0.001, + "loss": 3.7626, + "step": 21391 + }, + { + "epoch": 0.9049835011422286, + "grad_norm": 0.18254221975803375, + "learning_rate": 0.001, + "loss": 1.7773, + "step": 21392 + }, + { + "epoch": 0.905025805905745, + "grad_norm": 0.14208267629146576, + "learning_rate": 0.001, + "loss": 1.8959, + "step": 21393 + }, + { + "epoch": 0.9050681106692614, + "grad_norm": 0.3248007297515869, + "learning_rate": 0.001, + "loss": 2.2422, + "step": 21394 + }, + { + "epoch": 0.9051104154327777, + "grad_norm": 0.15163081884384155, + "learning_rate": 0.001, + "loss": 2.1537, + "step": 21395 + }, + { + "epoch": 0.9051527201962941, + "grad_norm": 0.16903750598430634, + "learning_rate": 0.001, + "loss": 1.838, + "step": 21396 + }, + { + "epoch": 0.9051950249598105, + "grad_norm": 0.13609051704406738, + "learning_rate": 0.001, + "loss": 1.3032, + "step": 21397 + }, + { + "epoch": 0.9052373297233268, + "grad_norm": 0.21412265300750732, + "learning_rate": 0.001, + "loss": 2.4583, + "step": 21398 + }, + { + "epoch": 0.9052796344868432, + "grad_norm": 0.6192092895507812, + "learning_rate": 0.001, + "loss": 3.1105, + "step": 21399 + }, + { + "epoch": 0.9053219392503596, + "grad_norm": 1.6677172183990479, + "learning_rate": 0.001, + "loss": 1.8853, + "step": 21400 + }, + { + "epoch": 0.9053642440138759, + "grad_norm": 0.5495797395706177, + "learning_rate": 0.001, + "loss": 2.3331, + "step": 21401 + }, + { + "epoch": 0.9054065487773924, + "grad_norm": 0.5030525922775269, + "learning_rate": 0.001, + "loss": 1.9881, + "step": 21402 + }, + { + "epoch": 0.9054488535409088, + "grad_norm": 0.14512759447097778, + "learning_rate": 0.001, + "loss": 2.2818, + "step": 21403 + }, + { + "epoch": 0.9054911583044251, + "grad_norm": 0.12264164537191391, + "learning_rate": 0.001, + "loss": 1.9594, + "step": 21404 + }, + { + "epoch": 0.9055334630679415, + "grad_norm": 0.14045067131519318, + "learning_rate": 0.001, + "loss": 1.8512, + "step": 21405 + }, + { + "epoch": 0.9055757678314578, + "grad_norm": 0.4061392843723297, + "learning_rate": 0.001, + "loss": 2.3092, + "step": 21406 + }, + { + "epoch": 0.9056180725949742, + "grad_norm": 1.0429646968841553, + "learning_rate": 0.001, + "loss": 1.7809, + "step": 21407 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.14219164848327637, + "learning_rate": 0.001, + "loss": 1.641, + "step": 21408 + }, + { + "epoch": 0.9057026821220069, + "grad_norm": 0.14022192358970642, + "learning_rate": 0.001, + "loss": 2.4478, + "step": 21409 + }, + { + "epoch": 0.9057449868855233, + "grad_norm": 0.20621517300605774, + "learning_rate": 0.001, + "loss": 2.3145, + "step": 21410 + }, + { + "epoch": 0.9057872916490397, + "grad_norm": 0.1814170777797699, + "learning_rate": 0.001, + "loss": 1.846, + "step": 21411 + }, + { + "epoch": 0.905829596412556, + "grad_norm": 0.17007854580879211, + "learning_rate": 0.001, + "loss": 2.2049, + "step": 21412 + }, + { + "epoch": 0.9058719011760724, + "grad_norm": 0.16254855692386627, + "learning_rate": 0.001, + "loss": 2.3874, + "step": 21413 + }, + { + "epoch": 0.9059142059395888, + "grad_norm": 0.1488046646118164, + "learning_rate": 0.001, + "loss": 2.5976, + "step": 21414 + }, + { + "epoch": 0.9059565107031051, + "grad_norm": 0.18739381432533264, + "learning_rate": 0.001, + "loss": 2.8319, + "step": 21415 + }, + { + "epoch": 0.9059988154666215, + "grad_norm": 0.3353780210018158, + "learning_rate": 0.001, + "loss": 2.6556, + "step": 21416 + }, + { + "epoch": 0.9060411202301379, + "grad_norm": 0.9559656381607056, + "learning_rate": 0.001, + "loss": 2.9665, + "step": 21417 + }, + { + "epoch": 0.9060834249936542, + "grad_norm": 0.15689292550086975, + "learning_rate": 0.001, + "loss": 2.0337, + "step": 21418 + }, + { + "epoch": 0.9061257297571707, + "grad_norm": 0.13871730864048004, + "learning_rate": 0.001, + "loss": 1.9963, + "step": 21419 + }, + { + "epoch": 0.9061680345206871, + "grad_norm": 0.16745619475841522, + "learning_rate": 0.001, + "loss": 1.7658, + "step": 21420 + }, + { + "epoch": 0.9062103392842034, + "grad_norm": 0.14893504977226257, + "learning_rate": 0.001, + "loss": 1.7183, + "step": 21421 + }, + { + "epoch": 0.9062526440477198, + "grad_norm": 0.18129420280456543, + "learning_rate": 0.001, + "loss": 1.9251, + "step": 21422 + }, + { + "epoch": 0.9062949488112362, + "grad_norm": 0.22704291343688965, + "learning_rate": 0.001, + "loss": 1.8624, + "step": 21423 + }, + { + "epoch": 0.9063372535747525, + "grad_norm": 0.16960926353931427, + "learning_rate": 0.001, + "loss": 1.7591, + "step": 21424 + }, + { + "epoch": 0.9063795583382689, + "grad_norm": 0.1893257051706314, + "learning_rate": 0.001, + "loss": 1.8019, + "step": 21425 + }, + { + "epoch": 0.9064218631017853, + "grad_norm": 8.908495903015137, + "learning_rate": 0.001, + "loss": 1.3735, + "step": 21426 + }, + { + "epoch": 0.9064641678653016, + "grad_norm": 0.16186918318271637, + "learning_rate": 0.001, + "loss": 1.9056, + "step": 21427 + }, + { + "epoch": 0.906506472628818, + "grad_norm": 0.16607151925563812, + "learning_rate": 0.001, + "loss": 2.8435, + "step": 21428 + }, + { + "epoch": 0.9065487773923344, + "grad_norm": 0.34751367568969727, + "learning_rate": 0.001, + "loss": 1.9356, + "step": 21429 + }, + { + "epoch": 0.9065910821558507, + "grad_norm": 0.3750505745410919, + "learning_rate": 0.001, + "loss": 2.2931, + "step": 21430 + }, + { + "epoch": 0.9066333869193671, + "grad_norm": 0.1678369790315628, + "learning_rate": 0.001, + "loss": 1.6514, + "step": 21431 + }, + { + "epoch": 0.9066756916828835, + "grad_norm": 0.5519915223121643, + "learning_rate": 0.001, + "loss": 2.7002, + "step": 21432 + }, + { + "epoch": 0.9067179964463998, + "grad_norm": 0.1600515991449356, + "learning_rate": 0.001, + "loss": 1.8045, + "step": 21433 + }, + { + "epoch": 0.9067603012099162, + "grad_norm": 0.15844613313674927, + "learning_rate": 0.001, + "loss": 1.8964, + "step": 21434 + }, + { + "epoch": 0.9068026059734327, + "grad_norm": 0.1344721019268036, + "learning_rate": 0.001, + "loss": 1.9177, + "step": 21435 + }, + { + "epoch": 0.906844910736949, + "grad_norm": 0.16520515084266663, + "learning_rate": 0.001, + "loss": 2.6828, + "step": 21436 + }, + { + "epoch": 0.9068872155004654, + "grad_norm": 0.15085281431674957, + "learning_rate": 0.001, + "loss": 1.7172, + "step": 21437 + }, + { + "epoch": 0.9069295202639818, + "grad_norm": 0.23470507562160492, + "learning_rate": 0.001, + "loss": 2.8068, + "step": 21438 + }, + { + "epoch": 0.9069718250274981, + "grad_norm": 0.15734681487083435, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 21439 + }, + { + "epoch": 0.9070141297910145, + "grad_norm": 0.19993236660957336, + "learning_rate": 0.001, + "loss": 1.9934, + "step": 21440 + }, + { + "epoch": 0.9070564345545309, + "grad_norm": 0.1488742232322693, + "learning_rate": 0.001, + "loss": 2.1611, + "step": 21441 + }, + { + "epoch": 0.9070987393180472, + "grad_norm": 0.15275177359580994, + "learning_rate": 0.001, + "loss": 1.9241, + "step": 21442 + }, + { + "epoch": 0.9071410440815636, + "grad_norm": 0.1733865737915039, + "learning_rate": 0.001, + "loss": 1.6173, + "step": 21443 + }, + { + "epoch": 0.90718334884508, + "grad_norm": 43.84885025024414, + "learning_rate": 0.001, + "loss": 3.4629, + "step": 21444 + }, + { + "epoch": 0.9072256536085963, + "grad_norm": 0.14967910945415497, + "learning_rate": 0.001, + "loss": 1.882, + "step": 21445 + }, + { + "epoch": 0.9072679583721127, + "grad_norm": 0.1916898787021637, + "learning_rate": 0.001, + "loss": 1.8736, + "step": 21446 + }, + { + "epoch": 0.9073102631356291, + "grad_norm": 3.939188003540039, + "learning_rate": 0.001, + "loss": 3.2875, + "step": 21447 + }, + { + "epoch": 0.9073525678991454, + "grad_norm": 0.16006742417812347, + "learning_rate": 0.001, + "loss": 1.9604, + "step": 21448 + }, + { + "epoch": 0.9073948726626618, + "grad_norm": 0.20729239284992218, + "learning_rate": 0.001, + "loss": 2.4355, + "step": 21449 + }, + { + "epoch": 0.9074371774261782, + "grad_norm": 0.15743091702461243, + "learning_rate": 0.001, + "loss": 2.3285, + "step": 21450 + }, + { + "epoch": 0.9074794821896945, + "grad_norm": 0.15352989733219147, + "learning_rate": 0.001, + "loss": 2.2617, + "step": 21451 + }, + { + "epoch": 0.907521786953211, + "grad_norm": 1.4606817960739136, + "learning_rate": 0.001, + "loss": 2.2048, + "step": 21452 + }, + { + "epoch": 0.9075640917167273, + "grad_norm": 0.14188838005065918, + "learning_rate": 0.001, + "loss": 1.9296, + "step": 21453 + }, + { + "epoch": 0.9076063964802437, + "grad_norm": 0.1336817890405655, + "learning_rate": 0.001, + "loss": 1.9424, + "step": 21454 + }, + { + "epoch": 0.9076487012437601, + "grad_norm": 1.9917988777160645, + "learning_rate": 0.001, + "loss": 1.8911, + "step": 21455 + }, + { + "epoch": 0.9076910060072764, + "grad_norm": 0.7746376395225525, + "learning_rate": 0.001, + "loss": 3.08, + "step": 21456 + }, + { + "epoch": 0.9077333107707928, + "grad_norm": 0.18907096982002258, + "learning_rate": 0.001, + "loss": 1.9718, + "step": 21457 + }, + { + "epoch": 0.9077756155343092, + "grad_norm": 0.1984204649925232, + "learning_rate": 0.001, + "loss": 1.9067, + "step": 21458 + }, + { + "epoch": 0.9078179202978255, + "grad_norm": 0.13996772468090057, + "learning_rate": 0.001, + "loss": 4.0638, + "step": 21459 + }, + { + "epoch": 0.9078602250613419, + "grad_norm": 0.16142001748085022, + "learning_rate": 0.001, + "loss": 2.3478, + "step": 21460 + }, + { + "epoch": 0.9079025298248583, + "grad_norm": 0.17676426470279694, + "learning_rate": 0.001, + "loss": 4.1074, + "step": 21461 + }, + { + "epoch": 0.9079448345883746, + "grad_norm": 0.1353548765182495, + "learning_rate": 0.001, + "loss": 2.4199, + "step": 21462 + }, + { + "epoch": 0.907987139351891, + "grad_norm": 0.18205760419368744, + "learning_rate": 0.001, + "loss": 2.2659, + "step": 21463 + }, + { + "epoch": 0.9080294441154074, + "grad_norm": 0.16253677010536194, + "learning_rate": 0.001, + "loss": 2.387, + "step": 21464 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 0.18944591283798218, + "learning_rate": 0.001, + "loss": 2.3769, + "step": 21465 + }, + { + "epoch": 0.9081140536424401, + "grad_norm": 0.1672883927822113, + "learning_rate": 0.001, + "loss": 2.4218, + "step": 21466 + }, + { + "epoch": 0.9081563584059565, + "grad_norm": 0.1499961018562317, + "learning_rate": 0.001, + "loss": 2.0867, + "step": 21467 + }, + { + "epoch": 0.9081986631694728, + "grad_norm": 0.2781917452812195, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 21468 + }, + { + "epoch": 0.9082409679329893, + "grad_norm": 0.2037796825170517, + "learning_rate": 0.001, + "loss": 3.4381, + "step": 21469 + }, + { + "epoch": 0.9082832726965057, + "grad_norm": 0.2747834622859955, + "learning_rate": 0.001, + "loss": 2.2192, + "step": 21470 + }, + { + "epoch": 0.908325577460022, + "grad_norm": 0.128767728805542, + "learning_rate": 0.001, + "loss": 1.6764, + "step": 21471 + }, + { + "epoch": 0.9083678822235384, + "grad_norm": 0.13520826399326324, + "learning_rate": 0.001, + "loss": 1.7189, + "step": 21472 + }, + { + "epoch": 0.9084101869870548, + "grad_norm": 0.16434721648693085, + "learning_rate": 0.001, + "loss": 1.1854, + "step": 21473 + }, + { + "epoch": 0.9084524917505711, + "grad_norm": 0.7008240818977356, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 21474 + }, + { + "epoch": 0.9084947965140875, + "grad_norm": 0.1205538809299469, + "learning_rate": 0.001, + "loss": 2.7022, + "step": 21475 + }, + { + "epoch": 0.9085371012776039, + "grad_norm": 0.17018336057662964, + "learning_rate": 0.001, + "loss": 2.3906, + "step": 21476 + }, + { + "epoch": 0.9085794060411202, + "grad_norm": 0.2808079421520233, + "learning_rate": 0.001, + "loss": 3.2353, + "step": 21477 + }, + { + "epoch": 0.9086217108046366, + "grad_norm": 0.18852543830871582, + "learning_rate": 0.001, + "loss": 2.035, + "step": 21478 + }, + { + "epoch": 0.908664015568153, + "grad_norm": 0.12925562262535095, + "learning_rate": 0.001, + "loss": 1.7236, + "step": 21479 + }, + { + "epoch": 0.9087063203316693, + "grad_norm": 4.139763832092285, + "learning_rate": 0.001, + "loss": 1.6331, + "step": 21480 + }, + { + "epoch": 0.9087486250951857, + "grad_norm": 0.15899895131587982, + "learning_rate": 0.001, + "loss": 1.7789, + "step": 21481 + }, + { + "epoch": 0.9087909298587021, + "grad_norm": 0.1730024665594101, + "learning_rate": 0.001, + "loss": 3.3566, + "step": 21482 + }, + { + "epoch": 0.9088332346222184, + "grad_norm": 0.1567884385585785, + "learning_rate": 0.001, + "loss": 1.9054, + "step": 21483 + }, + { + "epoch": 0.9088755393857348, + "grad_norm": 2.438006639480591, + "learning_rate": 0.001, + "loss": 2.6868, + "step": 21484 + }, + { + "epoch": 0.9089178441492513, + "grad_norm": 0.5900127291679382, + "learning_rate": 0.001, + "loss": 2.5526, + "step": 21485 + }, + { + "epoch": 0.9089601489127676, + "grad_norm": 0.7061132788658142, + "learning_rate": 0.001, + "loss": 2.5784, + "step": 21486 + }, + { + "epoch": 0.909002453676284, + "grad_norm": 0.7392309308052063, + "learning_rate": 0.001, + "loss": 2.1671, + "step": 21487 + }, + { + "epoch": 0.9090447584398004, + "grad_norm": 0.18610547482967377, + "learning_rate": 0.001, + "loss": 2.4229, + "step": 21488 + }, + { + "epoch": 0.9090870632033167, + "grad_norm": 0.15791387856006622, + "learning_rate": 0.001, + "loss": 2.6113, + "step": 21489 + }, + { + "epoch": 0.9091293679668331, + "grad_norm": 13.427140235900879, + "learning_rate": 0.001, + "loss": 1.7686, + "step": 21490 + }, + { + "epoch": 0.9091716727303495, + "grad_norm": 0.1820589154958725, + "learning_rate": 0.001, + "loss": 2.2658, + "step": 21491 + }, + { + "epoch": 0.9092139774938658, + "grad_norm": 0.5405210256576538, + "learning_rate": 0.001, + "loss": 2.6741, + "step": 21492 + }, + { + "epoch": 0.9092562822573822, + "grad_norm": 0.9230998754501343, + "learning_rate": 0.001, + "loss": 2.0778, + "step": 21493 + }, + { + "epoch": 0.9092985870208986, + "grad_norm": 0.1825629621744156, + "learning_rate": 0.001, + "loss": 2.0666, + "step": 21494 + }, + { + "epoch": 0.9093408917844149, + "grad_norm": 0.21903732419013977, + "learning_rate": 0.001, + "loss": 1.5503, + "step": 21495 + }, + { + "epoch": 0.9093831965479313, + "grad_norm": 0.30625858902931213, + "learning_rate": 0.001, + "loss": 1.7506, + "step": 21496 + }, + { + "epoch": 0.9094255013114476, + "grad_norm": 0.23304063081741333, + "learning_rate": 0.001, + "loss": 2.6598, + "step": 21497 + }, + { + "epoch": 0.909467806074964, + "grad_norm": 0.18999595940113068, + "learning_rate": 0.001, + "loss": 1.8506, + "step": 21498 + }, + { + "epoch": 0.9095101108384804, + "grad_norm": 3.7173240184783936, + "learning_rate": 0.001, + "loss": 2.9739, + "step": 21499 + }, + { + "epoch": 0.9095524156019967, + "grad_norm": 0.33216527104377747, + "learning_rate": 0.001, + "loss": 1.2953, + "step": 21500 + }, + { + "epoch": 0.9095947203655131, + "grad_norm": 0.1976582258939743, + "learning_rate": 0.001, + "loss": 2.2576, + "step": 21501 + }, + { + "epoch": 0.9096370251290296, + "grad_norm": 5.519730091094971, + "learning_rate": 0.001, + "loss": 1.8796, + "step": 21502 + }, + { + "epoch": 0.9096793298925459, + "grad_norm": 0.31326529383659363, + "learning_rate": 0.001, + "loss": 2.1128, + "step": 21503 + }, + { + "epoch": 0.9097216346560623, + "grad_norm": 0.15805281698703766, + "learning_rate": 0.001, + "loss": 1.3282, + "step": 21504 + }, + { + "epoch": 0.9097639394195787, + "grad_norm": 1.21946120262146, + "learning_rate": 0.001, + "loss": 2.203, + "step": 21505 + }, + { + "epoch": 0.909806244183095, + "grad_norm": 0.13598865270614624, + "learning_rate": 0.001, + "loss": 1.8011, + "step": 21506 + }, + { + "epoch": 0.9098485489466114, + "grad_norm": 0.30996546149253845, + "learning_rate": 0.001, + "loss": 1.5201, + "step": 21507 + }, + { + "epoch": 0.9098908537101278, + "grad_norm": 0.21078462898731232, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 21508 + }, + { + "epoch": 0.9099331584736441, + "grad_norm": 0.1793532818555832, + "learning_rate": 0.001, + "loss": 2.754, + "step": 21509 + }, + { + "epoch": 0.9099754632371605, + "grad_norm": 0.16735830903053284, + "learning_rate": 0.001, + "loss": 3.22, + "step": 21510 + }, + { + "epoch": 0.9100177680006769, + "grad_norm": 0.15920020639896393, + "learning_rate": 0.001, + "loss": 2.8717, + "step": 21511 + }, + { + "epoch": 0.9100600727641932, + "grad_norm": 0.17090164124965668, + "learning_rate": 0.001, + "loss": 1.8264, + "step": 21512 + }, + { + "epoch": 0.9101023775277096, + "grad_norm": 0.1805136799812317, + "learning_rate": 0.001, + "loss": 1.7801, + "step": 21513 + }, + { + "epoch": 0.910144682291226, + "grad_norm": 0.20294389128684998, + "learning_rate": 0.001, + "loss": 2.2071, + "step": 21514 + }, + { + "epoch": 0.9101869870547423, + "grad_norm": 0.18520846962928772, + "learning_rate": 0.001, + "loss": 1.3472, + "step": 21515 + }, + { + "epoch": 0.9102292918182587, + "grad_norm": 0.8986330032348633, + "learning_rate": 0.001, + "loss": 2.8275, + "step": 21516 + }, + { + "epoch": 0.9102715965817751, + "grad_norm": 0.16281363368034363, + "learning_rate": 0.001, + "loss": 3.0088, + "step": 21517 + }, + { + "epoch": 0.9103139013452914, + "grad_norm": 0.13993607461452484, + "learning_rate": 0.001, + "loss": 2.1769, + "step": 21518 + }, + { + "epoch": 0.9103562061088079, + "grad_norm": 0.13132378458976746, + "learning_rate": 0.001, + "loss": 1.4656, + "step": 21519 + }, + { + "epoch": 0.9103985108723243, + "grad_norm": 0.1520097702741623, + "learning_rate": 0.001, + "loss": 1.8638, + "step": 21520 + }, + { + "epoch": 0.9104408156358406, + "grad_norm": 1.1580663919448853, + "learning_rate": 0.001, + "loss": 2.0435, + "step": 21521 + }, + { + "epoch": 0.910483120399357, + "grad_norm": 0.1821383833885193, + "learning_rate": 0.001, + "loss": 2.5283, + "step": 21522 + }, + { + "epoch": 0.9105254251628734, + "grad_norm": 0.1639668494462967, + "learning_rate": 0.001, + "loss": 2.0908, + "step": 21523 + }, + { + "epoch": 0.9105677299263897, + "grad_norm": 0.18007436394691467, + "learning_rate": 0.001, + "loss": 1.8416, + "step": 21524 + }, + { + "epoch": 0.9106100346899061, + "grad_norm": 0.13596655428409576, + "learning_rate": 0.001, + "loss": 1.352, + "step": 21525 + }, + { + "epoch": 0.9106523394534225, + "grad_norm": 0.19696587324142456, + "learning_rate": 0.001, + "loss": 1.9196, + "step": 21526 + }, + { + "epoch": 0.9106946442169388, + "grad_norm": 0.15187956392765045, + "learning_rate": 0.001, + "loss": 2.5806, + "step": 21527 + }, + { + "epoch": 0.9107369489804552, + "grad_norm": 0.15384723246097565, + "learning_rate": 0.001, + "loss": 3.9416, + "step": 21528 + }, + { + "epoch": 0.9107792537439716, + "grad_norm": 0.21854868531227112, + "learning_rate": 0.001, + "loss": 2.478, + "step": 21529 + }, + { + "epoch": 0.9108215585074879, + "grad_norm": 0.1801101565361023, + "learning_rate": 0.001, + "loss": 2.2489, + "step": 21530 + }, + { + "epoch": 0.9108638632710043, + "grad_norm": 0.19910669326782227, + "learning_rate": 0.001, + "loss": 1.7515, + "step": 21531 + }, + { + "epoch": 0.9109061680345207, + "grad_norm": 0.18891854584217072, + "learning_rate": 0.001, + "loss": 1.8261, + "step": 21532 + }, + { + "epoch": 0.910948472798037, + "grad_norm": 0.1557842195034027, + "learning_rate": 0.001, + "loss": 2.175, + "step": 21533 + }, + { + "epoch": 0.9109907775615534, + "grad_norm": 0.15806511044502258, + "learning_rate": 0.001, + "loss": 1.7377, + "step": 21534 + }, + { + "epoch": 0.9110330823250699, + "grad_norm": 5.196726322174072, + "learning_rate": 0.001, + "loss": 2.0587, + "step": 21535 + }, + { + "epoch": 0.9110753870885862, + "grad_norm": 0.15925204753875732, + "learning_rate": 0.001, + "loss": 3.1414, + "step": 21536 + }, + { + "epoch": 0.9111176918521026, + "grad_norm": 0.16708320379257202, + "learning_rate": 0.001, + "loss": 2.3862, + "step": 21537 + }, + { + "epoch": 0.911159996615619, + "grad_norm": 0.14083898067474365, + "learning_rate": 0.001, + "loss": 1.5379, + "step": 21538 + }, + { + "epoch": 0.9112023013791353, + "grad_norm": 2.581650733947754, + "learning_rate": 0.001, + "loss": 2.6506, + "step": 21539 + }, + { + "epoch": 0.9112446061426517, + "grad_norm": 0.15130378305912018, + "learning_rate": 0.001, + "loss": 2.7111, + "step": 21540 + }, + { + "epoch": 0.911286910906168, + "grad_norm": 0.172215536236763, + "learning_rate": 0.001, + "loss": 1.4651, + "step": 21541 + }, + { + "epoch": 0.9113292156696844, + "grad_norm": 0.2063896358013153, + "learning_rate": 0.001, + "loss": 1.5084, + "step": 21542 + }, + { + "epoch": 0.9113715204332008, + "grad_norm": 0.48414063453674316, + "learning_rate": 0.001, + "loss": 3.9561, + "step": 21543 + }, + { + "epoch": 0.9114138251967171, + "grad_norm": 0.31004536151885986, + "learning_rate": 0.001, + "loss": 2.4255, + "step": 21544 + }, + { + "epoch": 0.9114561299602335, + "grad_norm": 0.17649905383586884, + "learning_rate": 0.001, + "loss": 1.8264, + "step": 21545 + }, + { + "epoch": 0.9114984347237499, + "grad_norm": 0.4742259085178375, + "learning_rate": 0.001, + "loss": 3.0758, + "step": 21546 + }, + { + "epoch": 0.9115407394872662, + "grad_norm": 0.21102559566497803, + "learning_rate": 0.001, + "loss": 2.8004, + "step": 21547 + }, + { + "epoch": 0.9115830442507826, + "grad_norm": 0.47817152738571167, + "learning_rate": 0.001, + "loss": 2.0812, + "step": 21548 + }, + { + "epoch": 0.911625349014299, + "grad_norm": 0.156982883810997, + "learning_rate": 0.001, + "loss": 2.3945, + "step": 21549 + }, + { + "epoch": 0.9116676537778153, + "grad_norm": 0.14067421853542328, + "learning_rate": 0.001, + "loss": 2.0697, + "step": 21550 + }, + { + "epoch": 0.9117099585413317, + "grad_norm": 0.17669621109962463, + "learning_rate": 0.001, + "loss": 2.3237, + "step": 21551 + }, + { + "epoch": 0.9117522633048482, + "grad_norm": 0.16611798107624054, + "learning_rate": 0.001, + "loss": 1.5483, + "step": 21552 + }, + { + "epoch": 0.9117945680683645, + "grad_norm": 0.2230168581008911, + "learning_rate": 0.001, + "loss": 2.6282, + "step": 21553 + }, + { + "epoch": 0.9118368728318809, + "grad_norm": 4.087730884552002, + "learning_rate": 0.001, + "loss": 1.8573, + "step": 21554 + }, + { + "epoch": 0.9118791775953973, + "grad_norm": 0.1587943285703659, + "learning_rate": 0.001, + "loss": 3.0096, + "step": 21555 + }, + { + "epoch": 0.9119214823589136, + "grad_norm": 0.13796477019786835, + "learning_rate": 0.001, + "loss": 2.5867, + "step": 21556 + }, + { + "epoch": 0.91196378712243, + "grad_norm": 0.17402009665966034, + "learning_rate": 0.001, + "loss": 2.2494, + "step": 21557 + }, + { + "epoch": 0.9120060918859464, + "grad_norm": 0.20480291545391083, + "learning_rate": 0.001, + "loss": 2.7589, + "step": 21558 + }, + { + "epoch": 0.9120483966494627, + "grad_norm": 0.45675426721572876, + "learning_rate": 0.001, + "loss": 2.9796, + "step": 21559 + }, + { + "epoch": 0.9120907014129791, + "grad_norm": 0.26858261227607727, + "learning_rate": 0.001, + "loss": 2.7131, + "step": 21560 + }, + { + "epoch": 0.9121330061764955, + "grad_norm": 0.1663147658109665, + "learning_rate": 0.001, + "loss": 1.9666, + "step": 21561 + }, + { + "epoch": 0.9121753109400118, + "grad_norm": 0.16249950230121613, + "learning_rate": 0.001, + "loss": 2.6662, + "step": 21562 + }, + { + "epoch": 0.9122176157035282, + "grad_norm": 0.22289776802062988, + "learning_rate": 0.001, + "loss": 2.3533, + "step": 21563 + }, + { + "epoch": 0.9122599204670446, + "grad_norm": 0.20994803309440613, + "learning_rate": 0.001, + "loss": 1.978, + "step": 21564 + }, + { + "epoch": 0.9123022252305609, + "grad_norm": 0.4022495448589325, + "learning_rate": 0.001, + "loss": 2.1195, + "step": 21565 + }, + { + "epoch": 0.9123445299940773, + "grad_norm": 0.17992646992206573, + "learning_rate": 0.001, + "loss": 1.934, + "step": 21566 + }, + { + "epoch": 0.9123868347575937, + "grad_norm": 9.34231948852539, + "learning_rate": 0.001, + "loss": 2.3568, + "step": 21567 + }, + { + "epoch": 0.91242913952111, + "grad_norm": 0.19124965369701385, + "learning_rate": 0.001, + "loss": 1.8253, + "step": 21568 + }, + { + "epoch": 0.9124714442846265, + "grad_norm": 0.1695529967546463, + "learning_rate": 0.001, + "loss": 2.892, + "step": 21569 + }, + { + "epoch": 0.9125137490481429, + "grad_norm": 0.1486518681049347, + "learning_rate": 0.001, + "loss": 1.868, + "step": 21570 + }, + { + "epoch": 0.9125560538116592, + "grad_norm": 20.817100524902344, + "learning_rate": 0.001, + "loss": 1.8457, + "step": 21571 + }, + { + "epoch": 0.9125983585751756, + "grad_norm": 12.602542877197266, + "learning_rate": 0.001, + "loss": 2.1331, + "step": 21572 + }, + { + "epoch": 0.912640663338692, + "grad_norm": 0.16852043569087982, + "learning_rate": 0.001, + "loss": 2.6505, + "step": 21573 + }, + { + "epoch": 0.9126829681022083, + "grad_norm": 0.17694880068302155, + "learning_rate": 0.001, + "loss": 1.5316, + "step": 21574 + }, + { + "epoch": 0.9127252728657247, + "grad_norm": 0.19821469485759735, + "learning_rate": 0.001, + "loss": 2.0534, + "step": 21575 + }, + { + "epoch": 0.9127675776292411, + "grad_norm": 0.21617907285690308, + "learning_rate": 0.001, + "loss": 2.5649, + "step": 21576 + }, + { + "epoch": 0.9128098823927574, + "grad_norm": 0.17548352479934692, + "learning_rate": 0.001, + "loss": 1.9663, + "step": 21577 + }, + { + "epoch": 0.9128521871562738, + "grad_norm": 0.22286243736743927, + "learning_rate": 0.001, + "loss": 1.97, + "step": 21578 + }, + { + "epoch": 0.9128944919197902, + "grad_norm": 0.19249877333641052, + "learning_rate": 0.001, + "loss": 2.8943, + "step": 21579 + }, + { + "epoch": 0.9129367966833065, + "grad_norm": 0.14703737199306488, + "learning_rate": 0.001, + "loss": 2.2766, + "step": 21580 + }, + { + "epoch": 0.9129791014468229, + "grad_norm": 0.17037394642829895, + "learning_rate": 0.001, + "loss": 1.5034, + "step": 21581 + }, + { + "epoch": 0.9130214062103393, + "grad_norm": 0.18207064270973206, + "learning_rate": 0.001, + "loss": 2.1486, + "step": 21582 + }, + { + "epoch": 0.9130637109738556, + "grad_norm": 0.1920434832572937, + "learning_rate": 0.001, + "loss": 1.6474, + "step": 21583 + }, + { + "epoch": 0.913106015737372, + "grad_norm": 1.184415578842163, + "learning_rate": 0.001, + "loss": 3.1735, + "step": 21584 + }, + { + "epoch": 0.9131483205008885, + "grad_norm": 0.15448032319545746, + "learning_rate": 0.001, + "loss": 1.736, + "step": 21585 + }, + { + "epoch": 0.9131906252644048, + "grad_norm": 0.18459028005599976, + "learning_rate": 0.001, + "loss": 2.0065, + "step": 21586 + }, + { + "epoch": 0.9132329300279212, + "grad_norm": 0.14690928161144257, + "learning_rate": 0.001, + "loss": 2.3362, + "step": 21587 + }, + { + "epoch": 0.9132752347914375, + "grad_norm": 0.15525342524051666, + "learning_rate": 0.001, + "loss": 1.691, + "step": 21588 + }, + { + "epoch": 0.9133175395549539, + "grad_norm": 0.15081803500652313, + "learning_rate": 0.001, + "loss": 1.3852, + "step": 21589 + }, + { + "epoch": 0.9133598443184703, + "grad_norm": 0.14801929891109467, + "learning_rate": 0.001, + "loss": 2.3841, + "step": 21590 + }, + { + "epoch": 0.9134021490819866, + "grad_norm": 0.1725027710199356, + "learning_rate": 0.001, + "loss": 2.52, + "step": 21591 + }, + { + "epoch": 0.913444453845503, + "grad_norm": 0.11509247869253159, + "learning_rate": 0.001, + "loss": 1.6571, + "step": 21592 + }, + { + "epoch": 0.9134867586090194, + "grad_norm": 0.14290519058704376, + "learning_rate": 0.001, + "loss": 1.8487, + "step": 21593 + }, + { + "epoch": 0.9135290633725357, + "grad_norm": 0.21351279318332672, + "learning_rate": 0.001, + "loss": 1.9015, + "step": 21594 + }, + { + "epoch": 0.9135713681360521, + "grad_norm": 0.391020268201828, + "learning_rate": 0.001, + "loss": 2.0749, + "step": 21595 + }, + { + "epoch": 0.9136136728995685, + "grad_norm": 0.14413462579250336, + "learning_rate": 0.001, + "loss": 1.8242, + "step": 21596 + }, + { + "epoch": 0.9136559776630848, + "grad_norm": 0.18715713918209076, + "learning_rate": 0.001, + "loss": 3.9564, + "step": 21597 + }, + { + "epoch": 0.9136982824266012, + "grad_norm": 0.17989511787891388, + "learning_rate": 0.001, + "loss": 2.6794, + "step": 21598 + }, + { + "epoch": 0.9137405871901176, + "grad_norm": 0.1721455156803131, + "learning_rate": 0.001, + "loss": 1.6095, + "step": 21599 + }, + { + "epoch": 0.9137828919536339, + "grad_norm": 1.7878568172454834, + "learning_rate": 0.001, + "loss": 2.6891, + "step": 21600 + }, + { + "epoch": 0.9138251967171503, + "grad_norm": 0.14471499621868134, + "learning_rate": 0.001, + "loss": 1.3701, + "step": 21601 + }, + { + "epoch": 0.9138675014806668, + "grad_norm": 2.2740018367767334, + "learning_rate": 0.001, + "loss": 2.1687, + "step": 21602 + }, + { + "epoch": 0.913909806244183, + "grad_norm": 0.13715597987174988, + "learning_rate": 0.001, + "loss": 1.995, + "step": 21603 + }, + { + "epoch": 0.9139521110076995, + "grad_norm": 0.13513702154159546, + "learning_rate": 0.001, + "loss": 1.735, + "step": 21604 + }, + { + "epoch": 0.9139944157712159, + "grad_norm": 0.1583508551120758, + "learning_rate": 0.001, + "loss": 1.746, + "step": 21605 + }, + { + "epoch": 0.9140367205347322, + "grad_norm": 0.1774708777666092, + "learning_rate": 0.001, + "loss": 2.3999, + "step": 21606 + }, + { + "epoch": 0.9140790252982486, + "grad_norm": 0.15882831811904907, + "learning_rate": 0.001, + "loss": 1.6509, + "step": 21607 + }, + { + "epoch": 0.914121330061765, + "grad_norm": 0.1779770851135254, + "learning_rate": 0.001, + "loss": 2.1135, + "step": 21608 + }, + { + "epoch": 0.9141636348252813, + "grad_norm": 0.33798548579216003, + "learning_rate": 0.001, + "loss": 2.7735, + "step": 21609 + }, + { + "epoch": 0.9142059395887977, + "grad_norm": 0.31764113903045654, + "learning_rate": 0.001, + "loss": 2.0155, + "step": 21610 + }, + { + "epoch": 0.9142482443523141, + "grad_norm": 0.17167538404464722, + "learning_rate": 0.001, + "loss": 1.7689, + "step": 21611 + }, + { + "epoch": 0.9142905491158304, + "grad_norm": 0.16581086814403534, + "learning_rate": 0.001, + "loss": 1.7205, + "step": 21612 + }, + { + "epoch": 0.9143328538793468, + "grad_norm": 0.16308166086673737, + "learning_rate": 0.001, + "loss": 2.3171, + "step": 21613 + }, + { + "epoch": 0.9143751586428632, + "grad_norm": 0.1509043574333191, + "learning_rate": 0.001, + "loss": 1.4007, + "step": 21614 + }, + { + "epoch": 0.9144174634063795, + "grad_norm": 0.18043819069862366, + "learning_rate": 0.001, + "loss": 3.3869, + "step": 21615 + }, + { + "epoch": 0.9144597681698959, + "grad_norm": 0.19052763283252716, + "learning_rate": 0.001, + "loss": 2.2411, + "step": 21616 + }, + { + "epoch": 0.9145020729334123, + "grad_norm": 0.13170795142650604, + "learning_rate": 0.001, + "loss": 2.1545, + "step": 21617 + }, + { + "epoch": 0.9145443776969286, + "grad_norm": 0.16418077051639557, + "learning_rate": 0.001, + "loss": 2.2063, + "step": 21618 + }, + { + "epoch": 0.914586682460445, + "grad_norm": 0.5688857436180115, + "learning_rate": 0.001, + "loss": 2.1972, + "step": 21619 + }, + { + "epoch": 0.9146289872239615, + "grad_norm": 0.1831405758857727, + "learning_rate": 0.001, + "loss": 1.6587, + "step": 21620 + }, + { + "epoch": 0.9146712919874778, + "grad_norm": 0.4554640054702759, + "learning_rate": 0.001, + "loss": 1.7407, + "step": 21621 + }, + { + "epoch": 0.9147135967509942, + "grad_norm": 0.17047862708568573, + "learning_rate": 0.001, + "loss": 1.6915, + "step": 21622 + }, + { + "epoch": 0.9147559015145106, + "grad_norm": 0.7939593195915222, + "learning_rate": 0.001, + "loss": 1.8853, + "step": 21623 + }, + { + "epoch": 0.9147982062780269, + "grad_norm": 0.17119646072387695, + "learning_rate": 0.001, + "loss": 3.3695, + "step": 21624 + }, + { + "epoch": 0.9148405110415433, + "grad_norm": 0.17291773855686188, + "learning_rate": 0.001, + "loss": 1.8255, + "step": 21625 + }, + { + "epoch": 0.9148828158050597, + "grad_norm": 0.49092814326286316, + "learning_rate": 0.001, + "loss": 2.3737, + "step": 21626 + }, + { + "epoch": 0.914925120568576, + "grad_norm": 0.13538378477096558, + "learning_rate": 0.001, + "loss": 1.5565, + "step": 21627 + }, + { + "epoch": 0.9149674253320924, + "grad_norm": 0.14585568010807037, + "learning_rate": 0.001, + "loss": 2.0289, + "step": 21628 + }, + { + "epoch": 0.9150097300956088, + "grad_norm": 1.05545175075531, + "learning_rate": 0.001, + "loss": 2.3307, + "step": 21629 + }, + { + "epoch": 0.9150520348591251, + "grad_norm": 0.13337115943431854, + "learning_rate": 0.001, + "loss": 2.3943, + "step": 21630 + }, + { + "epoch": 0.9150943396226415, + "grad_norm": 0.13960030674934387, + "learning_rate": 0.001, + "loss": 2.0699, + "step": 21631 + }, + { + "epoch": 0.9151366443861578, + "grad_norm": 0.1596209853887558, + "learning_rate": 0.001, + "loss": 2.9616, + "step": 21632 + }, + { + "epoch": 0.9151789491496742, + "grad_norm": 1.2385811805725098, + "learning_rate": 0.001, + "loss": 2.1809, + "step": 21633 + }, + { + "epoch": 0.9152212539131906, + "grad_norm": 0.15184301137924194, + "learning_rate": 0.001, + "loss": 1.4528, + "step": 21634 + }, + { + "epoch": 0.915263558676707, + "grad_norm": 3.3898375034332275, + "learning_rate": 0.001, + "loss": 1.8417, + "step": 21635 + }, + { + "epoch": 0.9153058634402234, + "grad_norm": 0.1527806520462036, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 21636 + }, + { + "epoch": 0.9153481682037398, + "grad_norm": 0.22297519445419312, + "learning_rate": 0.001, + "loss": 2.2794, + "step": 21637 + }, + { + "epoch": 0.9153904729672561, + "grad_norm": 0.21347364783287048, + "learning_rate": 0.001, + "loss": 3.186, + "step": 21638 + }, + { + "epoch": 0.9154327777307725, + "grad_norm": 0.15860065817832947, + "learning_rate": 0.001, + "loss": 1.7825, + "step": 21639 + }, + { + "epoch": 0.9154750824942889, + "grad_norm": 0.4748792052268982, + "learning_rate": 0.001, + "loss": 2.4484, + "step": 21640 + }, + { + "epoch": 0.9155173872578052, + "grad_norm": 0.15251575410366058, + "learning_rate": 0.001, + "loss": 1.5772, + "step": 21641 + }, + { + "epoch": 0.9155596920213216, + "grad_norm": 0.21372830867767334, + "learning_rate": 0.001, + "loss": 2.8094, + "step": 21642 + }, + { + "epoch": 0.915601996784838, + "grad_norm": 0.19164182245731354, + "learning_rate": 0.001, + "loss": 3.3052, + "step": 21643 + }, + { + "epoch": 0.9156443015483543, + "grad_norm": 0.15785560011863708, + "learning_rate": 0.001, + "loss": 2.3681, + "step": 21644 + }, + { + "epoch": 0.9156866063118707, + "grad_norm": 1.266538143157959, + "learning_rate": 0.001, + "loss": 2.0366, + "step": 21645 + }, + { + "epoch": 0.9157289110753871, + "grad_norm": 0.16436876356601715, + "learning_rate": 0.001, + "loss": 2.3029, + "step": 21646 + }, + { + "epoch": 0.9157712158389034, + "grad_norm": 0.1448490023612976, + "learning_rate": 0.001, + "loss": 2.3137, + "step": 21647 + }, + { + "epoch": 0.9158135206024198, + "grad_norm": 0.1606566458940506, + "learning_rate": 0.001, + "loss": 1.5938, + "step": 21648 + }, + { + "epoch": 0.9158558253659362, + "grad_norm": 0.6070122122764587, + "learning_rate": 0.001, + "loss": 3.4232, + "step": 21649 + }, + { + "epoch": 0.9158981301294525, + "grad_norm": 0.15284164249897003, + "learning_rate": 0.001, + "loss": 2.2872, + "step": 21650 + }, + { + "epoch": 0.915940434892969, + "grad_norm": 0.18675555288791656, + "learning_rate": 0.001, + "loss": 3.7336, + "step": 21651 + }, + { + "epoch": 0.9159827396564854, + "grad_norm": 0.20005974173545837, + "learning_rate": 0.001, + "loss": 2.4288, + "step": 21652 + }, + { + "epoch": 0.9160250444200017, + "grad_norm": 0.17080628871917725, + "learning_rate": 0.001, + "loss": 2.0765, + "step": 21653 + }, + { + "epoch": 0.9160673491835181, + "grad_norm": 0.17919304966926575, + "learning_rate": 0.001, + "loss": 1.6258, + "step": 21654 + }, + { + "epoch": 0.9161096539470345, + "grad_norm": 4.160139560699463, + "learning_rate": 0.001, + "loss": 3.0095, + "step": 21655 + }, + { + "epoch": 0.9161519587105508, + "grad_norm": 0.12385336309671402, + "learning_rate": 0.001, + "loss": 1.9264, + "step": 21656 + }, + { + "epoch": 0.9161942634740672, + "grad_norm": 0.19847314059734344, + "learning_rate": 0.001, + "loss": 2.0386, + "step": 21657 + }, + { + "epoch": 0.9162365682375836, + "grad_norm": 0.14693698287010193, + "learning_rate": 0.001, + "loss": 1.6431, + "step": 21658 + }, + { + "epoch": 0.9162788730010999, + "grad_norm": 0.1676744669675827, + "learning_rate": 0.001, + "loss": 1.7444, + "step": 21659 + }, + { + "epoch": 0.9163211777646163, + "grad_norm": 0.17183883488178253, + "learning_rate": 0.001, + "loss": 2.122, + "step": 21660 + }, + { + "epoch": 0.9163634825281327, + "grad_norm": 0.16480883955955505, + "learning_rate": 0.001, + "loss": 2.9668, + "step": 21661 + }, + { + "epoch": 0.916405787291649, + "grad_norm": 0.16469353437423706, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 21662 + }, + { + "epoch": 0.9164480920551654, + "grad_norm": 0.1656375229358673, + "learning_rate": 0.001, + "loss": 1.7279, + "step": 21663 + }, + { + "epoch": 0.9164903968186818, + "grad_norm": 0.15687058866024017, + "learning_rate": 0.001, + "loss": 2.5898, + "step": 21664 + }, + { + "epoch": 0.9165327015821981, + "grad_norm": 0.585669994354248, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 21665 + }, + { + "epoch": 0.9165750063457145, + "grad_norm": 0.5769844651222229, + "learning_rate": 0.001, + "loss": 2.0928, + "step": 21666 + }, + { + "epoch": 0.916617311109231, + "grad_norm": 0.18087390065193176, + "learning_rate": 0.001, + "loss": 1.8316, + "step": 21667 + }, + { + "epoch": 0.9166596158727472, + "grad_norm": 0.15963704884052277, + "learning_rate": 0.001, + "loss": 1.6256, + "step": 21668 + }, + { + "epoch": 0.9167019206362637, + "grad_norm": 0.15890270471572876, + "learning_rate": 0.001, + "loss": 1.5167, + "step": 21669 + }, + { + "epoch": 0.9167442253997801, + "grad_norm": 0.15813934803009033, + "learning_rate": 0.001, + "loss": 2.0896, + "step": 21670 + }, + { + "epoch": 0.9167865301632964, + "grad_norm": 0.17641639709472656, + "learning_rate": 0.001, + "loss": 1.7598, + "step": 21671 + }, + { + "epoch": 0.9168288349268128, + "grad_norm": 0.14289072155952454, + "learning_rate": 0.001, + "loss": 2.1995, + "step": 21672 + }, + { + "epoch": 0.9168711396903292, + "grad_norm": 0.14103195071220398, + "learning_rate": 0.001, + "loss": 1.6827, + "step": 21673 + }, + { + "epoch": 0.9169134444538455, + "grad_norm": 1.1286795139312744, + "learning_rate": 0.001, + "loss": 1.7365, + "step": 21674 + }, + { + "epoch": 0.9169557492173619, + "grad_norm": 0.1436692327260971, + "learning_rate": 0.001, + "loss": 1.5473, + "step": 21675 + }, + { + "epoch": 0.9169980539808782, + "grad_norm": 0.17740869522094727, + "learning_rate": 0.001, + "loss": 2.0802, + "step": 21676 + }, + { + "epoch": 0.9170403587443946, + "grad_norm": 0.21841758489608765, + "learning_rate": 0.001, + "loss": 2.4257, + "step": 21677 + }, + { + "epoch": 0.917082663507911, + "grad_norm": 0.1688189059495926, + "learning_rate": 0.001, + "loss": 1.6848, + "step": 21678 + }, + { + "epoch": 0.9171249682714273, + "grad_norm": 7.481250762939453, + "learning_rate": 0.001, + "loss": 1.8009, + "step": 21679 + }, + { + "epoch": 0.9171672730349437, + "grad_norm": 0.16650600731372833, + "learning_rate": 0.001, + "loss": 1.9332, + "step": 21680 + }, + { + "epoch": 0.9172095777984601, + "grad_norm": 0.16894063353538513, + "learning_rate": 0.001, + "loss": 1.8266, + "step": 21681 + }, + { + "epoch": 0.9172518825619764, + "grad_norm": 1.4320636987686157, + "learning_rate": 0.001, + "loss": 1.7303, + "step": 21682 + }, + { + "epoch": 0.9172941873254928, + "grad_norm": 0.15776890516281128, + "learning_rate": 0.001, + "loss": 1.483, + "step": 21683 + }, + { + "epoch": 0.9173364920890092, + "grad_norm": 0.5583562254905701, + "learning_rate": 0.001, + "loss": 2.6289, + "step": 21684 + }, + { + "epoch": 0.9173787968525255, + "grad_norm": 2.3220925331115723, + "learning_rate": 0.001, + "loss": 2.3595, + "step": 21685 + }, + { + "epoch": 0.917421101616042, + "grad_norm": 0.19249920547008514, + "learning_rate": 0.001, + "loss": 2.1336, + "step": 21686 + }, + { + "epoch": 0.9174634063795584, + "grad_norm": 0.24140247702598572, + "learning_rate": 0.001, + "loss": 2.4654, + "step": 21687 + }, + { + "epoch": 0.9175057111430747, + "grad_norm": 0.17343805730342865, + "learning_rate": 0.001, + "loss": 2.3035, + "step": 21688 + }, + { + "epoch": 0.9175480159065911, + "grad_norm": 0.19953952729701996, + "learning_rate": 0.001, + "loss": 1.9234, + "step": 21689 + }, + { + "epoch": 0.9175903206701075, + "grad_norm": 1.9272069931030273, + "learning_rate": 0.001, + "loss": 3.4397, + "step": 21690 + }, + { + "epoch": 0.9176326254336238, + "grad_norm": 0.1924961805343628, + "learning_rate": 0.001, + "loss": 2.4553, + "step": 21691 + }, + { + "epoch": 0.9176749301971402, + "grad_norm": 0.20997212827205658, + "learning_rate": 0.001, + "loss": 1.7757, + "step": 21692 + }, + { + "epoch": 0.9177172349606566, + "grad_norm": 0.18580669164657593, + "learning_rate": 0.001, + "loss": 2.1987, + "step": 21693 + }, + { + "epoch": 0.9177595397241729, + "grad_norm": 0.2456483542919159, + "learning_rate": 0.001, + "loss": 2.1412, + "step": 21694 + }, + { + "epoch": 0.9178018444876893, + "grad_norm": 0.2759229242801666, + "learning_rate": 0.001, + "loss": 2.3739, + "step": 21695 + }, + { + "epoch": 0.9178441492512057, + "grad_norm": 0.19626349210739136, + "learning_rate": 0.001, + "loss": 2.4, + "step": 21696 + }, + { + "epoch": 0.917886454014722, + "grad_norm": 0.19402183592319489, + "learning_rate": 0.001, + "loss": 3.3583, + "step": 21697 + }, + { + "epoch": 0.9179287587782384, + "grad_norm": 0.21589398384094238, + "learning_rate": 0.001, + "loss": 1.6545, + "step": 21698 + }, + { + "epoch": 0.9179710635417548, + "grad_norm": 0.19767926633358002, + "learning_rate": 0.001, + "loss": 2.963, + "step": 21699 + }, + { + "epoch": 0.9180133683052711, + "grad_norm": 0.6116325855255127, + "learning_rate": 0.001, + "loss": 2.4778, + "step": 21700 + }, + { + "epoch": 0.9180556730687875, + "grad_norm": 0.18952438235282898, + "learning_rate": 0.001, + "loss": 2.9357, + "step": 21701 + }, + { + "epoch": 0.918097977832304, + "grad_norm": 0.20334750413894653, + "learning_rate": 0.001, + "loss": 1.9754, + "step": 21702 + }, + { + "epoch": 0.9181402825958203, + "grad_norm": 0.17652149498462677, + "learning_rate": 0.001, + "loss": 3.0083, + "step": 21703 + }, + { + "epoch": 0.9181825873593367, + "grad_norm": 0.1732001155614853, + "learning_rate": 0.001, + "loss": 2.601, + "step": 21704 + }, + { + "epoch": 0.9182248921228531, + "grad_norm": 0.16499556601047516, + "learning_rate": 0.001, + "loss": 2.3997, + "step": 21705 + }, + { + "epoch": 0.9182671968863694, + "grad_norm": 0.1443309634923935, + "learning_rate": 0.001, + "loss": 3.3834, + "step": 21706 + }, + { + "epoch": 0.9183095016498858, + "grad_norm": 0.14484232664108276, + "learning_rate": 0.001, + "loss": 1.6655, + "step": 21707 + }, + { + "epoch": 0.9183518064134022, + "grad_norm": 0.13555298745632172, + "learning_rate": 0.001, + "loss": 1.6735, + "step": 21708 + }, + { + "epoch": 0.9183941111769185, + "grad_norm": 0.2232842892408371, + "learning_rate": 0.001, + "loss": 2.4252, + "step": 21709 + }, + { + "epoch": 0.9184364159404349, + "grad_norm": 0.175230473279953, + "learning_rate": 0.001, + "loss": 2.0342, + "step": 21710 + }, + { + "epoch": 0.9184787207039513, + "grad_norm": 0.1450617015361786, + "learning_rate": 0.001, + "loss": 1.7858, + "step": 21711 + }, + { + "epoch": 0.9185210254674676, + "grad_norm": 0.37246161699295044, + "learning_rate": 0.001, + "loss": 3.1194, + "step": 21712 + }, + { + "epoch": 0.918563330230984, + "grad_norm": 0.1520744115114212, + "learning_rate": 0.001, + "loss": 2.9826, + "step": 21713 + }, + { + "epoch": 0.9186056349945004, + "grad_norm": 0.17187443375587463, + "learning_rate": 0.001, + "loss": 2.6739, + "step": 21714 + }, + { + "epoch": 0.9186479397580167, + "grad_norm": 0.17397736012935638, + "learning_rate": 0.001, + "loss": 2.6349, + "step": 21715 + }, + { + "epoch": 0.9186902445215331, + "grad_norm": 2.4483251571655273, + "learning_rate": 0.001, + "loss": 2.8474, + "step": 21716 + }, + { + "epoch": 0.9187325492850495, + "grad_norm": 0.9830498695373535, + "learning_rate": 0.001, + "loss": 1.3181, + "step": 21717 + }, + { + "epoch": 0.9187748540485658, + "grad_norm": 0.1355532854795456, + "learning_rate": 0.001, + "loss": 2.4041, + "step": 21718 + }, + { + "epoch": 0.9188171588120823, + "grad_norm": 0.161952406167984, + "learning_rate": 0.001, + "loss": 2.9089, + "step": 21719 + }, + { + "epoch": 0.9188594635755987, + "grad_norm": 0.1725846230983734, + "learning_rate": 0.001, + "loss": 2.2824, + "step": 21720 + }, + { + "epoch": 0.918901768339115, + "grad_norm": 0.21761982142925262, + "learning_rate": 0.001, + "loss": 1.7065, + "step": 21721 + }, + { + "epoch": 0.9189440731026314, + "grad_norm": 0.16416600346565247, + "learning_rate": 0.001, + "loss": 1.8811, + "step": 21722 + }, + { + "epoch": 0.9189863778661477, + "grad_norm": 0.1734263002872467, + "learning_rate": 0.001, + "loss": 1.956, + "step": 21723 + }, + { + "epoch": 0.9190286826296641, + "grad_norm": 0.3407423198223114, + "learning_rate": 0.001, + "loss": 1.9373, + "step": 21724 + }, + { + "epoch": 0.9190709873931805, + "grad_norm": 0.1345268189907074, + "learning_rate": 0.001, + "loss": 1.7216, + "step": 21725 + }, + { + "epoch": 0.9191132921566968, + "grad_norm": 0.1486416757106781, + "learning_rate": 0.001, + "loss": 1.7895, + "step": 21726 + }, + { + "epoch": 0.9191555969202132, + "grad_norm": 0.14210502803325653, + "learning_rate": 0.001, + "loss": 2.0866, + "step": 21727 + }, + { + "epoch": 0.9191979016837296, + "grad_norm": 0.13867826759815216, + "learning_rate": 0.001, + "loss": 2.6647, + "step": 21728 + }, + { + "epoch": 0.9192402064472459, + "grad_norm": 0.4167260527610779, + "learning_rate": 0.001, + "loss": 2.3402, + "step": 21729 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 0.16781596839427948, + "learning_rate": 0.001, + "loss": 1.7369, + "step": 21730 + }, + { + "epoch": 0.9193248159742787, + "grad_norm": 0.16833274066448212, + "learning_rate": 0.001, + "loss": 2.136, + "step": 21731 + }, + { + "epoch": 0.919367120737795, + "grad_norm": 15.674321174621582, + "learning_rate": 0.001, + "loss": 2.771, + "step": 21732 + }, + { + "epoch": 0.9194094255013114, + "grad_norm": 1.287104606628418, + "learning_rate": 0.001, + "loss": 1.7155, + "step": 21733 + }, + { + "epoch": 0.9194517302648278, + "grad_norm": 0.14855779707431793, + "learning_rate": 0.001, + "loss": 2.1788, + "step": 21734 + }, + { + "epoch": 0.9194940350283441, + "grad_norm": 0.18779130280017853, + "learning_rate": 0.001, + "loss": 1.9621, + "step": 21735 + }, + { + "epoch": 0.9195363397918606, + "grad_norm": 0.18050996959209442, + "learning_rate": 0.001, + "loss": 1.7367, + "step": 21736 + }, + { + "epoch": 0.919578644555377, + "grad_norm": 0.2634378671646118, + "learning_rate": 0.001, + "loss": 2.6046, + "step": 21737 + }, + { + "epoch": 0.9196209493188933, + "grad_norm": 0.2787913978099823, + "learning_rate": 0.001, + "loss": 3.1114, + "step": 21738 + }, + { + "epoch": 0.9196632540824097, + "grad_norm": 0.15546195209026337, + "learning_rate": 0.001, + "loss": 2.7819, + "step": 21739 + }, + { + "epoch": 0.9197055588459261, + "grad_norm": 0.21307052671909332, + "learning_rate": 0.001, + "loss": 2.5438, + "step": 21740 + }, + { + "epoch": 0.9197478636094424, + "grad_norm": 0.18763384222984314, + "learning_rate": 0.001, + "loss": 1.7654, + "step": 21741 + }, + { + "epoch": 0.9197901683729588, + "grad_norm": 0.12560519576072693, + "learning_rate": 0.001, + "loss": 2.6024, + "step": 21742 + }, + { + "epoch": 0.9198324731364752, + "grad_norm": 0.18310007452964783, + "learning_rate": 0.001, + "loss": 2.0988, + "step": 21743 + }, + { + "epoch": 0.9198747778999915, + "grad_norm": 0.2050357460975647, + "learning_rate": 0.001, + "loss": 2.5167, + "step": 21744 + }, + { + "epoch": 0.9199170826635079, + "grad_norm": 0.21944154798984528, + "learning_rate": 0.001, + "loss": 2.9099, + "step": 21745 + }, + { + "epoch": 0.9199593874270243, + "grad_norm": 3.412935495376587, + "learning_rate": 0.001, + "loss": 2.1277, + "step": 21746 + }, + { + "epoch": 0.9200016921905406, + "grad_norm": 0.21068432927131653, + "learning_rate": 0.001, + "loss": 1.6681, + "step": 21747 + }, + { + "epoch": 0.920043996954057, + "grad_norm": 0.18886512517929077, + "learning_rate": 0.001, + "loss": 2.294, + "step": 21748 + }, + { + "epoch": 0.9200863017175734, + "grad_norm": 1.677681803703308, + "learning_rate": 0.001, + "loss": 2.0359, + "step": 21749 + }, + { + "epoch": 0.9201286064810897, + "grad_norm": 0.194906085729599, + "learning_rate": 0.001, + "loss": 2.1362, + "step": 21750 + }, + { + "epoch": 0.9201709112446061, + "grad_norm": 0.15261149406433105, + "learning_rate": 0.001, + "loss": 2.8701, + "step": 21751 + }, + { + "epoch": 0.9202132160081226, + "grad_norm": 0.16249825060367584, + "learning_rate": 0.001, + "loss": 2.4157, + "step": 21752 + }, + { + "epoch": 0.9202555207716389, + "grad_norm": 0.20041526854038239, + "learning_rate": 0.001, + "loss": 2.5838, + "step": 21753 + }, + { + "epoch": 0.9202978255351553, + "grad_norm": 0.2039480060338974, + "learning_rate": 0.001, + "loss": 1.1561, + "step": 21754 + }, + { + "epoch": 0.9203401302986717, + "grad_norm": 0.1664251685142517, + "learning_rate": 0.001, + "loss": 2.5506, + "step": 21755 + }, + { + "epoch": 0.920382435062188, + "grad_norm": 0.19452784955501556, + "learning_rate": 0.001, + "loss": 2.1064, + "step": 21756 + }, + { + "epoch": 0.9204247398257044, + "grad_norm": 0.19989818334579468, + "learning_rate": 0.001, + "loss": 1.9846, + "step": 21757 + }, + { + "epoch": 0.9204670445892208, + "grad_norm": 0.18792906403541565, + "learning_rate": 0.001, + "loss": 1.8072, + "step": 21758 + }, + { + "epoch": 0.9205093493527371, + "grad_norm": 0.4300324618816376, + "learning_rate": 0.001, + "loss": 3.106, + "step": 21759 + }, + { + "epoch": 0.9205516541162535, + "grad_norm": 0.20472699403762817, + "learning_rate": 0.001, + "loss": 2.8519, + "step": 21760 + }, + { + "epoch": 0.9205939588797699, + "grad_norm": 0.16424056887626648, + "learning_rate": 0.001, + "loss": 2.8775, + "step": 21761 + }, + { + "epoch": 0.9206362636432862, + "grad_norm": 0.2106097787618637, + "learning_rate": 0.001, + "loss": 2.7263, + "step": 21762 + }, + { + "epoch": 0.9206785684068026, + "grad_norm": 0.18007276952266693, + "learning_rate": 0.001, + "loss": 2.1168, + "step": 21763 + }, + { + "epoch": 0.920720873170319, + "grad_norm": 0.8814851641654968, + "learning_rate": 0.001, + "loss": 2.6329, + "step": 21764 + }, + { + "epoch": 0.9207631779338353, + "grad_norm": 0.16528195142745972, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 21765 + }, + { + "epoch": 0.9208054826973517, + "grad_norm": 0.14733877778053284, + "learning_rate": 0.001, + "loss": 1.9466, + "step": 21766 + }, + { + "epoch": 0.920847787460868, + "grad_norm": 0.15155373513698578, + "learning_rate": 0.001, + "loss": 2.0437, + "step": 21767 + }, + { + "epoch": 0.9208900922243844, + "grad_norm": 0.7689833045005798, + "learning_rate": 0.001, + "loss": 1.5882, + "step": 21768 + }, + { + "epoch": 0.9209323969879009, + "grad_norm": 0.35222962498664856, + "learning_rate": 0.001, + "loss": 2.6627, + "step": 21769 + }, + { + "epoch": 0.9209747017514172, + "grad_norm": 0.22019173204898834, + "learning_rate": 0.001, + "loss": 2.2336, + "step": 21770 + }, + { + "epoch": 0.9210170065149336, + "grad_norm": 1.362524151802063, + "learning_rate": 0.001, + "loss": 1.5802, + "step": 21771 + }, + { + "epoch": 0.92105931127845, + "grad_norm": 0.18947310745716095, + "learning_rate": 0.001, + "loss": 2.1529, + "step": 21772 + }, + { + "epoch": 0.9211016160419663, + "grad_norm": 0.5228827595710754, + "learning_rate": 0.001, + "loss": 2.2288, + "step": 21773 + }, + { + "epoch": 0.9211439208054827, + "grad_norm": 0.19477491080760956, + "learning_rate": 0.001, + "loss": 1.7683, + "step": 21774 + }, + { + "epoch": 0.9211862255689991, + "grad_norm": 0.1812824010848999, + "learning_rate": 0.001, + "loss": 1.9203, + "step": 21775 + }, + { + "epoch": 0.9212285303325154, + "grad_norm": 0.5380221009254456, + "learning_rate": 0.001, + "loss": 1.5879, + "step": 21776 + }, + { + "epoch": 0.9212708350960318, + "grad_norm": 2.430056095123291, + "learning_rate": 0.001, + "loss": 1.8626, + "step": 21777 + }, + { + "epoch": 0.9213131398595482, + "grad_norm": 0.14518165588378906, + "learning_rate": 0.001, + "loss": 1.956, + "step": 21778 + }, + { + "epoch": 0.9213554446230645, + "grad_norm": 0.16930201649665833, + "learning_rate": 0.001, + "loss": 2.0088, + "step": 21779 + }, + { + "epoch": 0.9213977493865809, + "grad_norm": 0.16156643629074097, + "learning_rate": 0.001, + "loss": 2.6447, + "step": 21780 + }, + { + "epoch": 0.9214400541500973, + "grad_norm": 0.2412625253200531, + "learning_rate": 0.001, + "loss": 1.5386, + "step": 21781 + }, + { + "epoch": 0.9214823589136136, + "grad_norm": 0.16392214596271515, + "learning_rate": 0.001, + "loss": 1.8491, + "step": 21782 + }, + { + "epoch": 0.92152466367713, + "grad_norm": 0.3166408836841583, + "learning_rate": 0.001, + "loss": 1.8965, + "step": 21783 + }, + { + "epoch": 0.9215669684406465, + "grad_norm": 0.6187849044799805, + "learning_rate": 0.001, + "loss": 3.2798, + "step": 21784 + }, + { + "epoch": 0.9216092732041627, + "grad_norm": 0.16876527667045593, + "learning_rate": 0.001, + "loss": 3.2222, + "step": 21785 + }, + { + "epoch": 0.9216515779676792, + "grad_norm": 0.21848821640014648, + "learning_rate": 0.001, + "loss": 2.4569, + "step": 21786 + }, + { + "epoch": 0.9216938827311956, + "grad_norm": 0.2898956537246704, + "learning_rate": 0.001, + "loss": 1.9074, + "step": 21787 + }, + { + "epoch": 0.9217361874947119, + "grad_norm": 0.9921806454658508, + "learning_rate": 0.001, + "loss": 2.6345, + "step": 21788 + }, + { + "epoch": 0.9217784922582283, + "grad_norm": 0.2962776720523834, + "learning_rate": 0.001, + "loss": 1.9957, + "step": 21789 + }, + { + "epoch": 0.9218207970217447, + "grad_norm": 4.324014663696289, + "learning_rate": 0.001, + "loss": 2.4474, + "step": 21790 + }, + { + "epoch": 0.921863101785261, + "grad_norm": 2.0869667530059814, + "learning_rate": 0.001, + "loss": 2.0607, + "step": 21791 + }, + { + "epoch": 0.9219054065487774, + "grad_norm": 0.1384233683347702, + "learning_rate": 0.001, + "loss": 2.6952, + "step": 21792 + }, + { + "epoch": 0.9219477113122938, + "grad_norm": 0.2545308470726013, + "learning_rate": 0.001, + "loss": 1.5397, + "step": 21793 + }, + { + "epoch": 0.9219900160758101, + "grad_norm": 0.1824929267168045, + "learning_rate": 0.001, + "loss": 2.8564, + "step": 21794 + }, + { + "epoch": 0.9220323208393265, + "grad_norm": 0.6214986443519592, + "learning_rate": 0.001, + "loss": 1.8838, + "step": 21795 + }, + { + "epoch": 0.9220746256028429, + "grad_norm": 0.1345427930355072, + "learning_rate": 0.001, + "loss": 1.5997, + "step": 21796 + }, + { + "epoch": 0.9221169303663592, + "grad_norm": 0.3405480980873108, + "learning_rate": 0.001, + "loss": 3.5338, + "step": 21797 + }, + { + "epoch": 0.9221592351298756, + "grad_norm": 1.3706905841827393, + "learning_rate": 0.001, + "loss": 1.5665, + "step": 21798 + }, + { + "epoch": 0.922201539893392, + "grad_norm": 3.3803353309631348, + "learning_rate": 0.001, + "loss": 2.0037, + "step": 21799 + }, + { + "epoch": 0.9222438446569083, + "grad_norm": 2.6381101608276367, + "learning_rate": 0.001, + "loss": 1.8247, + "step": 21800 + }, + { + "epoch": 0.9222861494204248, + "grad_norm": 0.15839798748493195, + "learning_rate": 0.001, + "loss": 2.7898, + "step": 21801 + }, + { + "epoch": 0.9223284541839412, + "grad_norm": 0.3356753885746002, + "learning_rate": 0.001, + "loss": 2.5539, + "step": 21802 + }, + { + "epoch": 0.9223707589474575, + "grad_norm": 0.1914248913526535, + "learning_rate": 0.001, + "loss": 1.6862, + "step": 21803 + }, + { + "epoch": 0.9224130637109739, + "grad_norm": 10.729948997497559, + "learning_rate": 0.001, + "loss": 1.7948, + "step": 21804 + }, + { + "epoch": 0.9224553684744903, + "grad_norm": 0.5791533589363098, + "learning_rate": 0.001, + "loss": 2.1725, + "step": 21805 + }, + { + "epoch": 0.9224976732380066, + "grad_norm": 0.20189444720745087, + "learning_rate": 0.001, + "loss": 1.7816, + "step": 21806 + }, + { + "epoch": 0.922539978001523, + "grad_norm": 5.010307788848877, + "learning_rate": 0.001, + "loss": 3.1041, + "step": 21807 + }, + { + "epoch": 0.9225822827650394, + "grad_norm": 0.21783508360385895, + "learning_rate": 0.001, + "loss": 2.4775, + "step": 21808 + }, + { + "epoch": 0.9226245875285557, + "grad_norm": 0.20718340575695038, + "learning_rate": 0.001, + "loss": 2.3186, + "step": 21809 + }, + { + "epoch": 0.9226668922920721, + "grad_norm": 0.18991310894489288, + "learning_rate": 0.001, + "loss": 1.8765, + "step": 21810 + }, + { + "epoch": 0.9227091970555885, + "grad_norm": 0.1602325290441513, + "learning_rate": 0.001, + "loss": 1.8977, + "step": 21811 + }, + { + "epoch": 0.9227515018191048, + "grad_norm": 0.40473005175590515, + "learning_rate": 0.001, + "loss": 2.3219, + "step": 21812 + }, + { + "epoch": 0.9227938065826212, + "grad_norm": 0.46014153957366943, + "learning_rate": 0.001, + "loss": 3.8687, + "step": 21813 + }, + { + "epoch": 0.9228361113461375, + "grad_norm": 53.86265182495117, + "learning_rate": 0.001, + "loss": 3.7695, + "step": 21814 + }, + { + "epoch": 0.9228784161096539, + "grad_norm": 0.18623895943164825, + "learning_rate": 0.001, + "loss": 2.0448, + "step": 21815 + }, + { + "epoch": 0.9229207208731703, + "grad_norm": 0.33342859148979187, + "learning_rate": 0.001, + "loss": 2.0159, + "step": 21816 + }, + { + "epoch": 0.9229630256366866, + "grad_norm": 17.7737979888916, + "learning_rate": 0.001, + "loss": 3.0125, + "step": 21817 + }, + { + "epoch": 0.923005330400203, + "grad_norm": 0.12280112504959106, + "learning_rate": 0.001, + "loss": 1.5704, + "step": 21818 + }, + { + "epoch": 0.9230476351637195, + "grad_norm": 0.21494708955287933, + "learning_rate": 0.001, + "loss": 1.6715, + "step": 21819 + }, + { + "epoch": 0.9230899399272358, + "grad_norm": 0.2017885446548462, + "learning_rate": 0.001, + "loss": 2.2747, + "step": 21820 + }, + { + "epoch": 0.9231322446907522, + "grad_norm": 0.2331605851650238, + "learning_rate": 0.001, + "loss": 1.9135, + "step": 21821 + }, + { + "epoch": 0.9231745494542686, + "grad_norm": 0.16537924110889435, + "learning_rate": 0.001, + "loss": 2.3864, + "step": 21822 + }, + { + "epoch": 0.9232168542177849, + "grad_norm": 0.1348440796136856, + "learning_rate": 0.001, + "loss": 1.8148, + "step": 21823 + }, + { + "epoch": 0.9232591589813013, + "grad_norm": 0.24906490743160248, + "learning_rate": 0.001, + "loss": 2.2961, + "step": 21824 + }, + { + "epoch": 0.9233014637448177, + "grad_norm": 0.16530346870422363, + "learning_rate": 0.001, + "loss": 2.0339, + "step": 21825 + }, + { + "epoch": 0.923343768508334, + "grad_norm": 0.13655491173267365, + "learning_rate": 0.001, + "loss": 2.1709, + "step": 21826 + }, + { + "epoch": 0.9233860732718504, + "grad_norm": 0.13853634893894196, + "learning_rate": 0.001, + "loss": 1.7936, + "step": 21827 + }, + { + "epoch": 0.9234283780353668, + "grad_norm": 0.14643678069114685, + "learning_rate": 0.001, + "loss": 2.1668, + "step": 21828 + }, + { + "epoch": 0.9234706827988831, + "grad_norm": 0.2975621521472931, + "learning_rate": 0.001, + "loss": 1.9856, + "step": 21829 + }, + { + "epoch": 0.9235129875623995, + "grad_norm": 0.1982540637254715, + "learning_rate": 0.001, + "loss": 2.4848, + "step": 21830 + }, + { + "epoch": 0.9235552923259159, + "grad_norm": 0.22689512372016907, + "learning_rate": 0.001, + "loss": 2.57, + "step": 21831 + }, + { + "epoch": 0.9235975970894322, + "grad_norm": 0.1401916742324829, + "learning_rate": 0.001, + "loss": 1.6943, + "step": 21832 + }, + { + "epoch": 0.9236399018529486, + "grad_norm": 2.501429796218872, + "learning_rate": 0.001, + "loss": 1.5686, + "step": 21833 + }, + { + "epoch": 0.923682206616465, + "grad_norm": 0.15667013823986053, + "learning_rate": 0.001, + "loss": 1.5747, + "step": 21834 + }, + { + "epoch": 0.9237245113799814, + "grad_norm": 0.19675031304359436, + "learning_rate": 0.001, + "loss": 2.1214, + "step": 21835 + }, + { + "epoch": 0.9237668161434978, + "grad_norm": 0.28486713767051697, + "learning_rate": 0.001, + "loss": 2.2525, + "step": 21836 + }, + { + "epoch": 0.9238091209070142, + "grad_norm": 0.28747811913490295, + "learning_rate": 0.001, + "loss": 2.6731, + "step": 21837 + }, + { + "epoch": 0.9238514256705305, + "grad_norm": 0.16760526597499847, + "learning_rate": 0.001, + "loss": 2.2244, + "step": 21838 + }, + { + "epoch": 0.9238937304340469, + "grad_norm": 0.23037217557430267, + "learning_rate": 0.001, + "loss": 2.328, + "step": 21839 + }, + { + "epoch": 0.9239360351975633, + "grad_norm": 0.13840559124946594, + "learning_rate": 0.001, + "loss": 1.8101, + "step": 21840 + }, + { + "epoch": 0.9239783399610796, + "grad_norm": 0.25135916471481323, + "learning_rate": 0.001, + "loss": 2.3498, + "step": 21841 + }, + { + "epoch": 0.924020644724596, + "grad_norm": 55.877220153808594, + "learning_rate": 0.001, + "loss": 2.1315, + "step": 21842 + }, + { + "epoch": 0.9240629494881124, + "grad_norm": 0.12954889237880707, + "learning_rate": 0.001, + "loss": 2.0259, + "step": 21843 + }, + { + "epoch": 0.9241052542516287, + "grad_norm": 3.64554762840271, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 21844 + }, + { + "epoch": 0.9241475590151451, + "grad_norm": 0.13647957146167755, + "learning_rate": 0.001, + "loss": 2.0474, + "step": 21845 + }, + { + "epoch": 0.9241898637786615, + "grad_norm": 0.1622430980205536, + "learning_rate": 0.001, + "loss": 2.051, + "step": 21846 + }, + { + "epoch": 0.9242321685421778, + "grad_norm": 0.17634640634059906, + "learning_rate": 0.001, + "loss": 2.8697, + "step": 21847 + }, + { + "epoch": 0.9242744733056942, + "grad_norm": 0.14419732987880707, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 21848 + }, + { + "epoch": 0.9243167780692106, + "grad_norm": 37.351749420166016, + "learning_rate": 0.001, + "loss": 1.999, + "step": 21849 + }, + { + "epoch": 0.9243590828327269, + "grad_norm": 0.20993314683437347, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 21850 + }, + { + "epoch": 0.9244013875962434, + "grad_norm": 0.29707229137420654, + "learning_rate": 0.001, + "loss": 3.4083, + "step": 21851 + }, + { + "epoch": 0.9244436923597598, + "grad_norm": 0.2973187565803528, + "learning_rate": 0.001, + "loss": 2.5179, + "step": 21852 + }, + { + "epoch": 0.9244859971232761, + "grad_norm": 0.26736098527908325, + "learning_rate": 0.001, + "loss": 1.8084, + "step": 21853 + }, + { + "epoch": 0.9245283018867925, + "grad_norm": 0.22918181121349335, + "learning_rate": 0.001, + "loss": 2.0938, + "step": 21854 + }, + { + "epoch": 0.9245706066503089, + "grad_norm": 0.2083725780248642, + "learning_rate": 0.001, + "loss": 3.048, + "step": 21855 + }, + { + "epoch": 0.9246129114138252, + "grad_norm": 1.0710909366607666, + "learning_rate": 0.001, + "loss": 1.9642, + "step": 21856 + }, + { + "epoch": 0.9246552161773416, + "grad_norm": 0.6909781694412231, + "learning_rate": 0.001, + "loss": 3.8566, + "step": 21857 + }, + { + "epoch": 0.9246975209408579, + "grad_norm": 0.310916543006897, + "learning_rate": 0.001, + "loss": 2.163, + "step": 21858 + }, + { + "epoch": 0.9247398257043743, + "grad_norm": 0.15251141786575317, + "learning_rate": 0.001, + "loss": 2.3132, + "step": 21859 + }, + { + "epoch": 0.9247821304678907, + "grad_norm": 0.19609764218330383, + "learning_rate": 0.001, + "loss": 1.6755, + "step": 21860 + }, + { + "epoch": 0.924824435231407, + "grad_norm": 0.7422865033149719, + "learning_rate": 0.001, + "loss": 2.823, + "step": 21861 + }, + { + "epoch": 0.9248667399949234, + "grad_norm": 0.14667384326457977, + "learning_rate": 0.001, + "loss": 2.1124, + "step": 21862 + }, + { + "epoch": 0.9249090447584398, + "grad_norm": 0.16594403982162476, + "learning_rate": 0.001, + "loss": 2.0591, + "step": 21863 + }, + { + "epoch": 0.9249513495219561, + "grad_norm": 0.18323829770088196, + "learning_rate": 0.001, + "loss": 1.6222, + "step": 21864 + }, + { + "epoch": 0.9249936542854725, + "grad_norm": 0.22297947108745575, + "learning_rate": 0.001, + "loss": 2.2291, + "step": 21865 + }, + { + "epoch": 0.9250359590489889, + "grad_norm": 0.16403332352638245, + "learning_rate": 0.001, + "loss": 2.3493, + "step": 21866 + }, + { + "epoch": 0.9250782638125052, + "grad_norm": 0.23182706534862518, + "learning_rate": 0.001, + "loss": 2.0805, + "step": 21867 + }, + { + "epoch": 0.9251205685760217, + "grad_norm": 0.22759734094142914, + "learning_rate": 0.001, + "loss": 2.4908, + "step": 21868 + }, + { + "epoch": 0.9251628733395381, + "grad_norm": 0.6803989410400391, + "learning_rate": 0.001, + "loss": 2.7571, + "step": 21869 + }, + { + "epoch": 0.9252051781030544, + "grad_norm": 1.2668521404266357, + "learning_rate": 0.001, + "loss": 1.9701, + "step": 21870 + }, + { + "epoch": 0.9252474828665708, + "grad_norm": 0.1656140238046646, + "learning_rate": 0.001, + "loss": 1.9307, + "step": 21871 + }, + { + "epoch": 0.9252897876300872, + "grad_norm": 0.14242412149906158, + "learning_rate": 0.001, + "loss": 2.5352, + "step": 21872 + }, + { + "epoch": 0.9253320923936035, + "grad_norm": 0.1434088498353958, + "learning_rate": 0.001, + "loss": 2.284, + "step": 21873 + }, + { + "epoch": 0.9253743971571199, + "grad_norm": 0.21865014731884003, + "learning_rate": 0.001, + "loss": 2.3557, + "step": 21874 + }, + { + "epoch": 0.9254167019206363, + "grad_norm": 0.1559746265411377, + "learning_rate": 0.001, + "loss": 1.9937, + "step": 21875 + }, + { + "epoch": 0.9254590066841526, + "grad_norm": 2.3426716327667236, + "learning_rate": 0.001, + "loss": 2.8022, + "step": 21876 + }, + { + "epoch": 0.925501311447669, + "grad_norm": 0.18600401282310486, + "learning_rate": 0.001, + "loss": 2.062, + "step": 21877 + }, + { + "epoch": 0.9255436162111854, + "grad_norm": 0.37630805373191833, + "learning_rate": 0.001, + "loss": 2.308, + "step": 21878 + }, + { + "epoch": 0.9255859209747017, + "grad_norm": 0.26190125942230225, + "learning_rate": 0.001, + "loss": 1.5722, + "step": 21879 + }, + { + "epoch": 0.9256282257382181, + "grad_norm": 0.17115432024002075, + "learning_rate": 0.001, + "loss": 2.0632, + "step": 21880 + }, + { + "epoch": 0.9256705305017345, + "grad_norm": 0.3577999472618103, + "learning_rate": 0.001, + "loss": 2.8232, + "step": 21881 + }, + { + "epoch": 0.9257128352652508, + "grad_norm": 0.15650738775730133, + "learning_rate": 0.001, + "loss": 2.1344, + "step": 21882 + }, + { + "epoch": 0.9257551400287672, + "grad_norm": 1.1321587562561035, + "learning_rate": 0.001, + "loss": 3.678, + "step": 21883 + }, + { + "epoch": 0.9257974447922837, + "grad_norm": 0.2203882336616516, + "learning_rate": 0.001, + "loss": 2.5869, + "step": 21884 + }, + { + "epoch": 0.9258397495558, + "grad_norm": 0.20148371160030365, + "learning_rate": 0.001, + "loss": 1.5426, + "step": 21885 + }, + { + "epoch": 0.9258820543193164, + "grad_norm": 0.18005190789699554, + "learning_rate": 0.001, + "loss": 1.9097, + "step": 21886 + }, + { + "epoch": 0.9259243590828328, + "grad_norm": 0.9154829382896423, + "learning_rate": 0.001, + "loss": 2.5761, + "step": 21887 + }, + { + "epoch": 0.9259666638463491, + "grad_norm": 0.16887728869915009, + "learning_rate": 0.001, + "loss": 2.6111, + "step": 21888 + }, + { + "epoch": 0.9260089686098655, + "grad_norm": 0.16773365437984467, + "learning_rate": 0.001, + "loss": 1.4804, + "step": 21889 + }, + { + "epoch": 0.9260512733733819, + "grad_norm": 0.672725260257721, + "learning_rate": 0.001, + "loss": 2.6119, + "step": 21890 + }, + { + "epoch": 0.9260935781368982, + "grad_norm": 2.3344016075134277, + "learning_rate": 0.001, + "loss": 1.5694, + "step": 21891 + }, + { + "epoch": 0.9261358829004146, + "grad_norm": 0.21035782992839813, + "learning_rate": 0.001, + "loss": 3.0887, + "step": 21892 + }, + { + "epoch": 0.926178187663931, + "grad_norm": 0.1845501810312271, + "learning_rate": 0.001, + "loss": 2.0434, + "step": 21893 + }, + { + "epoch": 0.9262204924274473, + "grad_norm": 0.20723649859428406, + "learning_rate": 0.001, + "loss": 1.8821, + "step": 21894 + }, + { + "epoch": 0.9262627971909637, + "grad_norm": 0.4533455967903137, + "learning_rate": 0.001, + "loss": 2.3809, + "step": 21895 + }, + { + "epoch": 0.9263051019544801, + "grad_norm": 1.0757017135620117, + "learning_rate": 0.001, + "loss": 2.2222, + "step": 21896 + }, + { + "epoch": 0.9263474067179964, + "grad_norm": 0.2541113495826721, + "learning_rate": 0.001, + "loss": 2.3574, + "step": 21897 + }, + { + "epoch": 0.9263897114815128, + "grad_norm": 1.4928815364837646, + "learning_rate": 0.001, + "loss": 2.738, + "step": 21898 + }, + { + "epoch": 0.9264320162450292, + "grad_norm": 0.17061060667037964, + "learning_rate": 0.001, + "loss": 1.8219, + "step": 21899 + }, + { + "epoch": 0.9264743210085455, + "grad_norm": 2.1404104232788086, + "learning_rate": 0.001, + "loss": 3.1614, + "step": 21900 + }, + { + "epoch": 0.926516625772062, + "grad_norm": 0.4184578061103821, + "learning_rate": 0.001, + "loss": 2.5873, + "step": 21901 + }, + { + "epoch": 0.9265589305355783, + "grad_norm": 0.758017361164093, + "learning_rate": 0.001, + "loss": 2.5198, + "step": 21902 + }, + { + "epoch": 0.9266012352990947, + "grad_norm": 0.2236809879541397, + "learning_rate": 0.001, + "loss": 2.0093, + "step": 21903 + }, + { + "epoch": 0.9266435400626111, + "grad_norm": 0.1709446907043457, + "learning_rate": 0.001, + "loss": 2.2584, + "step": 21904 + }, + { + "epoch": 0.9266858448261274, + "grad_norm": 0.18377362191677094, + "learning_rate": 0.001, + "loss": 1.7896, + "step": 21905 + }, + { + "epoch": 0.9267281495896438, + "grad_norm": 0.17706288397312164, + "learning_rate": 0.001, + "loss": 2.1536, + "step": 21906 + }, + { + "epoch": 0.9267704543531602, + "grad_norm": 3.2292115688323975, + "learning_rate": 0.001, + "loss": 2.3149, + "step": 21907 + }, + { + "epoch": 0.9268127591166765, + "grad_norm": 0.1633622944355011, + "learning_rate": 0.001, + "loss": 2.2869, + "step": 21908 + }, + { + "epoch": 0.9268550638801929, + "grad_norm": 0.18171913921833038, + "learning_rate": 0.001, + "loss": 1.4094, + "step": 21909 + }, + { + "epoch": 0.9268973686437093, + "grad_norm": 0.20554277300834656, + "learning_rate": 0.001, + "loss": 1.8559, + "step": 21910 + }, + { + "epoch": 0.9269396734072256, + "grad_norm": 0.4027025103569031, + "learning_rate": 0.001, + "loss": 3.185, + "step": 21911 + }, + { + "epoch": 0.926981978170742, + "grad_norm": 0.23326435685157776, + "learning_rate": 0.001, + "loss": 2.6125, + "step": 21912 + }, + { + "epoch": 0.9270242829342584, + "grad_norm": 0.13189657032489777, + "learning_rate": 0.001, + "loss": 1.8056, + "step": 21913 + }, + { + "epoch": 0.9270665876977747, + "grad_norm": 0.15088336169719696, + "learning_rate": 0.001, + "loss": 2.4187, + "step": 21914 + }, + { + "epoch": 0.9271088924612911, + "grad_norm": 0.275808185338974, + "learning_rate": 0.001, + "loss": 2.0664, + "step": 21915 + }, + { + "epoch": 0.9271511972248075, + "grad_norm": 0.16052496433258057, + "learning_rate": 0.001, + "loss": 1.911, + "step": 21916 + }, + { + "epoch": 0.9271935019883238, + "grad_norm": 0.14898687601089478, + "learning_rate": 0.001, + "loss": 1.9662, + "step": 21917 + }, + { + "epoch": 0.9272358067518403, + "grad_norm": 0.13894212245941162, + "learning_rate": 0.001, + "loss": 2.3613, + "step": 21918 + }, + { + "epoch": 0.9272781115153567, + "grad_norm": 0.12985843420028687, + "learning_rate": 0.001, + "loss": 1.5966, + "step": 21919 + }, + { + "epoch": 0.927320416278873, + "grad_norm": 13.45024299621582, + "learning_rate": 0.001, + "loss": 2.3785, + "step": 21920 + }, + { + "epoch": 0.9273627210423894, + "grad_norm": 0.14908571541309357, + "learning_rate": 0.001, + "loss": 2.8162, + "step": 21921 + }, + { + "epoch": 0.9274050258059058, + "grad_norm": 0.1413915455341339, + "learning_rate": 0.001, + "loss": 2.1948, + "step": 21922 + }, + { + "epoch": 0.9274473305694221, + "grad_norm": 0.13808265328407288, + "learning_rate": 0.001, + "loss": 1.619, + "step": 21923 + }, + { + "epoch": 0.9274896353329385, + "grad_norm": 1.1322904825210571, + "learning_rate": 0.001, + "loss": 3.322, + "step": 21924 + }, + { + "epoch": 0.9275319400964549, + "grad_norm": 0.19078315794467926, + "learning_rate": 0.001, + "loss": 2.4069, + "step": 21925 + }, + { + "epoch": 0.9275742448599712, + "grad_norm": 0.341876357793808, + "learning_rate": 0.001, + "loss": 1.9122, + "step": 21926 + }, + { + "epoch": 0.9276165496234876, + "grad_norm": 0.1864626556634903, + "learning_rate": 0.001, + "loss": 1.8488, + "step": 21927 + }, + { + "epoch": 0.927658854387004, + "grad_norm": 0.19447799026966095, + "learning_rate": 0.001, + "loss": 2.0424, + "step": 21928 + }, + { + "epoch": 0.9277011591505203, + "grad_norm": 0.534332275390625, + "learning_rate": 0.001, + "loss": 2.9907, + "step": 21929 + }, + { + "epoch": 0.9277434639140367, + "grad_norm": 0.16288526356220245, + "learning_rate": 0.001, + "loss": 2.4716, + "step": 21930 + }, + { + "epoch": 0.9277857686775531, + "grad_norm": 0.1490895301103592, + "learning_rate": 0.001, + "loss": 1.6003, + "step": 21931 + }, + { + "epoch": 0.9278280734410694, + "grad_norm": 0.16813409328460693, + "learning_rate": 0.001, + "loss": 2.4994, + "step": 21932 + }, + { + "epoch": 0.9278703782045858, + "grad_norm": 0.14073902368545532, + "learning_rate": 0.001, + "loss": 2.4238, + "step": 21933 + }, + { + "epoch": 0.9279126829681023, + "grad_norm": 0.1578540802001953, + "learning_rate": 0.001, + "loss": 1.8206, + "step": 21934 + }, + { + "epoch": 0.9279549877316186, + "grad_norm": 0.3728206753730774, + "learning_rate": 0.001, + "loss": 2.1717, + "step": 21935 + }, + { + "epoch": 0.927997292495135, + "grad_norm": 0.16407276690006256, + "learning_rate": 0.001, + "loss": 1.3542, + "step": 21936 + }, + { + "epoch": 0.9280395972586514, + "grad_norm": 0.5730623006820679, + "learning_rate": 0.001, + "loss": 2.5158, + "step": 21937 + }, + { + "epoch": 0.9280819020221677, + "grad_norm": 0.4225897789001465, + "learning_rate": 0.001, + "loss": 2.63, + "step": 21938 + }, + { + "epoch": 0.9281242067856841, + "grad_norm": 0.19734539091587067, + "learning_rate": 0.001, + "loss": 1.9227, + "step": 21939 + }, + { + "epoch": 0.9281665115492005, + "grad_norm": 0.22450488805770874, + "learning_rate": 0.001, + "loss": 2.4276, + "step": 21940 + }, + { + "epoch": 0.9282088163127168, + "grad_norm": 0.174544557929039, + "learning_rate": 0.001, + "loss": 2.449, + "step": 21941 + }, + { + "epoch": 0.9282511210762332, + "grad_norm": 0.16052083671092987, + "learning_rate": 0.001, + "loss": 2.0025, + "step": 21942 + }, + { + "epoch": 0.9282934258397496, + "grad_norm": 0.14678356051445007, + "learning_rate": 0.001, + "loss": 2.2724, + "step": 21943 + }, + { + "epoch": 0.9283357306032659, + "grad_norm": 0.1925598680973053, + "learning_rate": 0.001, + "loss": 2.3425, + "step": 21944 + }, + { + "epoch": 0.9283780353667823, + "grad_norm": 0.1436510980129242, + "learning_rate": 0.001, + "loss": 1.4963, + "step": 21945 + }, + { + "epoch": 0.9284203401302987, + "grad_norm": 0.1730327010154724, + "learning_rate": 0.001, + "loss": 2.7398, + "step": 21946 + }, + { + "epoch": 0.928462644893815, + "grad_norm": 0.19589565694332123, + "learning_rate": 0.001, + "loss": 1.7817, + "step": 21947 + }, + { + "epoch": 0.9285049496573314, + "grad_norm": 0.4311773180961609, + "learning_rate": 0.001, + "loss": 2.5202, + "step": 21948 + }, + { + "epoch": 0.9285472544208477, + "grad_norm": 0.16326993703842163, + "learning_rate": 0.001, + "loss": 1.5574, + "step": 21949 + }, + { + "epoch": 0.9285895591843641, + "grad_norm": 0.17932333052158356, + "learning_rate": 0.001, + "loss": 1.8842, + "step": 21950 + }, + { + "epoch": 0.9286318639478806, + "grad_norm": 0.18280398845672607, + "learning_rate": 0.001, + "loss": 1.8407, + "step": 21951 + }, + { + "epoch": 0.9286741687113969, + "grad_norm": 0.2984718978404999, + "learning_rate": 0.001, + "loss": 2.01, + "step": 21952 + }, + { + "epoch": 0.9287164734749133, + "grad_norm": 0.1796807199716568, + "learning_rate": 0.001, + "loss": 2.3846, + "step": 21953 + }, + { + "epoch": 0.9287587782384297, + "grad_norm": 0.21217189729213715, + "learning_rate": 0.001, + "loss": 1.991, + "step": 21954 + }, + { + "epoch": 0.928801083001946, + "grad_norm": 0.19677573442459106, + "learning_rate": 0.001, + "loss": 1.8435, + "step": 21955 + }, + { + "epoch": 0.9288433877654624, + "grad_norm": 0.15436428785324097, + "learning_rate": 0.001, + "loss": 2.0693, + "step": 21956 + }, + { + "epoch": 0.9288856925289788, + "grad_norm": 0.15341611206531525, + "learning_rate": 0.001, + "loss": 1.5144, + "step": 21957 + }, + { + "epoch": 0.9289279972924951, + "grad_norm": 0.16485430300235748, + "learning_rate": 0.001, + "loss": 2.2537, + "step": 21958 + }, + { + "epoch": 0.9289703020560115, + "grad_norm": 0.20801179111003876, + "learning_rate": 0.001, + "loss": 1.6632, + "step": 21959 + }, + { + "epoch": 0.9290126068195279, + "grad_norm": 0.1642443984746933, + "learning_rate": 0.001, + "loss": 2.366, + "step": 21960 + }, + { + "epoch": 0.9290549115830442, + "grad_norm": 0.22680698335170746, + "learning_rate": 0.001, + "loss": 1.711, + "step": 21961 + }, + { + "epoch": 0.9290972163465606, + "grad_norm": 0.16985486447811127, + "learning_rate": 0.001, + "loss": 2.383, + "step": 21962 + }, + { + "epoch": 0.929139521110077, + "grad_norm": 0.1616482138633728, + "learning_rate": 0.001, + "loss": 1.9125, + "step": 21963 + }, + { + "epoch": 0.9291818258735933, + "grad_norm": 0.15879781544208527, + "learning_rate": 0.001, + "loss": 2.075, + "step": 21964 + }, + { + "epoch": 0.9292241306371097, + "grad_norm": 0.12933063507080078, + "learning_rate": 0.001, + "loss": 2.4285, + "step": 21965 + }, + { + "epoch": 0.9292664354006261, + "grad_norm": 0.1703050583600998, + "learning_rate": 0.001, + "loss": 1.8814, + "step": 21966 + }, + { + "epoch": 0.9293087401641424, + "grad_norm": 0.15759029984474182, + "learning_rate": 0.001, + "loss": 1.975, + "step": 21967 + }, + { + "epoch": 0.9293510449276589, + "grad_norm": 0.16814181208610535, + "learning_rate": 0.001, + "loss": 3.0872, + "step": 21968 + }, + { + "epoch": 0.9293933496911753, + "grad_norm": 1.8021339178085327, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 21969 + }, + { + "epoch": 0.9294356544546916, + "grad_norm": 0.16891023516654968, + "learning_rate": 0.001, + "loss": 1.7206, + "step": 21970 + }, + { + "epoch": 0.929477959218208, + "grad_norm": 0.297505259513855, + "learning_rate": 0.001, + "loss": 2.7611, + "step": 21971 + }, + { + "epoch": 0.9295202639817244, + "grad_norm": 0.1714632362127304, + "learning_rate": 0.001, + "loss": 2.1129, + "step": 21972 + }, + { + "epoch": 0.9295625687452407, + "grad_norm": 0.19479063153266907, + "learning_rate": 0.001, + "loss": 2.0595, + "step": 21973 + }, + { + "epoch": 0.9296048735087571, + "grad_norm": 0.180049866437912, + "learning_rate": 0.001, + "loss": 2.3096, + "step": 21974 + }, + { + "epoch": 0.9296471782722735, + "grad_norm": 0.6898881196975708, + "learning_rate": 0.001, + "loss": 2.1587, + "step": 21975 + }, + { + "epoch": 0.9296894830357898, + "grad_norm": 1.291130781173706, + "learning_rate": 0.001, + "loss": 2.1467, + "step": 21976 + }, + { + "epoch": 0.9297317877993062, + "grad_norm": 0.16307774186134338, + "learning_rate": 0.001, + "loss": 2.5267, + "step": 21977 + }, + { + "epoch": 0.9297740925628226, + "grad_norm": 0.2098264843225479, + "learning_rate": 0.001, + "loss": 2.5711, + "step": 21978 + }, + { + "epoch": 0.9298163973263389, + "grad_norm": 2.183814287185669, + "learning_rate": 0.001, + "loss": 1.6738, + "step": 21979 + }, + { + "epoch": 0.9298587020898553, + "grad_norm": 0.1923678070306778, + "learning_rate": 0.001, + "loss": 3.2464, + "step": 21980 + }, + { + "epoch": 0.9299010068533717, + "grad_norm": 0.13686540722846985, + "learning_rate": 0.001, + "loss": 2.1387, + "step": 21981 + }, + { + "epoch": 0.929943311616888, + "grad_norm": 4.34953498840332, + "learning_rate": 0.001, + "loss": 1.6731, + "step": 21982 + }, + { + "epoch": 0.9299856163804044, + "grad_norm": 0.16118478775024414, + "learning_rate": 0.001, + "loss": 2.368, + "step": 21983 + }, + { + "epoch": 0.9300279211439209, + "grad_norm": 0.589535117149353, + "learning_rate": 0.001, + "loss": 2.4586, + "step": 21984 + }, + { + "epoch": 0.9300702259074372, + "grad_norm": 0.535638689994812, + "learning_rate": 0.001, + "loss": 2.6041, + "step": 21985 + }, + { + "epoch": 0.9301125306709536, + "grad_norm": 0.1409117430448532, + "learning_rate": 0.001, + "loss": 2.2921, + "step": 21986 + }, + { + "epoch": 0.93015483543447, + "grad_norm": 0.27417516708374023, + "learning_rate": 0.001, + "loss": 1.468, + "step": 21987 + }, + { + "epoch": 0.9301971401979863, + "grad_norm": 0.4808787405490875, + "learning_rate": 0.001, + "loss": 1.8091, + "step": 21988 + }, + { + "epoch": 0.9302394449615027, + "grad_norm": 3.5292954444885254, + "learning_rate": 0.001, + "loss": 2.1259, + "step": 21989 + }, + { + "epoch": 0.9302817497250191, + "grad_norm": 0.1566908210515976, + "learning_rate": 0.001, + "loss": 2.3119, + "step": 21990 + }, + { + "epoch": 0.9303240544885354, + "grad_norm": 0.14310206472873688, + "learning_rate": 0.001, + "loss": 3.0195, + "step": 21991 + }, + { + "epoch": 0.9303663592520518, + "grad_norm": 0.16697970032691956, + "learning_rate": 0.001, + "loss": 2.2591, + "step": 21992 + }, + { + "epoch": 0.9304086640155681, + "grad_norm": 0.1776588410139084, + "learning_rate": 0.001, + "loss": 2.4498, + "step": 21993 + }, + { + "epoch": 0.9304509687790845, + "grad_norm": 0.6791718602180481, + "learning_rate": 0.001, + "loss": 1.7671, + "step": 21994 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 11.523677825927734, + "learning_rate": 0.001, + "loss": 2.4538, + "step": 21995 + }, + { + "epoch": 0.9305355783061172, + "grad_norm": 0.16806228458881378, + "learning_rate": 0.001, + "loss": 2.1725, + "step": 21996 + }, + { + "epoch": 0.9305778830696336, + "grad_norm": 0.24460969865322113, + "learning_rate": 0.001, + "loss": 2.6757, + "step": 21997 + }, + { + "epoch": 0.93062018783315, + "grad_norm": 0.19408555328845978, + "learning_rate": 0.001, + "loss": 2.5526, + "step": 21998 + }, + { + "epoch": 0.9306624925966663, + "grad_norm": 0.18568742275238037, + "learning_rate": 0.001, + "loss": 2.6363, + "step": 21999 + }, + { + "epoch": 0.9307047973601827, + "grad_norm": 0.16618452966213226, + "learning_rate": 0.001, + "loss": 2.1546, + "step": 22000 + }, + { + "epoch": 0.9307471021236992, + "grad_norm": 0.16462920606136322, + "learning_rate": 0.001, + "loss": 1.5723, + "step": 22001 + }, + { + "epoch": 0.9307894068872155, + "grad_norm": 0.12047246843576431, + "learning_rate": 0.001, + "loss": 1.9203, + "step": 22002 + }, + { + "epoch": 0.9308317116507319, + "grad_norm": 0.22285322844982147, + "learning_rate": 0.001, + "loss": 3.2207, + "step": 22003 + }, + { + "epoch": 0.9308740164142483, + "grad_norm": 0.14793570339679718, + "learning_rate": 0.001, + "loss": 2.233, + "step": 22004 + }, + { + "epoch": 0.9309163211777646, + "grad_norm": 0.16332119703292847, + "learning_rate": 0.001, + "loss": 1.9054, + "step": 22005 + }, + { + "epoch": 0.930958625941281, + "grad_norm": 0.1610136777162552, + "learning_rate": 0.001, + "loss": 2.1695, + "step": 22006 + }, + { + "epoch": 0.9310009307047974, + "grad_norm": 0.8120068907737732, + "learning_rate": 0.001, + "loss": 2.0812, + "step": 22007 + }, + { + "epoch": 0.9310432354683137, + "grad_norm": 0.1367693543434143, + "learning_rate": 0.001, + "loss": 1.6623, + "step": 22008 + }, + { + "epoch": 0.9310855402318301, + "grad_norm": 0.2394467443227768, + "learning_rate": 0.001, + "loss": 2.7049, + "step": 22009 + }, + { + "epoch": 0.9311278449953465, + "grad_norm": 0.4347785413265228, + "learning_rate": 0.001, + "loss": 2.373, + "step": 22010 + }, + { + "epoch": 0.9311701497588628, + "grad_norm": 0.1576579064130783, + "learning_rate": 0.001, + "loss": 2.3929, + "step": 22011 + }, + { + "epoch": 0.9312124545223792, + "grad_norm": 0.41782185435295105, + "learning_rate": 0.001, + "loss": 1.6985, + "step": 22012 + }, + { + "epoch": 0.9312547592858956, + "grad_norm": 0.16780440509319305, + "learning_rate": 0.001, + "loss": 1.8258, + "step": 22013 + }, + { + "epoch": 0.9312970640494119, + "grad_norm": 0.15203818678855896, + "learning_rate": 0.001, + "loss": 2.5598, + "step": 22014 + }, + { + "epoch": 0.9313393688129283, + "grad_norm": 0.1656513512134552, + "learning_rate": 0.001, + "loss": 2.0156, + "step": 22015 + }, + { + "epoch": 0.9313816735764447, + "grad_norm": 0.2978683412075043, + "learning_rate": 0.001, + "loss": 1.8497, + "step": 22016 + }, + { + "epoch": 0.931423978339961, + "grad_norm": 2.85372257232666, + "learning_rate": 0.001, + "loss": 2.6914, + "step": 22017 + }, + { + "epoch": 0.9314662831034775, + "grad_norm": 0.16330118477344513, + "learning_rate": 0.001, + "loss": 2.3187, + "step": 22018 + }, + { + "epoch": 0.9315085878669939, + "grad_norm": 0.6117385029792786, + "learning_rate": 0.001, + "loss": 2.3308, + "step": 22019 + }, + { + "epoch": 0.9315508926305102, + "grad_norm": 0.15620072185993195, + "learning_rate": 0.001, + "loss": 2.5478, + "step": 22020 + }, + { + "epoch": 0.9315931973940266, + "grad_norm": 0.167611226439476, + "learning_rate": 0.001, + "loss": 2.4346, + "step": 22021 + }, + { + "epoch": 0.931635502157543, + "grad_norm": 0.36283206939697266, + "learning_rate": 0.001, + "loss": 2.1529, + "step": 22022 + }, + { + "epoch": 0.9316778069210593, + "grad_norm": 0.15195079147815704, + "learning_rate": 0.001, + "loss": 1.6573, + "step": 22023 + }, + { + "epoch": 0.9317201116845757, + "grad_norm": 0.16134585440158844, + "learning_rate": 0.001, + "loss": 2.0162, + "step": 22024 + }, + { + "epoch": 0.9317624164480921, + "grad_norm": 0.20685450732707977, + "learning_rate": 0.001, + "loss": 1.9848, + "step": 22025 + }, + { + "epoch": 0.9318047212116084, + "grad_norm": 0.24313196539878845, + "learning_rate": 0.001, + "loss": 2.4929, + "step": 22026 + }, + { + "epoch": 0.9318470259751248, + "grad_norm": 0.20717033743858337, + "learning_rate": 0.001, + "loss": 1.6051, + "step": 22027 + }, + { + "epoch": 0.9318893307386412, + "grad_norm": 0.1792394369840622, + "learning_rate": 0.001, + "loss": 2.2256, + "step": 22028 + }, + { + "epoch": 0.9319316355021575, + "grad_norm": 0.17706766724586487, + "learning_rate": 0.001, + "loss": 3.126, + "step": 22029 + }, + { + "epoch": 0.9319739402656739, + "grad_norm": 0.21002162992954254, + "learning_rate": 0.001, + "loss": 2.634, + "step": 22030 + }, + { + "epoch": 0.9320162450291903, + "grad_norm": 0.16654951870441437, + "learning_rate": 0.001, + "loss": 2.363, + "step": 22031 + }, + { + "epoch": 0.9320585497927066, + "grad_norm": 0.19493767619132996, + "learning_rate": 0.001, + "loss": 2.8382, + "step": 22032 + }, + { + "epoch": 0.932100854556223, + "grad_norm": 0.24346645176410675, + "learning_rate": 0.001, + "loss": 1.4973, + "step": 22033 + }, + { + "epoch": 0.9321431593197395, + "grad_norm": 0.19552533328533173, + "learning_rate": 0.001, + "loss": 2.1005, + "step": 22034 + }, + { + "epoch": 0.9321854640832558, + "grad_norm": 0.4952528774738312, + "learning_rate": 0.001, + "loss": 2.4358, + "step": 22035 + }, + { + "epoch": 0.9322277688467722, + "grad_norm": 0.16397695243358612, + "learning_rate": 0.001, + "loss": 1.7882, + "step": 22036 + }, + { + "epoch": 0.9322700736102885, + "grad_norm": 0.16197176277637482, + "learning_rate": 0.001, + "loss": 2.0223, + "step": 22037 + }, + { + "epoch": 0.9323123783738049, + "grad_norm": 0.5470212697982788, + "learning_rate": 0.001, + "loss": 3.6555, + "step": 22038 + }, + { + "epoch": 0.9323546831373213, + "grad_norm": 0.1571442037820816, + "learning_rate": 0.001, + "loss": 2.9497, + "step": 22039 + }, + { + "epoch": 0.9323969879008376, + "grad_norm": 0.16548150777816772, + "learning_rate": 0.001, + "loss": 2.1686, + "step": 22040 + }, + { + "epoch": 0.932439292664354, + "grad_norm": 22.103620529174805, + "learning_rate": 0.001, + "loss": 2.1238, + "step": 22041 + }, + { + "epoch": 0.9324815974278704, + "grad_norm": 0.16696321964263916, + "learning_rate": 0.001, + "loss": 2.387, + "step": 22042 + }, + { + "epoch": 0.9325239021913867, + "grad_norm": 31.06536102294922, + "learning_rate": 0.001, + "loss": 2.0811, + "step": 22043 + }, + { + "epoch": 0.9325662069549031, + "grad_norm": 0.3314652144908905, + "learning_rate": 0.001, + "loss": 2.2734, + "step": 22044 + }, + { + "epoch": 0.9326085117184195, + "grad_norm": 0.17265798151493073, + "learning_rate": 0.001, + "loss": 2.7165, + "step": 22045 + }, + { + "epoch": 0.9326508164819358, + "grad_norm": 0.1684129536151886, + "learning_rate": 0.001, + "loss": 1.8214, + "step": 22046 + }, + { + "epoch": 0.9326931212454522, + "grad_norm": 3.3001720905303955, + "learning_rate": 0.001, + "loss": 2.131, + "step": 22047 + }, + { + "epoch": 0.9327354260089686, + "grad_norm": 0.13358072936534882, + "learning_rate": 0.001, + "loss": 2.129, + "step": 22048 + }, + { + "epoch": 0.9327777307724849, + "grad_norm": 0.15245433151721954, + "learning_rate": 0.001, + "loss": 2.2186, + "step": 22049 + }, + { + "epoch": 0.9328200355360013, + "grad_norm": 0.16228505969047546, + "learning_rate": 0.001, + "loss": 1.5467, + "step": 22050 + }, + { + "epoch": 0.9328623402995178, + "grad_norm": 0.18052756786346436, + "learning_rate": 0.001, + "loss": 2.555, + "step": 22051 + }, + { + "epoch": 0.932904645063034, + "grad_norm": 0.20280319452285767, + "learning_rate": 0.001, + "loss": 1.93, + "step": 22052 + }, + { + "epoch": 0.9329469498265505, + "grad_norm": 0.15624532103538513, + "learning_rate": 0.001, + "loss": 2.3892, + "step": 22053 + }, + { + "epoch": 0.9329892545900669, + "grad_norm": 0.15655791759490967, + "learning_rate": 0.001, + "loss": 2.7288, + "step": 22054 + }, + { + "epoch": 0.9330315593535832, + "grad_norm": 0.40425026416778564, + "learning_rate": 0.001, + "loss": 3.1181, + "step": 22055 + }, + { + "epoch": 0.9330738641170996, + "grad_norm": 0.41149187088012695, + "learning_rate": 0.001, + "loss": 4.535, + "step": 22056 + }, + { + "epoch": 0.933116168880616, + "grad_norm": 6.105749130249023, + "learning_rate": 0.001, + "loss": 2.8493, + "step": 22057 + }, + { + "epoch": 0.9331584736441323, + "grad_norm": 0.14242605865001678, + "learning_rate": 0.001, + "loss": 2.2115, + "step": 22058 + }, + { + "epoch": 0.9332007784076487, + "grad_norm": 0.14453691244125366, + "learning_rate": 0.001, + "loss": 2.0789, + "step": 22059 + }, + { + "epoch": 0.9332430831711651, + "grad_norm": 0.1681307554244995, + "learning_rate": 0.001, + "loss": 2.631, + "step": 22060 + }, + { + "epoch": 0.9332853879346814, + "grad_norm": 0.14247244596481323, + "learning_rate": 0.001, + "loss": 2.0911, + "step": 22061 + }, + { + "epoch": 0.9333276926981978, + "grad_norm": 0.16670750081539154, + "learning_rate": 0.001, + "loss": 1.9668, + "step": 22062 + }, + { + "epoch": 0.9333699974617142, + "grad_norm": 0.1602124273777008, + "learning_rate": 0.001, + "loss": 2.805, + "step": 22063 + }, + { + "epoch": 0.9334123022252305, + "grad_norm": 0.4584788978099823, + "learning_rate": 0.001, + "loss": 2.2362, + "step": 22064 + }, + { + "epoch": 0.9334546069887469, + "grad_norm": 0.16626909375190735, + "learning_rate": 0.001, + "loss": 2.8254, + "step": 22065 + }, + { + "epoch": 0.9334969117522633, + "grad_norm": 0.1582920104265213, + "learning_rate": 0.001, + "loss": 2.3247, + "step": 22066 + }, + { + "epoch": 0.9335392165157796, + "grad_norm": 0.15107718110084534, + "learning_rate": 0.001, + "loss": 1.2551, + "step": 22067 + }, + { + "epoch": 0.9335815212792961, + "grad_norm": 0.1743275374174118, + "learning_rate": 0.001, + "loss": 1.9243, + "step": 22068 + }, + { + "epoch": 0.9336238260428125, + "grad_norm": 0.148182213306427, + "learning_rate": 0.001, + "loss": 1.3991, + "step": 22069 + }, + { + "epoch": 0.9336661308063288, + "grad_norm": 0.1482088416814804, + "learning_rate": 0.001, + "loss": 1.905, + "step": 22070 + }, + { + "epoch": 0.9337084355698452, + "grad_norm": 0.17089058458805084, + "learning_rate": 0.001, + "loss": 2.596, + "step": 22071 + }, + { + "epoch": 0.9337507403333616, + "grad_norm": 0.15498429536819458, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 22072 + }, + { + "epoch": 0.9337930450968779, + "grad_norm": 0.1602306365966797, + "learning_rate": 0.001, + "loss": 1.7998, + "step": 22073 + }, + { + "epoch": 0.9338353498603943, + "grad_norm": 0.816593587398529, + "learning_rate": 0.001, + "loss": 1.4125, + "step": 22074 + }, + { + "epoch": 0.9338776546239107, + "grad_norm": 3.837941884994507, + "learning_rate": 0.001, + "loss": 2.5317, + "step": 22075 + }, + { + "epoch": 0.933919959387427, + "grad_norm": 0.1522829383611679, + "learning_rate": 0.001, + "loss": 1.5934, + "step": 22076 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 0.1627858579158783, + "learning_rate": 0.001, + "loss": 2.3209, + "step": 22077 + }, + { + "epoch": 0.9340045689144598, + "grad_norm": 0.22749550640583038, + "learning_rate": 0.001, + "loss": 1.7942, + "step": 22078 + }, + { + "epoch": 0.9340468736779761, + "grad_norm": 0.2853844463825226, + "learning_rate": 0.001, + "loss": 2.4773, + "step": 22079 + }, + { + "epoch": 0.9340891784414925, + "grad_norm": 0.16091381013393402, + "learning_rate": 0.001, + "loss": 2.1776, + "step": 22080 + }, + { + "epoch": 0.9341314832050089, + "grad_norm": 0.15295559167861938, + "learning_rate": 0.001, + "loss": 2.8482, + "step": 22081 + }, + { + "epoch": 0.9341737879685252, + "grad_norm": 0.1788291186094284, + "learning_rate": 0.001, + "loss": 1.6635, + "step": 22082 + }, + { + "epoch": 0.9342160927320416, + "grad_norm": 0.1991390734910965, + "learning_rate": 0.001, + "loss": 2.5794, + "step": 22083 + }, + { + "epoch": 0.934258397495558, + "grad_norm": 0.32976192235946655, + "learning_rate": 0.001, + "loss": 2.057, + "step": 22084 + }, + { + "epoch": 0.9343007022590744, + "grad_norm": 0.1380927860736847, + "learning_rate": 0.001, + "loss": 1.8436, + "step": 22085 + }, + { + "epoch": 0.9343430070225908, + "grad_norm": 0.13859638571739197, + "learning_rate": 0.001, + "loss": 1.8601, + "step": 22086 + }, + { + "epoch": 0.9343853117861071, + "grad_norm": 3.625734567642212, + "learning_rate": 0.001, + "loss": 2.2505, + "step": 22087 + }, + { + "epoch": 0.9344276165496235, + "grad_norm": 0.1253889799118042, + "learning_rate": 0.001, + "loss": 2.8063, + "step": 22088 + }, + { + "epoch": 0.9344699213131399, + "grad_norm": 0.1292024403810501, + "learning_rate": 0.001, + "loss": 1.7487, + "step": 22089 + }, + { + "epoch": 0.9345122260766562, + "grad_norm": 2.1888620853424072, + "learning_rate": 0.001, + "loss": 1.8234, + "step": 22090 + }, + { + "epoch": 0.9345545308401726, + "grad_norm": 0.8209211230278015, + "learning_rate": 0.001, + "loss": 2.2189, + "step": 22091 + }, + { + "epoch": 0.934596835603689, + "grad_norm": 0.14426971971988678, + "learning_rate": 0.001, + "loss": 1.8472, + "step": 22092 + }, + { + "epoch": 0.9346391403672053, + "grad_norm": 0.1676510125398636, + "learning_rate": 0.001, + "loss": 2.6767, + "step": 22093 + }, + { + "epoch": 0.9346814451307217, + "grad_norm": 0.1549409031867981, + "learning_rate": 0.001, + "loss": 2.5355, + "step": 22094 + }, + { + "epoch": 0.9347237498942381, + "grad_norm": 0.15506504476070404, + "learning_rate": 0.001, + "loss": 2.0499, + "step": 22095 + }, + { + "epoch": 0.9347660546577544, + "grad_norm": 0.16985729336738586, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 22096 + }, + { + "epoch": 0.9348083594212708, + "grad_norm": 0.15692901611328125, + "learning_rate": 0.001, + "loss": 1.603, + "step": 22097 + }, + { + "epoch": 0.9348506641847872, + "grad_norm": 0.17224203050136566, + "learning_rate": 0.001, + "loss": 2.0071, + "step": 22098 + }, + { + "epoch": 0.9348929689483035, + "grad_norm": 1.8327698707580566, + "learning_rate": 0.001, + "loss": 1.5114, + "step": 22099 + }, + { + "epoch": 0.93493527371182, + "grad_norm": 0.16607293486595154, + "learning_rate": 0.001, + "loss": 1.9993, + "step": 22100 + }, + { + "epoch": 0.9349775784753364, + "grad_norm": 1.2528990507125854, + "learning_rate": 0.001, + "loss": 2.7158, + "step": 22101 + }, + { + "epoch": 0.9350198832388527, + "grad_norm": 1.1872773170471191, + "learning_rate": 0.001, + "loss": 2.4694, + "step": 22102 + }, + { + "epoch": 0.9350621880023691, + "grad_norm": 0.6562841534614563, + "learning_rate": 0.001, + "loss": 2.5087, + "step": 22103 + }, + { + "epoch": 0.9351044927658855, + "grad_norm": 0.16749240458011627, + "learning_rate": 0.001, + "loss": 2.0348, + "step": 22104 + }, + { + "epoch": 0.9351467975294018, + "grad_norm": 0.7875110507011414, + "learning_rate": 0.001, + "loss": 2.6865, + "step": 22105 + }, + { + "epoch": 0.9351891022929182, + "grad_norm": 0.15838965773582458, + "learning_rate": 0.001, + "loss": 2.4356, + "step": 22106 + }, + { + "epoch": 0.9352314070564346, + "grad_norm": 0.23913998901844025, + "learning_rate": 0.001, + "loss": 2.4341, + "step": 22107 + }, + { + "epoch": 0.9352737118199509, + "grad_norm": 0.19448761641979218, + "learning_rate": 0.001, + "loss": 2.0626, + "step": 22108 + }, + { + "epoch": 0.9353160165834673, + "grad_norm": 0.16222184896469116, + "learning_rate": 0.001, + "loss": 1.2972, + "step": 22109 + }, + { + "epoch": 0.9353583213469837, + "grad_norm": 0.17358072102069855, + "learning_rate": 0.001, + "loss": 2.4516, + "step": 22110 + }, + { + "epoch": 0.9354006261105, + "grad_norm": 0.22711333632469177, + "learning_rate": 0.001, + "loss": 2.397, + "step": 22111 + }, + { + "epoch": 0.9354429308740164, + "grad_norm": 0.151783287525177, + "learning_rate": 0.001, + "loss": 2.1857, + "step": 22112 + }, + { + "epoch": 0.9354852356375328, + "grad_norm": 0.7333983182907104, + "learning_rate": 0.001, + "loss": 1.6396, + "step": 22113 + }, + { + "epoch": 0.9355275404010491, + "grad_norm": 0.5164382457733154, + "learning_rate": 0.001, + "loss": 1.9616, + "step": 22114 + }, + { + "epoch": 0.9355698451645655, + "grad_norm": 0.21135050058364868, + "learning_rate": 0.001, + "loss": 1.9535, + "step": 22115 + }, + { + "epoch": 0.935612149928082, + "grad_norm": 0.19637274742126465, + "learning_rate": 0.001, + "loss": 2.3697, + "step": 22116 + }, + { + "epoch": 0.9356544546915982, + "grad_norm": 0.2197602093219757, + "learning_rate": 0.001, + "loss": 2.5342, + "step": 22117 + }, + { + "epoch": 0.9356967594551147, + "grad_norm": 11.109819412231445, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 22118 + }, + { + "epoch": 0.9357390642186311, + "grad_norm": 0.5363801121711731, + "learning_rate": 0.001, + "loss": 1.6609, + "step": 22119 + }, + { + "epoch": 0.9357813689821474, + "grad_norm": 0.7339990735054016, + "learning_rate": 0.001, + "loss": 1.8741, + "step": 22120 + }, + { + "epoch": 0.9358236737456638, + "grad_norm": 0.1876104772090912, + "learning_rate": 0.001, + "loss": 1.6976, + "step": 22121 + }, + { + "epoch": 0.9358659785091802, + "grad_norm": 6.681327819824219, + "learning_rate": 0.001, + "loss": 2.7797, + "step": 22122 + }, + { + "epoch": 0.9359082832726965, + "grad_norm": 0.15576091408729553, + "learning_rate": 0.001, + "loss": 1.8506, + "step": 22123 + }, + { + "epoch": 0.9359505880362129, + "grad_norm": 0.15668316185474396, + "learning_rate": 0.001, + "loss": 1.701, + "step": 22124 + }, + { + "epoch": 0.9359928927997293, + "grad_norm": 0.19054444134235382, + "learning_rate": 0.001, + "loss": 1.8375, + "step": 22125 + }, + { + "epoch": 0.9360351975632456, + "grad_norm": 0.14556585252285004, + "learning_rate": 0.001, + "loss": 1.7359, + "step": 22126 + }, + { + "epoch": 0.936077502326762, + "grad_norm": 1.7815239429473877, + "learning_rate": 0.001, + "loss": 2.0106, + "step": 22127 + }, + { + "epoch": 0.9361198070902783, + "grad_norm": 0.13777472078800201, + "learning_rate": 0.001, + "loss": 2.0197, + "step": 22128 + }, + { + "epoch": 0.9361621118537947, + "grad_norm": 0.1779537945985794, + "learning_rate": 0.001, + "loss": 2.0832, + "step": 22129 + }, + { + "epoch": 0.9362044166173111, + "grad_norm": 0.1431722640991211, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 22130 + }, + { + "epoch": 0.9362467213808274, + "grad_norm": 0.5513030290603638, + "learning_rate": 0.001, + "loss": 1.8146, + "step": 22131 + }, + { + "epoch": 0.9362890261443438, + "grad_norm": 0.14709900319576263, + "learning_rate": 0.001, + "loss": 2.2721, + "step": 22132 + }, + { + "epoch": 0.9363313309078602, + "grad_norm": 0.13920997083187103, + "learning_rate": 0.001, + "loss": 1.7674, + "step": 22133 + }, + { + "epoch": 0.9363736356713765, + "grad_norm": 0.13396479189395905, + "learning_rate": 0.001, + "loss": 1.5546, + "step": 22134 + }, + { + "epoch": 0.936415940434893, + "grad_norm": 0.14524602890014648, + "learning_rate": 0.001, + "loss": 1.9367, + "step": 22135 + }, + { + "epoch": 0.9364582451984094, + "grad_norm": 0.14268428087234497, + "learning_rate": 0.001, + "loss": 1.7628, + "step": 22136 + }, + { + "epoch": 0.9365005499619257, + "grad_norm": 0.16720207035541534, + "learning_rate": 0.001, + "loss": 2.0936, + "step": 22137 + }, + { + "epoch": 0.9365428547254421, + "grad_norm": 0.1389460563659668, + "learning_rate": 0.001, + "loss": 1.5024, + "step": 22138 + }, + { + "epoch": 0.9365851594889585, + "grad_norm": 0.1430455595254898, + "learning_rate": 0.001, + "loss": 3.3252, + "step": 22139 + }, + { + "epoch": 0.9366274642524748, + "grad_norm": 0.2216232270002365, + "learning_rate": 0.001, + "loss": 1.4568, + "step": 22140 + }, + { + "epoch": 0.9366697690159912, + "grad_norm": 0.1489740014076233, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 22141 + }, + { + "epoch": 0.9367120737795076, + "grad_norm": 0.16746602952480316, + "learning_rate": 0.001, + "loss": 1.7385, + "step": 22142 + }, + { + "epoch": 0.9367543785430239, + "grad_norm": 1.196807861328125, + "learning_rate": 0.001, + "loss": 2.4886, + "step": 22143 + }, + { + "epoch": 0.9367966833065403, + "grad_norm": 0.15129493176937103, + "learning_rate": 0.001, + "loss": 2.6034, + "step": 22144 + }, + { + "epoch": 0.9368389880700567, + "grad_norm": 0.17672112584114075, + "learning_rate": 0.001, + "loss": 3.1263, + "step": 22145 + }, + { + "epoch": 0.936881292833573, + "grad_norm": 0.16527965664863586, + "learning_rate": 0.001, + "loss": 1.997, + "step": 22146 + }, + { + "epoch": 0.9369235975970894, + "grad_norm": 0.8040674328804016, + "learning_rate": 0.001, + "loss": 2.0254, + "step": 22147 + }, + { + "epoch": 0.9369659023606058, + "grad_norm": 0.1395406723022461, + "learning_rate": 0.001, + "loss": 2.0166, + "step": 22148 + }, + { + "epoch": 0.9370082071241221, + "grad_norm": 0.14435064792633057, + "learning_rate": 0.001, + "loss": 1.8352, + "step": 22149 + }, + { + "epoch": 0.9370505118876385, + "grad_norm": 0.14937026798725128, + "learning_rate": 0.001, + "loss": 4.2509, + "step": 22150 + }, + { + "epoch": 0.937092816651155, + "grad_norm": 0.22345872223377228, + "learning_rate": 0.001, + "loss": 2.0077, + "step": 22151 + }, + { + "epoch": 0.9371351214146713, + "grad_norm": 0.14060348272323608, + "learning_rate": 0.001, + "loss": 1.376, + "step": 22152 + }, + { + "epoch": 0.9371774261781877, + "grad_norm": 0.3378833532333374, + "learning_rate": 0.001, + "loss": 2.4393, + "step": 22153 + }, + { + "epoch": 0.9372197309417041, + "grad_norm": 0.16241557896137238, + "learning_rate": 0.001, + "loss": 2.1142, + "step": 22154 + }, + { + "epoch": 0.9372620357052204, + "grad_norm": 0.15988023579120636, + "learning_rate": 0.001, + "loss": 1.9314, + "step": 22155 + }, + { + "epoch": 0.9373043404687368, + "grad_norm": 0.15488198399543762, + "learning_rate": 0.001, + "loss": 1.4996, + "step": 22156 + }, + { + "epoch": 0.9373466452322532, + "grad_norm": 0.157693013548851, + "learning_rate": 0.001, + "loss": 2.2934, + "step": 22157 + }, + { + "epoch": 0.9373889499957695, + "grad_norm": 0.16368556022644043, + "learning_rate": 0.001, + "loss": 1.5275, + "step": 22158 + }, + { + "epoch": 0.9374312547592859, + "grad_norm": 0.20545239746570587, + "learning_rate": 0.001, + "loss": 1.5569, + "step": 22159 + }, + { + "epoch": 0.9374735595228023, + "grad_norm": 0.2001711130142212, + "learning_rate": 0.001, + "loss": 2.5836, + "step": 22160 + }, + { + "epoch": 0.9375158642863186, + "grad_norm": 0.3313537538051605, + "learning_rate": 0.001, + "loss": 3.5361, + "step": 22161 + }, + { + "epoch": 0.937558169049835, + "grad_norm": 0.1719702184200287, + "learning_rate": 0.001, + "loss": 1.6831, + "step": 22162 + }, + { + "epoch": 0.9376004738133514, + "grad_norm": 0.1592639684677124, + "learning_rate": 0.001, + "loss": 1.5466, + "step": 22163 + }, + { + "epoch": 0.9376427785768677, + "grad_norm": 0.13352732360363007, + "learning_rate": 0.001, + "loss": 1.5322, + "step": 22164 + }, + { + "epoch": 0.9376850833403841, + "grad_norm": 0.18848414719104767, + "learning_rate": 0.001, + "loss": 2.1188, + "step": 22165 + }, + { + "epoch": 0.9377273881039005, + "grad_norm": 0.14446255564689636, + "learning_rate": 0.001, + "loss": 2.4954, + "step": 22166 + }, + { + "epoch": 0.9377696928674168, + "grad_norm": 0.1566217541694641, + "learning_rate": 0.001, + "loss": 1.6525, + "step": 22167 + }, + { + "epoch": 0.9378119976309333, + "grad_norm": 0.15236781537532806, + "learning_rate": 0.001, + "loss": 1.9077, + "step": 22168 + }, + { + "epoch": 0.9378543023944497, + "grad_norm": 0.4224134385585785, + "learning_rate": 0.001, + "loss": 2.2906, + "step": 22169 + }, + { + "epoch": 0.937896607157966, + "grad_norm": 0.6217629313468933, + "learning_rate": 0.001, + "loss": 2.5966, + "step": 22170 + }, + { + "epoch": 0.9379389119214824, + "grad_norm": 0.17094938457012177, + "learning_rate": 0.001, + "loss": 2.0306, + "step": 22171 + }, + { + "epoch": 0.9379812166849987, + "grad_norm": 0.18009664118289948, + "learning_rate": 0.001, + "loss": 1.8595, + "step": 22172 + }, + { + "epoch": 0.9380235214485151, + "grad_norm": 0.1493372768163681, + "learning_rate": 0.001, + "loss": 1.5855, + "step": 22173 + }, + { + "epoch": 0.9380658262120315, + "grad_norm": 0.13836872577667236, + "learning_rate": 0.001, + "loss": 2.7204, + "step": 22174 + }, + { + "epoch": 0.9381081309755478, + "grad_norm": 0.15783464908599854, + "learning_rate": 0.001, + "loss": 2.2411, + "step": 22175 + }, + { + "epoch": 0.9381504357390642, + "grad_norm": 0.2531960606575012, + "learning_rate": 0.001, + "loss": 2.9065, + "step": 22176 + }, + { + "epoch": 0.9381927405025806, + "grad_norm": 0.18750788271427155, + "learning_rate": 0.001, + "loss": 2.6953, + "step": 22177 + }, + { + "epoch": 0.9382350452660969, + "grad_norm": 0.1514870524406433, + "learning_rate": 0.001, + "loss": 2.5924, + "step": 22178 + }, + { + "epoch": 0.9382773500296133, + "grad_norm": 0.1663045436143875, + "learning_rate": 0.001, + "loss": 3.2424, + "step": 22179 + }, + { + "epoch": 0.9383196547931297, + "grad_norm": 0.12670683860778809, + "learning_rate": 0.001, + "loss": 1.8389, + "step": 22180 + }, + { + "epoch": 0.938361959556646, + "grad_norm": 0.16413888335227966, + "learning_rate": 0.001, + "loss": 1.6121, + "step": 22181 + }, + { + "epoch": 0.9384042643201624, + "grad_norm": 0.1846090853214264, + "learning_rate": 0.001, + "loss": 2.6834, + "step": 22182 + }, + { + "epoch": 0.9384465690836788, + "grad_norm": 0.17222410440444946, + "learning_rate": 0.001, + "loss": 1.5327, + "step": 22183 + }, + { + "epoch": 0.9384888738471951, + "grad_norm": 0.18798105418682098, + "learning_rate": 0.001, + "loss": 2.3528, + "step": 22184 + }, + { + "epoch": 0.9385311786107116, + "grad_norm": 0.12880779802799225, + "learning_rate": 0.001, + "loss": 3.7122, + "step": 22185 + }, + { + "epoch": 0.938573483374228, + "grad_norm": 0.15221832692623138, + "learning_rate": 0.001, + "loss": 2.1376, + "step": 22186 + }, + { + "epoch": 0.9386157881377443, + "grad_norm": 0.2237653285264969, + "learning_rate": 0.001, + "loss": 3.0669, + "step": 22187 + }, + { + "epoch": 0.9386580929012607, + "grad_norm": 0.17113400995731354, + "learning_rate": 0.001, + "loss": 1.6939, + "step": 22188 + }, + { + "epoch": 0.9387003976647771, + "grad_norm": 0.1556374728679657, + "learning_rate": 0.001, + "loss": 2.1034, + "step": 22189 + }, + { + "epoch": 0.9387427024282934, + "grad_norm": 0.14844928681850433, + "learning_rate": 0.001, + "loss": 1.7177, + "step": 22190 + }, + { + "epoch": 0.9387850071918098, + "grad_norm": 0.7886921763420105, + "learning_rate": 0.001, + "loss": 2.4152, + "step": 22191 + }, + { + "epoch": 0.9388273119553262, + "grad_norm": 0.17194926738739014, + "learning_rate": 0.001, + "loss": 2.1804, + "step": 22192 + }, + { + "epoch": 0.9388696167188425, + "grad_norm": 0.17443875968456268, + "learning_rate": 0.001, + "loss": 1.7619, + "step": 22193 + }, + { + "epoch": 0.9389119214823589, + "grad_norm": 0.17937207221984863, + "learning_rate": 0.001, + "loss": 1.8103, + "step": 22194 + }, + { + "epoch": 0.9389542262458753, + "grad_norm": 0.16381466388702393, + "learning_rate": 0.001, + "loss": 2.595, + "step": 22195 + }, + { + "epoch": 0.9389965310093916, + "grad_norm": 0.15668261051177979, + "learning_rate": 0.001, + "loss": 2.379, + "step": 22196 + }, + { + "epoch": 0.939038835772908, + "grad_norm": 0.1664295494556427, + "learning_rate": 0.001, + "loss": 1.8999, + "step": 22197 + }, + { + "epoch": 0.9390811405364244, + "grad_norm": 0.20704372227191925, + "learning_rate": 0.001, + "loss": 2.2593, + "step": 22198 + }, + { + "epoch": 0.9391234452999407, + "grad_norm": 0.17788812518119812, + "learning_rate": 0.001, + "loss": 2.0697, + "step": 22199 + }, + { + "epoch": 0.9391657500634571, + "grad_norm": 0.20091095566749573, + "learning_rate": 0.001, + "loss": 2.408, + "step": 22200 + }, + { + "epoch": 0.9392080548269736, + "grad_norm": 0.16119319200515747, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 22201 + }, + { + "epoch": 0.9392503595904899, + "grad_norm": 0.16883371770381927, + "learning_rate": 0.001, + "loss": 1.7975, + "step": 22202 + }, + { + "epoch": 0.9392926643540063, + "grad_norm": 0.15238231420516968, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 22203 + }, + { + "epoch": 0.9393349691175227, + "grad_norm": 0.14293359220027924, + "learning_rate": 0.001, + "loss": 1.9112, + "step": 22204 + }, + { + "epoch": 0.939377273881039, + "grad_norm": 0.41308876872062683, + "learning_rate": 0.001, + "loss": 2.4264, + "step": 22205 + }, + { + "epoch": 0.9394195786445554, + "grad_norm": 0.1429714858531952, + "learning_rate": 0.001, + "loss": 2.8803, + "step": 22206 + }, + { + "epoch": 0.9394618834080718, + "grad_norm": 0.16283197700977325, + "learning_rate": 0.001, + "loss": 1.6608, + "step": 22207 + }, + { + "epoch": 0.9395041881715881, + "grad_norm": 0.18357500433921814, + "learning_rate": 0.001, + "loss": 2.3365, + "step": 22208 + }, + { + "epoch": 0.9395464929351045, + "grad_norm": 0.1459013670682907, + "learning_rate": 0.001, + "loss": 1.9035, + "step": 22209 + }, + { + "epoch": 0.9395887976986209, + "grad_norm": 0.13705354928970337, + "learning_rate": 0.001, + "loss": 2.3804, + "step": 22210 + }, + { + "epoch": 0.9396311024621372, + "grad_norm": 0.15534013509750366, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 22211 + }, + { + "epoch": 0.9396734072256536, + "grad_norm": 0.19198080897331238, + "learning_rate": 0.001, + "loss": 1.5588, + "step": 22212 + }, + { + "epoch": 0.93971571198917, + "grad_norm": 0.14550134539604187, + "learning_rate": 0.001, + "loss": 1.6061, + "step": 22213 + }, + { + "epoch": 0.9397580167526863, + "grad_norm": 0.1513293832540512, + "learning_rate": 0.001, + "loss": 2.114, + "step": 22214 + }, + { + "epoch": 0.9398003215162027, + "grad_norm": 6.008566379547119, + "learning_rate": 0.001, + "loss": 1.6516, + "step": 22215 + }, + { + "epoch": 0.9398426262797192, + "grad_norm": 0.14722181856632233, + "learning_rate": 0.001, + "loss": 1.2652, + "step": 22216 + }, + { + "epoch": 0.9398849310432354, + "grad_norm": 0.16536396741867065, + "learning_rate": 0.001, + "loss": 1.9738, + "step": 22217 + }, + { + "epoch": 0.9399272358067519, + "grad_norm": 0.1685878336429596, + "learning_rate": 0.001, + "loss": 1.5726, + "step": 22218 + }, + { + "epoch": 0.9399695405702682, + "grad_norm": 0.19034594297409058, + "learning_rate": 0.001, + "loss": 2.2402, + "step": 22219 + }, + { + "epoch": 0.9400118453337846, + "grad_norm": 0.16235460340976715, + "learning_rate": 0.001, + "loss": 2.2383, + "step": 22220 + }, + { + "epoch": 0.940054150097301, + "grad_norm": 0.1350528448820114, + "learning_rate": 0.001, + "loss": 3.5003, + "step": 22221 + }, + { + "epoch": 0.9400964548608173, + "grad_norm": 0.5780348181724548, + "learning_rate": 0.001, + "loss": 3.6418, + "step": 22222 + }, + { + "epoch": 0.9401387596243337, + "grad_norm": 0.9433380365371704, + "learning_rate": 0.001, + "loss": 2.0663, + "step": 22223 + }, + { + "epoch": 0.9401810643878501, + "grad_norm": 1.4721174240112305, + "learning_rate": 0.001, + "loss": 2.5166, + "step": 22224 + }, + { + "epoch": 0.9402233691513664, + "grad_norm": 0.18319587409496307, + "learning_rate": 0.001, + "loss": 2.239, + "step": 22225 + }, + { + "epoch": 0.9402656739148828, + "grad_norm": 0.14535488188266754, + "learning_rate": 0.001, + "loss": 2.5118, + "step": 22226 + }, + { + "epoch": 0.9403079786783992, + "grad_norm": 0.1889243721961975, + "learning_rate": 0.001, + "loss": 2.082, + "step": 22227 + }, + { + "epoch": 0.9403502834419155, + "grad_norm": 0.15350744128227234, + "learning_rate": 0.001, + "loss": 2.3159, + "step": 22228 + }, + { + "epoch": 0.9403925882054319, + "grad_norm": 0.9224516749382019, + "learning_rate": 0.001, + "loss": 1.3976, + "step": 22229 + }, + { + "epoch": 0.9404348929689483, + "grad_norm": 0.132807195186615, + "learning_rate": 0.001, + "loss": 2.8195, + "step": 22230 + }, + { + "epoch": 0.9404771977324646, + "grad_norm": 0.17904257774353027, + "learning_rate": 0.001, + "loss": 2.3766, + "step": 22231 + }, + { + "epoch": 0.940519502495981, + "grad_norm": 0.15651574730873108, + "learning_rate": 0.001, + "loss": 1.5067, + "step": 22232 + }, + { + "epoch": 0.9405618072594975, + "grad_norm": 0.1492500901222229, + "learning_rate": 0.001, + "loss": 1.3076, + "step": 22233 + }, + { + "epoch": 0.9406041120230137, + "grad_norm": 0.177638441324234, + "learning_rate": 0.001, + "loss": 2.5056, + "step": 22234 + }, + { + "epoch": 0.9406464167865302, + "grad_norm": 0.1903133988380432, + "learning_rate": 0.001, + "loss": 2.0664, + "step": 22235 + }, + { + "epoch": 0.9406887215500466, + "grad_norm": 0.2579449415206909, + "learning_rate": 0.001, + "loss": 1.6951, + "step": 22236 + }, + { + "epoch": 0.9407310263135629, + "grad_norm": 0.19664129614830017, + "learning_rate": 0.001, + "loss": 2.2619, + "step": 22237 + }, + { + "epoch": 0.9407733310770793, + "grad_norm": 0.1550266593694687, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 22238 + }, + { + "epoch": 0.9408156358405957, + "grad_norm": 0.3928470015525818, + "learning_rate": 0.001, + "loss": 2.0354, + "step": 22239 + }, + { + "epoch": 0.940857940604112, + "grad_norm": 0.16893982887268066, + "learning_rate": 0.001, + "loss": 2.6994, + "step": 22240 + }, + { + "epoch": 0.9409002453676284, + "grad_norm": 0.16361390054225922, + "learning_rate": 0.001, + "loss": 2.2244, + "step": 22241 + }, + { + "epoch": 0.9409425501311448, + "grad_norm": 0.14021305739879608, + "learning_rate": 0.001, + "loss": 2.2345, + "step": 22242 + }, + { + "epoch": 0.9409848548946611, + "grad_norm": 0.1266973614692688, + "learning_rate": 0.001, + "loss": 1.6334, + "step": 22243 + }, + { + "epoch": 0.9410271596581775, + "grad_norm": 0.16998490691184998, + "learning_rate": 0.001, + "loss": 1.9222, + "step": 22244 + }, + { + "epoch": 0.9410694644216939, + "grad_norm": 0.17645969986915588, + "learning_rate": 0.001, + "loss": 2.8088, + "step": 22245 + }, + { + "epoch": 0.9411117691852102, + "grad_norm": 0.14849461615085602, + "learning_rate": 0.001, + "loss": 1.4422, + "step": 22246 + }, + { + "epoch": 0.9411540739487266, + "grad_norm": 0.12691397964954376, + "learning_rate": 0.001, + "loss": 1.4996, + "step": 22247 + }, + { + "epoch": 0.941196378712243, + "grad_norm": 0.13772864639759064, + "learning_rate": 0.001, + "loss": 2.2293, + "step": 22248 + }, + { + "epoch": 0.9412386834757593, + "grad_norm": 0.15115776658058167, + "learning_rate": 0.001, + "loss": 1.4696, + "step": 22249 + }, + { + "epoch": 0.9412809882392758, + "grad_norm": 0.14529401063919067, + "learning_rate": 0.001, + "loss": 2.4046, + "step": 22250 + }, + { + "epoch": 0.9413232930027922, + "grad_norm": 0.14119921624660492, + "learning_rate": 0.001, + "loss": 1.5726, + "step": 22251 + }, + { + "epoch": 0.9413655977663085, + "grad_norm": 0.31882578134536743, + "learning_rate": 0.001, + "loss": 2.2414, + "step": 22252 + }, + { + "epoch": 0.9414079025298249, + "grad_norm": 0.20864500105381012, + "learning_rate": 0.001, + "loss": 2.1722, + "step": 22253 + }, + { + "epoch": 0.9414502072933413, + "grad_norm": 0.15085260570049286, + "learning_rate": 0.001, + "loss": 1.5922, + "step": 22254 + }, + { + "epoch": 0.9414925120568576, + "grad_norm": 0.15553170442581177, + "learning_rate": 0.001, + "loss": 1.7225, + "step": 22255 + }, + { + "epoch": 0.941534816820374, + "grad_norm": 41.11113739013672, + "learning_rate": 0.001, + "loss": 1.9813, + "step": 22256 + }, + { + "epoch": 0.9415771215838904, + "grad_norm": 0.1472524106502533, + "learning_rate": 0.001, + "loss": 1.5666, + "step": 22257 + }, + { + "epoch": 0.9416194263474067, + "grad_norm": 0.15531371533870697, + "learning_rate": 0.001, + "loss": 2.4356, + "step": 22258 + }, + { + "epoch": 0.9416617311109231, + "grad_norm": 0.158295139670372, + "learning_rate": 0.001, + "loss": 1.8226, + "step": 22259 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.1623242199420929, + "learning_rate": 0.001, + "loss": 2.2772, + "step": 22260 + }, + { + "epoch": 0.9417463406379558, + "grad_norm": 0.14404623210430145, + "learning_rate": 0.001, + "loss": 2.1139, + "step": 22261 + }, + { + "epoch": 0.9417886454014722, + "grad_norm": 0.15782445669174194, + "learning_rate": 0.001, + "loss": 1.8102, + "step": 22262 + }, + { + "epoch": 0.9418309501649885, + "grad_norm": 0.15719851851463318, + "learning_rate": 0.001, + "loss": 1.3974, + "step": 22263 + }, + { + "epoch": 0.9418732549285049, + "grad_norm": 0.17760580778121948, + "learning_rate": 0.001, + "loss": 1.7701, + "step": 22264 + }, + { + "epoch": 0.9419155596920213, + "grad_norm": 0.1664871722459793, + "learning_rate": 0.001, + "loss": 2.3184, + "step": 22265 + }, + { + "epoch": 0.9419578644555376, + "grad_norm": 0.47373244166374207, + "learning_rate": 0.001, + "loss": 2.0407, + "step": 22266 + }, + { + "epoch": 0.942000169219054, + "grad_norm": 0.5430980324745178, + "learning_rate": 0.001, + "loss": 1.881, + "step": 22267 + }, + { + "epoch": 0.9420424739825705, + "grad_norm": 0.17633113265037537, + "learning_rate": 0.001, + "loss": 1.8635, + "step": 22268 + }, + { + "epoch": 0.9420847787460868, + "grad_norm": 0.1332206279039383, + "learning_rate": 0.001, + "loss": 2.9, + "step": 22269 + }, + { + "epoch": 0.9421270835096032, + "grad_norm": 0.17408061027526855, + "learning_rate": 0.001, + "loss": 2.2226, + "step": 22270 + }, + { + "epoch": 0.9421693882731196, + "grad_norm": 0.19134634733200073, + "learning_rate": 0.001, + "loss": 3.0214, + "step": 22271 + }, + { + "epoch": 0.9422116930366359, + "grad_norm": 0.16139277815818787, + "learning_rate": 0.001, + "loss": 3.3642, + "step": 22272 + }, + { + "epoch": 0.9422539978001523, + "grad_norm": 0.14892078936100006, + "learning_rate": 0.001, + "loss": 2.0493, + "step": 22273 + }, + { + "epoch": 0.9422963025636687, + "grad_norm": 0.18064714968204498, + "learning_rate": 0.001, + "loss": 1.9058, + "step": 22274 + }, + { + "epoch": 0.942338607327185, + "grad_norm": 4.666594505310059, + "learning_rate": 0.001, + "loss": 1.9391, + "step": 22275 + }, + { + "epoch": 0.9423809120907014, + "grad_norm": 0.16249027848243713, + "learning_rate": 0.001, + "loss": 2.5574, + "step": 22276 + }, + { + "epoch": 0.9424232168542178, + "grad_norm": 0.14459872245788574, + "learning_rate": 0.001, + "loss": 1.8483, + "step": 22277 + }, + { + "epoch": 0.9424655216177341, + "grad_norm": 0.1476597934961319, + "learning_rate": 0.001, + "loss": 1.9867, + "step": 22278 + }, + { + "epoch": 0.9425078263812505, + "grad_norm": 0.4886634945869446, + "learning_rate": 0.001, + "loss": 2.0321, + "step": 22279 + }, + { + "epoch": 0.9425501311447669, + "grad_norm": 0.23067691922187805, + "learning_rate": 0.001, + "loss": 2.1118, + "step": 22280 + }, + { + "epoch": 0.9425924359082832, + "grad_norm": 0.14237605035305023, + "learning_rate": 0.001, + "loss": 1.9353, + "step": 22281 + }, + { + "epoch": 0.9426347406717996, + "grad_norm": 0.1730206310749054, + "learning_rate": 0.001, + "loss": 2.1826, + "step": 22282 + }, + { + "epoch": 0.942677045435316, + "grad_norm": 0.14405101537704468, + "learning_rate": 0.001, + "loss": 3.3043, + "step": 22283 + }, + { + "epoch": 0.9427193501988324, + "grad_norm": 0.2477824091911316, + "learning_rate": 0.001, + "loss": 3.2702, + "step": 22284 + }, + { + "epoch": 0.9427616549623488, + "grad_norm": 0.12695826590061188, + "learning_rate": 0.001, + "loss": 2.3018, + "step": 22285 + }, + { + "epoch": 0.9428039597258652, + "grad_norm": 2.1101129055023193, + "learning_rate": 0.001, + "loss": 2.0449, + "step": 22286 + }, + { + "epoch": 0.9428462644893815, + "grad_norm": 0.13695190846920013, + "learning_rate": 0.001, + "loss": 2.5038, + "step": 22287 + }, + { + "epoch": 0.9428885692528979, + "grad_norm": 0.1824672371149063, + "learning_rate": 0.001, + "loss": 2.4115, + "step": 22288 + }, + { + "epoch": 0.9429308740164143, + "grad_norm": 0.1646248996257782, + "learning_rate": 0.001, + "loss": 4.0476, + "step": 22289 + }, + { + "epoch": 0.9429731787799306, + "grad_norm": 0.13337703049182892, + "learning_rate": 0.001, + "loss": 2.0791, + "step": 22290 + }, + { + "epoch": 0.943015483543447, + "grad_norm": 0.13796573877334595, + "learning_rate": 0.001, + "loss": 1.7621, + "step": 22291 + }, + { + "epoch": 0.9430577883069634, + "grad_norm": 0.16016069054603577, + "learning_rate": 0.001, + "loss": 1.8191, + "step": 22292 + }, + { + "epoch": 0.9431000930704797, + "grad_norm": 0.15696018934249878, + "learning_rate": 0.001, + "loss": 1.7768, + "step": 22293 + }, + { + "epoch": 0.9431423978339961, + "grad_norm": 0.2677614092826843, + "learning_rate": 0.001, + "loss": 1.8438, + "step": 22294 + }, + { + "epoch": 0.9431847025975125, + "grad_norm": 0.17819730937480927, + "learning_rate": 0.001, + "loss": 1.4621, + "step": 22295 + }, + { + "epoch": 0.9432270073610288, + "grad_norm": 0.9130420088768005, + "learning_rate": 0.001, + "loss": 3.1412, + "step": 22296 + }, + { + "epoch": 0.9432693121245452, + "grad_norm": 0.1402437835931778, + "learning_rate": 0.001, + "loss": 2.0391, + "step": 22297 + }, + { + "epoch": 0.9433116168880616, + "grad_norm": 1.1969804763793945, + "learning_rate": 0.001, + "loss": 2.7918, + "step": 22298 + }, + { + "epoch": 0.9433539216515779, + "grad_norm": 0.7922569513320923, + "learning_rate": 0.001, + "loss": 1.9959, + "step": 22299 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.17550984025001526, + "learning_rate": 0.001, + "loss": 1.9096, + "step": 22300 + }, + { + "epoch": 0.9434385311786108, + "grad_norm": 0.14983303844928741, + "learning_rate": 0.001, + "loss": 1.9521, + "step": 22301 + }, + { + "epoch": 0.9434808359421271, + "grad_norm": 0.14537028968334198, + "learning_rate": 0.001, + "loss": 2.2255, + "step": 22302 + }, + { + "epoch": 0.9435231407056435, + "grad_norm": 0.18867170810699463, + "learning_rate": 0.001, + "loss": 1.7019, + "step": 22303 + }, + { + "epoch": 0.9435654454691599, + "grad_norm": 0.183610200881958, + "learning_rate": 0.001, + "loss": 1.4077, + "step": 22304 + }, + { + "epoch": 0.9436077502326762, + "grad_norm": 0.15308107435703278, + "learning_rate": 0.001, + "loss": 1.8193, + "step": 22305 + }, + { + "epoch": 0.9436500549961926, + "grad_norm": 0.15715880692005157, + "learning_rate": 0.001, + "loss": 1.7452, + "step": 22306 + }, + { + "epoch": 0.943692359759709, + "grad_norm": 0.15837711095809937, + "learning_rate": 0.001, + "loss": 1.8037, + "step": 22307 + }, + { + "epoch": 0.9437346645232253, + "grad_norm": 0.1577230989933014, + "learning_rate": 0.001, + "loss": 1.6906, + "step": 22308 + }, + { + "epoch": 0.9437769692867417, + "grad_norm": 0.1631125658750534, + "learning_rate": 0.001, + "loss": 1.8791, + "step": 22309 + }, + { + "epoch": 0.943819274050258, + "grad_norm": 0.15031826496124268, + "learning_rate": 0.001, + "loss": 3.3296, + "step": 22310 + }, + { + "epoch": 0.9438615788137744, + "grad_norm": 0.15790674090385437, + "learning_rate": 0.001, + "loss": 1.7894, + "step": 22311 + }, + { + "epoch": 0.9439038835772908, + "grad_norm": 0.16160251200199127, + "learning_rate": 0.001, + "loss": 2.2725, + "step": 22312 + }, + { + "epoch": 0.9439461883408071, + "grad_norm": 0.14644403755664825, + "learning_rate": 0.001, + "loss": 2.6085, + "step": 22313 + }, + { + "epoch": 0.9439884931043235, + "grad_norm": 0.14695337414741516, + "learning_rate": 0.001, + "loss": 1.806, + "step": 22314 + }, + { + "epoch": 0.9440307978678399, + "grad_norm": 0.22415880858898163, + "learning_rate": 0.001, + "loss": 1.6828, + "step": 22315 + }, + { + "epoch": 0.9440731026313562, + "grad_norm": 0.1559433937072754, + "learning_rate": 0.001, + "loss": 2.0603, + "step": 22316 + }, + { + "epoch": 0.9441154073948727, + "grad_norm": 0.14911769330501556, + "learning_rate": 0.001, + "loss": 3.1866, + "step": 22317 + }, + { + "epoch": 0.9441577121583891, + "grad_norm": 0.17388781905174255, + "learning_rate": 0.001, + "loss": 2.1254, + "step": 22318 + }, + { + "epoch": 0.9442000169219054, + "grad_norm": 1.0141180753707886, + "learning_rate": 0.001, + "loss": 2.1553, + "step": 22319 + }, + { + "epoch": 0.9442423216854218, + "grad_norm": 0.13785704970359802, + "learning_rate": 0.001, + "loss": 1.6176, + "step": 22320 + }, + { + "epoch": 0.9442846264489382, + "grad_norm": 1.1813472509384155, + "learning_rate": 0.001, + "loss": 1.9315, + "step": 22321 + }, + { + "epoch": 0.9443269312124545, + "grad_norm": 0.15482167899608612, + "learning_rate": 0.001, + "loss": 2.5054, + "step": 22322 + }, + { + "epoch": 0.9443692359759709, + "grad_norm": 0.164631187915802, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 22323 + }, + { + "epoch": 0.9444115407394873, + "grad_norm": 0.18228505551815033, + "learning_rate": 0.001, + "loss": 2.0032, + "step": 22324 + }, + { + "epoch": 0.9444538455030036, + "grad_norm": 0.2625649571418762, + "learning_rate": 0.001, + "loss": 2.6495, + "step": 22325 + }, + { + "epoch": 0.94449615026652, + "grad_norm": 0.2495313137769699, + "learning_rate": 0.001, + "loss": 1.887, + "step": 22326 + }, + { + "epoch": 0.9445384550300364, + "grad_norm": 0.16760574281215668, + "learning_rate": 0.001, + "loss": 1.9866, + "step": 22327 + }, + { + "epoch": 0.9445807597935527, + "grad_norm": 0.16984060406684875, + "learning_rate": 0.001, + "loss": 2.2137, + "step": 22328 + }, + { + "epoch": 0.9446230645570691, + "grad_norm": 0.20226795971393585, + "learning_rate": 0.001, + "loss": 2.0092, + "step": 22329 + }, + { + "epoch": 0.9446653693205855, + "grad_norm": 0.24898548424243927, + "learning_rate": 0.001, + "loss": 2.6886, + "step": 22330 + }, + { + "epoch": 0.9447076740841018, + "grad_norm": 0.40224114060401917, + "learning_rate": 0.001, + "loss": 3.2176, + "step": 22331 + }, + { + "epoch": 0.9447499788476182, + "grad_norm": 0.18034416437149048, + "learning_rate": 0.001, + "loss": 1.7549, + "step": 22332 + }, + { + "epoch": 0.9447922836111347, + "grad_norm": 0.14370644092559814, + "learning_rate": 0.001, + "loss": 1.5195, + "step": 22333 + }, + { + "epoch": 0.944834588374651, + "grad_norm": 0.7416594624519348, + "learning_rate": 0.001, + "loss": 1.7912, + "step": 22334 + }, + { + "epoch": 0.9448768931381674, + "grad_norm": 0.15781673789024353, + "learning_rate": 0.001, + "loss": 2.0478, + "step": 22335 + }, + { + "epoch": 0.9449191979016838, + "grad_norm": 0.17141476273536682, + "learning_rate": 0.001, + "loss": 2.7985, + "step": 22336 + }, + { + "epoch": 0.9449615026652001, + "grad_norm": 0.15994961559772491, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 22337 + }, + { + "epoch": 0.9450038074287165, + "grad_norm": 0.3029901385307312, + "learning_rate": 0.001, + "loss": 1.5779, + "step": 22338 + }, + { + "epoch": 0.9450461121922329, + "grad_norm": 0.20497363805770874, + "learning_rate": 0.001, + "loss": 2.5866, + "step": 22339 + }, + { + "epoch": 0.9450884169557492, + "grad_norm": 0.1555112898349762, + "learning_rate": 0.001, + "loss": 3.3234, + "step": 22340 + }, + { + "epoch": 0.9451307217192656, + "grad_norm": 0.18291279673576355, + "learning_rate": 0.001, + "loss": 2.5366, + "step": 22341 + }, + { + "epoch": 0.945173026482782, + "grad_norm": 0.16539302468299866, + "learning_rate": 0.001, + "loss": 3.267, + "step": 22342 + }, + { + "epoch": 0.9452153312462983, + "grad_norm": 0.15976227819919586, + "learning_rate": 0.001, + "loss": 2.2329, + "step": 22343 + }, + { + "epoch": 0.9452576360098147, + "grad_norm": 0.262100487947464, + "learning_rate": 0.001, + "loss": 1.7827, + "step": 22344 + }, + { + "epoch": 0.9452999407733311, + "grad_norm": 1.0867878198623657, + "learning_rate": 0.001, + "loss": 2.03, + "step": 22345 + }, + { + "epoch": 0.9453422455368474, + "grad_norm": 0.16407005488872528, + "learning_rate": 0.001, + "loss": 2.5212, + "step": 22346 + }, + { + "epoch": 0.9453845503003638, + "grad_norm": 0.1369040310382843, + "learning_rate": 0.001, + "loss": 2.0789, + "step": 22347 + }, + { + "epoch": 0.9454268550638802, + "grad_norm": 0.13893716037273407, + "learning_rate": 0.001, + "loss": 1.5555, + "step": 22348 + }, + { + "epoch": 0.9454691598273965, + "grad_norm": 0.5329959392547607, + "learning_rate": 0.001, + "loss": 2.0893, + "step": 22349 + }, + { + "epoch": 0.945511464590913, + "grad_norm": 0.17351019382476807, + "learning_rate": 0.001, + "loss": 1.7663, + "step": 22350 + }, + { + "epoch": 0.9455537693544294, + "grad_norm": 0.1668737381696701, + "learning_rate": 0.001, + "loss": 2.5798, + "step": 22351 + }, + { + "epoch": 0.9455960741179457, + "grad_norm": 1.1766389608383179, + "learning_rate": 0.001, + "loss": 1.6858, + "step": 22352 + }, + { + "epoch": 0.9456383788814621, + "grad_norm": 0.19534622132778168, + "learning_rate": 0.001, + "loss": 2.2868, + "step": 22353 + }, + { + "epoch": 0.9456806836449784, + "grad_norm": 0.1639569103717804, + "learning_rate": 0.001, + "loss": 1.7299, + "step": 22354 + }, + { + "epoch": 0.9457229884084948, + "grad_norm": 0.14215198159217834, + "learning_rate": 0.001, + "loss": 2.8559, + "step": 22355 + }, + { + "epoch": 0.9457652931720112, + "grad_norm": 0.1718878597021103, + "learning_rate": 0.001, + "loss": 2.5348, + "step": 22356 + }, + { + "epoch": 0.9458075979355275, + "grad_norm": 1.5104109048843384, + "learning_rate": 0.001, + "loss": 3.4204, + "step": 22357 + }, + { + "epoch": 0.9458499026990439, + "grad_norm": 0.18013618886470795, + "learning_rate": 0.001, + "loss": 3.1525, + "step": 22358 + }, + { + "epoch": 0.9458922074625603, + "grad_norm": 0.5852957963943481, + "learning_rate": 0.001, + "loss": 1.9951, + "step": 22359 + }, + { + "epoch": 0.9459345122260766, + "grad_norm": 0.3737950921058655, + "learning_rate": 0.001, + "loss": 2.5527, + "step": 22360 + }, + { + "epoch": 0.945976816989593, + "grad_norm": 0.20486804842948914, + "learning_rate": 0.001, + "loss": 2.8936, + "step": 22361 + }, + { + "epoch": 0.9460191217531094, + "grad_norm": 0.18885180354118347, + "learning_rate": 0.001, + "loss": 1.9923, + "step": 22362 + }, + { + "epoch": 0.9460614265166257, + "grad_norm": 0.1904320865869522, + "learning_rate": 0.001, + "loss": 1.5084, + "step": 22363 + }, + { + "epoch": 0.9461037312801421, + "grad_norm": 0.1865696907043457, + "learning_rate": 0.001, + "loss": 2.2982, + "step": 22364 + }, + { + "epoch": 0.9461460360436585, + "grad_norm": 0.14152313768863678, + "learning_rate": 0.001, + "loss": 1.7908, + "step": 22365 + }, + { + "epoch": 0.9461883408071748, + "grad_norm": 0.16738182306289673, + "learning_rate": 0.001, + "loss": 2.1825, + "step": 22366 + }, + { + "epoch": 0.9462306455706913, + "grad_norm": 0.4649767577648163, + "learning_rate": 0.001, + "loss": 2.4458, + "step": 22367 + }, + { + "epoch": 0.9462729503342077, + "grad_norm": 0.1782921850681305, + "learning_rate": 0.001, + "loss": 2.2509, + "step": 22368 + }, + { + "epoch": 0.946315255097724, + "grad_norm": 0.16340084373950958, + "learning_rate": 0.001, + "loss": 2.4144, + "step": 22369 + }, + { + "epoch": 0.9463575598612404, + "grad_norm": 0.19446922838687897, + "learning_rate": 0.001, + "loss": 2.3684, + "step": 22370 + }, + { + "epoch": 0.9463998646247568, + "grad_norm": 0.20399482548236847, + "learning_rate": 0.001, + "loss": 1.9716, + "step": 22371 + }, + { + "epoch": 0.9464421693882731, + "grad_norm": 0.14847496151924133, + "learning_rate": 0.001, + "loss": 2.6068, + "step": 22372 + }, + { + "epoch": 0.9464844741517895, + "grad_norm": 52.29764175415039, + "learning_rate": 0.001, + "loss": 2.1968, + "step": 22373 + }, + { + "epoch": 0.9465267789153059, + "grad_norm": 0.18324236571788788, + "learning_rate": 0.001, + "loss": 2.1071, + "step": 22374 + }, + { + "epoch": 0.9465690836788222, + "grad_norm": 0.1397266834974289, + "learning_rate": 0.001, + "loss": 1.4608, + "step": 22375 + }, + { + "epoch": 0.9466113884423386, + "grad_norm": 0.19050876796245575, + "learning_rate": 0.001, + "loss": 1.6181, + "step": 22376 + }, + { + "epoch": 0.946653693205855, + "grad_norm": 0.17032888531684875, + "learning_rate": 0.001, + "loss": 1.9212, + "step": 22377 + }, + { + "epoch": 0.9466959979693713, + "grad_norm": 0.1688220053911209, + "learning_rate": 0.001, + "loss": 1.6201, + "step": 22378 + }, + { + "epoch": 0.9467383027328877, + "grad_norm": 0.1576201170682907, + "learning_rate": 0.001, + "loss": 2.0552, + "step": 22379 + }, + { + "epoch": 0.9467806074964041, + "grad_norm": 0.17971685528755188, + "learning_rate": 0.001, + "loss": 1.8798, + "step": 22380 + }, + { + "epoch": 0.9468229122599204, + "grad_norm": 0.21118150651454926, + "learning_rate": 0.001, + "loss": 2.0006, + "step": 22381 + }, + { + "epoch": 0.9468652170234368, + "grad_norm": 0.17267964780330658, + "learning_rate": 0.001, + "loss": 1.6616, + "step": 22382 + }, + { + "epoch": 0.9469075217869533, + "grad_norm": 0.15508294105529785, + "learning_rate": 0.001, + "loss": 1.9034, + "step": 22383 + }, + { + "epoch": 0.9469498265504696, + "grad_norm": 0.2111867219209671, + "learning_rate": 0.001, + "loss": 1.9499, + "step": 22384 + }, + { + "epoch": 0.946992131313986, + "grad_norm": 0.1746092438697815, + "learning_rate": 0.001, + "loss": 1.4817, + "step": 22385 + }, + { + "epoch": 0.9470344360775024, + "grad_norm": 0.33903783559799194, + "learning_rate": 0.001, + "loss": 1.9695, + "step": 22386 + }, + { + "epoch": 0.9470767408410187, + "grad_norm": 0.362958699464798, + "learning_rate": 0.001, + "loss": 2.04, + "step": 22387 + }, + { + "epoch": 0.9471190456045351, + "grad_norm": 0.15616007149219513, + "learning_rate": 0.001, + "loss": 1.8717, + "step": 22388 + }, + { + "epoch": 0.9471613503680515, + "grad_norm": 0.19031497836112976, + "learning_rate": 0.001, + "loss": 2.02, + "step": 22389 + }, + { + "epoch": 0.9472036551315678, + "grad_norm": 0.1764826625585556, + "learning_rate": 0.001, + "loss": 1.3835, + "step": 22390 + }, + { + "epoch": 0.9472459598950842, + "grad_norm": 0.1563934087753296, + "learning_rate": 0.001, + "loss": 2.3536, + "step": 22391 + }, + { + "epoch": 0.9472882646586006, + "grad_norm": 0.15653590857982635, + "learning_rate": 0.001, + "loss": 2.4971, + "step": 22392 + }, + { + "epoch": 0.9473305694221169, + "grad_norm": 0.1690565049648285, + "learning_rate": 0.001, + "loss": 2.161, + "step": 22393 + }, + { + "epoch": 0.9473728741856333, + "grad_norm": 0.1902676671743393, + "learning_rate": 0.001, + "loss": 1.7483, + "step": 22394 + }, + { + "epoch": 0.9474151789491497, + "grad_norm": 0.14070867002010345, + "learning_rate": 0.001, + "loss": 1.9859, + "step": 22395 + }, + { + "epoch": 0.947457483712666, + "grad_norm": 0.1544153243303299, + "learning_rate": 0.001, + "loss": 1.7372, + "step": 22396 + }, + { + "epoch": 0.9474997884761824, + "grad_norm": 0.12571722269058228, + "learning_rate": 0.001, + "loss": 2.009, + "step": 22397 + }, + { + "epoch": 0.9475420932396987, + "grad_norm": 0.18831411004066467, + "learning_rate": 0.001, + "loss": 2.0311, + "step": 22398 + }, + { + "epoch": 0.9475843980032151, + "grad_norm": 0.19460955262184143, + "learning_rate": 0.001, + "loss": 1.6814, + "step": 22399 + }, + { + "epoch": 0.9476267027667316, + "grad_norm": 0.14858324825763702, + "learning_rate": 0.001, + "loss": 1.8082, + "step": 22400 + }, + { + "epoch": 0.9476690075302479, + "grad_norm": 0.18766085803508759, + "learning_rate": 0.001, + "loss": 2.5125, + "step": 22401 + }, + { + "epoch": 0.9477113122937643, + "grad_norm": 0.15538080036640167, + "learning_rate": 0.001, + "loss": 1.6178, + "step": 22402 + }, + { + "epoch": 0.9477536170572807, + "grad_norm": 0.21831777691841125, + "learning_rate": 0.001, + "loss": 3.1002, + "step": 22403 + }, + { + "epoch": 0.947795921820797, + "grad_norm": 0.1795506328344345, + "learning_rate": 0.001, + "loss": 1.9556, + "step": 22404 + }, + { + "epoch": 0.9478382265843134, + "grad_norm": 0.12825722992420197, + "learning_rate": 0.001, + "loss": 2.2971, + "step": 22405 + }, + { + "epoch": 0.9478805313478298, + "grad_norm": 0.16179698705673218, + "learning_rate": 0.001, + "loss": 2.6207, + "step": 22406 + }, + { + "epoch": 0.9479228361113461, + "grad_norm": 0.17438729107379913, + "learning_rate": 0.001, + "loss": 1.7021, + "step": 22407 + }, + { + "epoch": 0.9479651408748625, + "grad_norm": 0.14387691020965576, + "learning_rate": 0.001, + "loss": 2.1247, + "step": 22408 + }, + { + "epoch": 0.9480074456383789, + "grad_norm": 0.14491653442382812, + "learning_rate": 0.001, + "loss": 1.5185, + "step": 22409 + }, + { + "epoch": 0.9480497504018952, + "grad_norm": 0.15343250334262848, + "learning_rate": 0.001, + "loss": 1.7086, + "step": 22410 + }, + { + "epoch": 0.9480920551654116, + "grad_norm": 0.2698756456375122, + "learning_rate": 0.001, + "loss": 2.0098, + "step": 22411 + }, + { + "epoch": 0.948134359928928, + "grad_norm": 0.12875612080097198, + "learning_rate": 0.001, + "loss": 1.5075, + "step": 22412 + }, + { + "epoch": 0.9481766646924443, + "grad_norm": 0.2064325362443924, + "learning_rate": 0.001, + "loss": 1.9919, + "step": 22413 + }, + { + "epoch": 0.9482189694559607, + "grad_norm": 0.1798713207244873, + "learning_rate": 0.001, + "loss": 2.8559, + "step": 22414 + }, + { + "epoch": 0.9482612742194771, + "grad_norm": 4.14750337600708, + "learning_rate": 0.001, + "loss": 2.7561, + "step": 22415 + }, + { + "epoch": 0.9483035789829934, + "grad_norm": 0.15084944665431976, + "learning_rate": 0.001, + "loss": 2.2551, + "step": 22416 + }, + { + "epoch": 0.9483458837465099, + "grad_norm": 0.1535675972700119, + "learning_rate": 0.001, + "loss": 3.2881, + "step": 22417 + }, + { + "epoch": 0.9483881885100263, + "grad_norm": 0.1589122712612152, + "learning_rate": 0.001, + "loss": 1.8991, + "step": 22418 + }, + { + "epoch": 0.9484304932735426, + "grad_norm": 0.15410476922988892, + "learning_rate": 0.001, + "loss": 2.1234, + "step": 22419 + }, + { + "epoch": 0.948472798037059, + "grad_norm": 0.15095588564872742, + "learning_rate": 0.001, + "loss": 1.6618, + "step": 22420 + }, + { + "epoch": 0.9485151028005754, + "grad_norm": 0.14121654629707336, + "learning_rate": 0.001, + "loss": 1.6598, + "step": 22421 + }, + { + "epoch": 0.9485574075640917, + "grad_norm": 0.1701010763645172, + "learning_rate": 0.001, + "loss": 1.8561, + "step": 22422 + }, + { + "epoch": 0.9485997123276081, + "grad_norm": 0.15959163010120392, + "learning_rate": 0.001, + "loss": 1.7565, + "step": 22423 + }, + { + "epoch": 0.9486420170911245, + "grad_norm": 0.1474931389093399, + "learning_rate": 0.001, + "loss": 2.6955, + "step": 22424 + }, + { + "epoch": 0.9486843218546408, + "grad_norm": 0.17197000980377197, + "learning_rate": 0.001, + "loss": 3.4178, + "step": 22425 + }, + { + "epoch": 0.9487266266181572, + "grad_norm": 0.27430710196495056, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 22426 + }, + { + "epoch": 0.9487689313816736, + "grad_norm": 0.15919172763824463, + "learning_rate": 0.001, + "loss": 1.9106, + "step": 22427 + }, + { + "epoch": 0.9488112361451899, + "grad_norm": 11.105170249938965, + "learning_rate": 0.001, + "loss": 2.4885, + "step": 22428 + }, + { + "epoch": 0.9488535409087063, + "grad_norm": 0.18378356099128723, + "learning_rate": 0.001, + "loss": 1.9228, + "step": 22429 + }, + { + "epoch": 0.9488958456722227, + "grad_norm": 0.16684526205062866, + "learning_rate": 0.001, + "loss": 1.6823, + "step": 22430 + }, + { + "epoch": 0.948938150435739, + "grad_norm": 0.12937648594379425, + "learning_rate": 0.001, + "loss": 2.4837, + "step": 22431 + }, + { + "epoch": 0.9489804551992554, + "grad_norm": 6.135654449462891, + "learning_rate": 0.001, + "loss": 1.8389, + "step": 22432 + }, + { + "epoch": 0.9490227599627719, + "grad_norm": 0.15108750760555267, + "learning_rate": 0.001, + "loss": 1.4086, + "step": 22433 + }, + { + "epoch": 0.9490650647262882, + "grad_norm": 0.1532086580991745, + "learning_rate": 0.001, + "loss": 2.467, + "step": 22434 + }, + { + "epoch": 0.9491073694898046, + "grad_norm": 0.21752135455608368, + "learning_rate": 0.001, + "loss": 3.2817, + "step": 22435 + }, + { + "epoch": 0.949149674253321, + "grad_norm": 0.45808929204940796, + "learning_rate": 0.001, + "loss": 2.6698, + "step": 22436 + }, + { + "epoch": 0.9491919790168373, + "grad_norm": 0.1843804270029068, + "learning_rate": 0.001, + "loss": 1.7856, + "step": 22437 + }, + { + "epoch": 0.9492342837803537, + "grad_norm": 0.21448786556720734, + "learning_rate": 0.001, + "loss": 2.3707, + "step": 22438 + }, + { + "epoch": 0.9492765885438701, + "grad_norm": 0.13399538397789001, + "learning_rate": 0.001, + "loss": 1.7928, + "step": 22439 + }, + { + "epoch": 0.9493188933073864, + "grad_norm": 0.13753435015678406, + "learning_rate": 0.001, + "loss": 2.8811, + "step": 22440 + }, + { + "epoch": 0.9493611980709028, + "grad_norm": 0.14787648618221283, + "learning_rate": 0.001, + "loss": 1.6632, + "step": 22441 + }, + { + "epoch": 0.9494035028344192, + "grad_norm": 0.150163933634758, + "learning_rate": 0.001, + "loss": 1.6527, + "step": 22442 + }, + { + "epoch": 0.9494458075979355, + "grad_norm": 0.14982707798480988, + "learning_rate": 0.001, + "loss": 1.4879, + "step": 22443 + }, + { + "epoch": 0.9494881123614519, + "grad_norm": 0.9315142035484314, + "learning_rate": 0.001, + "loss": 1.8539, + "step": 22444 + }, + { + "epoch": 0.9495304171249682, + "grad_norm": 0.1498139351606369, + "learning_rate": 0.001, + "loss": 2.1401, + "step": 22445 + }, + { + "epoch": 0.9495727218884846, + "grad_norm": 0.1625237613916397, + "learning_rate": 0.001, + "loss": 2.2502, + "step": 22446 + }, + { + "epoch": 0.949615026652001, + "grad_norm": 1.6854742765426636, + "learning_rate": 0.001, + "loss": 1.7948, + "step": 22447 + }, + { + "epoch": 0.9496573314155173, + "grad_norm": 0.24923864006996155, + "learning_rate": 0.001, + "loss": 1.9564, + "step": 22448 + }, + { + "epoch": 0.9496996361790337, + "grad_norm": 0.16029444336891174, + "learning_rate": 0.001, + "loss": 1.8311, + "step": 22449 + }, + { + "epoch": 0.9497419409425502, + "grad_norm": 0.18315951526165009, + "learning_rate": 0.001, + "loss": 2.9322, + "step": 22450 + }, + { + "epoch": 0.9497842457060665, + "grad_norm": 0.14090849459171295, + "learning_rate": 0.001, + "loss": 1.3484, + "step": 22451 + }, + { + "epoch": 0.9498265504695829, + "grad_norm": 0.19691091775894165, + "learning_rate": 0.001, + "loss": 1.3982, + "step": 22452 + }, + { + "epoch": 0.9498688552330993, + "grad_norm": 0.19947366416454315, + "learning_rate": 0.001, + "loss": 2.8261, + "step": 22453 + }, + { + "epoch": 0.9499111599966156, + "grad_norm": 0.1901845932006836, + "learning_rate": 0.001, + "loss": 2.3973, + "step": 22454 + }, + { + "epoch": 0.949953464760132, + "grad_norm": 12.674116134643555, + "learning_rate": 0.001, + "loss": 2.4079, + "step": 22455 + }, + { + "epoch": 0.9499957695236484, + "grad_norm": 0.20128510892391205, + "learning_rate": 0.001, + "loss": 2.0499, + "step": 22456 + }, + { + "epoch": 0.9500380742871647, + "grad_norm": 1.169372320175171, + "learning_rate": 0.001, + "loss": 2.0313, + "step": 22457 + }, + { + "epoch": 0.9500803790506811, + "grad_norm": 0.7405765652656555, + "learning_rate": 0.001, + "loss": 2.3182, + "step": 22458 + }, + { + "epoch": 0.9501226838141975, + "grad_norm": 0.16553084552288055, + "learning_rate": 0.001, + "loss": 1.6338, + "step": 22459 + }, + { + "epoch": 0.9501649885777138, + "grad_norm": 0.34061887860298157, + "learning_rate": 0.001, + "loss": 1.7407, + "step": 22460 + }, + { + "epoch": 0.9502072933412302, + "grad_norm": 0.13990429043769836, + "learning_rate": 0.001, + "loss": 1.731, + "step": 22461 + }, + { + "epoch": 0.9502495981047466, + "grad_norm": 0.1741171032190323, + "learning_rate": 0.001, + "loss": 1.9323, + "step": 22462 + }, + { + "epoch": 0.9502919028682629, + "grad_norm": 0.1664520800113678, + "learning_rate": 0.001, + "loss": 2.2529, + "step": 22463 + }, + { + "epoch": 0.9503342076317793, + "grad_norm": 0.17836974561214447, + "learning_rate": 0.001, + "loss": 2.3956, + "step": 22464 + }, + { + "epoch": 0.9503765123952957, + "grad_norm": 0.16051559150218964, + "learning_rate": 0.001, + "loss": 2.4441, + "step": 22465 + }, + { + "epoch": 0.950418817158812, + "grad_norm": 0.14825597405433655, + "learning_rate": 0.001, + "loss": 2.1567, + "step": 22466 + }, + { + "epoch": 0.9504611219223285, + "grad_norm": 0.1824505478143692, + "learning_rate": 0.001, + "loss": 2.5797, + "step": 22467 + }, + { + "epoch": 0.9505034266858449, + "grad_norm": 0.21144576370716095, + "learning_rate": 0.001, + "loss": 2.8215, + "step": 22468 + }, + { + "epoch": 0.9505457314493612, + "grad_norm": 0.16636143624782562, + "learning_rate": 0.001, + "loss": 2.8168, + "step": 22469 + }, + { + "epoch": 0.9505880362128776, + "grad_norm": 0.3710878789424896, + "learning_rate": 0.001, + "loss": 3.2767, + "step": 22470 + }, + { + "epoch": 0.950630340976394, + "grad_norm": 0.1762601137161255, + "learning_rate": 0.001, + "loss": 2.8635, + "step": 22471 + }, + { + "epoch": 0.9506726457399103, + "grad_norm": 2.875742197036743, + "learning_rate": 0.001, + "loss": 1.802, + "step": 22472 + }, + { + "epoch": 0.9507149505034267, + "grad_norm": 0.16046804189682007, + "learning_rate": 0.001, + "loss": 1.5675, + "step": 22473 + }, + { + "epoch": 0.9507572552669431, + "grad_norm": 0.18604080379009247, + "learning_rate": 0.001, + "loss": 2.3506, + "step": 22474 + }, + { + "epoch": 0.9507995600304594, + "grad_norm": 0.1995047777891159, + "learning_rate": 0.001, + "loss": 1.4797, + "step": 22475 + }, + { + "epoch": 0.9508418647939758, + "grad_norm": 41.969749450683594, + "learning_rate": 0.001, + "loss": 1.6885, + "step": 22476 + }, + { + "epoch": 0.9508841695574922, + "grad_norm": 0.17194336652755737, + "learning_rate": 0.001, + "loss": 1.8758, + "step": 22477 + }, + { + "epoch": 0.9509264743210085, + "grad_norm": 0.1948256492614746, + "learning_rate": 0.001, + "loss": 2.4022, + "step": 22478 + }, + { + "epoch": 0.9509687790845249, + "grad_norm": 0.21217121183872223, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 22479 + }, + { + "epoch": 0.9510110838480413, + "grad_norm": 0.1763080209493637, + "learning_rate": 0.001, + "loss": 2.3558, + "step": 22480 + }, + { + "epoch": 0.9510533886115576, + "grad_norm": 0.22325830161571503, + "learning_rate": 0.001, + "loss": 2.0567, + "step": 22481 + }, + { + "epoch": 0.951095693375074, + "grad_norm": 0.16838374733924866, + "learning_rate": 0.001, + "loss": 2.779, + "step": 22482 + }, + { + "epoch": 0.9511379981385905, + "grad_norm": 0.17531989514827728, + "learning_rate": 0.001, + "loss": 2.0818, + "step": 22483 + }, + { + "epoch": 0.9511803029021068, + "grad_norm": 0.17867770791053772, + "learning_rate": 0.001, + "loss": 2.5842, + "step": 22484 + }, + { + "epoch": 0.9512226076656232, + "grad_norm": 0.1932533085346222, + "learning_rate": 0.001, + "loss": 1.9442, + "step": 22485 + }, + { + "epoch": 0.9512649124291396, + "grad_norm": 0.1727113425731659, + "learning_rate": 0.001, + "loss": 1.5683, + "step": 22486 + }, + { + "epoch": 0.9513072171926559, + "grad_norm": 0.1291782408952713, + "learning_rate": 0.001, + "loss": 2.3555, + "step": 22487 + }, + { + "epoch": 0.9513495219561723, + "grad_norm": 0.7629174590110779, + "learning_rate": 0.001, + "loss": 2.651, + "step": 22488 + }, + { + "epoch": 0.9513918267196886, + "grad_norm": 0.12951643764972687, + "learning_rate": 0.001, + "loss": 1.9717, + "step": 22489 + }, + { + "epoch": 0.951434131483205, + "grad_norm": 0.6491950750350952, + "learning_rate": 0.001, + "loss": 1.9874, + "step": 22490 + }, + { + "epoch": 0.9514764362467214, + "grad_norm": 0.1988588273525238, + "learning_rate": 0.001, + "loss": 2.0231, + "step": 22491 + }, + { + "epoch": 0.9515187410102377, + "grad_norm": 0.18948745727539062, + "learning_rate": 0.001, + "loss": 2.0585, + "step": 22492 + }, + { + "epoch": 0.9515610457737541, + "grad_norm": 0.3615002930164337, + "learning_rate": 0.001, + "loss": 3.7874, + "step": 22493 + }, + { + "epoch": 0.9516033505372705, + "grad_norm": 0.13920491933822632, + "learning_rate": 0.001, + "loss": 1.6694, + "step": 22494 + }, + { + "epoch": 0.9516456553007868, + "grad_norm": 0.16164518892765045, + "learning_rate": 0.001, + "loss": 2.0692, + "step": 22495 + }, + { + "epoch": 0.9516879600643032, + "grad_norm": 0.47578608989715576, + "learning_rate": 0.001, + "loss": 2.1681, + "step": 22496 + }, + { + "epoch": 0.9517302648278196, + "grad_norm": 0.20885364711284637, + "learning_rate": 0.001, + "loss": 3.6518, + "step": 22497 + }, + { + "epoch": 0.9517725695913359, + "grad_norm": 0.16062334179878235, + "learning_rate": 0.001, + "loss": 1.9614, + "step": 22498 + }, + { + "epoch": 0.9518148743548523, + "grad_norm": 0.22703850269317627, + "learning_rate": 0.001, + "loss": 2.476, + "step": 22499 + }, + { + "epoch": 0.9518571791183688, + "grad_norm": 0.2395503968000412, + "learning_rate": 0.001, + "loss": 1.9883, + "step": 22500 + }, + { + "epoch": 0.9518994838818851, + "grad_norm": 0.15861296653747559, + "learning_rate": 0.001, + "loss": 1.8161, + "step": 22501 + }, + { + "epoch": 0.9519417886454015, + "grad_norm": 0.15886007249355316, + "learning_rate": 0.001, + "loss": 1.8054, + "step": 22502 + }, + { + "epoch": 0.9519840934089179, + "grad_norm": 0.18226757645606995, + "learning_rate": 0.001, + "loss": 1.7382, + "step": 22503 + }, + { + "epoch": 0.9520263981724342, + "grad_norm": 0.17415927350521088, + "learning_rate": 0.001, + "loss": 1.6456, + "step": 22504 + }, + { + "epoch": 0.9520687029359506, + "grad_norm": 0.22157450020313263, + "learning_rate": 0.001, + "loss": 2.0265, + "step": 22505 + }, + { + "epoch": 0.952111007699467, + "grad_norm": 0.18525850772857666, + "learning_rate": 0.001, + "loss": 3.7023, + "step": 22506 + }, + { + "epoch": 0.9521533124629833, + "grad_norm": 0.2394370287656784, + "learning_rate": 0.001, + "loss": 3.2888, + "step": 22507 + }, + { + "epoch": 0.9521956172264997, + "grad_norm": 0.15804435312747955, + "learning_rate": 0.001, + "loss": 1.9221, + "step": 22508 + }, + { + "epoch": 0.9522379219900161, + "grad_norm": 0.18028190732002258, + "learning_rate": 0.001, + "loss": 2.1621, + "step": 22509 + }, + { + "epoch": 0.9522802267535324, + "grad_norm": 0.16329197585582733, + "learning_rate": 0.001, + "loss": 1.6036, + "step": 22510 + }, + { + "epoch": 0.9523225315170488, + "grad_norm": 0.15661050379276276, + "learning_rate": 0.001, + "loss": 1.8212, + "step": 22511 + }, + { + "epoch": 0.9523648362805652, + "grad_norm": 0.23924638330936432, + "learning_rate": 0.001, + "loss": 1.4197, + "step": 22512 + }, + { + "epoch": 0.9524071410440815, + "grad_norm": 0.17719987034797668, + "learning_rate": 0.001, + "loss": 1.427, + "step": 22513 + }, + { + "epoch": 0.9524494458075979, + "grad_norm": 0.1736730933189392, + "learning_rate": 0.001, + "loss": 1.5887, + "step": 22514 + }, + { + "epoch": 0.9524917505711143, + "grad_norm": 0.1556921899318695, + "learning_rate": 0.001, + "loss": 2.8394, + "step": 22515 + }, + { + "epoch": 0.9525340553346306, + "grad_norm": 0.15462371706962585, + "learning_rate": 0.001, + "loss": 1.9483, + "step": 22516 + }, + { + "epoch": 0.9525763600981471, + "grad_norm": 0.16534818708896637, + "learning_rate": 0.001, + "loss": 2.2795, + "step": 22517 + }, + { + "epoch": 0.9526186648616635, + "grad_norm": 0.276944100856781, + "learning_rate": 0.001, + "loss": 2.1088, + "step": 22518 + }, + { + "epoch": 0.9526609696251798, + "grad_norm": 0.13408029079437256, + "learning_rate": 0.001, + "loss": 1.9941, + "step": 22519 + }, + { + "epoch": 0.9527032743886962, + "grad_norm": 8.694868087768555, + "learning_rate": 0.001, + "loss": 1.924, + "step": 22520 + }, + { + "epoch": 0.9527455791522126, + "grad_norm": 0.24468626081943512, + "learning_rate": 0.001, + "loss": 2.0428, + "step": 22521 + }, + { + "epoch": 0.9527878839157289, + "grad_norm": 0.1722177416086197, + "learning_rate": 0.001, + "loss": 2.467, + "step": 22522 + }, + { + "epoch": 0.9528301886792453, + "grad_norm": 0.16793347895145416, + "learning_rate": 0.001, + "loss": 1.8234, + "step": 22523 + }, + { + "epoch": 0.9528724934427617, + "grad_norm": 0.6583682894706726, + "learning_rate": 0.001, + "loss": 1.7482, + "step": 22524 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.17453870177268982, + "learning_rate": 0.001, + "loss": 1.8821, + "step": 22525 + }, + { + "epoch": 0.9529571029697944, + "grad_norm": 0.17266742885112762, + "learning_rate": 0.001, + "loss": 1.8525, + "step": 22526 + }, + { + "epoch": 0.9529994077333108, + "grad_norm": 0.1756075918674469, + "learning_rate": 0.001, + "loss": 2.13, + "step": 22527 + }, + { + "epoch": 0.9530417124968271, + "grad_norm": 0.18365883827209473, + "learning_rate": 0.001, + "loss": 1.8475, + "step": 22528 + }, + { + "epoch": 0.9530840172603435, + "grad_norm": 1.1204431056976318, + "learning_rate": 0.001, + "loss": 1.7542, + "step": 22529 + }, + { + "epoch": 0.9531263220238599, + "grad_norm": 0.16155286133289337, + "learning_rate": 0.001, + "loss": 2.4133, + "step": 22530 + }, + { + "epoch": 0.9531686267873762, + "grad_norm": 1.2708326578140259, + "learning_rate": 0.001, + "loss": 1.942, + "step": 22531 + }, + { + "epoch": 0.9532109315508926, + "grad_norm": 0.3815459609031677, + "learning_rate": 0.001, + "loss": 1.5244, + "step": 22532 + }, + { + "epoch": 0.953253236314409, + "grad_norm": 0.1591525673866272, + "learning_rate": 0.001, + "loss": 1.6586, + "step": 22533 + }, + { + "epoch": 0.9532955410779254, + "grad_norm": 0.16831563413143158, + "learning_rate": 0.001, + "loss": 1.9452, + "step": 22534 + }, + { + "epoch": 0.9533378458414418, + "grad_norm": 0.1781228631734848, + "learning_rate": 0.001, + "loss": 1.9083, + "step": 22535 + }, + { + "epoch": 0.9533801506049581, + "grad_norm": 0.196019247174263, + "learning_rate": 0.001, + "loss": 1.8808, + "step": 22536 + }, + { + "epoch": 0.9534224553684745, + "grad_norm": 0.16879303753376007, + "learning_rate": 0.001, + "loss": 2.5292, + "step": 22537 + }, + { + "epoch": 0.9534647601319909, + "grad_norm": 0.14344555139541626, + "learning_rate": 0.001, + "loss": 3.0776, + "step": 22538 + }, + { + "epoch": 0.9535070648955072, + "grad_norm": 0.13954542577266693, + "learning_rate": 0.001, + "loss": 1.8676, + "step": 22539 + }, + { + "epoch": 0.9535493696590236, + "grad_norm": 0.16517263650894165, + "learning_rate": 0.001, + "loss": 2.1363, + "step": 22540 + }, + { + "epoch": 0.95359167442254, + "grad_norm": 0.4212926924228668, + "learning_rate": 0.001, + "loss": 2.4361, + "step": 22541 + }, + { + "epoch": 0.9536339791860563, + "grad_norm": 0.19001418352127075, + "learning_rate": 0.001, + "loss": 2.4905, + "step": 22542 + }, + { + "epoch": 0.9536762839495727, + "grad_norm": 0.1455460637807846, + "learning_rate": 0.001, + "loss": 1.8486, + "step": 22543 + }, + { + "epoch": 0.9537185887130891, + "grad_norm": 0.13077104091644287, + "learning_rate": 0.001, + "loss": 1.5365, + "step": 22544 + }, + { + "epoch": 0.9537608934766054, + "grad_norm": 0.139632910490036, + "learning_rate": 0.001, + "loss": 1.5715, + "step": 22545 + }, + { + "epoch": 0.9538031982401218, + "grad_norm": 0.14451947808265686, + "learning_rate": 0.001, + "loss": 2.0676, + "step": 22546 + }, + { + "epoch": 0.9538455030036382, + "grad_norm": 0.2569085657596588, + "learning_rate": 0.001, + "loss": 2.4596, + "step": 22547 + }, + { + "epoch": 0.9538878077671545, + "grad_norm": 9.396105766296387, + "learning_rate": 0.001, + "loss": 1.7845, + "step": 22548 + }, + { + "epoch": 0.953930112530671, + "grad_norm": 0.17778228223323822, + "learning_rate": 0.001, + "loss": 2.6535, + "step": 22549 + }, + { + "epoch": 0.9539724172941874, + "grad_norm": 1.270423173904419, + "learning_rate": 0.001, + "loss": 1.7749, + "step": 22550 + }, + { + "epoch": 0.9540147220577037, + "grad_norm": 0.20643652975559235, + "learning_rate": 0.001, + "loss": 2.5151, + "step": 22551 + }, + { + "epoch": 0.9540570268212201, + "grad_norm": 0.5847330093383789, + "learning_rate": 0.001, + "loss": 2.6464, + "step": 22552 + }, + { + "epoch": 0.9540993315847365, + "grad_norm": 0.15910720825195312, + "learning_rate": 0.001, + "loss": 2.5165, + "step": 22553 + }, + { + "epoch": 0.9541416363482528, + "grad_norm": 0.2874158024787903, + "learning_rate": 0.001, + "loss": 2.2493, + "step": 22554 + }, + { + "epoch": 0.9541839411117692, + "grad_norm": 0.14962883293628693, + "learning_rate": 0.001, + "loss": 2.0772, + "step": 22555 + }, + { + "epoch": 0.9542262458752856, + "grad_norm": 0.16051600873470306, + "learning_rate": 0.001, + "loss": 1.4298, + "step": 22556 + }, + { + "epoch": 0.9542685506388019, + "grad_norm": 0.16125862300395966, + "learning_rate": 0.001, + "loss": 1.383, + "step": 22557 + }, + { + "epoch": 0.9543108554023183, + "grad_norm": 0.15643396973609924, + "learning_rate": 0.001, + "loss": 1.9494, + "step": 22558 + }, + { + "epoch": 0.9543531601658347, + "grad_norm": 0.2045285403728485, + "learning_rate": 0.001, + "loss": 2.7422, + "step": 22559 + }, + { + "epoch": 0.954395464929351, + "grad_norm": 0.2184513658285141, + "learning_rate": 0.001, + "loss": 2.085, + "step": 22560 + }, + { + "epoch": 0.9544377696928674, + "grad_norm": 0.16785749793052673, + "learning_rate": 0.001, + "loss": 2.3423, + "step": 22561 + }, + { + "epoch": 0.9544800744563838, + "grad_norm": 0.16938123106956482, + "learning_rate": 0.001, + "loss": 1.6398, + "step": 22562 + }, + { + "epoch": 0.9545223792199001, + "grad_norm": 0.15510663390159607, + "learning_rate": 0.001, + "loss": 1.739, + "step": 22563 + }, + { + "epoch": 0.9545646839834165, + "grad_norm": 0.16871055960655212, + "learning_rate": 0.001, + "loss": 1.6647, + "step": 22564 + }, + { + "epoch": 0.954606988746933, + "grad_norm": 0.2923224866390228, + "learning_rate": 0.001, + "loss": 3.0942, + "step": 22565 + }, + { + "epoch": 0.9546492935104492, + "grad_norm": 0.37078985571861267, + "learning_rate": 0.001, + "loss": 1.9319, + "step": 22566 + }, + { + "epoch": 0.9546915982739657, + "grad_norm": 0.16258271038532257, + "learning_rate": 0.001, + "loss": 3.1049, + "step": 22567 + }, + { + "epoch": 0.9547339030374821, + "grad_norm": 0.18606746196746826, + "learning_rate": 0.001, + "loss": 3.302, + "step": 22568 + }, + { + "epoch": 0.9547762078009984, + "grad_norm": 0.7223076224327087, + "learning_rate": 0.001, + "loss": 1.7057, + "step": 22569 + }, + { + "epoch": 0.9548185125645148, + "grad_norm": 0.1600056141614914, + "learning_rate": 0.001, + "loss": 2.7663, + "step": 22570 + }, + { + "epoch": 0.9548608173280312, + "grad_norm": 0.16260926425457, + "learning_rate": 0.001, + "loss": 3.0786, + "step": 22571 + }, + { + "epoch": 0.9549031220915475, + "grad_norm": 0.16478076577186584, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 22572 + }, + { + "epoch": 0.9549454268550639, + "grad_norm": 0.14797167479991913, + "learning_rate": 0.001, + "loss": 1.6904, + "step": 22573 + }, + { + "epoch": 0.9549877316185803, + "grad_norm": 0.1376873105764389, + "learning_rate": 0.001, + "loss": 1.672, + "step": 22574 + }, + { + "epoch": 0.9550300363820966, + "grad_norm": 0.15510663390159607, + "learning_rate": 0.001, + "loss": 2.1794, + "step": 22575 + }, + { + "epoch": 0.955072341145613, + "grad_norm": 0.15933188796043396, + "learning_rate": 0.001, + "loss": 2.7918, + "step": 22576 + }, + { + "epoch": 0.9551146459091294, + "grad_norm": 0.15145090222358704, + "learning_rate": 0.001, + "loss": 1.8279, + "step": 22577 + }, + { + "epoch": 0.9551569506726457, + "grad_norm": 0.17468474805355072, + "learning_rate": 0.001, + "loss": 3.198, + "step": 22578 + }, + { + "epoch": 0.9551992554361621, + "grad_norm": 0.17872756719589233, + "learning_rate": 0.001, + "loss": 1.7174, + "step": 22579 + }, + { + "epoch": 0.9552415601996784, + "grad_norm": 0.17756347358226776, + "learning_rate": 0.001, + "loss": 2.8859, + "step": 22580 + }, + { + "epoch": 0.9552838649631948, + "grad_norm": 0.17524990439414978, + "learning_rate": 0.001, + "loss": 2.7646, + "step": 22581 + }, + { + "epoch": 0.9553261697267112, + "grad_norm": 0.1490987241268158, + "learning_rate": 0.001, + "loss": 2.0239, + "step": 22582 + }, + { + "epoch": 0.9553684744902275, + "grad_norm": 0.17126286029815674, + "learning_rate": 0.001, + "loss": 2.1528, + "step": 22583 + }, + { + "epoch": 0.955410779253744, + "grad_norm": 0.14318470656871796, + "learning_rate": 0.001, + "loss": 2.0535, + "step": 22584 + }, + { + "epoch": 0.9554530840172604, + "grad_norm": 0.1496138721704483, + "learning_rate": 0.001, + "loss": 2.2055, + "step": 22585 + }, + { + "epoch": 0.9554953887807767, + "grad_norm": 0.1591835618019104, + "learning_rate": 0.001, + "loss": 1.8175, + "step": 22586 + }, + { + "epoch": 0.9555376935442931, + "grad_norm": 0.18127931654453278, + "learning_rate": 0.001, + "loss": 3.1087, + "step": 22587 + }, + { + "epoch": 0.9555799983078095, + "grad_norm": 0.12975792586803436, + "learning_rate": 0.001, + "loss": 2.2168, + "step": 22588 + }, + { + "epoch": 0.9556223030713258, + "grad_norm": 0.1456218957901001, + "learning_rate": 0.001, + "loss": 1.4985, + "step": 22589 + }, + { + "epoch": 0.9556646078348422, + "grad_norm": 0.1666790097951889, + "learning_rate": 0.001, + "loss": 1.8393, + "step": 22590 + }, + { + "epoch": 0.9557069125983586, + "grad_norm": 0.1480528861284256, + "learning_rate": 0.001, + "loss": 1.7311, + "step": 22591 + }, + { + "epoch": 0.9557492173618749, + "grad_norm": 0.1545533984899521, + "learning_rate": 0.001, + "loss": 2.0204, + "step": 22592 + }, + { + "epoch": 0.9557915221253913, + "grad_norm": 0.17958112061023712, + "learning_rate": 0.001, + "loss": 2.5396, + "step": 22593 + }, + { + "epoch": 0.9558338268889077, + "grad_norm": 0.20622247457504272, + "learning_rate": 0.001, + "loss": 2.4856, + "step": 22594 + }, + { + "epoch": 0.955876131652424, + "grad_norm": 0.1637284755706787, + "learning_rate": 0.001, + "loss": 1.561, + "step": 22595 + }, + { + "epoch": 0.9559184364159404, + "grad_norm": 0.13965626060962677, + "learning_rate": 0.001, + "loss": 1.7077, + "step": 22596 + }, + { + "epoch": 0.9559607411794568, + "grad_norm": 0.15451842546463013, + "learning_rate": 0.001, + "loss": 2.9797, + "step": 22597 + }, + { + "epoch": 0.9560030459429731, + "grad_norm": 0.1734355241060257, + "learning_rate": 0.001, + "loss": 1.9119, + "step": 22598 + }, + { + "epoch": 0.9560453507064895, + "grad_norm": 0.16818620264530182, + "learning_rate": 0.001, + "loss": 2.4106, + "step": 22599 + }, + { + "epoch": 0.956087655470006, + "grad_norm": 0.4067355990409851, + "learning_rate": 0.001, + "loss": 2.4487, + "step": 22600 + }, + { + "epoch": 0.9561299602335223, + "grad_norm": 0.16298459470272064, + "learning_rate": 0.001, + "loss": 2.2041, + "step": 22601 + }, + { + "epoch": 0.9561722649970387, + "grad_norm": 0.7233208417892456, + "learning_rate": 0.001, + "loss": 1.9019, + "step": 22602 + }, + { + "epoch": 0.9562145697605551, + "grad_norm": 0.15039215981960297, + "learning_rate": 0.001, + "loss": 2.3582, + "step": 22603 + }, + { + "epoch": 0.9562568745240714, + "grad_norm": 0.25849875807762146, + "learning_rate": 0.001, + "loss": 2.4879, + "step": 22604 + }, + { + "epoch": 0.9562991792875878, + "grad_norm": 0.14870387315750122, + "learning_rate": 0.001, + "loss": 2.3015, + "step": 22605 + }, + { + "epoch": 0.9563414840511042, + "grad_norm": 0.16163372993469238, + "learning_rate": 0.001, + "loss": 2.9453, + "step": 22606 + }, + { + "epoch": 0.9563837888146205, + "grad_norm": 0.17681002616882324, + "learning_rate": 0.001, + "loss": 2.3321, + "step": 22607 + }, + { + "epoch": 0.9564260935781369, + "grad_norm": 0.16098996996879578, + "learning_rate": 0.001, + "loss": 1.9478, + "step": 22608 + }, + { + "epoch": 0.9564683983416533, + "grad_norm": 0.14798353612422943, + "learning_rate": 0.001, + "loss": 1.9331, + "step": 22609 + }, + { + "epoch": 0.9565107031051696, + "grad_norm": 0.31577110290527344, + "learning_rate": 0.001, + "loss": 3.3183, + "step": 22610 + }, + { + "epoch": 0.956553007868686, + "grad_norm": 0.1621190905570984, + "learning_rate": 0.001, + "loss": 1.9568, + "step": 22611 + }, + { + "epoch": 0.9565953126322024, + "grad_norm": 0.17691479623317719, + "learning_rate": 0.001, + "loss": 2.06, + "step": 22612 + }, + { + "epoch": 0.9566376173957187, + "grad_norm": 0.15779487788677216, + "learning_rate": 0.001, + "loss": 1.6084, + "step": 22613 + }, + { + "epoch": 0.9566799221592351, + "grad_norm": 0.15047040581703186, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 22614 + }, + { + "epoch": 0.9567222269227516, + "grad_norm": 0.17002348601818085, + "learning_rate": 0.001, + "loss": 1.7664, + "step": 22615 + }, + { + "epoch": 0.9567645316862678, + "grad_norm": 0.14370211958885193, + "learning_rate": 0.001, + "loss": 1.598, + "step": 22616 + }, + { + "epoch": 0.9568068364497843, + "grad_norm": 1.530818223953247, + "learning_rate": 0.001, + "loss": 3.326, + "step": 22617 + }, + { + "epoch": 0.9568491412133007, + "grad_norm": 0.1687633991241455, + "learning_rate": 0.001, + "loss": 1.355, + "step": 22618 + }, + { + "epoch": 0.956891445976817, + "grad_norm": 0.13722075521945953, + "learning_rate": 0.001, + "loss": 3.3707, + "step": 22619 + }, + { + "epoch": 0.9569337507403334, + "grad_norm": 0.16178375482559204, + "learning_rate": 0.001, + "loss": 2.5891, + "step": 22620 + }, + { + "epoch": 0.9569760555038498, + "grad_norm": 0.7043463587760925, + "learning_rate": 0.001, + "loss": 2.3528, + "step": 22621 + }, + { + "epoch": 0.9570183602673661, + "grad_norm": 0.14952924847602844, + "learning_rate": 0.001, + "loss": 1.4348, + "step": 22622 + }, + { + "epoch": 0.9570606650308825, + "grad_norm": 0.14588811993598938, + "learning_rate": 0.001, + "loss": 2.168, + "step": 22623 + }, + { + "epoch": 0.9571029697943988, + "grad_norm": 0.136274516582489, + "learning_rate": 0.001, + "loss": 3.4291, + "step": 22624 + }, + { + "epoch": 0.9571452745579152, + "grad_norm": 0.15219195187091827, + "learning_rate": 0.001, + "loss": 2.353, + "step": 22625 + }, + { + "epoch": 0.9571875793214316, + "grad_norm": 0.192640021443367, + "learning_rate": 0.001, + "loss": 2.5704, + "step": 22626 + }, + { + "epoch": 0.9572298840849479, + "grad_norm": 2.2798144817352295, + "learning_rate": 0.001, + "loss": 2.4483, + "step": 22627 + }, + { + "epoch": 0.9572721888484643, + "grad_norm": 0.12433091551065445, + "learning_rate": 0.001, + "loss": 2.9738, + "step": 22628 + }, + { + "epoch": 0.9573144936119807, + "grad_norm": 0.14844204485416412, + "learning_rate": 0.001, + "loss": 2.3593, + "step": 22629 + }, + { + "epoch": 0.957356798375497, + "grad_norm": 0.18237383663654327, + "learning_rate": 0.001, + "loss": 2.4718, + "step": 22630 + }, + { + "epoch": 0.9573991031390134, + "grad_norm": 0.15633726119995117, + "learning_rate": 0.001, + "loss": 2.9718, + "step": 22631 + }, + { + "epoch": 0.9574414079025299, + "grad_norm": 0.1357184201478958, + "learning_rate": 0.001, + "loss": 1.8955, + "step": 22632 + }, + { + "epoch": 0.9574837126660461, + "grad_norm": 0.15892942249774933, + "learning_rate": 0.001, + "loss": 1.6971, + "step": 22633 + }, + { + "epoch": 0.9575260174295626, + "grad_norm": 0.1862511783838272, + "learning_rate": 0.001, + "loss": 2.416, + "step": 22634 + }, + { + "epoch": 0.957568322193079, + "grad_norm": 0.18506558239459991, + "learning_rate": 0.001, + "loss": 2.481, + "step": 22635 + }, + { + "epoch": 0.9576106269565953, + "grad_norm": 2.206505298614502, + "learning_rate": 0.001, + "loss": 2.0285, + "step": 22636 + }, + { + "epoch": 0.9576529317201117, + "grad_norm": 0.19020433723926544, + "learning_rate": 0.001, + "loss": 1.833, + "step": 22637 + }, + { + "epoch": 0.9576952364836281, + "grad_norm": 0.22142185270786285, + "learning_rate": 0.001, + "loss": 2.196, + "step": 22638 + }, + { + "epoch": 0.9577375412471444, + "grad_norm": 0.1741228997707367, + "learning_rate": 0.001, + "loss": 2.101, + "step": 22639 + }, + { + "epoch": 0.9577798460106608, + "grad_norm": 0.1707838922739029, + "learning_rate": 0.001, + "loss": 2.0027, + "step": 22640 + }, + { + "epoch": 0.9578221507741772, + "grad_norm": 0.16270555555820465, + "learning_rate": 0.001, + "loss": 2.9908, + "step": 22641 + }, + { + "epoch": 0.9578644555376935, + "grad_norm": 0.1549161970615387, + "learning_rate": 0.001, + "loss": 1.7517, + "step": 22642 + }, + { + "epoch": 0.9579067603012099, + "grad_norm": 0.16210713982582092, + "learning_rate": 0.001, + "loss": 2.5452, + "step": 22643 + }, + { + "epoch": 0.9579490650647263, + "grad_norm": 0.14848294854164124, + "learning_rate": 0.001, + "loss": 2.117, + "step": 22644 + }, + { + "epoch": 0.9579913698282426, + "grad_norm": 0.7370397448539734, + "learning_rate": 0.001, + "loss": 1.9066, + "step": 22645 + }, + { + "epoch": 0.958033674591759, + "grad_norm": 0.15546540915966034, + "learning_rate": 0.001, + "loss": 1.5374, + "step": 22646 + }, + { + "epoch": 0.9580759793552754, + "grad_norm": 0.1666499376296997, + "learning_rate": 0.001, + "loss": 1.7665, + "step": 22647 + }, + { + "epoch": 0.9581182841187917, + "grad_norm": 0.16058315336704254, + "learning_rate": 0.001, + "loss": 1.7836, + "step": 22648 + }, + { + "epoch": 0.9581605888823082, + "grad_norm": 0.1704726219177246, + "learning_rate": 0.001, + "loss": 2.0733, + "step": 22649 + }, + { + "epoch": 0.9582028936458246, + "grad_norm": 1.3915730714797974, + "learning_rate": 0.001, + "loss": 2.1041, + "step": 22650 + }, + { + "epoch": 0.9582451984093409, + "grad_norm": 0.14765958487987518, + "learning_rate": 0.001, + "loss": 2.1703, + "step": 22651 + }, + { + "epoch": 0.9582875031728573, + "grad_norm": 0.136846661567688, + "learning_rate": 0.001, + "loss": 2.0503, + "step": 22652 + }, + { + "epoch": 0.9583298079363737, + "grad_norm": 2.0541558265686035, + "learning_rate": 0.001, + "loss": 1.9055, + "step": 22653 + }, + { + "epoch": 0.95837211269989, + "grad_norm": 0.168845996260643, + "learning_rate": 0.001, + "loss": 2.0872, + "step": 22654 + }, + { + "epoch": 0.9584144174634064, + "grad_norm": 0.18082398176193237, + "learning_rate": 0.001, + "loss": 2.6359, + "step": 22655 + }, + { + "epoch": 0.9584567222269228, + "grad_norm": 0.2050907164812088, + "learning_rate": 0.001, + "loss": 2.1573, + "step": 22656 + }, + { + "epoch": 0.9584990269904391, + "grad_norm": 0.23154161870479584, + "learning_rate": 0.001, + "loss": 2.4487, + "step": 22657 + }, + { + "epoch": 0.9585413317539555, + "grad_norm": 0.228712260723114, + "learning_rate": 0.001, + "loss": 2.5394, + "step": 22658 + }, + { + "epoch": 0.9585836365174719, + "grad_norm": 0.18508756160736084, + "learning_rate": 0.001, + "loss": 1.982, + "step": 22659 + }, + { + "epoch": 0.9586259412809882, + "grad_norm": 0.2040599286556244, + "learning_rate": 0.001, + "loss": 1.9579, + "step": 22660 + }, + { + "epoch": 0.9586682460445046, + "grad_norm": 0.2641620337963104, + "learning_rate": 0.001, + "loss": 2.3784, + "step": 22661 + }, + { + "epoch": 0.958710550808021, + "grad_norm": 0.17128469049930573, + "learning_rate": 0.001, + "loss": 2.367, + "step": 22662 + }, + { + "epoch": 0.9587528555715373, + "grad_norm": 0.5706077814102173, + "learning_rate": 0.001, + "loss": 2.7794, + "step": 22663 + }, + { + "epoch": 0.9587951603350537, + "grad_norm": 0.15378667414188385, + "learning_rate": 0.001, + "loss": 1.2554, + "step": 22664 + }, + { + "epoch": 0.9588374650985702, + "grad_norm": 0.14689679443836212, + "learning_rate": 0.001, + "loss": 2.1687, + "step": 22665 + }, + { + "epoch": 0.9588797698620865, + "grad_norm": 0.19567295908927917, + "learning_rate": 0.001, + "loss": 2.2162, + "step": 22666 + }, + { + "epoch": 0.9589220746256029, + "grad_norm": 0.3699814975261688, + "learning_rate": 0.001, + "loss": 2.1986, + "step": 22667 + }, + { + "epoch": 0.9589643793891193, + "grad_norm": 0.19808320701122284, + "learning_rate": 0.001, + "loss": 2.4602, + "step": 22668 + }, + { + "epoch": 0.9590066841526356, + "grad_norm": 0.17543701827526093, + "learning_rate": 0.001, + "loss": 2.4826, + "step": 22669 + }, + { + "epoch": 0.959048988916152, + "grad_norm": 0.16463440656661987, + "learning_rate": 0.001, + "loss": 2.3312, + "step": 22670 + }, + { + "epoch": 0.9590912936796683, + "grad_norm": 15.79118537902832, + "learning_rate": 0.001, + "loss": 2.4984, + "step": 22671 + }, + { + "epoch": 0.9591335984431847, + "grad_norm": 0.457317054271698, + "learning_rate": 0.001, + "loss": 1.6768, + "step": 22672 + }, + { + "epoch": 0.9591759032067011, + "grad_norm": 0.3178519904613495, + "learning_rate": 0.001, + "loss": 1.9121, + "step": 22673 + }, + { + "epoch": 0.9592182079702174, + "grad_norm": 0.1642330437898636, + "learning_rate": 0.001, + "loss": 1.1973, + "step": 22674 + }, + { + "epoch": 0.9592605127337338, + "grad_norm": 0.9215223789215088, + "learning_rate": 0.001, + "loss": 1.7921, + "step": 22675 + }, + { + "epoch": 0.9593028174972502, + "grad_norm": 0.15992654860019684, + "learning_rate": 0.001, + "loss": 1.6263, + "step": 22676 + }, + { + "epoch": 0.9593451222607665, + "grad_norm": 0.18335092067718506, + "learning_rate": 0.001, + "loss": 1.7375, + "step": 22677 + }, + { + "epoch": 0.9593874270242829, + "grad_norm": 0.19411593675613403, + "learning_rate": 0.001, + "loss": 1.8051, + "step": 22678 + }, + { + "epoch": 0.9594297317877993, + "grad_norm": 0.16195566952228546, + "learning_rate": 0.001, + "loss": 3.0332, + "step": 22679 + }, + { + "epoch": 0.9594720365513156, + "grad_norm": 0.5709326863288879, + "learning_rate": 0.001, + "loss": 2.7656, + "step": 22680 + }, + { + "epoch": 0.959514341314832, + "grad_norm": 0.145700141787529, + "learning_rate": 0.001, + "loss": 1.9661, + "step": 22681 + }, + { + "epoch": 0.9595566460783485, + "grad_norm": 0.16970385611057281, + "learning_rate": 0.001, + "loss": 1.9235, + "step": 22682 + }, + { + "epoch": 0.9595989508418648, + "grad_norm": 0.17255772650241852, + "learning_rate": 0.001, + "loss": 2.1868, + "step": 22683 + }, + { + "epoch": 0.9596412556053812, + "grad_norm": 0.16571679711341858, + "learning_rate": 0.001, + "loss": 1.8325, + "step": 22684 + }, + { + "epoch": 0.9596835603688976, + "grad_norm": 0.12924878299236298, + "learning_rate": 0.001, + "loss": 1.8835, + "step": 22685 + }, + { + "epoch": 0.9597258651324139, + "grad_norm": 0.20774781703948975, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 22686 + }, + { + "epoch": 0.9597681698959303, + "grad_norm": 0.35853028297424316, + "learning_rate": 0.001, + "loss": 2.2272, + "step": 22687 + }, + { + "epoch": 0.9598104746594467, + "grad_norm": 0.5301867127418518, + "learning_rate": 0.001, + "loss": 1.568, + "step": 22688 + }, + { + "epoch": 0.959852779422963, + "grad_norm": 0.19120998680591583, + "learning_rate": 0.001, + "loss": 2.1802, + "step": 22689 + }, + { + "epoch": 0.9598950841864794, + "grad_norm": 0.16802310943603516, + "learning_rate": 0.001, + "loss": 1.6033, + "step": 22690 + }, + { + "epoch": 0.9599373889499958, + "grad_norm": 0.17627355456352234, + "learning_rate": 0.001, + "loss": 2.273, + "step": 22691 + }, + { + "epoch": 0.9599796937135121, + "grad_norm": 0.22242601215839386, + "learning_rate": 0.001, + "loss": 1.9304, + "step": 22692 + }, + { + "epoch": 0.9600219984770285, + "grad_norm": 0.1294320523738861, + "learning_rate": 0.001, + "loss": 2.0693, + "step": 22693 + }, + { + "epoch": 0.9600643032405449, + "grad_norm": 0.17644540965557098, + "learning_rate": 0.001, + "loss": 2.0387, + "step": 22694 + }, + { + "epoch": 0.9601066080040612, + "grad_norm": 1.0202915668487549, + "learning_rate": 0.001, + "loss": 1.9817, + "step": 22695 + }, + { + "epoch": 0.9601489127675776, + "grad_norm": 0.15253128111362457, + "learning_rate": 0.001, + "loss": 2.055, + "step": 22696 + }, + { + "epoch": 0.960191217531094, + "grad_norm": 0.15679995715618134, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 22697 + }, + { + "epoch": 0.9602335222946103, + "grad_norm": 0.15941359102725983, + "learning_rate": 0.001, + "loss": 2.0955, + "step": 22698 + }, + { + "epoch": 0.9602758270581268, + "grad_norm": 0.17292888462543488, + "learning_rate": 0.001, + "loss": 1.624, + "step": 22699 + }, + { + "epoch": 0.9603181318216432, + "grad_norm": 0.1328788846731186, + "learning_rate": 0.001, + "loss": 1.4056, + "step": 22700 + }, + { + "epoch": 0.9603604365851595, + "grad_norm": 0.18797191977500916, + "learning_rate": 0.001, + "loss": 2.5867, + "step": 22701 + }, + { + "epoch": 0.9604027413486759, + "grad_norm": 0.7679688930511475, + "learning_rate": 0.001, + "loss": 3.1701, + "step": 22702 + }, + { + "epoch": 0.9604450461121923, + "grad_norm": 0.6459025740623474, + "learning_rate": 0.001, + "loss": 4.0775, + "step": 22703 + }, + { + "epoch": 0.9604873508757086, + "grad_norm": 0.13661043345928192, + "learning_rate": 0.001, + "loss": 2.6037, + "step": 22704 + }, + { + "epoch": 0.960529655639225, + "grad_norm": 0.15741907060146332, + "learning_rate": 0.001, + "loss": 2.0212, + "step": 22705 + }, + { + "epoch": 0.9605719604027414, + "grad_norm": 0.9307416677474976, + "learning_rate": 0.001, + "loss": 2.2273, + "step": 22706 + }, + { + "epoch": 0.9606142651662577, + "grad_norm": 0.19062964618206024, + "learning_rate": 0.001, + "loss": 3.1016, + "step": 22707 + }, + { + "epoch": 0.9606565699297741, + "grad_norm": 0.17339986562728882, + "learning_rate": 0.001, + "loss": 1.9746, + "step": 22708 + }, + { + "epoch": 0.9606988746932905, + "grad_norm": 0.17814798653125763, + "learning_rate": 0.001, + "loss": 2.9384, + "step": 22709 + }, + { + "epoch": 0.9607411794568068, + "grad_norm": 0.19734229147434235, + "learning_rate": 0.001, + "loss": 2.3626, + "step": 22710 + }, + { + "epoch": 0.9607834842203232, + "grad_norm": 0.2042291760444641, + "learning_rate": 0.001, + "loss": 1.851, + "step": 22711 + }, + { + "epoch": 0.9608257889838396, + "grad_norm": 1.1738054752349854, + "learning_rate": 0.001, + "loss": 3.5114, + "step": 22712 + }, + { + "epoch": 0.9608680937473559, + "grad_norm": 3.5392398834228516, + "learning_rate": 0.001, + "loss": 2.1209, + "step": 22713 + }, + { + "epoch": 0.9609103985108723, + "grad_norm": 0.2352452278137207, + "learning_rate": 0.001, + "loss": 2.2434, + "step": 22714 + }, + { + "epoch": 0.9609527032743886, + "grad_norm": 0.696122407913208, + "learning_rate": 0.001, + "loss": 2.0429, + "step": 22715 + }, + { + "epoch": 0.960995008037905, + "grad_norm": 0.24021172523498535, + "learning_rate": 0.001, + "loss": 1.927, + "step": 22716 + }, + { + "epoch": 0.9610373128014215, + "grad_norm": 0.1911345273256302, + "learning_rate": 0.001, + "loss": 2.3258, + "step": 22717 + }, + { + "epoch": 0.9610796175649378, + "grad_norm": 0.1680738478899002, + "learning_rate": 0.001, + "loss": 1.7179, + "step": 22718 + }, + { + "epoch": 0.9611219223284542, + "grad_norm": 0.20426170527935028, + "learning_rate": 0.001, + "loss": 1.8262, + "step": 22719 + }, + { + "epoch": 0.9611642270919706, + "grad_norm": 0.9206085205078125, + "learning_rate": 0.001, + "loss": 1.7566, + "step": 22720 + }, + { + "epoch": 0.9612065318554869, + "grad_norm": 0.17064528167247772, + "learning_rate": 0.001, + "loss": 3.1635, + "step": 22721 + }, + { + "epoch": 0.9612488366190033, + "grad_norm": 0.1871117353439331, + "learning_rate": 0.001, + "loss": 3.1601, + "step": 22722 + }, + { + "epoch": 0.9612911413825197, + "grad_norm": 0.14905090630054474, + "learning_rate": 0.001, + "loss": 2.5749, + "step": 22723 + }, + { + "epoch": 0.961333446146036, + "grad_norm": 0.3839198350906372, + "learning_rate": 0.001, + "loss": 3.2174, + "step": 22724 + }, + { + "epoch": 0.9613757509095524, + "grad_norm": 0.15603891015052795, + "learning_rate": 0.001, + "loss": 1.784, + "step": 22725 + }, + { + "epoch": 0.9614180556730688, + "grad_norm": 0.16384871304035187, + "learning_rate": 0.001, + "loss": 2.1131, + "step": 22726 + }, + { + "epoch": 0.9614603604365851, + "grad_norm": 0.1653030961751938, + "learning_rate": 0.001, + "loss": 2.8662, + "step": 22727 + }, + { + "epoch": 0.9615026652001015, + "grad_norm": 0.15746630728244781, + "learning_rate": 0.001, + "loss": 2.3647, + "step": 22728 + }, + { + "epoch": 0.9615449699636179, + "grad_norm": 0.1539701670408249, + "learning_rate": 0.001, + "loss": 1.9527, + "step": 22729 + }, + { + "epoch": 0.9615872747271342, + "grad_norm": 0.15698012709617615, + "learning_rate": 0.001, + "loss": 1.9104, + "step": 22730 + }, + { + "epoch": 0.9616295794906506, + "grad_norm": 0.17898909747600555, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 22731 + }, + { + "epoch": 0.961671884254167, + "grad_norm": 0.1490541249513626, + "learning_rate": 0.001, + "loss": 1.9078, + "step": 22732 + }, + { + "epoch": 0.9617141890176834, + "grad_norm": 0.17301572859287262, + "learning_rate": 0.001, + "loss": 1.5119, + "step": 22733 + }, + { + "epoch": 0.9617564937811998, + "grad_norm": 1.670694351196289, + "learning_rate": 0.001, + "loss": 1.6093, + "step": 22734 + }, + { + "epoch": 0.9617987985447162, + "grad_norm": 0.16678212583065033, + "learning_rate": 0.001, + "loss": 1.5122, + "step": 22735 + }, + { + "epoch": 0.9618411033082325, + "grad_norm": 0.15618404746055603, + "learning_rate": 0.001, + "loss": 1.4957, + "step": 22736 + }, + { + "epoch": 0.9618834080717489, + "grad_norm": 0.15901844203472137, + "learning_rate": 0.001, + "loss": 2.9647, + "step": 22737 + }, + { + "epoch": 0.9619257128352653, + "grad_norm": 0.162552148103714, + "learning_rate": 0.001, + "loss": 2.2336, + "step": 22738 + }, + { + "epoch": 0.9619680175987816, + "grad_norm": 0.6180990934371948, + "learning_rate": 0.001, + "loss": 2.6319, + "step": 22739 + }, + { + "epoch": 0.962010322362298, + "grad_norm": 0.16461017727851868, + "learning_rate": 0.001, + "loss": 2.8503, + "step": 22740 + }, + { + "epoch": 0.9620526271258144, + "grad_norm": 0.15487508475780487, + "learning_rate": 0.001, + "loss": 1.7637, + "step": 22741 + }, + { + "epoch": 0.9620949318893307, + "grad_norm": 0.16154402494430542, + "learning_rate": 0.001, + "loss": 1.7477, + "step": 22742 + }, + { + "epoch": 0.9621372366528471, + "grad_norm": 0.162453293800354, + "learning_rate": 0.001, + "loss": 1.1528, + "step": 22743 + }, + { + "epoch": 0.9621795414163635, + "grad_norm": 0.15126554667949677, + "learning_rate": 0.001, + "loss": 2.702, + "step": 22744 + }, + { + "epoch": 0.9622218461798798, + "grad_norm": 0.13284894824028015, + "learning_rate": 0.001, + "loss": 3.2294, + "step": 22745 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 1.1502666473388672, + "learning_rate": 0.001, + "loss": 2.4184, + "step": 22746 + }, + { + "epoch": 0.9623064557069126, + "grad_norm": 0.1922639161348343, + "learning_rate": 0.001, + "loss": 2.0937, + "step": 22747 + }, + { + "epoch": 0.9623487604704289, + "grad_norm": 0.16614429652690887, + "learning_rate": 0.001, + "loss": 1.959, + "step": 22748 + }, + { + "epoch": 0.9623910652339454, + "grad_norm": 0.1562388688325882, + "learning_rate": 0.001, + "loss": 2.9115, + "step": 22749 + }, + { + "epoch": 0.9624333699974618, + "grad_norm": 0.15959501266479492, + "learning_rate": 0.001, + "loss": 1.7341, + "step": 22750 + }, + { + "epoch": 0.9624756747609781, + "grad_norm": 0.5524002909660339, + "learning_rate": 0.001, + "loss": 1.8583, + "step": 22751 + }, + { + "epoch": 0.9625179795244945, + "grad_norm": 0.16397516429424286, + "learning_rate": 0.001, + "loss": 1.9359, + "step": 22752 + }, + { + "epoch": 0.9625602842880109, + "grad_norm": 0.19087645411491394, + "learning_rate": 0.001, + "loss": 2.3213, + "step": 22753 + }, + { + "epoch": 0.9626025890515272, + "grad_norm": 0.18343958258628845, + "learning_rate": 0.001, + "loss": 2.2451, + "step": 22754 + }, + { + "epoch": 0.9626448938150436, + "grad_norm": 0.17732329666614532, + "learning_rate": 0.001, + "loss": 2.5962, + "step": 22755 + }, + { + "epoch": 0.96268719857856, + "grad_norm": 0.1700247973203659, + "learning_rate": 0.001, + "loss": 1.6104, + "step": 22756 + }, + { + "epoch": 0.9627295033420763, + "grad_norm": 18.20061492919922, + "learning_rate": 0.001, + "loss": 2.0838, + "step": 22757 + }, + { + "epoch": 0.9627718081055927, + "grad_norm": 0.8646219372749329, + "learning_rate": 0.001, + "loss": 3.463, + "step": 22758 + }, + { + "epoch": 0.962814112869109, + "grad_norm": 0.6529208421707153, + "learning_rate": 0.001, + "loss": 3.052, + "step": 22759 + }, + { + "epoch": 0.9628564176326254, + "grad_norm": 0.16437289118766785, + "learning_rate": 0.001, + "loss": 1.9609, + "step": 22760 + }, + { + "epoch": 0.9628987223961418, + "grad_norm": 0.2545870840549469, + "learning_rate": 0.001, + "loss": 2.0325, + "step": 22761 + }, + { + "epoch": 0.9629410271596581, + "grad_norm": 0.4441566467285156, + "learning_rate": 0.001, + "loss": 2.3939, + "step": 22762 + }, + { + "epoch": 0.9629833319231745, + "grad_norm": 0.17705874145030975, + "learning_rate": 0.001, + "loss": 2.0372, + "step": 22763 + }, + { + "epoch": 0.963025636686691, + "grad_norm": 0.14965033531188965, + "learning_rate": 0.001, + "loss": 1.515, + "step": 22764 + }, + { + "epoch": 0.9630679414502072, + "grad_norm": 0.1866179257631302, + "learning_rate": 0.001, + "loss": 2.3098, + "step": 22765 + }, + { + "epoch": 0.9631102462137237, + "grad_norm": 2.7773499488830566, + "learning_rate": 0.001, + "loss": 2.5506, + "step": 22766 + }, + { + "epoch": 0.9631525509772401, + "grad_norm": 0.18301425874233246, + "learning_rate": 0.001, + "loss": 1.7283, + "step": 22767 + }, + { + "epoch": 0.9631948557407564, + "grad_norm": 0.15599261224269867, + "learning_rate": 0.001, + "loss": 2.4437, + "step": 22768 + }, + { + "epoch": 0.9632371605042728, + "grad_norm": 0.35363784432411194, + "learning_rate": 0.001, + "loss": 2.3903, + "step": 22769 + }, + { + "epoch": 0.9632794652677892, + "grad_norm": 5.986473560333252, + "learning_rate": 0.001, + "loss": 1.5687, + "step": 22770 + }, + { + "epoch": 0.9633217700313055, + "grad_norm": 0.20316632091999054, + "learning_rate": 0.001, + "loss": 2.2692, + "step": 22771 + }, + { + "epoch": 0.9633640747948219, + "grad_norm": 0.16965316236019135, + "learning_rate": 0.001, + "loss": 2.4032, + "step": 22772 + }, + { + "epoch": 0.9634063795583383, + "grad_norm": 0.18180473148822784, + "learning_rate": 0.001, + "loss": 2.0577, + "step": 22773 + }, + { + "epoch": 0.9634486843218546, + "grad_norm": 0.16141119599342346, + "learning_rate": 0.001, + "loss": 1.8242, + "step": 22774 + }, + { + "epoch": 0.963490989085371, + "grad_norm": 0.18191201984882355, + "learning_rate": 0.001, + "loss": 2.1905, + "step": 22775 + }, + { + "epoch": 0.9635332938488874, + "grad_norm": 0.783811092376709, + "learning_rate": 0.001, + "loss": 3.1841, + "step": 22776 + }, + { + "epoch": 0.9635755986124037, + "grad_norm": 0.3125717043876648, + "learning_rate": 0.001, + "loss": 1.7756, + "step": 22777 + }, + { + "epoch": 0.9636179033759201, + "grad_norm": 0.25806742906570435, + "learning_rate": 0.001, + "loss": 1.7628, + "step": 22778 + }, + { + "epoch": 0.9636602081394365, + "grad_norm": 0.16139912605285645, + "learning_rate": 0.001, + "loss": 1.7258, + "step": 22779 + }, + { + "epoch": 0.9637025129029528, + "grad_norm": 0.15739895403385162, + "learning_rate": 0.001, + "loss": 1.9454, + "step": 22780 + }, + { + "epoch": 0.9637448176664692, + "grad_norm": 0.15625838935375214, + "learning_rate": 0.001, + "loss": 2.3343, + "step": 22781 + }, + { + "epoch": 0.9637871224299857, + "grad_norm": 0.15970557928085327, + "learning_rate": 0.001, + "loss": 2.6073, + "step": 22782 + }, + { + "epoch": 0.963829427193502, + "grad_norm": 2.783663272857666, + "learning_rate": 0.001, + "loss": 2.5539, + "step": 22783 + }, + { + "epoch": 0.9638717319570184, + "grad_norm": 0.15198123455047607, + "learning_rate": 0.001, + "loss": 1.7133, + "step": 22784 + }, + { + "epoch": 0.9639140367205348, + "grad_norm": 0.25221678614616394, + "learning_rate": 0.001, + "loss": 3.6873, + "step": 22785 + }, + { + "epoch": 0.9639563414840511, + "grad_norm": 0.1301819235086441, + "learning_rate": 0.001, + "loss": 2.8356, + "step": 22786 + }, + { + "epoch": 0.9639986462475675, + "grad_norm": 0.2297348827123642, + "learning_rate": 0.001, + "loss": 2.3075, + "step": 22787 + }, + { + "epoch": 0.9640409510110839, + "grad_norm": 0.15260782837867737, + "learning_rate": 0.001, + "loss": 1.8023, + "step": 22788 + }, + { + "epoch": 0.9640832557746002, + "grad_norm": 0.18781907856464386, + "learning_rate": 0.001, + "loss": 2.7565, + "step": 22789 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.38502037525177, + "learning_rate": 0.001, + "loss": 2.6757, + "step": 22790 + }, + { + "epoch": 0.964167865301633, + "grad_norm": 0.1971847265958786, + "learning_rate": 0.001, + "loss": 1.6928, + "step": 22791 + }, + { + "epoch": 0.9642101700651493, + "grad_norm": 0.1461469531059265, + "learning_rate": 0.001, + "loss": 2.707, + "step": 22792 + }, + { + "epoch": 0.9642524748286657, + "grad_norm": 0.15411047637462616, + "learning_rate": 0.001, + "loss": 1.8741, + "step": 22793 + }, + { + "epoch": 0.9642947795921821, + "grad_norm": 0.15573959052562714, + "learning_rate": 0.001, + "loss": 2.3374, + "step": 22794 + }, + { + "epoch": 0.9643370843556984, + "grad_norm": 0.1828451305627823, + "learning_rate": 0.001, + "loss": 2.5809, + "step": 22795 + }, + { + "epoch": 0.9643793891192148, + "grad_norm": 2.2866430282592773, + "learning_rate": 0.001, + "loss": 2.0316, + "step": 22796 + }, + { + "epoch": 0.9644216938827312, + "grad_norm": 5.461979866027832, + "learning_rate": 0.001, + "loss": 1.6355, + "step": 22797 + }, + { + "epoch": 0.9644639986462475, + "grad_norm": 0.24063386023044586, + "learning_rate": 0.001, + "loss": 1.7713, + "step": 22798 + }, + { + "epoch": 0.964506303409764, + "grad_norm": 0.15354004502296448, + "learning_rate": 0.001, + "loss": 1.7979, + "step": 22799 + }, + { + "epoch": 0.9645486081732804, + "grad_norm": 0.20435552299022675, + "learning_rate": 0.001, + "loss": 1.8636, + "step": 22800 + }, + { + "epoch": 0.9645909129367967, + "grad_norm": 0.17988991737365723, + "learning_rate": 0.001, + "loss": 3.3384, + "step": 22801 + }, + { + "epoch": 0.9646332177003131, + "grad_norm": 0.19008944928646088, + "learning_rate": 0.001, + "loss": 1.8226, + "step": 22802 + }, + { + "epoch": 0.9646755224638295, + "grad_norm": 0.20340676605701447, + "learning_rate": 0.001, + "loss": 3.0798, + "step": 22803 + }, + { + "epoch": 0.9647178272273458, + "grad_norm": 0.18513543903827667, + "learning_rate": 0.001, + "loss": 1.7459, + "step": 22804 + }, + { + "epoch": 0.9647601319908622, + "grad_norm": 0.18096236884593964, + "learning_rate": 0.001, + "loss": 2.0214, + "step": 22805 + }, + { + "epoch": 0.9648024367543785, + "grad_norm": 0.20372340083122253, + "learning_rate": 0.001, + "loss": 2.1574, + "step": 22806 + }, + { + "epoch": 0.9648447415178949, + "grad_norm": 0.1942087858915329, + "learning_rate": 0.001, + "loss": 1.8102, + "step": 22807 + }, + { + "epoch": 0.9648870462814113, + "grad_norm": 0.16786138713359833, + "learning_rate": 0.001, + "loss": 1.7745, + "step": 22808 + }, + { + "epoch": 0.9649293510449276, + "grad_norm": 21.640262603759766, + "learning_rate": 0.001, + "loss": 2.6787, + "step": 22809 + }, + { + "epoch": 0.964971655808444, + "grad_norm": 0.17724758386611938, + "learning_rate": 0.001, + "loss": 2.0269, + "step": 22810 + }, + { + "epoch": 0.9650139605719604, + "grad_norm": 0.17593099176883698, + "learning_rate": 0.001, + "loss": 2.6293, + "step": 22811 + }, + { + "epoch": 0.9650562653354767, + "grad_norm": 0.1671978384256363, + "learning_rate": 0.001, + "loss": 2.3804, + "step": 22812 + }, + { + "epoch": 0.9650985700989931, + "grad_norm": 0.19632820785045624, + "learning_rate": 0.001, + "loss": 1.808, + "step": 22813 + }, + { + "epoch": 0.9651408748625095, + "grad_norm": 0.35916808247566223, + "learning_rate": 0.001, + "loss": 1.8704, + "step": 22814 + }, + { + "epoch": 0.9651831796260258, + "grad_norm": 0.16983725130558014, + "learning_rate": 0.001, + "loss": 3.2959, + "step": 22815 + }, + { + "epoch": 0.9652254843895423, + "grad_norm": 0.6574493050575256, + "learning_rate": 0.001, + "loss": 3.0752, + "step": 22816 + }, + { + "epoch": 0.9652677891530587, + "grad_norm": 0.14061124622821808, + "learning_rate": 0.001, + "loss": 2.3643, + "step": 22817 + }, + { + "epoch": 0.965310093916575, + "grad_norm": 0.12727145850658417, + "learning_rate": 0.001, + "loss": 2.3003, + "step": 22818 + }, + { + "epoch": 0.9653523986800914, + "grad_norm": 0.15103477239608765, + "learning_rate": 0.001, + "loss": 2.3464, + "step": 22819 + }, + { + "epoch": 0.9653947034436078, + "grad_norm": 0.15163427591323853, + "learning_rate": 0.001, + "loss": 1.8442, + "step": 22820 + }, + { + "epoch": 0.9654370082071241, + "grad_norm": 0.16377829015254974, + "learning_rate": 0.001, + "loss": 2.1828, + "step": 22821 + }, + { + "epoch": 0.9654793129706405, + "grad_norm": 0.44327592849731445, + "learning_rate": 0.001, + "loss": 2.5437, + "step": 22822 + }, + { + "epoch": 0.9655216177341569, + "grad_norm": 0.2196311205625534, + "learning_rate": 0.001, + "loss": 1.9828, + "step": 22823 + }, + { + "epoch": 0.9655639224976732, + "grad_norm": 0.13134104013442993, + "learning_rate": 0.001, + "loss": 1.5855, + "step": 22824 + }, + { + "epoch": 0.9656062272611896, + "grad_norm": 0.4789327085018158, + "learning_rate": 0.001, + "loss": 2.3847, + "step": 22825 + }, + { + "epoch": 0.965648532024706, + "grad_norm": 0.14557301998138428, + "learning_rate": 0.001, + "loss": 2.6556, + "step": 22826 + }, + { + "epoch": 0.9656908367882223, + "grad_norm": 0.1414880007505417, + "learning_rate": 0.001, + "loss": 1.5357, + "step": 22827 + }, + { + "epoch": 0.9657331415517387, + "grad_norm": 0.14299647510051727, + "learning_rate": 0.001, + "loss": 2.0178, + "step": 22828 + }, + { + "epoch": 0.9657754463152551, + "grad_norm": 1.3442524671554565, + "learning_rate": 0.001, + "loss": 1.8132, + "step": 22829 + }, + { + "epoch": 0.9658177510787714, + "grad_norm": 0.14745590090751648, + "learning_rate": 0.001, + "loss": 2.4792, + "step": 22830 + }, + { + "epoch": 0.9658600558422878, + "grad_norm": 0.14321614801883698, + "learning_rate": 0.001, + "loss": 2.3954, + "step": 22831 + }, + { + "epoch": 0.9659023606058043, + "grad_norm": 0.15746840834617615, + "learning_rate": 0.001, + "loss": 1.694, + "step": 22832 + }, + { + "epoch": 0.9659446653693206, + "grad_norm": 0.15602366626262665, + "learning_rate": 0.001, + "loss": 2.3334, + "step": 22833 + }, + { + "epoch": 0.965986970132837, + "grad_norm": 0.1474500298500061, + "learning_rate": 0.001, + "loss": 2.1187, + "step": 22834 + }, + { + "epoch": 0.9660292748963534, + "grad_norm": 1.2670032978057861, + "learning_rate": 0.001, + "loss": 1.8418, + "step": 22835 + }, + { + "epoch": 0.9660715796598697, + "grad_norm": 0.20300118625164032, + "learning_rate": 0.001, + "loss": 1.9915, + "step": 22836 + }, + { + "epoch": 0.9661138844233861, + "grad_norm": 0.20837882161140442, + "learning_rate": 0.001, + "loss": 2.6805, + "step": 22837 + }, + { + "epoch": 0.9661561891869025, + "grad_norm": 0.1467253714799881, + "learning_rate": 0.001, + "loss": 1.7238, + "step": 22838 + }, + { + "epoch": 0.9661984939504188, + "grad_norm": 8.646101951599121, + "learning_rate": 0.001, + "loss": 2.1153, + "step": 22839 + }, + { + "epoch": 0.9662407987139352, + "grad_norm": 0.12623724341392517, + "learning_rate": 0.001, + "loss": 1.9916, + "step": 22840 + }, + { + "epoch": 0.9662831034774516, + "grad_norm": 0.1795465052127838, + "learning_rate": 0.001, + "loss": 2.2702, + "step": 22841 + }, + { + "epoch": 0.9663254082409679, + "grad_norm": 0.14602890610694885, + "learning_rate": 0.001, + "loss": 2.0795, + "step": 22842 + }, + { + "epoch": 0.9663677130044843, + "grad_norm": 0.18272976577281952, + "learning_rate": 0.001, + "loss": 2.061, + "step": 22843 + }, + { + "epoch": 0.9664100177680007, + "grad_norm": 0.19199426472187042, + "learning_rate": 0.001, + "loss": 2.0847, + "step": 22844 + }, + { + "epoch": 0.966452322531517, + "grad_norm": 0.19457626342773438, + "learning_rate": 0.001, + "loss": 1.9797, + "step": 22845 + }, + { + "epoch": 0.9664946272950334, + "grad_norm": 0.29736241698265076, + "learning_rate": 0.001, + "loss": 3.3281, + "step": 22846 + }, + { + "epoch": 0.9665369320585498, + "grad_norm": 0.29135966300964355, + "learning_rate": 0.001, + "loss": 5.947, + "step": 22847 + }, + { + "epoch": 0.9665792368220661, + "grad_norm": 3.7206881046295166, + "learning_rate": 0.001, + "loss": 1.9796, + "step": 22848 + }, + { + "epoch": 0.9666215415855826, + "grad_norm": 0.14259220659732819, + "learning_rate": 0.001, + "loss": 3.0848, + "step": 22849 + }, + { + "epoch": 0.9666638463490989, + "grad_norm": 2.7031147480010986, + "learning_rate": 0.001, + "loss": 2.8408, + "step": 22850 + }, + { + "epoch": 0.9667061511126153, + "grad_norm": 0.1781085729598999, + "learning_rate": 0.001, + "loss": 2.7997, + "step": 22851 + }, + { + "epoch": 0.9667484558761317, + "grad_norm": 0.1732090413570404, + "learning_rate": 0.001, + "loss": 1.9524, + "step": 22852 + }, + { + "epoch": 0.966790760639648, + "grad_norm": 0.13723929226398468, + "learning_rate": 0.001, + "loss": 1.7664, + "step": 22853 + }, + { + "epoch": 0.9668330654031644, + "grad_norm": 0.1692230999469757, + "learning_rate": 0.001, + "loss": 2.081, + "step": 22854 + }, + { + "epoch": 0.9668753701666808, + "grad_norm": 0.1593841016292572, + "learning_rate": 0.001, + "loss": 1.6948, + "step": 22855 + }, + { + "epoch": 0.9669176749301971, + "grad_norm": 0.13628552854061127, + "learning_rate": 0.001, + "loss": 2.2481, + "step": 22856 + }, + { + "epoch": 0.9669599796937135, + "grad_norm": 0.1631813496351242, + "learning_rate": 0.001, + "loss": 1.9093, + "step": 22857 + }, + { + "epoch": 0.9670022844572299, + "grad_norm": 0.16931277513504028, + "learning_rate": 0.001, + "loss": 3.1397, + "step": 22858 + }, + { + "epoch": 0.9670445892207462, + "grad_norm": 0.14777718484401703, + "learning_rate": 0.001, + "loss": 1.9937, + "step": 22859 + }, + { + "epoch": 0.9670868939842626, + "grad_norm": 0.23261679708957672, + "learning_rate": 0.001, + "loss": 1.5095, + "step": 22860 + }, + { + "epoch": 0.967129198747779, + "grad_norm": 0.23100370168685913, + "learning_rate": 0.001, + "loss": 3.1784, + "step": 22861 + }, + { + "epoch": 0.9671715035112953, + "grad_norm": 0.17123155295848846, + "learning_rate": 0.001, + "loss": 2.0223, + "step": 22862 + }, + { + "epoch": 0.9672138082748117, + "grad_norm": 0.16940762102603912, + "learning_rate": 0.001, + "loss": 2.0325, + "step": 22863 + }, + { + "epoch": 0.9672561130383281, + "grad_norm": 0.15157361328601837, + "learning_rate": 0.001, + "loss": 2.8099, + "step": 22864 + }, + { + "epoch": 0.9672984178018444, + "grad_norm": 0.1540924608707428, + "learning_rate": 0.001, + "loss": 1.6055, + "step": 22865 + }, + { + "epoch": 0.9673407225653609, + "grad_norm": 0.17207752168178558, + "learning_rate": 0.001, + "loss": 1.7778, + "step": 22866 + }, + { + "epoch": 0.9673830273288773, + "grad_norm": 0.2308436930179596, + "learning_rate": 0.001, + "loss": 2.6066, + "step": 22867 + }, + { + "epoch": 0.9674253320923936, + "grad_norm": 0.15052172541618347, + "learning_rate": 0.001, + "loss": 1.6072, + "step": 22868 + }, + { + "epoch": 0.96746763685591, + "grad_norm": 0.13643066585063934, + "learning_rate": 0.001, + "loss": 1.8093, + "step": 22869 + }, + { + "epoch": 0.9675099416194264, + "grad_norm": 0.16846615076065063, + "learning_rate": 0.001, + "loss": 1.918, + "step": 22870 + }, + { + "epoch": 0.9675522463829427, + "grad_norm": 1.7029542922973633, + "learning_rate": 0.001, + "loss": 1.7533, + "step": 22871 + }, + { + "epoch": 0.9675945511464591, + "grad_norm": 0.16174376010894775, + "learning_rate": 0.001, + "loss": 2.0584, + "step": 22872 + }, + { + "epoch": 0.9676368559099755, + "grad_norm": 0.17282286286354065, + "learning_rate": 0.001, + "loss": 1.9735, + "step": 22873 + }, + { + "epoch": 0.9676791606734918, + "grad_norm": 3.2396557331085205, + "learning_rate": 0.001, + "loss": 2.9205, + "step": 22874 + }, + { + "epoch": 0.9677214654370082, + "grad_norm": 0.18239837884902954, + "learning_rate": 0.001, + "loss": 1.9157, + "step": 22875 + }, + { + "epoch": 0.9677637702005246, + "grad_norm": 0.16040238738059998, + "learning_rate": 0.001, + "loss": 2.5035, + "step": 22876 + }, + { + "epoch": 0.9678060749640409, + "grad_norm": 0.31164249777793884, + "learning_rate": 0.001, + "loss": 2.2374, + "step": 22877 + }, + { + "epoch": 0.9678483797275573, + "grad_norm": 0.1581432968378067, + "learning_rate": 0.001, + "loss": 1.8085, + "step": 22878 + }, + { + "epoch": 0.9678906844910737, + "grad_norm": 0.17101150751113892, + "learning_rate": 0.001, + "loss": 1.9065, + "step": 22879 + }, + { + "epoch": 0.96793298925459, + "grad_norm": 0.1634765863418579, + "learning_rate": 0.001, + "loss": 1.3569, + "step": 22880 + }, + { + "epoch": 0.9679752940181064, + "grad_norm": 0.1971825212240219, + "learning_rate": 0.001, + "loss": 1.9538, + "step": 22881 + }, + { + "epoch": 0.9680175987816229, + "grad_norm": 0.17837342619895935, + "learning_rate": 0.001, + "loss": 1.7106, + "step": 22882 + }, + { + "epoch": 0.9680599035451392, + "grad_norm": 0.18054227530956268, + "learning_rate": 0.001, + "loss": 2.0404, + "step": 22883 + }, + { + "epoch": 0.9681022083086556, + "grad_norm": 0.16137461364269257, + "learning_rate": 0.001, + "loss": 1.4786, + "step": 22884 + }, + { + "epoch": 0.968144513072172, + "grad_norm": 0.1961548924446106, + "learning_rate": 0.001, + "loss": 2.127, + "step": 22885 + }, + { + "epoch": 0.9681868178356883, + "grad_norm": 0.1829284280538559, + "learning_rate": 0.001, + "loss": 1.826, + "step": 22886 + }, + { + "epoch": 0.9682291225992047, + "grad_norm": 0.16938935220241547, + "learning_rate": 0.001, + "loss": 2.0336, + "step": 22887 + }, + { + "epoch": 0.9682714273627211, + "grad_norm": 0.1795877367258072, + "learning_rate": 0.001, + "loss": 2.4894, + "step": 22888 + }, + { + "epoch": 0.9683137321262374, + "grad_norm": 0.16748693585395813, + "learning_rate": 0.001, + "loss": 2.4214, + "step": 22889 + }, + { + "epoch": 0.9683560368897538, + "grad_norm": 0.15759611129760742, + "learning_rate": 0.001, + "loss": 1.9249, + "step": 22890 + }, + { + "epoch": 0.9683983416532702, + "grad_norm": 0.16091367602348328, + "learning_rate": 0.001, + "loss": 1.8189, + "step": 22891 + }, + { + "epoch": 0.9684406464167865, + "grad_norm": 0.1906583309173584, + "learning_rate": 0.001, + "loss": 2.3985, + "step": 22892 + }, + { + "epoch": 0.9684829511803029, + "grad_norm": 0.15400663018226624, + "learning_rate": 0.001, + "loss": 2.1882, + "step": 22893 + }, + { + "epoch": 0.9685252559438192, + "grad_norm": 0.1885334551334381, + "learning_rate": 0.001, + "loss": 1.759, + "step": 22894 + }, + { + "epoch": 0.9685675607073356, + "grad_norm": 0.2451082170009613, + "learning_rate": 0.001, + "loss": 2.4656, + "step": 22895 + }, + { + "epoch": 0.968609865470852, + "grad_norm": 0.25269943475723267, + "learning_rate": 0.001, + "loss": 2.1214, + "step": 22896 + }, + { + "epoch": 0.9686521702343683, + "grad_norm": 0.21075433492660522, + "learning_rate": 0.001, + "loss": 1.8323, + "step": 22897 + }, + { + "epoch": 0.9686944749978847, + "grad_norm": 0.168150395154953, + "learning_rate": 0.001, + "loss": 1.8325, + "step": 22898 + }, + { + "epoch": 0.9687367797614012, + "grad_norm": 0.1418176144361496, + "learning_rate": 0.001, + "loss": 2.0136, + "step": 22899 + }, + { + "epoch": 0.9687790845249175, + "grad_norm": 0.21564440429210663, + "learning_rate": 0.001, + "loss": 2.308, + "step": 22900 + }, + { + "epoch": 0.9688213892884339, + "grad_norm": 0.18524868786334991, + "learning_rate": 0.001, + "loss": 1.8404, + "step": 22901 + }, + { + "epoch": 0.9688636940519503, + "grad_norm": 0.13573579490184784, + "learning_rate": 0.001, + "loss": 1.5914, + "step": 22902 + }, + { + "epoch": 0.9689059988154666, + "grad_norm": 0.7693604826927185, + "learning_rate": 0.001, + "loss": 1.7203, + "step": 22903 + }, + { + "epoch": 0.968948303578983, + "grad_norm": 0.2601660490036011, + "learning_rate": 0.001, + "loss": 2.7439, + "step": 22904 + }, + { + "epoch": 0.9689906083424994, + "grad_norm": 0.16246432065963745, + "learning_rate": 0.001, + "loss": 2.6348, + "step": 22905 + }, + { + "epoch": 0.9690329131060157, + "grad_norm": 0.2047150582075119, + "learning_rate": 0.001, + "loss": 2.1811, + "step": 22906 + }, + { + "epoch": 0.9690752178695321, + "grad_norm": 0.19627639651298523, + "learning_rate": 0.001, + "loss": 1.8717, + "step": 22907 + }, + { + "epoch": 0.9691175226330485, + "grad_norm": 0.2079043984413147, + "learning_rate": 0.001, + "loss": 3.3459, + "step": 22908 + }, + { + "epoch": 0.9691598273965648, + "grad_norm": 0.16984863579273224, + "learning_rate": 0.001, + "loss": 1.721, + "step": 22909 + }, + { + "epoch": 0.9692021321600812, + "grad_norm": 0.23695328831672668, + "learning_rate": 0.001, + "loss": 1.92, + "step": 22910 + }, + { + "epoch": 0.9692444369235976, + "grad_norm": 0.13472682237625122, + "learning_rate": 0.001, + "loss": 1.5671, + "step": 22911 + }, + { + "epoch": 0.9692867416871139, + "grad_norm": 0.1398351788520813, + "learning_rate": 0.001, + "loss": 2.6105, + "step": 22912 + }, + { + "epoch": 0.9693290464506303, + "grad_norm": 0.16584454476833344, + "learning_rate": 0.001, + "loss": 2.4794, + "step": 22913 + }, + { + "epoch": 0.9693713512141467, + "grad_norm": 0.20908065140247345, + "learning_rate": 0.001, + "loss": 4.0282, + "step": 22914 + }, + { + "epoch": 0.969413655977663, + "grad_norm": 0.14626561105251312, + "learning_rate": 0.001, + "loss": 3.0325, + "step": 22915 + }, + { + "epoch": 0.9694559607411795, + "grad_norm": 0.190009206533432, + "learning_rate": 0.001, + "loss": 1.9914, + "step": 22916 + }, + { + "epoch": 0.9694982655046959, + "grad_norm": 0.30894187092781067, + "learning_rate": 0.001, + "loss": 1.9338, + "step": 22917 + }, + { + "epoch": 0.9695405702682122, + "grad_norm": 0.1590581089258194, + "learning_rate": 0.001, + "loss": 2.0382, + "step": 22918 + }, + { + "epoch": 0.9695828750317286, + "grad_norm": 0.1639058142900467, + "learning_rate": 0.001, + "loss": 1.5866, + "step": 22919 + }, + { + "epoch": 0.969625179795245, + "grad_norm": 0.1441287249326706, + "learning_rate": 0.001, + "loss": 1.7956, + "step": 22920 + }, + { + "epoch": 0.9696674845587613, + "grad_norm": 0.14907407760620117, + "learning_rate": 0.001, + "loss": 2.4144, + "step": 22921 + }, + { + "epoch": 0.9697097893222777, + "grad_norm": 0.17159616947174072, + "learning_rate": 0.001, + "loss": 1.4419, + "step": 22922 + }, + { + "epoch": 0.9697520940857941, + "grad_norm": 0.1924789398908615, + "learning_rate": 0.001, + "loss": 2.0441, + "step": 22923 + }, + { + "epoch": 0.9697943988493104, + "grad_norm": 0.8495581150054932, + "learning_rate": 0.001, + "loss": 2.2902, + "step": 22924 + }, + { + "epoch": 0.9698367036128268, + "grad_norm": 0.19208434224128723, + "learning_rate": 0.001, + "loss": 3.4082, + "step": 22925 + }, + { + "epoch": 0.9698790083763432, + "grad_norm": 0.16336600482463837, + "learning_rate": 0.001, + "loss": 1.8806, + "step": 22926 + }, + { + "epoch": 0.9699213131398595, + "grad_norm": 0.3081340491771698, + "learning_rate": 0.001, + "loss": 2.5719, + "step": 22927 + }, + { + "epoch": 0.9699636179033759, + "grad_norm": 0.743816614151001, + "learning_rate": 0.001, + "loss": 1.715, + "step": 22928 + }, + { + "epoch": 0.9700059226668923, + "grad_norm": 0.14184142649173737, + "learning_rate": 0.001, + "loss": 2.3673, + "step": 22929 + }, + { + "epoch": 0.9700482274304086, + "grad_norm": 0.14244288206100464, + "learning_rate": 0.001, + "loss": 2.542, + "step": 22930 + }, + { + "epoch": 0.970090532193925, + "grad_norm": 0.15329469740390778, + "learning_rate": 0.001, + "loss": 2.1073, + "step": 22931 + }, + { + "epoch": 0.9701328369574415, + "grad_norm": 7.721384048461914, + "learning_rate": 0.001, + "loss": 2.1494, + "step": 22932 + }, + { + "epoch": 0.9701751417209578, + "grad_norm": 0.22561503946781158, + "learning_rate": 0.001, + "loss": 1.9826, + "step": 22933 + }, + { + "epoch": 0.9702174464844742, + "grad_norm": 0.3391435444355011, + "learning_rate": 0.001, + "loss": 1.8618, + "step": 22934 + }, + { + "epoch": 0.9702597512479906, + "grad_norm": 0.17945805191993713, + "learning_rate": 0.001, + "loss": 1.4256, + "step": 22935 + }, + { + "epoch": 0.9703020560115069, + "grad_norm": 0.1751527637243271, + "learning_rate": 0.001, + "loss": 2.4109, + "step": 22936 + }, + { + "epoch": 0.9703443607750233, + "grad_norm": 0.15709732472896576, + "learning_rate": 0.001, + "loss": 1.6146, + "step": 22937 + }, + { + "epoch": 0.9703866655385397, + "grad_norm": 0.16432954370975494, + "learning_rate": 0.001, + "loss": 2.0583, + "step": 22938 + }, + { + "epoch": 0.970428970302056, + "grad_norm": 0.44643503427505493, + "learning_rate": 0.001, + "loss": 2.3525, + "step": 22939 + }, + { + "epoch": 0.9704712750655724, + "grad_norm": 0.1962294727563858, + "learning_rate": 0.001, + "loss": 1.874, + "step": 22940 + }, + { + "epoch": 0.9705135798290887, + "grad_norm": 0.1391528844833374, + "learning_rate": 0.001, + "loss": 2.7016, + "step": 22941 + }, + { + "epoch": 0.9705558845926051, + "grad_norm": 0.15284281969070435, + "learning_rate": 0.001, + "loss": 2.5978, + "step": 22942 + }, + { + "epoch": 0.9705981893561215, + "grad_norm": 0.19085395336151123, + "learning_rate": 0.001, + "loss": 1.2241, + "step": 22943 + }, + { + "epoch": 0.9706404941196378, + "grad_norm": 0.17279383540153503, + "learning_rate": 0.001, + "loss": 2.0888, + "step": 22944 + }, + { + "epoch": 0.9706827988831542, + "grad_norm": 0.15863092243671417, + "learning_rate": 0.001, + "loss": 1.6636, + "step": 22945 + }, + { + "epoch": 0.9707251036466706, + "grad_norm": 0.1498878002166748, + "learning_rate": 0.001, + "loss": 1.7367, + "step": 22946 + }, + { + "epoch": 0.9707674084101869, + "grad_norm": 0.14784745872020721, + "learning_rate": 0.001, + "loss": 1.7366, + "step": 22947 + }, + { + "epoch": 0.9708097131737033, + "grad_norm": 0.14569970965385437, + "learning_rate": 0.001, + "loss": 1.6962, + "step": 22948 + }, + { + "epoch": 0.9708520179372198, + "grad_norm": 0.13814716041088104, + "learning_rate": 0.001, + "loss": 1.8074, + "step": 22949 + }, + { + "epoch": 0.9708943227007361, + "grad_norm": 0.18621012568473816, + "learning_rate": 0.001, + "loss": 2.2638, + "step": 22950 + }, + { + "epoch": 0.9709366274642525, + "grad_norm": 0.1415053755044937, + "learning_rate": 0.001, + "loss": 1.6322, + "step": 22951 + }, + { + "epoch": 0.9709789322277689, + "grad_norm": 0.1956765204668045, + "learning_rate": 0.001, + "loss": 2.0337, + "step": 22952 + }, + { + "epoch": 0.9710212369912852, + "grad_norm": 0.16318294405937195, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 22953 + }, + { + "epoch": 0.9710635417548016, + "grad_norm": 6.47411584854126, + "learning_rate": 0.001, + "loss": 2.0023, + "step": 22954 + }, + { + "epoch": 0.971105846518318, + "grad_norm": 0.14443252980709076, + "learning_rate": 0.001, + "loss": 2.8807, + "step": 22955 + }, + { + "epoch": 0.9711481512818343, + "grad_norm": 0.12970362603664398, + "learning_rate": 0.001, + "loss": 3.2348, + "step": 22956 + }, + { + "epoch": 0.9711904560453507, + "grad_norm": 0.21394479274749756, + "learning_rate": 0.001, + "loss": 1.8296, + "step": 22957 + }, + { + "epoch": 0.9712327608088671, + "grad_norm": 0.15435615181922913, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 22958 + }, + { + "epoch": 0.9712750655723834, + "grad_norm": 0.21590697765350342, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 22959 + }, + { + "epoch": 0.9713173703358998, + "grad_norm": 0.14091353118419647, + "learning_rate": 0.001, + "loss": 2.0747, + "step": 22960 + }, + { + "epoch": 0.9713596750994162, + "grad_norm": 0.1489262878894806, + "learning_rate": 0.001, + "loss": 1.9124, + "step": 22961 + }, + { + "epoch": 0.9714019798629325, + "grad_norm": 0.14029927551746368, + "learning_rate": 0.001, + "loss": 2.5653, + "step": 22962 + }, + { + "epoch": 0.9714442846264489, + "grad_norm": 0.1342879831790924, + "learning_rate": 0.001, + "loss": 1.4259, + "step": 22963 + }, + { + "epoch": 0.9714865893899653, + "grad_norm": 0.13123171031475067, + "learning_rate": 0.001, + "loss": 2.2616, + "step": 22964 + }, + { + "epoch": 0.9715288941534816, + "grad_norm": 0.13907508552074432, + "learning_rate": 0.001, + "loss": 1.5811, + "step": 22965 + }, + { + "epoch": 0.9715711989169981, + "grad_norm": 0.14411531388759613, + "learning_rate": 0.001, + "loss": 2.1203, + "step": 22966 + }, + { + "epoch": 0.9716135036805145, + "grad_norm": 0.15099336206912994, + "learning_rate": 0.001, + "loss": 2.7498, + "step": 22967 + }, + { + "epoch": 0.9716558084440308, + "grad_norm": 0.14100094139575958, + "learning_rate": 0.001, + "loss": 1.9042, + "step": 22968 + }, + { + "epoch": 0.9716981132075472, + "grad_norm": 0.16299395263195038, + "learning_rate": 0.001, + "loss": 3.2449, + "step": 22969 + }, + { + "epoch": 0.9717404179710636, + "grad_norm": 0.1855599284172058, + "learning_rate": 0.001, + "loss": 1.2989, + "step": 22970 + }, + { + "epoch": 0.9717827227345799, + "grad_norm": 0.13893848657608032, + "learning_rate": 0.001, + "loss": 1.8379, + "step": 22971 + }, + { + "epoch": 0.9718250274980963, + "grad_norm": 0.22873970866203308, + "learning_rate": 0.001, + "loss": 2.7587, + "step": 22972 + }, + { + "epoch": 0.9718673322616127, + "grad_norm": 0.21288566291332245, + "learning_rate": 0.001, + "loss": 1.7145, + "step": 22973 + }, + { + "epoch": 0.971909637025129, + "grad_norm": 0.16666394472122192, + "learning_rate": 0.001, + "loss": 2.0038, + "step": 22974 + }, + { + "epoch": 0.9719519417886454, + "grad_norm": 0.1581132858991623, + "learning_rate": 0.001, + "loss": 1.5963, + "step": 22975 + }, + { + "epoch": 0.9719942465521618, + "grad_norm": 0.1787661761045456, + "learning_rate": 0.001, + "loss": 2.5239, + "step": 22976 + }, + { + "epoch": 0.9720365513156781, + "grad_norm": 0.1651124209165573, + "learning_rate": 0.001, + "loss": 2.6818, + "step": 22977 + }, + { + "epoch": 0.9720788560791945, + "grad_norm": 0.14961262047290802, + "learning_rate": 0.001, + "loss": 2.9672, + "step": 22978 + }, + { + "epoch": 0.9721211608427109, + "grad_norm": 0.15636776387691498, + "learning_rate": 0.001, + "loss": 2.4673, + "step": 22979 + }, + { + "epoch": 0.9721634656062272, + "grad_norm": 1.0559109449386597, + "learning_rate": 0.001, + "loss": 2.7908, + "step": 22980 + }, + { + "epoch": 0.9722057703697436, + "grad_norm": 0.15720365941524506, + "learning_rate": 0.001, + "loss": 2.1491, + "step": 22981 + }, + { + "epoch": 0.9722480751332601, + "grad_norm": 0.14667530357837677, + "learning_rate": 0.001, + "loss": 1.7503, + "step": 22982 + }, + { + "epoch": 0.9722903798967764, + "grad_norm": 0.15242712199687958, + "learning_rate": 0.001, + "loss": 2.048, + "step": 22983 + }, + { + "epoch": 0.9723326846602928, + "grad_norm": 7.654297828674316, + "learning_rate": 0.001, + "loss": 1.64, + "step": 22984 + }, + { + "epoch": 0.9723749894238091, + "grad_norm": 3.1997509002685547, + "learning_rate": 0.001, + "loss": 1.8606, + "step": 22985 + }, + { + "epoch": 0.9724172941873255, + "grad_norm": 0.3333907723426819, + "learning_rate": 0.001, + "loss": 1.8566, + "step": 22986 + }, + { + "epoch": 0.9724595989508419, + "grad_norm": 0.2523484528064728, + "learning_rate": 0.001, + "loss": 2.3548, + "step": 22987 + }, + { + "epoch": 0.9725019037143582, + "grad_norm": 0.1559314876794815, + "learning_rate": 0.001, + "loss": 1.3173, + "step": 22988 + }, + { + "epoch": 0.9725442084778746, + "grad_norm": 0.13869056105613708, + "learning_rate": 0.001, + "loss": 2.4087, + "step": 22989 + }, + { + "epoch": 0.972586513241391, + "grad_norm": 0.13740174472332, + "learning_rate": 0.001, + "loss": 1.849, + "step": 22990 + }, + { + "epoch": 0.9726288180049073, + "grad_norm": 0.1633240431547165, + "learning_rate": 0.001, + "loss": 2.104, + "step": 22991 + }, + { + "epoch": 0.9726711227684237, + "grad_norm": 0.14043399691581726, + "learning_rate": 0.001, + "loss": 2.5682, + "step": 22992 + }, + { + "epoch": 0.9727134275319401, + "grad_norm": 0.16599078476428986, + "learning_rate": 0.001, + "loss": 3.1016, + "step": 22993 + }, + { + "epoch": 0.9727557322954564, + "grad_norm": 0.6337260007858276, + "learning_rate": 0.001, + "loss": 2.6611, + "step": 22994 + }, + { + "epoch": 0.9727980370589728, + "grad_norm": 0.21094225347042084, + "learning_rate": 0.001, + "loss": 3.6387, + "step": 22995 + }, + { + "epoch": 0.9728403418224892, + "grad_norm": 0.3254936635494232, + "learning_rate": 0.001, + "loss": 3.3134, + "step": 22996 + }, + { + "epoch": 0.9728826465860055, + "grad_norm": 0.16493062674999237, + "learning_rate": 0.001, + "loss": 2.4989, + "step": 22997 + }, + { + "epoch": 0.972924951349522, + "grad_norm": 0.18628916144371033, + "learning_rate": 0.001, + "loss": 2.0522, + "step": 22998 + }, + { + "epoch": 0.9729672561130384, + "grad_norm": 0.20497991144657135, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 22999 + }, + { + "epoch": 0.9730095608765547, + "grad_norm": 0.13439905643463135, + "learning_rate": 0.001, + "loss": 1.7492, + "step": 23000 + }, + { + "epoch": 0.9730518656400711, + "grad_norm": 0.20762348175048828, + "learning_rate": 0.001, + "loss": 2.6112, + "step": 23001 + }, + { + "epoch": 0.9730941704035875, + "grad_norm": 0.17342343926429749, + "learning_rate": 0.001, + "loss": 1.9358, + "step": 23002 + }, + { + "epoch": 0.9731364751671038, + "grad_norm": 0.15936243534088135, + "learning_rate": 0.001, + "loss": 2.0159, + "step": 23003 + }, + { + "epoch": 0.9731787799306202, + "grad_norm": 0.15786008536815643, + "learning_rate": 0.001, + "loss": 1.3607, + "step": 23004 + }, + { + "epoch": 0.9732210846941366, + "grad_norm": 0.18366417288780212, + "learning_rate": 0.001, + "loss": 1.746, + "step": 23005 + }, + { + "epoch": 0.9732633894576529, + "grad_norm": 0.2870394289493561, + "learning_rate": 0.001, + "loss": 1.9022, + "step": 23006 + }, + { + "epoch": 0.9733056942211693, + "grad_norm": 0.18606680631637573, + "learning_rate": 0.001, + "loss": 2.4152, + "step": 23007 + }, + { + "epoch": 0.9733479989846857, + "grad_norm": 0.15611374378204346, + "learning_rate": 0.001, + "loss": 2.4503, + "step": 23008 + }, + { + "epoch": 0.973390303748202, + "grad_norm": 0.1782141774892807, + "learning_rate": 0.001, + "loss": 2.1087, + "step": 23009 + }, + { + "epoch": 0.9734326085117184, + "grad_norm": 0.817843496799469, + "learning_rate": 0.001, + "loss": 2.758, + "step": 23010 + }, + { + "epoch": 0.9734749132752348, + "grad_norm": 0.13205979764461517, + "learning_rate": 0.001, + "loss": 1.4987, + "step": 23011 + }, + { + "epoch": 0.9735172180387511, + "grad_norm": 3.9600741863250732, + "learning_rate": 0.001, + "loss": 1.5233, + "step": 23012 + }, + { + "epoch": 0.9735595228022675, + "grad_norm": 0.17472542822360992, + "learning_rate": 0.001, + "loss": 1.5872, + "step": 23013 + }, + { + "epoch": 0.973601827565784, + "grad_norm": 0.17888934910297394, + "learning_rate": 0.001, + "loss": 1.989, + "step": 23014 + }, + { + "epoch": 0.9736441323293002, + "grad_norm": 0.162007674574852, + "learning_rate": 0.001, + "loss": 2.0518, + "step": 23015 + }, + { + "epoch": 0.9736864370928167, + "grad_norm": 0.17636260390281677, + "learning_rate": 0.001, + "loss": 1.9944, + "step": 23016 + }, + { + "epoch": 0.9737287418563331, + "grad_norm": 0.19297371804714203, + "learning_rate": 0.001, + "loss": 1.6305, + "step": 23017 + }, + { + "epoch": 0.9737710466198494, + "grad_norm": 0.16038502752780914, + "learning_rate": 0.001, + "loss": 2.8023, + "step": 23018 + }, + { + "epoch": 0.9738133513833658, + "grad_norm": 0.24749846756458282, + "learning_rate": 0.001, + "loss": 1.8999, + "step": 23019 + }, + { + "epoch": 0.9738556561468822, + "grad_norm": 0.16544784605503082, + "learning_rate": 0.001, + "loss": 1.9664, + "step": 23020 + }, + { + "epoch": 0.9738979609103985, + "grad_norm": 0.16566026210784912, + "learning_rate": 0.001, + "loss": 1.6773, + "step": 23021 + }, + { + "epoch": 0.9739402656739149, + "grad_norm": 0.16837327182292938, + "learning_rate": 0.001, + "loss": 2.1888, + "step": 23022 + }, + { + "epoch": 0.9739825704374313, + "grad_norm": 0.16060344874858856, + "learning_rate": 0.001, + "loss": 1.9338, + "step": 23023 + }, + { + "epoch": 0.9740248752009476, + "grad_norm": 0.21505653858184814, + "learning_rate": 0.001, + "loss": 2.1727, + "step": 23024 + }, + { + "epoch": 0.974067179964464, + "grad_norm": 0.17655415832996368, + "learning_rate": 0.001, + "loss": 1.7649, + "step": 23025 + }, + { + "epoch": 0.9741094847279804, + "grad_norm": 0.1724175214767456, + "learning_rate": 0.001, + "loss": 1.7799, + "step": 23026 + }, + { + "epoch": 0.9741517894914967, + "grad_norm": 0.21680817008018494, + "learning_rate": 0.001, + "loss": 1.9723, + "step": 23027 + }, + { + "epoch": 0.9741940942550131, + "grad_norm": 5.8497748374938965, + "learning_rate": 0.001, + "loss": 2.4025, + "step": 23028 + }, + { + "epoch": 0.9742363990185295, + "grad_norm": 0.18116092681884766, + "learning_rate": 0.001, + "loss": 2.083, + "step": 23029 + }, + { + "epoch": 0.9742787037820458, + "grad_norm": 0.1881796270608902, + "learning_rate": 0.001, + "loss": 1.8736, + "step": 23030 + }, + { + "epoch": 0.9743210085455623, + "grad_norm": 0.2181680053472519, + "learning_rate": 0.001, + "loss": 1.9318, + "step": 23031 + }, + { + "epoch": 0.9743633133090785, + "grad_norm": 0.2097787708044052, + "learning_rate": 0.001, + "loss": 1.819, + "step": 23032 + }, + { + "epoch": 0.974405618072595, + "grad_norm": 0.16672278940677643, + "learning_rate": 0.001, + "loss": 1.8597, + "step": 23033 + }, + { + "epoch": 0.9744479228361114, + "grad_norm": 0.1386634260416031, + "learning_rate": 0.001, + "loss": 1.5876, + "step": 23034 + }, + { + "epoch": 0.9744902275996277, + "grad_norm": 0.1755525916814804, + "learning_rate": 0.001, + "loss": 1.8865, + "step": 23035 + }, + { + "epoch": 0.9745325323631441, + "grad_norm": 0.17198018729686737, + "learning_rate": 0.001, + "loss": 2.1409, + "step": 23036 + }, + { + "epoch": 0.9745748371266605, + "grad_norm": 0.20147661864757538, + "learning_rate": 0.001, + "loss": 1.6044, + "step": 23037 + }, + { + "epoch": 0.9746171418901768, + "grad_norm": 0.292715460062027, + "learning_rate": 0.001, + "loss": 1.906, + "step": 23038 + }, + { + "epoch": 0.9746594466536932, + "grad_norm": 3.269289493560791, + "learning_rate": 0.001, + "loss": 2.3023, + "step": 23039 + }, + { + "epoch": 0.9747017514172096, + "grad_norm": 0.30846601724624634, + "learning_rate": 0.001, + "loss": 2.5841, + "step": 23040 + }, + { + "epoch": 0.9747440561807259, + "grad_norm": 0.2580687701702118, + "learning_rate": 0.001, + "loss": 2.0838, + "step": 23041 + }, + { + "epoch": 0.9747863609442423, + "grad_norm": 0.4336620271205902, + "learning_rate": 0.001, + "loss": 2.2656, + "step": 23042 + }, + { + "epoch": 0.9748286657077587, + "grad_norm": 0.9304342865943909, + "learning_rate": 0.001, + "loss": 2.4475, + "step": 23043 + }, + { + "epoch": 0.974870970471275, + "grad_norm": 5.938025951385498, + "learning_rate": 0.001, + "loss": 2.5208, + "step": 23044 + }, + { + "epoch": 0.9749132752347914, + "grad_norm": 24.187389373779297, + "learning_rate": 0.001, + "loss": 1.5931, + "step": 23045 + }, + { + "epoch": 0.9749555799983078, + "grad_norm": 0.4992624819278717, + "learning_rate": 0.001, + "loss": 2.4145, + "step": 23046 + }, + { + "epoch": 0.9749978847618241, + "grad_norm": 2.016716718673706, + "learning_rate": 0.001, + "loss": 2.8388, + "step": 23047 + }, + { + "epoch": 0.9750401895253406, + "grad_norm": 1.3812201023101807, + "learning_rate": 0.001, + "loss": 2.0137, + "step": 23048 + }, + { + "epoch": 0.975082494288857, + "grad_norm": 0.2711379826068878, + "learning_rate": 0.001, + "loss": 1.798, + "step": 23049 + }, + { + "epoch": 0.9751247990523733, + "grad_norm": 0.3764013350009918, + "learning_rate": 0.001, + "loss": 2.0769, + "step": 23050 + }, + { + "epoch": 0.9751671038158897, + "grad_norm": 2.7994561195373535, + "learning_rate": 0.001, + "loss": 1.6256, + "step": 23051 + }, + { + "epoch": 0.9752094085794061, + "grad_norm": 0.26443973183631897, + "learning_rate": 0.001, + "loss": 2.8268, + "step": 23052 + }, + { + "epoch": 0.9752517133429224, + "grad_norm": 0.17250576615333557, + "learning_rate": 0.001, + "loss": 2.3258, + "step": 23053 + }, + { + "epoch": 0.9752940181064388, + "grad_norm": 0.23773407936096191, + "learning_rate": 0.001, + "loss": 2.3701, + "step": 23054 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.17003774642944336, + "learning_rate": 0.001, + "loss": 1.8557, + "step": 23055 + }, + { + "epoch": 0.9753786276334715, + "grad_norm": 0.6521497368812561, + "learning_rate": 0.001, + "loss": 2.9078, + "step": 23056 + }, + { + "epoch": 0.9754209323969879, + "grad_norm": 0.1700069010257721, + "learning_rate": 0.001, + "loss": 2.3583, + "step": 23057 + }, + { + "epoch": 0.9754632371605043, + "grad_norm": 1.8804014921188354, + "learning_rate": 0.001, + "loss": 1.5116, + "step": 23058 + }, + { + "epoch": 0.9755055419240206, + "grad_norm": 0.16962087154388428, + "learning_rate": 0.001, + "loss": 2.3743, + "step": 23059 + }, + { + "epoch": 0.975547846687537, + "grad_norm": 0.1760241985321045, + "learning_rate": 0.001, + "loss": 1.7771, + "step": 23060 + }, + { + "epoch": 0.9755901514510534, + "grad_norm": 0.3765549063682556, + "learning_rate": 0.001, + "loss": 2.3506, + "step": 23061 + }, + { + "epoch": 0.9756324562145697, + "grad_norm": 0.26375001668930054, + "learning_rate": 0.001, + "loss": 2.108, + "step": 23062 + }, + { + "epoch": 0.9756747609780861, + "grad_norm": 0.17104345560073853, + "learning_rate": 0.001, + "loss": 1.2225, + "step": 23063 + }, + { + "epoch": 0.9757170657416026, + "grad_norm": 0.15106819570064545, + "learning_rate": 0.001, + "loss": 2.0871, + "step": 23064 + }, + { + "epoch": 0.9757593705051189, + "grad_norm": 0.16280482709407806, + "learning_rate": 0.001, + "loss": 1.6354, + "step": 23065 + }, + { + "epoch": 0.9758016752686353, + "grad_norm": 0.19860334694385529, + "learning_rate": 0.001, + "loss": 1.9093, + "step": 23066 + }, + { + "epoch": 0.9758439800321517, + "grad_norm": 0.14197562634944916, + "learning_rate": 0.001, + "loss": 1.5894, + "step": 23067 + }, + { + "epoch": 0.975886284795668, + "grad_norm": 0.16649185121059418, + "learning_rate": 0.001, + "loss": 2.0655, + "step": 23068 + }, + { + "epoch": 0.9759285895591844, + "grad_norm": 0.15236613154411316, + "learning_rate": 0.001, + "loss": 1.4908, + "step": 23069 + }, + { + "epoch": 0.9759708943227008, + "grad_norm": 0.7669524550437927, + "learning_rate": 0.001, + "loss": 1.9742, + "step": 23070 + }, + { + "epoch": 0.9760131990862171, + "grad_norm": 0.2197335958480835, + "learning_rate": 0.001, + "loss": 1.8815, + "step": 23071 + }, + { + "epoch": 0.9760555038497335, + "grad_norm": 0.18375352025032043, + "learning_rate": 0.001, + "loss": 2.4837, + "step": 23072 + }, + { + "epoch": 0.9760978086132499, + "grad_norm": 0.31479236483573914, + "learning_rate": 0.001, + "loss": 1.4933, + "step": 23073 + }, + { + "epoch": 0.9761401133767662, + "grad_norm": 6.906200408935547, + "learning_rate": 0.001, + "loss": 2.0665, + "step": 23074 + }, + { + "epoch": 0.9761824181402826, + "grad_norm": 0.17486466467380524, + "learning_rate": 0.001, + "loss": 3.0518, + "step": 23075 + }, + { + "epoch": 0.9762247229037989, + "grad_norm": 0.16075573861598969, + "learning_rate": 0.001, + "loss": 2.0813, + "step": 23076 + }, + { + "epoch": 0.9762670276673153, + "grad_norm": 0.16066281497478485, + "learning_rate": 0.001, + "loss": 3.0841, + "step": 23077 + }, + { + "epoch": 0.9763093324308317, + "grad_norm": 0.20229189097881317, + "learning_rate": 0.001, + "loss": 2.0031, + "step": 23078 + }, + { + "epoch": 0.976351637194348, + "grad_norm": 0.1783391684293747, + "learning_rate": 0.001, + "loss": 2.0519, + "step": 23079 + }, + { + "epoch": 0.9763939419578644, + "grad_norm": 0.16642500460147858, + "learning_rate": 0.001, + "loss": 1.9304, + "step": 23080 + }, + { + "epoch": 0.9764362467213809, + "grad_norm": 1.8268184661865234, + "learning_rate": 0.001, + "loss": 1.4835, + "step": 23081 + }, + { + "epoch": 0.9764785514848972, + "grad_norm": 0.14119704067707062, + "learning_rate": 0.001, + "loss": 2.4155, + "step": 23082 + }, + { + "epoch": 0.9765208562484136, + "grad_norm": 0.3065393567085266, + "learning_rate": 0.001, + "loss": 3.0736, + "step": 23083 + }, + { + "epoch": 0.97656316101193, + "grad_norm": 1.7388895750045776, + "learning_rate": 0.001, + "loss": 1.6748, + "step": 23084 + }, + { + "epoch": 0.9766054657754463, + "grad_norm": 0.18974030017852783, + "learning_rate": 0.001, + "loss": 2.2323, + "step": 23085 + }, + { + "epoch": 0.9766477705389627, + "grad_norm": 0.20437590777873993, + "learning_rate": 0.001, + "loss": 2.0711, + "step": 23086 + }, + { + "epoch": 0.9766900753024791, + "grad_norm": 0.144393190741539, + "learning_rate": 0.001, + "loss": 1.9763, + "step": 23087 + }, + { + "epoch": 0.9767323800659954, + "grad_norm": 0.1816982924938202, + "learning_rate": 0.001, + "loss": 2.3921, + "step": 23088 + }, + { + "epoch": 0.9767746848295118, + "grad_norm": 0.24456720054149628, + "learning_rate": 0.001, + "loss": 2.6854, + "step": 23089 + }, + { + "epoch": 0.9768169895930282, + "grad_norm": 0.17447160184383392, + "learning_rate": 0.001, + "loss": 1.757, + "step": 23090 + }, + { + "epoch": 0.9768592943565445, + "grad_norm": 25.030933380126953, + "learning_rate": 0.001, + "loss": 2.1183, + "step": 23091 + }, + { + "epoch": 0.9769015991200609, + "grad_norm": 0.20335282385349274, + "learning_rate": 0.001, + "loss": 2.738, + "step": 23092 + }, + { + "epoch": 0.9769439038835773, + "grad_norm": 0.4485716223716736, + "learning_rate": 0.001, + "loss": 1.7982, + "step": 23093 + }, + { + "epoch": 0.9769862086470936, + "grad_norm": 0.2497912049293518, + "learning_rate": 0.001, + "loss": 2.0586, + "step": 23094 + }, + { + "epoch": 0.97702851341061, + "grad_norm": 0.19284145534038544, + "learning_rate": 0.001, + "loss": 1.5553, + "step": 23095 + }, + { + "epoch": 0.9770708181741264, + "grad_norm": 0.14969174563884735, + "learning_rate": 0.001, + "loss": 1.8045, + "step": 23096 + }, + { + "epoch": 0.9771131229376427, + "grad_norm": 0.18470831215381622, + "learning_rate": 0.001, + "loss": 2.3824, + "step": 23097 + }, + { + "epoch": 0.9771554277011592, + "grad_norm": 0.1780456155538559, + "learning_rate": 0.001, + "loss": 2.5985, + "step": 23098 + }, + { + "epoch": 0.9771977324646756, + "grad_norm": 0.18881769478321075, + "learning_rate": 0.001, + "loss": 2.6814, + "step": 23099 + }, + { + "epoch": 0.9772400372281919, + "grad_norm": 0.160416379570961, + "learning_rate": 0.001, + "loss": 2.7233, + "step": 23100 + }, + { + "epoch": 0.9772823419917083, + "grad_norm": 0.1704859733581543, + "learning_rate": 0.001, + "loss": 2.3356, + "step": 23101 + }, + { + "epoch": 0.9773246467552247, + "grad_norm": 0.15877144038677216, + "learning_rate": 0.001, + "loss": 1.8551, + "step": 23102 + }, + { + "epoch": 0.977366951518741, + "grad_norm": 0.15213461220264435, + "learning_rate": 0.001, + "loss": 2.2297, + "step": 23103 + }, + { + "epoch": 0.9774092562822574, + "grad_norm": 0.23088282346725464, + "learning_rate": 0.001, + "loss": 2.2761, + "step": 23104 + }, + { + "epoch": 0.9774515610457738, + "grad_norm": 3.229534864425659, + "learning_rate": 0.001, + "loss": 3.1849, + "step": 23105 + }, + { + "epoch": 0.9774938658092901, + "grad_norm": 0.14823846518993378, + "learning_rate": 0.001, + "loss": 2.04, + "step": 23106 + }, + { + "epoch": 0.9775361705728065, + "grad_norm": 0.13392066955566406, + "learning_rate": 0.001, + "loss": 2.1271, + "step": 23107 + }, + { + "epoch": 0.9775784753363229, + "grad_norm": 0.251636266708374, + "learning_rate": 0.001, + "loss": 2.1417, + "step": 23108 + }, + { + "epoch": 0.9776207800998392, + "grad_norm": 0.12352551519870758, + "learning_rate": 0.001, + "loss": 1.8046, + "step": 23109 + }, + { + "epoch": 0.9776630848633556, + "grad_norm": 0.44623270630836487, + "learning_rate": 0.001, + "loss": 3.5603, + "step": 23110 + }, + { + "epoch": 0.977705389626872, + "grad_norm": 0.46664145588874817, + "learning_rate": 0.001, + "loss": 1.9793, + "step": 23111 + }, + { + "epoch": 0.9777476943903883, + "grad_norm": 0.19913674890995026, + "learning_rate": 0.001, + "loss": 2.2404, + "step": 23112 + }, + { + "epoch": 0.9777899991539047, + "grad_norm": 0.14924490451812744, + "learning_rate": 0.001, + "loss": 1.7245, + "step": 23113 + }, + { + "epoch": 0.9778323039174212, + "grad_norm": 0.17213156819343567, + "learning_rate": 0.001, + "loss": 1.6626, + "step": 23114 + }, + { + "epoch": 0.9778746086809375, + "grad_norm": 0.16330088675022125, + "learning_rate": 0.001, + "loss": 1.6231, + "step": 23115 + }, + { + "epoch": 0.9779169134444539, + "grad_norm": 0.5413417816162109, + "learning_rate": 0.001, + "loss": 2.2929, + "step": 23116 + }, + { + "epoch": 0.9779592182079703, + "grad_norm": 0.8194732069969177, + "learning_rate": 0.001, + "loss": 2.0811, + "step": 23117 + }, + { + "epoch": 0.9780015229714866, + "grad_norm": 0.3619847297668457, + "learning_rate": 0.001, + "loss": 2.1488, + "step": 23118 + }, + { + "epoch": 0.978043827735003, + "grad_norm": 0.2249876707792282, + "learning_rate": 0.001, + "loss": 2.3759, + "step": 23119 + }, + { + "epoch": 0.9780861324985193, + "grad_norm": 2.6536123752593994, + "learning_rate": 0.001, + "loss": 2.256, + "step": 23120 + }, + { + "epoch": 0.9781284372620357, + "grad_norm": 0.1619919091463089, + "learning_rate": 0.001, + "loss": 2.3861, + "step": 23121 + }, + { + "epoch": 0.9781707420255521, + "grad_norm": 0.16015075147151947, + "learning_rate": 0.001, + "loss": 1.9889, + "step": 23122 + }, + { + "epoch": 0.9782130467890684, + "grad_norm": 0.14887668192386627, + "learning_rate": 0.001, + "loss": 2.1329, + "step": 23123 + }, + { + "epoch": 0.9782553515525848, + "grad_norm": 0.14117036759853363, + "learning_rate": 0.001, + "loss": 2.0059, + "step": 23124 + }, + { + "epoch": 0.9782976563161012, + "grad_norm": 0.8189803957939148, + "learning_rate": 0.001, + "loss": 2.0877, + "step": 23125 + }, + { + "epoch": 0.9783399610796175, + "grad_norm": 0.93526691198349, + "learning_rate": 0.001, + "loss": 1.3733, + "step": 23126 + }, + { + "epoch": 0.9783822658431339, + "grad_norm": 0.15379607677459717, + "learning_rate": 0.001, + "loss": 2.3471, + "step": 23127 + }, + { + "epoch": 0.9784245706066503, + "grad_norm": 0.1474200189113617, + "learning_rate": 0.001, + "loss": 2.0631, + "step": 23128 + }, + { + "epoch": 0.9784668753701666, + "grad_norm": 0.14249064028263092, + "learning_rate": 0.001, + "loss": 1.8189, + "step": 23129 + }, + { + "epoch": 0.978509180133683, + "grad_norm": 0.17934098839759827, + "learning_rate": 0.001, + "loss": 1.7359, + "step": 23130 + }, + { + "epoch": 0.9785514848971995, + "grad_norm": 0.16001644730567932, + "learning_rate": 0.001, + "loss": 1.8006, + "step": 23131 + }, + { + "epoch": 0.9785937896607158, + "grad_norm": 0.1634327471256256, + "learning_rate": 0.001, + "loss": 1.937, + "step": 23132 + }, + { + "epoch": 0.9786360944242322, + "grad_norm": 0.1568276584148407, + "learning_rate": 0.001, + "loss": 1.8826, + "step": 23133 + }, + { + "epoch": 0.9786783991877486, + "grad_norm": 0.17905938625335693, + "learning_rate": 0.001, + "loss": 1.6522, + "step": 23134 + }, + { + "epoch": 0.9787207039512649, + "grad_norm": 0.2984275817871094, + "learning_rate": 0.001, + "loss": 1.7585, + "step": 23135 + }, + { + "epoch": 0.9787630087147813, + "grad_norm": 0.16698578000068665, + "learning_rate": 0.001, + "loss": 2.5787, + "step": 23136 + }, + { + "epoch": 0.9788053134782977, + "grad_norm": 0.13776206970214844, + "learning_rate": 0.001, + "loss": 1.9259, + "step": 23137 + }, + { + "epoch": 0.978847618241814, + "grad_norm": 0.1826232522726059, + "learning_rate": 0.001, + "loss": 2.3701, + "step": 23138 + }, + { + "epoch": 0.9788899230053304, + "grad_norm": 0.14153793454170227, + "learning_rate": 0.001, + "loss": 1.9833, + "step": 23139 + }, + { + "epoch": 0.9789322277688468, + "grad_norm": 0.19493409991264343, + "learning_rate": 0.001, + "loss": 3.4682, + "step": 23140 + }, + { + "epoch": 0.9789745325323631, + "grad_norm": 0.13115043938159943, + "learning_rate": 0.001, + "loss": 1.2842, + "step": 23141 + }, + { + "epoch": 0.9790168372958795, + "grad_norm": 0.13824576139450073, + "learning_rate": 0.001, + "loss": 2.2034, + "step": 23142 + }, + { + "epoch": 0.9790591420593959, + "grad_norm": 0.17789553105831146, + "learning_rate": 0.001, + "loss": 3.4185, + "step": 23143 + }, + { + "epoch": 0.9791014468229122, + "grad_norm": 0.1795099377632141, + "learning_rate": 0.001, + "loss": 2.0876, + "step": 23144 + }, + { + "epoch": 0.9791437515864286, + "grad_norm": 0.15983949601650238, + "learning_rate": 0.001, + "loss": 1.6265, + "step": 23145 + }, + { + "epoch": 0.979186056349945, + "grad_norm": 0.1699821501970291, + "learning_rate": 0.001, + "loss": 2.6895, + "step": 23146 + }, + { + "epoch": 0.9792283611134613, + "grad_norm": 2.1327931880950928, + "learning_rate": 0.001, + "loss": 1.7483, + "step": 23147 + }, + { + "epoch": 0.9792706658769778, + "grad_norm": 0.14410525560379028, + "learning_rate": 0.001, + "loss": 2.2787, + "step": 23148 + }, + { + "epoch": 0.9793129706404942, + "grad_norm": 0.14737246930599213, + "learning_rate": 0.001, + "loss": 2.1674, + "step": 23149 + }, + { + "epoch": 0.9793552754040105, + "grad_norm": 0.17130975425243378, + "learning_rate": 0.001, + "loss": 2.8032, + "step": 23150 + }, + { + "epoch": 0.9793975801675269, + "grad_norm": 0.1790180206298828, + "learning_rate": 0.001, + "loss": 2.6401, + "step": 23151 + }, + { + "epoch": 0.9794398849310433, + "grad_norm": 0.1794183999300003, + "learning_rate": 0.001, + "loss": 1.896, + "step": 23152 + }, + { + "epoch": 0.9794821896945596, + "grad_norm": 0.12838570773601532, + "learning_rate": 0.001, + "loss": 1.8266, + "step": 23153 + }, + { + "epoch": 0.979524494458076, + "grad_norm": 0.18338772654533386, + "learning_rate": 0.001, + "loss": 1.7244, + "step": 23154 + }, + { + "epoch": 0.9795667992215924, + "grad_norm": 0.17173555493354797, + "learning_rate": 0.001, + "loss": 1.6205, + "step": 23155 + }, + { + "epoch": 0.9796091039851087, + "grad_norm": 0.18049339950084686, + "learning_rate": 0.001, + "loss": 2.2256, + "step": 23156 + }, + { + "epoch": 0.9796514087486251, + "grad_norm": 0.13144956529140472, + "learning_rate": 0.001, + "loss": 2.6915, + "step": 23157 + }, + { + "epoch": 0.9796937135121415, + "grad_norm": 0.1813870072364807, + "learning_rate": 0.001, + "loss": 1.9047, + "step": 23158 + }, + { + "epoch": 0.9797360182756578, + "grad_norm": 1.0048320293426514, + "learning_rate": 0.001, + "loss": 2.8726, + "step": 23159 + }, + { + "epoch": 0.9797783230391742, + "grad_norm": 0.1345098316669464, + "learning_rate": 0.001, + "loss": 1.4787, + "step": 23160 + }, + { + "epoch": 0.9798206278026906, + "grad_norm": 0.1486302614212036, + "learning_rate": 0.001, + "loss": 2.3496, + "step": 23161 + }, + { + "epoch": 0.9798629325662069, + "grad_norm": 0.198981374502182, + "learning_rate": 0.001, + "loss": 1.918, + "step": 23162 + }, + { + "epoch": 0.9799052373297233, + "grad_norm": 0.37297117710113525, + "learning_rate": 0.001, + "loss": 3.412, + "step": 23163 + }, + { + "epoch": 0.9799475420932398, + "grad_norm": 0.18499860167503357, + "learning_rate": 0.001, + "loss": 2.2248, + "step": 23164 + }, + { + "epoch": 0.979989846856756, + "grad_norm": 0.16248784959316254, + "learning_rate": 0.001, + "loss": 2.2262, + "step": 23165 + }, + { + "epoch": 0.9800321516202725, + "grad_norm": 0.13412149250507355, + "learning_rate": 0.001, + "loss": 1.9383, + "step": 23166 + }, + { + "epoch": 0.9800744563837888, + "grad_norm": 0.14490722119808197, + "learning_rate": 0.001, + "loss": 1.7336, + "step": 23167 + }, + { + "epoch": 0.9801167611473052, + "grad_norm": 0.16167306900024414, + "learning_rate": 0.001, + "loss": 3.7023, + "step": 23168 + }, + { + "epoch": 0.9801590659108216, + "grad_norm": 0.2081134021282196, + "learning_rate": 0.001, + "loss": 3.0097, + "step": 23169 + }, + { + "epoch": 0.9802013706743379, + "grad_norm": 0.15773500502109528, + "learning_rate": 0.001, + "loss": 1.7977, + "step": 23170 + }, + { + "epoch": 0.9802436754378543, + "grad_norm": 0.2358037680387497, + "learning_rate": 0.001, + "loss": 2.6206, + "step": 23171 + }, + { + "epoch": 0.9802859802013707, + "grad_norm": 0.17311686277389526, + "learning_rate": 0.001, + "loss": 2.619, + "step": 23172 + }, + { + "epoch": 0.980328284964887, + "grad_norm": 15.802042961120605, + "learning_rate": 0.001, + "loss": 1.6373, + "step": 23173 + }, + { + "epoch": 0.9803705897284034, + "grad_norm": 0.14769822359085083, + "learning_rate": 0.001, + "loss": 1.7526, + "step": 23174 + }, + { + "epoch": 0.9804128944919198, + "grad_norm": 2.210271120071411, + "learning_rate": 0.001, + "loss": 1.7164, + "step": 23175 + }, + { + "epoch": 0.9804551992554361, + "grad_norm": 0.15342247486114502, + "learning_rate": 0.001, + "loss": 3.1296, + "step": 23176 + }, + { + "epoch": 0.9804975040189525, + "grad_norm": 0.1497957855463028, + "learning_rate": 0.001, + "loss": 2.6274, + "step": 23177 + }, + { + "epoch": 0.9805398087824689, + "grad_norm": 0.1614879071712494, + "learning_rate": 0.001, + "loss": 1.4673, + "step": 23178 + }, + { + "epoch": 0.9805821135459852, + "grad_norm": 0.1438165307044983, + "learning_rate": 0.001, + "loss": 3.2337, + "step": 23179 + }, + { + "epoch": 0.9806244183095016, + "grad_norm": 0.1747131198644638, + "learning_rate": 0.001, + "loss": 2.1779, + "step": 23180 + }, + { + "epoch": 0.980666723073018, + "grad_norm": 0.15409283339977264, + "learning_rate": 0.001, + "loss": 2.4793, + "step": 23181 + }, + { + "epoch": 0.9807090278365344, + "grad_norm": 0.1740875393152237, + "learning_rate": 0.001, + "loss": 2.0542, + "step": 23182 + }, + { + "epoch": 0.9807513326000508, + "grad_norm": 0.13922379910945892, + "learning_rate": 0.001, + "loss": 2.1908, + "step": 23183 + }, + { + "epoch": 0.9807936373635672, + "grad_norm": 0.20002827048301697, + "learning_rate": 0.001, + "loss": 1.9136, + "step": 23184 + }, + { + "epoch": 0.9808359421270835, + "grad_norm": 0.13429036736488342, + "learning_rate": 0.001, + "loss": 1.3618, + "step": 23185 + }, + { + "epoch": 0.9808782468905999, + "grad_norm": 0.16126492619514465, + "learning_rate": 0.001, + "loss": 2.7224, + "step": 23186 + }, + { + "epoch": 0.9809205516541163, + "grad_norm": 0.14296741783618927, + "learning_rate": 0.001, + "loss": 3.2186, + "step": 23187 + }, + { + "epoch": 0.9809628564176326, + "grad_norm": 5.455838203430176, + "learning_rate": 0.001, + "loss": 2.5349, + "step": 23188 + }, + { + "epoch": 0.981005161181149, + "grad_norm": 0.13420797884464264, + "learning_rate": 0.001, + "loss": 1.4715, + "step": 23189 + }, + { + "epoch": 0.9810474659446654, + "grad_norm": 0.2915991544723511, + "learning_rate": 0.001, + "loss": 1.9732, + "step": 23190 + }, + { + "epoch": 0.9810897707081817, + "grad_norm": 0.12874186038970947, + "learning_rate": 0.001, + "loss": 1.8352, + "step": 23191 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.14853565394878387, + "learning_rate": 0.001, + "loss": 2.8209, + "step": 23192 + }, + { + "epoch": 0.9811743802352145, + "grad_norm": 0.14589852094650269, + "learning_rate": 0.001, + "loss": 2.0377, + "step": 23193 + }, + { + "epoch": 0.9812166849987308, + "grad_norm": 0.15394212305545807, + "learning_rate": 0.001, + "loss": 1.9553, + "step": 23194 + }, + { + "epoch": 0.9812589897622472, + "grad_norm": 0.1757000833749771, + "learning_rate": 0.001, + "loss": 2.2428, + "step": 23195 + }, + { + "epoch": 0.9813012945257636, + "grad_norm": 0.15882089734077454, + "learning_rate": 0.001, + "loss": 2.0133, + "step": 23196 + }, + { + "epoch": 0.9813435992892799, + "grad_norm": 0.14844751358032227, + "learning_rate": 0.001, + "loss": 1.7509, + "step": 23197 + }, + { + "epoch": 0.9813859040527964, + "grad_norm": 0.16744114458560944, + "learning_rate": 0.001, + "loss": 1.7146, + "step": 23198 + }, + { + "epoch": 0.9814282088163128, + "grad_norm": 0.1688256859779358, + "learning_rate": 0.001, + "loss": 3.276, + "step": 23199 + }, + { + "epoch": 0.9814705135798291, + "grad_norm": 4.072800159454346, + "learning_rate": 0.001, + "loss": 2.5557, + "step": 23200 + }, + { + "epoch": 0.9815128183433455, + "grad_norm": 0.1719663292169571, + "learning_rate": 0.001, + "loss": 1.8419, + "step": 23201 + }, + { + "epoch": 0.9815551231068619, + "grad_norm": 0.15536484122276306, + "learning_rate": 0.001, + "loss": 1.8884, + "step": 23202 + }, + { + "epoch": 0.9815974278703782, + "grad_norm": 1.0643222332000732, + "learning_rate": 0.001, + "loss": 1.9732, + "step": 23203 + }, + { + "epoch": 0.9816397326338946, + "grad_norm": 0.19347727298736572, + "learning_rate": 0.001, + "loss": 2.0092, + "step": 23204 + }, + { + "epoch": 0.981682037397411, + "grad_norm": 0.14213740825653076, + "learning_rate": 0.001, + "loss": 2.3664, + "step": 23205 + }, + { + "epoch": 0.9817243421609273, + "grad_norm": 0.5251598954200745, + "learning_rate": 0.001, + "loss": 2.6287, + "step": 23206 + }, + { + "epoch": 0.9817666469244437, + "grad_norm": 0.19425900280475616, + "learning_rate": 0.001, + "loss": 2.4588, + "step": 23207 + }, + { + "epoch": 0.9818089516879601, + "grad_norm": 1.0385212898254395, + "learning_rate": 0.001, + "loss": 1.961, + "step": 23208 + }, + { + "epoch": 0.9818512564514764, + "grad_norm": 3.3503012657165527, + "learning_rate": 0.001, + "loss": 2.6355, + "step": 23209 + }, + { + "epoch": 0.9818935612149928, + "grad_norm": 0.344450980424881, + "learning_rate": 0.001, + "loss": 3.4627, + "step": 23210 + }, + { + "epoch": 0.9819358659785091, + "grad_norm": 0.3758525252342224, + "learning_rate": 0.001, + "loss": 2.3434, + "step": 23211 + }, + { + "epoch": 0.9819781707420255, + "grad_norm": 0.23000246286392212, + "learning_rate": 0.001, + "loss": 2.1027, + "step": 23212 + }, + { + "epoch": 0.982020475505542, + "grad_norm": 0.21098732948303223, + "learning_rate": 0.001, + "loss": 1.8135, + "step": 23213 + }, + { + "epoch": 0.9820627802690582, + "grad_norm": 0.3435835540294647, + "learning_rate": 0.001, + "loss": 2.3019, + "step": 23214 + }, + { + "epoch": 0.9821050850325747, + "grad_norm": 0.1896451711654663, + "learning_rate": 0.001, + "loss": 3.6827, + "step": 23215 + }, + { + "epoch": 0.9821473897960911, + "grad_norm": 0.9407826066017151, + "learning_rate": 0.001, + "loss": 2.207, + "step": 23216 + }, + { + "epoch": 0.9821896945596074, + "grad_norm": 0.17004285752773285, + "learning_rate": 0.001, + "loss": 1.6758, + "step": 23217 + }, + { + "epoch": 0.9822319993231238, + "grad_norm": 0.21439626812934875, + "learning_rate": 0.001, + "loss": 2.7161, + "step": 23218 + }, + { + "epoch": 0.9822743040866402, + "grad_norm": 0.6642925143241882, + "learning_rate": 0.001, + "loss": 2.7868, + "step": 23219 + }, + { + "epoch": 0.9823166088501565, + "grad_norm": 0.20337320864200592, + "learning_rate": 0.001, + "loss": 1.9839, + "step": 23220 + }, + { + "epoch": 0.9823589136136729, + "grad_norm": 0.15737198293209076, + "learning_rate": 0.001, + "loss": 2.3604, + "step": 23221 + }, + { + "epoch": 0.9824012183771893, + "grad_norm": 0.6608482599258423, + "learning_rate": 0.001, + "loss": 1.6606, + "step": 23222 + }, + { + "epoch": 0.9824435231407056, + "grad_norm": 0.11500653624534607, + "learning_rate": 0.001, + "loss": 2.2689, + "step": 23223 + }, + { + "epoch": 0.982485827904222, + "grad_norm": 0.2476760298013687, + "learning_rate": 0.001, + "loss": 2.4611, + "step": 23224 + }, + { + "epoch": 0.9825281326677384, + "grad_norm": 0.20634648203849792, + "learning_rate": 0.001, + "loss": 3.2298, + "step": 23225 + }, + { + "epoch": 0.9825704374312547, + "grad_norm": 0.174639493227005, + "learning_rate": 0.001, + "loss": 2.3492, + "step": 23226 + }, + { + "epoch": 0.9826127421947711, + "grad_norm": 0.1685517132282257, + "learning_rate": 0.001, + "loss": 3.3922, + "step": 23227 + }, + { + "epoch": 0.9826550469582875, + "grad_norm": 0.17105820775032043, + "learning_rate": 0.001, + "loss": 1.8573, + "step": 23228 + }, + { + "epoch": 0.9826973517218038, + "grad_norm": 0.1790645271539688, + "learning_rate": 0.001, + "loss": 3.3843, + "step": 23229 + }, + { + "epoch": 0.9827396564853202, + "grad_norm": 0.14651429653167725, + "learning_rate": 0.001, + "loss": 2.4584, + "step": 23230 + }, + { + "epoch": 0.9827819612488367, + "grad_norm": 0.15420038998126984, + "learning_rate": 0.001, + "loss": 1.8876, + "step": 23231 + }, + { + "epoch": 0.982824266012353, + "grad_norm": 0.13720165193080902, + "learning_rate": 0.001, + "loss": 1.6773, + "step": 23232 + }, + { + "epoch": 0.9828665707758694, + "grad_norm": 0.3285228908061981, + "learning_rate": 0.001, + "loss": 2.1622, + "step": 23233 + }, + { + "epoch": 0.9829088755393858, + "grad_norm": 0.22455190122127533, + "learning_rate": 0.001, + "loss": 3.7145, + "step": 23234 + }, + { + "epoch": 0.9829511803029021, + "grad_norm": 0.14361530542373657, + "learning_rate": 0.001, + "loss": 2.551, + "step": 23235 + }, + { + "epoch": 0.9829934850664185, + "grad_norm": 0.20710772275924683, + "learning_rate": 0.001, + "loss": 1.8731, + "step": 23236 + }, + { + "epoch": 0.9830357898299349, + "grad_norm": 0.15071751177310944, + "learning_rate": 0.001, + "loss": 1.4985, + "step": 23237 + }, + { + "epoch": 0.9830780945934512, + "grad_norm": 0.15085220336914062, + "learning_rate": 0.001, + "loss": 2.0827, + "step": 23238 + }, + { + "epoch": 0.9831203993569676, + "grad_norm": 0.1446499526500702, + "learning_rate": 0.001, + "loss": 2.4606, + "step": 23239 + }, + { + "epoch": 0.983162704120484, + "grad_norm": 0.1713118702173233, + "learning_rate": 0.001, + "loss": 1.947, + "step": 23240 + }, + { + "epoch": 0.9832050088840003, + "grad_norm": 0.23264853656291962, + "learning_rate": 0.001, + "loss": 1.7186, + "step": 23241 + }, + { + "epoch": 0.9832473136475167, + "grad_norm": 0.15223060548305511, + "learning_rate": 0.001, + "loss": 2.3135, + "step": 23242 + }, + { + "epoch": 0.9832896184110331, + "grad_norm": 2.1504344940185547, + "learning_rate": 0.001, + "loss": 2.345, + "step": 23243 + }, + { + "epoch": 0.9833319231745494, + "grad_norm": 0.2231014370918274, + "learning_rate": 0.001, + "loss": 2.4845, + "step": 23244 + }, + { + "epoch": 0.9833742279380658, + "grad_norm": 30.52190399169922, + "learning_rate": 0.001, + "loss": 1.5587, + "step": 23245 + }, + { + "epoch": 0.9834165327015822, + "grad_norm": 0.1924566775560379, + "learning_rate": 0.001, + "loss": 1.8833, + "step": 23246 + }, + { + "epoch": 0.9834588374650985, + "grad_norm": 0.17296500504016876, + "learning_rate": 0.001, + "loss": 1.9576, + "step": 23247 + }, + { + "epoch": 0.983501142228615, + "grad_norm": 0.17767272889614105, + "learning_rate": 0.001, + "loss": 2.5095, + "step": 23248 + }, + { + "epoch": 0.9835434469921314, + "grad_norm": 16.753442764282227, + "learning_rate": 0.001, + "loss": 2.5697, + "step": 23249 + }, + { + "epoch": 0.9835857517556477, + "grad_norm": 0.19115328788757324, + "learning_rate": 0.001, + "loss": 1.6478, + "step": 23250 + }, + { + "epoch": 0.9836280565191641, + "grad_norm": 22.24527359008789, + "learning_rate": 0.001, + "loss": 2.6725, + "step": 23251 + }, + { + "epoch": 0.9836703612826805, + "grad_norm": 2.2926394939422607, + "learning_rate": 0.001, + "loss": 1.8396, + "step": 23252 + }, + { + "epoch": 0.9837126660461968, + "grad_norm": 0.22507277131080627, + "learning_rate": 0.001, + "loss": 2.7259, + "step": 23253 + }, + { + "epoch": 0.9837549708097132, + "grad_norm": 0.20244504511356354, + "learning_rate": 0.001, + "loss": 1.8665, + "step": 23254 + }, + { + "epoch": 0.9837972755732295, + "grad_norm": 0.17822127044200897, + "learning_rate": 0.001, + "loss": 2.7145, + "step": 23255 + }, + { + "epoch": 0.9838395803367459, + "grad_norm": 0.21497797966003418, + "learning_rate": 0.001, + "loss": 1.9941, + "step": 23256 + }, + { + "epoch": 0.9838818851002623, + "grad_norm": 0.2271234095096588, + "learning_rate": 0.001, + "loss": 2.2163, + "step": 23257 + }, + { + "epoch": 0.9839241898637786, + "grad_norm": 1.1759521961212158, + "learning_rate": 0.001, + "loss": 2.7644, + "step": 23258 + }, + { + "epoch": 0.983966494627295, + "grad_norm": 0.19246691465377808, + "learning_rate": 0.001, + "loss": 1.9231, + "step": 23259 + }, + { + "epoch": 0.9840087993908114, + "grad_norm": 4.506860733032227, + "learning_rate": 0.001, + "loss": 2.4722, + "step": 23260 + }, + { + "epoch": 0.9840511041543277, + "grad_norm": 0.20085294544696808, + "learning_rate": 0.001, + "loss": 3.0157, + "step": 23261 + }, + { + "epoch": 0.9840934089178441, + "grad_norm": 1.484424352645874, + "learning_rate": 0.001, + "loss": 2.2361, + "step": 23262 + }, + { + "epoch": 0.9841357136813605, + "grad_norm": 0.21662937104701996, + "learning_rate": 0.001, + "loss": 2.2051, + "step": 23263 + }, + { + "epoch": 0.9841780184448768, + "grad_norm": 0.20913396775722504, + "learning_rate": 0.001, + "loss": 1.697, + "step": 23264 + }, + { + "epoch": 0.9842203232083933, + "grad_norm": 0.19714033603668213, + "learning_rate": 0.001, + "loss": 2.0469, + "step": 23265 + }, + { + "epoch": 0.9842626279719097, + "grad_norm": 0.18258053064346313, + "learning_rate": 0.001, + "loss": 2.446, + "step": 23266 + }, + { + "epoch": 0.984304932735426, + "grad_norm": 0.43898874521255493, + "learning_rate": 0.001, + "loss": 1.9534, + "step": 23267 + }, + { + "epoch": 0.9843472374989424, + "grad_norm": 0.16025614738464355, + "learning_rate": 0.001, + "loss": 2.2371, + "step": 23268 + }, + { + "epoch": 0.9843895422624588, + "grad_norm": 0.4538215100765228, + "learning_rate": 0.001, + "loss": 1.9265, + "step": 23269 + }, + { + "epoch": 0.9844318470259751, + "grad_norm": 0.18556450307369232, + "learning_rate": 0.001, + "loss": 1.5331, + "step": 23270 + }, + { + "epoch": 0.9844741517894915, + "grad_norm": 0.15075351297855377, + "learning_rate": 0.001, + "loss": 3.1292, + "step": 23271 + }, + { + "epoch": 0.9845164565530079, + "grad_norm": 19.696006774902344, + "learning_rate": 0.001, + "loss": 2.7711, + "step": 23272 + }, + { + "epoch": 0.9845587613165242, + "grad_norm": 0.22151979804039001, + "learning_rate": 0.001, + "loss": 1.6431, + "step": 23273 + }, + { + "epoch": 0.9846010660800406, + "grad_norm": 0.27550455927848816, + "learning_rate": 0.001, + "loss": 2.4787, + "step": 23274 + }, + { + "epoch": 0.984643370843557, + "grad_norm": 0.22198323905467987, + "learning_rate": 0.001, + "loss": 1.6055, + "step": 23275 + }, + { + "epoch": 0.9846856756070733, + "grad_norm": 0.20321761071681976, + "learning_rate": 0.001, + "loss": 2.4743, + "step": 23276 + }, + { + "epoch": 0.9847279803705897, + "grad_norm": 0.19668048620224, + "learning_rate": 0.001, + "loss": 1.825, + "step": 23277 + }, + { + "epoch": 0.9847702851341061, + "grad_norm": 0.7689309120178223, + "learning_rate": 0.001, + "loss": 3.5899, + "step": 23278 + }, + { + "epoch": 0.9848125898976224, + "grad_norm": 0.18673300743103027, + "learning_rate": 0.001, + "loss": 2.8243, + "step": 23279 + }, + { + "epoch": 0.9848548946611388, + "grad_norm": 1.7281594276428223, + "learning_rate": 0.001, + "loss": 1.5403, + "step": 23280 + }, + { + "epoch": 0.9848971994246553, + "grad_norm": 0.15205202996730804, + "learning_rate": 0.001, + "loss": 3.1236, + "step": 23281 + }, + { + "epoch": 0.9849395041881716, + "grad_norm": 0.14774078130722046, + "learning_rate": 0.001, + "loss": 2.1822, + "step": 23282 + }, + { + "epoch": 0.984981808951688, + "grad_norm": 0.15019498765468597, + "learning_rate": 0.001, + "loss": 1.8555, + "step": 23283 + }, + { + "epoch": 0.9850241137152044, + "grad_norm": 0.17441150546073914, + "learning_rate": 0.001, + "loss": 1.8417, + "step": 23284 + }, + { + "epoch": 0.9850664184787207, + "grad_norm": 0.16328319907188416, + "learning_rate": 0.001, + "loss": 1.957, + "step": 23285 + }, + { + "epoch": 0.9851087232422371, + "grad_norm": 1.1682124137878418, + "learning_rate": 0.001, + "loss": 1.9435, + "step": 23286 + }, + { + "epoch": 0.9851510280057535, + "grad_norm": 0.8976200819015503, + "learning_rate": 0.001, + "loss": 2.9577, + "step": 23287 + }, + { + "epoch": 0.9851933327692698, + "grad_norm": 0.18787872791290283, + "learning_rate": 0.001, + "loss": 2.205, + "step": 23288 + }, + { + "epoch": 0.9852356375327862, + "grad_norm": 0.17400221526622772, + "learning_rate": 0.001, + "loss": 1.8947, + "step": 23289 + }, + { + "epoch": 0.9852779422963026, + "grad_norm": 0.16904416680335999, + "learning_rate": 0.001, + "loss": 1.7336, + "step": 23290 + }, + { + "epoch": 0.9853202470598189, + "grad_norm": 0.2862572968006134, + "learning_rate": 0.001, + "loss": 2.0924, + "step": 23291 + }, + { + "epoch": 0.9853625518233353, + "grad_norm": 0.18694454431533813, + "learning_rate": 0.001, + "loss": 2.4069, + "step": 23292 + }, + { + "epoch": 0.9854048565868517, + "grad_norm": 0.17351536452770233, + "learning_rate": 0.001, + "loss": 2.3785, + "step": 23293 + }, + { + "epoch": 0.985447161350368, + "grad_norm": 0.16305550932884216, + "learning_rate": 0.001, + "loss": 2.5295, + "step": 23294 + }, + { + "epoch": 0.9854894661138844, + "grad_norm": 0.28731977939605713, + "learning_rate": 0.001, + "loss": 1.5698, + "step": 23295 + }, + { + "epoch": 0.9855317708774008, + "grad_norm": 0.19942089915275574, + "learning_rate": 0.001, + "loss": 2.6501, + "step": 23296 + }, + { + "epoch": 0.9855740756409171, + "grad_norm": 0.15859198570251465, + "learning_rate": 0.001, + "loss": 1.9334, + "step": 23297 + }, + { + "epoch": 0.9856163804044336, + "grad_norm": 0.17347994446754456, + "learning_rate": 0.001, + "loss": 2.281, + "step": 23298 + }, + { + "epoch": 0.98565868516795, + "grad_norm": 0.1470501720905304, + "learning_rate": 0.001, + "loss": 2.8596, + "step": 23299 + }, + { + "epoch": 0.9857009899314663, + "grad_norm": 0.17378562688827515, + "learning_rate": 0.001, + "loss": 2.6134, + "step": 23300 + }, + { + "epoch": 0.9857432946949827, + "grad_norm": 2.3224494457244873, + "learning_rate": 0.001, + "loss": 1.8163, + "step": 23301 + }, + { + "epoch": 0.985785599458499, + "grad_norm": 0.15434542298316956, + "learning_rate": 0.001, + "loss": 2.0795, + "step": 23302 + }, + { + "epoch": 0.9858279042220154, + "grad_norm": 0.17513182759284973, + "learning_rate": 0.001, + "loss": 1.9282, + "step": 23303 + }, + { + "epoch": 0.9858702089855318, + "grad_norm": 0.18898020684719086, + "learning_rate": 0.001, + "loss": 2.8024, + "step": 23304 + }, + { + "epoch": 0.9859125137490481, + "grad_norm": 0.5918177366256714, + "learning_rate": 0.001, + "loss": 2.3883, + "step": 23305 + }, + { + "epoch": 0.9859548185125645, + "grad_norm": 0.13751918077468872, + "learning_rate": 0.001, + "loss": 1.7054, + "step": 23306 + }, + { + "epoch": 0.9859971232760809, + "grad_norm": 0.1347641944885254, + "learning_rate": 0.001, + "loss": 1.615, + "step": 23307 + }, + { + "epoch": 0.9860394280395972, + "grad_norm": 0.21464622020721436, + "learning_rate": 0.001, + "loss": 2.3099, + "step": 23308 + }, + { + "epoch": 0.9860817328031136, + "grad_norm": 0.15518558025360107, + "learning_rate": 0.001, + "loss": 2.1591, + "step": 23309 + }, + { + "epoch": 0.98612403756663, + "grad_norm": 0.1407310962677002, + "learning_rate": 0.001, + "loss": 1.9204, + "step": 23310 + }, + { + "epoch": 0.9861663423301463, + "grad_norm": 0.11285633593797684, + "learning_rate": 0.001, + "loss": 1.7241, + "step": 23311 + }, + { + "epoch": 0.9862086470936627, + "grad_norm": 8.615754127502441, + "learning_rate": 0.001, + "loss": 2.5709, + "step": 23312 + }, + { + "epoch": 0.9862509518571791, + "grad_norm": 0.13533753156661987, + "learning_rate": 0.001, + "loss": 1.6491, + "step": 23313 + }, + { + "epoch": 0.9862932566206954, + "grad_norm": 0.16837237775325775, + "learning_rate": 0.001, + "loss": 2.4037, + "step": 23314 + }, + { + "epoch": 0.9863355613842119, + "grad_norm": 0.14844800531864166, + "learning_rate": 0.001, + "loss": 1.6873, + "step": 23315 + }, + { + "epoch": 0.9863778661477283, + "grad_norm": 0.13803304731845856, + "learning_rate": 0.001, + "loss": 2.2417, + "step": 23316 + }, + { + "epoch": 0.9864201709112446, + "grad_norm": 0.19187100231647491, + "learning_rate": 0.001, + "loss": 1.9479, + "step": 23317 + }, + { + "epoch": 0.986462475674761, + "grad_norm": 1.3388183116912842, + "learning_rate": 0.001, + "loss": 1.8091, + "step": 23318 + }, + { + "epoch": 0.9865047804382774, + "grad_norm": 0.18258285522460938, + "learning_rate": 0.001, + "loss": 2.8915, + "step": 23319 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.1443546712398529, + "learning_rate": 0.001, + "loss": 2.0421, + "step": 23320 + }, + { + "epoch": 0.9865893899653101, + "grad_norm": 0.16136842966079712, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 23321 + }, + { + "epoch": 0.9866316947288265, + "grad_norm": 0.13147707283496857, + "learning_rate": 0.001, + "loss": 1.6761, + "step": 23322 + }, + { + "epoch": 0.9866739994923428, + "grad_norm": 0.18694503605365753, + "learning_rate": 0.001, + "loss": 2.0926, + "step": 23323 + }, + { + "epoch": 0.9867163042558592, + "grad_norm": 0.1893046796321869, + "learning_rate": 0.001, + "loss": 2.8645, + "step": 23324 + }, + { + "epoch": 0.9867586090193756, + "grad_norm": 0.2780762314796448, + "learning_rate": 0.001, + "loss": 2.0847, + "step": 23325 + }, + { + "epoch": 0.9868009137828919, + "grad_norm": 0.19630302488803864, + "learning_rate": 0.001, + "loss": 2.0885, + "step": 23326 + }, + { + "epoch": 0.9868432185464083, + "grad_norm": 0.1403011828660965, + "learning_rate": 0.001, + "loss": 1.564, + "step": 23327 + }, + { + "epoch": 0.9868855233099247, + "grad_norm": 0.8801196813583374, + "learning_rate": 0.001, + "loss": 2.5189, + "step": 23328 + }, + { + "epoch": 0.986927828073441, + "grad_norm": 0.13722547888755798, + "learning_rate": 0.001, + "loss": 2.0738, + "step": 23329 + }, + { + "epoch": 0.9869701328369574, + "grad_norm": 0.42092952132225037, + "learning_rate": 0.001, + "loss": 1.6655, + "step": 23330 + }, + { + "epoch": 0.9870124376004739, + "grad_norm": 0.19227460026741028, + "learning_rate": 0.001, + "loss": 2.1327, + "step": 23331 + }, + { + "epoch": 0.9870547423639902, + "grad_norm": 0.14953431487083435, + "learning_rate": 0.001, + "loss": 1.5976, + "step": 23332 + }, + { + "epoch": 0.9870970471275066, + "grad_norm": 1.6279288530349731, + "learning_rate": 0.001, + "loss": 2.4999, + "step": 23333 + }, + { + "epoch": 0.987139351891023, + "grad_norm": 0.15288491547107697, + "learning_rate": 0.001, + "loss": 1.9023, + "step": 23334 + }, + { + "epoch": 0.9871816566545393, + "grad_norm": 0.16339105367660522, + "learning_rate": 0.001, + "loss": 1.5201, + "step": 23335 + }, + { + "epoch": 0.9872239614180557, + "grad_norm": 0.19303716719150543, + "learning_rate": 0.001, + "loss": 2.255, + "step": 23336 + }, + { + "epoch": 0.9872662661815721, + "grad_norm": 0.16261610388755798, + "learning_rate": 0.001, + "loss": 1.7459, + "step": 23337 + }, + { + "epoch": 0.9873085709450884, + "grad_norm": 0.2173750400543213, + "learning_rate": 0.001, + "loss": 2.1332, + "step": 23338 + }, + { + "epoch": 0.9873508757086048, + "grad_norm": 0.5034084320068359, + "learning_rate": 0.001, + "loss": 2.0107, + "step": 23339 + }, + { + "epoch": 0.9873931804721212, + "grad_norm": 0.1737852692604065, + "learning_rate": 0.001, + "loss": 1.8748, + "step": 23340 + }, + { + "epoch": 0.9874354852356375, + "grad_norm": 0.19651900231838226, + "learning_rate": 0.001, + "loss": 2.7285, + "step": 23341 + }, + { + "epoch": 0.9874777899991539, + "grad_norm": 0.14730535447597504, + "learning_rate": 0.001, + "loss": 2.8565, + "step": 23342 + }, + { + "epoch": 0.9875200947626703, + "grad_norm": 0.15681281685829163, + "learning_rate": 0.001, + "loss": 2.0837, + "step": 23343 + }, + { + "epoch": 0.9875623995261866, + "grad_norm": 0.17979633808135986, + "learning_rate": 0.001, + "loss": 2.1084, + "step": 23344 + }, + { + "epoch": 0.987604704289703, + "grad_norm": 0.29083535075187683, + "learning_rate": 0.001, + "loss": 2.7915, + "step": 23345 + }, + { + "epoch": 0.9876470090532193, + "grad_norm": 0.16025808453559875, + "learning_rate": 0.001, + "loss": 2.1277, + "step": 23346 + }, + { + "epoch": 0.9876893138167357, + "grad_norm": 0.187299445271492, + "learning_rate": 0.001, + "loss": 2.0213, + "step": 23347 + }, + { + "epoch": 0.9877316185802522, + "grad_norm": 0.1659306287765503, + "learning_rate": 0.001, + "loss": 1.4863, + "step": 23348 + }, + { + "epoch": 0.9877739233437685, + "grad_norm": 0.13653291761875153, + "learning_rate": 0.001, + "loss": 2.0724, + "step": 23349 + }, + { + "epoch": 0.9878162281072849, + "grad_norm": 0.19335125386714935, + "learning_rate": 0.001, + "loss": 2.5707, + "step": 23350 + }, + { + "epoch": 0.9878585328708013, + "grad_norm": 0.17489448189735413, + "learning_rate": 0.001, + "loss": 2.4171, + "step": 23351 + }, + { + "epoch": 0.9879008376343176, + "grad_norm": 0.15927927196025848, + "learning_rate": 0.001, + "loss": 1.9707, + "step": 23352 + }, + { + "epoch": 0.987943142397834, + "grad_norm": 0.13132639229297638, + "learning_rate": 0.001, + "loss": 1.7196, + "step": 23353 + }, + { + "epoch": 0.9879854471613504, + "grad_norm": 0.7293382883071899, + "learning_rate": 0.001, + "loss": 2.0281, + "step": 23354 + }, + { + "epoch": 0.9880277519248667, + "grad_norm": 0.151032954454422, + "learning_rate": 0.001, + "loss": 1.6911, + "step": 23355 + }, + { + "epoch": 0.9880700566883831, + "grad_norm": 0.13139599561691284, + "learning_rate": 0.001, + "loss": 1.4657, + "step": 23356 + }, + { + "epoch": 0.9881123614518995, + "grad_norm": 0.14863912761211395, + "learning_rate": 0.001, + "loss": 1.7769, + "step": 23357 + }, + { + "epoch": 0.9881546662154158, + "grad_norm": 0.14839519560337067, + "learning_rate": 0.001, + "loss": 2.1032, + "step": 23358 + }, + { + "epoch": 0.9881969709789322, + "grad_norm": 0.12207513302564621, + "learning_rate": 0.001, + "loss": 1.7226, + "step": 23359 + }, + { + "epoch": 0.9882392757424486, + "grad_norm": 0.18079492449760437, + "learning_rate": 0.001, + "loss": 1.8141, + "step": 23360 + }, + { + "epoch": 0.9882815805059649, + "grad_norm": 0.1407337784767151, + "learning_rate": 0.001, + "loss": 2.2357, + "step": 23361 + }, + { + "epoch": 0.9883238852694813, + "grad_norm": 0.1415151059627533, + "learning_rate": 0.001, + "loss": 1.8046, + "step": 23362 + }, + { + "epoch": 0.9883661900329977, + "grad_norm": 2.5134902000427246, + "learning_rate": 0.001, + "loss": 2.7073, + "step": 23363 + }, + { + "epoch": 0.988408494796514, + "grad_norm": 0.16898906230926514, + "learning_rate": 0.001, + "loss": 3.0829, + "step": 23364 + }, + { + "epoch": 0.9884507995600305, + "grad_norm": 0.13951623439788818, + "learning_rate": 0.001, + "loss": 2.0555, + "step": 23365 + }, + { + "epoch": 0.9884931043235469, + "grad_norm": 0.14504913985729218, + "learning_rate": 0.001, + "loss": 2.0594, + "step": 23366 + }, + { + "epoch": 0.9885354090870632, + "grad_norm": 0.5770459771156311, + "learning_rate": 0.001, + "loss": 2.0616, + "step": 23367 + }, + { + "epoch": 0.9885777138505796, + "grad_norm": 0.16317375004291534, + "learning_rate": 0.001, + "loss": 2.1634, + "step": 23368 + }, + { + "epoch": 0.988620018614096, + "grad_norm": 0.1316281110048294, + "learning_rate": 0.001, + "loss": 2.3575, + "step": 23369 + }, + { + "epoch": 0.9886623233776123, + "grad_norm": 0.14289572834968567, + "learning_rate": 0.001, + "loss": 1.5892, + "step": 23370 + }, + { + "epoch": 0.9887046281411287, + "grad_norm": 0.16080570220947266, + "learning_rate": 0.001, + "loss": 2.1795, + "step": 23371 + }, + { + "epoch": 0.9887469329046451, + "grad_norm": 0.185518279671669, + "learning_rate": 0.001, + "loss": 3.0211, + "step": 23372 + }, + { + "epoch": 0.9887892376681614, + "grad_norm": 0.13568751513957977, + "learning_rate": 0.001, + "loss": 2.4641, + "step": 23373 + }, + { + "epoch": 0.9888315424316778, + "grad_norm": 0.17552120983600616, + "learning_rate": 0.001, + "loss": 2.334, + "step": 23374 + }, + { + "epoch": 0.9888738471951942, + "grad_norm": 0.15184400975704193, + "learning_rate": 0.001, + "loss": 2.95, + "step": 23375 + }, + { + "epoch": 0.9889161519587105, + "grad_norm": 0.14975741505622864, + "learning_rate": 0.001, + "loss": 2.5531, + "step": 23376 + }, + { + "epoch": 0.9889584567222269, + "grad_norm": 0.12690399587154388, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 23377 + }, + { + "epoch": 0.9890007614857433, + "grad_norm": 0.15675854682922363, + "learning_rate": 0.001, + "loss": 2.5597, + "step": 23378 + }, + { + "epoch": 0.9890430662492596, + "grad_norm": 0.15196675062179565, + "learning_rate": 0.001, + "loss": 2.6782, + "step": 23379 + }, + { + "epoch": 0.989085371012776, + "grad_norm": 0.7298043966293335, + "learning_rate": 0.001, + "loss": 2.3701, + "step": 23380 + }, + { + "epoch": 0.9891276757762925, + "grad_norm": 0.14933039247989655, + "learning_rate": 0.001, + "loss": 1.7975, + "step": 23381 + }, + { + "epoch": 0.9891699805398088, + "grad_norm": 0.15482641756534576, + "learning_rate": 0.001, + "loss": 2.4515, + "step": 23382 + }, + { + "epoch": 0.9892122853033252, + "grad_norm": 1.0311862230300903, + "learning_rate": 0.001, + "loss": 2.0674, + "step": 23383 + }, + { + "epoch": 0.9892545900668416, + "grad_norm": 0.13115628063678741, + "learning_rate": 0.001, + "loss": 2.2563, + "step": 23384 + }, + { + "epoch": 0.9892968948303579, + "grad_norm": 0.15161246061325073, + "learning_rate": 0.001, + "loss": 1.3509, + "step": 23385 + }, + { + "epoch": 0.9893391995938743, + "grad_norm": 0.14778055250644684, + "learning_rate": 0.001, + "loss": 2.6554, + "step": 23386 + }, + { + "epoch": 0.9893815043573907, + "grad_norm": 0.1839752495288849, + "learning_rate": 0.001, + "loss": 2.5877, + "step": 23387 + }, + { + "epoch": 0.989423809120907, + "grad_norm": 0.5433216094970703, + "learning_rate": 0.001, + "loss": 2.4707, + "step": 23388 + }, + { + "epoch": 0.9894661138844234, + "grad_norm": 0.19804272055625916, + "learning_rate": 0.001, + "loss": 1.9891, + "step": 23389 + }, + { + "epoch": 0.9895084186479397, + "grad_norm": 0.21847547590732574, + "learning_rate": 0.001, + "loss": 5.1899, + "step": 23390 + }, + { + "epoch": 0.9895507234114561, + "grad_norm": 0.8081578612327576, + "learning_rate": 0.001, + "loss": 2.3353, + "step": 23391 + }, + { + "epoch": 0.9895930281749725, + "grad_norm": 0.13023246824741364, + "learning_rate": 0.001, + "loss": 2.8709, + "step": 23392 + }, + { + "epoch": 0.9896353329384888, + "grad_norm": 0.16377229988574982, + "learning_rate": 0.001, + "loss": 1.5964, + "step": 23393 + }, + { + "epoch": 0.9896776377020052, + "grad_norm": 0.17955902218818665, + "learning_rate": 0.001, + "loss": 1.7785, + "step": 23394 + }, + { + "epoch": 0.9897199424655216, + "grad_norm": 0.18906940519809723, + "learning_rate": 0.001, + "loss": 2.842, + "step": 23395 + }, + { + "epoch": 0.9897622472290379, + "grad_norm": 0.20560045540332794, + "learning_rate": 0.001, + "loss": 1.9539, + "step": 23396 + }, + { + "epoch": 0.9898045519925543, + "grad_norm": 0.1924160122871399, + "learning_rate": 0.001, + "loss": 1.8572, + "step": 23397 + }, + { + "epoch": 0.9898468567560708, + "grad_norm": 0.1467902511358261, + "learning_rate": 0.001, + "loss": 1.7799, + "step": 23398 + }, + { + "epoch": 0.9898891615195871, + "grad_norm": 0.15988442301750183, + "learning_rate": 0.001, + "loss": 1.7468, + "step": 23399 + }, + { + "epoch": 0.9899314662831035, + "grad_norm": 0.16045702993869781, + "learning_rate": 0.001, + "loss": 2.8367, + "step": 23400 + }, + { + "epoch": 0.9899737710466199, + "grad_norm": 0.16333599388599396, + "learning_rate": 0.001, + "loss": 1.6981, + "step": 23401 + }, + { + "epoch": 0.9900160758101362, + "grad_norm": 0.1779310405254364, + "learning_rate": 0.001, + "loss": 1.8536, + "step": 23402 + }, + { + "epoch": 0.9900583805736526, + "grad_norm": 0.8571863770484924, + "learning_rate": 0.001, + "loss": 2.8216, + "step": 23403 + }, + { + "epoch": 0.990100685337169, + "grad_norm": 0.18227769434452057, + "learning_rate": 0.001, + "loss": 1.8207, + "step": 23404 + }, + { + "epoch": 0.9901429901006853, + "grad_norm": 0.1805211752653122, + "learning_rate": 0.001, + "loss": 2.2027, + "step": 23405 + }, + { + "epoch": 0.9901852948642017, + "grad_norm": 1.146898865699768, + "learning_rate": 0.001, + "loss": 2.0096, + "step": 23406 + }, + { + "epoch": 0.9902275996277181, + "grad_norm": 0.14902471005916595, + "learning_rate": 0.001, + "loss": 1.337, + "step": 23407 + }, + { + "epoch": 0.9902699043912344, + "grad_norm": 0.17255893349647522, + "learning_rate": 0.001, + "loss": 2.4503, + "step": 23408 + }, + { + "epoch": 0.9903122091547508, + "grad_norm": 0.18396355211734772, + "learning_rate": 0.001, + "loss": 2.3969, + "step": 23409 + }, + { + "epoch": 0.9903545139182672, + "grad_norm": 0.17676624655723572, + "learning_rate": 0.001, + "loss": 1.7923, + "step": 23410 + }, + { + "epoch": 0.9903968186817835, + "grad_norm": 0.19076810777187347, + "learning_rate": 0.001, + "loss": 1.9227, + "step": 23411 + }, + { + "epoch": 0.9904391234452999, + "grad_norm": 0.15760193765163422, + "learning_rate": 0.001, + "loss": 2.9747, + "step": 23412 + }, + { + "epoch": 0.9904814282088164, + "grad_norm": 0.20325231552124023, + "learning_rate": 0.001, + "loss": 1.7755, + "step": 23413 + }, + { + "epoch": 0.9905237329723326, + "grad_norm": 0.17613425850868225, + "learning_rate": 0.001, + "loss": 2.0028, + "step": 23414 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 0.20244133472442627, + "learning_rate": 0.001, + "loss": 2.5725, + "step": 23415 + }, + { + "epoch": 0.9906083424993655, + "grad_norm": 0.18409040570259094, + "learning_rate": 0.001, + "loss": 3.6709, + "step": 23416 + }, + { + "epoch": 0.9906506472628818, + "grad_norm": 0.15753605961799622, + "learning_rate": 0.001, + "loss": 2.0971, + "step": 23417 + }, + { + "epoch": 0.9906929520263982, + "grad_norm": 0.14398153126239777, + "learning_rate": 0.001, + "loss": 2.2371, + "step": 23418 + }, + { + "epoch": 0.9907352567899146, + "grad_norm": 0.1526634693145752, + "learning_rate": 0.001, + "loss": 2.2506, + "step": 23419 + }, + { + "epoch": 0.9907775615534309, + "grad_norm": 0.12150632590055466, + "learning_rate": 0.001, + "loss": 2.0968, + "step": 23420 + }, + { + "epoch": 0.9908198663169473, + "grad_norm": 0.7939785122871399, + "learning_rate": 0.001, + "loss": 2.479, + "step": 23421 + }, + { + "epoch": 0.9908621710804637, + "grad_norm": 0.16594654321670532, + "learning_rate": 0.001, + "loss": 1.6048, + "step": 23422 + }, + { + "epoch": 0.99090447584398, + "grad_norm": 0.1510103940963745, + "learning_rate": 0.001, + "loss": 2.7858, + "step": 23423 + }, + { + "epoch": 0.9909467806074964, + "grad_norm": 0.23928913474082947, + "learning_rate": 0.001, + "loss": 1.7646, + "step": 23424 + }, + { + "epoch": 0.9909890853710128, + "grad_norm": 0.14759401977062225, + "learning_rate": 0.001, + "loss": 1.6538, + "step": 23425 + }, + { + "epoch": 0.9910313901345291, + "grad_norm": 0.19283150136470795, + "learning_rate": 0.001, + "loss": 2.6298, + "step": 23426 + }, + { + "epoch": 0.9910736948980455, + "grad_norm": 3.1644039154052734, + "learning_rate": 0.001, + "loss": 2.4429, + "step": 23427 + }, + { + "epoch": 0.9911159996615619, + "grad_norm": 0.15195727348327637, + "learning_rate": 0.001, + "loss": 2.3545, + "step": 23428 + }, + { + "epoch": 0.9911583044250782, + "grad_norm": 0.13520923256874084, + "learning_rate": 0.001, + "loss": 2.0637, + "step": 23429 + }, + { + "epoch": 0.9912006091885947, + "grad_norm": 0.1306263655424118, + "learning_rate": 0.001, + "loss": 1.5021, + "step": 23430 + }, + { + "epoch": 0.9912429139521111, + "grad_norm": 0.16827377676963806, + "learning_rate": 0.001, + "loss": 3.3265, + "step": 23431 + }, + { + "epoch": 0.9912852187156274, + "grad_norm": 0.14732570946216583, + "learning_rate": 0.001, + "loss": 2.0172, + "step": 23432 + }, + { + "epoch": 0.9913275234791438, + "grad_norm": 3.2398881912231445, + "learning_rate": 0.001, + "loss": 2.523, + "step": 23433 + }, + { + "epoch": 0.9913698282426602, + "grad_norm": 0.14063461124897003, + "learning_rate": 0.001, + "loss": 1.9463, + "step": 23434 + }, + { + "epoch": 0.9914121330061765, + "grad_norm": 0.2017151415348053, + "learning_rate": 0.001, + "loss": 1.9935, + "step": 23435 + }, + { + "epoch": 0.9914544377696929, + "grad_norm": 0.1547040343284607, + "learning_rate": 0.001, + "loss": 1.7171, + "step": 23436 + }, + { + "epoch": 0.9914967425332092, + "grad_norm": 0.1772545576095581, + "learning_rate": 0.001, + "loss": 2.6763, + "step": 23437 + }, + { + "epoch": 0.9915390472967256, + "grad_norm": 0.158152773976326, + "learning_rate": 0.001, + "loss": 1.7772, + "step": 23438 + }, + { + "epoch": 0.991581352060242, + "grad_norm": 0.17712469398975372, + "learning_rate": 0.001, + "loss": 1.6187, + "step": 23439 + }, + { + "epoch": 0.9916236568237583, + "grad_norm": 0.16980746388435364, + "learning_rate": 0.001, + "loss": 2.7735, + "step": 23440 + }, + { + "epoch": 0.9916659615872747, + "grad_norm": 1.1700719594955444, + "learning_rate": 0.001, + "loss": 2.1555, + "step": 23441 + }, + { + "epoch": 0.9917082663507911, + "grad_norm": 0.1690703183412552, + "learning_rate": 0.001, + "loss": 1.6741, + "step": 23442 + }, + { + "epoch": 0.9917505711143074, + "grad_norm": 0.15942858159542084, + "learning_rate": 0.001, + "loss": 1.7488, + "step": 23443 + }, + { + "epoch": 0.9917928758778238, + "grad_norm": 0.15770910680294037, + "learning_rate": 0.001, + "loss": 3.3523, + "step": 23444 + }, + { + "epoch": 0.9918351806413402, + "grad_norm": 11.168841361999512, + "learning_rate": 0.001, + "loss": 2.2278, + "step": 23445 + }, + { + "epoch": 0.9918774854048565, + "grad_norm": 0.173135906457901, + "learning_rate": 0.001, + "loss": 2.5919, + "step": 23446 + }, + { + "epoch": 0.991919790168373, + "grad_norm": 0.13830503821372986, + "learning_rate": 0.001, + "loss": 2.0946, + "step": 23447 + }, + { + "epoch": 0.9919620949318894, + "grad_norm": 0.1732984036207199, + "learning_rate": 0.001, + "loss": 1.8609, + "step": 23448 + }, + { + "epoch": 0.9920043996954057, + "grad_norm": 0.1577487289905548, + "learning_rate": 0.001, + "loss": 2.0619, + "step": 23449 + }, + { + "epoch": 0.9920467044589221, + "grad_norm": 0.9779529571533203, + "learning_rate": 0.001, + "loss": 1.9558, + "step": 23450 + }, + { + "epoch": 0.9920890092224385, + "grad_norm": 0.21312867105007172, + "learning_rate": 0.001, + "loss": 2.2838, + "step": 23451 + }, + { + "epoch": 0.9921313139859548, + "grad_norm": 0.18126481771469116, + "learning_rate": 0.001, + "loss": 1.9558, + "step": 23452 + }, + { + "epoch": 0.9921736187494712, + "grad_norm": 0.9032267928123474, + "learning_rate": 0.001, + "loss": 2.3783, + "step": 23453 + }, + { + "epoch": 0.9922159235129876, + "grad_norm": 0.13526488840579987, + "learning_rate": 0.001, + "loss": 2.0193, + "step": 23454 + }, + { + "epoch": 0.9922582282765039, + "grad_norm": 0.16340215504169464, + "learning_rate": 0.001, + "loss": 2.604, + "step": 23455 + }, + { + "epoch": 0.9923005330400203, + "grad_norm": 0.14608405530452728, + "learning_rate": 0.001, + "loss": 3.0248, + "step": 23456 + }, + { + "epoch": 0.9923428378035367, + "grad_norm": 0.15734529495239258, + "learning_rate": 0.001, + "loss": 1.6765, + "step": 23457 + }, + { + "epoch": 0.992385142567053, + "grad_norm": 0.14332926273345947, + "learning_rate": 0.001, + "loss": 2.2598, + "step": 23458 + }, + { + "epoch": 0.9924274473305694, + "grad_norm": 1.3368622064590454, + "learning_rate": 0.001, + "loss": 3.2208, + "step": 23459 + }, + { + "epoch": 0.9924697520940858, + "grad_norm": 0.16903036832809448, + "learning_rate": 0.001, + "loss": 1.5743, + "step": 23460 + }, + { + "epoch": 0.9925120568576021, + "grad_norm": 0.1450778692960739, + "learning_rate": 0.001, + "loss": 2.1063, + "step": 23461 + }, + { + "epoch": 0.9925543616211185, + "grad_norm": 0.24200314283370972, + "learning_rate": 0.001, + "loss": 2.7203, + "step": 23462 + }, + { + "epoch": 0.992596666384635, + "grad_norm": 0.1364940106868744, + "learning_rate": 0.001, + "loss": 2.2594, + "step": 23463 + }, + { + "epoch": 0.9926389711481513, + "grad_norm": 0.15272387862205505, + "learning_rate": 0.001, + "loss": 1.5758, + "step": 23464 + }, + { + "epoch": 0.9926812759116677, + "grad_norm": 0.1937706619501114, + "learning_rate": 0.001, + "loss": 2.4084, + "step": 23465 + }, + { + "epoch": 0.9927235806751841, + "grad_norm": 0.1765604466199875, + "learning_rate": 0.001, + "loss": 1.7184, + "step": 23466 + }, + { + "epoch": 0.9927658854387004, + "grad_norm": 0.17739205062389374, + "learning_rate": 0.001, + "loss": 1.9615, + "step": 23467 + }, + { + "epoch": 0.9928081902022168, + "grad_norm": 1.2467105388641357, + "learning_rate": 0.001, + "loss": 1.6635, + "step": 23468 + }, + { + "epoch": 0.9928504949657332, + "grad_norm": 1.0034493207931519, + "learning_rate": 0.001, + "loss": 2.3672, + "step": 23469 + }, + { + "epoch": 0.9928927997292495, + "grad_norm": 0.19060944020748138, + "learning_rate": 0.001, + "loss": 2.5745, + "step": 23470 + }, + { + "epoch": 0.9929351044927659, + "grad_norm": 0.18545667827129364, + "learning_rate": 0.001, + "loss": 2.2075, + "step": 23471 + }, + { + "epoch": 0.9929774092562823, + "grad_norm": 0.1679558902978897, + "learning_rate": 0.001, + "loss": 1.7822, + "step": 23472 + }, + { + "epoch": 0.9930197140197986, + "grad_norm": 0.21493469178676605, + "learning_rate": 0.001, + "loss": 3.6993, + "step": 23473 + }, + { + "epoch": 0.993062018783315, + "grad_norm": 0.21583551168441772, + "learning_rate": 0.001, + "loss": 2.2144, + "step": 23474 + }, + { + "epoch": 0.9931043235468314, + "grad_norm": 0.1545204520225525, + "learning_rate": 0.001, + "loss": 1.8569, + "step": 23475 + }, + { + "epoch": 0.9931466283103477, + "grad_norm": 0.17547298967838287, + "learning_rate": 0.001, + "loss": 1.6119, + "step": 23476 + }, + { + "epoch": 0.9931889330738641, + "grad_norm": 0.18335486948490143, + "learning_rate": 0.001, + "loss": 1.8375, + "step": 23477 + }, + { + "epoch": 0.9932312378373805, + "grad_norm": 0.17604441940784454, + "learning_rate": 0.001, + "loss": 1.7185, + "step": 23478 + }, + { + "epoch": 0.9932735426008968, + "grad_norm": 0.1612115502357483, + "learning_rate": 0.001, + "loss": 1.7173, + "step": 23479 + }, + { + "epoch": 0.9933158473644133, + "grad_norm": 0.17903564870357513, + "learning_rate": 0.001, + "loss": 2.2194, + "step": 23480 + }, + { + "epoch": 0.9933581521279296, + "grad_norm": 0.15683114528656006, + "learning_rate": 0.001, + "loss": 1.9522, + "step": 23481 + }, + { + "epoch": 0.993400456891446, + "grad_norm": 0.19979719817638397, + "learning_rate": 0.001, + "loss": 2.8421, + "step": 23482 + }, + { + "epoch": 0.9934427616549624, + "grad_norm": 0.16343528032302856, + "learning_rate": 0.001, + "loss": 2.3331, + "step": 23483 + }, + { + "epoch": 0.9934850664184787, + "grad_norm": 0.14997512102127075, + "learning_rate": 0.001, + "loss": 1.9094, + "step": 23484 + }, + { + "epoch": 0.9935273711819951, + "grad_norm": 0.14654596149921417, + "learning_rate": 0.001, + "loss": 2.325, + "step": 23485 + }, + { + "epoch": 0.9935696759455115, + "grad_norm": 0.15524841845035553, + "learning_rate": 0.001, + "loss": 2.0226, + "step": 23486 + }, + { + "epoch": 0.9936119807090278, + "grad_norm": 0.1682468056678772, + "learning_rate": 0.001, + "loss": 1.9113, + "step": 23487 + }, + { + "epoch": 0.9936542854725442, + "grad_norm": 0.15577398240566254, + "learning_rate": 0.001, + "loss": 2.6625, + "step": 23488 + }, + { + "epoch": 0.9936965902360606, + "grad_norm": 0.1386529803276062, + "learning_rate": 0.001, + "loss": 1.7642, + "step": 23489 + }, + { + "epoch": 0.9937388949995769, + "grad_norm": 0.1561625450849533, + "learning_rate": 0.001, + "loss": 2.7162, + "step": 23490 + }, + { + "epoch": 0.9937811997630933, + "grad_norm": 0.22317662835121155, + "learning_rate": 0.001, + "loss": 2.0305, + "step": 23491 + }, + { + "epoch": 0.9938235045266097, + "grad_norm": 0.1401653289794922, + "learning_rate": 0.001, + "loss": 2.2692, + "step": 23492 + }, + { + "epoch": 0.993865809290126, + "grad_norm": 0.14323166012763977, + "learning_rate": 0.001, + "loss": 1.724, + "step": 23493 + }, + { + "epoch": 0.9939081140536424, + "grad_norm": 0.1282302439212799, + "learning_rate": 0.001, + "loss": 2.036, + "step": 23494 + }, + { + "epoch": 0.9939504188171588, + "grad_norm": 0.1263008862733841, + "learning_rate": 0.001, + "loss": 1.3955, + "step": 23495 + }, + { + "epoch": 0.9939927235806751, + "grad_norm": 0.13142234086990356, + "learning_rate": 0.001, + "loss": 2.1095, + "step": 23496 + }, + { + "epoch": 0.9940350283441916, + "grad_norm": 0.17697520554065704, + "learning_rate": 0.001, + "loss": 2.3401, + "step": 23497 + }, + { + "epoch": 0.994077333107708, + "grad_norm": 1.2804735898971558, + "learning_rate": 0.001, + "loss": 2.5458, + "step": 23498 + }, + { + "epoch": 0.9941196378712243, + "grad_norm": 0.24874600768089294, + "learning_rate": 0.001, + "loss": 2.6729, + "step": 23499 + }, + { + "epoch": 0.9941619426347407, + "grad_norm": 0.2032213658094406, + "learning_rate": 0.001, + "loss": 2.7131, + "step": 23500 + }, + { + "epoch": 0.9942042473982571, + "grad_norm": 0.267898827791214, + "learning_rate": 0.001, + "loss": 2.2249, + "step": 23501 + }, + { + "epoch": 0.9942465521617734, + "grad_norm": 0.1407955288887024, + "learning_rate": 0.001, + "loss": 2.842, + "step": 23502 + }, + { + "epoch": 0.9942888569252898, + "grad_norm": 10.123157501220703, + "learning_rate": 0.001, + "loss": 1.9959, + "step": 23503 + }, + { + "epoch": 0.9943311616888062, + "grad_norm": 0.1459282636642456, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 23504 + }, + { + "epoch": 0.9943734664523225, + "grad_norm": 0.17329807579517365, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 23505 + }, + { + "epoch": 0.9944157712158389, + "grad_norm": 0.11436944454908371, + "learning_rate": 0.001, + "loss": 2.0212, + "step": 23506 + }, + { + "epoch": 0.9944580759793553, + "grad_norm": 0.15068282186985016, + "learning_rate": 0.001, + "loss": 1.7319, + "step": 23507 + }, + { + "epoch": 0.9945003807428716, + "grad_norm": 0.14463482797145844, + "learning_rate": 0.001, + "loss": 1.932, + "step": 23508 + }, + { + "epoch": 0.994542685506388, + "grad_norm": 0.1417880356311798, + "learning_rate": 0.001, + "loss": 2.3686, + "step": 23509 + }, + { + "epoch": 0.9945849902699044, + "grad_norm": 2.411515951156616, + "learning_rate": 0.001, + "loss": 1.9726, + "step": 23510 + }, + { + "epoch": 0.9946272950334207, + "grad_norm": 0.15016627311706543, + "learning_rate": 0.001, + "loss": 2.7317, + "step": 23511 + }, + { + "epoch": 0.9946695997969371, + "grad_norm": 0.16307124495506287, + "learning_rate": 0.001, + "loss": 2.3945, + "step": 23512 + }, + { + "epoch": 0.9947119045604536, + "grad_norm": 0.2472067028284073, + "learning_rate": 0.001, + "loss": 1.323, + "step": 23513 + }, + { + "epoch": 0.9947542093239699, + "grad_norm": 0.14261025190353394, + "learning_rate": 0.001, + "loss": 1.6545, + "step": 23514 + }, + { + "epoch": 0.9947965140874863, + "grad_norm": 0.21985666453838348, + "learning_rate": 0.001, + "loss": 1.6186, + "step": 23515 + }, + { + "epoch": 0.9948388188510027, + "grad_norm": 0.18958121538162231, + "learning_rate": 0.001, + "loss": 3.1224, + "step": 23516 + }, + { + "epoch": 0.994881123614519, + "grad_norm": 0.18746091425418854, + "learning_rate": 0.001, + "loss": 1.7352, + "step": 23517 + }, + { + "epoch": 0.9949234283780354, + "grad_norm": 0.3156006932258606, + "learning_rate": 0.001, + "loss": 3.373, + "step": 23518 + }, + { + "epoch": 0.9949657331415518, + "grad_norm": 0.1298127919435501, + "learning_rate": 0.001, + "loss": 2.0672, + "step": 23519 + }, + { + "epoch": 0.9950080379050681, + "grad_norm": 0.12908710539340973, + "learning_rate": 0.001, + "loss": 1.835, + "step": 23520 + }, + { + "epoch": 0.9950503426685845, + "grad_norm": 0.2133929282426834, + "learning_rate": 0.001, + "loss": 3.1512, + "step": 23521 + }, + { + "epoch": 0.9950926474321009, + "grad_norm": 0.145905002951622, + "learning_rate": 0.001, + "loss": 1.3302, + "step": 23522 + }, + { + "epoch": 0.9951349521956172, + "grad_norm": 0.19927392899990082, + "learning_rate": 0.001, + "loss": 3.0666, + "step": 23523 + }, + { + "epoch": 0.9951772569591336, + "grad_norm": 0.1459832340478897, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 23524 + }, + { + "epoch": 0.99521956172265, + "grad_norm": 0.19735519587993622, + "learning_rate": 0.001, + "loss": 1.8252, + "step": 23525 + }, + { + "epoch": 0.9952618664861663, + "grad_norm": 0.15133242309093475, + "learning_rate": 0.001, + "loss": 1.5066, + "step": 23526 + }, + { + "epoch": 0.9953041712496827, + "grad_norm": 7.522138595581055, + "learning_rate": 0.001, + "loss": 2.1434, + "step": 23527 + }, + { + "epoch": 0.995346476013199, + "grad_norm": 0.1376638412475586, + "learning_rate": 0.001, + "loss": 1.7716, + "step": 23528 + }, + { + "epoch": 0.9953887807767154, + "grad_norm": 1.5856484174728394, + "learning_rate": 0.001, + "loss": 1.7964, + "step": 23529 + }, + { + "epoch": 0.9954310855402319, + "grad_norm": 0.16695480048656464, + "learning_rate": 0.001, + "loss": 1.9568, + "step": 23530 + }, + { + "epoch": 0.9954733903037482, + "grad_norm": 0.6410706043243408, + "learning_rate": 0.001, + "loss": 2.1194, + "step": 23531 + }, + { + "epoch": 0.9955156950672646, + "grad_norm": 0.31505700945854187, + "learning_rate": 0.001, + "loss": 1.8234, + "step": 23532 + }, + { + "epoch": 0.995557999830781, + "grad_norm": 0.1771719753742218, + "learning_rate": 0.001, + "loss": 2.5342, + "step": 23533 + }, + { + "epoch": 0.9956003045942973, + "grad_norm": 0.18417403101921082, + "learning_rate": 0.001, + "loss": 2.5644, + "step": 23534 + }, + { + "epoch": 0.9956426093578137, + "grad_norm": 0.15871447324752808, + "learning_rate": 0.001, + "loss": 2.7864, + "step": 23535 + }, + { + "epoch": 0.9956849141213301, + "grad_norm": 0.2665135860443115, + "learning_rate": 0.001, + "loss": 2.847, + "step": 23536 + }, + { + "epoch": 0.9957272188848464, + "grad_norm": 0.7563045620918274, + "learning_rate": 0.001, + "loss": 3.1899, + "step": 23537 + }, + { + "epoch": 0.9957695236483628, + "grad_norm": 0.1422766149044037, + "learning_rate": 0.001, + "loss": 2.0713, + "step": 23538 + }, + { + "epoch": 0.9958118284118792, + "grad_norm": 0.14153209328651428, + "learning_rate": 0.001, + "loss": 1.7366, + "step": 23539 + }, + { + "epoch": 0.9958541331753955, + "grad_norm": 0.1573881059885025, + "learning_rate": 0.001, + "loss": 1.9069, + "step": 23540 + }, + { + "epoch": 0.9958964379389119, + "grad_norm": 39.94224166870117, + "learning_rate": 0.001, + "loss": 2.0561, + "step": 23541 + }, + { + "epoch": 0.9959387427024283, + "grad_norm": 0.1591683328151703, + "learning_rate": 0.001, + "loss": 3.2608, + "step": 23542 + }, + { + "epoch": 0.9959810474659446, + "grad_norm": 0.19343800842761993, + "learning_rate": 0.001, + "loss": 3.1865, + "step": 23543 + }, + { + "epoch": 0.996023352229461, + "grad_norm": 0.15812444686889648, + "learning_rate": 0.001, + "loss": 2.2566, + "step": 23544 + }, + { + "epoch": 0.9960656569929774, + "grad_norm": 0.3023439049720764, + "learning_rate": 0.001, + "loss": 2.1273, + "step": 23545 + }, + { + "epoch": 0.9961079617564937, + "grad_norm": 0.14426670968532562, + "learning_rate": 0.001, + "loss": 1.5217, + "step": 23546 + }, + { + "epoch": 0.9961502665200102, + "grad_norm": 0.5261450409889221, + "learning_rate": 0.001, + "loss": 1.8896, + "step": 23547 + }, + { + "epoch": 0.9961925712835266, + "grad_norm": 0.1593768447637558, + "learning_rate": 0.001, + "loss": 1.302, + "step": 23548 + }, + { + "epoch": 0.9962348760470429, + "grad_norm": 0.19295907020568848, + "learning_rate": 0.001, + "loss": 2.4438, + "step": 23549 + }, + { + "epoch": 0.9962771808105593, + "grad_norm": 0.1521875560283661, + "learning_rate": 0.001, + "loss": 1.3707, + "step": 23550 + }, + { + "epoch": 0.9963194855740757, + "grad_norm": 0.1482807993888855, + "learning_rate": 0.001, + "loss": 2.3517, + "step": 23551 + }, + { + "epoch": 0.996361790337592, + "grad_norm": 0.1328219175338745, + "learning_rate": 0.001, + "loss": 1.6607, + "step": 23552 + }, + { + "epoch": 0.9964040951011084, + "grad_norm": 0.13568271696567535, + "learning_rate": 0.001, + "loss": 2.0808, + "step": 23553 + }, + { + "epoch": 0.9964463998646248, + "grad_norm": 0.17867201566696167, + "learning_rate": 0.001, + "loss": 1.8838, + "step": 23554 + }, + { + "epoch": 0.9964887046281411, + "grad_norm": 0.14420682191848755, + "learning_rate": 0.001, + "loss": 2.1963, + "step": 23555 + }, + { + "epoch": 0.9965310093916575, + "grad_norm": 0.19156771898269653, + "learning_rate": 0.001, + "loss": 2.6953, + "step": 23556 + }, + { + "epoch": 0.9965733141551739, + "grad_norm": 0.13624778389930725, + "learning_rate": 0.001, + "loss": 2.076, + "step": 23557 + }, + { + "epoch": 0.9966156189186902, + "grad_norm": 0.15911436080932617, + "learning_rate": 0.001, + "loss": 2.2559, + "step": 23558 + }, + { + "epoch": 0.9966579236822066, + "grad_norm": 0.16328518092632294, + "learning_rate": 0.001, + "loss": 1.7107, + "step": 23559 + }, + { + "epoch": 0.996700228445723, + "grad_norm": 0.20304375886917114, + "learning_rate": 0.001, + "loss": 2.0468, + "step": 23560 + }, + { + "epoch": 0.9967425332092393, + "grad_norm": 0.16803617775440216, + "learning_rate": 0.001, + "loss": 2.2345, + "step": 23561 + }, + { + "epoch": 0.9967848379727557, + "grad_norm": 0.13952644169330597, + "learning_rate": 0.001, + "loss": 2.2767, + "step": 23562 + }, + { + "epoch": 0.9968271427362722, + "grad_norm": 0.15917642414569855, + "learning_rate": 0.001, + "loss": 1.8884, + "step": 23563 + }, + { + "epoch": 0.9968694474997885, + "grad_norm": 0.1422685831785202, + "learning_rate": 0.001, + "loss": 1.6789, + "step": 23564 + }, + { + "epoch": 0.9969117522633049, + "grad_norm": 0.9447866082191467, + "learning_rate": 0.001, + "loss": 2.6736, + "step": 23565 + }, + { + "epoch": 0.9969540570268213, + "grad_norm": 0.15833914279937744, + "learning_rate": 0.001, + "loss": 2.3225, + "step": 23566 + }, + { + "epoch": 0.9969963617903376, + "grad_norm": 0.15113773941993713, + "learning_rate": 0.001, + "loss": 2.1877, + "step": 23567 + }, + { + "epoch": 0.997038666553854, + "grad_norm": 0.17661526799201965, + "learning_rate": 0.001, + "loss": 2.0605, + "step": 23568 + }, + { + "epoch": 0.9970809713173704, + "grad_norm": 0.2037663608789444, + "learning_rate": 0.001, + "loss": 2.095, + "step": 23569 + }, + { + "epoch": 0.9971232760808867, + "grad_norm": 0.40272435545921326, + "learning_rate": 0.001, + "loss": 1.8401, + "step": 23570 + }, + { + "epoch": 0.9971655808444031, + "grad_norm": 1.8662751913070679, + "learning_rate": 0.001, + "loss": 3.3036, + "step": 23571 + }, + { + "epoch": 0.9972078856079194, + "grad_norm": 0.14151303470134735, + "learning_rate": 0.001, + "loss": 2.6741, + "step": 23572 + }, + { + "epoch": 0.9972501903714358, + "grad_norm": 0.17152297496795654, + "learning_rate": 0.001, + "loss": 2.5857, + "step": 23573 + }, + { + "epoch": 0.9972924951349522, + "grad_norm": 1.016983151435852, + "learning_rate": 0.001, + "loss": 1.5302, + "step": 23574 + }, + { + "epoch": 0.9973347998984685, + "grad_norm": 0.16691197454929352, + "learning_rate": 0.001, + "loss": 2.3803, + "step": 23575 + }, + { + "epoch": 0.9973771046619849, + "grad_norm": 0.16801585257053375, + "learning_rate": 0.001, + "loss": 1.581, + "step": 23576 + }, + { + "epoch": 0.9974194094255013, + "grad_norm": 0.20410461723804474, + "learning_rate": 0.001, + "loss": 1.7048, + "step": 23577 + }, + { + "epoch": 0.9974617141890176, + "grad_norm": 0.17868375778198242, + "learning_rate": 0.001, + "loss": 2.3364, + "step": 23578 + }, + { + "epoch": 0.997504018952534, + "grad_norm": 0.1568222939968109, + "learning_rate": 0.001, + "loss": 1.8333, + "step": 23579 + }, + { + "epoch": 0.9975463237160505, + "grad_norm": 0.1552361100912094, + "learning_rate": 0.001, + "loss": 1.9154, + "step": 23580 + }, + { + "epoch": 0.9975886284795668, + "grad_norm": 0.15056028962135315, + "learning_rate": 0.001, + "loss": 2.2219, + "step": 23581 + }, + { + "epoch": 0.9976309332430832, + "grad_norm": 0.17635402083396912, + "learning_rate": 0.001, + "loss": 1.6218, + "step": 23582 + }, + { + "epoch": 0.9976732380065996, + "grad_norm": 0.273888498544693, + "learning_rate": 0.001, + "loss": 3.4827, + "step": 23583 + }, + { + "epoch": 0.9977155427701159, + "grad_norm": 0.14739017188549042, + "learning_rate": 0.001, + "loss": 2.1994, + "step": 23584 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.18939383327960968, + "learning_rate": 0.001, + "loss": 2.3165, + "step": 23585 + }, + { + "epoch": 0.9978001522971487, + "grad_norm": 0.15688124299049377, + "learning_rate": 0.001, + "loss": 1.771, + "step": 23586 + }, + { + "epoch": 0.997842457060665, + "grad_norm": 0.15662074089050293, + "learning_rate": 0.001, + "loss": 2.1358, + "step": 23587 + }, + { + "epoch": 0.9978847618241814, + "grad_norm": 0.15011928975582123, + "learning_rate": 0.001, + "loss": 2.084, + "step": 23588 + }, + { + "epoch": 0.9979270665876978, + "grad_norm": 0.1399097889661789, + "learning_rate": 0.001, + "loss": 3.6885, + "step": 23589 + }, + { + "epoch": 0.9979693713512141, + "grad_norm": 0.1603403389453888, + "learning_rate": 0.001, + "loss": 2.0335, + "step": 23590 + }, + { + "epoch": 0.9980116761147305, + "grad_norm": 0.14453408122062683, + "learning_rate": 0.001, + "loss": 2.7993, + "step": 23591 + }, + { + "epoch": 0.9980539808782469, + "grad_norm": 0.2429407685995102, + "learning_rate": 0.001, + "loss": 2.0416, + "step": 23592 + }, + { + "epoch": 0.9980962856417632, + "grad_norm": 0.15473546087741852, + "learning_rate": 0.001, + "loss": 1.5858, + "step": 23593 + }, + { + "epoch": 0.9981385904052796, + "grad_norm": 0.13461685180664062, + "learning_rate": 0.001, + "loss": 2.3416, + "step": 23594 + }, + { + "epoch": 0.998180895168796, + "grad_norm": 0.8460047841072083, + "learning_rate": 0.001, + "loss": 2.1287, + "step": 23595 + }, + { + "epoch": 0.9982231999323123, + "grad_norm": 0.15388406813144684, + "learning_rate": 0.001, + "loss": 1.5546, + "step": 23596 + }, + { + "epoch": 0.9982655046958288, + "grad_norm": 0.15559299290180206, + "learning_rate": 0.001, + "loss": 1.6043, + "step": 23597 + }, + { + "epoch": 0.9983078094593452, + "grad_norm": 0.14206543564796448, + "learning_rate": 0.001, + "loss": 1.5113, + "step": 23598 + }, + { + "epoch": 0.9983501142228615, + "grad_norm": 0.20963889360427856, + "learning_rate": 0.001, + "loss": 1.5628, + "step": 23599 + }, + { + "epoch": 0.9983924189863779, + "grad_norm": 0.20047658681869507, + "learning_rate": 0.001, + "loss": 1.5766, + "step": 23600 + }, + { + "epoch": 0.9984347237498943, + "grad_norm": 0.1295713633298874, + "learning_rate": 0.001, + "loss": 1.6966, + "step": 23601 + }, + { + "epoch": 0.9984770285134106, + "grad_norm": 0.20154249668121338, + "learning_rate": 0.001, + "loss": 1.6119, + "step": 23602 + }, + { + "epoch": 0.998519333276927, + "grad_norm": 0.14119143784046173, + "learning_rate": 0.001, + "loss": 1.534, + "step": 23603 + }, + { + "epoch": 0.9985616380404434, + "grad_norm": 0.15057438611984253, + "learning_rate": 0.001, + "loss": 2.1838, + "step": 23604 + }, + { + "epoch": 0.9986039428039597, + "grad_norm": 0.1763651818037033, + "learning_rate": 0.001, + "loss": 2.8296, + "step": 23605 + }, + { + "epoch": 0.9986462475674761, + "grad_norm": 0.20839767158031464, + "learning_rate": 0.001, + "loss": 2.2355, + "step": 23606 + }, + { + "epoch": 0.9986885523309925, + "grad_norm": 0.20311135053634644, + "learning_rate": 0.001, + "loss": 2.5101, + "step": 23607 + }, + { + "epoch": 0.9987308570945088, + "grad_norm": 0.17892763018608093, + "learning_rate": 0.001, + "loss": 2.4245, + "step": 23608 + }, + { + "epoch": 0.9987731618580252, + "grad_norm": 0.15312178432941437, + "learning_rate": 0.001, + "loss": 2.0084, + "step": 23609 + }, + { + "epoch": 0.9988154666215416, + "grad_norm": 0.15701162815093994, + "learning_rate": 0.001, + "loss": 1.6143, + "step": 23610 + }, + { + "epoch": 0.9988577713850579, + "grad_norm": 0.16364918649196625, + "learning_rate": 0.001, + "loss": 1.4365, + "step": 23611 + }, + { + "epoch": 0.9989000761485743, + "grad_norm": 0.4083903729915619, + "learning_rate": 0.001, + "loss": 1.6857, + "step": 23612 + }, + { + "epoch": 0.9989423809120908, + "grad_norm": 0.13597403466701508, + "learning_rate": 0.001, + "loss": 2.4054, + "step": 23613 + }, + { + "epoch": 0.998984685675607, + "grad_norm": 0.12076670676469803, + "learning_rate": 0.001, + "loss": 1.7387, + "step": 23614 + }, + { + "epoch": 0.9990269904391235, + "grad_norm": 0.20430530607700348, + "learning_rate": 0.001, + "loss": 2.5614, + "step": 23615 + }, + { + "epoch": 0.9990692952026398, + "grad_norm": 0.1506972312927246, + "learning_rate": 0.001, + "loss": 2.3331, + "step": 23616 + }, + { + "epoch": 0.9991115999661562, + "grad_norm": 0.14669986069202423, + "learning_rate": 0.001, + "loss": 1.5766, + "step": 23617 + }, + { + "epoch": 0.9991539047296726, + "grad_norm": 0.15908612310886383, + "learning_rate": 0.001, + "loss": 2.0938, + "step": 23618 + }, + { + "epoch": 0.9991962094931889, + "grad_norm": 0.1240277886390686, + "learning_rate": 0.001, + "loss": 2.1356, + "step": 23619 + }, + { + "epoch": 0.9992385142567053, + "grad_norm": 0.14557062089443207, + "learning_rate": 0.001, + "loss": 2.1815, + "step": 23620 + }, + { + "epoch": 0.9992808190202217, + "grad_norm": 0.16171297430992126, + "learning_rate": 0.001, + "loss": 1.7981, + "step": 23621 + }, + { + "epoch": 0.999323123783738, + "grad_norm": 0.157927006483078, + "learning_rate": 0.001, + "loss": 1.6165, + "step": 23622 + }, + { + "epoch": 0.9993654285472544, + "grad_norm": 0.15406683087348938, + "learning_rate": 0.001, + "loss": 2.1801, + "step": 23623 + }, + { + "epoch": 0.9994077333107708, + "grad_norm": 0.19311675429344177, + "learning_rate": 0.001, + "loss": 1.6993, + "step": 23624 + }, + { + "epoch": 0.9994500380742871, + "grad_norm": 0.24103492498397827, + "learning_rate": 0.001, + "loss": 2.6117, + "step": 23625 + }, + { + "epoch": 0.9994923428378035, + "grad_norm": 0.20797502994537354, + "learning_rate": 0.001, + "loss": 2.2529, + "step": 23626 + }, + { + "epoch": 0.9995346476013199, + "grad_norm": 0.3445920944213867, + "learning_rate": 0.001, + "loss": 2.4338, + "step": 23627 + }, + { + "epoch": 0.9995769523648362, + "grad_norm": 0.1522158980369568, + "learning_rate": 0.001, + "loss": 1.6047, + "step": 23628 + }, + { + "epoch": 0.9996192571283526, + "grad_norm": 1.6727036237716675, + "learning_rate": 0.001, + "loss": 3.4602, + "step": 23629 + }, + { + "epoch": 0.999661561891869, + "grad_norm": 0.14390981197357178, + "learning_rate": 0.001, + "loss": 1.7825, + "step": 23630 + }, + { + "epoch": 0.9997038666553854, + "grad_norm": 0.16790489852428436, + "learning_rate": 0.001, + "loss": 2.0287, + "step": 23631 + }, + { + "epoch": 0.9997461714189018, + "grad_norm": 1.9648386240005493, + "learning_rate": 0.001, + "loss": 1.9811, + "step": 23632 + }, + { + "epoch": 0.9997884761824182, + "grad_norm": 0.17755821347236633, + "learning_rate": 0.001, + "loss": 1.6772, + "step": 23633 + }, + { + "epoch": 0.9998307809459345, + "grad_norm": 0.1543680876493454, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 23634 + }, + { + "epoch": 0.9998730857094509, + "grad_norm": 0.1278516948223114, + "learning_rate": 0.001, + "loss": 2.8439, + "step": 23635 + }, + { + "epoch": 0.9999153904729673, + "grad_norm": 0.15573585033416748, + "learning_rate": 0.001, + "loss": 2.505, + "step": 23636 + }, + { + "epoch": 0.9999576952364836, + "grad_norm": 0.17682018876075745, + "learning_rate": 0.001, + "loss": 1.2282, + "step": 23637 + }, + { + "epoch": 1.0, + "grad_norm": 0.19988803565502167, + "learning_rate": 0.001, + "loss": 1.9946, + "step": 23638 + }, + { + "epoch": 1.0, + "step": 23638, + "total_flos": 1.3698602096404378e+17, + "train_loss": 2.3282218011757556, + "train_runtime": 26726.0005, + "train_samples_per_second": 7.076, + "train_steps_per_second": 0.884 + } + ], + "logging_steps": 1, + "max_steps": 23638, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5910, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3698602096404378e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}