[ { "loss": 36.4684, "grad_norm": 1.4972327947616577, "learning_rate": 9.991248796709548e-05, "epoch": 0.0 }, { "loss": 31.4702, "grad_norm": 1.1359542608261108, "learning_rate": 9.982497593419096e-05, "epoch": 0.01 }, { "loss": 27.0139, "grad_norm": 0.8641749024391174, "learning_rate": 9.973746390128643e-05, "epoch": 0.01 }, { "loss": 25.5819, "grad_norm": 0.7870637774467468, "learning_rate": 9.964995186838191e-05, "epoch": 0.01 }, { "loss": 23.737, "grad_norm": 0.5421485900878906, "learning_rate": 9.956243983547739e-05, "epoch": 0.01 }, { "loss": 22.9208, "grad_norm": 0.6402917504310608, "learning_rate": 9.947492780257286e-05, "epoch": 0.02 }, { "loss": 22.6313, "grad_norm": 4.473743915557861, "learning_rate": 9.938741576966833e-05, "epoch": 0.02 }, { "loss": 21.2445, "grad_norm": 1.5480146408081055, "learning_rate": 9.929990373676381e-05, "epoch": 0.02 }, { "loss": 21.5249, "grad_norm": 1.6474212408065796, "learning_rate": 9.921239170385929e-05, "epoch": 0.02 }, { "loss": 20.416, "grad_norm": 5.127786636352539, "learning_rate": 9.912487967095477e-05, "epoch": 0.03 }, { "loss": 20.4307, "grad_norm": 3.4422194957733154, "learning_rate": 9.903736763805023e-05, "epoch": 0.03 }, { "loss": 20.303, "grad_norm": 4.862687587738037, "learning_rate": 9.894985560514571e-05, "epoch": 0.03 }, { "loss": 18.9257, "grad_norm": 3.3842506408691406, "learning_rate": 9.886234357224119e-05, "epoch": 0.03 }, { "loss": 19.3255, "grad_norm": 2.574763774871826, "learning_rate": 9.877483153933667e-05, "epoch": 0.04 }, { "loss": 18.568, "grad_norm": 3.362725019454956, "learning_rate": 9.868731950643215e-05, "epoch": 0.04 }, { "loss": 19.2249, "grad_norm": 2.039949417114258, "learning_rate": 9.859980747352761e-05, "epoch": 0.04 }, { "loss": 18.892, "grad_norm": 6.239978313446045, "learning_rate": 9.851229544062309e-05, "epoch": 0.04 }, { "loss": 18.1108, "grad_norm": 4.516480445861816, "learning_rate": 9.842478340771857e-05, "epoch": 0.05 }, { "loss": 17.5294, "grad_norm": 2.1653459072113037, "learning_rate": 9.833727137481405e-05, "epoch": 0.05 }, { "loss": 16.4828, "grad_norm": 3.5177245140075684, "learning_rate": 9.824975934190951e-05, "epoch": 0.05 }, { "loss": 17.0135, "grad_norm": 3.044818162918091, "learning_rate": 9.8162247309005e-05, "epoch": 0.06 }, { "loss": 17.2804, "grad_norm": 2.429781198501587, "learning_rate": 9.807473527610047e-05, "epoch": 0.06 }, { "loss": 16.9488, "grad_norm": 4.289863586425781, "learning_rate": 9.798722324319595e-05, "epoch": 0.06 }, { "loss": 16.413, "grad_norm": 2.322678804397583, "learning_rate": 9.789971121029142e-05, "epoch": 0.06 }, { "loss": 15.9682, "grad_norm": 2.0535218715667725, "learning_rate": 9.78121991773869e-05, "epoch": 0.07 }, { "loss": 15.9078, "grad_norm": 1.9909405708312988, "learning_rate": 9.772468714448237e-05, "epoch": 0.07 }, { "loss": 16.3041, "grad_norm": 2.9589896202087402, "learning_rate": 9.763717511157785e-05, "epoch": 0.07 }, { "loss": 15.9803, "grad_norm": 4.50626277923584, "learning_rate": 9.754966307867332e-05, "epoch": 0.07 }, { "loss": 15.3465, "grad_norm": 2.2582998275756836, "learning_rate": 9.74621510457688e-05, "epoch": 0.08 }, { "loss": 15.8187, "grad_norm": 1.8218069076538086, "learning_rate": 9.737463901286428e-05, "epoch": 0.08 }, { "loss": 15.6423, "grad_norm": 3.2282557487487793, "learning_rate": 9.728712697995976e-05, "epoch": 0.08 }, { "loss": 15.2293, "grad_norm": 4.152139663696289, "learning_rate": 9.719961494705523e-05, "epoch": 0.08 }, { "loss": 14.9266, "grad_norm": 2.4058563709259033, "learning_rate": 9.71121029141507e-05, "epoch": 0.09 }, { "loss": 14.9444, "grad_norm": 2.390054941177368, "learning_rate": 9.702459088124618e-05, "epoch": 0.09 }, { "loss": 14.3572, "grad_norm": 2.8402915000915527, "learning_rate": 9.693707884834166e-05, "epoch": 0.09 }, { "loss": 14.2444, "grad_norm": 3.193218469619751, "learning_rate": 9.684956681543714e-05, "epoch": 0.09 }, { "loss": 14.4161, "grad_norm": 3.0809693336486816, "learning_rate": 9.67620547825326e-05, "epoch": 0.1 }, { "loss": 15.3421, "grad_norm": 2.5853307247161865, "learning_rate": 9.667454274962808e-05, "epoch": 0.1 }, { "loss": 14.4311, "grad_norm": 4.195634365081787, "learning_rate": 9.658703071672356e-05, "epoch": 0.1 }, { "loss": 14.6836, "grad_norm": 3.293510913848877, "learning_rate": 9.649951868381904e-05, "epoch": 0.1 }, { "loss": 13.8319, "grad_norm": 2.9272682666778564, "learning_rate": 9.64120066509145e-05, "epoch": 0.11 }, { "loss": 14.566, "grad_norm": 2.871901512145996, "learning_rate": 9.632449461800998e-05, "epoch": 0.11 }, { "loss": 14.1236, "grad_norm": 5.13899564743042, "learning_rate": 9.623698258510546e-05, "epoch": 0.11 }, { "loss": 14.5302, "grad_norm": 2.652040481567383, "learning_rate": 9.614947055220094e-05, "epoch": 0.12 }, { "loss": 13.8852, "grad_norm": 2.9902889728546143, "learning_rate": 9.606195851929642e-05, "epoch": 0.12 }, { "loss": 14.0293, "grad_norm": 2.3652427196502686, "learning_rate": 9.597444648639187e-05, "epoch": 0.12 }, { "loss": 14.2171, "grad_norm": 4.3896484375, "learning_rate": 9.588693445348735e-05, "epoch": 0.12 }, { "loss": 13.8609, "grad_norm": 3.040400505065918, "learning_rate": 9.579942242058283e-05, "epoch": 0.13 }, { "loss": 13.7735, "grad_norm": 4.223784446716309, "learning_rate": 9.571191038767831e-05, "epoch": 0.13 }, { "loss": 13.8871, "grad_norm": 3.0033748149871826, "learning_rate": 9.562439835477379e-05, "epoch": 0.13 }, { "loss": 13.934, "grad_norm": 1.834015965461731, "learning_rate": 9.553688632186925e-05, "epoch": 0.13 }, { "loss": 12.9919, "grad_norm": 4.704802513122559, "learning_rate": 9.544937428896473e-05, "epoch": 0.14 }, { "loss": 12.8476, "grad_norm": 2.105950355529785, "learning_rate": 9.536186225606021e-05, "epoch": 0.14 }, { "loss": 13.0775, "grad_norm": 3.732581615447998, "learning_rate": 9.527435022315569e-05, "epoch": 0.14 }, { "loss": 14.248, "grad_norm": 3.9151251316070557, "learning_rate": 9.518683819025115e-05, "epoch": 0.14 }, { "loss": 13.519, "grad_norm": 2.424039602279663, "learning_rate": 9.509932615734663e-05, "epoch": 0.15 }, { "loss": 12.9035, "grad_norm": 2.8388330936431885, "learning_rate": 9.501181412444211e-05, "epoch": 0.15 }, { "loss": 13.5495, "grad_norm": 4.111719131469727, "learning_rate": 9.492430209153759e-05, "epoch": 0.15 }, { "loss": 13.0792, "grad_norm": 5.089598655700684, "learning_rate": 9.483679005863306e-05, "epoch": 0.15 }, { "loss": 12.7144, "grad_norm": 2.015564203262329, "learning_rate": 9.474927802572854e-05, "epoch": 0.16 }, { "loss": 13.1144, "grad_norm": 1.9832412004470825, "learning_rate": 9.466176599282401e-05, "epoch": 0.16 }, { "loss": 12.9708, "grad_norm": 1.9776819944381714, "learning_rate": 9.457425395991949e-05, "epoch": 0.16 }, { "loss": 13.1545, "grad_norm": 3.082418918609619, "learning_rate": 9.448674192701497e-05, "epoch": 0.17 }, { "loss": 12.8948, "grad_norm": 2.824528217315674, "learning_rate": 9.439922989411044e-05, "epoch": 0.17 }, { "loss": 12.8128, "grad_norm": 3.653470754623413, "learning_rate": 9.431171786120592e-05, "epoch": 0.17 }, { "loss": 12.4615, "grad_norm": 2.4570350646972656, "learning_rate": 9.42242058283014e-05, "epoch": 0.17 }, { "loss": 13.7183, "grad_norm": 1.996759057044983, "learning_rate": 9.413669379539687e-05, "epoch": 0.18 }, { "loss": 13.286, "grad_norm": 2.3849940299987793, "learning_rate": 9.404918176249234e-05, "epoch": 0.18 }, { "loss": 12.6206, "grad_norm": 3.374633550643921, "learning_rate": 9.396166972958782e-05, "epoch": 0.18 }, { "loss": 13.1151, "grad_norm": 4.5953216552734375, "learning_rate": 9.38741576966833e-05, "epoch": 0.18 }, { "loss": 12.6137, "grad_norm": 2.402780532836914, "learning_rate": 9.378664566377878e-05, "epoch": 0.19 }, { "loss": 12.1367, "grad_norm": 5.434263706207275, "learning_rate": 9.369913363087424e-05, "epoch": 0.19 }, { "loss": 12.4959, "grad_norm": 3.73447585105896, "learning_rate": 9.361162159796972e-05, "epoch": 0.19 }, { "loss": 12.1629, "grad_norm": 3.205071449279785, "learning_rate": 9.35241095650652e-05, "epoch": 0.19 }, { "loss": 12.0657, "grad_norm": 4.104920864105225, "learning_rate": 9.343659753216068e-05, "epoch": 0.2 }, { "loss": 11.9909, "grad_norm": 4.132589817047119, "learning_rate": 9.334908549925616e-05, "epoch": 0.2 }, { "loss": 11.9682, "grad_norm": 2.3729248046875, "learning_rate": 9.326157346635162e-05, "epoch": 0.2 }, { "loss": 12.2075, "grad_norm": 3.024388313293457, "learning_rate": 9.31740614334471e-05, "epoch": 0.2 }, { "loss": 11.4629, "grad_norm": 2.923081159591675, "learning_rate": 9.308654940054258e-05, "epoch": 0.21 }, { "loss": 12.6273, "grad_norm": 4.349196434020996, "learning_rate": 9.299903736763806e-05, "epoch": 0.21 }, { "loss": 12.0323, "grad_norm": 3.275175094604492, "learning_rate": 9.291152533473352e-05, "epoch": 0.21 }, { "loss": 12.1019, "grad_norm": 1.8104184865951538, "learning_rate": 9.2824013301829e-05, "epoch": 0.22 }, { "loss": 12.131, "grad_norm": 3.931492567062378, "learning_rate": 9.273650126892448e-05, "epoch": 0.22 }, { "loss": 12.6479, "grad_norm": 4.626213550567627, "learning_rate": 9.264898923601996e-05, "epoch": 0.22 }, { "loss": 12.0639, "grad_norm": 2.5656702518463135, "learning_rate": 9.256147720311543e-05, "epoch": 0.22 }, { "loss": 11.9819, "grad_norm": 3.8051023483276367, "learning_rate": 9.24739651702109e-05, "epoch": 0.23 }, { "loss": 12.7138, "grad_norm": 2.1373887062072754, "learning_rate": 9.238645313730638e-05, "epoch": 0.23 }, { "loss": 12.1889, "grad_norm": 4.774439334869385, "learning_rate": 9.229894110440186e-05, "epoch": 0.23 }, { "loss": 12.3925, "grad_norm": 3.0765390396118164, "learning_rate": 9.221142907149734e-05, "epoch": 0.23 }, { "loss": 11.4008, "grad_norm": 3.136746644973755, "learning_rate": 9.212391703859281e-05, "epoch": 0.24 }, { "loss": 11.8306, "grad_norm": 1.836838722229004, "learning_rate": 9.203640500568829e-05, "epoch": 0.24 }, { "loss": 11.7773, "grad_norm": 3.790940523147583, "learning_rate": 9.194889297278376e-05, "epoch": 0.24 }, { "loss": 11.8051, "grad_norm": 3.1878066062927246, "learning_rate": 9.186138093987924e-05, "epoch": 0.24 }, { "loss": 12.5683, "grad_norm": 3.5691912174224854, "learning_rate": 9.177386890697471e-05, "epoch": 0.25 }, { "loss": 12.0541, "grad_norm": 3.9797616004943848, "learning_rate": 9.168635687407019e-05, "epoch": 0.25 }, { "loss": 11.8673, "grad_norm": 6.183890342712402, "learning_rate": 9.159884484116567e-05, "epoch": 0.25 }, { "loss": 11.3265, "grad_norm": 3.011223316192627, "learning_rate": 9.151133280826115e-05, "epoch": 0.25 }, { "loss": 11.5664, "grad_norm": 2.2235491275787354, "learning_rate": 9.142382077535661e-05, "epoch": 0.26 }, { "loss": 11.5695, "grad_norm": 2.199366807937622, "learning_rate": 9.133630874245209e-05, "epoch": 0.26 }, { "loss": 12.1804, "grad_norm": 2.8299245834350586, "learning_rate": 9.124879670954757e-05, "epoch": 0.26 }, { "loss": 11.4799, "grad_norm": 3.164628744125366, "learning_rate": 9.116128467664305e-05, "epoch": 0.27 }, { "loss": 11.4195, "grad_norm": 4.022547245025635, "learning_rate": 9.107377264373851e-05, "epoch": 0.27 }, { "loss": 11.5764, "grad_norm": 2.569967031478882, "learning_rate": 9.098626061083399e-05, "epoch": 0.27 }, { "loss": 11.4122, "grad_norm": 2.7668631076812744, "learning_rate": 9.089874857792947e-05, "epoch": 0.27 }, { "loss": 11.9738, "grad_norm": 4.17225980758667, "learning_rate": 9.081123654502495e-05, "epoch": 0.28 }, { "loss": 11.2846, "grad_norm": 3.6021440029144287, "learning_rate": 9.072372451212043e-05, "epoch": 0.28 }, { "loss": 11.885, "grad_norm": 5.99414587020874, "learning_rate": 9.06362124792159e-05, "epoch": 0.28 }, { "loss": 11.4829, "grad_norm": 3.0609118938446045, "learning_rate": 9.054870044631137e-05, "epoch": 0.28 }, { "loss": 11.2748, "grad_norm": 3.083606243133545, "learning_rate": 9.046118841340685e-05, "epoch": 0.29 }, { "loss": 10.9556, "grad_norm": 2.1071770191192627, "learning_rate": 9.037367638050233e-05, "epoch": 0.29 }, { "loss": 11.9664, "grad_norm": 3.2089502811431885, "learning_rate": 9.02861643475978e-05, "epoch": 0.29 }, { "loss": 11.0907, "grad_norm": 2.714460611343384, "learning_rate": 9.019865231469327e-05, "epoch": 0.29 }, { "loss": 11.0296, "grad_norm": 4.843391418457031, "learning_rate": 9.011114028178875e-05, "epoch": 0.3 }, { "loss": 10.6882, "grad_norm": 2.8939428329467773, "learning_rate": 9.002362824888423e-05, "epoch": 0.3 }, { "loss": 11.4392, "grad_norm": 5.056521892547607, "learning_rate": 8.99361162159797e-05, "epoch": 0.3 }, { "loss": 11.1842, "grad_norm": 2.7797389030456543, "learning_rate": 8.984860418307518e-05, "epoch": 0.3 }, { "loss": 10.97, "grad_norm": 4.099424839019775, "learning_rate": 8.976109215017066e-05, "epoch": 0.31 }, { "loss": 11.3728, "grad_norm": 3.803455114364624, "learning_rate": 8.967358011726613e-05, "epoch": 0.31 }, { "loss": 11.1566, "grad_norm": 6.033726215362549, "learning_rate": 8.958606808436161e-05, "epoch": 0.31 }, { "loss": 10.635, "grad_norm": 3.339327335357666, "learning_rate": 8.949855605145708e-05, "epoch": 0.31 }, { "loss": 11.2816, "grad_norm": 2.3768680095672607, "learning_rate": 8.941104401855256e-05, "epoch": 0.32 }, { "loss": 10.8236, "grad_norm": 3.4453046321868896, "learning_rate": 8.932353198564804e-05, "epoch": 0.32 }, { "loss": 10.9037, "grad_norm": 3.0895841121673584, "learning_rate": 8.923601995274352e-05, "epoch": 0.32 }, { "loss": 10.4346, "grad_norm": 3.26282000541687, "learning_rate": 8.914850791983898e-05, "epoch": 0.33 }, { "loss": 10.2253, "grad_norm": 3.158858299255371, "learning_rate": 8.906099588693446e-05, "epoch": 0.33 }, { "loss": 10.9327, "grad_norm": 2.569925308227539, "learning_rate": 8.897348385402994e-05, "epoch": 0.33 }, { "loss": 11.1466, "grad_norm": 4.456540107727051, "learning_rate": 8.888597182112542e-05, "epoch": 0.33 }, { "loss": 10.6713, "grad_norm": 2.9973337650299072, "learning_rate": 8.879845978822088e-05, "epoch": 0.34 }, { "loss": 10.6667, "grad_norm": 4.433472156524658, "learning_rate": 8.871094775531636e-05, "epoch": 0.34 }, { "loss": 11.0465, "grad_norm": 3.661515474319458, "learning_rate": 8.862343572241184e-05, "epoch": 0.34 }, { "loss": 10.2861, "grad_norm": 2.8008625507354736, "learning_rate": 8.853592368950732e-05, "epoch": 0.34 }, { "loss": 10.6822, "grad_norm": 3.843266487121582, "learning_rate": 8.84484116566028e-05, "epoch": 0.35 }, { "loss": 10.726, "grad_norm": 3.4649717807769775, "learning_rate": 8.836089962369826e-05, "epoch": 0.35 }, { "loss": 10.6911, "grad_norm": 4.743326187133789, "learning_rate": 8.827338759079374e-05, "epoch": 0.35 }, { "loss": 10.1019, "grad_norm": 2.6317293643951416, "learning_rate": 8.818587555788922e-05, "epoch": 0.35 }, { "loss": 10.4147, "grad_norm": 3.893660306930542, "learning_rate": 8.80983635249847e-05, "epoch": 0.36 }, { "loss": 10.5977, "grad_norm": 2.704558849334717, "learning_rate": 8.801085149208017e-05, "epoch": 0.36 }, { "loss": 10.7126, "grad_norm": 3.4808812141418457, "learning_rate": 8.792333945917563e-05, "epoch": 0.36 }, { "loss": 10.4511, "grad_norm": 2.971688985824585, "learning_rate": 8.783582742627111e-05, "epoch": 0.36 }, { "loss": 10.3621, "grad_norm": 3.7666103839874268, "learning_rate": 8.774831539336659e-05, "epoch": 0.37 }, { "loss": 10.3416, "grad_norm": 2.951805353164673, "learning_rate": 8.766080336046207e-05, "epoch": 0.37 }, { "loss": 10.5537, "grad_norm": 3.5080454349517822, "learning_rate": 8.757329132755753e-05, "epoch": 0.37 }, { "loss": 10.3536, "grad_norm": 3.521519660949707, "learning_rate": 8.748577929465301e-05, "epoch": 0.38 }, { "loss": 10.3231, "grad_norm": 3.646610736846924, "learning_rate": 8.739826726174849e-05, "epoch": 0.38 }, { "loss": 10.6433, "grad_norm": 3.4696707725524902, "learning_rate": 8.731075522884397e-05, "epoch": 0.38 }, { "loss": 10.1692, "grad_norm": 3.852370500564575, "learning_rate": 8.722324319593944e-05, "epoch": 0.38 }, { "loss": 9.7841, "grad_norm": 3.693451404571533, "learning_rate": 8.713573116303491e-05, "epoch": 0.39 }, { "loss": 10.7817, "grad_norm": 3.032994508743286, "learning_rate": 8.704821913013039e-05, "epoch": 0.39 }, { "loss": 10.7416, "grad_norm": 3.537693500518799, "learning_rate": 8.696070709722587e-05, "epoch": 0.39 }, { "loss": 9.9999, "grad_norm": 2.624573230743408, "learning_rate": 8.687319506432135e-05, "epoch": 0.39 }, { "loss": 10.3777, "grad_norm": 2.453648328781128, "learning_rate": 8.678568303141682e-05, "epoch": 0.4 }, { "loss": 10.2463, "grad_norm": 3.5459659099578857, "learning_rate": 8.66981709985123e-05, "epoch": 0.4 }, { "loss": 9.69, "grad_norm": 2.9005305767059326, "learning_rate": 8.661065896560777e-05, "epoch": 0.4 }, { "loss": 10.5555, "grad_norm": 4.305134296417236, "learning_rate": 8.652314693270325e-05, "epoch": 0.4 }, { "loss": 11.4746, "grad_norm": 3.8746566772460938, "learning_rate": 8.643563489979872e-05, "epoch": 0.41 }, { "loss": 10.4112, "grad_norm": 3.0006351470947266, "learning_rate": 8.63481228668942e-05, "epoch": 0.41 }, { "loss": 10.7159, "grad_norm": 3.4273717403411865, "learning_rate": 8.626061083398968e-05, "epoch": 0.41 }, { "loss": 9.8815, "grad_norm": 3.3976597785949707, "learning_rate": 8.617309880108515e-05, "epoch": 0.41 }, { "loss": 10.0256, "grad_norm": 4.364745140075684, "learning_rate": 8.608558676818062e-05, "epoch": 0.42 }, { "loss": 10.2598, "grad_norm": 2.6873209476470947, "learning_rate": 8.59980747352761e-05, "epoch": 0.42 }, { "loss": 10.3362, "grad_norm": 4.00089693069458, "learning_rate": 8.591056270237158e-05, "epoch": 0.42 }, { "loss": 10.2189, "grad_norm": 2.858186721801758, "learning_rate": 8.582305066946706e-05, "epoch": 0.43 }, { "loss": 10.3866, "grad_norm": 3.203000783920288, "learning_rate": 8.573553863656252e-05, "epoch": 0.43 }, { "loss": 10.0813, "grad_norm": 3.210279941558838, "learning_rate": 8.5648026603658e-05, "epoch": 0.43 }, { "loss": 10.5642, "grad_norm": 3.2169432640075684, "learning_rate": 8.556051457075348e-05, "epoch": 0.43 }, { "loss": 9.8901, "grad_norm": 3.107404947280884, "learning_rate": 8.547300253784896e-05, "epoch": 0.44 }, { "loss": 10.1058, "grad_norm": 2.7491989135742188, "learning_rate": 8.538549050494444e-05, "epoch": 0.44 }, { "loss": 10.1777, "grad_norm": 3.140073299407959, "learning_rate": 8.52979784720399e-05, "epoch": 0.44 }, { "loss": 9.4428, "grad_norm": 3.9033658504486084, "learning_rate": 8.521046643913538e-05, "epoch": 0.44 }, { "loss": 10.4304, "grad_norm": 3.4388954639434814, "learning_rate": 8.512295440623086e-05, "epoch": 0.45 }, { "loss": 9.7865, "grad_norm": 2.7577993869781494, "learning_rate": 8.503544237332634e-05, "epoch": 0.45 }, { "loss": 10.5389, "grad_norm": 4.365457534790039, "learning_rate": 8.49479303404218e-05, "epoch": 0.45 }, { "loss": 9.6268, "grad_norm": 4.908252239227295, "learning_rate": 8.486041830751728e-05, "epoch": 0.45 }, { "loss": 9.8142, "grad_norm": 3.5492117404937744, "learning_rate": 8.477290627461276e-05, "epoch": 0.46 }, { "loss": 9.1744, "grad_norm": 3.34104061126709, "learning_rate": 8.468539424170824e-05, "epoch": 0.46 }, { "loss": 9.793, "grad_norm": 5.443964958190918, "learning_rate": 8.459788220880371e-05, "epoch": 0.46 }, { "loss": 9.6955, "grad_norm": 3.092270851135254, "learning_rate": 8.451037017589919e-05, "epoch": 0.46 }, { "loss": 9.7381, "grad_norm": 3.322415828704834, "learning_rate": 8.442285814299467e-05, "epoch": 0.47 }, { "loss": 10.1758, "grad_norm": 3.5836918354034424, "learning_rate": 8.433534611009014e-05, "epoch": 0.47 }, { "loss": 10.0565, "grad_norm": 4.64646053314209, "learning_rate": 8.424783407718562e-05, "epoch": 0.47 }, { "loss": 9.3562, "grad_norm": 2.8691656589508057, "learning_rate": 8.416032204428109e-05, "epoch": 0.48 }, { "loss": 10.1164, "grad_norm": 2.6130857467651367, "learning_rate": 8.407281001137657e-05, "epoch": 0.48 }, { "loss": 9.1353, "grad_norm": 2.950364112854004, "learning_rate": 8.398529797847205e-05, "epoch": 0.48 }, { "loss": 9.2614, "grad_norm": 3.1866071224212646, "learning_rate": 8.389778594556752e-05, "epoch": 0.48 }, { "loss": 9.7335, "grad_norm": 3.584228038787842, "learning_rate": 8.381027391266299e-05, "epoch": 0.49 }, { "loss": 9.7923, "grad_norm": 2.941434860229492, "learning_rate": 8.372276187975847e-05, "epoch": 0.49 }, { "loss": 9.6495, "grad_norm": 3.9578118324279785, "learning_rate": 8.363524984685395e-05, "epoch": 0.49 }, { "loss": 9.7038, "grad_norm": 3.197563648223877, "learning_rate": 8.354773781394943e-05, "epoch": 0.49 }, { "loss": 9.9406, "grad_norm": 3.8146650791168213, "learning_rate": 8.346022578104489e-05, "epoch": 0.5 }, { "loss": 9.8941, "grad_norm": 3.293826103210449, "learning_rate": 8.337271374814037e-05, "epoch": 0.5 }, { "loss": 9.463, "grad_norm": 2.8410701751708984, "learning_rate": 8.328520171523585e-05, "epoch": 0.5 }, { "loss": 9.9774, "grad_norm": 4.301900386810303, "learning_rate": 8.319768968233133e-05, "epoch": 0.5 }, { "loss": 9.8438, "grad_norm": 3.798737049102783, "learning_rate": 8.311017764942681e-05, "epoch": 0.51 }, { "loss": 9.9238, "grad_norm": 3.634910821914673, "learning_rate": 8.302266561652227e-05, "epoch": 0.51 }, { "loss": 9.3031, "grad_norm": 4.557560443878174, "learning_rate": 8.293515358361775e-05, "epoch": 0.51 }, { "loss": 9.7714, "grad_norm": 3.100658893585205, "learning_rate": 8.284764155071323e-05, "epoch": 0.51 }, { "loss": 9.6701, "grad_norm": 3.0376410484313965, "learning_rate": 8.276012951780871e-05, "epoch": 0.52 }, { "loss": 9.7965, "grad_norm": 2.64803147315979, "learning_rate": 8.267261748490418e-05, "epoch": 0.52 }, { "loss": 9.3502, "grad_norm": 3.5259008407592773, "learning_rate": 8.258510545199965e-05, "epoch": 0.52 }, { "loss": 9.3924, "grad_norm": 3.604329824447632, "learning_rate": 8.249759341909513e-05, "epoch": 0.52 }, { "loss": 9.1481, "grad_norm": 2.6112060546875, "learning_rate": 8.241008138619061e-05, "epoch": 0.53 }, { "loss": 10.2096, "grad_norm": 3.718703031539917, "learning_rate": 8.232256935328608e-05, "epoch": 0.53 }, { "loss": 9.0669, "grad_norm": 4.43959903717041, "learning_rate": 8.223505732038156e-05, "epoch": 0.53 }, { "loss": 9.2528, "grad_norm": 3.4342939853668213, "learning_rate": 8.214754528747703e-05, "epoch": 0.54 }, { "loss": 9.4184, "grad_norm": 4.191211700439453, "learning_rate": 8.206003325457251e-05, "epoch": 0.54 }, { "loss": 9.2057, "grad_norm": 3.076712131500244, "learning_rate": 8.197252122166799e-05, "epoch": 0.54 }, { "loss": 9.3174, "grad_norm": 3.668440341949463, "learning_rate": 8.188500918876346e-05, "epoch": 0.54 }, { "loss": 9.8369, "grad_norm": 3.419703483581543, "learning_rate": 8.179749715585894e-05, "epoch": 0.55 }, { "loss": 9.2614, "grad_norm": 4.150201797485352, "learning_rate": 8.170998512295442e-05, "epoch": 0.55 }, { "loss": 9.1372, "grad_norm": 3.2589640617370605, "learning_rate": 8.16224730900499e-05, "epoch": 0.55 }, { "loss": 9.4359, "grad_norm": 3.1012041568756104, "learning_rate": 8.153496105714536e-05, "epoch": 0.55 }, { "loss": 9.6403, "grad_norm": 3.882509708404541, "learning_rate": 8.144744902424084e-05, "epoch": 0.56 }, { "loss": 9.6448, "grad_norm": 2.656543254852295, "learning_rate": 8.135993699133632e-05, "epoch": 0.56 }, { "loss": 9.5709, "grad_norm": 3.2577645778656006, "learning_rate": 8.12724249584318e-05, "epoch": 0.56 }, { "loss": 9.4838, "grad_norm": 2.737210512161255, "learning_rate": 8.118491292552726e-05, "epoch": 0.56 }, { "loss": 9.0991, "grad_norm": 2.2185497283935547, "learning_rate": 8.109740089262274e-05, "epoch": 0.57 }, { "loss": 9.1741, "grad_norm": 2.766544818878174, "learning_rate": 8.100988885971822e-05, "epoch": 0.57 }, { "loss": 9.9664, "grad_norm": 3.627641201019287, "learning_rate": 8.09223768268137e-05, "epoch": 0.57 }, { "loss": 9.3467, "grad_norm": 3.600707769393921, "learning_rate": 8.083486479390916e-05, "epoch": 0.57 }, { "loss": 9.8328, "grad_norm": 5.097866058349609, "learning_rate": 8.074735276100464e-05, "epoch": 0.58 }, { "loss": 9.289, "grad_norm": 3.3913521766662598, "learning_rate": 8.065984072810012e-05, "epoch": 0.58 }, { "loss": 9.2046, "grad_norm": 3.586367130279541, "learning_rate": 8.05723286951956e-05, "epoch": 0.58 }, { "loss": 9.1802, "grad_norm": 5.786179542541504, "learning_rate": 8.048481666229108e-05, "epoch": 0.59 }, { "loss": 9.6482, "grad_norm": 3.158339023590088, "learning_rate": 8.039730462938655e-05, "epoch": 0.59 }, { "loss": 9.0124, "grad_norm": 3.3116583824157715, "learning_rate": 8.030979259648202e-05, "epoch": 0.59 }, { "loss": 8.7543, "grad_norm": 2.555194616317749, "learning_rate": 8.02222805635775e-05, "epoch": 0.59 }, { "loss": 9.043, "grad_norm": 3.2205519676208496, "learning_rate": 8.013476853067298e-05, "epoch": 0.6 }, { "loss": 9.5348, "grad_norm": 3.4175057411193848, "learning_rate": 8.004725649776845e-05, "epoch": 0.6 }, { "loss": 9.177, "grad_norm": 4.694581985473633, "learning_rate": 7.995974446486391e-05, "epoch": 0.6 }, { "loss": 9.2863, "grad_norm": 2.7787346839904785, "learning_rate": 7.987223243195939e-05, "epoch": 0.6 }, { "loss": 8.6984, "grad_norm": 3.4298195838928223, "learning_rate": 7.978472039905487e-05, "epoch": 0.61 }, { "loss": 9.0926, "grad_norm": 4.21417760848999, "learning_rate": 7.969720836615035e-05, "epoch": 0.61 }, { "loss": 8.8851, "grad_norm": 3.0844244956970215, "learning_rate": 7.960969633324581e-05, "epoch": 0.61 }, { "loss": 9.7628, "grad_norm": 3.0156939029693604, "learning_rate": 7.95221843003413e-05, "epoch": 0.61 }, { "loss": 8.6529, "grad_norm": 3.9500784873962402, "learning_rate": 7.943467226743677e-05, "epoch": 0.62 }, { "loss": 9.0502, "grad_norm": 4.802796840667725, "learning_rate": 7.934716023453225e-05, "epoch": 0.62 }, { "loss": 8.6458, "grad_norm": 4.273401260375977, "learning_rate": 7.925964820162772e-05, "epoch": 0.62 }, { "loss": 8.8679, "grad_norm": 4.070954322814941, "learning_rate": 7.91721361687232e-05, "epoch": 0.62 }, { "loss": 8.8355, "grad_norm": 3.3995022773742676, "learning_rate": 7.908462413581867e-05, "epoch": 0.63 }, { "loss": 8.7255, "grad_norm": 2.974888801574707, "learning_rate": 7.899711210291415e-05, "epoch": 0.63 }, { "loss": 9.6256, "grad_norm": 2.521350145339966, "learning_rate": 7.890960007000963e-05, "epoch": 0.63 }, { "loss": 8.9677, "grad_norm": 2.659583330154419, "learning_rate": 7.88220880371051e-05, "epoch": 0.64 }, { "loss": 9.339, "grad_norm": 4.531926155090332, "learning_rate": 7.873457600420058e-05, "epoch": 0.64 }, { "loss": 9.469, "grad_norm": 3.573625087738037, "learning_rate": 7.864706397129606e-05, "epoch": 0.64 }, { "loss": 9.2697, "grad_norm": 3.5155880451202393, "learning_rate": 7.855955193839153e-05, "epoch": 0.64 }, { "loss": 8.5749, "grad_norm": 3.201718330383301, "learning_rate": 7.8472039905487e-05, "epoch": 0.65 }, { "loss": 8.9228, "grad_norm": 3.8670506477355957, "learning_rate": 7.838452787258248e-05, "epoch": 0.65 }, { "loss": 9.5243, "grad_norm": 3.4351415634155273, "learning_rate": 7.829701583967796e-05, "epoch": 0.65 }, { "loss": 8.7689, "grad_norm": 4.182631492614746, "learning_rate": 7.820950380677344e-05, "epoch": 0.65 }, { "loss": 9.2565, "grad_norm": 3.6523499488830566, "learning_rate": 7.81219917738689e-05, "epoch": 0.66 }, { "loss": 8.9147, "grad_norm": 3.6572344303131104, "learning_rate": 7.803447974096438e-05, "epoch": 0.66 }, { "loss": 8.6875, "grad_norm": 4.45376443862915, "learning_rate": 7.794696770805986e-05, "epoch": 0.66 }, { "loss": 9.782, "grad_norm": 4.446099758148193, "learning_rate": 7.785945567515534e-05, "epoch": 0.66 }, { "loss": 9.0401, "grad_norm": 3.134500026702881, "learning_rate": 7.777194364225082e-05, "epoch": 0.67 }, { "loss": 9.3041, "grad_norm": 4.3101325035095215, "learning_rate": 7.768443160934628e-05, "epoch": 0.67 }, { "loss": 8.3818, "grad_norm": 2.935241222381592, "learning_rate": 7.759691957644176e-05, "epoch": 0.67 }, { "loss": 9.3778, "grad_norm": 3.966174364089966, "learning_rate": 7.750940754353724e-05, "epoch": 0.67 }, { "loss": 9.0559, "grad_norm": 3.758314609527588, "learning_rate": 7.742189551063272e-05, "epoch": 0.68 }, { "loss": 8.5828, "grad_norm": 3.2531213760375977, "learning_rate": 7.733438347772818e-05, "epoch": 0.68 }, { "loss": 8.6358, "grad_norm": 3.9096357822418213, "learning_rate": 7.724687144482366e-05, "epoch": 0.68 }, { "loss": 9.0841, "grad_norm": 2.787165641784668, "learning_rate": 7.715935941191914e-05, "epoch": 0.69 }, { "loss": 8.7611, "grad_norm": 3.6336965560913086, "learning_rate": 7.707184737901462e-05, "epoch": 0.69 }, { "loss": 9.3819, "grad_norm": 4.785186290740967, "learning_rate": 7.698433534611009e-05, "epoch": 0.69 }, { "loss": 9.3396, "grad_norm": 3.7301132678985596, "learning_rate": 7.689682331320557e-05, "epoch": 0.69 }, { "loss": 8.7932, "grad_norm": 3.769679307937622, "learning_rate": 7.680931128030104e-05, "epoch": 0.7 }, { "loss": 9.5408, "grad_norm": 3.249382257461548, "learning_rate": 7.672179924739652e-05, "epoch": 0.7 }, { "loss": 9.1383, "grad_norm": 3.562981128692627, "learning_rate": 7.6634287214492e-05, "epoch": 0.7 }, { "loss": 8.5737, "grad_norm": 3.2148962020874023, "learning_rate": 7.654677518158747e-05, "epoch": 0.7 }, { "loss": 8.5483, "grad_norm": 2.9571826457977295, "learning_rate": 7.645926314868295e-05, "epoch": 0.71 }, { "loss": 8.9157, "grad_norm": 3.3202896118164062, "learning_rate": 7.637175111577843e-05, "epoch": 0.71 }, { "loss": 8.9654, "grad_norm": 4.197299957275391, "learning_rate": 7.62842390828739e-05, "epoch": 0.71 }, { "loss": 9.642, "grad_norm": 2.9648005962371826, "learning_rate": 7.619672704996937e-05, "epoch": 0.71 }, { "loss": 9.5579, "grad_norm": 2.793729066848755, "learning_rate": 7.610921501706485e-05, "epoch": 0.72 }, { "loss": 9.0535, "grad_norm": 3.039337158203125, "learning_rate": 7.602170298416033e-05, "epoch": 0.72 }, { "loss": 8.4261, "grad_norm": 3.472973346710205, "learning_rate": 7.59341909512558e-05, "epoch": 0.72 }, { "loss": 8.5207, "grad_norm": 2.588060140609741, "learning_rate": 7.584667891835127e-05, "epoch": 0.72 }, { "loss": 9.4719, "grad_norm": 3.702918529510498, "learning_rate": 7.575916688544675e-05, "epoch": 0.73 }, { "loss": 8.2056, "grad_norm": 3.087986946105957, "learning_rate": 7.567165485254223e-05, "epoch": 0.73 }, { "loss": 8.9777, "grad_norm": 3.231987476348877, "learning_rate": 7.558414281963771e-05, "epoch": 0.73 }, { "loss": 8.946, "grad_norm": 3.1620264053344727, "learning_rate": 7.549663078673317e-05, "epoch": 0.73 }, { "loss": 9.3374, "grad_norm": 3.0438194274902344, "learning_rate": 7.540911875382865e-05, "epoch": 0.74 }, { "loss": 8.4711, "grad_norm": 3.3557493686676025, "learning_rate": 7.532160672092413e-05, "epoch": 0.74 }, { "loss": 8.5348, "grad_norm": 3.693506956100464, "learning_rate": 7.523409468801961e-05, "epoch": 0.74 }, { "loss": 9.2969, "grad_norm": 4.126795291900635, "learning_rate": 7.514658265511509e-05, "epoch": 0.75 }, { "loss": 8.6828, "grad_norm": 3.4798762798309326, "learning_rate": 7.505907062221055e-05, "epoch": 0.75 }, { "loss": 9.885, "grad_norm": 3.5834882259368896, "learning_rate": 7.497155858930603e-05, "epoch": 0.75 }, { "loss": 8.7332, "grad_norm": 3.054962396621704, "learning_rate": 7.488404655640151e-05, "epoch": 0.75 }, { "loss": 8.8859, "grad_norm": 2.3702313899993896, "learning_rate": 7.479653452349699e-05, "epoch": 0.76 }, { "loss": 8.9641, "grad_norm": 3.573233127593994, "learning_rate": 7.470902249059246e-05, "epoch": 0.76 }, { "loss": 9.0286, "grad_norm": 2.7246625423431396, "learning_rate": 7.462151045768794e-05, "epoch": 0.76 }, { "loss": 8.4259, "grad_norm": 3.090899705886841, "learning_rate": 7.453399842478341e-05, "epoch": 0.76 }, { "loss": 8.5746, "grad_norm": 2.8535008430480957, "learning_rate": 7.444648639187889e-05, "epoch": 0.77 }, { "loss": 9.0497, "grad_norm": 3.7636609077453613, "learning_rate": 7.435897435897436e-05, "epoch": 0.77 }, { "loss": 8.7239, "grad_norm": 3.038818597793579, "learning_rate": 7.427146232606984e-05, "epoch": 0.77 }, { "loss": 9.0842, "grad_norm": 3.275329351425171, "learning_rate": 7.418395029316532e-05, "epoch": 0.77 }, { "loss": 8.9054, "grad_norm": 2.4956889152526855, "learning_rate": 7.40964382602608e-05, "epoch": 0.78 }, { "loss": 8.8721, "grad_norm": 2.9423913955688477, "learning_rate": 7.400892622735627e-05, "epoch": 0.78 }, { "loss": 8.9035, "grad_norm": 4.2211785316467285, "learning_rate": 7.392141419445174e-05, "epoch": 0.78 }, { "loss": 8.9983, "grad_norm": 3.3558285236358643, "learning_rate": 7.383390216154722e-05, "epoch": 0.78 }, { "loss": 8.4671, "grad_norm": 3.1967856884002686, "learning_rate": 7.37463901286427e-05, "epoch": 0.79 }, { "loss": 8.4608, "grad_norm": 3.5259337425231934, "learning_rate": 7.365887809573818e-05, "epoch": 0.79 }, { "loss": 9.1684, "grad_norm": 3.226388692855835, "learning_rate": 7.357136606283364e-05, "epoch": 0.79 }, { "loss": 8.4309, "grad_norm": 3.7550246715545654, "learning_rate": 7.348385402992912e-05, "epoch": 0.8 }, { "loss": 8.4427, "grad_norm": 4.338967800140381, "learning_rate": 7.33963419970246e-05, "epoch": 0.8 }, { "loss": 8.9643, "grad_norm": 3.764723777770996, "learning_rate": 7.330882996412008e-05, "epoch": 0.8 }, { "loss": 8.346, "grad_norm": 3.2704851627349854, "learning_rate": 7.322131793121554e-05, "epoch": 0.8 }, { "loss": 9.3154, "grad_norm": 2.8961048126220703, "learning_rate": 7.313380589831102e-05, "epoch": 0.81 }, { "loss": 8.9944, "grad_norm": 3.3732376098632812, "learning_rate": 7.30462938654065e-05, "epoch": 0.81 }, { "loss": 8.4122, "grad_norm": 3.7773525714874268, "learning_rate": 7.295878183250198e-05, "epoch": 0.81 }, { "loss": 9.2683, "grad_norm": 3.716183662414551, "learning_rate": 7.287126979959746e-05, "epoch": 0.81 }, { "loss": 8.6879, "grad_norm": 3.6532111167907715, "learning_rate": 7.278375776669292e-05, "epoch": 0.82 }, { "loss": 8.6653, "grad_norm": 3.4833829402923584, "learning_rate": 7.26962457337884e-05, "epoch": 0.82 }, { "loss": 8.8866, "grad_norm": 4.338618278503418, "learning_rate": 7.260873370088388e-05, "epoch": 0.82 }, { "loss": 8.9239, "grad_norm": 3.3099405765533447, "learning_rate": 7.252122166797936e-05, "epoch": 0.82 }, { "loss": 8.8475, "grad_norm": 3.2691259384155273, "learning_rate": 7.243370963507483e-05, "epoch": 0.83 }, { "loss": 9.0336, "grad_norm": 3.4680464267730713, "learning_rate": 7.23461976021703e-05, "epoch": 0.83 }, { "loss": 9.1274, "grad_norm": 3.6281306743621826, "learning_rate": 7.225868556926578e-05, "epoch": 0.83 }, { "loss": 8.557, "grad_norm": 3.129265546798706, "learning_rate": 7.217117353636126e-05, "epoch": 0.83 }, { "loss": 8.2588, "grad_norm": 2.781096935272217, "learning_rate": 7.208366150345673e-05, "epoch": 0.84 }, { "loss": 8.6785, "grad_norm": 2.085003137588501, "learning_rate": 7.19961494705522e-05, "epoch": 0.84 }, { "loss": 8.2642, "grad_norm": 3.204002618789673, "learning_rate": 7.190863743764767e-05, "epoch": 0.84 }, { "loss": 7.9569, "grad_norm": 2.3955442905426025, "learning_rate": 7.182112540474315e-05, "epoch": 0.85 }, { "loss": 8.3701, "grad_norm": 3.8179776668548584, "learning_rate": 7.173361337183863e-05, "epoch": 0.85 }, { "loss": 8.4552, "grad_norm": 3.572737216949463, "learning_rate": 7.16461013389341e-05, "epoch": 0.85 }, { "loss": 8.5619, "grad_norm": 3.7918508052825928, "learning_rate": 7.155858930602957e-05, "epoch": 0.85 }, { "loss": 8.9817, "grad_norm": 3.5314037799835205, "learning_rate": 7.147107727312505e-05, "epoch": 0.86 }, { "loss": 8.3192, "grad_norm": 3.137615442276001, "learning_rate": 7.138356524022053e-05, "epoch": 0.86 }, { "loss": 9.0373, "grad_norm": 3.58949613571167, "learning_rate": 7.129605320731601e-05, "epoch": 0.86 }, { "loss": 8.5971, "grad_norm": 3.062047243118286, "learning_rate": 7.120854117441148e-05, "epoch": 0.86 }, { "loss": 8.334, "grad_norm": 3.8008644580841064, "learning_rate": 7.112102914150696e-05, "epoch": 0.87 }, { "loss": 8.4934, "grad_norm": 3.4640395641326904, "learning_rate": 7.103351710860243e-05, "epoch": 0.87 }, { "loss": 8.5304, "grad_norm": 3.6595981121063232, "learning_rate": 7.094600507569791e-05, "epoch": 0.87 }, { "loss": 8.6947, "grad_norm": 4.255160331726074, "learning_rate": 7.085849304279338e-05, "epoch": 0.87 }, { "loss": 8.5373, "grad_norm": 2.957233428955078, "learning_rate": 7.077098100988886e-05, "epoch": 0.88 }, { "loss": 8.919, "grad_norm": 4.1049933433532715, "learning_rate": 7.068346897698434e-05, "epoch": 0.88 }, { "loss": 8.6536, "grad_norm": 3.6588120460510254, "learning_rate": 7.059595694407982e-05, "epoch": 0.88 }, { "loss": 8.1786, "grad_norm": 2.536498785018921, "learning_rate": 7.050844491117528e-05, "epoch": 0.88 }, { "loss": 8.9825, "grad_norm": 3.442955255508423, "learning_rate": 7.042093287827076e-05, "epoch": 0.89 }, { "loss": 8.7311, "grad_norm": 2.923522710800171, "learning_rate": 7.033342084536624e-05, "epoch": 0.89 }, { "loss": 8.276, "grad_norm": 3.6458773612976074, "learning_rate": 7.024590881246172e-05, "epoch": 0.89 }, { "loss": 8.5271, "grad_norm": 3.239694356918335, "learning_rate": 7.01583967795572e-05, "epoch": 0.9 }, { "loss": 8.2755, "grad_norm": 3.39280366897583, "learning_rate": 7.007088474665266e-05, "epoch": 0.9 }, { "loss": 9.2045, "grad_norm": 3.4279630184173584, "learning_rate": 6.998337271374814e-05, "epoch": 0.9 }, { "loss": 8.5647, "grad_norm": 2.416999578475952, "learning_rate": 6.989586068084362e-05, "epoch": 0.9 }, { "loss": 8.7283, "grad_norm": 2.8094992637634277, "learning_rate": 6.98083486479391e-05, "epoch": 0.91 }, { "loss": 8.6518, "grad_norm": 4.319655418395996, "learning_rate": 6.972083661503456e-05, "epoch": 0.91 }, { "loss": 8.2852, "grad_norm": 3.9317190647125244, "learning_rate": 6.963332458213004e-05, "epoch": 0.91 }, { "loss": 8.6298, "grad_norm": 4.8585405349731445, "learning_rate": 6.954581254922552e-05, "epoch": 0.91 }, { "loss": 8.2455, "grad_norm": 2.6159684658050537, "learning_rate": 6.9458300516321e-05, "epoch": 0.92 }, { "loss": 8.7473, "grad_norm": 2.344099521636963, "learning_rate": 6.937078848341647e-05, "epoch": 0.92 }, { "loss": 7.9803, "grad_norm": 3.1866703033447266, "learning_rate": 6.928327645051194e-05, "epoch": 0.92 }, { "loss": 8.79, "grad_norm": 3.943319320678711, "learning_rate": 6.919576441760742e-05, "epoch": 0.92 }, { "loss": 8.8112, "grad_norm": 2.919020891189575, "learning_rate": 6.91082523847029e-05, "epoch": 0.93 }, { "loss": 8.5764, "grad_norm": 3.47027325630188, "learning_rate": 6.902074035179837e-05, "epoch": 0.93 }, { "loss": 9.4408, "grad_norm": 3.2260677814483643, "learning_rate": 6.893322831889385e-05, "epoch": 0.93 }, { "loss": 8.9655, "grad_norm": 3.2517478466033936, "learning_rate": 6.884571628598933e-05, "epoch": 0.93 }, { "loss": 8.8759, "grad_norm": 4.705760478973389, "learning_rate": 6.87582042530848e-05, "epoch": 0.94 }, { "loss": 8.8426, "grad_norm": 2.7460803985595703, "learning_rate": 6.867069222018028e-05, "epoch": 0.94 }, { "loss": 8.6733, "grad_norm": 3.944464921951294, "learning_rate": 6.858318018727575e-05, "epoch": 0.94 }, { "loss": 8.56, "grad_norm": 3.393721342086792, "learning_rate": 6.849566815437123e-05, "epoch": 0.94 }, { "loss": 7.8694, "grad_norm": 2.579340696334839, "learning_rate": 6.84081561214667e-05, "epoch": 0.95 }, { "loss": 8.2998, "grad_norm": 3.6678457260131836, "learning_rate": 6.832064408856219e-05, "epoch": 0.95 }, { "loss": 8.5947, "grad_norm": 3.218284845352173, "learning_rate": 6.823313205565765e-05, "epoch": 0.95 }, { "loss": 8.5014, "grad_norm": 3.5185766220092773, "learning_rate": 6.814562002275313e-05, "epoch": 0.96 }, { "loss": 9.1655, "grad_norm": 3.5601882934570312, "learning_rate": 6.805810798984861e-05, "epoch": 0.96 }, { "loss": 8.6889, "grad_norm": 3.317361354827881, "learning_rate": 6.797059595694409e-05, "epoch": 0.96 }, { "loss": 8.9274, "grad_norm": 3.271773338317871, "learning_rate": 6.788308392403955e-05, "epoch": 0.96 }, { "loss": 8.733, "grad_norm": 3.0022764205932617, "learning_rate": 6.779557189113503e-05, "epoch": 0.97 }, { "loss": 8.5665, "grad_norm": 3.5991992950439453, "learning_rate": 6.770805985823051e-05, "epoch": 0.97 }, { "loss": 8.3149, "grad_norm": 3.060124158859253, "learning_rate": 6.762054782532599e-05, "epoch": 0.97 }, { "loss": 7.9419, "grad_norm": 3.116497278213501, "learning_rate": 6.753303579242147e-05, "epoch": 0.97 }, { "loss": 8.456, "grad_norm": 3.201129198074341, "learning_rate": 6.744552375951693e-05, "epoch": 0.98 }, { "loss": 8.5843, "grad_norm": 3.7871932983398438, "learning_rate": 6.735801172661241e-05, "epoch": 0.98 }, { "loss": 8.1042, "grad_norm": 3.8025078773498535, "learning_rate": 6.727049969370789e-05, "epoch": 0.98 }, { "loss": 9.0008, "grad_norm": 2.9040756225585938, "learning_rate": 6.718298766080337e-05, "epoch": 0.98 }, { "loss": 7.971, "grad_norm": 5.227065086364746, "learning_rate": 6.709547562789884e-05, "epoch": 0.99 }, { "loss": 9.1628, "grad_norm": 2.822517156600952, "learning_rate": 6.700796359499431e-05, "epoch": 0.99 }, { "loss": 7.7244, "grad_norm": 2.9904367923736572, "learning_rate": 6.69204515620898e-05, "epoch": 0.99 }, { "loss": 8.0893, "grad_norm": 4.167274475097656, "learning_rate": 6.683293952918527e-05, "epoch": 0.99 }, { "loss": 8.3331, "grad_norm": 3.8134043216705322, "learning_rate": 6.674542749628074e-05, "epoch": 1.0 }, { "loss": 7.8118, "grad_norm": 3.0692172050476074, "learning_rate": 6.665791546337622e-05, "epoch": 1.0 }, { "loss": 8.4811, "grad_norm": 4.14231014251709, "learning_rate": 6.65704034304717e-05, "epoch": 1.0 }, { "loss": 8.0494, "grad_norm": 3.4583489894866943, "learning_rate": 6.648289139756717e-05, "epoch": 1.01 }, { "loss": 8.2668, "grad_norm": 3.468843460083008, "learning_rate": 6.639537936466265e-05, "epoch": 1.01 }, { "loss": 8.384, "grad_norm": 3.0200271606445312, "learning_rate": 6.630786733175812e-05, "epoch": 1.01 }, { "loss": 8.5672, "grad_norm": 3.9895946979522705, "learning_rate": 6.62203552988536e-05, "epoch": 1.01 }, { "loss": 8.4798, "grad_norm": 2.92266583442688, "learning_rate": 6.613284326594908e-05, "epoch": 1.02 }, { "loss": 8.749, "grad_norm": 3.8905258178710938, "learning_rate": 6.604533123304456e-05, "epoch": 1.02 }, { "loss": 7.8761, "grad_norm": 3.545311212539673, "learning_rate": 6.595781920014002e-05, "epoch": 1.02 }, { "loss": 8.3273, "grad_norm": 2.925837516784668, "learning_rate": 6.58703071672355e-05, "epoch": 1.02 }, { "loss": 8.3426, "grad_norm": 3.527435064315796, "learning_rate": 6.578279513433098e-05, "epoch": 1.03 }, { "loss": 7.5962, "grad_norm": 2.926382064819336, "learning_rate": 6.569528310142646e-05, "epoch": 1.03 }, { "loss": 8.2975, "grad_norm": 3.4969446659088135, "learning_rate": 6.560777106852192e-05, "epoch": 1.03 }, { "loss": 8.4417, "grad_norm": 3.466707229614258, "learning_rate": 6.55202590356174e-05, "epoch": 1.03 }, { "loss": 8.1161, "grad_norm": 4.119028091430664, "learning_rate": 6.543274700271288e-05, "epoch": 1.04 }, { "loss": 8.0248, "grad_norm": 3.2728042602539062, "learning_rate": 6.534523496980836e-05, "epoch": 1.04 }, { "loss": 8.0926, "grad_norm": 2.8251736164093018, "learning_rate": 6.525772293690382e-05, "epoch": 1.04 }, { "loss": 8.6109, "grad_norm": 3.521144151687622, "learning_rate": 6.51702109039993e-05, "epoch": 1.04 }, { "loss": 8.6268, "grad_norm": 3.15901780128479, "learning_rate": 6.508269887109478e-05, "epoch": 1.05 }, { "loss": 8.2584, "grad_norm": 3.5901992321014404, "learning_rate": 6.499518683819026e-05, "epoch": 1.05 }, { "loss": 8.5285, "grad_norm": 4.662459850311279, "learning_rate": 6.490767480528574e-05, "epoch": 1.05 }, { "loss": 7.8559, "grad_norm": 2.72666597366333, "learning_rate": 6.48201627723812e-05, "epoch": 1.06 }, { "loss": 8.145, "grad_norm": 3.6170144081115723, "learning_rate": 6.473265073947668e-05, "epoch": 1.06 }, { "loss": 7.838, "grad_norm": 2.9118199348449707, "learning_rate": 6.464513870657216e-05, "epoch": 1.06 }, { "loss": 8.4171, "grad_norm": 3.7052972316741943, "learning_rate": 6.455762667366764e-05, "epoch": 1.06 }, { "loss": 8.2865, "grad_norm": 4.498712062835693, "learning_rate": 6.447011464076311e-05, "epoch": 1.07 }, { "loss": 8.6456, "grad_norm": 3.1900229454040527, "learning_rate": 6.438260260785859e-05, "epoch": 1.07 }, { "loss": 8.1772, "grad_norm": 4.92230224609375, "learning_rate": 6.429509057495407e-05, "epoch": 1.07 }, { "loss": 9.221, "grad_norm": 3.758399724960327, "learning_rate": 6.420757854204954e-05, "epoch": 1.07 }, { "loss": 8.8449, "grad_norm": 3.110145092010498, "learning_rate": 6.412006650914501e-05, "epoch": 1.08 }, { "loss": 8.5491, "grad_norm": 3.1985270977020264, "learning_rate": 6.403255447624049e-05, "epoch": 1.08 }, { "loss": 8.0487, "grad_norm": 4.918299674987793, "learning_rate": 6.394504244333595e-05, "epoch": 1.08 }, { "loss": 8.6751, "grad_norm": 3.328449010848999, "learning_rate": 6.385753041043143e-05, "epoch": 1.08 }, { "loss": 8.0822, "grad_norm": 2.8385417461395264, "learning_rate": 6.377001837752691e-05, "epoch": 1.09 }, { "loss": 8.0838, "grad_norm": 3.5397825241088867, "learning_rate": 6.368250634462238e-05, "epoch": 1.09 }, { "loss": 8.3657, "grad_norm": 3.8638100624084473, "learning_rate": 6.359499431171786e-05, "epoch": 1.09 }, { "loss": 8.5573, "grad_norm": 3.129281759262085, "learning_rate": 6.350748227881333e-05, "epoch": 1.09 }, { "loss": 8.424, "grad_norm": 4.496127605438232, "learning_rate": 6.341997024590881e-05, "epoch": 1.1 }, { "loss": 8.4377, "grad_norm": 5.132551670074463, "learning_rate": 6.333245821300429e-05, "epoch": 1.1 }, { "loss": 8.1084, "grad_norm": 2.994011402130127, "learning_rate": 6.324494618009976e-05, "epoch": 1.1 }, { "loss": 8.0609, "grad_norm": 3.976611375808716, "learning_rate": 6.315743414719524e-05, "epoch": 1.11 }, { "loss": 8.0137, "grad_norm": 4.869803428649902, "learning_rate": 6.306992211429072e-05, "epoch": 1.11 }, { "loss": 8.2885, "grad_norm": 3.4231982231140137, "learning_rate": 6.29824100813862e-05, "epoch": 1.11 }, { "loss": 8.2444, "grad_norm": 5.1861252784729, "learning_rate": 6.289489804848166e-05, "epoch": 1.11 }, { "loss": 7.7026, "grad_norm": 4.288048267364502, "learning_rate": 6.280738601557714e-05, "epoch": 1.12 }, { "loss": 7.9181, "grad_norm": 3.64320969581604, "learning_rate": 6.271987398267262e-05, "epoch": 1.12 }, { "loss": 8.2069, "grad_norm": 4.117647647857666, "learning_rate": 6.26323619497681e-05, "epoch": 1.12 }, { "loss": 8.505, "grad_norm": 3.528850793838501, "learning_rate": 6.254484991686356e-05, "epoch": 1.12 }, { "loss": 8.1734, "grad_norm": 2.9336414337158203, "learning_rate": 6.245733788395904e-05, "epoch": 1.13 }, { "loss": 7.9808, "grad_norm": 4.607523441314697, "learning_rate": 6.236982585105452e-05, "epoch": 1.13 }, { "loss": 8.2398, "grad_norm": 5.0170464515686035, "learning_rate": 6.228231381815e-05, "epoch": 1.13 }, { "loss": 8.3433, "grad_norm": 3.7535080909729004, "learning_rate": 6.219480178524548e-05, "epoch": 1.13 }, { "loss": 7.8144, "grad_norm": 3.409480333328247, "learning_rate": 6.210728975234094e-05, "epoch": 1.14 }, { "loss": 8.5735, "grad_norm": 3.7058238983154297, "learning_rate": 6.201977771943642e-05, "epoch": 1.14 }, { "loss": 8.4169, "grad_norm": 3.1820621490478516, "learning_rate": 6.19322656865319e-05, "epoch": 1.14 }, { "loss": 7.9047, "grad_norm": 2.8989903926849365, "learning_rate": 6.184475365362738e-05, "epoch": 1.14 }, { "loss": 8.1377, "grad_norm": 3.903512477874756, "learning_rate": 6.175724162072284e-05, "epoch": 1.15 }, { "loss": 8.6631, "grad_norm": 2.916041374206543, "learning_rate": 6.166972958781832e-05, "epoch": 1.15 }, { "loss": 8.2377, "grad_norm": 5.932418346405029, "learning_rate": 6.15822175549138e-05, "epoch": 1.15 }, { "loss": 8.9961, "grad_norm": 4.3133368492126465, "learning_rate": 6.149470552200928e-05, "epoch": 1.15 }, { "loss": 8.0339, "grad_norm": 3.1874802112579346, "learning_rate": 6.140719348910475e-05, "epoch": 1.16 }, { "loss": 7.3347, "grad_norm": 3.9368977546691895, "learning_rate": 6.131968145620023e-05, "epoch": 1.16 }, { "loss": 7.723, "grad_norm": 3.000967025756836, "learning_rate": 6.12321694232957e-05, "epoch": 1.16 }, { "loss": 8.3258, "grad_norm": 3.96174693107605, "learning_rate": 6.114465739039118e-05, "epoch": 1.17 }, { "loss": 8.4037, "grad_norm": 3.6735053062438965, "learning_rate": 6.105714535748666e-05, "epoch": 1.17 }, { "loss": 8.2432, "grad_norm": 4.765099048614502, "learning_rate": 6.0969633324582134e-05, "epoch": 1.17 }, { "loss": 8.232, "grad_norm": 3.118907928466797, "learning_rate": 6.0882121291677607e-05, "epoch": 1.17 }, { "loss": 8.4367, "grad_norm": 2.7283709049224854, "learning_rate": 6.0794609258773085e-05, "epoch": 1.18 }, { "loss": 8.062, "grad_norm": 2.874713182449341, "learning_rate": 6.070709722586856e-05, "epoch": 1.18 }, { "loss": 8.2398, "grad_norm": 3.3554372787475586, "learning_rate": 6.0619585192964036e-05, "epoch": 1.18 }, { "loss": 8.0336, "grad_norm": 3.0796210765838623, "learning_rate": 6.053207316005951e-05, "epoch": 1.18 }, { "loss": 8.6318, "grad_norm": 4.566615581512451, "learning_rate": 6.044456112715499e-05, "epoch": 1.19 }, { "loss": 7.8031, "grad_norm": 2.9705634117126465, "learning_rate": 6.035704909425046e-05, "epoch": 1.19 }, { "loss": 8.2148, "grad_norm": 2.8812005519866943, "learning_rate": 6.026953706134594e-05, "epoch": 1.19 }, { "loss": 7.8284, "grad_norm": 3.389988899230957, "learning_rate": 6.018202502844141e-05, "epoch": 1.19 }, { "loss": 7.9364, "grad_norm": 4.094693660736084, "learning_rate": 6.009451299553689e-05, "epoch": 1.2 }, { "loss": 8.1086, "grad_norm": 2.9179611206054688, "learning_rate": 6.000700096263236e-05, "epoch": 1.2 }, { "loss": 8.5533, "grad_norm": 3.0995657444000244, "learning_rate": 5.991948892972784e-05, "epoch": 1.2 }, { "loss": 8.1136, "grad_norm": 4.079578399658203, "learning_rate": 5.983197689682332e-05, "epoch": 1.2 }, { "loss": 8.379, "grad_norm": 3.150442600250244, "learning_rate": 5.974446486391879e-05, "epoch": 1.21 }, { "loss": 7.6851, "grad_norm": 3.904902458190918, "learning_rate": 5.965695283101427e-05, "epoch": 1.21 }, { "loss": 8.5392, "grad_norm": 2.8424036502838135, "learning_rate": 5.956944079810974e-05, "epoch": 1.21 }, { "loss": 7.9675, "grad_norm": 5.174964904785156, "learning_rate": 5.948192876520522e-05, "epoch": 1.22 }, { "loss": 7.8097, "grad_norm": 3.6166417598724365, "learning_rate": 5.9394416732300694e-05, "epoch": 1.22 }, { "loss": 7.981, "grad_norm": 3.1801164150238037, "learning_rate": 5.930690469939617e-05, "epoch": 1.22 }, { "loss": 8.2149, "grad_norm": 3.975576400756836, "learning_rate": 5.9219392666491645e-05, "epoch": 1.22 }, { "loss": 8.5473, "grad_norm": 4.039759159088135, "learning_rate": 5.9131880633587123e-05, "epoch": 1.23 }, { "loss": 8.2659, "grad_norm": 4.9490861892700195, "learning_rate": 5.9044368600682596e-05, "epoch": 1.23 }, { "loss": 8.0904, "grad_norm": 4.2978715896606445, "learning_rate": 5.8956856567778074e-05, "epoch": 1.23 }, { "loss": 8.0685, "grad_norm": 3.037668466567993, "learning_rate": 5.8869344534873547e-05, "epoch": 1.23 }, { "loss": 7.7553, "grad_norm": 3.496307849884033, "learning_rate": 5.8781832501969025e-05, "epoch": 1.24 }, { "loss": 8.5136, "grad_norm": 3.175560712814331, "learning_rate": 5.8694320469064504e-05, "epoch": 1.24 }, { "loss": 7.8844, "grad_norm": 3.7230336666107178, "learning_rate": 5.8606808436159976e-05, "epoch": 1.24 }, { "loss": 8.2197, "grad_norm": 4.2161359786987305, "learning_rate": 5.8519296403255455e-05, "epoch": 1.24 }, { "loss": 8.2073, "grad_norm": 4.122830867767334, "learning_rate": 5.843178437035093e-05, "epoch": 1.25 }, { "loss": 7.9876, "grad_norm": 4.076393127441406, "learning_rate": 5.8344272337446406e-05, "epoch": 1.25 }, { "loss": 7.7404, "grad_norm": 4.162572860717773, "learning_rate": 5.825676030454188e-05, "epoch": 1.25 }, { "loss": 7.9991, "grad_norm": 3.4230401515960693, "learning_rate": 5.816924827163736e-05, "epoch": 1.25 }, { "loss": 8.0793, "grad_norm": 4.0326313972473145, "learning_rate": 5.808173623873283e-05, "epoch": 1.26 }, { "loss": 7.3234, "grad_norm": 2.669609308242798, "learning_rate": 5.799422420582831e-05, "epoch": 1.26 }, { "loss": 8.4489, "grad_norm": 3.926006555557251, "learning_rate": 5.790671217292378e-05, "epoch": 1.26 }, { "loss": 8.2047, "grad_norm": 2.7743630409240723, "learning_rate": 5.781920014001926e-05, "epoch": 1.27 }, { "loss": 7.6027, "grad_norm": 3.9714138507843018, "learning_rate": 5.773168810711473e-05, "epoch": 1.27 }, { "loss": 7.5097, "grad_norm": 3.775052070617676, "learning_rate": 5.764417607421021e-05, "epoch": 1.27 }, { "loss": 8.1056, "grad_norm": 4.158542633056641, "learning_rate": 5.755666404130568e-05, "epoch": 1.27 }, { "loss": 8.2198, "grad_norm": 3.647034168243408, "learning_rate": 5.746915200840116e-05, "epoch": 1.28 }, { "loss": 8.5833, "grad_norm": 3.3458187580108643, "learning_rate": 5.738163997549664e-05, "epoch": 1.28 }, { "loss": 7.8895, "grad_norm": 3.5432512760162354, "learning_rate": 5.729412794259211e-05, "epoch": 1.28 }, { "loss": 7.9663, "grad_norm": 3.1249189376831055, "learning_rate": 5.720661590968759e-05, "epoch": 1.28 }, { "loss": 7.6285, "grad_norm": 2.8004276752471924, "learning_rate": 5.7119103876783063e-05, "epoch": 1.29 }, { "loss": 8.2953, "grad_norm": 3.2479677200317383, "learning_rate": 5.703159184387854e-05, "epoch": 1.29 }, { "loss": 8.0079, "grad_norm": 3.8008508682250977, "learning_rate": 5.6944079810974014e-05, "epoch": 1.29 }, { "loss": 7.8035, "grad_norm": 3.2461721897125244, "learning_rate": 5.685656777806949e-05, "epoch": 1.29 }, { "loss": 8.6322, "grad_norm": 3.8512370586395264, "learning_rate": 5.6769055745164965e-05, "epoch": 1.3 }, { "loss": 8.7099, "grad_norm": 3.9859845638275146, "learning_rate": 5.6681543712260444e-05, "epoch": 1.3 }, { "loss": 7.9929, "grad_norm": 3.455918550491333, "learning_rate": 5.6594031679355916e-05, "epoch": 1.3 }, { "loss": 7.9348, "grad_norm": 3.744387626647949, "learning_rate": 5.6506519646451395e-05, "epoch": 1.3 }, { "loss": 7.9457, "grad_norm": 5.055604934692383, "learning_rate": 5.641900761354687e-05, "epoch": 1.31 }, { "loss": 8.2247, "grad_norm": 3.072326183319092, "learning_rate": 5.6331495580642346e-05, "epoch": 1.31 }, { "loss": 7.7124, "grad_norm": 4.150148868560791, "learning_rate": 5.624398354773782e-05, "epoch": 1.31 }, { "loss": 8.1994, "grad_norm": 5.2460503578186035, "learning_rate": 5.61564715148333e-05, "epoch": 1.32 }, { "loss": 8.1123, "grad_norm": 3.8343966007232666, "learning_rate": 5.6068959481928776e-05, "epoch": 1.32 }, { "loss": 7.9948, "grad_norm": 3.488602638244629, "learning_rate": 5.5981447449024235e-05, "epoch": 1.32 }, { "loss": 7.745, "grad_norm": 2.6132748126983643, "learning_rate": 5.5893935416119714e-05, "epoch": 1.32 }, { "loss": 8.4735, "grad_norm": 3.123828172683716, "learning_rate": 5.580642338321519e-05, "epoch": 1.33 }, { "loss": 7.5695, "grad_norm": 3.747915506362915, "learning_rate": 5.5718911350310665e-05, "epoch": 1.33 }, { "loss": 8.3462, "grad_norm": 4.172099590301514, "learning_rate": 5.5631399317406144e-05, "epoch": 1.33 }, { "loss": 7.639, "grad_norm": 3.172137498855591, "learning_rate": 5.5543887284501616e-05, "epoch": 1.33 }, { "loss": 7.9555, "grad_norm": 4.053969383239746, "learning_rate": 5.5456375251597095e-05, "epoch": 1.34 }, { "loss": 7.2866, "grad_norm": 3.186673164367676, "learning_rate": 5.536886321869257e-05, "epoch": 1.34 }, { "loss": 7.7438, "grad_norm": 3.2737646102905273, "learning_rate": 5.5281351185788046e-05, "epoch": 1.34 }, { "loss": 7.7122, "grad_norm": 3.1801161766052246, "learning_rate": 5.519383915288352e-05, "epoch": 1.34 }, { "loss": 7.6629, "grad_norm": 3.773719072341919, "learning_rate": 5.5106327119979e-05, "epoch": 1.35 }, { "loss": 8.2615, "grad_norm": 4.548736095428467, "learning_rate": 5.501881508707447e-05, "epoch": 1.35 }, { "loss": 8.0535, "grad_norm": 3.921649694442749, "learning_rate": 5.493130305416995e-05, "epoch": 1.35 }, { "loss": 7.4305, "grad_norm": 4.346540451049805, "learning_rate": 5.484379102126542e-05, "epoch": 1.35 }, { "loss": 8.423, "grad_norm": 4.634354114532471, "learning_rate": 5.47562789883609e-05, "epoch": 1.36 }, { "loss": 7.8966, "grad_norm": 3.5531675815582275, "learning_rate": 5.466876695545637e-05, "epoch": 1.36 }, { "loss": 7.7126, "grad_norm": 4.377911567687988, "learning_rate": 5.458125492255185e-05, "epoch": 1.36 }, { "loss": 8.2728, "grad_norm": 3.366030216217041, "learning_rate": 5.449374288964733e-05, "epoch": 1.36 }, { "loss": 8.1801, "grad_norm": 3.4603772163391113, "learning_rate": 5.44062308567428e-05, "epoch": 1.37 }, { "loss": 8.2143, "grad_norm": 4.528195381164551, "learning_rate": 5.431871882383828e-05, "epoch": 1.37 }, { "loss": 7.359, "grad_norm": 2.4803977012634277, "learning_rate": 5.423120679093375e-05, "epoch": 1.37 }, { "loss": 8.1361, "grad_norm": 4.201333999633789, "learning_rate": 5.414369475802923e-05, "epoch": 1.38 }, { "loss": 7.8974, "grad_norm": 4.272532939910889, "learning_rate": 5.40561827251247e-05, "epoch": 1.38 }, { "loss": 7.502, "grad_norm": 3.4006450176239014, "learning_rate": 5.396867069222018e-05, "epoch": 1.38 }, { "loss": 7.78, "grad_norm": 3.902611255645752, "learning_rate": 5.3881158659315654e-05, "epoch": 1.38 }, { "loss": 8.0953, "grad_norm": 2.6970345973968506, "learning_rate": 5.379364662641113e-05, "epoch": 1.39 }, { "loss": 7.7703, "grad_norm": 3.610957145690918, "learning_rate": 5.3706134593506605e-05, "epoch": 1.39 }, { "loss": 8.7352, "grad_norm": 4.159451961517334, "learning_rate": 5.3618622560602084e-05, "epoch": 1.39 }, { "loss": 7.5264, "grad_norm": 2.7696640491485596, "learning_rate": 5.3531110527697556e-05, "epoch": 1.39 }, { "loss": 7.776, "grad_norm": 5.263556003570557, "learning_rate": 5.3443598494793035e-05, "epoch": 1.4 }, { "loss": 7.4893, "grad_norm": 3.3409626483917236, "learning_rate": 5.3356086461888514e-05, "epoch": 1.4 }, { "loss": 7.9241, "grad_norm": 5.305122375488281, "learning_rate": 5.3268574428983986e-05, "epoch": 1.4 }, { "loss": 7.9385, "grad_norm": 4.231367588043213, "learning_rate": 5.3181062396079465e-05, "epoch": 1.4 }, { "loss": 8.1538, "grad_norm": 3.7227704524993896, "learning_rate": 5.309355036317494e-05, "epoch": 1.41 }, { "loss": 7.5085, "grad_norm": 3.6258912086486816, "learning_rate": 5.3006038330270416e-05, "epoch": 1.41 }, { "loss": 7.6825, "grad_norm": 3.3270792961120605, "learning_rate": 5.291852629736589e-05, "epoch": 1.41 }, { "loss": 7.8874, "grad_norm": 2.983099937438965, "learning_rate": 5.2831014264461367e-05, "epoch": 1.41 }, { "loss": 8.0616, "grad_norm": 3.8440752029418945, "learning_rate": 5.274350223155684e-05, "epoch": 1.42 }, { "loss": 7.7685, "grad_norm": 5.8492608070373535, "learning_rate": 5.265599019865232e-05, "epoch": 1.42 }, { "loss": 7.5809, "grad_norm": 3.308460235595703, "learning_rate": 5.256847816574779e-05, "epoch": 1.42 }, { "loss": 8.7381, "grad_norm": 3.017559766769409, "learning_rate": 5.248096613284327e-05, "epoch": 1.43 }, { "loss": 7.9422, "grad_norm": 4.227987766265869, "learning_rate": 5.239345409993874e-05, "epoch": 1.43 }, { "loss": 7.9202, "grad_norm": 3.1066997051239014, "learning_rate": 5.230594206703422e-05, "epoch": 1.43 }, { "loss": 7.7366, "grad_norm": 3.3069207668304443, "learning_rate": 5.221843003412969e-05, "epoch": 1.43 }, { "loss": 8.3042, "grad_norm": 3.065303087234497, "learning_rate": 5.213091800122517e-05, "epoch": 1.44 }, { "loss": 8.2732, "grad_norm": 3.6093387603759766, "learning_rate": 5.204340596832065e-05, "epoch": 1.44 }, { "loss": 7.1332, "grad_norm": 4.356596946716309, "learning_rate": 5.195589393541612e-05, "epoch": 1.44 }, { "loss": 7.9349, "grad_norm": 4.91728401184082, "learning_rate": 5.18683819025116e-05, "epoch": 1.44 }, { "loss": 8.0487, "grad_norm": 4.411836624145508, "learning_rate": 5.178086986960707e-05, "epoch": 1.45 }, { "loss": 7.7545, "grad_norm": 3.488790512084961, "learning_rate": 5.169335783670255e-05, "epoch": 1.45 }, { "loss": 7.7508, "grad_norm": 5.54533576965332, "learning_rate": 5.1605845803798024e-05, "epoch": 1.45 }, { "loss": 7.8403, "grad_norm": 2.8527212142944336, "learning_rate": 5.15183337708935e-05, "epoch": 1.45 }, { "loss": 8.1535, "grad_norm": 3.892737865447998, "learning_rate": 5.1430821737988975e-05, "epoch": 1.46 }, { "loss": 7.6217, "grad_norm": 3.9077818393707275, "learning_rate": 5.1343309705084454e-05, "epoch": 1.46 }, { "loss": 7.6306, "grad_norm": 3.9363648891448975, "learning_rate": 5.1255797672179926e-05, "epoch": 1.46 }, { "loss": 8.5425, "grad_norm": 4.692113399505615, "learning_rate": 5.1168285639275405e-05, "epoch": 1.46 }, { "loss": 7.7507, "grad_norm": 5.601973056793213, "learning_rate": 5.108077360637088e-05, "epoch": 1.47 }, { "loss": 7.8534, "grad_norm": 3.5403494834899902, "learning_rate": 5.0993261573466356e-05, "epoch": 1.47 }, { "loss": 7.9312, "grad_norm": 4.555025100708008, "learning_rate": 5.0905749540561834e-05, "epoch": 1.47 }, { "loss": 7.624, "grad_norm": 5.721600532531738, "learning_rate": 5.0818237507657307e-05, "epoch": 1.48 }, { "loss": 7.9349, "grad_norm": 3.4647514820098877, "learning_rate": 5.0730725474752785e-05, "epoch": 1.48 }, { "loss": 7.5857, "grad_norm": 3.362941026687622, "learning_rate": 5.064321344184826e-05, "epoch": 1.48 }, { "loss": 8.3246, "grad_norm": 5.352531433105469, "learning_rate": 5.0555701408943736e-05, "epoch": 1.48 }, { "loss": 7.8738, "grad_norm": 3.2162294387817383, "learning_rate": 5.046818937603921e-05, "epoch": 1.49 }, { "loss": 8.0784, "grad_norm": 3.607652187347412, "learning_rate": 5.038067734313469e-05, "epoch": 1.49 }, { "loss": 8.0385, "grad_norm": 3.6921122074127197, "learning_rate": 5.029316531023016e-05, "epoch": 1.49 }, { "loss": 7.9548, "grad_norm": 5.187925338745117, "learning_rate": 5.020565327732564e-05, "epoch": 1.49 }, { "loss": 7.9085, "grad_norm": 4.099059581756592, "learning_rate": 5.011814124442111e-05, "epoch": 1.5 }, { "loss": 7.7387, "grad_norm": 3.0415878295898438, "learning_rate": 5.003062921151659e-05, "epoch": 1.5 }, { "loss": 8.5766, "grad_norm": 4.777284622192383, "learning_rate": 4.994311717861206e-05, "epoch": 1.5 }, { "loss": 7.697, "grad_norm": 5.438363075256348, "learning_rate": 4.9855605145707534e-05, "epoch": 1.5 }, { "loss": 8.4595, "grad_norm": 5.054925441741943, "learning_rate": 4.976809311280301e-05, "epoch": 1.51 }, { "loss": 8.3592, "grad_norm": 2.9228146076202393, "learning_rate": 4.9680581079898485e-05, "epoch": 1.51 }, { "loss": 7.7911, "grad_norm": 4.529871940612793, "learning_rate": 4.9593069046993964e-05, "epoch": 1.51 }, { "loss": 8.0316, "grad_norm": 3.9995975494384766, "learning_rate": 4.9505557014089436e-05, "epoch": 1.51 }, { "loss": 7.5984, "grad_norm": 3.9212229251861572, "learning_rate": 4.9418044981184915e-05, "epoch": 1.52 }, { "loss": 8.1606, "grad_norm": 3.479395866394043, "learning_rate": 4.933053294828039e-05, "epoch": 1.52 }, { "loss": 7.7609, "grad_norm": 3.5287656784057617, "learning_rate": 4.9243020915375866e-05, "epoch": 1.52 }, { "loss": 8.2939, "grad_norm": 3.2169201374053955, "learning_rate": 4.915550888247134e-05, "epoch": 1.53 }, { "loss": 7.8161, "grad_norm": 4.046046733856201, "learning_rate": 4.906799684956682e-05, "epoch": 1.53 }, { "loss": 8.1367, "grad_norm": 3.9905033111572266, "learning_rate": 4.898048481666229e-05, "epoch": 1.53 }, { "loss": 7.522, "grad_norm": 3.0949547290802, "learning_rate": 4.889297278375777e-05, "epoch": 1.53 }, { "loss": 7.8009, "grad_norm": 3.2042038440704346, "learning_rate": 4.8805460750853247e-05, "epoch": 1.54 }, { "loss": 7.9891, "grad_norm": 3.542771100997925, "learning_rate": 4.871794871794872e-05, "epoch": 1.54 }, { "loss": 8.1601, "grad_norm": 4.720103740692139, "learning_rate": 4.86304366850442e-05, "epoch": 1.54 }, { "loss": 7.5776, "grad_norm": 3.65787672996521, "learning_rate": 4.854292465213967e-05, "epoch": 1.54 }, { "loss": 7.4047, "grad_norm": 3.9372549057006836, "learning_rate": 4.845541261923515e-05, "epoch": 1.55 }, { "loss": 8.0766, "grad_norm": 3.362112045288086, "learning_rate": 4.836790058633062e-05, "epoch": 1.55 }, { "loss": 7.8371, "grad_norm": 5.547123908996582, "learning_rate": 4.82803885534261e-05, "epoch": 1.55 }, { "loss": 8.0129, "grad_norm": 4.756041526794434, "learning_rate": 4.819287652052157e-05, "epoch": 1.55 }, { "loss": 7.8304, "grad_norm": 3.8089821338653564, "learning_rate": 4.810536448761705e-05, "epoch": 1.56 }, { "loss": 7.7565, "grad_norm": 3.8562700748443604, "learning_rate": 4.801785245471252e-05, "epoch": 1.56 }, { "loss": 7.4297, "grad_norm": 4.8232831954956055, "learning_rate": 4.7930340421808e-05, "epoch": 1.56 }, { "loss": 7.4436, "grad_norm": 4.951693058013916, "learning_rate": 4.7842828388903474e-05, "epoch": 1.56 }, { "loss": 7.9573, "grad_norm": 3.800071954727173, "learning_rate": 4.775531635599895e-05, "epoch": 1.57 }, { "loss": 7.4168, "grad_norm": 4.224662780761719, "learning_rate": 4.766780432309443e-05, "epoch": 1.57 }, { "loss": 8.4404, "grad_norm": 3.3358187675476074, "learning_rate": 4.7580292290189904e-05, "epoch": 1.57 }, { "loss": 7.4616, "grad_norm": 4.352634906768799, "learning_rate": 4.749278025728538e-05, "epoch": 1.57 }, { "loss": 7.8744, "grad_norm": 3.5693962574005127, "learning_rate": 4.7405268224380855e-05, "epoch": 1.58 }, { "loss": 7.5451, "grad_norm": 3.5086276531219482, "learning_rate": 4.7317756191476334e-05, "epoch": 1.58 }, { "loss": 7.4284, "grad_norm": 3.0168793201446533, "learning_rate": 4.7230244158571806e-05, "epoch": 1.58 }, { "loss": 7.7376, "grad_norm": 4.352570056915283, "learning_rate": 4.7142732125667285e-05, "epoch": 1.59 }, { "loss": 7.64, "grad_norm": 5.351820468902588, "learning_rate": 4.705522009276276e-05, "epoch": 1.59 }, { "loss": 7.9318, "grad_norm": 3.993790626525879, "learning_rate": 4.6967708059858236e-05, "epoch": 1.59 }, { "loss": 7.7602, "grad_norm": 3.1628670692443848, "learning_rate": 4.688019602695371e-05, "epoch": 1.59 }, { "loss": 7.2848, "grad_norm": 2.481705665588379, "learning_rate": 4.679268399404919e-05, "epoch": 1.6 }, { "loss": 7.8586, "grad_norm": 3.944296360015869, "learning_rate": 4.670517196114466e-05, "epoch": 1.6 }, { "loss": 7.7223, "grad_norm": 4.099398136138916, "learning_rate": 4.661765992824014e-05, "epoch": 1.6 }, { "loss": 7.6033, "grad_norm": 2.781362533569336, "learning_rate": 4.653014789533561e-05, "epoch": 1.6 }, { "loss": 8.4421, "grad_norm": 4.035131454467773, "learning_rate": 4.644263586243109e-05, "epoch": 1.61 }, { "loss": 7.6095, "grad_norm": 3.3464620113372803, "learning_rate": 4.635512382952657e-05, "epoch": 1.61 }, { "loss": 7.8892, "grad_norm": 4.8561553955078125, "learning_rate": 4.626761179662204e-05, "epoch": 1.61 }, { "loss": 7.7162, "grad_norm": 6.795714378356934, "learning_rate": 4.618009976371752e-05, "epoch": 1.61 }, { "loss": 7.7384, "grad_norm": 3.0965943336486816, "learning_rate": 4.609258773081299e-05, "epoch": 1.62 }, { "loss": 7.9766, "grad_norm": 3.1002793312072754, "learning_rate": 4.600507569790847e-05, "epoch": 1.62 }, { "loss": 7.4392, "grad_norm": 6.083471298217773, "learning_rate": 4.5917563665003935e-05, "epoch": 1.62 }, { "loss": 7.5822, "grad_norm": 4.11601448059082, "learning_rate": 4.5830051632099414e-05, "epoch": 1.62 }, { "loss": 7.5988, "grad_norm": 4.361574172973633, "learning_rate": 4.5742539599194886e-05, "epoch": 1.63 }, { "loss": 8.1273, "grad_norm": 4.6307549476623535, "learning_rate": 4.5655027566290365e-05, "epoch": 1.63 }, { "loss": 7.3601, "grad_norm": 3.4341373443603516, "learning_rate": 4.5567515533385844e-05, "epoch": 1.63 }, { "loss": 7.9766, "grad_norm": 3.7583069801330566, "learning_rate": 4.5480003500481316e-05, "epoch": 1.64 }, { "loss": 7.8755, "grad_norm": 3.212942123413086, "learning_rate": 4.5392491467576795e-05, "epoch": 1.64 }, { "loss": 7.8139, "grad_norm": 2.9877207279205322, "learning_rate": 4.530497943467227e-05, "epoch": 1.64 }, { "loss": 7.8379, "grad_norm": 4.133498191833496, "learning_rate": 4.5217467401767746e-05, "epoch": 1.64 }, { "loss": 7.9657, "grad_norm": 3.252624273300171, "learning_rate": 4.512995536886322e-05, "epoch": 1.65 }, { "loss": 7.7005, "grad_norm": 3.70926833152771, "learning_rate": 4.50424433359587e-05, "epoch": 1.65 }, { "loss": 7.6194, "grad_norm": 4.198193073272705, "learning_rate": 4.495493130305417e-05, "epoch": 1.65 }, { "loss": 8.1874, "grad_norm": 3.5660247802734375, "learning_rate": 4.486741927014965e-05, "epoch": 1.65 }, { "loss": 8.0731, "grad_norm": 3.6867547035217285, "learning_rate": 4.477990723724512e-05, "epoch": 1.66 }, { "loss": 7.4702, "grad_norm": 3.8409180641174316, "learning_rate": 4.46923952043406e-05, "epoch": 1.66 }, { "loss": 8.1333, "grad_norm": 3.7179150581359863, "learning_rate": 4.460488317143607e-05, "epoch": 1.66 }, { "loss": 7.4353, "grad_norm": 4.092810153961182, "learning_rate": 4.451737113853155e-05, "epoch": 1.66 }, { "loss": 8.1888, "grad_norm": 4.3642754554748535, "learning_rate": 4.442985910562702e-05, "epoch": 1.67 }, { "loss": 8.1823, "grad_norm": 3.4664993286132812, "learning_rate": 4.43423470727225e-05, "epoch": 1.67 }, { "loss": 7.6325, "grad_norm": 4.143255710601807, "learning_rate": 4.425483503981798e-05, "epoch": 1.67 }, { "loss": 7.9794, "grad_norm": 3.8068184852600098, "learning_rate": 4.416732300691345e-05, "epoch": 1.67 }, { "loss": 7.5482, "grad_norm": 3.6255953311920166, "learning_rate": 4.407981097400893e-05, "epoch": 1.68 }, { "loss": 7.1437, "grad_norm": 4.526164531707764, "learning_rate": 4.39922989411044e-05, "epoch": 1.68 }, { "loss": 7.3931, "grad_norm": 3.652649402618408, "learning_rate": 4.390478690819988e-05, "epoch": 1.68 }, { "loss": 7.3862, "grad_norm": 4.751399993896484, "learning_rate": 4.3817274875295354e-05, "epoch": 1.69 }, { "loss": 7.8723, "grad_norm": 3.011975049972534, "learning_rate": 4.372976284239083e-05, "epoch": 1.69 }, { "loss": 8.0483, "grad_norm": 4.407155513763428, "learning_rate": 4.3642250809486305e-05, "epoch": 1.69 }, { "loss": 7.6125, "grad_norm": 3.762749195098877, "learning_rate": 4.3554738776581784e-05, "epoch": 1.69 }, { "loss": 7.7699, "grad_norm": 5.391783714294434, "learning_rate": 4.3467226743677256e-05, "epoch": 1.7 }, { "loss": 7.6641, "grad_norm": 3.509794235229492, "learning_rate": 4.3379714710772735e-05, "epoch": 1.7 }, { "loss": 8.4195, "grad_norm": 4.34732723236084, "learning_rate": 4.329220267786821e-05, "epoch": 1.7 }, { "loss": 7.6044, "grad_norm": 4.418550491333008, "learning_rate": 4.3204690644963686e-05, "epoch": 1.7 }, { "loss": 7.8304, "grad_norm": 3.9914748668670654, "learning_rate": 4.3117178612059165e-05, "epoch": 1.71 }, { "loss": 7.4516, "grad_norm": 4.141488075256348, "learning_rate": 4.302966657915464e-05, "epoch": 1.71 }, { "loss": 7.7451, "grad_norm": 3.61734938621521, "learning_rate": 4.2942154546250116e-05, "epoch": 1.71 }, { "loss": 8.0466, "grad_norm": 3.956249475479126, "learning_rate": 4.285464251334559e-05, "epoch": 1.71 }, { "loss": 7.5964, "grad_norm": 3.214452028274536, "learning_rate": 4.276713048044107e-05, "epoch": 1.72 }, { "loss": 7.7055, "grad_norm": 3.8038113117218018, "learning_rate": 4.267961844753654e-05, "epoch": 1.72 }, { "loss": 7.9609, "grad_norm": 4.2961626052856445, "learning_rate": 4.259210641463202e-05, "epoch": 1.72 }, { "loss": 7.7958, "grad_norm": 2.900935649871826, "learning_rate": 4.250459438172749e-05, "epoch": 1.72 }, { "loss": 7.7953, "grad_norm": 3.369781970977783, "learning_rate": 4.241708234882297e-05, "epoch": 1.73 }, { "loss": 7.5888, "grad_norm": 6.093942642211914, "learning_rate": 4.232957031591844e-05, "epoch": 1.73 }, { "loss": 8.1158, "grad_norm": 4.063805103302002, "learning_rate": 4.224205828301392e-05, "epoch": 1.73 }, { "loss": 7.6127, "grad_norm": 3.981023073196411, "learning_rate": 4.215454625010939e-05, "epoch": 1.74 }, { "loss": 7.3727, "grad_norm": 3.273742437362671, "learning_rate": 4.206703421720487e-05, "epoch": 1.74 }, { "loss": 7.7617, "grad_norm": 4.247544765472412, "learning_rate": 4.197952218430034e-05, "epoch": 1.74 }, { "loss": 8.2201, "grad_norm": 5.181518077850342, "learning_rate": 4.1892010151395815e-05, "epoch": 1.74 }, { "loss": 8.0504, "grad_norm": 3.4994397163391113, "learning_rate": 4.1804498118491294e-05, "epoch": 1.75 }, { "loss": 8.2552, "grad_norm": 4.784666061401367, "learning_rate": 4.1716986085586766e-05, "epoch": 1.75 }, { "loss": 7.746, "grad_norm": 4.549380779266357, "learning_rate": 4.1629474052682245e-05, "epoch": 1.75 }, { "loss": 7.6646, "grad_norm": 3.586853504180908, "learning_rate": 4.154196201977772e-05, "epoch": 1.75 }, { "loss": 7.454, "grad_norm": 4.0881500244140625, "learning_rate": 4.1454449986873196e-05, "epoch": 1.76 }, { "loss": 7.6951, "grad_norm": 3.7725558280944824, "learning_rate": 4.136693795396867e-05, "epoch": 1.76 }, { "loss": 7.7406, "grad_norm": 4.566652297973633, "learning_rate": 4.127942592106415e-05, "epoch": 1.76 }, { "loss": 8.0458, "grad_norm": 4.562892913818359, "learning_rate": 4.119191388815962e-05, "epoch": 1.76 }, { "loss": 8.314, "grad_norm": 3.1217896938323975, "learning_rate": 4.11044018552551e-05, "epoch": 1.77 }, { "loss": 7.931, "grad_norm": 3.4693222045898438, "learning_rate": 4.101688982235058e-05, "epoch": 1.77 }, { "loss": 7.6126, "grad_norm": 3.7778282165527344, "learning_rate": 4.092937778944605e-05, "epoch": 1.77 }, { "loss": 7.4983, "grad_norm": 6.494439125061035, "learning_rate": 4.084186575654153e-05, "epoch": 1.77 }, { "loss": 7.801, "grad_norm": 3.602264165878296, "learning_rate": 4.0754353723637e-05, "epoch": 1.78 }, { "loss": 7.405, "grad_norm": 4.2882795333862305, "learning_rate": 4.066684169073248e-05, "epoch": 1.78 }, { "loss": 7.5115, "grad_norm": 4.935623645782471, "learning_rate": 4.057932965782795e-05, "epoch": 1.78 }, { "loss": 7.5315, "grad_norm": 5.16713809967041, "learning_rate": 4.049181762492343e-05, "epoch": 1.78 }, { "loss": 7.8313, "grad_norm": 3.440279960632324, "learning_rate": 4.04043055920189e-05, "epoch": 1.79 }, { "loss": 7.7669, "grad_norm": 4.02671480178833, "learning_rate": 4.031679355911438e-05, "epoch": 1.79 }, { "loss": 7.6988, "grad_norm": 5.945104598999023, "learning_rate": 4.022928152620985e-05, "epoch": 1.79 }, { "loss": 7.2314, "grad_norm": 4.557019233703613, "learning_rate": 4.014176949330533e-05, "epoch": 1.8 }, { "loss": 7.5578, "grad_norm": 3.9793171882629395, "learning_rate": 4.0054257460400804e-05, "epoch": 1.8 }, { "loss": 7.1794, "grad_norm": 3.178558349609375, "learning_rate": 3.996674542749628e-05, "epoch": 1.8 }, { "loss": 7.842, "grad_norm": 4.609609127044678, "learning_rate": 3.987923339459176e-05, "epoch": 1.8 }, { "loss": 7.4484, "grad_norm": 3.5374889373779297, "learning_rate": 3.9791721361687234e-05, "epoch": 1.81 }, { "loss": 7.2917, "grad_norm": 4.768485069274902, "learning_rate": 3.970420932878271e-05, "epoch": 1.81 }, { "loss": 7.4525, "grad_norm": 3.342456102371216, "learning_rate": 3.9616697295878185e-05, "epoch": 1.81 }, { "loss": 7.6611, "grad_norm": 4.111917018890381, "learning_rate": 3.9529185262973664e-05, "epoch": 1.81 }, { "loss": 7.9292, "grad_norm": 5.008895397186279, "learning_rate": 3.9441673230069136e-05, "epoch": 1.82 }, { "loss": 7.9246, "grad_norm": 4.372122287750244, "learning_rate": 3.9354161197164615e-05, "epoch": 1.82 }, { "loss": 7.6795, "grad_norm": 3.406059503555298, "learning_rate": 3.926664916426009e-05, "epoch": 1.82 }, { "loss": 7.5926, "grad_norm": 4.412403583526611, "learning_rate": 3.9179137131355566e-05, "epoch": 1.82 }, { "loss": 8.0002, "grad_norm": 4.203276634216309, "learning_rate": 3.909162509845104e-05, "epoch": 1.83 }, { "loss": 7.8556, "grad_norm": 3.7347216606140137, "learning_rate": 3.900411306554652e-05, "epoch": 1.83 }, { "loss": 8.2028, "grad_norm": 4.552736282348633, "learning_rate": 3.891660103264199e-05, "epoch": 1.83 }, { "loss": 8.1277, "grad_norm": 4.882839679718018, "learning_rate": 3.882908899973747e-05, "epoch": 1.83 }, { "loss": 7.74, "grad_norm": 4.639001846313477, "learning_rate": 3.875032817012339e-05, "epoch": 1.84 }, { "loss": 8.0942, "grad_norm": 5.097876071929932, "learning_rate": 3.866281613721887e-05, "epoch": 1.84 }, { "loss": 7.9073, "grad_norm": 3.200108051300049, "learning_rate": 3.8575304104314344e-05, "epoch": 1.84 }, { "loss": 7.5499, "grad_norm": 3.8395094871520996, "learning_rate": 3.848779207140982e-05, "epoch": 1.85 }, { "loss": 7.5298, "grad_norm": 3.6033782958984375, "learning_rate": 3.8400280038505295e-05, "epoch": 1.85 }, { "loss": 7.3608, "grad_norm": 4.341715335845947, "learning_rate": 3.8312768005600774e-05, "epoch": 1.85 }, { "loss": 8.2837, "grad_norm": 2.746906042098999, "learning_rate": 3.822525597269625e-05, "epoch": 1.85 }, { "loss": 7.0553, "grad_norm": 4.10823392868042, "learning_rate": 3.8137743939791725e-05, "epoch": 1.86 }, { "loss": 7.4852, "grad_norm": 3.21799635887146, "learning_rate": 3.8050231906887204e-05, "epoch": 1.86 }, { "loss": 7.4002, "grad_norm": 4.537161827087402, "learning_rate": 3.796271987398267e-05, "epoch": 1.86 }, { "loss": 7.4377, "grad_norm": 4.020664691925049, "learning_rate": 3.787520784107815e-05, "epoch": 1.86 }, { "loss": 7.6523, "grad_norm": 3.1800293922424316, "learning_rate": 3.778769580817362e-05, "epoch": 1.87 }, { "loss": 7.4824, "grad_norm": 3.2757511138916016, "learning_rate": 3.77001837752691e-05, "epoch": 1.87 }, { "loss": 7.8269, "grad_norm": 3.6784262657165527, "learning_rate": 3.761267174236457e-05, "epoch": 1.87 }, { "loss": 7.9757, "grad_norm": 3.4948902130126953, "learning_rate": 3.752515970946005e-05, "epoch": 1.87 }, { "loss": 7.8251, "grad_norm": 5.0971598625183105, "learning_rate": 3.743764767655553e-05, "epoch": 1.88 }, { "loss": 7.7561, "grad_norm": 4.533854961395264, "learning_rate": 3.7350135643651e-05, "epoch": 1.88 }, { "loss": 7.8986, "grad_norm": 4.550451278686523, "learning_rate": 3.726262361074648e-05, "epoch": 1.88 }, { "loss": 7.2438, "grad_norm": 3.8077099323272705, "learning_rate": 3.717511157784195e-05, "epoch": 1.88 }, { "loss": 7.7242, "grad_norm": 5.2727203369140625, "learning_rate": 3.708759954493743e-05, "epoch": 1.89 }, { "loss": 7.8602, "grad_norm": 2.9006500244140625, "learning_rate": 3.7000087512032903e-05, "epoch": 1.89 }, { "loss": 7.0401, "grad_norm": 4.919744491577148, "learning_rate": 3.691257547912838e-05, "epoch": 1.89 }, { "loss": 7.5799, "grad_norm": 3.297295093536377, "learning_rate": 3.6825063446223854e-05, "epoch": 1.9 }, { "loss": 7.355, "grad_norm": 2.9851813316345215, "learning_rate": 3.673755141331933e-05, "epoch": 1.9 }, { "loss": 7.715, "grad_norm": 3.619997262954712, "learning_rate": 3.6650039380414805e-05, "epoch": 1.9 }, { "loss": 7.8093, "grad_norm": 4.266133785247803, "learning_rate": 3.6562527347510284e-05, "epoch": 1.9 }, { "loss": 7.5723, "grad_norm": 3.513849973678589, "learning_rate": 3.6475015314605756e-05, "epoch": 1.91 }, { "loss": 7.006, "grad_norm": 3.6736350059509277, "learning_rate": 3.6387503281701235e-05, "epoch": 1.91 }, { "loss": 7.8925, "grad_norm": 3.4943020343780518, "learning_rate": 3.629999124879671e-05, "epoch": 1.91 }, { "loss": 7.3886, "grad_norm": 5.898230075836182, "learning_rate": 3.6212479215892186e-05, "epoch": 1.91 }, { "loss": 7.9521, "grad_norm": 3.2569427490234375, "learning_rate": 3.6124967182987665e-05, "epoch": 1.92 }, { "loss": 7.1573, "grad_norm": 6.18344259262085, "learning_rate": 3.603745515008314e-05, "epoch": 1.92 }, { "loss": 7.3595, "grad_norm": 6.704586982727051, "learning_rate": 3.5949943117178616e-05, "epoch": 1.92 }, { "loss": 8.1131, "grad_norm": 3.768490791320801, "learning_rate": 3.586243108427409e-05, "epoch": 1.92 }, { "loss": 7.8278, "grad_norm": 4.432671070098877, "learning_rate": 3.577491905136957e-05, "epoch": 1.93 }, { "loss": 7.4794, "grad_norm": 3.835556745529175, "learning_rate": 3.568740701846504e-05, "epoch": 1.93 }, { "loss": 7.5471, "grad_norm": 5.500497817993164, "learning_rate": 3.559989498556052e-05, "epoch": 1.93 }, { "loss": 7.9875, "grad_norm": 4.727583408355713, "learning_rate": 3.551238295265599e-05, "epoch": 1.93 }, { "loss": 6.9965, "grad_norm": 5.54524040222168, "learning_rate": 3.542487091975147e-05, "epoch": 1.94 }, { "loss": 7.6091, "grad_norm": 3.945673942565918, "learning_rate": 3.533735888684694e-05, "epoch": 1.94 }, { "loss": 7.6881, "grad_norm": 3.220522880554199, "learning_rate": 3.524984685394242e-05, "epoch": 1.94 }, { "loss": 7.5224, "grad_norm": 5.061761856079102, "learning_rate": 3.516233482103789e-05, "epoch": 1.95 }, { "loss": 7.2034, "grad_norm": 4.419524192810059, "learning_rate": 3.507482278813337e-05, "epoch": 1.95 }, { "loss": 7.8167, "grad_norm": 4.390359878540039, "learning_rate": 3.498731075522885e-05, "epoch": 1.95 }, { "loss": 7.9913, "grad_norm": 3.729773998260498, "learning_rate": 3.489979872232432e-05, "epoch": 1.95 }, { "loss": 7.6947, "grad_norm": 4.854176044464111, "learning_rate": 3.48122866894198e-05, "epoch": 1.96 }, { "loss": 7.7003, "grad_norm": 3.3899290561676025, "learning_rate": 3.472477465651527e-05, "epoch": 1.96 }, { "loss": 7.6489, "grad_norm": 4.47396993637085, "learning_rate": 3.463726262361075e-05, "epoch": 1.96 }, { "loss": 7.8908, "grad_norm": 3.3266396522521973, "learning_rate": 3.4549750590706224e-05, "epoch": 1.96 }, { "loss": 7.4745, "grad_norm": 4.091291904449463, "learning_rate": 3.44622385578017e-05, "epoch": 1.97 }, { "loss": 7.5307, "grad_norm": 7.771108627319336, "learning_rate": 3.4374726524897175e-05, "epoch": 1.97 }, { "loss": 7.7252, "grad_norm": 4.1433305740356445, "learning_rate": 3.4287214491992654e-05, "epoch": 1.97 }, { "loss": 8.1189, "grad_norm": 3.5036942958831787, "learning_rate": 3.4199702459088126e-05, "epoch": 1.97 }, { "loss": 7.6836, "grad_norm": 4.437150478363037, "learning_rate": 3.4112190426183605e-05, "epoch": 1.98 }, { "loss": 7.8543, "grad_norm": 6.440913200378418, "learning_rate": 3.402467839327908e-05, "epoch": 1.98 }, { "loss": 7.8673, "grad_norm": 4.657886981964111, "learning_rate": 3.393716636037455e-05, "epoch": 1.98 }, { "loss": 8.4281, "grad_norm": 4.122070789337158, "learning_rate": 3.384965432747003e-05, "epoch": 1.98 }, { "loss": 7.1948, "grad_norm": 3.2325737476348877, "learning_rate": 3.37621422945655e-05, "epoch": 1.99 }, { "loss": 7.3229, "grad_norm": 3.874630928039551, "learning_rate": 3.367463026166098e-05, "epoch": 1.99 }, { "loss": 7.5072, "grad_norm": 4.308450222015381, "learning_rate": 3.358711822875645e-05, "epoch": 1.99 }, { "loss": 7.5892, "grad_norm": 3.9709150791168213, "learning_rate": 3.349960619585193e-05, "epoch": 1.99 }, { "loss": 7.9792, "grad_norm": 6.5298919677734375, "learning_rate": 3.34120941629474e-05, "epoch": 2.0 }, { "loss": 7.688, "grad_norm": 4.508563041687012, "learning_rate": 3.332458213004288e-05, "epoch": 2.0 }, { "loss": 7.1915, "grad_norm": 3.5211637020111084, "learning_rate": 3.3237070097138354e-05, "epoch": 2.0 }, { "loss": 7.4623, "grad_norm": 4.973934173583984, "learning_rate": 3.314955806423383e-05, "epoch": 2.01 }, { "loss": 7.4097, "grad_norm": 4.810267448425293, "learning_rate": 3.3062046031329305e-05, "epoch": 2.01 }, { "loss": 7.5439, "grad_norm": 3.942003011703491, "learning_rate": 3.2974533998424783e-05, "epoch": 2.01 }, { "loss": 7.2904, "grad_norm": 3.7207398414611816, "learning_rate": 3.288702196552026e-05, "epoch": 2.01 }, { "loss": 7.0411, "grad_norm": 3.197200298309326, "learning_rate": 3.2799509932615734e-05, "epoch": 2.02 }, { "loss": 6.7708, "grad_norm": 5.261172294616699, "learning_rate": 3.2711997899711213e-05, "epoch": 2.02 }, { "loss": 7.2953, "grad_norm": 7.287022590637207, "learning_rate": 3.2624485866806686e-05, "epoch": 2.02 }, { "loss": 7.7778, "grad_norm": 3.6490862369537354, "learning_rate": 3.2536973833902164e-05, "epoch": 2.02 }, { "loss": 7.0521, "grad_norm": 3.474090337753296, "learning_rate": 3.2449461800997637e-05, "epoch": 2.03 }, { "loss": 7.6243, "grad_norm": 4.992802143096924, "learning_rate": 3.2361949768093115e-05, "epoch": 2.03 }, { "loss": 7.7397, "grad_norm": 4.16194486618042, "learning_rate": 3.227443773518859e-05, "epoch": 2.03 }, { "loss": 7.8743, "grad_norm": 4.265628814697266, "learning_rate": 3.2186925702284066e-05, "epoch": 2.03 }, { "loss": 7.5843, "grad_norm": 4.442827224731445, "learning_rate": 3.209941366937954e-05, "epoch": 2.04 }, { "loss": 7.1913, "grad_norm": 3.7389514446258545, "learning_rate": 3.201190163647502e-05, "epoch": 2.04 }, { "loss": 7.4241, "grad_norm": 4.544101238250732, "learning_rate": 3.192438960357049e-05, "epoch": 2.04 }, { "loss": 7.4453, "grad_norm": 3.6654653549194336, "learning_rate": 3.183687757066597e-05, "epoch": 2.04 }, { "loss": 7.4288, "grad_norm": 3.525256872177124, "learning_rate": 3.174936553776145e-05, "epoch": 2.05 }, { "loss": 7.387, "grad_norm": 4.041418075561523, "learning_rate": 3.166185350485692e-05, "epoch": 2.05 }, { "loss": 7.4435, "grad_norm": 4.415677547454834, "learning_rate": 3.15743414719524e-05, "epoch": 2.05 }, { "loss": 7.6321, "grad_norm": 3.649733066558838, "learning_rate": 3.148682943904787e-05, "epoch": 2.06 }, { "loss": 7.1816, "grad_norm": 4.361470699310303, "learning_rate": 3.139931740614335e-05, "epoch": 2.06 }, { "loss": 7.8665, "grad_norm": 2.8240556716918945, "learning_rate": 3.131180537323882e-05, "epoch": 2.06 }, { "loss": 8.0303, "grad_norm": 6.444936275482178, "learning_rate": 3.12242933403343e-05, "epoch": 2.06 }, { "loss": 7.834, "grad_norm": 4.267172813415527, "learning_rate": 3.113678130742977e-05, "epoch": 2.07 }, { "loss": 7.5162, "grad_norm": 4.9462480545043945, "learning_rate": 3.104926927452525e-05, "epoch": 2.07 }, { "loss": 7.3831, "grad_norm": 3.944603204727173, "learning_rate": 3.0961757241620723e-05, "epoch": 2.07 }, { "loss": 7.0951, "grad_norm": 4.1821608543396, "learning_rate": 3.08742452087162e-05, "epoch": 2.07 }, { "loss": 7.4743, "grad_norm": 4.054866790771484, "learning_rate": 3.0786733175811675e-05, "epoch": 2.08 }, { "loss": 7.4355, "grad_norm": 4.87803316116333, "learning_rate": 3.0699221142907153e-05, "epoch": 2.08 }, { "loss": 7.8149, "grad_norm": 4.6143388748168945, "learning_rate": 3.0611709110002626e-05, "epoch": 2.08 }, { "loss": 7.2776, "grad_norm": 3.6637542247772217, "learning_rate": 3.0524197077098104e-05, "epoch": 2.08 }, { "loss": 7.5867, "grad_norm": 4.739266872406006, "learning_rate": 3.043668504419358e-05, "epoch": 2.09 }, { "loss": 8.031, "grad_norm": 4.118218898773193, "learning_rate": 3.0349173011289055e-05, "epoch": 2.09 }, { "loss": 7.5086, "grad_norm": 3.7304162979125977, "learning_rate": 3.026166097838453e-05, "epoch": 2.09 }, { "loss": 7.0712, "grad_norm": 3.3575172424316406, "learning_rate": 3.0174148945480006e-05, "epoch": 2.09 }, { "loss": 7.8495, "grad_norm": 3.6715874671936035, "learning_rate": 3.0086636912575482e-05, "epoch": 2.1 }, { "loss": 7.8997, "grad_norm": 3.8344626426696777, "learning_rate": 2.9999124879670954e-05, "epoch": 2.1 }, { "loss": 7.6391, "grad_norm": 5.086608409881592, "learning_rate": 2.991161284676643e-05, "epoch": 2.1 }, { "loss": 7.6541, "grad_norm": 5.020079135894775, "learning_rate": 2.9824100813861905e-05, "epoch": 2.11 }, { "loss": 7.9148, "grad_norm": 4.90994119644165, "learning_rate": 2.973658878095738e-05, "epoch": 2.11 }, { "loss": 7.3306, "grad_norm": 7.108256816864014, "learning_rate": 2.9649076748052856e-05, "epoch": 2.11 }, { "loss": 7.4177, "grad_norm": 3.922966480255127, "learning_rate": 2.956156471514833e-05, "epoch": 2.11 }, { "loss": 7.6952, "grad_norm": 4.077629566192627, "learning_rate": 2.9474052682243807e-05, "epoch": 2.12 }, { "loss": 8.2089, "grad_norm": 3.003819227218628, "learning_rate": 2.9386540649339283e-05, "epoch": 2.12 }, { "loss": 6.8683, "grad_norm": 3.8509228229522705, "learning_rate": 2.9299028616434758e-05, "epoch": 2.12 }, { "loss": 7.1152, "grad_norm": 3.316972017288208, "learning_rate": 2.9211516583530234e-05, "epoch": 2.12 }, { "loss": 7.7425, "grad_norm": 5.465259552001953, "learning_rate": 2.9124004550625713e-05, "epoch": 2.13 }, { "loss": 7.3628, "grad_norm": 3.923509120941162, "learning_rate": 2.9036492517721188e-05, "epoch": 2.13 }, { "loss": 7.382, "grad_norm": 4.779471397399902, "learning_rate": 2.8948980484816664e-05, "epoch": 2.13 }, { "loss": 7.4485, "grad_norm": 6.00252628326416, "learning_rate": 2.886146845191214e-05, "epoch": 2.13 }, { "loss": 7.4238, "grad_norm": 4.734460353851318, "learning_rate": 2.8773956419007615e-05, "epoch": 2.14 }, { "loss": 7.4597, "grad_norm": 4.662705898284912, "learning_rate": 2.868644438610309e-05, "epoch": 2.14 }, { "loss": 7.7309, "grad_norm": 3.3174445629119873, "learning_rate": 2.8598932353198566e-05, "epoch": 2.14 }, { "loss": 7.7715, "grad_norm": 3.2781224250793457, "learning_rate": 2.851142032029404e-05, "epoch": 2.14 }, { "loss": 7.6348, "grad_norm": 5.909160137176514, "learning_rate": 2.8423908287389517e-05, "epoch": 2.15 }, { "loss": 7.2795, "grad_norm": 4.939976215362549, "learning_rate": 2.8336396254484992e-05, "epoch": 2.15 }, { "loss": 7.9346, "grad_norm": 4.42500114440918, "learning_rate": 2.8248884221580468e-05, "epoch": 2.15 }, { "loss": 7.4785, "grad_norm": 3.704190731048584, "learning_rate": 2.8161372188675943e-05, "epoch": 2.16 }, { "loss": 7.456, "grad_norm": 3.73481822013855, "learning_rate": 2.807386015577142e-05, "epoch": 2.16 }, { "loss": 7.4728, "grad_norm": 4.051381587982178, "learning_rate": 2.7986348122866894e-05, "epoch": 2.16 }, { "loss": 7.6646, "grad_norm": 3.674975633621216, "learning_rate": 2.7898836089962373e-05, "epoch": 2.16 }, { "loss": 7.4139, "grad_norm": 4.6207709312438965, "learning_rate": 2.781132405705785e-05, "epoch": 2.17 }, { "loss": 7.3962, "grad_norm": 3.7129499912261963, "learning_rate": 2.7723812024153324e-05, "epoch": 2.17 }, { "loss": 7.423, "grad_norm": 4.65708589553833, "learning_rate": 2.76362999912488e-05, "epoch": 2.17 }, { "loss": 7.577, "grad_norm": 5.05981969833374, "learning_rate": 2.7548787958344275e-05, "epoch": 2.17 }, { "loss": 7.4925, "grad_norm": 4.692249774932861, "learning_rate": 2.746127592543975e-05, "epoch": 2.18 }, { "loss": 7.4587, "grad_norm": 4.2007856369018555, "learning_rate": 2.7373763892535226e-05, "epoch": 2.18 }, { "loss": 7.7241, "grad_norm": 6.081201553344727, "learning_rate": 2.72862518596307e-05, "epoch": 2.18 }, { "loss": 7.6261, "grad_norm": 3.801405429840088, "learning_rate": 2.7198739826726177e-05, "epoch": 2.18 }, { "loss": 7.199, "grad_norm": 4.788170337677002, "learning_rate": 2.7111227793821653e-05, "epoch": 2.19 }, { "loss": 7.6249, "grad_norm": 3.934465169906616, "learning_rate": 2.7023715760917128e-05, "epoch": 2.19 }, { "loss": 7.1697, "grad_norm": 3.270228147506714, "learning_rate": 2.6936203728012604e-05, "epoch": 2.19 }, { "loss": 7.3199, "grad_norm": 4.648608207702637, "learning_rate": 2.684869169510808e-05, "epoch": 2.19 }, { "loss": 7.0883, "grad_norm": 3.7127881050109863, "learning_rate": 2.6761179662203555e-05, "epoch": 2.2 }, { "loss": 7.6402, "grad_norm": 4.310494899749756, "learning_rate": 2.6673667629299033e-05, "epoch": 2.2 }, { "loss": 8.1804, "grad_norm": 3.9658172130584717, "learning_rate": 2.658615559639451e-05, "epoch": 2.2 }, { "loss": 7.4997, "grad_norm": 6.007218837738037, "learning_rate": 2.6498643563489984e-05, "epoch": 2.2 }, { "loss": 7.1157, "grad_norm": 4.3563995361328125, "learning_rate": 2.641113153058546e-05, "epoch": 2.21 }, { "loss": 7.6341, "grad_norm": 3.9191131591796875, "learning_rate": 2.6323619497680935e-05, "epoch": 2.21 }, { "loss": 7.3165, "grad_norm": 6.353770732879639, "learning_rate": 2.623610746477641e-05, "epoch": 2.21 }, { "loss": 7.2788, "grad_norm": 4.23541784286499, "learning_rate": 2.6148595431871886e-05, "epoch": 2.22 }, { "loss": 7.4787, "grad_norm": 7.060284614562988, "learning_rate": 2.6061083398967362e-05, "epoch": 2.22 }, { "loss": 7.7702, "grad_norm": 3.484837055206299, "learning_rate": 2.597357136606283e-05, "epoch": 2.22 }, { "loss": 7.3549, "grad_norm": 6.6010589599609375, "learning_rate": 2.5886059333158306e-05, "epoch": 2.22 }, { "loss": 7.2694, "grad_norm": 4.792263984680176, "learning_rate": 2.5798547300253785e-05, "epoch": 2.23 }, { "loss": 7.479, "grad_norm": 4.992294788360596, "learning_rate": 2.571103526734926e-05, "epoch": 2.23 }, { "loss": 7.5955, "grad_norm": 5.028162956237793, "learning_rate": 2.5623523234444736e-05, "epoch": 2.23 }, { "loss": 7.1964, "grad_norm": 4.13356876373291, "learning_rate": 2.553601120154021e-05, "epoch": 2.23 }, { "loss": 6.9294, "grad_norm": 3.5145249366760254, "learning_rate": 2.5448499168635687e-05, "epoch": 2.24 }, { "loss": 7.4454, "grad_norm": 3.8901588916778564, "learning_rate": 2.5360987135731163e-05, "epoch": 2.24 }, { "loss": 7.738, "grad_norm": 4.009905815124512, "learning_rate": 2.5273475102826638e-05, "epoch": 2.24 }, { "loss": 7.3253, "grad_norm": 4.332956314086914, "learning_rate": 2.5185963069922114e-05, "epoch": 2.24 }, { "loss": 7.1716, "grad_norm": 3.688816785812378, "learning_rate": 2.509845103701759e-05, "epoch": 2.25 }, { "loss": 6.8345, "grad_norm": 7.10718297958374, "learning_rate": 2.5010939004113065e-05, "epoch": 2.25 }, { "loss": 7.7885, "grad_norm": 5.8644585609436035, "learning_rate": 2.492342697120854e-05, "epoch": 2.25 }, { "loss": 7.3728, "grad_norm": 2.958936929702759, "learning_rate": 2.4835914938304016e-05, "epoch": 2.25 }, { "loss": 7.2452, "grad_norm": 3.498347520828247, "learning_rate": 2.474840290539949e-05, "epoch": 2.26 }, { "loss": 7.0227, "grad_norm": 4.527777671813965, "learning_rate": 2.4660890872494967e-05, "epoch": 2.26 }, { "loss": 7.0811, "grad_norm": 4.315553665161133, "learning_rate": 2.4573378839590446e-05, "epoch": 2.26 }, { "loss": 7.5339, "grad_norm": 3.5268032550811768, "learning_rate": 2.448586680668592e-05, "epoch": 2.27 }, { "loss": 7.5172, "grad_norm": 4.606849670410156, "learning_rate": 2.4398354773781397e-05, "epoch": 2.27 }, { "loss": 7.5034, "grad_norm": 3.0682761669158936, "learning_rate": 2.4310842740876872e-05, "epoch": 2.27 }, { "loss": 7.0923, "grad_norm": 3.9300010204315186, "learning_rate": 2.4223330707972348e-05, "epoch": 2.27 }, { "loss": 7.3782, "grad_norm": 5.444020748138428, "learning_rate": 2.4135818675067823e-05, "epoch": 2.28 }, { "loss": 8.0698, "grad_norm": 3.9157919883728027, "learning_rate": 2.40483066421633e-05, "epoch": 2.28 }, { "loss": 7.4659, "grad_norm": 4.808152675628662, "learning_rate": 2.3960794609258774e-05, "epoch": 2.28 }, { "loss": 7.2596, "grad_norm": 4.249693870544434, "learning_rate": 2.387328257635425e-05, "epoch": 2.28 }, { "loss": 7.0133, "grad_norm": 4.091562271118164, "learning_rate": 2.3785770543449725e-05, "epoch": 2.29 }, { "loss": 7.9337, "grad_norm": 3.689053535461426, "learning_rate": 2.36982585105452e-05, "epoch": 2.29 }, { "loss": 7.954, "grad_norm": 3.9822888374328613, "learning_rate": 2.3610746477640676e-05, "epoch": 2.29 }, { "loss": 8.0586, "grad_norm": 4.524798393249512, "learning_rate": 2.352323444473615e-05, "epoch": 2.29 }, { "loss": 7.6348, "grad_norm": 4.638789653778076, "learning_rate": 2.3435722411831627e-05, "epoch": 2.3 }, { "loss": 7.1951, "grad_norm": 3.9606380462646484, "learning_rate": 2.3348210378927106e-05, "epoch": 2.3 }, { "loss": 7.1919, "grad_norm": 4.085976600646973, "learning_rate": 2.326069834602258e-05, "epoch": 2.3 }, { "loss": 7.7985, "grad_norm": 4.817371845245361, "learning_rate": 2.3173186313118057e-05, "epoch": 2.3 }, { "loss": 7.4659, "grad_norm": 4.804962635040283, "learning_rate": 2.3085674280213533e-05, "epoch": 2.31 }, { "loss": 7.6245, "grad_norm": 5.15590763092041, "learning_rate": 2.2998162247309005e-05, "epoch": 2.31 }, { "loss": 7.1799, "grad_norm": 5.4307122230529785, "learning_rate": 2.291065021440448e-05, "epoch": 2.31 }, { "loss": 6.737, "grad_norm": 3.417074680328369, "learning_rate": 2.2823138181499956e-05, "epoch": 2.32 }, { "loss": 7.3044, "grad_norm": 5.047757148742676, "learning_rate": 2.273562614859543e-05, "epoch": 2.32 }, { "loss": 7.0146, "grad_norm": 5.767230033874512, "learning_rate": 2.2648114115690907e-05, "epoch": 2.32 }, { "loss": 7.5375, "grad_norm": 4.92877197265625, "learning_rate": 2.2560602082786382e-05, "epoch": 2.32 }, { "loss": 7.5536, "grad_norm": 3.8499937057495117, "learning_rate": 2.2473090049881858e-05, "epoch": 2.33 }, { "loss": 7.2617, "grad_norm": 3.698652505874634, "learning_rate": 2.2385578016977337e-05, "epoch": 2.33 }, { "loss": 7.3101, "grad_norm": 3.8474197387695312, "learning_rate": 2.2298065984072812e-05, "epoch": 2.33 }, { "loss": 7.4945, "grad_norm": 4.18773889541626, "learning_rate": 2.2210553951168288e-05, "epoch": 2.33 }, { "loss": 7.1942, "grad_norm": 4.604954242706299, "learning_rate": 2.2123041918263763e-05, "epoch": 2.34 }, { "loss": 7.3004, "grad_norm": 4.48193359375, "learning_rate": 2.203552988535924e-05, "epoch": 2.34 }, { "loss": 7.2258, "grad_norm": 3.1619014739990234, "learning_rate": 2.1948017852454714e-05, "epoch": 2.34 }, { "loss": 7.1968, "grad_norm": 4.031898021697998, "learning_rate": 2.186050581955019e-05, "epoch": 2.34 }, { "loss": 7.7404, "grad_norm": 4.030830383300781, "learning_rate": 2.1772993786645665e-05, "epoch": 2.35 }, { "loss": 7.1855, "grad_norm": 3.3764097690582275, "learning_rate": 2.168548175374114e-05, "epoch": 2.35 }, { "loss": 7.0507, "grad_norm": 5.506438732147217, "learning_rate": 2.1597969720836616e-05, "epoch": 2.35 }, { "loss": 6.5909, "grad_norm": 4.797235012054443, "learning_rate": 2.151045768793209e-05, "epoch": 2.35 }, { "loss": 7.418, "grad_norm": 4.5042853355407715, "learning_rate": 2.1422945655027567e-05, "epoch": 2.36 }, { "loss": 7.3702, "grad_norm": 3.449220657348633, "learning_rate": 2.1335433622123043e-05, "epoch": 2.36 }, { "loss": 7.2734, "grad_norm": 5.276688098907471, "learning_rate": 2.1247921589218518e-05, "epoch": 2.36 }, { "loss": 7.1368, "grad_norm": 4.960446834564209, "learning_rate": 2.1160409556313997e-05, "epoch": 2.37 }, { "loss": 7.3468, "grad_norm": 4.041114330291748, "learning_rate": 2.1072897523409473e-05, "epoch": 2.37 }, { "loss": 7.5724, "grad_norm": 5.667148113250732, "learning_rate": 2.0985385490504945e-05, "epoch": 2.37 }, { "loss": 7.1379, "grad_norm": 3.245389223098755, "learning_rate": 2.089787345760042e-05, "epoch": 2.37 }, { "loss": 6.9722, "grad_norm": 4.715411186218262, "learning_rate": 2.0810361424695896e-05, "epoch": 2.38 }, { "loss": 7.4667, "grad_norm": 3.4023447036743164, "learning_rate": 2.072284939179137e-05, "epoch": 2.38 }, { "loss": 7.3368, "grad_norm": 4.798887252807617, "learning_rate": 2.0635337358886847e-05, "epoch": 2.38 }, { "loss": 7.224, "grad_norm": 4.741410255432129, "learning_rate": 2.0547825325982322e-05, "epoch": 2.38 }, { "loss": 7.0656, "grad_norm": 3.5715346336364746, "learning_rate": 2.046906449636825e-05, "epoch": 2.39 }, { "loss": 7.0197, "grad_norm": 4.428717613220215, "learning_rate": 2.0381552463463727e-05, "epoch": 2.39 }, { "loss": 7.4517, "grad_norm": 4.353855133056641, "learning_rate": 2.0294040430559202e-05, "epoch": 2.39 }, { "loss": 7.5488, "grad_norm": 3.331164598464966, "learning_rate": 2.0206528397654678e-05, "epoch": 2.39 }, { "loss": 6.8231, "grad_norm": 4.357122898101807, "learning_rate": 2.0119016364750153e-05, "epoch": 2.4 }, { "loss": 7.8218, "grad_norm": 3.6374125480651855, "learning_rate": 2.003150433184563e-05, "epoch": 2.4 }, { "loss": 7.2913, "grad_norm": 3.414724826812744, "learning_rate": 1.9943992298941104e-05, "epoch": 2.4 }, { "loss": 7.5022, "grad_norm": 3.073855400085449, "learning_rate": 1.985648026603658e-05, "epoch": 2.4 }, { "loss": 7.1292, "grad_norm": 5.69718074798584, "learning_rate": 1.976896823313206e-05, "epoch": 2.41 }, { "loss": 7.5066, "grad_norm": 3.6818926334381104, "learning_rate": 1.9681456200227534e-05, "epoch": 2.41 }, { "loss": 7.1924, "grad_norm": 6.099584579467773, "learning_rate": 1.959394416732301e-05, "epoch": 2.41 }, { "loss": 7.7311, "grad_norm": 4.388739109039307, "learning_rate": 1.9506432134418485e-05, "epoch": 2.41 }, { "loss": 7.8034, "grad_norm": 4.578341007232666, "learning_rate": 1.941892010151396e-05, "epoch": 2.42 }, { "loss": 7.1698, "grad_norm": 3.4639930725097656, "learning_rate": 1.9331408068609436e-05, "epoch": 2.42 }, { "loss": 7.0058, "grad_norm": 4.414987564086914, "learning_rate": 1.924389603570491e-05, "epoch": 2.42 }, { "loss": 7.3363, "grad_norm": 4.268624305725098, "learning_rate": 1.9156384002800387e-05, "epoch": 2.43 }, { "loss": 7.2589, "grad_norm": 6.716452598571777, "learning_rate": 1.9068871969895863e-05, "epoch": 2.43 }, { "loss": 7.2501, "grad_norm": 5.058889865875244, "learning_rate": 1.8981359936991335e-05, "epoch": 2.43 }, { "loss": 7.3893, "grad_norm": 6.656921863555908, "learning_rate": 1.889384790408681e-05, "epoch": 2.43 }, { "loss": 7.2942, "grad_norm": 4.824561595916748, "learning_rate": 1.8806335871182286e-05, "epoch": 2.44 }, { "loss": 7.6477, "grad_norm": 5.925858020782471, "learning_rate": 1.8718823838277765e-05, "epoch": 2.44 }, { "loss": 6.9424, "grad_norm": 3.955688714981079, "learning_rate": 1.863131180537324e-05, "epoch": 2.44 }, { "loss": 6.8707, "grad_norm": 3.95426869392395, "learning_rate": 1.8543799772468716e-05, "epoch": 2.44 }, { "loss": 7.5012, "grad_norm": 5.377491474151611, "learning_rate": 1.845628773956419e-05, "epoch": 2.45 }, { "loss": 7.6028, "grad_norm": 4.264338970184326, "learning_rate": 1.8368775706659667e-05, "epoch": 2.45 }, { "loss": 6.8808, "grad_norm": 5.041021347045898, "learning_rate": 1.8281263673755142e-05, "epoch": 2.45 }, { "loss": 8.0187, "grad_norm": 6.484523773193359, "learning_rate": 1.8193751640850618e-05, "epoch": 2.45 }, { "loss": 6.9991, "grad_norm": 4.5790205001831055, "learning_rate": 1.8106239607946093e-05, "epoch": 2.46 }, { "loss": 7.2012, "grad_norm": 4.204977989196777, "learning_rate": 1.801872757504157e-05, "epoch": 2.46 }, { "loss": 7.6324, "grad_norm": 3.803563356399536, "learning_rate": 1.7931215542137044e-05, "epoch": 2.46 }, { "loss": 6.6331, "grad_norm": 3.6445772647857666, "learning_rate": 1.784370350923252e-05, "epoch": 2.46 }, { "loss": 7.4633, "grad_norm": 3.9381942749023438, "learning_rate": 1.7756191476327995e-05, "epoch": 2.47 }, { "loss": 7.4318, "grad_norm": 3.573315382003784, "learning_rate": 1.766867944342347e-05, "epoch": 2.47 }, { "loss": 7.3045, "grad_norm": 3.7262725830078125, "learning_rate": 1.7581167410518946e-05, "epoch": 2.47 }, { "loss": 6.9975, "grad_norm": 4.73222541809082, "learning_rate": 1.7493655377614425e-05, "epoch": 2.48 }, { "loss": 6.7643, "grad_norm": 4.269005298614502, "learning_rate": 1.74061433447099e-05, "epoch": 2.48 }, { "loss": 7.7472, "grad_norm": 4.969855785369873, "learning_rate": 1.7318631311805376e-05, "epoch": 2.48 }, { "loss": 7.0408, "grad_norm": 4.290554046630859, "learning_rate": 1.723111927890085e-05, "epoch": 2.48 }, { "loss": 7.5848, "grad_norm": 4.593362808227539, "learning_rate": 1.7143607245996327e-05, "epoch": 2.49 }, { "loss": 7.2043, "grad_norm": 3.8505163192749023, "learning_rate": 1.7056095213091803e-05, "epoch": 2.49 }, { "loss": 7.076, "grad_norm": 5.526023864746094, "learning_rate": 1.6968583180187275e-05, "epoch": 2.49 }, { "loss": 7.5282, "grad_norm": 4.70090389251709, "learning_rate": 1.688107114728275e-05, "epoch": 2.49 }, { "loss": 7.1022, "grad_norm": 5.819429397583008, "learning_rate": 1.6793559114378226e-05, "epoch": 2.5 }, { "loss": 7.5449, "grad_norm": 4.2631707191467285, "learning_rate": 1.67060470814737e-05, "epoch": 2.5 }, { "loss": 7.5941, "grad_norm": 5.127431392669678, "learning_rate": 1.6618535048569177e-05, "epoch": 2.5 }, { "loss": 7.1741, "grad_norm": 5.605392932891846, "learning_rate": 1.6531023015664652e-05, "epoch": 2.5 }, { "loss": 7.5186, "grad_norm": 5.392033576965332, "learning_rate": 1.644351098276013e-05, "epoch": 2.51 }, { "loss": 7.4369, "grad_norm": 4.743539810180664, "learning_rate": 1.6355998949855607e-05, "epoch": 2.51 }, { "loss": 7.0646, "grad_norm": 4.0009684562683105, "learning_rate": 1.6268486916951082e-05, "epoch": 2.51 }, { "loss": 7.3658, "grad_norm": 4.551602363586426, "learning_rate": 1.6180974884046558e-05, "epoch": 2.51 }, { "loss": 7.7071, "grad_norm": 3.369328737258911, "learning_rate": 1.6093462851142033e-05, "epoch": 2.52 }, { "loss": 7.5386, "grad_norm": 3.6127750873565674, "learning_rate": 1.600595081823751e-05, "epoch": 2.52 }, { "loss": 7.3196, "grad_norm": 4.915907382965088, "learning_rate": 1.5918438785332984e-05, "epoch": 2.52 }, { "loss": 7.1598, "grad_norm": 5.295419216156006, "learning_rate": 1.583092675242846e-05, "epoch": 2.53 }, { "loss": 7.2027, "grad_norm": 5.066037654876709, "learning_rate": 1.5743414719523935e-05, "epoch": 2.53 }, { "loss": 7.4418, "grad_norm": 5.553489685058594, "learning_rate": 1.565590268661941e-05, "epoch": 2.53 }, { "loss": 6.8532, "grad_norm": 4.176399230957031, "learning_rate": 1.5568390653714886e-05, "epoch": 2.53 }, { "loss": 7.1576, "grad_norm": 5.018221855163574, "learning_rate": 1.5480878620810362e-05, "epoch": 2.54 }, { "loss": 7.5015, "grad_norm": 3.439542293548584, "learning_rate": 1.5393366587905837e-05, "epoch": 2.54 }, { "loss": 6.8427, "grad_norm": 3.641223907470703, "learning_rate": 1.5305854555001313e-05, "epoch": 2.54 }, { "loss": 7.7335, "grad_norm": 5.225297451019287, "learning_rate": 1.521834252209679e-05, "epoch": 2.54 }, { "loss": 7.1007, "grad_norm": 3.5159335136413574, "learning_rate": 1.5130830489192265e-05, "epoch": 2.55 }, { "loss": 7.3267, "grad_norm": 4.219715118408203, "learning_rate": 1.5043318456287741e-05, "epoch": 2.55 }, { "loss": 7.0066, "grad_norm": 4.482273101806641, "learning_rate": 1.4955806423383215e-05, "epoch": 2.55 }, { "loss": 7.4307, "grad_norm": 4.263273239135742, "learning_rate": 1.486829439047869e-05, "epoch": 2.55 }, { "loss": 7.3687, "grad_norm": 4.202017784118652, "learning_rate": 1.4780782357574166e-05, "epoch": 2.56 }, { "loss": 7.6405, "grad_norm": 5.738183498382568, "learning_rate": 1.4693270324669641e-05, "epoch": 2.56 }, { "loss": 7.3089, "grad_norm": 5.287261962890625, "learning_rate": 1.4605758291765117e-05, "epoch": 2.56 }, { "loss": 7.0879, "grad_norm": 5.147162914276123, "learning_rate": 1.4518246258860594e-05, "epoch": 2.56 }, { "loss": 7.145, "grad_norm": 3.873149871826172, "learning_rate": 1.443073422595607e-05, "epoch": 2.57 }, { "loss": 7.1013, "grad_norm": 4.64039945602417, "learning_rate": 1.4343222193051545e-05, "epoch": 2.57 }, { "loss": 7.0642, "grad_norm": 3.6532037258148193, "learning_rate": 1.425571016014702e-05, "epoch": 2.57 }, { "loss": 7.1647, "grad_norm": 3.756361484527588, "learning_rate": 1.4168198127242496e-05, "epoch": 2.58 }, { "loss": 7.0501, "grad_norm": 3.5314979553222656, "learning_rate": 1.4080686094337972e-05, "epoch": 2.58 }, { "loss": 7.4017, "grad_norm": 3.386040687561035, "learning_rate": 1.3993174061433447e-05, "epoch": 2.58 }, { "loss": 7.4412, "grad_norm": 3.566223382949829, "learning_rate": 1.3905662028528924e-05, "epoch": 2.58 }, { "loss": 7.0911, "grad_norm": 5.274896621704102, "learning_rate": 1.38181499956244e-05, "epoch": 2.59 }, { "loss": 7.0295, "grad_norm": 5.615356922149658, "learning_rate": 1.3730637962719875e-05, "epoch": 2.59 }, { "loss": 7.1455, "grad_norm": 4.624752521514893, "learning_rate": 1.364312592981535e-05, "epoch": 2.59 }, { "loss": 7.1833, "grad_norm": 4.156666278839111, "learning_rate": 1.3555613896910826e-05, "epoch": 2.59 }, { "loss": 7.8835, "grad_norm": 3.4591434001922607, "learning_rate": 1.3468101864006302e-05, "epoch": 2.6 }, { "loss": 7.1062, "grad_norm": 3.3804733753204346, "learning_rate": 1.3380589831101777e-05, "epoch": 2.6 }, { "loss": 7.365, "grad_norm": 3.7281017303466797, "learning_rate": 1.3293077798197254e-05, "epoch": 2.6 }, { "loss": 7.0466, "grad_norm": 3.4281463623046875, "learning_rate": 1.320556576529273e-05, "epoch": 2.6 }, { "loss": 7.4435, "grad_norm": 3.9861958026885986, "learning_rate": 1.3118053732388205e-05, "epoch": 2.61 }, { "loss": 7.3772, "grad_norm": 5.1440253257751465, "learning_rate": 1.3030541699483681e-05, "epoch": 2.61 }, { "loss": 7.2704, "grad_norm": 3.7356927394866943, "learning_rate": 1.2943029666579153e-05, "epoch": 2.61 }, { "loss": 7.4473, "grad_norm": 3.138427257537842, "learning_rate": 1.285551763367463e-05, "epoch": 2.61 }, { "loss": 7.654, "grad_norm": 5.250783920288086, "learning_rate": 1.2768005600770106e-05, "epoch": 2.62 }, { "loss": 7.2723, "grad_norm": 3.7493326663970947, "learning_rate": 1.2680493567865581e-05, "epoch": 2.62 }, { "loss": 7.2502, "grad_norm": 4.482826232910156, "learning_rate": 1.2592981534961057e-05, "epoch": 2.62 }, { "loss": 6.9983, "grad_norm": 4.741217613220215, "learning_rate": 1.2505469502056532e-05, "epoch": 2.62 }, { "loss": 7.1721, "grad_norm": 5.053958892822266, "learning_rate": 1.2417957469152008e-05, "epoch": 2.63 }, { "loss": 7.2113, "grad_norm": 5.000698089599609, "learning_rate": 1.2330445436247483e-05, "epoch": 2.63 }, { "loss": 7.0898, "grad_norm": 5.456648826599121, "learning_rate": 1.224293340334296e-05, "epoch": 2.63 }, { "loss": 7.2822, "grad_norm": 3.733816146850586, "learning_rate": 1.2155421370438436e-05, "epoch": 2.64 }, { "loss": 7.4952, "grad_norm": 4.114339351654053, "learning_rate": 1.2067909337533912e-05, "epoch": 2.64 }, { "loss": 6.9912, "grad_norm": 3.963610887527466, "learning_rate": 1.1980397304629387e-05, "epoch": 2.64 }, { "loss": 7.2077, "grad_norm": 4.697625637054443, "learning_rate": 1.1892885271724863e-05, "epoch": 2.64 }, { "loss": 6.9453, "grad_norm": 3.8456337451934814, "learning_rate": 1.1805373238820338e-05, "epoch": 2.65 }, { "loss": 7.6018, "grad_norm": 3.9979872703552246, "learning_rate": 1.1717861205915814e-05, "epoch": 2.65 }, { "loss": 7.078, "grad_norm": 4.1047563552856445, "learning_rate": 1.163034917301129e-05, "epoch": 2.65 }, { "loss": 6.7399, "grad_norm": 5.3073248863220215, "learning_rate": 1.1542837140106766e-05, "epoch": 2.65 }, { "loss": 6.9167, "grad_norm": 5.714503765106201, "learning_rate": 1.145532510720224e-05, "epoch": 2.66 }, { "loss": 7.4903, "grad_norm": 3.9626924991607666, "learning_rate": 1.1367813074297716e-05, "epoch": 2.66 }, { "loss": 7.4558, "grad_norm": 4.751763343811035, "learning_rate": 1.1280301041393191e-05, "epoch": 2.66 }, { "loss": 7.5696, "grad_norm": 3.73614501953125, "learning_rate": 1.1192789008488668e-05, "epoch": 2.66 }, { "loss": 7.1071, "grad_norm": 3.236339569091797, "learning_rate": 1.1105276975584144e-05, "epoch": 2.67 }, { "loss": 7.6115, "grad_norm": 4.271381855010986, "learning_rate": 1.101776494267962e-05, "epoch": 2.67 }, { "loss": 7.1791, "grad_norm": 3.6989824771881104, "learning_rate": 1.0930252909775095e-05, "epoch": 2.67 }, { "loss": 7.0511, "grad_norm": 3.856694221496582, "learning_rate": 1.084274087687057e-05, "epoch": 2.67 }, { "loss": 7.2655, "grad_norm": 4.834972858428955, "learning_rate": 1.0755228843966046e-05, "epoch": 2.68 }, { "loss": 7.0369, "grad_norm": 4.6722211837768555, "learning_rate": 1.0667716811061521e-05, "epoch": 2.68 }, { "loss": 7.1936, "grad_norm": 4.993673324584961, "learning_rate": 1.0580204778156999e-05, "epoch": 2.68 }, { "loss": 7.3347, "grad_norm": 3.4490904808044434, "learning_rate": 1.0492692745252472e-05, "epoch": 2.69 }, { "loss": 7.736, "grad_norm": 3.283051013946533, "learning_rate": 1.0405180712347948e-05, "epoch": 2.69 }, { "loss": 7.0317, "grad_norm": 3.656076431274414, "learning_rate": 1.0317668679443423e-05, "epoch": 2.69 }, { "loss": 6.866, "grad_norm": 3.4769787788391113, "learning_rate": 1.0230156646538899e-05, "epoch": 2.69 }, { "loss": 7.4499, "grad_norm": 3.384229898452759, "learning_rate": 1.0142644613634374e-05, "epoch": 2.7 }, { "loss": 6.9746, "grad_norm": 4.784582614898682, "learning_rate": 1.0055132580729852e-05, "epoch": 2.7 }, { "loss": 7.0138, "grad_norm": 4.076469898223877, "learning_rate": 9.967620547825327e-06, "epoch": 2.7 }, { "loss": 7.2836, "grad_norm": 5.0796709060668945, "learning_rate": 9.880108514920803e-06, "epoch": 2.7 }, { "loss": 7.0612, "grad_norm": 4.263620853424072, "learning_rate": 9.792596482016278e-06, "epoch": 2.71 }, { "loss": 7.0573, "grad_norm": 4.355484485626221, "learning_rate": 9.705084449111754e-06, "epoch": 2.71 }, { "loss": 7.3385, "grad_norm": 6.618645668029785, "learning_rate": 9.617572416207229e-06, "epoch": 2.71 }, { "loss": 7.5994, "grad_norm": 4.804537296295166, "learning_rate": 9.530060383302705e-06, "epoch": 2.71 }, { "loss": 8.1717, "grad_norm": 4.777498722076416, "learning_rate": 9.44254835039818e-06, "epoch": 2.72 }, { "loss": 7.1703, "grad_norm": 3.5699825286865234, "learning_rate": 9.355036317493656e-06, "epoch": 2.72 }, { "loss": 7.0822, "grad_norm": 6.044339179992676, "learning_rate": 9.267524284589131e-06, "epoch": 2.72 }, { "loss": 7.0899, "grad_norm": 3.558217763900757, "learning_rate": 9.180012251684607e-06, "epoch": 2.72 }, { "loss": 6.9441, "grad_norm": 4.0059075355529785, "learning_rate": 9.092500218780082e-06, "epoch": 2.73 }, { "loss": 7.1065, "grad_norm": 5.324728012084961, "learning_rate": 9.004988185875558e-06, "epoch": 2.73 }, { "loss": 6.91, "grad_norm": 3.852426767349243, "learning_rate": 8.917476152971035e-06, "epoch": 2.73 }, { "loss": 7.7985, "grad_norm": 5.7546844482421875, "learning_rate": 8.82996412006651e-06, "epoch": 2.74 }, { "loss": 7.098, "grad_norm": 4.994897842407227, "learning_rate": 8.742452087161986e-06, "epoch": 2.74 }, { "loss": 7.4831, "grad_norm": 4.3503522872924805, "learning_rate": 8.654940054257461e-06, "epoch": 2.74 }, { "loss": 7.3563, "grad_norm": 3.4878551959991455, "learning_rate": 8.567428021352937e-06, "epoch": 2.74 }, { "loss": 7.8125, "grad_norm": 4.518803596496582, "learning_rate": 8.47991598844841e-06, "epoch": 2.75 }, { "loss": 7.1354, "grad_norm": 3.4671084880828857, "learning_rate": 8.392403955543888e-06, "epoch": 2.75 }, { "loss": 7.1782, "grad_norm": 5.328606128692627, "learning_rate": 8.304891922639363e-06, "epoch": 2.75 }, { "loss": 7.7318, "grad_norm": 5.223174095153809, "learning_rate": 8.217379889734839e-06, "epoch": 2.75 }, { "loss": 6.9753, "grad_norm": 3.5544798374176025, "learning_rate": 8.129867856830314e-06, "epoch": 2.76 }, { "loss": 7.5763, "grad_norm": 8.088041305541992, "learning_rate": 8.04235582392579e-06, "epoch": 2.76 }, { "loss": 7.9196, "grad_norm": 4.860823631286621, "learning_rate": 7.954843791021265e-06, "epoch": 2.76 }, { "loss": 6.7199, "grad_norm": 2.9834036827087402, "learning_rate": 7.867331758116741e-06, "epoch": 2.76 }, { "loss": 7.0354, "grad_norm": 3.6943893432617188, "learning_rate": 7.779819725212218e-06, "epoch": 2.77 }, { "loss": 7.7071, "grad_norm": 4.091310977935791, "learning_rate": 7.692307692307694e-06, "epoch": 2.77 }, { "loss": 7.2135, "grad_norm": 3.6830339431762695, "learning_rate": 7.604795659403169e-06, "epoch": 2.77 }, { "loss": 7.5218, "grad_norm": 3.4381253719329834, "learning_rate": 7.517283626498645e-06, "epoch": 2.77 }, { "loss": 7.1017, "grad_norm": 5.597609519958496, "learning_rate": 7.429771593594119e-06, "epoch": 2.78 }, { "loss": 6.7574, "grad_norm": 3.175727128982544, "learning_rate": 7.342259560689595e-06, "epoch": 2.78 }, { "loss": 6.8652, "grad_norm": 3.873260021209717, "learning_rate": 7.25474752778507e-06, "epoch": 2.78 }, { "loss": 7.0743, "grad_norm": 3.3759090900421143, "learning_rate": 7.167235494880546e-06, "epoch": 2.79 }, { "loss": 7.4346, "grad_norm": 4.680045127868652, "learning_rate": 7.079723461976022e-06, "epoch": 2.79 }, { "loss": 7.1374, "grad_norm": 5.331534385681152, "learning_rate": 6.992211429071498e-06, "epoch": 2.79 }, { "loss": 7.1309, "grad_norm": 5.658579349517822, "learning_rate": 6.904699396166973e-06, "epoch": 2.79 }, { "loss": 6.9677, "grad_norm": 4.185191631317139, "learning_rate": 6.8171873632624495e-06, "epoch": 2.8 }, { "loss": 6.9958, "grad_norm": 3.82647967338562, "learning_rate": 6.729675330357925e-06, "epoch": 2.8 }, { "loss": 7.1796, "grad_norm": 4.288081645965576, "learning_rate": 6.6421632974534005e-06, "epoch": 2.8 }, { "loss": 7.0815, "grad_norm": 3.3872106075286865, "learning_rate": 6.554651264548876e-06, "epoch": 2.8 }, { "loss": 6.9035, "grad_norm": 3.4726109504699707, "learning_rate": 6.467139231644351e-06, "epoch": 2.81 }, { "loss": 7.4375, "grad_norm": 5.091712951660156, "learning_rate": 6.379627198739826e-06, "epoch": 2.81 }, { "loss": 7.4423, "grad_norm": 3.271453380584717, "learning_rate": 6.2921151658353025e-06, "epoch": 2.81 }, { "loss": 6.976, "grad_norm": 3.8439278602600098, "learning_rate": 6.204603132930778e-06, "epoch": 2.81 }, { "loss": 7.7446, "grad_norm": 3.4631197452545166, "learning_rate": 6.1170911000262535e-06, "epoch": 2.82 }, { "loss": 7.7139, "grad_norm": 3.5582733154296875, "learning_rate": 6.02957906712173e-06, "epoch": 2.82 }, { "loss": 7.0974, "grad_norm": 4.480440139770508, "learning_rate": 5.942067034217205e-06, "epoch": 2.82 }, { "loss": 7.6834, "grad_norm": 4.127463340759277, "learning_rate": 5.854555001312681e-06, "epoch": 2.82 }, { "loss": 7.0067, "grad_norm": 4.044102191925049, "learning_rate": 5.767042968408156e-06, "epoch": 2.83 }, { "loss": 7.4619, "grad_norm": 5.20751953125, "learning_rate": 5.679530935503632e-06, "epoch": 2.83 }, { "loss": 6.9783, "grad_norm": 4.303256511688232, "learning_rate": 5.592018902599107e-06, "epoch": 2.83 }, { "loss": 7.093, "grad_norm": 5.665140151977539, "learning_rate": 5.504506869694583e-06, "epoch": 2.83 }, { "loss": 7.5041, "grad_norm": 4.066624164581299, "learning_rate": 5.416994836790059e-06, "epoch": 2.84 }, { "loss": 6.906, "grad_norm": 4.449793815612793, "learning_rate": 5.329482803885535e-06, "epoch": 2.84 }, { "loss": 7.4408, "grad_norm": 4.4521074295043945, "learning_rate": 5.2419707709810094e-06, "epoch": 2.84 }, { "loss": 7.1617, "grad_norm": 3.7591211795806885, "learning_rate": 5.154458738076486e-06, "epoch": 2.85 }, { "loss": 6.955, "grad_norm": 5.360795974731445, "learning_rate": 5.066946705171961e-06, "epoch": 2.85 }, { "loss": 7.3291, "grad_norm": 3.872117280960083, "learning_rate": 4.979434672267437e-06, "epoch": 2.85 }, { "loss": 7.7188, "grad_norm": 5.078587055206299, "learning_rate": 4.891922639362913e-06, "epoch": 2.85 }, { "loss": 7.1934, "grad_norm": 5.7633056640625, "learning_rate": 4.804410606458389e-06, "epoch": 2.86 }, { "loss": 7.0959, "grad_norm": 3.960428476333618, "learning_rate": 4.716898573553863e-06, "epoch": 2.86 }, { "loss": 6.9419, "grad_norm": 6.363913536071777, "learning_rate": 4.62938654064934e-06, "epoch": 2.86 }, { "loss": 7.0335, "grad_norm": 4.09603214263916, "learning_rate": 4.541874507744815e-06, "epoch": 2.86 }, { "loss": 7.2072, "grad_norm": 3.217400312423706, "learning_rate": 4.454362474840291e-06, "epoch": 2.87 }, { "loss": 7.3142, "grad_norm": 4.389254570007324, "learning_rate": 4.366850441935766e-06, "epoch": 2.87 }, { "loss": 7.3642, "grad_norm": 4.192555904388428, "learning_rate": 4.2793384090312425e-06, "epoch": 2.87 }, { "loss": 6.8518, "grad_norm": 4.6586809158325195, "learning_rate": 4.191826376126717e-06, "epoch": 2.87 }, { "loss": 7.4924, "grad_norm": 3.969644784927368, "learning_rate": 4.104314343222193e-06, "epoch": 2.88 }, { "loss": 6.8693, "grad_norm": 3.7710835933685303, "learning_rate": 4.016802310317669e-06, "epoch": 2.88 }, { "loss": 6.8957, "grad_norm": 3.549421548843384, "learning_rate": 3.9292902774131446e-06, "epoch": 2.88 }, { "loss": 7.6237, "grad_norm": 3.6362552642822266, "learning_rate": 3.84177824450862e-06, "epoch": 2.88 }, { "loss": 7.2684, "grad_norm": 4.023890972137451, "learning_rate": 3.754266211604096e-06, "epoch": 2.89 }, { "loss": 7.2039, "grad_norm": 3.6492927074432373, "learning_rate": 3.666754178699571e-06, "epoch": 2.89 }, { "loss": 7.7765, "grad_norm": 4.3814191818237305, "learning_rate": 3.579242145795047e-06, "epoch": 2.89 }, { "loss": 7.3461, "grad_norm": 5.455050468444824, "learning_rate": 3.4917301128905225e-06, "epoch": 2.9 }, { "loss": 7.2237, "grad_norm": 4.942239761352539, "learning_rate": 3.4042180799859984e-06, "epoch": 2.9 }, { "loss": 6.8001, "grad_norm": 3.596323251724243, "learning_rate": 3.316706047081474e-06, "epoch": 2.9 }, { "loss": 7.3173, "grad_norm": 3.8507444858551025, "learning_rate": 3.229194014176949e-06, "epoch": 2.9 }, { "loss": 7.1595, "grad_norm": 4.1059651374816895, "learning_rate": 3.141681981272425e-06, "epoch": 2.91 }, { "loss": 7.279, "grad_norm": 3.6779584884643555, "learning_rate": 3.0541699483679005e-06, "epoch": 2.91 }, { "loss": 6.9319, "grad_norm": 3.716569423675537, "learning_rate": 2.9666579154633764e-06, "epoch": 2.91 }, { "loss": 7.1694, "grad_norm": 4.070470333099365, "learning_rate": 2.8791458825588523e-06, "epoch": 2.91 }, { "loss": 7.8244, "grad_norm": 4.256218910217285, "learning_rate": 2.7916338496543274e-06, "epoch": 2.92 }, { "loss": 7.5893, "grad_norm": 3.4462125301361084, "learning_rate": 2.7041218167498033e-06, "epoch": 2.92 }, { "loss": 6.8421, "grad_norm": 3.3473029136657715, "learning_rate": 2.616609783845279e-06, "epoch": 2.92 }, { "loss": 6.735, "grad_norm": 4.084453582763672, "learning_rate": 2.5290977509407543e-06, "epoch": 2.92 }, { "loss": 7.4639, "grad_norm": 4.453617572784424, "learning_rate": 2.4415857180362303e-06, "epoch": 2.93 }, { "loss": 7.1289, "grad_norm": 3.5332283973693848, "learning_rate": 2.3540736851317058e-06, "epoch": 2.93 }, { "loss": 6.9, "grad_norm": 4.8280792236328125, "learning_rate": 2.2665616522271813e-06, "epoch": 2.93 }, { "loss": 7.534, "grad_norm": 4.160041809082031, "learning_rate": 2.179049619322657e-06, "epoch": 2.93 }, { "loss": 7.4365, "grad_norm": 4.7804975509643555, "learning_rate": 2.0915375864181323e-06, "epoch": 2.94 }, { "loss": 7.4622, "grad_norm": 5.68775749206543, "learning_rate": 2.0040255535136082e-06, "epoch": 2.94 }, { "loss": 6.9841, "grad_norm": Infinity, "learning_rate": 1.9252647238995363e-06, "epoch": 2.94 }, { "loss": 7.0386, "grad_norm": 3.1843979358673096, "learning_rate": 1.8377526909950118e-06, "epoch": 2.95 }, { "loss": 7.1348, "grad_norm": 3.309314489364624, "learning_rate": 1.7502406580904876e-06, "epoch": 2.95 }, { "loss": 7.5823, "grad_norm": 3.1254711151123047, "learning_rate": 1.6627286251859633e-06, "epoch": 2.95 }, { "loss": 7.4536, "grad_norm": 4.925593852996826, "learning_rate": 1.5752165922814386e-06, "epoch": 2.95 }, { "loss": 7.7934, "grad_norm": 3.3663341999053955, "learning_rate": 1.4877045593769145e-06, "epoch": 2.96 }, { "loss": 7.2564, "grad_norm": 4.316028594970703, "learning_rate": 1.40019252647239e-06, "epoch": 2.96 }, { "loss": 7.0841, "grad_norm": 3.023416757583618, "learning_rate": 1.3126804935678657e-06, "epoch": 2.96 }, { "loss": 7.0354, "grad_norm": 4.041641712188721, "learning_rate": 1.2251684606633412e-06, "epoch": 2.96 }, { "loss": 7.2408, "grad_norm": 4.036230564117432, "learning_rate": 1.137656427758817e-06, "epoch": 2.97 }, { "loss": 6.8542, "grad_norm": 3.975757360458374, "learning_rate": 1.0501443948542925e-06, "epoch": 2.97 }, { "loss": 7.384, "grad_norm": 4.212265968322754, "learning_rate": 9.626323619497682e-07, "epoch": 2.97 }, { "loss": 7.5291, "grad_norm": 4.709102630615234, "learning_rate": 8.751203290452438e-07, "epoch": 2.97 }, { "loss": 7.2415, "grad_norm": 4.24073600769043, "learning_rate": 7.876082961407193e-07, "epoch": 2.98 }, { "loss": 7.117, "grad_norm": 4.139495849609375, "learning_rate": 7.00096263236195e-07, "epoch": 2.98 }, { "loss": 7.4584, "grad_norm": 3.581001043319702, "learning_rate": 6.125842303316706e-07, "epoch": 2.98 }, { "loss": 7.0442, "grad_norm": 3.6776018142700195, "learning_rate": 5.250721974271462e-07, "epoch": 2.98 }, { "loss": 7.1353, "grad_norm": 3.3257029056549072, "learning_rate": 4.375601645226219e-07, "epoch": 2.99 }, { "loss": 7.4847, "grad_norm": 4.7782697677612305, "learning_rate": 3.500481316180975e-07, "epoch": 2.99 }, { "loss": 7.4365, "grad_norm": 3.6555185317993164, "learning_rate": 2.625360987135731e-07, "epoch": 2.99 }, { "loss": 7.0578, "grad_norm": 3.675234079360962, "learning_rate": 1.7502406580904875e-07, "epoch": 3.0 }, { "loss": 6.8152, "grad_norm": 4.117830276489258, "learning_rate": 8.751203290452438e-08, "epoch": 3.0 }, { "train_runtime": 132410.4061, "train_samples_per_second": 2.762, "train_steps_per_second": 0.086, "train_loss": 8.721853916044362, "epoch": 3.0 } ]