protgpt2-distilled-medium / training_logs.json
littleworth's picture
Upload folder using huggingface_hub
dbe753c verified
raw
history blame
No virus
169 kB
[
{
"loss": 36.4684,
"grad_norm": 1.4972327947616577,
"learning_rate": 9.991248796709548e-05,
"epoch": 0.0
},
{
"loss": 31.4702,
"grad_norm": 1.1359542608261108,
"learning_rate": 9.982497593419096e-05,
"epoch": 0.01
},
{
"loss": 27.0139,
"grad_norm": 0.8641749024391174,
"learning_rate": 9.973746390128643e-05,
"epoch": 0.01
},
{
"loss": 25.5819,
"grad_norm": 0.7870637774467468,
"learning_rate": 9.964995186838191e-05,
"epoch": 0.01
},
{
"loss": 23.737,
"grad_norm": 0.5421485900878906,
"learning_rate": 9.956243983547739e-05,
"epoch": 0.01
},
{
"loss": 22.9208,
"grad_norm": 0.6402917504310608,
"learning_rate": 9.947492780257286e-05,
"epoch": 0.02
},
{
"loss": 22.6313,
"grad_norm": 4.473743915557861,
"learning_rate": 9.938741576966833e-05,
"epoch": 0.02
},
{
"loss": 21.2445,
"grad_norm": 1.5480146408081055,
"learning_rate": 9.929990373676381e-05,
"epoch": 0.02
},
{
"loss": 21.5249,
"grad_norm": 1.6474212408065796,
"learning_rate": 9.921239170385929e-05,
"epoch": 0.02
},
{
"loss": 20.416,
"grad_norm": 5.127786636352539,
"learning_rate": 9.912487967095477e-05,
"epoch": 0.03
},
{
"loss": 20.4307,
"grad_norm": 3.4422194957733154,
"learning_rate": 9.903736763805023e-05,
"epoch": 0.03
},
{
"loss": 20.303,
"grad_norm": 4.862687587738037,
"learning_rate": 9.894985560514571e-05,
"epoch": 0.03
},
{
"loss": 18.9257,
"grad_norm": 3.3842506408691406,
"learning_rate": 9.886234357224119e-05,
"epoch": 0.03
},
{
"loss": 19.3255,
"grad_norm": 2.574763774871826,
"learning_rate": 9.877483153933667e-05,
"epoch": 0.04
},
{
"loss": 18.568,
"grad_norm": 3.362725019454956,
"learning_rate": 9.868731950643215e-05,
"epoch": 0.04
},
{
"loss": 19.2249,
"grad_norm": 2.039949417114258,
"learning_rate": 9.859980747352761e-05,
"epoch": 0.04
},
{
"loss": 18.892,
"grad_norm": 6.239978313446045,
"learning_rate": 9.851229544062309e-05,
"epoch": 0.04
},
{
"loss": 18.1108,
"grad_norm": 4.516480445861816,
"learning_rate": 9.842478340771857e-05,
"epoch": 0.05
},
{
"loss": 17.5294,
"grad_norm": 2.1653459072113037,
"learning_rate": 9.833727137481405e-05,
"epoch": 0.05
},
{
"loss": 16.4828,
"grad_norm": 3.5177245140075684,
"learning_rate": 9.824975934190951e-05,
"epoch": 0.05
},
{
"loss": 17.0135,
"grad_norm": 3.044818162918091,
"learning_rate": 9.8162247309005e-05,
"epoch": 0.06
},
{
"loss": 17.2804,
"grad_norm": 2.429781198501587,
"learning_rate": 9.807473527610047e-05,
"epoch": 0.06
},
{
"loss": 16.9488,
"grad_norm": 4.289863586425781,
"learning_rate": 9.798722324319595e-05,
"epoch": 0.06
},
{
"loss": 16.413,
"grad_norm": 2.322678804397583,
"learning_rate": 9.789971121029142e-05,
"epoch": 0.06
},
{
"loss": 15.9682,
"grad_norm": 2.0535218715667725,
"learning_rate": 9.78121991773869e-05,
"epoch": 0.07
},
{
"loss": 15.9078,
"grad_norm": 1.9909405708312988,
"learning_rate": 9.772468714448237e-05,
"epoch": 0.07
},
{
"loss": 16.3041,
"grad_norm": 2.9589896202087402,
"learning_rate": 9.763717511157785e-05,
"epoch": 0.07
},
{
"loss": 15.9803,
"grad_norm": 4.50626277923584,
"learning_rate": 9.754966307867332e-05,
"epoch": 0.07
},
{
"loss": 15.3465,
"grad_norm": 2.2582998275756836,
"learning_rate": 9.74621510457688e-05,
"epoch": 0.08
},
{
"loss": 15.8187,
"grad_norm": 1.8218069076538086,
"learning_rate": 9.737463901286428e-05,
"epoch": 0.08
},
{
"loss": 15.6423,
"grad_norm": 3.2282557487487793,
"learning_rate": 9.728712697995976e-05,
"epoch": 0.08
},
{
"loss": 15.2293,
"grad_norm": 4.152139663696289,
"learning_rate": 9.719961494705523e-05,
"epoch": 0.08
},
{
"loss": 14.9266,
"grad_norm": 2.4058563709259033,
"learning_rate": 9.71121029141507e-05,
"epoch": 0.09
},
{
"loss": 14.9444,
"grad_norm": 2.390054941177368,
"learning_rate": 9.702459088124618e-05,
"epoch": 0.09
},
{
"loss": 14.3572,
"grad_norm": 2.8402915000915527,
"learning_rate": 9.693707884834166e-05,
"epoch": 0.09
},
{
"loss": 14.2444,
"grad_norm": 3.193218469619751,
"learning_rate": 9.684956681543714e-05,
"epoch": 0.09
},
{
"loss": 14.4161,
"grad_norm": 3.0809693336486816,
"learning_rate": 9.67620547825326e-05,
"epoch": 0.1
},
{
"loss": 15.3421,
"grad_norm": 2.5853307247161865,
"learning_rate": 9.667454274962808e-05,
"epoch": 0.1
},
{
"loss": 14.4311,
"grad_norm": 4.195634365081787,
"learning_rate": 9.658703071672356e-05,
"epoch": 0.1
},
{
"loss": 14.6836,
"grad_norm": 3.293510913848877,
"learning_rate": 9.649951868381904e-05,
"epoch": 0.1
},
{
"loss": 13.8319,
"grad_norm": 2.9272682666778564,
"learning_rate": 9.64120066509145e-05,
"epoch": 0.11
},
{
"loss": 14.566,
"grad_norm": 2.871901512145996,
"learning_rate": 9.632449461800998e-05,
"epoch": 0.11
},
{
"loss": 14.1236,
"grad_norm": 5.13899564743042,
"learning_rate": 9.623698258510546e-05,
"epoch": 0.11
},
{
"loss": 14.5302,
"grad_norm": 2.652040481567383,
"learning_rate": 9.614947055220094e-05,
"epoch": 0.12
},
{
"loss": 13.8852,
"grad_norm": 2.9902889728546143,
"learning_rate": 9.606195851929642e-05,
"epoch": 0.12
},
{
"loss": 14.0293,
"grad_norm": 2.3652427196502686,
"learning_rate": 9.597444648639187e-05,
"epoch": 0.12
},
{
"loss": 14.2171,
"grad_norm": 4.3896484375,
"learning_rate": 9.588693445348735e-05,
"epoch": 0.12
},
{
"loss": 13.8609,
"grad_norm": 3.040400505065918,
"learning_rate": 9.579942242058283e-05,
"epoch": 0.13
},
{
"loss": 13.7735,
"grad_norm": 4.223784446716309,
"learning_rate": 9.571191038767831e-05,
"epoch": 0.13
},
{
"loss": 13.8871,
"grad_norm": 3.0033748149871826,
"learning_rate": 9.562439835477379e-05,
"epoch": 0.13
},
{
"loss": 13.934,
"grad_norm": 1.834015965461731,
"learning_rate": 9.553688632186925e-05,
"epoch": 0.13
},
{
"loss": 12.9919,
"grad_norm": 4.704802513122559,
"learning_rate": 9.544937428896473e-05,
"epoch": 0.14
},
{
"loss": 12.8476,
"grad_norm": 2.105950355529785,
"learning_rate": 9.536186225606021e-05,
"epoch": 0.14
},
{
"loss": 13.0775,
"grad_norm": 3.732581615447998,
"learning_rate": 9.527435022315569e-05,
"epoch": 0.14
},
{
"loss": 14.248,
"grad_norm": 3.9151251316070557,
"learning_rate": 9.518683819025115e-05,
"epoch": 0.14
},
{
"loss": 13.519,
"grad_norm": 2.424039602279663,
"learning_rate": 9.509932615734663e-05,
"epoch": 0.15
},
{
"loss": 12.9035,
"grad_norm": 2.8388330936431885,
"learning_rate": 9.501181412444211e-05,
"epoch": 0.15
},
{
"loss": 13.5495,
"grad_norm": 4.111719131469727,
"learning_rate": 9.492430209153759e-05,
"epoch": 0.15
},
{
"loss": 13.0792,
"grad_norm": 5.089598655700684,
"learning_rate": 9.483679005863306e-05,
"epoch": 0.15
},
{
"loss": 12.7144,
"grad_norm": 2.015564203262329,
"learning_rate": 9.474927802572854e-05,
"epoch": 0.16
},
{
"loss": 13.1144,
"grad_norm": 1.9832412004470825,
"learning_rate": 9.466176599282401e-05,
"epoch": 0.16
},
{
"loss": 12.9708,
"grad_norm": 1.9776819944381714,
"learning_rate": 9.457425395991949e-05,
"epoch": 0.16
},
{
"loss": 13.1545,
"grad_norm": 3.082418918609619,
"learning_rate": 9.448674192701497e-05,
"epoch": 0.17
},
{
"loss": 12.8948,
"grad_norm": 2.824528217315674,
"learning_rate": 9.439922989411044e-05,
"epoch": 0.17
},
{
"loss": 12.8128,
"grad_norm": 3.653470754623413,
"learning_rate": 9.431171786120592e-05,
"epoch": 0.17
},
{
"loss": 12.4615,
"grad_norm": 2.4570350646972656,
"learning_rate": 9.42242058283014e-05,
"epoch": 0.17
},
{
"loss": 13.7183,
"grad_norm": 1.996759057044983,
"learning_rate": 9.413669379539687e-05,
"epoch": 0.18
},
{
"loss": 13.286,
"grad_norm": 2.3849940299987793,
"learning_rate": 9.404918176249234e-05,
"epoch": 0.18
},
{
"loss": 12.6206,
"grad_norm": 3.374633550643921,
"learning_rate": 9.396166972958782e-05,
"epoch": 0.18
},
{
"loss": 13.1151,
"grad_norm": 4.5953216552734375,
"learning_rate": 9.38741576966833e-05,
"epoch": 0.18
},
{
"loss": 12.6137,
"grad_norm": 2.402780532836914,
"learning_rate": 9.378664566377878e-05,
"epoch": 0.19
},
{
"loss": 12.1367,
"grad_norm": 5.434263706207275,
"learning_rate": 9.369913363087424e-05,
"epoch": 0.19
},
{
"loss": 12.4959,
"grad_norm": 3.73447585105896,
"learning_rate": 9.361162159796972e-05,
"epoch": 0.19
},
{
"loss": 12.1629,
"grad_norm": 3.205071449279785,
"learning_rate": 9.35241095650652e-05,
"epoch": 0.19
},
{
"loss": 12.0657,
"grad_norm": 4.104920864105225,
"learning_rate": 9.343659753216068e-05,
"epoch": 0.2
},
{
"loss": 11.9909,
"grad_norm": 4.132589817047119,
"learning_rate": 9.334908549925616e-05,
"epoch": 0.2
},
{
"loss": 11.9682,
"grad_norm": 2.3729248046875,
"learning_rate": 9.326157346635162e-05,
"epoch": 0.2
},
{
"loss": 12.2075,
"grad_norm": 3.024388313293457,
"learning_rate": 9.31740614334471e-05,
"epoch": 0.2
},
{
"loss": 11.4629,
"grad_norm": 2.923081159591675,
"learning_rate": 9.308654940054258e-05,
"epoch": 0.21
},
{
"loss": 12.6273,
"grad_norm": 4.349196434020996,
"learning_rate": 9.299903736763806e-05,
"epoch": 0.21
},
{
"loss": 12.0323,
"grad_norm": 3.275175094604492,
"learning_rate": 9.291152533473352e-05,
"epoch": 0.21
},
{
"loss": 12.1019,
"grad_norm": 1.8104184865951538,
"learning_rate": 9.2824013301829e-05,
"epoch": 0.22
},
{
"loss": 12.131,
"grad_norm": 3.931492567062378,
"learning_rate": 9.273650126892448e-05,
"epoch": 0.22
},
{
"loss": 12.6479,
"grad_norm": 4.626213550567627,
"learning_rate": 9.264898923601996e-05,
"epoch": 0.22
},
{
"loss": 12.0639,
"grad_norm": 2.5656702518463135,
"learning_rate": 9.256147720311543e-05,
"epoch": 0.22
},
{
"loss": 11.9819,
"grad_norm": 3.8051023483276367,
"learning_rate": 9.24739651702109e-05,
"epoch": 0.23
},
{
"loss": 12.7138,
"grad_norm": 2.1373887062072754,
"learning_rate": 9.238645313730638e-05,
"epoch": 0.23
},
{
"loss": 12.1889,
"grad_norm": 4.774439334869385,
"learning_rate": 9.229894110440186e-05,
"epoch": 0.23
},
{
"loss": 12.3925,
"grad_norm": 3.0765390396118164,
"learning_rate": 9.221142907149734e-05,
"epoch": 0.23
},
{
"loss": 11.4008,
"grad_norm": 3.136746644973755,
"learning_rate": 9.212391703859281e-05,
"epoch": 0.24
},
{
"loss": 11.8306,
"grad_norm": 1.836838722229004,
"learning_rate": 9.203640500568829e-05,
"epoch": 0.24
},
{
"loss": 11.7773,
"grad_norm": 3.790940523147583,
"learning_rate": 9.194889297278376e-05,
"epoch": 0.24
},
{
"loss": 11.8051,
"grad_norm": 3.1878066062927246,
"learning_rate": 9.186138093987924e-05,
"epoch": 0.24
},
{
"loss": 12.5683,
"grad_norm": 3.5691912174224854,
"learning_rate": 9.177386890697471e-05,
"epoch": 0.25
},
{
"loss": 12.0541,
"grad_norm": 3.9797616004943848,
"learning_rate": 9.168635687407019e-05,
"epoch": 0.25
},
{
"loss": 11.8673,
"grad_norm": 6.183890342712402,
"learning_rate": 9.159884484116567e-05,
"epoch": 0.25
},
{
"loss": 11.3265,
"grad_norm": 3.011223316192627,
"learning_rate": 9.151133280826115e-05,
"epoch": 0.25
},
{
"loss": 11.5664,
"grad_norm": 2.2235491275787354,
"learning_rate": 9.142382077535661e-05,
"epoch": 0.26
},
{
"loss": 11.5695,
"grad_norm": 2.199366807937622,
"learning_rate": 9.133630874245209e-05,
"epoch": 0.26
},
{
"loss": 12.1804,
"grad_norm": 2.8299245834350586,
"learning_rate": 9.124879670954757e-05,
"epoch": 0.26
},
{
"loss": 11.4799,
"grad_norm": 3.164628744125366,
"learning_rate": 9.116128467664305e-05,
"epoch": 0.27
},
{
"loss": 11.4195,
"grad_norm": 4.022547245025635,
"learning_rate": 9.107377264373851e-05,
"epoch": 0.27
},
{
"loss": 11.5764,
"grad_norm": 2.569967031478882,
"learning_rate": 9.098626061083399e-05,
"epoch": 0.27
},
{
"loss": 11.4122,
"grad_norm": 2.7668631076812744,
"learning_rate": 9.089874857792947e-05,
"epoch": 0.27
},
{
"loss": 11.9738,
"grad_norm": 4.17225980758667,
"learning_rate": 9.081123654502495e-05,
"epoch": 0.28
},
{
"loss": 11.2846,
"grad_norm": 3.6021440029144287,
"learning_rate": 9.072372451212043e-05,
"epoch": 0.28
},
{
"loss": 11.885,
"grad_norm": 5.99414587020874,
"learning_rate": 9.06362124792159e-05,
"epoch": 0.28
},
{
"loss": 11.4829,
"grad_norm": 3.0609118938446045,
"learning_rate": 9.054870044631137e-05,
"epoch": 0.28
},
{
"loss": 11.2748,
"grad_norm": 3.083606243133545,
"learning_rate": 9.046118841340685e-05,
"epoch": 0.29
},
{
"loss": 10.9556,
"grad_norm": 2.1071770191192627,
"learning_rate": 9.037367638050233e-05,
"epoch": 0.29
},
{
"loss": 11.9664,
"grad_norm": 3.2089502811431885,
"learning_rate": 9.02861643475978e-05,
"epoch": 0.29
},
{
"loss": 11.0907,
"grad_norm": 2.714460611343384,
"learning_rate": 9.019865231469327e-05,
"epoch": 0.29
},
{
"loss": 11.0296,
"grad_norm": 4.843391418457031,
"learning_rate": 9.011114028178875e-05,
"epoch": 0.3
},
{
"loss": 10.6882,
"grad_norm": 2.8939428329467773,
"learning_rate": 9.002362824888423e-05,
"epoch": 0.3
},
{
"loss": 11.4392,
"grad_norm": 5.056521892547607,
"learning_rate": 8.99361162159797e-05,
"epoch": 0.3
},
{
"loss": 11.1842,
"grad_norm": 2.7797389030456543,
"learning_rate": 8.984860418307518e-05,
"epoch": 0.3
},
{
"loss": 10.97,
"grad_norm": 4.099424839019775,
"learning_rate": 8.976109215017066e-05,
"epoch": 0.31
},
{
"loss": 11.3728,
"grad_norm": 3.803455114364624,
"learning_rate": 8.967358011726613e-05,
"epoch": 0.31
},
{
"loss": 11.1566,
"grad_norm": 6.033726215362549,
"learning_rate": 8.958606808436161e-05,
"epoch": 0.31
},
{
"loss": 10.635,
"grad_norm": 3.339327335357666,
"learning_rate": 8.949855605145708e-05,
"epoch": 0.31
},
{
"loss": 11.2816,
"grad_norm": 2.3768680095672607,
"learning_rate": 8.941104401855256e-05,
"epoch": 0.32
},
{
"loss": 10.8236,
"grad_norm": 3.4453046321868896,
"learning_rate": 8.932353198564804e-05,
"epoch": 0.32
},
{
"loss": 10.9037,
"grad_norm": 3.0895841121673584,
"learning_rate": 8.923601995274352e-05,
"epoch": 0.32
},
{
"loss": 10.4346,
"grad_norm": 3.26282000541687,
"learning_rate": 8.914850791983898e-05,
"epoch": 0.33
},
{
"loss": 10.2253,
"grad_norm": 3.158858299255371,
"learning_rate": 8.906099588693446e-05,
"epoch": 0.33
},
{
"loss": 10.9327,
"grad_norm": 2.569925308227539,
"learning_rate": 8.897348385402994e-05,
"epoch": 0.33
},
{
"loss": 11.1466,
"grad_norm": 4.456540107727051,
"learning_rate": 8.888597182112542e-05,
"epoch": 0.33
},
{
"loss": 10.6713,
"grad_norm": 2.9973337650299072,
"learning_rate": 8.879845978822088e-05,
"epoch": 0.34
},
{
"loss": 10.6667,
"grad_norm": 4.433472156524658,
"learning_rate": 8.871094775531636e-05,
"epoch": 0.34
},
{
"loss": 11.0465,
"grad_norm": 3.661515474319458,
"learning_rate": 8.862343572241184e-05,
"epoch": 0.34
},
{
"loss": 10.2861,
"grad_norm": 2.8008625507354736,
"learning_rate": 8.853592368950732e-05,
"epoch": 0.34
},
{
"loss": 10.6822,
"grad_norm": 3.843266487121582,
"learning_rate": 8.84484116566028e-05,
"epoch": 0.35
},
{
"loss": 10.726,
"grad_norm": 3.4649717807769775,
"learning_rate": 8.836089962369826e-05,
"epoch": 0.35
},
{
"loss": 10.6911,
"grad_norm": 4.743326187133789,
"learning_rate": 8.827338759079374e-05,
"epoch": 0.35
},
{
"loss": 10.1019,
"grad_norm": 2.6317293643951416,
"learning_rate": 8.818587555788922e-05,
"epoch": 0.35
},
{
"loss": 10.4147,
"grad_norm": 3.893660306930542,
"learning_rate": 8.80983635249847e-05,
"epoch": 0.36
},
{
"loss": 10.5977,
"grad_norm": 2.704558849334717,
"learning_rate": 8.801085149208017e-05,
"epoch": 0.36
},
{
"loss": 10.7126,
"grad_norm": 3.4808812141418457,
"learning_rate": 8.792333945917563e-05,
"epoch": 0.36
},
{
"loss": 10.4511,
"grad_norm": 2.971688985824585,
"learning_rate": 8.783582742627111e-05,
"epoch": 0.36
},
{
"loss": 10.3621,
"grad_norm": 3.7666103839874268,
"learning_rate": 8.774831539336659e-05,
"epoch": 0.37
},
{
"loss": 10.3416,
"grad_norm": 2.951805353164673,
"learning_rate": 8.766080336046207e-05,
"epoch": 0.37
},
{
"loss": 10.5537,
"grad_norm": 3.5080454349517822,
"learning_rate": 8.757329132755753e-05,
"epoch": 0.37
},
{
"loss": 10.3536,
"grad_norm": 3.521519660949707,
"learning_rate": 8.748577929465301e-05,
"epoch": 0.38
},
{
"loss": 10.3231,
"grad_norm": 3.646610736846924,
"learning_rate": 8.739826726174849e-05,
"epoch": 0.38
},
{
"loss": 10.6433,
"grad_norm": 3.4696707725524902,
"learning_rate": 8.731075522884397e-05,
"epoch": 0.38
},
{
"loss": 10.1692,
"grad_norm": 3.852370500564575,
"learning_rate": 8.722324319593944e-05,
"epoch": 0.38
},
{
"loss": 9.7841,
"grad_norm": 3.693451404571533,
"learning_rate": 8.713573116303491e-05,
"epoch": 0.39
},
{
"loss": 10.7817,
"grad_norm": 3.032994508743286,
"learning_rate": 8.704821913013039e-05,
"epoch": 0.39
},
{
"loss": 10.7416,
"grad_norm": 3.537693500518799,
"learning_rate": 8.696070709722587e-05,
"epoch": 0.39
},
{
"loss": 9.9999,
"grad_norm": 2.624573230743408,
"learning_rate": 8.687319506432135e-05,
"epoch": 0.39
},
{
"loss": 10.3777,
"grad_norm": 2.453648328781128,
"learning_rate": 8.678568303141682e-05,
"epoch": 0.4
},
{
"loss": 10.2463,
"grad_norm": 3.5459659099578857,
"learning_rate": 8.66981709985123e-05,
"epoch": 0.4
},
{
"loss": 9.69,
"grad_norm": 2.9005305767059326,
"learning_rate": 8.661065896560777e-05,
"epoch": 0.4
},
{
"loss": 10.5555,
"grad_norm": 4.305134296417236,
"learning_rate": 8.652314693270325e-05,
"epoch": 0.4
},
{
"loss": 11.4746,
"grad_norm": 3.8746566772460938,
"learning_rate": 8.643563489979872e-05,
"epoch": 0.41
},
{
"loss": 10.4112,
"grad_norm": 3.0006351470947266,
"learning_rate": 8.63481228668942e-05,
"epoch": 0.41
},
{
"loss": 10.7159,
"grad_norm": 3.4273717403411865,
"learning_rate": 8.626061083398968e-05,
"epoch": 0.41
},
{
"loss": 9.8815,
"grad_norm": 3.3976597785949707,
"learning_rate": 8.617309880108515e-05,
"epoch": 0.41
},
{
"loss": 10.0256,
"grad_norm": 4.364745140075684,
"learning_rate": 8.608558676818062e-05,
"epoch": 0.42
},
{
"loss": 10.2598,
"grad_norm": 2.6873209476470947,
"learning_rate": 8.59980747352761e-05,
"epoch": 0.42
},
{
"loss": 10.3362,
"grad_norm": 4.00089693069458,
"learning_rate": 8.591056270237158e-05,
"epoch": 0.42
},
{
"loss": 10.2189,
"grad_norm": 2.858186721801758,
"learning_rate": 8.582305066946706e-05,
"epoch": 0.43
},
{
"loss": 10.3866,
"grad_norm": 3.203000783920288,
"learning_rate": 8.573553863656252e-05,
"epoch": 0.43
},
{
"loss": 10.0813,
"grad_norm": 3.210279941558838,
"learning_rate": 8.5648026603658e-05,
"epoch": 0.43
},
{
"loss": 10.5642,
"grad_norm": 3.2169432640075684,
"learning_rate": 8.556051457075348e-05,
"epoch": 0.43
},
{
"loss": 9.8901,
"grad_norm": 3.107404947280884,
"learning_rate": 8.547300253784896e-05,
"epoch": 0.44
},
{
"loss": 10.1058,
"grad_norm": 2.7491989135742188,
"learning_rate": 8.538549050494444e-05,
"epoch": 0.44
},
{
"loss": 10.1777,
"grad_norm": 3.140073299407959,
"learning_rate": 8.52979784720399e-05,
"epoch": 0.44
},
{
"loss": 9.4428,
"grad_norm": 3.9033658504486084,
"learning_rate": 8.521046643913538e-05,
"epoch": 0.44
},
{
"loss": 10.4304,
"grad_norm": 3.4388954639434814,
"learning_rate": 8.512295440623086e-05,
"epoch": 0.45
},
{
"loss": 9.7865,
"grad_norm": 2.7577993869781494,
"learning_rate": 8.503544237332634e-05,
"epoch": 0.45
},
{
"loss": 10.5389,
"grad_norm": 4.365457534790039,
"learning_rate": 8.49479303404218e-05,
"epoch": 0.45
},
{
"loss": 9.6268,
"grad_norm": 4.908252239227295,
"learning_rate": 8.486041830751728e-05,
"epoch": 0.45
},
{
"loss": 9.8142,
"grad_norm": 3.5492117404937744,
"learning_rate": 8.477290627461276e-05,
"epoch": 0.46
},
{
"loss": 9.1744,
"grad_norm": 3.34104061126709,
"learning_rate": 8.468539424170824e-05,
"epoch": 0.46
},
{
"loss": 9.793,
"grad_norm": 5.443964958190918,
"learning_rate": 8.459788220880371e-05,
"epoch": 0.46
},
{
"loss": 9.6955,
"grad_norm": 3.092270851135254,
"learning_rate": 8.451037017589919e-05,
"epoch": 0.46
},
{
"loss": 9.7381,
"grad_norm": 3.322415828704834,
"learning_rate": 8.442285814299467e-05,
"epoch": 0.47
},
{
"loss": 10.1758,
"grad_norm": 3.5836918354034424,
"learning_rate": 8.433534611009014e-05,
"epoch": 0.47
},
{
"loss": 10.0565,
"grad_norm": 4.64646053314209,
"learning_rate": 8.424783407718562e-05,
"epoch": 0.47
},
{
"loss": 9.3562,
"grad_norm": 2.8691656589508057,
"learning_rate": 8.416032204428109e-05,
"epoch": 0.48
},
{
"loss": 10.1164,
"grad_norm": 2.6130857467651367,
"learning_rate": 8.407281001137657e-05,
"epoch": 0.48
},
{
"loss": 9.1353,
"grad_norm": 2.950364112854004,
"learning_rate": 8.398529797847205e-05,
"epoch": 0.48
},
{
"loss": 9.2614,
"grad_norm": 3.1866071224212646,
"learning_rate": 8.389778594556752e-05,
"epoch": 0.48
},
{
"loss": 9.7335,
"grad_norm": 3.584228038787842,
"learning_rate": 8.381027391266299e-05,
"epoch": 0.49
},
{
"loss": 9.7923,
"grad_norm": 2.941434860229492,
"learning_rate": 8.372276187975847e-05,
"epoch": 0.49
},
{
"loss": 9.6495,
"grad_norm": 3.9578118324279785,
"learning_rate": 8.363524984685395e-05,
"epoch": 0.49
},
{
"loss": 9.7038,
"grad_norm": 3.197563648223877,
"learning_rate": 8.354773781394943e-05,
"epoch": 0.49
},
{
"loss": 9.9406,
"grad_norm": 3.8146650791168213,
"learning_rate": 8.346022578104489e-05,
"epoch": 0.5
},
{
"loss": 9.8941,
"grad_norm": 3.293826103210449,
"learning_rate": 8.337271374814037e-05,
"epoch": 0.5
},
{
"loss": 9.463,
"grad_norm": 2.8410701751708984,
"learning_rate": 8.328520171523585e-05,
"epoch": 0.5
},
{
"loss": 9.9774,
"grad_norm": 4.301900386810303,
"learning_rate": 8.319768968233133e-05,
"epoch": 0.5
},
{
"loss": 9.8438,
"grad_norm": 3.798737049102783,
"learning_rate": 8.311017764942681e-05,
"epoch": 0.51
},
{
"loss": 9.9238,
"grad_norm": 3.634910821914673,
"learning_rate": 8.302266561652227e-05,
"epoch": 0.51
},
{
"loss": 9.3031,
"grad_norm": 4.557560443878174,
"learning_rate": 8.293515358361775e-05,
"epoch": 0.51
},
{
"loss": 9.7714,
"grad_norm": 3.100658893585205,
"learning_rate": 8.284764155071323e-05,
"epoch": 0.51
},
{
"loss": 9.6701,
"grad_norm": 3.0376410484313965,
"learning_rate": 8.276012951780871e-05,
"epoch": 0.52
},
{
"loss": 9.7965,
"grad_norm": 2.64803147315979,
"learning_rate": 8.267261748490418e-05,
"epoch": 0.52
},
{
"loss": 9.3502,
"grad_norm": 3.5259008407592773,
"learning_rate": 8.258510545199965e-05,
"epoch": 0.52
},
{
"loss": 9.3924,
"grad_norm": 3.604329824447632,
"learning_rate": 8.249759341909513e-05,
"epoch": 0.52
},
{
"loss": 9.1481,
"grad_norm": 2.6112060546875,
"learning_rate": 8.241008138619061e-05,
"epoch": 0.53
},
{
"loss": 10.2096,
"grad_norm": 3.718703031539917,
"learning_rate": 8.232256935328608e-05,
"epoch": 0.53
},
{
"loss": 9.0669,
"grad_norm": 4.43959903717041,
"learning_rate": 8.223505732038156e-05,
"epoch": 0.53
},
{
"loss": 9.2528,
"grad_norm": 3.4342939853668213,
"learning_rate": 8.214754528747703e-05,
"epoch": 0.54
},
{
"loss": 9.4184,
"grad_norm": 4.191211700439453,
"learning_rate": 8.206003325457251e-05,
"epoch": 0.54
},
{
"loss": 9.2057,
"grad_norm": 3.076712131500244,
"learning_rate": 8.197252122166799e-05,
"epoch": 0.54
},
{
"loss": 9.3174,
"grad_norm": 3.668440341949463,
"learning_rate": 8.188500918876346e-05,
"epoch": 0.54
},
{
"loss": 9.8369,
"grad_norm": 3.419703483581543,
"learning_rate": 8.179749715585894e-05,
"epoch": 0.55
},
{
"loss": 9.2614,
"grad_norm": 4.150201797485352,
"learning_rate": 8.170998512295442e-05,
"epoch": 0.55
},
{
"loss": 9.1372,
"grad_norm": 3.2589640617370605,
"learning_rate": 8.16224730900499e-05,
"epoch": 0.55
},
{
"loss": 9.4359,
"grad_norm": 3.1012041568756104,
"learning_rate": 8.153496105714536e-05,
"epoch": 0.55
},
{
"loss": 9.6403,
"grad_norm": 3.882509708404541,
"learning_rate": 8.144744902424084e-05,
"epoch": 0.56
},
{
"loss": 9.6448,
"grad_norm": 2.656543254852295,
"learning_rate": 8.135993699133632e-05,
"epoch": 0.56
},
{
"loss": 9.5709,
"grad_norm": 3.2577645778656006,
"learning_rate": 8.12724249584318e-05,
"epoch": 0.56
},
{
"loss": 9.4838,
"grad_norm": 2.737210512161255,
"learning_rate": 8.118491292552726e-05,
"epoch": 0.56
},
{
"loss": 9.0991,
"grad_norm": 2.2185497283935547,
"learning_rate": 8.109740089262274e-05,
"epoch": 0.57
},
{
"loss": 9.1741,
"grad_norm": 2.766544818878174,
"learning_rate": 8.100988885971822e-05,
"epoch": 0.57
},
{
"loss": 9.9664,
"grad_norm": 3.627641201019287,
"learning_rate": 8.09223768268137e-05,
"epoch": 0.57
},
{
"loss": 9.3467,
"grad_norm": 3.600707769393921,
"learning_rate": 8.083486479390916e-05,
"epoch": 0.57
},
{
"loss": 9.8328,
"grad_norm": 5.097866058349609,
"learning_rate": 8.074735276100464e-05,
"epoch": 0.58
},
{
"loss": 9.289,
"grad_norm": 3.3913521766662598,
"learning_rate": 8.065984072810012e-05,
"epoch": 0.58
},
{
"loss": 9.2046,
"grad_norm": 3.586367130279541,
"learning_rate": 8.05723286951956e-05,
"epoch": 0.58
},
{
"loss": 9.1802,
"grad_norm": 5.786179542541504,
"learning_rate": 8.048481666229108e-05,
"epoch": 0.59
},
{
"loss": 9.6482,
"grad_norm": 3.158339023590088,
"learning_rate": 8.039730462938655e-05,
"epoch": 0.59
},
{
"loss": 9.0124,
"grad_norm": 3.3116583824157715,
"learning_rate": 8.030979259648202e-05,
"epoch": 0.59
},
{
"loss": 8.7543,
"grad_norm": 2.555194616317749,
"learning_rate": 8.02222805635775e-05,
"epoch": 0.59
},
{
"loss": 9.043,
"grad_norm": 3.2205519676208496,
"learning_rate": 8.013476853067298e-05,
"epoch": 0.6
},
{
"loss": 9.5348,
"grad_norm": 3.4175057411193848,
"learning_rate": 8.004725649776845e-05,
"epoch": 0.6
},
{
"loss": 9.177,
"grad_norm": 4.694581985473633,
"learning_rate": 7.995974446486391e-05,
"epoch": 0.6
},
{
"loss": 9.2863,
"grad_norm": 2.7787346839904785,
"learning_rate": 7.987223243195939e-05,
"epoch": 0.6
},
{
"loss": 8.6984,
"grad_norm": 3.4298195838928223,
"learning_rate": 7.978472039905487e-05,
"epoch": 0.61
},
{
"loss": 9.0926,
"grad_norm": 4.21417760848999,
"learning_rate": 7.969720836615035e-05,
"epoch": 0.61
},
{
"loss": 8.8851,
"grad_norm": 3.0844244956970215,
"learning_rate": 7.960969633324581e-05,
"epoch": 0.61
},
{
"loss": 9.7628,
"grad_norm": 3.0156939029693604,
"learning_rate": 7.95221843003413e-05,
"epoch": 0.61
},
{
"loss": 8.6529,
"grad_norm": 3.9500784873962402,
"learning_rate": 7.943467226743677e-05,
"epoch": 0.62
},
{
"loss": 9.0502,
"grad_norm": 4.802796840667725,
"learning_rate": 7.934716023453225e-05,
"epoch": 0.62
},
{
"loss": 8.6458,
"grad_norm": 4.273401260375977,
"learning_rate": 7.925964820162772e-05,
"epoch": 0.62
},
{
"loss": 8.8679,
"grad_norm": 4.070954322814941,
"learning_rate": 7.91721361687232e-05,
"epoch": 0.62
},
{
"loss": 8.8355,
"grad_norm": 3.3995022773742676,
"learning_rate": 7.908462413581867e-05,
"epoch": 0.63
},
{
"loss": 8.7255,
"grad_norm": 2.974888801574707,
"learning_rate": 7.899711210291415e-05,
"epoch": 0.63
},
{
"loss": 9.6256,
"grad_norm": 2.521350145339966,
"learning_rate": 7.890960007000963e-05,
"epoch": 0.63
},
{
"loss": 8.9677,
"grad_norm": 2.659583330154419,
"learning_rate": 7.88220880371051e-05,
"epoch": 0.64
},
{
"loss": 9.339,
"grad_norm": 4.531926155090332,
"learning_rate": 7.873457600420058e-05,
"epoch": 0.64
},
{
"loss": 9.469,
"grad_norm": 3.573625087738037,
"learning_rate": 7.864706397129606e-05,
"epoch": 0.64
},
{
"loss": 9.2697,
"grad_norm": 3.5155880451202393,
"learning_rate": 7.855955193839153e-05,
"epoch": 0.64
},
{
"loss": 8.5749,
"grad_norm": 3.201718330383301,
"learning_rate": 7.8472039905487e-05,
"epoch": 0.65
},
{
"loss": 8.9228,
"grad_norm": 3.8670506477355957,
"learning_rate": 7.838452787258248e-05,
"epoch": 0.65
},
{
"loss": 9.5243,
"grad_norm": 3.4351415634155273,
"learning_rate": 7.829701583967796e-05,
"epoch": 0.65
},
{
"loss": 8.7689,
"grad_norm": 4.182631492614746,
"learning_rate": 7.820950380677344e-05,
"epoch": 0.65
},
{
"loss": 9.2565,
"grad_norm": 3.6523499488830566,
"learning_rate": 7.81219917738689e-05,
"epoch": 0.66
},
{
"loss": 8.9147,
"grad_norm": 3.6572344303131104,
"learning_rate": 7.803447974096438e-05,
"epoch": 0.66
},
{
"loss": 8.6875,
"grad_norm": 4.45376443862915,
"learning_rate": 7.794696770805986e-05,
"epoch": 0.66
},
{
"loss": 9.782,
"grad_norm": 4.446099758148193,
"learning_rate": 7.785945567515534e-05,
"epoch": 0.66
},
{
"loss": 9.0401,
"grad_norm": 3.134500026702881,
"learning_rate": 7.777194364225082e-05,
"epoch": 0.67
},
{
"loss": 9.3041,
"grad_norm": 4.3101325035095215,
"learning_rate": 7.768443160934628e-05,
"epoch": 0.67
},
{
"loss": 8.3818,
"grad_norm": 2.935241222381592,
"learning_rate": 7.759691957644176e-05,
"epoch": 0.67
},
{
"loss": 9.3778,
"grad_norm": 3.966174364089966,
"learning_rate": 7.750940754353724e-05,
"epoch": 0.67
},
{
"loss": 9.0559,
"grad_norm": 3.758314609527588,
"learning_rate": 7.742189551063272e-05,
"epoch": 0.68
},
{
"loss": 8.5828,
"grad_norm": 3.2531213760375977,
"learning_rate": 7.733438347772818e-05,
"epoch": 0.68
},
{
"loss": 8.6358,
"grad_norm": 3.9096357822418213,
"learning_rate": 7.724687144482366e-05,
"epoch": 0.68
},
{
"loss": 9.0841,
"grad_norm": 2.787165641784668,
"learning_rate": 7.715935941191914e-05,
"epoch": 0.69
},
{
"loss": 8.7611,
"grad_norm": 3.6336965560913086,
"learning_rate": 7.707184737901462e-05,
"epoch": 0.69
},
{
"loss": 9.3819,
"grad_norm": 4.785186290740967,
"learning_rate": 7.698433534611009e-05,
"epoch": 0.69
},
{
"loss": 9.3396,
"grad_norm": 3.7301132678985596,
"learning_rate": 7.689682331320557e-05,
"epoch": 0.69
},
{
"loss": 8.7932,
"grad_norm": 3.769679307937622,
"learning_rate": 7.680931128030104e-05,
"epoch": 0.7
},
{
"loss": 9.5408,
"grad_norm": 3.249382257461548,
"learning_rate": 7.672179924739652e-05,
"epoch": 0.7
},
{
"loss": 9.1383,
"grad_norm": 3.562981128692627,
"learning_rate": 7.6634287214492e-05,
"epoch": 0.7
},
{
"loss": 8.5737,
"grad_norm": 3.2148962020874023,
"learning_rate": 7.654677518158747e-05,
"epoch": 0.7
},
{
"loss": 8.5483,
"grad_norm": 2.9571826457977295,
"learning_rate": 7.645926314868295e-05,
"epoch": 0.71
},
{
"loss": 8.9157,
"grad_norm": 3.3202896118164062,
"learning_rate": 7.637175111577843e-05,
"epoch": 0.71
},
{
"loss": 8.9654,
"grad_norm": 4.197299957275391,
"learning_rate": 7.62842390828739e-05,
"epoch": 0.71
},
{
"loss": 9.642,
"grad_norm": 2.9648005962371826,
"learning_rate": 7.619672704996937e-05,
"epoch": 0.71
},
{
"loss": 9.5579,
"grad_norm": 2.793729066848755,
"learning_rate": 7.610921501706485e-05,
"epoch": 0.72
},
{
"loss": 9.0535,
"grad_norm": 3.039337158203125,
"learning_rate": 7.602170298416033e-05,
"epoch": 0.72
},
{
"loss": 8.4261,
"grad_norm": 3.472973346710205,
"learning_rate": 7.59341909512558e-05,
"epoch": 0.72
},
{
"loss": 8.5207,
"grad_norm": 2.588060140609741,
"learning_rate": 7.584667891835127e-05,
"epoch": 0.72
},
{
"loss": 9.4719,
"grad_norm": 3.702918529510498,
"learning_rate": 7.575916688544675e-05,
"epoch": 0.73
},
{
"loss": 8.2056,
"grad_norm": 3.087986946105957,
"learning_rate": 7.567165485254223e-05,
"epoch": 0.73
},
{
"loss": 8.9777,
"grad_norm": 3.231987476348877,
"learning_rate": 7.558414281963771e-05,
"epoch": 0.73
},
{
"loss": 8.946,
"grad_norm": 3.1620264053344727,
"learning_rate": 7.549663078673317e-05,
"epoch": 0.73
},
{
"loss": 9.3374,
"grad_norm": 3.0438194274902344,
"learning_rate": 7.540911875382865e-05,
"epoch": 0.74
},
{
"loss": 8.4711,
"grad_norm": 3.3557493686676025,
"learning_rate": 7.532160672092413e-05,
"epoch": 0.74
},
{
"loss": 8.5348,
"grad_norm": 3.693506956100464,
"learning_rate": 7.523409468801961e-05,
"epoch": 0.74
},
{
"loss": 9.2969,
"grad_norm": 4.126795291900635,
"learning_rate": 7.514658265511509e-05,
"epoch": 0.75
},
{
"loss": 8.6828,
"grad_norm": 3.4798762798309326,
"learning_rate": 7.505907062221055e-05,
"epoch": 0.75
},
{
"loss": 9.885,
"grad_norm": 3.5834882259368896,
"learning_rate": 7.497155858930603e-05,
"epoch": 0.75
},
{
"loss": 8.7332,
"grad_norm": 3.054962396621704,
"learning_rate": 7.488404655640151e-05,
"epoch": 0.75
},
{
"loss": 8.8859,
"grad_norm": 2.3702313899993896,
"learning_rate": 7.479653452349699e-05,
"epoch": 0.76
},
{
"loss": 8.9641,
"grad_norm": 3.573233127593994,
"learning_rate": 7.470902249059246e-05,
"epoch": 0.76
},
{
"loss": 9.0286,
"grad_norm": 2.7246625423431396,
"learning_rate": 7.462151045768794e-05,
"epoch": 0.76
},
{
"loss": 8.4259,
"grad_norm": 3.090899705886841,
"learning_rate": 7.453399842478341e-05,
"epoch": 0.76
},
{
"loss": 8.5746,
"grad_norm": 2.8535008430480957,
"learning_rate": 7.444648639187889e-05,
"epoch": 0.77
},
{
"loss": 9.0497,
"grad_norm": 3.7636609077453613,
"learning_rate": 7.435897435897436e-05,
"epoch": 0.77
},
{
"loss": 8.7239,
"grad_norm": 3.038818597793579,
"learning_rate": 7.427146232606984e-05,
"epoch": 0.77
},
{
"loss": 9.0842,
"grad_norm": 3.275329351425171,
"learning_rate": 7.418395029316532e-05,
"epoch": 0.77
},
{
"loss": 8.9054,
"grad_norm": 2.4956889152526855,
"learning_rate": 7.40964382602608e-05,
"epoch": 0.78
},
{
"loss": 8.8721,
"grad_norm": 2.9423913955688477,
"learning_rate": 7.400892622735627e-05,
"epoch": 0.78
},
{
"loss": 8.9035,
"grad_norm": 4.2211785316467285,
"learning_rate": 7.392141419445174e-05,
"epoch": 0.78
},
{
"loss": 8.9983,
"grad_norm": 3.3558285236358643,
"learning_rate": 7.383390216154722e-05,
"epoch": 0.78
},
{
"loss": 8.4671,
"grad_norm": 3.1967856884002686,
"learning_rate": 7.37463901286427e-05,
"epoch": 0.79
},
{
"loss": 8.4608,
"grad_norm": 3.5259337425231934,
"learning_rate": 7.365887809573818e-05,
"epoch": 0.79
},
{
"loss": 9.1684,
"grad_norm": 3.226388692855835,
"learning_rate": 7.357136606283364e-05,
"epoch": 0.79
},
{
"loss": 8.4309,
"grad_norm": 3.7550246715545654,
"learning_rate": 7.348385402992912e-05,
"epoch": 0.8
},
{
"loss": 8.4427,
"grad_norm": 4.338967800140381,
"learning_rate": 7.33963419970246e-05,
"epoch": 0.8
},
{
"loss": 8.9643,
"grad_norm": 3.764723777770996,
"learning_rate": 7.330882996412008e-05,
"epoch": 0.8
},
{
"loss": 8.346,
"grad_norm": 3.2704851627349854,
"learning_rate": 7.322131793121554e-05,
"epoch": 0.8
},
{
"loss": 9.3154,
"grad_norm": 2.8961048126220703,
"learning_rate": 7.313380589831102e-05,
"epoch": 0.81
},
{
"loss": 8.9944,
"grad_norm": 3.3732376098632812,
"learning_rate": 7.30462938654065e-05,
"epoch": 0.81
},
{
"loss": 8.4122,
"grad_norm": 3.7773525714874268,
"learning_rate": 7.295878183250198e-05,
"epoch": 0.81
},
{
"loss": 9.2683,
"grad_norm": 3.716183662414551,
"learning_rate": 7.287126979959746e-05,
"epoch": 0.81
},
{
"loss": 8.6879,
"grad_norm": 3.6532111167907715,
"learning_rate": 7.278375776669292e-05,
"epoch": 0.82
},
{
"loss": 8.6653,
"grad_norm": 3.4833829402923584,
"learning_rate": 7.26962457337884e-05,
"epoch": 0.82
},
{
"loss": 8.8866,
"grad_norm": 4.338618278503418,
"learning_rate": 7.260873370088388e-05,
"epoch": 0.82
},
{
"loss": 8.9239,
"grad_norm": 3.3099405765533447,
"learning_rate": 7.252122166797936e-05,
"epoch": 0.82
},
{
"loss": 8.8475,
"grad_norm": 3.2691259384155273,
"learning_rate": 7.243370963507483e-05,
"epoch": 0.83
},
{
"loss": 9.0336,
"grad_norm": 3.4680464267730713,
"learning_rate": 7.23461976021703e-05,
"epoch": 0.83
},
{
"loss": 9.1274,
"grad_norm": 3.6281306743621826,
"learning_rate": 7.225868556926578e-05,
"epoch": 0.83
},
{
"loss": 8.557,
"grad_norm": 3.129265546798706,
"learning_rate": 7.217117353636126e-05,
"epoch": 0.83
},
{
"loss": 8.2588,
"grad_norm": 2.781096935272217,
"learning_rate": 7.208366150345673e-05,
"epoch": 0.84
},
{
"loss": 8.6785,
"grad_norm": 2.085003137588501,
"learning_rate": 7.19961494705522e-05,
"epoch": 0.84
},
{
"loss": 8.2642,
"grad_norm": 3.204002618789673,
"learning_rate": 7.190863743764767e-05,
"epoch": 0.84
},
{
"loss": 7.9569,
"grad_norm": 2.3955442905426025,
"learning_rate": 7.182112540474315e-05,
"epoch": 0.85
},
{
"loss": 8.3701,
"grad_norm": 3.8179776668548584,
"learning_rate": 7.173361337183863e-05,
"epoch": 0.85
},
{
"loss": 8.4552,
"grad_norm": 3.572737216949463,
"learning_rate": 7.16461013389341e-05,
"epoch": 0.85
},
{
"loss": 8.5619,
"grad_norm": 3.7918508052825928,
"learning_rate": 7.155858930602957e-05,
"epoch": 0.85
},
{
"loss": 8.9817,
"grad_norm": 3.5314037799835205,
"learning_rate": 7.147107727312505e-05,
"epoch": 0.86
},
{
"loss": 8.3192,
"grad_norm": 3.137615442276001,
"learning_rate": 7.138356524022053e-05,
"epoch": 0.86
},
{
"loss": 9.0373,
"grad_norm": 3.58949613571167,
"learning_rate": 7.129605320731601e-05,
"epoch": 0.86
},
{
"loss": 8.5971,
"grad_norm": 3.062047243118286,
"learning_rate": 7.120854117441148e-05,
"epoch": 0.86
},
{
"loss": 8.334,
"grad_norm": 3.8008644580841064,
"learning_rate": 7.112102914150696e-05,
"epoch": 0.87
},
{
"loss": 8.4934,
"grad_norm": 3.4640395641326904,
"learning_rate": 7.103351710860243e-05,
"epoch": 0.87
},
{
"loss": 8.5304,
"grad_norm": 3.6595981121063232,
"learning_rate": 7.094600507569791e-05,
"epoch": 0.87
},
{
"loss": 8.6947,
"grad_norm": 4.255160331726074,
"learning_rate": 7.085849304279338e-05,
"epoch": 0.87
},
{
"loss": 8.5373,
"grad_norm": 2.957233428955078,
"learning_rate": 7.077098100988886e-05,
"epoch": 0.88
},
{
"loss": 8.919,
"grad_norm": 4.1049933433532715,
"learning_rate": 7.068346897698434e-05,
"epoch": 0.88
},
{
"loss": 8.6536,
"grad_norm": 3.6588120460510254,
"learning_rate": 7.059595694407982e-05,
"epoch": 0.88
},
{
"loss": 8.1786,
"grad_norm": 2.536498785018921,
"learning_rate": 7.050844491117528e-05,
"epoch": 0.88
},
{
"loss": 8.9825,
"grad_norm": 3.442955255508423,
"learning_rate": 7.042093287827076e-05,
"epoch": 0.89
},
{
"loss": 8.7311,
"grad_norm": 2.923522710800171,
"learning_rate": 7.033342084536624e-05,
"epoch": 0.89
},
{
"loss": 8.276,
"grad_norm": 3.6458773612976074,
"learning_rate": 7.024590881246172e-05,
"epoch": 0.89
},
{
"loss": 8.5271,
"grad_norm": 3.239694356918335,
"learning_rate": 7.01583967795572e-05,
"epoch": 0.9
},
{
"loss": 8.2755,
"grad_norm": 3.39280366897583,
"learning_rate": 7.007088474665266e-05,
"epoch": 0.9
},
{
"loss": 9.2045,
"grad_norm": 3.4279630184173584,
"learning_rate": 6.998337271374814e-05,
"epoch": 0.9
},
{
"loss": 8.5647,
"grad_norm": 2.416999578475952,
"learning_rate": 6.989586068084362e-05,
"epoch": 0.9
},
{
"loss": 8.7283,
"grad_norm": 2.8094992637634277,
"learning_rate": 6.98083486479391e-05,
"epoch": 0.91
},
{
"loss": 8.6518,
"grad_norm": 4.319655418395996,
"learning_rate": 6.972083661503456e-05,
"epoch": 0.91
},
{
"loss": 8.2852,
"grad_norm": 3.9317190647125244,
"learning_rate": 6.963332458213004e-05,
"epoch": 0.91
},
{
"loss": 8.6298,
"grad_norm": 4.8585405349731445,
"learning_rate": 6.954581254922552e-05,
"epoch": 0.91
},
{
"loss": 8.2455,
"grad_norm": 2.6159684658050537,
"learning_rate": 6.9458300516321e-05,
"epoch": 0.92
},
{
"loss": 8.7473,
"grad_norm": 2.344099521636963,
"learning_rate": 6.937078848341647e-05,
"epoch": 0.92
},
{
"loss": 7.9803,
"grad_norm": 3.1866703033447266,
"learning_rate": 6.928327645051194e-05,
"epoch": 0.92
},
{
"loss": 8.79,
"grad_norm": 3.943319320678711,
"learning_rate": 6.919576441760742e-05,
"epoch": 0.92
},
{
"loss": 8.8112,
"grad_norm": 2.919020891189575,
"learning_rate": 6.91082523847029e-05,
"epoch": 0.93
},
{
"loss": 8.5764,
"grad_norm": 3.47027325630188,
"learning_rate": 6.902074035179837e-05,
"epoch": 0.93
},
{
"loss": 9.4408,
"grad_norm": 3.2260677814483643,
"learning_rate": 6.893322831889385e-05,
"epoch": 0.93
},
{
"loss": 8.9655,
"grad_norm": 3.2517478466033936,
"learning_rate": 6.884571628598933e-05,
"epoch": 0.93
},
{
"loss": 8.8759,
"grad_norm": 4.705760478973389,
"learning_rate": 6.87582042530848e-05,
"epoch": 0.94
},
{
"loss": 8.8426,
"grad_norm": 2.7460803985595703,
"learning_rate": 6.867069222018028e-05,
"epoch": 0.94
},
{
"loss": 8.6733,
"grad_norm": 3.944464921951294,
"learning_rate": 6.858318018727575e-05,
"epoch": 0.94
},
{
"loss": 8.56,
"grad_norm": 3.393721342086792,
"learning_rate": 6.849566815437123e-05,
"epoch": 0.94
},
{
"loss": 7.8694,
"grad_norm": 2.579340696334839,
"learning_rate": 6.84081561214667e-05,
"epoch": 0.95
},
{
"loss": 8.2998,
"grad_norm": 3.6678457260131836,
"learning_rate": 6.832064408856219e-05,
"epoch": 0.95
},
{
"loss": 8.5947,
"grad_norm": 3.218284845352173,
"learning_rate": 6.823313205565765e-05,
"epoch": 0.95
},
{
"loss": 8.5014,
"grad_norm": 3.5185766220092773,
"learning_rate": 6.814562002275313e-05,
"epoch": 0.96
},
{
"loss": 9.1655,
"grad_norm": 3.5601882934570312,
"learning_rate": 6.805810798984861e-05,
"epoch": 0.96
},
{
"loss": 8.6889,
"grad_norm": 3.317361354827881,
"learning_rate": 6.797059595694409e-05,
"epoch": 0.96
},
{
"loss": 8.9274,
"grad_norm": 3.271773338317871,
"learning_rate": 6.788308392403955e-05,
"epoch": 0.96
},
{
"loss": 8.733,
"grad_norm": 3.0022764205932617,
"learning_rate": 6.779557189113503e-05,
"epoch": 0.97
},
{
"loss": 8.5665,
"grad_norm": 3.5991992950439453,
"learning_rate": 6.770805985823051e-05,
"epoch": 0.97
},
{
"loss": 8.3149,
"grad_norm": 3.060124158859253,
"learning_rate": 6.762054782532599e-05,
"epoch": 0.97
},
{
"loss": 7.9419,
"grad_norm": 3.116497278213501,
"learning_rate": 6.753303579242147e-05,
"epoch": 0.97
},
{
"loss": 8.456,
"grad_norm": 3.201129198074341,
"learning_rate": 6.744552375951693e-05,
"epoch": 0.98
},
{
"loss": 8.5843,
"grad_norm": 3.7871932983398438,
"learning_rate": 6.735801172661241e-05,
"epoch": 0.98
},
{
"loss": 8.1042,
"grad_norm": 3.8025078773498535,
"learning_rate": 6.727049969370789e-05,
"epoch": 0.98
},
{
"loss": 9.0008,
"grad_norm": 2.9040756225585938,
"learning_rate": 6.718298766080337e-05,
"epoch": 0.98
},
{
"loss": 7.971,
"grad_norm": 5.227065086364746,
"learning_rate": 6.709547562789884e-05,
"epoch": 0.99
},
{
"loss": 9.1628,
"grad_norm": 2.822517156600952,
"learning_rate": 6.700796359499431e-05,
"epoch": 0.99
},
{
"loss": 7.7244,
"grad_norm": 2.9904367923736572,
"learning_rate": 6.69204515620898e-05,
"epoch": 0.99
},
{
"loss": 8.0893,
"grad_norm": 4.167274475097656,
"learning_rate": 6.683293952918527e-05,
"epoch": 0.99
},
{
"loss": 8.3331,
"grad_norm": 3.8134043216705322,
"learning_rate": 6.674542749628074e-05,
"epoch": 1.0
},
{
"loss": 7.8118,
"grad_norm": 3.0692172050476074,
"learning_rate": 6.665791546337622e-05,
"epoch": 1.0
},
{
"loss": 8.4811,
"grad_norm": 4.14231014251709,
"learning_rate": 6.65704034304717e-05,
"epoch": 1.0
},
{
"loss": 8.0494,
"grad_norm": 3.4583489894866943,
"learning_rate": 6.648289139756717e-05,
"epoch": 1.01
},
{
"loss": 8.2668,
"grad_norm": 3.468843460083008,
"learning_rate": 6.639537936466265e-05,
"epoch": 1.01
},
{
"loss": 8.384,
"grad_norm": 3.0200271606445312,
"learning_rate": 6.630786733175812e-05,
"epoch": 1.01
},
{
"loss": 8.5672,
"grad_norm": 3.9895946979522705,
"learning_rate": 6.62203552988536e-05,
"epoch": 1.01
},
{
"loss": 8.4798,
"grad_norm": 2.92266583442688,
"learning_rate": 6.613284326594908e-05,
"epoch": 1.02
},
{
"loss": 8.749,
"grad_norm": 3.8905258178710938,
"learning_rate": 6.604533123304456e-05,
"epoch": 1.02
},
{
"loss": 7.8761,
"grad_norm": 3.545311212539673,
"learning_rate": 6.595781920014002e-05,
"epoch": 1.02
},
{
"loss": 8.3273,
"grad_norm": 2.925837516784668,
"learning_rate": 6.58703071672355e-05,
"epoch": 1.02
},
{
"loss": 8.3426,
"grad_norm": 3.527435064315796,
"learning_rate": 6.578279513433098e-05,
"epoch": 1.03
},
{
"loss": 7.5962,
"grad_norm": 2.926382064819336,
"learning_rate": 6.569528310142646e-05,
"epoch": 1.03
},
{
"loss": 8.2975,
"grad_norm": 3.4969446659088135,
"learning_rate": 6.560777106852192e-05,
"epoch": 1.03
},
{
"loss": 8.4417,
"grad_norm": 3.466707229614258,
"learning_rate": 6.55202590356174e-05,
"epoch": 1.03
},
{
"loss": 8.1161,
"grad_norm": 4.119028091430664,
"learning_rate": 6.543274700271288e-05,
"epoch": 1.04
},
{
"loss": 8.0248,
"grad_norm": 3.2728042602539062,
"learning_rate": 6.534523496980836e-05,
"epoch": 1.04
},
{
"loss": 8.0926,
"grad_norm": 2.8251736164093018,
"learning_rate": 6.525772293690382e-05,
"epoch": 1.04
},
{
"loss": 8.6109,
"grad_norm": 3.521144151687622,
"learning_rate": 6.51702109039993e-05,
"epoch": 1.04
},
{
"loss": 8.6268,
"grad_norm": 3.15901780128479,
"learning_rate": 6.508269887109478e-05,
"epoch": 1.05
},
{
"loss": 8.2584,
"grad_norm": 3.5901992321014404,
"learning_rate": 6.499518683819026e-05,
"epoch": 1.05
},
{
"loss": 8.5285,
"grad_norm": 4.662459850311279,
"learning_rate": 6.490767480528574e-05,
"epoch": 1.05
},
{
"loss": 7.8559,
"grad_norm": 2.72666597366333,
"learning_rate": 6.48201627723812e-05,
"epoch": 1.06
},
{
"loss": 8.145,
"grad_norm": 3.6170144081115723,
"learning_rate": 6.473265073947668e-05,
"epoch": 1.06
},
{
"loss": 7.838,
"grad_norm": 2.9118199348449707,
"learning_rate": 6.464513870657216e-05,
"epoch": 1.06
},
{
"loss": 8.4171,
"grad_norm": 3.7052972316741943,
"learning_rate": 6.455762667366764e-05,
"epoch": 1.06
},
{
"loss": 8.2865,
"grad_norm": 4.498712062835693,
"learning_rate": 6.447011464076311e-05,
"epoch": 1.07
},
{
"loss": 8.6456,
"grad_norm": 3.1900229454040527,
"learning_rate": 6.438260260785859e-05,
"epoch": 1.07
},
{
"loss": 8.1772,
"grad_norm": 4.92230224609375,
"learning_rate": 6.429509057495407e-05,
"epoch": 1.07
},
{
"loss": 9.221,
"grad_norm": 3.758399724960327,
"learning_rate": 6.420757854204954e-05,
"epoch": 1.07
},
{
"loss": 8.8449,
"grad_norm": 3.110145092010498,
"learning_rate": 6.412006650914501e-05,
"epoch": 1.08
},
{
"loss": 8.5491,
"grad_norm": 3.1985270977020264,
"learning_rate": 6.403255447624049e-05,
"epoch": 1.08
},
{
"loss": 8.0487,
"grad_norm": 4.918299674987793,
"learning_rate": 6.394504244333595e-05,
"epoch": 1.08
},
{
"loss": 8.6751,
"grad_norm": 3.328449010848999,
"learning_rate": 6.385753041043143e-05,
"epoch": 1.08
},
{
"loss": 8.0822,
"grad_norm": 2.8385417461395264,
"learning_rate": 6.377001837752691e-05,
"epoch": 1.09
},
{
"loss": 8.0838,
"grad_norm": 3.5397825241088867,
"learning_rate": 6.368250634462238e-05,
"epoch": 1.09
},
{
"loss": 8.3657,
"grad_norm": 3.8638100624084473,
"learning_rate": 6.359499431171786e-05,
"epoch": 1.09
},
{
"loss": 8.5573,
"grad_norm": 3.129281759262085,
"learning_rate": 6.350748227881333e-05,
"epoch": 1.09
},
{
"loss": 8.424,
"grad_norm": 4.496127605438232,
"learning_rate": 6.341997024590881e-05,
"epoch": 1.1
},
{
"loss": 8.4377,
"grad_norm": 5.132551670074463,
"learning_rate": 6.333245821300429e-05,
"epoch": 1.1
},
{
"loss": 8.1084,
"grad_norm": 2.994011402130127,
"learning_rate": 6.324494618009976e-05,
"epoch": 1.1
},
{
"loss": 8.0609,
"grad_norm": 3.976611375808716,
"learning_rate": 6.315743414719524e-05,
"epoch": 1.11
},
{
"loss": 8.0137,
"grad_norm": 4.869803428649902,
"learning_rate": 6.306992211429072e-05,
"epoch": 1.11
},
{
"loss": 8.2885,
"grad_norm": 3.4231982231140137,
"learning_rate": 6.29824100813862e-05,
"epoch": 1.11
},
{
"loss": 8.2444,
"grad_norm": 5.1861252784729,
"learning_rate": 6.289489804848166e-05,
"epoch": 1.11
},
{
"loss": 7.7026,
"grad_norm": 4.288048267364502,
"learning_rate": 6.280738601557714e-05,
"epoch": 1.12
},
{
"loss": 7.9181,
"grad_norm": 3.64320969581604,
"learning_rate": 6.271987398267262e-05,
"epoch": 1.12
},
{
"loss": 8.2069,
"grad_norm": 4.117647647857666,
"learning_rate": 6.26323619497681e-05,
"epoch": 1.12
},
{
"loss": 8.505,
"grad_norm": 3.528850793838501,
"learning_rate": 6.254484991686356e-05,
"epoch": 1.12
},
{
"loss": 8.1734,
"grad_norm": 2.9336414337158203,
"learning_rate": 6.245733788395904e-05,
"epoch": 1.13
},
{
"loss": 7.9808,
"grad_norm": 4.607523441314697,
"learning_rate": 6.236982585105452e-05,
"epoch": 1.13
},
{
"loss": 8.2398,
"grad_norm": 5.0170464515686035,
"learning_rate": 6.228231381815e-05,
"epoch": 1.13
},
{
"loss": 8.3433,
"grad_norm": 3.7535080909729004,
"learning_rate": 6.219480178524548e-05,
"epoch": 1.13
},
{
"loss": 7.8144,
"grad_norm": 3.409480333328247,
"learning_rate": 6.210728975234094e-05,
"epoch": 1.14
},
{
"loss": 8.5735,
"grad_norm": 3.7058238983154297,
"learning_rate": 6.201977771943642e-05,
"epoch": 1.14
},
{
"loss": 8.4169,
"grad_norm": 3.1820621490478516,
"learning_rate": 6.19322656865319e-05,
"epoch": 1.14
},
{
"loss": 7.9047,
"grad_norm": 2.8989903926849365,
"learning_rate": 6.184475365362738e-05,
"epoch": 1.14
},
{
"loss": 8.1377,
"grad_norm": 3.903512477874756,
"learning_rate": 6.175724162072284e-05,
"epoch": 1.15
},
{
"loss": 8.6631,
"grad_norm": 2.916041374206543,
"learning_rate": 6.166972958781832e-05,
"epoch": 1.15
},
{
"loss": 8.2377,
"grad_norm": 5.932418346405029,
"learning_rate": 6.15822175549138e-05,
"epoch": 1.15
},
{
"loss": 8.9961,
"grad_norm": 4.3133368492126465,
"learning_rate": 6.149470552200928e-05,
"epoch": 1.15
},
{
"loss": 8.0339,
"grad_norm": 3.1874802112579346,
"learning_rate": 6.140719348910475e-05,
"epoch": 1.16
},
{
"loss": 7.3347,
"grad_norm": 3.9368977546691895,
"learning_rate": 6.131968145620023e-05,
"epoch": 1.16
},
{
"loss": 7.723,
"grad_norm": 3.000967025756836,
"learning_rate": 6.12321694232957e-05,
"epoch": 1.16
},
{
"loss": 8.3258,
"grad_norm": 3.96174693107605,
"learning_rate": 6.114465739039118e-05,
"epoch": 1.17
},
{
"loss": 8.4037,
"grad_norm": 3.6735053062438965,
"learning_rate": 6.105714535748666e-05,
"epoch": 1.17
},
{
"loss": 8.2432,
"grad_norm": 4.765099048614502,
"learning_rate": 6.0969633324582134e-05,
"epoch": 1.17
},
{
"loss": 8.232,
"grad_norm": 3.118907928466797,
"learning_rate": 6.0882121291677607e-05,
"epoch": 1.17
},
{
"loss": 8.4367,
"grad_norm": 2.7283709049224854,
"learning_rate": 6.0794609258773085e-05,
"epoch": 1.18
},
{
"loss": 8.062,
"grad_norm": 2.874713182449341,
"learning_rate": 6.070709722586856e-05,
"epoch": 1.18
},
{
"loss": 8.2398,
"grad_norm": 3.3554372787475586,
"learning_rate": 6.0619585192964036e-05,
"epoch": 1.18
},
{
"loss": 8.0336,
"grad_norm": 3.0796210765838623,
"learning_rate": 6.053207316005951e-05,
"epoch": 1.18
},
{
"loss": 8.6318,
"grad_norm": 4.566615581512451,
"learning_rate": 6.044456112715499e-05,
"epoch": 1.19
},
{
"loss": 7.8031,
"grad_norm": 2.9705634117126465,
"learning_rate": 6.035704909425046e-05,
"epoch": 1.19
},
{
"loss": 8.2148,
"grad_norm": 2.8812005519866943,
"learning_rate": 6.026953706134594e-05,
"epoch": 1.19
},
{
"loss": 7.8284,
"grad_norm": 3.389988899230957,
"learning_rate": 6.018202502844141e-05,
"epoch": 1.19
},
{
"loss": 7.9364,
"grad_norm": 4.094693660736084,
"learning_rate": 6.009451299553689e-05,
"epoch": 1.2
},
{
"loss": 8.1086,
"grad_norm": 2.9179611206054688,
"learning_rate": 6.000700096263236e-05,
"epoch": 1.2
},
{
"loss": 8.5533,
"grad_norm": 3.0995657444000244,
"learning_rate": 5.991948892972784e-05,
"epoch": 1.2
},
{
"loss": 8.1136,
"grad_norm": 4.079578399658203,
"learning_rate": 5.983197689682332e-05,
"epoch": 1.2
},
{
"loss": 8.379,
"grad_norm": 3.150442600250244,
"learning_rate": 5.974446486391879e-05,
"epoch": 1.21
},
{
"loss": 7.6851,
"grad_norm": 3.904902458190918,
"learning_rate": 5.965695283101427e-05,
"epoch": 1.21
},
{
"loss": 8.5392,
"grad_norm": 2.8424036502838135,
"learning_rate": 5.956944079810974e-05,
"epoch": 1.21
},
{
"loss": 7.9675,
"grad_norm": 5.174964904785156,
"learning_rate": 5.948192876520522e-05,
"epoch": 1.22
},
{
"loss": 7.8097,
"grad_norm": 3.6166417598724365,
"learning_rate": 5.9394416732300694e-05,
"epoch": 1.22
},
{
"loss": 7.981,
"grad_norm": 3.1801164150238037,
"learning_rate": 5.930690469939617e-05,
"epoch": 1.22
},
{
"loss": 8.2149,
"grad_norm": 3.975576400756836,
"learning_rate": 5.9219392666491645e-05,
"epoch": 1.22
},
{
"loss": 8.5473,
"grad_norm": 4.039759159088135,
"learning_rate": 5.9131880633587123e-05,
"epoch": 1.23
},
{
"loss": 8.2659,
"grad_norm": 4.9490861892700195,
"learning_rate": 5.9044368600682596e-05,
"epoch": 1.23
},
{
"loss": 8.0904,
"grad_norm": 4.2978715896606445,
"learning_rate": 5.8956856567778074e-05,
"epoch": 1.23
},
{
"loss": 8.0685,
"grad_norm": 3.037668466567993,
"learning_rate": 5.8869344534873547e-05,
"epoch": 1.23
},
{
"loss": 7.7553,
"grad_norm": 3.496307849884033,
"learning_rate": 5.8781832501969025e-05,
"epoch": 1.24
},
{
"loss": 8.5136,
"grad_norm": 3.175560712814331,
"learning_rate": 5.8694320469064504e-05,
"epoch": 1.24
},
{
"loss": 7.8844,
"grad_norm": 3.7230336666107178,
"learning_rate": 5.8606808436159976e-05,
"epoch": 1.24
},
{
"loss": 8.2197,
"grad_norm": 4.2161359786987305,
"learning_rate": 5.8519296403255455e-05,
"epoch": 1.24
},
{
"loss": 8.2073,
"grad_norm": 4.122830867767334,
"learning_rate": 5.843178437035093e-05,
"epoch": 1.25
},
{
"loss": 7.9876,
"grad_norm": 4.076393127441406,
"learning_rate": 5.8344272337446406e-05,
"epoch": 1.25
},
{
"loss": 7.7404,
"grad_norm": 4.162572860717773,
"learning_rate": 5.825676030454188e-05,
"epoch": 1.25
},
{
"loss": 7.9991,
"grad_norm": 3.4230401515960693,
"learning_rate": 5.816924827163736e-05,
"epoch": 1.25
},
{
"loss": 8.0793,
"grad_norm": 4.0326313972473145,
"learning_rate": 5.808173623873283e-05,
"epoch": 1.26
},
{
"loss": 7.3234,
"grad_norm": 2.669609308242798,
"learning_rate": 5.799422420582831e-05,
"epoch": 1.26
},
{
"loss": 8.4489,
"grad_norm": 3.926006555557251,
"learning_rate": 5.790671217292378e-05,
"epoch": 1.26
},
{
"loss": 8.2047,
"grad_norm": 2.7743630409240723,
"learning_rate": 5.781920014001926e-05,
"epoch": 1.27
},
{
"loss": 7.6027,
"grad_norm": 3.9714138507843018,
"learning_rate": 5.773168810711473e-05,
"epoch": 1.27
},
{
"loss": 7.5097,
"grad_norm": 3.775052070617676,
"learning_rate": 5.764417607421021e-05,
"epoch": 1.27
},
{
"loss": 8.1056,
"grad_norm": 4.158542633056641,
"learning_rate": 5.755666404130568e-05,
"epoch": 1.27
},
{
"loss": 8.2198,
"grad_norm": 3.647034168243408,
"learning_rate": 5.746915200840116e-05,
"epoch": 1.28
},
{
"loss": 8.5833,
"grad_norm": 3.3458187580108643,
"learning_rate": 5.738163997549664e-05,
"epoch": 1.28
},
{
"loss": 7.8895,
"grad_norm": 3.5432512760162354,
"learning_rate": 5.729412794259211e-05,
"epoch": 1.28
},
{
"loss": 7.9663,
"grad_norm": 3.1249189376831055,
"learning_rate": 5.720661590968759e-05,
"epoch": 1.28
},
{
"loss": 7.6285,
"grad_norm": 2.8004276752471924,
"learning_rate": 5.7119103876783063e-05,
"epoch": 1.29
},
{
"loss": 8.2953,
"grad_norm": 3.2479677200317383,
"learning_rate": 5.703159184387854e-05,
"epoch": 1.29
},
{
"loss": 8.0079,
"grad_norm": 3.8008508682250977,
"learning_rate": 5.6944079810974014e-05,
"epoch": 1.29
},
{
"loss": 7.8035,
"grad_norm": 3.2461721897125244,
"learning_rate": 5.685656777806949e-05,
"epoch": 1.29
},
{
"loss": 8.6322,
"grad_norm": 3.8512370586395264,
"learning_rate": 5.6769055745164965e-05,
"epoch": 1.3
},
{
"loss": 8.7099,
"grad_norm": 3.9859845638275146,
"learning_rate": 5.6681543712260444e-05,
"epoch": 1.3
},
{
"loss": 7.9929,
"grad_norm": 3.455918550491333,
"learning_rate": 5.6594031679355916e-05,
"epoch": 1.3
},
{
"loss": 7.9348,
"grad_norm": 3.744387626647949,
"learning_rate": 5.6506519646451395e-05,
"epoch": 1.3
},
{
"loss": 7.9457,
"grad_norm": 5.055604934692383,
"learning_rate": 5.641900761354687e-05,
"epoch": 1.31
},
{
"loss": 8.2247,
"grad_norm": 3.072326183319092,
"learning_rate": 5.6331495580642346e-05,
"epoch": 1.31
},
{
"loss": 7.7124,
"grad_norm": 4.150148868560791,
"learning_rate": 5.624398354773782e-05,
"epoch": 1.31
},
{
"loss": 8.1994,
"grad_norm": 5.2460503578186035,
"learning_rate": 5.61564715148333e-05,
"epoch": 1.32
},
{
"loss": 8.1123,
"grad_norm": 3.8343966007232666,
"learning_rate": 5.6068959481928776e-05,
"epoch": 1.32
},
{
"loss": 7.9948,
"grad_norm": 3.488602638244629,
"learning_rate": 5.5981447449024235e-05,
"epoch": 1.32
},
{
"loss": 7.745,
"grad_norm": 2.6132748126983643,
"learning_rate": 5.5893935416119714e-05,
"epoch": 1.32
},
{
"loss": 8.4735,
"grad_norm": 3.123828172683716,
"learning_rate": 5.580642338321519e-05,
"epoch": 1.33
},
{
"loss": 7.5695,
"grad_norm": 3.747915506362915,
"learning_rate": 5.5718911350310665e-05,
"epoch": 1.33
},
{
"loss": 8.3462,
"grad_norm": 4.172099590301514,
"learning_rate": 5.5631399317406144e-05,
"epoch": 1.33
},
{
"loss": 7.639,
"grad_norm": 3.172137498855591,
"learning_rate": 5.5543887284501616e-05,
"epoch": 1.33
},
{
"loss": 7.9555,
"grad_norm": 4.053969383239746,
"learning_rate": 5.5456375251597095e-05,
"epoch": 1.34
},
{
"loss": 7.2866,
"grad_norm": 3.186673164367676,
"learning_rate": 5.536886321869257e-05,
"epoch": 1.34
},
{
"loss": 7.7438,
"grad_norm": 3.2737646102905273,
"learning_rate": 5.5281351185788046e-05,
"epoch": 1.34
},
{
"loss": 7.7122,
"grad_norm": 3.1801161766052246,
"learning_rate": 5.519383915288352e-05,
"epoch": 1.34
},
{
"loss": 7.6629,
"grad_norm": 3.773719072341919,
"learning_rate": 5.5106327119979e-05,
"epoch": 1.35
},
{
"loss": 8.2615,
"grad_norm": 4.548736095428467,
"learning_rate": 5.501881508707447e-05,
"epoch": 1.35
},
{
"loss": 8.0535,
"grad_norm": 3.921649694442749,
"learning_rate": 5.493130305416995e-05,
"epoch": 1.35
},
{
"loss": 7.4305,
"grad_norm": 4.346540451049805,
"learning_rate": 5.484379102126542e-05,
"epoch": 1.35
},
{
"loss": 8.423,
"grad_norm": 4.634354114532471,
"learning_rate": 5.47562789883609e-05,
"epoch": 1.36
},
{
"loss": 7.8966,
"grad_norm": 3.5531675815582275,
"learning_rate": 5.466876695545637e-05,
"epoch": 1.36
},
{
"loss": 7.7126,
"grad_norm": 4.377911567687988,
"learning_rate": 5.458125492255185e-05,
"epoch": 1.36
},
{
"loss": 8.2728,
"grad_norm": 3.366030216217041,
"learning_rate": 5.449374288964733e-05,
"epoch": 1.36
},
{
"loss": 8.1801,
"grad_norm": 3.4603772163391113,
"learning_rate": 5.44062308567428e-05,
"epoch": 1.37
},
{
"loss": 8.2143,
"grad_norm": 4.528195381164551,
"learning_rate": 5.431871882383828e-05,
"epoch": 1.37
},
{
"loss": 7.359,
"grad_norm": 2.4803977012634277,
"learning_rate": 5.423120679093375e-05,
"epoch": 1.37
},
{
"loss": 8.1361,
"grad_norm": 4.201333999633789,
"learning_rate": 5.414369475802923e-05,
"epoch": 1.38
},
{
"loss": 7.8974,
"grad_norm": 4.272532939910889,
"learning_rate": 5.40561827251247e-05,
"epoch": 1.38
},
{
"loss": 7.502,
"grad_norm": 3.4006450176239014,
"learning_rate": 5.396867069222018e-05,
"epoch": 1.38
},
{
"loss": 7.78,
"grad_norm": 3.902611255645752,
"learning_rate": 5.3881158659315654e-05,
"epoch": 1.38
},
{
"loss": 8.0953,
"grad_norm": 2.6970345973968506,
"learning_rate": 5.379364662641113e-05,
"epoch": 1.39
},
{
"loss": 7.7703,
"grad_norm": 3.610957145690918,
"learning_rate": 5.3706134593506605e-05,
"epoch": 1.39
},
{
"loss": 8.7352,
"grad_norm": 4.159451961517334,
"learning_rate": 5.3618622560602084e-05,
"epoch": 1.39
},
{
"loss": 7.5264,
"grad_norm": 2.7696640491485596,
"learning_rate": 5.3531110527697556e-05,
"epoch": 1.39
},
{
"loss": 7.776,
"grad_norm": 5.263556003570557,
"learning_rate": 5.3443598494793035e-05,
"epoch": 1.4
},
{
"loss": 7.4893,
"grad_norm": 3.3409626483917236,
"learning_rate": 5.3356086461888514e-05,
"epoch": 1.4
},
{
"loss": 7.9241,
"grad_norm": 5.305122375488281,
"learning_rate": 5.3268574428983986e-05,
"epoch": 1.4
},
{
"loss": 7.9385,
"grad_norm": 4.231367588043213,
"learning_rate": 5.3181062396079465e-05,
"epoch": 1.4
},
{
"loss": 8.1538,
"grad_norm": 3.7227704524993896,
"learning_rate": 5.309355036317494e-05,
"epoch": 1.41
},
{
"loss": 7.5085,
"grad_norm": 3.6258912086486816,
"learning_rate": 5.3006038330270416e-05,
"epoch": 1.41
},
{
"loss": 7.6825,
"grad_norm": 3.3270792961120605,
"learning_rate": 5.291852629736589e-05,
"epoch": 1.41
},
{
"loss": 7.8874,
"grad_norm": 2.983099937438965,
"learning_rate": 5.2831014264461367e-05,
"epoch": 1.41
},
{
"loss": 8.0616,
"grad_norm": 3.8440752029418945,
"learning_rate": 5.274350223155684e-05,
"epoch": 1.42
},
{
"loss": 7.7685,
"grad_norm": 5.8492608070373535,
"learning_rate": 5.265599019865232e-05,
"epoch": 1.42
},
{
"loss": 7.5809,
"grad_norm": 3.308460235595703,
"learning_rate": 5.256847816574779e-05,
"epoch": 1.42
},
{
"loss": 8.7381,
"grad_norm": 3.017559766769409,
"learning_rate": 5.248096613284327e-05,
"epoch": 1.43
},
{
"loss": 7.9422,
"grad_norm": 4.227987766265869,
"learning_rate": 5.239345409993874e-05,
"epoch": 1.43
},
{
"loss": 7.9202,
"grad_norm": 3.1066997051239014,
"learning_rate": 5.230594206703422e-05,
"epoch": 1.43
},
{
"loss": 7.7366,
"grad_norm": 3.3069207668304443,
"learning_rate": 5.221843003412969e-05,
"epoch": 1.43
},
{
"loss": 8.3042,
"grad_norm": 3.065303087234497,
"learning_rate": 5.213091800122517e-05,
"epoch": 1.44
},
{
"loss": 8.2732,
"grad_norm": 3.6093387603759766,
"learning_rate": 5.204340596832065e-05,
"epoch": 1.44
},
{
"loss": 7.1332,
"grad_norm": 4.356596946716309,
"learning_rate": 5.195589393541612e-05,
"epoch": 1.44
},
{
"loss": 7.9349,
"grad_norm": 4.91728401184082,
"learning_rate": 5.18683819025116e-05,
"epoch": 1.44
},
{
"loss": 8.0487,
"grad_norm": 4.411836624145508,
"learning_rate": 5.178086986960707e-05,
"epoch": 1.45
},
{
"loss": 7.7545,
"grad_norm": 3.488790512084961,
"learning_rate": 5.169335783670255e-05,
"epoch": 1.45
},
{
"loss": 7.7508,
"grad_norm": 5.54533576965332,
"learning_rate": 5.1605845803798024e-05,
"epoch": 1.45
},
{
"loss": 7.8403,
"grad_norm": 2.8527212142944336,
"learning_rate": 5.15183337708935e-05,
"epoch": 1.45
},
{
"loss": 8.1535,
"grad_norm": 3.892737865447998,
"learning_rate": 5.1430821737988975e-05,
"epoch": 1.46
},
{
"loss": 7.6217,
"grad_norm": 3.9077818393707275,
"learning_rate": 5.1343309705084454e-05,
"epoch": 1.46
},
{
"loss": 7.6306,
"grad_norm": 3.9363648891448975,
"learning_rate": 5.1255797672179926e-05,
"epoch": 1.46
},
{
"loss": 8.5425,
"grad_norm": 4.692113399505615,
"learning_rate": 5.1168285639275405e-05,
"epoch": 1.46
},
{
"loss": 7.7507,
"grad_norm": 5.601973056793213,
"learning_rate": 5.108077360637088e-05,
"epoch": 1.47
},
{
"loss": 7.8534,
"grad_norm": 3.5403494834899902,
"learning_rate": 5.0993261573466356e-05,
"epoch": 1.47
},
{
"loss": 7.9312,
"grad_norm": 4.555025100708008,
"learning_rate": 5.0905749540561834e-05,
"epoch": 1.47
},
{
"loss": 7.624,
"grad_norm": 5.721600532531738,
"learning_rate": 5.0818237507657307e-05,
"epoch": 1.48
},
{
"loss": 7.9349,
"grad_norm": 3.4647514820098877,
"learning_rate": 5.0730725474752785e-05,
"epoch": 1.48
},
{
"loss": 7.5857,
"grad_norm": 3.362941026687622,
"learning_rate": 5.064321344184826e-05,
"epoch": 1.48
},
{
"loss": 8.3246,
"grad_norm": 5.352531433105469,
"learning_rate": 5.0555701408943736e-05,
"epoch": 1.48
},
{
"loss": 7.8738,
"grad_norm": 3.2162294387817383,
"learning_rate": 5.046818937603921e-05,
"epoch": 1.49
},
{
"loss": 8.0784,
"grad_norm": 3.607652187347412,
"learning_rate": 5.038067734313469e-05,
"epoch": 1.49
},
{
"loss": 8.0385,
"grad_norm": 3.6921122074127197,
"learning_rate": 5.029316531023016e-05,
"epoch": 1.49
},
{
"loss": 7.9548,
"grad_norm": 5.187925338745117,
"learning_rate": 5.020565327732564e-05,
"epoch": 1.49
},
{
"loss": 7.9085,
"grad_norm": 4.099059581756592,
"learning_rate": 5.011814124442111e-05,
"epoch": 1.5
},
{
"loss": 7.7387,
"grad_norm": 3.0415878295898438,
"learning_rate": 5.003062921151659e-05,
"epoch": 1.5
},
{
"loss": 8.5766,
"grad_norm": 4.777284622192383,
"learning_rate": 4.994311717861206e-05,
"epoch": 1.5
},
{
"loss": 7.697,
"grad_norm": 5.438363075256348,
"learning_rate": 4.9855605145707534e-05,
"epoch": 1.5
},
{
"loss": 8.4595,
"grad_norm": 5.054925441741943,
"learning_rate": 4.976809311280301e-05,
"epoch": 1.51
},
{
"loss": 8.3592,
"grad_norm": 2.9228146076202393,
"learning_rate": 4.9680581079898485e-05,
"epoch": 1.51
},
{
"loss": 7.7911,
"grad_norm": 4.529871940612793,
"learning_rate": 4.9593069046993964e-05,
"epoch": 1.51
},
{
"loss": 8.0316,
"grad_norm": 3.9995975494384766,
"learning_rate": 4.9505557014089436e-05,
"epoch": 1.51
},
{
"loss": 7.5984,
"grad_norm": 3.9212229251861572,
"learning_rate": 4.9418044981184915e-05,
"epoch": 1.52
},
{
"loss": 8.1606,
"grad_norm": 3.479395866394043,
"learning_rate": 4.933053294828039e-05,
"epoch": 1.52
},
{
"loss": 7.7609,
"grad_norm": 3.5287656784057617,
"learning_rate": 4.9243020915375866e-05,
"epoch": 1.52
},
{
"loss": 8.2939,
"grad_norm": 3.2169201374053955,
"learning_rate": 4.915550888247134e-05,
"epoch": 1.53
},
{
"loss": 7.8161,
"grad_norm": 4.046046733856201,
"learning_rate": 4.906799684956682e-05,
"epoch": 1.53
},
{
"loss": 8.1367,
"grad_norm": 3.9905033111572266,
"learning_rate": 4.898048481666229e-05,
"epoch": 1.53
},
{
"loss": 7.522,
"grad_norm": 3.0949547290802,
"learning_rate": 4.889297278375777e-05,
"epoch": 1.53
},
{
"loss": 7.8009,
"grad_norm": 3.2042038440704346,
"learning_rate": 4.8805460750853247e-05,
"epoch": 1.54
},
{
"loss": 7.9891,
"grad_norm": 3.542771100997925,
"learning_rate": 4.871794871794872e-05,
"epoch": 1.54
},
{
"loss": 8.1601,
"grad_norm": 4.720103740692139,
"learning_rate": 4.86304366850442e-05,
"epoch": 1.54
},
{
"loss": 7.5776,
"grad_norm": 3.65787672996521,
"learning_rate": 4.854292465213967e-05,
"epoch": 1.54
},
{
"loss": 7.4047,
"grad_norm": 3.9372549057006836,
"learning_rate": 4.845541261923515e-05,
"epoch": 1.55
},
{
"loss": 8.0766,
"grad_norm": 3.362112045288086,
"learning_rate": 4.836790058633062e-05,
"epoch": 1.55
},
{
"loss": 7.8371,
"grad_norm": 5.547123908996582,
"learning_rate": 4.82803885534261e-05,
"epoch": 1.55
},
{
"loss": 8.0129,
"grad_norm": 4.756041526794434,
"learning_rate": 4.819287652052157e-05,
"epoch": 1.55
},
{
"loss": 7.8304,
"grad_norm": 3.8089821338653564,
"learning_rate": 4.810536448761705e-05,
"epoch": 1.56
},
{
"loss": 7.7565,
"grad_norm": 3.8562700748443604,
"learning_rate": 4.801785245471252e-05,
"epoch": 1.56
},
{
"loss": 7.4297,
"grad_norm": 4.8232831954956055,
"learning_rate": 4.7930340421808e-05,
"epoch": 1.56
},
{
"loss": 7.4436,
"grad_norm": 4.951693058013916,
"learning_rate": 4.7842828388903474e-05,
"epoch": 1.56
},
{
"loss": 7.9573,
"grad_norm": 3.800071954727173,
"learning_rate": 4.775531635599895e-05,
"epoch": 1.57
},
{
"loss": 7.4168,
"grad_norm": 4.224662780761719,
"learning_rate": 4.766780432309443e-05,
"epoch": 1.57
},
{
"loss": 8.4404,
"grad_norm": 3.3358187675476074,
"learning_rate": 4.7580292290189904e-05,
"epoch": 1.57
},
{
"loss": 7.4616,
"grad_norm": 4.352634906768799,
"learning_rate": 4.749278025728538e-05,
"epoch": 1.57
},
{
"loss": 7.8744,
"grad_norm": 3.5693962574005127,
"learning_rate": 4.7405268224380855e-05,
"epoch": 1.58
},
{
"loss": 7.5451,
"grad_norm": 3.5086276531219482,
"learning_rate": 4.7317756191476334e-05,
"epoch": 1.58
},
{
"loss": 7.4284,
"grad_norm": 3.0168793201446533,
"learning_rate": 4.7230244158571806e-05,
"epoch": 1.58
},
{
"loss": 7.7376,
"grad_norm": 4.352570056915283,
"learning_rate": 4.7142732125667285e-05,
"epoch": 1.59
},
{
"loss": 7.64,
"grad_norm": 5.351820468902588,
"learning_rate": 4.705522009276276e-05,
"epoch": 1.59
},
{
"loss": 7.9318,
"grad_norm": 3.993790626525879,
"learning_rate": 4.6967708059858236e-05,
"epoch": 1.59
},
{
"loss": 7.7602,
"grad_norm": 3.1628670692443848,
"learning_rate": 4.688019602695371e-05,
"epoch": 1.59
},
{
"loss": 7.2848,
"grad_norm": 2.481705665588379,
"learning_rate": 4.679268399404919e-05,
"epoch": 1.6
},
{
"loss": 7.8586,
"grad_norm": 3.944296360015869,
"learning_rate": 4.670517196114466e-05,
"epoch": 1.6
},
{
"loss": 7.7223,
"grad_norm": 4.099398136138916,
"learning_rate": 4.661765992824014e-05,
"epoch": 1.6
},
{
"loss": 7.6033,
"grad_norm": 2.781362533569336,
"learning_rate": 4.653014789533561e-05,
"epoch": 1.6
},
{
"loss": 8.4421,
"grad_norm": 4.035131454467773,
"learning_rate": 4.644263586243109e-05,
"epoch": 1.61
},
{
"loss": 7.6095,
"grad_norm": 3.3464620113372803,
"learning_rate": 4.635512382952657e-05,
"epoch": 1.61
},
{
"loss": 7.8892,
"grad_norm": 4.8561553955078125,
"learning_rate": 4.626761179662204e-05,
"epoch": 1.61
},
{
"loss": 7.7162,
"grad_norm": 6.795714378356934,
"learning_rate": 4.618009976371752e-05,
"epoch": 1.61
},
{
"loss": 7.7384,
"grad_norm": 3.0965943336486816,
"learning_rate": 4.609258773081299e-05,
"epoch": 1.62
},
{
"loss": 7.9766,
"grad_norm": 3.1002793312072754,
"learning_rate": 4.600507569790847e-05,
"epoch": 1.62
},
{
"loss": 7.4392,
"grad_norm": 6.083471298217773,
"learning_rate": 4.5917563665003935e-05,
"epoch": 1.62
},
{
"loss": 7.5822,
"grad_norm": 4.11601448059082,
"learning_rate": 4.5830051632099414e-05,
"epoch": 1.62
},
{
"loss": 7.5988,
"grad_norm": 4.361574172973633,
"learning_rate": 4.5742539599194886e-05,
"epoch": 1.63
},
{
"loss": 8.1273,
"grad_norm": 4.6307549476623535,
"learning_rate": 4.5655027566290365e-05,
"epoch": 1.63
},
{
"loss": 7.3601,
"grad_norm": 3.4341373443603516,
"learning_rate": 4.5567515533385844e-05,
"epoch": 1.63
},
{
"loss": 7.9766,
"grad_norm": 3.7583069801330566,
"learning_rate": 4.5480003500481316e-05,
"epoch": 1.64
},
{
"loss": 7.8755,
"grad_norm": 3.212942123413086,
"learning_rate": 4.5392491467576795e-05,
"epoch": 1.64
},
{
"loss": 7.8139,
"grad_norm": 2.9877207279205322,
"learning_rate": 4.530497943467227e-05,
"epoch": 1.64
},
{
"loss": 7.8379,
"grad_norm": 4.133498191833496,
"learning_rate": 4.5217467401767746e-05,
"epoch": 1.64
},
{
"loss": 7.9657,
"grad_norm": 3.252624273300171,
"learning_rate": 4.512995536886322e-05,
"epoch": 1.65
},
{
"loss": 7.7005,
"grad_norm": 3.70926833152771,
"learning_rate": 4.50424433359587e-05,
"epoch": 1.65
},
{
"loss": 7.6194,
"grad_norm": 4.198193073272705,
"learning_rate": 4.495493130305417e-05,
"epoch": 1.65
},
{
"loss": 8.1874,
"grad_norm": 3.5660247802734375,
"learning_rate": 4.486741927014965e-05,
"epoch": 1.65
},
{
"loss": 8.0731,
"grad_norm": 3.6867547035217285,
"learning_rate": 4.477990723724512e-05,
"epoch": 1.66
},
{
"loss": 7.4702,
"grad_norm": 3.8409180641174316,
"learning_rate": 4.46923952043406e-05,
"epoch": 1.66
},
{
"loss": 8.1333,
"grad_norm": 3.7179150581359863,
"learning_rate": 4.460488317143607e-05,
"epoch": 1.66
},
{
"loss": 7.4353,
"grad_norm": 4.092810153961182,
"learning_rate": 4.451737113853155e-05,
"epoch": 1.66
},
{
"loss": 8.1888,
"grad_norm": 4.3642754554748535,
"learning_rate": 4.442985910562702e-05,
"epoch": 1.67
},
{
"loss": 8.1823,
"grad_norm": 3.4664993286132812,
"learning_rate": 4.43423470727225e-05,
"epoch": 1.67
},
{
"loss": 7.6325,
"grad_norm": 4.143255710601807,
"learning_rate": 4.425483503981798e-05,
"epoch": 1.67
},
{
"loss": 7.9794,
"grad_norm": 3.8068184852600098,
"learning_rate": 4.416732300691345e-05,
"epoch": 1.67
},
{
"loss": 7.5482,
"grad_norm": 3.6255953311920166,
"learning_rate": 4.407981097400893e-05,
"epoch": 1.68
},
{
"loss": 7.1437,
"grad_norm": 4.526164531707764,
"learning_rate": 4.39922989411044e-05,
"epoch": 1.68
},
{
"loss": 7.3931,
"grad_norm": 3.652649402618408,
"learning_rate": 4.390478690819988e-05,
"epoch": 1.68
},
{
"loss": 7.3862,
"grad_norm": 4.751399993896484,
"learning_rate": 4.3817274875295354e-05,
"epoch": 1.69
},
{
"loss": 7.8723,
"grad_norm": 3.011975049972534,
"learning_rate": 4.372976284239083e-05,
"epoch": 1.69
},
{
"loss": 8.0483,
"grad_norm": 4.407155513763428,
"learning_rate": 4.3642250809486305e-05,
"epoch": 1.69
},
{
"loss": 7.6125,
"grad_norm": 3.762749195098877,
"learning_rate": 4.3554738776581784e-05,
"epoch": 1.69
},
{
"loss": 7.7699,
"grad_norm": 5.391783714294434,
"learning_rate": 4.3467226743677256e-05,
"epoch": 1.7
},
{
"loss": 7.6641,
"grad_norm": 3.509794235229492,
"learning_rate": 4.3379714710772735e-05,
"epoch": 1.7
},
{
"loss": 8.4195,
"grad_norm": 4.34732723236084,
"learning_rate": 4.329220267786821e-05,
"epoch": 1.7
},
{
"loss": 7.6044,
"grad_norm": 4.418550491333008,
"learning_rate": 4.3204690644963686e-05,
"epoch": 1.7
},
{
"loss": 7.8304,
"grad_norm": 3.9914748668670654,
"learning_rate": 4.3117178612059165e-05,
"epoch": 1.71
},
{
"loss": 7.4516,
"grad_norm": 4.141488075256348,
"learning_rate": 4.302966657915464e-05,
"epoch": 1.71
},
{
"loss": 7.7451,
"grad_norm": 3.61734938621521,
"learning_rate": 4.2942154546250116e-05,
"epoch": 1.71
},
{
"loss": 8.0466,
"grad_norm": 3.956249475479126,
"learning_rate": 4.285464251334559e-05,
"epoch": 1.71
},
{
"loss": 7.5964,
"grad_norm": 3.214452028274536,
"learning_rate": 4.276713048044107e-05,
"epoch": 1.72
},
{
"loss": 7.7055,
"grad_norm": 3.8038113117218018,
"learning_rate": 4.267961844753654e-05,
"epoch": 1.72
},
{
"loss": 7.9609,
"grad_norm": 4.2961626052856445,
"learning_rate": 4.259210641463202e-05,
"epoch": 1.72
},
{
"loss": 7.7958,
"grad_norm": 2.900935649871826,
"learning_rate": 4.250459438172749e-05,
"epoch": 1.72
},
{
"loss": 7.7953,
"grad_norm": 3.369781970977783,
"learning_rate": 4.241708234882297e-05,
"epoch": 1.73
},
{
"loss": 7.5888,
"grad_norm": 6.093942642211914,
"learning_rate": 4.232957031591844e-05,
"epoch": 1.73
},
{
"loss": 8.1158,
"grad_norm": 4.063805103302002,
"learning_rate": 4.224205828301392e-05,
"epoch": 1.73
},
{
"loss": 7.6127,
"grad_norm": 3.981023073196411,
"learning_rate": 4.215454625010939e-05,
"epoch": 1.74
},
{
"loss": 7.3727,
"grad_norm": 3.273742437362671,
"learning_rate": 4.206703421720487e-05,
"epoch": 1.74
},
{
"loss": 7.7617,
"grad_norm": 4.247544765472412,
"learning_rate": 4.197952218430034e-05,
"epoch": 1.74
},
{
"loss": 8.2201,
"grad_norm": 5.181518077850342,
"learning_rate": 4.1892010151395815e-05,
"epoch": 1.74
},
{
"loss": 8.0504,
"grad_norm": 3.4994397163391113,
"learning_rate": 4.1804498118491294e-05,
"epoch": 1.75
},
{
"loss": 8.2552,
"grad_norm": 4.784666061401367,
"learning_rate": 4.1716986085586766e-05,
"epoch": 1.75
},
{
"loss": 7.746,
"grad_norm": 4.549380779266357,
"learning_rate": 4.1629474052682245e-05,
"epoch": 1.75
},
{
"loss": 7.6646,
"grad_norm": 3.586853504180908,
"learning_rate": 4.154196201977772e-05,
"epoch": 1.75
},
{
"loss": 7.454,
"grad_norm": 4.0881500244140625,
"learning_rate": 4.1454449986873196e-05,
"epoch": 1.76
},
{
"loss": 7.6951,
"grad_norm": 3.7725558280944824,
"learning_rate": 4.136693795396867e-05,
"epoch": 1.76
},
{
"loss": 7.7406,
"grad_norm": 4.566652297973633,
"learning_rate": 4.127942592106415e-05,
"epoch": 1.76
},
{
"loss": 8.0458,
"grad_norm": 4.562892913818359,
"learning_rate": 4.119191388815962e-05,
"epoch": 1.76
},
{
"loss": 8.314,
"grad_norm": 3.1217896938323975,
"learning_rate": 4.11044018552551e-05,
"epoch": 1.77
},
{
"loss": 7.931,
"grad_norm": 3.4693222045898438,
"learning_rate": 4.101688982235058e-05,
"epoch": 1.77
},
{
"loss": 7.6126,
"grad_norm": 3.7778282165527344,
"learning_rate": 4.092937778944605e-05,
"epoch": 1.77
},
{
"loss": 7.4983,
"grad_norm": 6.494439125061035,
"learning_rate": 4.084186575654153e-05,
"epoch": 1.77
},
{
"loss": 7.801,
"grad_norm": 3.602264165878296,
"learning_rate": 4.0754353723637e-05,
"epoch": 1.78
},
{
"loss": 7.405,
"grad_norm": 4.2882795333862305,
"learning_rate": 4.066684169073248e-05,
"epoch": 1.78
},
{
"loss": 7.5115,
"grad_norm": 4.935623645782471,
"learning_rate": 4.057932965782795e-05,
"epoch": 1.78
},
{
"loss": 7.5315,
"grad_norm": 5.16713809967041,
"learning_rate": 4.049181762492343e-05,
"epoch": 1.78
},
{
"loss": 7.8313,
"grad_norm": 3.440279960632324,
"learning_rate": 4.04043055920189e-05,
"epoch": 1.79
},
{
"loss": 7.7669,
"grad_norm": 4.02671480178833,
"learning_rate": 4.031679355911438e-05,
"epoch": 1.79
},
{
"loss": 7.6988,
"grad_norm": 5.945104598999023,
"learning_rate": 4.022928152620985e-05,
"epoch": 1.79
},
{
"loss": 7.2314,
"grad_norm": 4.557019233703613,
"learning_rate": 4.014176949330533e-05,
"epoch": 1.8
},
{
"loss": 7.5578,
"grad_norm": 3.9793171882629395,
"learning_rate": 4.0054257460400804e-05,
"epoch": 1.8
},
{
"loss": 7.1794,
"grad_norm": 3.178558349609375,
"learning_rate": 3.996674542749628e-05,
"epoch": 1.8
},
{
"loss": 7.842,
"grad_norm": 4.609609127044678,
"learning_rate": 3.987923339459176e-05,
"epoch": 1.8
},
{
"loss": 7.4484,
"grad_norm": 3.5374889373779297,
"learning_rate": 3.9791721361687234e-05,
"epoch": 1.81
},
{
"loss": 7.2917,
"grad_norm": 4.768485069274902,
"learning_rate": 3.970420932878271e-05,
"epoch": 1.81
},
{
"loss": 7.4525,
"grad_norm": 3.342456102371216,
"learning_rate": 3.9616697295878185e-05,
"epoch": 1.81
},
{
"loss": 7.6611,
"grad_norm": 4.111917018890381,
"learning_rate": 3.9529185262973664e-05,
"epoch": 1.81
},
{
"loss": 7.9292,
"grad_norm": 5.008895397186279,
"learning_rate": 3.9441673230069136e-05,
"epoch": 1.82
},
{
"loss": 7.9246,
"grad_norm": 4.372122287750244,
"learning_rate": 3.9354161197164615e-05,
"epoch": 1.82
},
{
"loss": 7.6795,
"grad_norm": 3.406059503555298,
"learning_rate": 3.926664916426009e-05,
"epoch": 1.82
},
{
"loss": 7.5926,
"grad_norm": 4.412403583526611,
"learning_rate": 3.9179137131355566e-05,
"epoch": 1.82
},
{
"loss": 8.0002,
"grad_norm": 4.203276634216309,
"learning_rate": 3.909162509845104e-05,
"epoch": 1.83
},
{
"loss": 7.8556,
"grad_norm": 3.7347216606140137,
"learning_rate": 3.900411306554652e-05,
"epoch": 1.83
},
{
"loss": 8.2028,
"grad_norm": 4.552736282348633,
"learning_rate": 3.891660103264199e-05,
"epoch": 1.83
},
{
"loss": 8.1277,
"grad_norm": 4.882839679718018,
"learning_rate": 3.882908899973747e-05,
"epoch": 1.83
},
{
"loss": 7.74,
"grad_norm": 4.639001846313477,
"learning_rate": 3.875032817012339e-05,
"epoch": 1.84
},
{
"loss": 8.0942,
"grad_norm": 5.097876071929932,
"learning_rate": 3.866281613721887e-05,
"epoch": 1.84
},
{
"loss": 7.9073,
"grad_norm": 3.200108051300049,
"learning_rate": 3.8575304104314344e-05,
"epoch": 1.84
},
{
"loss": 7.5499,
"grad_norm": 3.8395094871520996,
"learning_rate": 3.848779207140982e-05,
"epoch": 1.85
},
{
"loss": 7.5298,
"grad_norm": 3.6033782958984375,
"learning_rate": 3.8400280038505295e-05,
"epoch": 1.85
},
{
"loss": 7.3608,
"grad_norm": 4.341715335845947,
"learning_rate": 3.8312768005600774e-05,
"epoch": 1.85
},
{
"loss": 8.2837,
"grad_norm": 2.746906042098999,
"learning_rate": 3.822525597269625e-05,
"epoch": 1.85
},
{
"loss": 7.0553,
"grad_norm": 4.10823392868042,
"learning_rate": 3.8137743939791725e-05,
"epoch": 1.86
},
{
"loss": 7.4852,
"grad_norm": 3.21799635887146,
"learning_rate": 3.8050231906887204e-05,
"epoch": 1.86
},
{
"loss": 7.4002,
"grad_norm": 4.537161827087402,
"learning_rate": 3.796271987398267e-05,
"epoch": 1.86
},
{
"loss": 7.4377,
"grad_norm": 4.020664691925049,
"learning_rate": 3.787520784107815e-05,
"epoch": 1.86
},
{
"loss": 7.6523,
"grad_norm": 3.1800293922424316,
"learning_rate": 3.778769580817362e-05,
"epoch": 1.87
},
{
"loss": 7.4824,
"grad_norm": 3.2757511138916016,
"learning_rate": 3.77001837752691e-05,
"epoch": 1.87
},
{
"loss": 7.8269,
"grad_norm": 3.6784262657165527,
"learning_rate": 3.761267174236457e-05,
"epoch": 1.87
},
{
"loss": 7.9757,
"grad_norm": 3.4948902130126953,
"learning_rate": 3.752515970946005e-05,
"epoch": 1.87
},
{
"loss": 7.8251,
"grad_norm": 5.0971598625183105,
"learning_rate": 3.743764767655553e-05,
"epoch": 1.88
},
{
"loss": 7.7561,
"grad_norm": 4.533854961395264,
"learning_rate": 3.7350135643651e-05,
"epoch": 1.88
},
{
"loss": 7.8986,
"grad_norm": 4.550451278686523,
"learning_rate": 3.726262361074648e-05,
"epoch": 1.88
},
{
"loss": 7.2438,
"grad_norm": 3.8077099323272705,
"learning_rate": 3.717511157784195e-05,
"epoch": 1.88
},
{
"loss": 7.7242,
"grad_norm": 5.2727203369140625,
"learning_rate": 3.708759954493743e-05,
"epoch": 1.89
},
{
"loss": 7.8602,
"grad_norm": 2.9006500244140625,
"learning_rate": 3.7000087512032903e-05,
"epoch": 1.89
},
{
"loss": 7.0401,
"grad_norm": 4.919744491577148,
"learning_rate": 3.691257547912838e-05,
"epoch": 1.89
},
{
"loss": 7.5799,
"grad_norm": 3.297295093536377,
"learning_rate": 3.6825063446223854e-05,
"epoch": 1.9
},
{
"loss": 7.355,
"grad_norm": 2.9851813316345215,
"learning_rate": 3.673755141331933e-05,
"epoch": 1.9
},
{
"loss": 7.715,
"grad_norm": 3.619997262954712,
"learning_rate": 3.6650039380414805e-05,
"epoch": 1.9
},
{
"loss": 7.8093,
"grad_norm": 4.266133785247803,
"learning_rate": 3.6562527347510284e-05,
"epoch": 1.9
},
{
"loss": 7.5723,
"grad_norm": 3.513849973678589,
"learning_rate": 3.6475015314605756e-05,
"epoch": 1.91
},
{
"loss": 7.006,
"grad_norm": 3.6736350059509277,
"learning_rate": 3.6387503281701235e-05,
"epoch": 1.91
},
{
"loss": 7.8925,
"grad_norm": 3.4943020343780518,
"learning_rate": 3.629999124879671e-05,
"epoch": 1.91
},
{
"loss": 7.3886,
"grad_norm": 5.898230075836182,
"learning_rate": 3.6212479215892186e-05,
"epoch": 1.91
},
{
"loss": 7.9521,
"grad_norm": 3.2569427490234375,
"learning_rate": 3.6124967182987665e-05,
"epoch": 1.92
},
{
"loss": 7.1573,
"grad_norm": 6.18344259262085,
"learning_rate": 3.603745515008314e-05,
"epoch": 1.92
},
{
"loss": 7.3595,
"grad_norm": 6.704586982727051,
"learning_rate": 3.5949943117178616e-05,
"epoch": 1.92
},
{
"loss": 8.1131,
"grad_norm": 3.768490791320801,
"learning_rate": 3.586243108427409e-05,
"epoch": 1.92
},
{
"loss": 7.8278,
"grad_norm": 4.432671070098877,
"learning_rate": 3.577491905136957e-05,
"epoch": 1.93
},
{
"loss": 7.4794,
"grad_norm": 3.835556745529175,
"learning_rate": 3.568740701846504e-05,
"epoch": 1.93
},
{
"loss": 7.5471,
"grad_norm": 5.500497817993164,
"learning_rate": 3.559989498556052e-05,
"epoch": 1.93
},
{
"loss": 7.9875,
"grad_norm": 4.727583408355713,
"learning_rate": 3.551238295265599e-05,
"epoch": 1.93
},
{
"loss": 6.9965,
"grad_norm": 5.54524040222168,
"learning_rate": 3.542487091975147e-05,
"epoch": 1.94
},
{
"loss": 7.6091,
"grad_norm": 3.945673942565918,
"learning_rate": 3.533735888684694e-05,
"epoch": 1.94
},
{
"loss": 7.6881,
"grad_norm": 3.220522880554199,
"learning_rate": 3.524984685394242e-05,
"epoch": 1.94
},
{
"loss": 7.5224,
"grad_norm": 5.061761856079102,
"learning_rate": 3.516233482103789e-05,
"epoch": 1.95
},
{
"loss": 7.2034,
"grad_norm": 4.419524192810059,
"learning_rate": 3.507482278813337e-05,
"epoch": 1.95
},
{
"loss": 7.8167,
"grad_norm": 4.390359878540039,
"learning_rate": 3.498731075522885e-05,
"epoch": 1.95
},
{
"loss": 7.9913,
"grad_norm": 3.729773998260498,
"learning_rate": 3.489979872232432e-05,
"epoch": 1.95
},
{
"loss": 7.6947,
"grad_norm": 4.854176044464111,
"learning_rate": 3.48122866894198e-05,
"epoch": 1.96
},
{
"loss": 7.7003,
"grad_norm": 3.3899290561676025,
"learning_rate": 3.472477465651527e-05,
"epoch": 1.96
},
{
"loss": 7.6489,
"grad_norm": 4.47396993637085,
"learning_rate": 3.463726262361075e-05,
"epoch": 1.96
},
{
"loss": 7.8908,
"grad_norm": 3.3266396522521973,
"learning_rate": 3.4549750590706224e-05,
"epoch": 1.96
},
{
"loss": 7.4745,
"grad_norm": 4.091291904449463,
"learning_rate": 3.44622385578017e-05,
"epoch": 1.97
},
{
"loss": 7.5307,
"grad_norm": 7.771108627319336,
"learning_rate": 3.4374726524897175e-05,
"epoch": 1.97
},
{
"loss": 7.7252,
"grad_norm": 4.1433305740356445,
"learning_rate": 3.4287214491992654e-05,
"epoch": 1.97
},
{
"loss": 8.1189,
"grad_norm": 3.5036942958831787,
"learning_rate": 3.4199702459088126e-05,
"epoch": 1.97
},
{
"loss": 7.6836,
"grad_norm": 4.437150478363037,
"learning_rate": 3.4112190426183605e-05,
"epoch": 1.98
},
{
"loss": 7.8543,
"grad_norm": 6.440913200378418,
"learning_rate": 3.402467839327908e-05,
"epoch": 1.98
},
{
"loss": 7.8673,
"grad_norm": 4.657886981964111,
"learning_rate": 3.393716636037455e-05,
"epoch": 1.98
},
{
"loss": 8.4281,
"grad_norm": 4.122070789337158,
"learning_rate": 3.384965432747003e-05,
"epoch": 1.98
},
{
"loss": 7.1948,
"grad_norm": 3.2325737476348877,
"learning_rate": 3.37621422945655e-05,
"epoch": 1.99
},
{
"loss": 7.3229,
"grad_norm": 3.874630928039551,
"learning_rate": 3.367463026166098e-05,
"epoch": 1.99
},
{
"loss": 7.5072,
"grad_norm": 4.308450222015381,
"learning_rate": 3.358711822875645e-05,
"epoch": 1.99
},
{
"loss": 7.5892,
"grad_norm": 3.9709150791168213,
"learning_rate": 3.349960619585193e-05,
"epoch": 1.99
},
{
"loss": 7.9792,
"grad_norm": 6.5298919677734375,
"learning_rate": 3.34120941629474e-05,
"epoch": 2.0
},
{
"loss": 7.688,
"grad_norm": 4.508563041687012,
"learning_rate": 3.332458213004288e-05,
"epoch": 2.0
},
{
"loss": 7.1915,
"grad_norm": 3.5211637020111084,
"learning_rate": 3.3237070097138354e-05,
"epoch": 2.0
},
{
"loss": 7.4623,
"grad_norm": 4.973934173583984,
"learning_rate": 3.314955806423383e-05,
"epoch": 2.01
},
{
"loss": 7.4097,
"grad_norm": 4.810267448425293,
"learning_rate": 3.3062046031329305e-05,
"epoch": 2.01
},
{
"loss": 7.5439,
"grad_norm": 3.942003011703491,
"learning_rate": 3.2974533998424783e-05,
"epoch": 2.01
},
{
"loss": 7.2904,
"grad_norm": 3.7207398414611816,
"learning_rate": 3.288702196552026e-05,
"epoch": 2.01
},
{
"loss": 7.0411,
"grad_norm": 3.197200298309326,
"learning_rate": 3.2799509932615734e-05,
"epoch": 2.02
},
{
"loss": 6.7708,
"grad_norm": 5.261172294616699,
"learning_rate": 3.2711997899711213e-05,
"epoch": 2.02
},
{
"loss": 7.2953,
"grad_norm": 7.287022590637207,
"learning_rate": 3.2624485866806686e-05,
"epoch": 2.02
},
{
"loss": 7.7778,
"grad_norm": 3.6490862369537354,
"learning_rate": 3.2536973833902164e-05,
"epoch": 2.02
},
{
"loss": 7.0521,
"grad_norm": 3.474090337753296,
"learning_rate": 3.2449461800997637e-05,
"epoch": 2.03
},
{
"loss": 7.6243,
"grad_norm": 4.992802143096924,
"learning_rate": 3.2361949768093115e-05,
"epoch": 2.03
},
{
"loss": 7.7397,
"grad_norm": 4.16194486618042,
"learning_rate": 3.227443773518859e-05,
"epoch": 2.03
},
{
"loss": 7.8743,
"grad_norm": 4.265628814697266,
"learning_rate": 3.2186925702284066e-05,
"epoch": 2.03
},
{
"loss": 7.5843,
"grad_norm": 4.442827224731445,
"learning_rate": 3.209941366937954e-05,
"epoch": 2.04
},
{
"loss": 7.1913,
"grad_norm": 3.7389514446258545,
"learning_rate": 3.201190163647502e-05,
"epoch": 2.04
},
{
"loss": 7.4241,
"grad_norm": 4.544101238250732,
"learning_rate": 3.192438960357049e-05,
"epoch": 2.04
},
{
"loss": 7.4453,
"grad_norm": 3.6654653549194336,
"learning_rate": 3.183687757066597e-05,
"epoch": 2.04
},
{
"loss": 7.4288,
"grad_norm": 3.525256872177124,
"learning_rate": 3.174936553776145e-05,
"epoch": 2.05
},
{
"loss": 7.387,
"grad_norm": 4.041418075561523,
"learning_rate": 3.166185350485692e-05,
"epoch": 2.05
},
{
"loss": 7.4435,
"grad_norm": 4.415677547454834,
"learning_rate": 3.15743414719524e-05,
"epoch": 2.05
},
{
"loss": 7.6321,
"grad_norm": 3.649733066558838,
"learning_rate": 3.148682943904787e-05,
"epoch": 2.06
},
{
"loss": 7.1816,
"grad_norm": 4.361470699310303,
"learning_rate": 3.139931740614335e-05,
"epoch": 2.06
},
{
"loss": 7.8665,
"grad_norm": 2.8240556716918945,
"learning_rate": 3.131180537323882e-05,
"epoch": 2.06
},
{
"loss": 8.0303,
"grad_norm": 6.444936275482178,
"learning_rate": 3.12242933403343e-05,
"epoch": 2.06
},
{
"loss": 7.834,
"grad_norm": 4.267172813415527,
"learning_rate": 3.113678130742977e-05,
"epoch": 2.07
},
{
"loss": 7.5162,
"grad_norm": 4.9462480545043945,
"learning_rate": 3.104926927452525e-05,
"epoch": 2.07
},
{
"loss": 7.3831,
"grad_norm": 3.944603204727173,
"learning_rate": 3.0961757241620723e-05,
"epoch": 2.07
},
{
"loss": 7.0951,
"grad_norm": 4.1821608543396,
"learning_rate": 3.08742452087162e-05,
"epoch": 2.07
},
{
"loss": 7.4743,
"grad_norm": 4.054866790771484,
"learning_rate": 3.0786733175811675e-05,
"epoch": 2.08
},
{
"loss": 7.4355,
"grad_norm": 4.87803316116333,
"learning_rate": 3.0699221142907153e-05,
"epoch": 2.08
},
{
"loss": 7.8149,
"grad_norm": 4.6143388748168945,
"learning_rate": 3.0611709110002626e-05,
"epoch": 2.08
},
{
"loss": 7.2776,
"grad_norm": 3.6637542247772217,
"learning_rate": 3.0524197077098104e-05,
"epoch": 2.08
},
{
"loss": 7.5867,
"grad_norm": 4.739266872406006,
"learning_rate": 3.043668504419358e-05,
"epoch": 2.09
},
{
"loss": 8.031,
"grad_norm": 4.118218898773193,
"learning_rate": 3.0349173011289055e-05,
"epoch": 2.09
},
{
"loss": 7.5086,
"grad_norm": 3.7304162979125977,
"learning_rate": 3.026166097838453e-05,
"epoch": 2.09
},
{
"loss": 7.0712,
"grad_norm": 3.3575172424316406,
"learning_rate": 3.0174148945480006e-05,
"epoch": 2.09
},
{
"loss": 7.8495,
"grad_norm": 3.6715874671936035,
"learning_rate": 3.0086636912575482e-05,
"epoch": 2.1
},
{
"loss": 7.8997,
"grad_norm": 3.8344626426696777,
"learning_rate": 2.9999124879670954e-05,
"epoch": 2.1
},
{
"loss": 7.6391,
"grad_norm": 5.086608409881592,
"learning_rate": 2.991161284676643e-05,
"epoch": 2.1
},
{
"loss": 7.6541,
"grad_norm": 5.020079135894775,
"learning_rate": 2.9824100813861905e-05,
"epoch": 2.11
},
{
"loss": 7.9148,
"grad_norm": 4.90994119644165,
"learning_rate": 2.973658878095738e-05,
"epoch": 2.11
},
{
"loss": 7.3306,
"grad_norm": 7.108256816864014,
"learning_rate": 2.9649076748052856e-05,
"epoch": 2.11
},
{
"loss": 7.4177,
"grad_norm": 3.922966480255127,
"learning_rate": 2.956156471514833e-05,
"epoch": 2.11
},
{
"loss": 7.6952,
"grad_norm": 4.077629566192627,
"learning_rate": 2.9474052682243807e-05,
"epoch": 2.12
},
{
"loss": 8.2089,
"grad_norm": 3.003819227218628,
"learning_rate": 2.9386540649339283e-05,
"epoch": 2.12
},
{
"loss": 6.8683,
"grad_norm": 3.8509228229522705,
"learning_rate": 2.9299028616434758e-05,
"epoch": 2.12
},
{
"loss": 7.1152,
"grad_norm": 3.316972017288208,
"learning_rate": 2.9211516583530234e-05,
"epoch": 2.12
},
{
"loss": 7.7425,
"grad_norm": 5.465259552001953,
"learning_rate": 2.9124004550625713e-05,
"epoch": 2.13
},
{
"loss": 7.3628,
"grad_norm": 3.923509120941162,
"learning_rate": 2.9036492517721188e-05,
"epoch": 2.13
},
{
"loss": 7.382,
"grad_norm": 4.779471397399902,
"learning_rate": 2.8948980484816664e-05,
"epoch": 2.13
},
{
"loss": 7.4485,
"grad_norm": 6.00252628326416,
"learning_rate": 2.886146845191214e-05,
"epoch": 2.13
},
{
"loss": 7.4238,
"grad_norm": 4.734460353851318,
"learning_rate": 2.8773956419007615e-05,
"epoch": 2.14
},
{
"loss": 7.4597,
"grad_norm": 4.662705898284912,
"learning_rate": 2.868644438610309e-05,
"epoch": 2.14
},
{
"loss": 7.7309,
"grad_norm": 3.3174445629119873,
"learning_rate": 2.8598932353198566e-05,
"epoch": 2.14
},
{
"loss": 7.7715,
"grad_norm": 3.2781224250793457,
"learning_rate": 2.851142032029404e-05,
"epoch": 2.14
},
{
"loss": 7.6348,
"grad_norm": 5.909160137176514,
"learning_rate": 2.8423908287389517e-05,
"epoch": 2.15
},
{
"loss": 7.2795,
"grad_norm": 4.939976215362549,
"learning_rate": 2.8336396254484992e-05,
"epoch": 2.15
},
{
"loss": 7.9346,
"grad_norm": 4.42500114440918,
"learning_rate": 2.8248884221580468e-05,
"epoch": 2.15
},
{
"loss": 7.4785,
"grad_norm": 3.704190731048584,
"learning_rate": 2.8161372188675943e-05,
"epoch": 2.16
},
{
"loss": 7.456,
"grad_norm": 3.73481822013855,
"learning_rate": 2.807386015577142e-05,
"epoch": 2.16
},
{
"loss": 7.4728,
"grad_norm": 4.051381587982178,
"learning_rate": 2.7986348122866894e-05,
"epoch": 2.16
},
{
"loss": 7.6646,
"grad_norm": 3.674975633621216,
"learning_rate": 2.7898836089962373e-05,
"epoch": 2.16
},
{
"loss": 7.4139,
"grad_norm": 4.6207709312438965,
"learning_rate": 2.781132405705785e-05,
"epoch": 2.17
},
{
"loss": 7.3962,
"grad_norm": 3.7129499912261963,
"learning_rate": 2.7723812024153324e-05,
"epoch": 2.17
},
{
"loss": 7.423,
"grad_norm": 4.65708589553833,
"learning_rate": 2.76362999912488e-05,
"epoch": 2.17
},
{
"loss": 7.577,
"grad_norm": 5.05981969833374,
"learning_rate": 2.7548787958344275e-05,
"epoch": 2.17
},
{
"loss": 7.4925,
"grad_norm": 4.692249774932861,
"learning_rate": 2.746127592543975e-05,
"epoch": 2.18
},
{
"loss": 7.4587,
"grad_norm": 4.2007856369018555,
"learning_rate": 2.7373763892535226e-05,
"epoch": 2.18
},
{
"loss": 7.7241,
"grad_norm": 6.081201553344727,
"learning_rate": 2.72862518596307e-05,
"epoch": 2.18
},
{
"loss": 7.6261,
"grad_norm": 3.801405429840088,
"learning_rate": 2.7198739826726177e-05,
"epoch": 2.18
},
{
"loss": 7.199,
"grad_norm": 4.788170337677002,
"learning_rate": 2.7111227793821653e-05,
"epoch": 2.19
},
{
"loss": 7.6249,
"grad_norm": 3.934465169906616,
"learning_rate": 2.7023715760917128e-05,
"epoch": 2.19
},
{
"loss": 7.1697,
"grad_norm": 3.270228147506714,
"learning_rate": 2.6936203728012604e-05,
"epoch": 2.19
},
{
"loss": 7.3199,
"grad_norm": 4.648608207702637,
"learning_rate": 2.684869169510808e-05,
"epoch": 2.19
},
{
"loss": 7.0883,
"grad_norm": 3.7127881050109863,
"learning_rate": 2.6761179662203555e-05,
"epoch": 2.2
},
{
"loss": 7.6402,
"grad_norm": 4.310494899749756,
"learning_rate": 2.6673667629299033e-05,
"epoch": 2.2
},
{
"loss": 8.1804,
"grad_norm": 3.9658172130584717,
"learning_rate": 2.658615559639451e-05,
"epoch": 2.2
},
{
"loss": 7.4997,
"grad_norm": 6.007218837738037,
"learning_rate": 2.6498643563489984e-05,
"epoch": 2.2
},
{
"loss": 7.1157,
"grad_norm": 4.3563995361328125,
"learning_rate": 2.641113153058546e-05,
"epoch": 2.21
},
{
"loss": 7.6341,
"grad_norm": 3.9191131591796875,
"learning_rate": 2.6323619497680935e-05,
"epoch": 2.21
},
{
"loss": 7.3165,
"grad_norm": 6.353770732879639,
"learning_rate": 2.623610746477641e-05,
"epoch": 2.21
},
{
"loss": 7.2788,
"grad_norm": 4.23541784286499,
"learning_rate": 2.6148595431871886e-05,
"epoch": 2.22
},
{
"loss": 7.4787,
"grad_norm": 7.060284614562988,
"learning_rate": 2.6061083398967362e-05,
"epoch": 2.22
},
{
"loss": 7.7702,
"grad_norm": 3.484837055206299,
"learning_rate": 2.597357136606283e-05,
"epoch": 2.22
},
{
"loss": 7.3549,
"grad_norm": 6.6010589599609375,
"learning_rate": 2.5886059333158306e-05,
"epoch": 2.22
},
{
"loss": 7.2694,
"grad_norm": 4.792263984680176,
"learning_rate": 2.5798547300253785e-05,
"epoch": 2.23
},
{
"loss": 7.479,
"grad_norm": 4.992294788360596,
"learning_rate": 2.571103526734926e-05,
"epoch": 2.23
},
{
"loss": 7.5955,
"grad_norm": 5.028162956237793,
"learning_rate": 2.5623523234444736e-05,
"epoch": 2.23
},
{
"loss": 7.1964,
"grad_norm": 4.13356876373291,
"learning_rate": 2.553601120154021e-05,
"epoch": 2.23
},
{
"loss": 6.9294,
"grad_norm": 3.5145249366760254,
"learning_rate": 2.5448499168635687e-05,
"epoch": 2.24
},
{
"loss": 7.4454,
"grad_norm": 3.8901588916778564,
"learning_rate": 2.5360987135731163e-05,
"epoch": 2.24
},
{
"loss": 7.738,
"grad_norm": 4.009905815124512,
"learning_rate": 2.5273475102826638e-05,
"epoch": 2.24
},
{
"loss": 7.3253,
"grad_norm": 4.332956314086914,
"learning_rate": 2.5185963069922114e-05,
"epoch": 2.24
},
{
"loss": 7.1716,
"grad_norm": 3.688816785812378,
"learning_rate": 2.509845103701759e-05,
"epoch": 2.25
},
{
"loss": 6.8345,
"grad_norm": 7.10718297958374,
"learning_rate": 2.5010939004113065e-05,
"epoch": 2.25
},
{
"loss": 7.7885,
"grad_norm": 5.8644585609436035,
"learning_rate": 2.492342697120854e-05,
"epoch": 2.25
},
{
"loss": 7.3728,
"grad_norm": 2.958936929702759,
"learning_rate": 2.4835914938304016e-05,
"epoch": 2.25
},
{
"loss": 7.2452,
"grad_norm": 3.498347520828247,
"learning_rate": 2.474840290539949e-05,
"epoch": 2.26
},
{
"loss": 7.0227,
"grad_norm": 4.527777671813965,
"learning_rate": 2.4660890872494967e-05,
"epoch": 2.26
},
{
"loss": 7.0811,
"grad_norm": 4.315553665161133,
"learning_rate": 2.4573378839590446e-05,
"epoch": 2.26
},
{
"loss": 7.5339,
"grad_norm": 3.5268032550811768,
"learning_rate": 2.448586680668592e-05,
"epoch": 2.27
},
{
"loss": 7.5172,
"grad_norm": 4.606849670410156,
"learning_rate": 2.4398354773781397e-05,
"epoch": 2.27
},
{
"loss": 7.5034,
"grad_norm": 3.0682761669158936,
"learning_rate": 2.4310842740876872e-05,
"epoch": 2.27
},
{
"loss": 7.0923,
"grad_norm": 3.9300010204315186,
"learning_rate": 2.4223330707972348e-05,
"epoch": 2.27
},
{
"loss": 7.3782,
"grad_norm": 5.444020748138428,
"learning_rate": 2.4135818675067823e-05,
"epoch": 2.28
},
{
"loss": 8.0698,
"grad_norm": 3.9157919883728027,
"learning_rate": 2.40483066421633e-05,
"epoch": 2.28
},
{
"loss": 7.4659,
"grad_norm": 4.808152675628662,
"learning_rate": 2.3960794609258774e-05,
"epoch": 2.28
},
{
"loss": 7.2596,
"grad_norm": 4.249693870544434,
"learning_rate": 2.387328257635425e-05,
"epoch": 2.28
},
{
"loss": 7.0133,
"grad_norm": 4.091562271118164,
"learning_rate": 2.3785770543449725e-05,
"epoch": 2.29
},
{
"loss": 7.9337,
"grad_norm": 3.689053535461426,
"learning_rate": 2.36982585105452e-05,
"epoch": 2.29
},
{
"loss": 7.954,
"grad_norm": 3.9822888374328613,
"learning_rate": 2.3610746477640676e-05,
"epoch": 2.29
},
{
"loss": 8.0586,
"grad_norm": 4.524798393249512,
"learning_rate": 2.352323444473615e-05,
"epoch": 2.29
},
{
"loss": 7.6348,
"grad_norm": 4.638789653778076,
"learning_rate": 2.3435722411831627e-05,
"epoch": 2.3
},
{
"loss": 7.1951,
"grad_norm": 3.9606380462646484,
"learning_rate": 2.3348210378927106e-05,
"epoch": 2.3
},
{
"loss": 7.1919,
"grad_norm": 4.085976600646973,
"learning_rate": 2.326069834602258e-05,
"epoch": 2.3
},
{
"loss": 7.7985,
"grad_norm": 4.817371845245361,
"learning_rate": 2.3173186313118057e-05,
"epoch": 2.3
},
{
"loss": 7.4659,
"grad_norm": 4.804962635040283,
"learning_rate": 2.3085674280213533e-05,
"epoch": 2.31
},
{
"loss": 7.6245,
"grad_norm": 5.15590763092041,
"learning_rate": 2.2998162247309005e-05,
"epoch": 2.31
},
{
"loss": 7.1799,
"grad_norm": 5.4307122230529785,
"learning_rate": 2.291065021440448e-05,
"epoch": 2.31
},
{
"loss": 6.737,
"grad_norm": 3.417074680328369,
"learning_rate": 2.2823138181499956e-05,
"epoch": 2.32
},
{
"loss": 7.3044,
"grad_norm": 5.047757148742676,
"learning_rate": 2.273562614859543e-05,
"epoch": 2.32
},
{
"loss": 7.0146,
"grad_norm": 5.767230033874512,
"learning_rate": 2.2648114115690907e-05,
"epoch": 2.32
},
{
"loss": 7.5375,
"grad_norm": 4.92877197265625,
"learning_rate": 2.2560602082786382e-05,
"epoch": 2.32
},
{
"loss": 7.5536,
"grad_norm": 3.8499937057495117,
"learning_rate": 2.2473090049881858e-05,
"epoch": 2.33
},
{
"loss": 7.2617,
"grad_norm": 3.698652505874634,
"learning_rate": 2.2385578016977337e-05,
"epoch": 2.33
},
{
"loss": 7.3101,
"grad_norm": 3.8474197387695312,
"learning_rate": 2.2298065984072812e-05,
"epoch": 2.33
},
{
"loss": 7.4945,
"grad_norm": 4.18773889541626,
"learning_rate": 2.2210553951168288e-05,
"epoch": 2.33
},
{
"loss": 7.1942,
"grad_norm": 4.604954242706299,
"learning_rate": 2.2123041918263763e-05,
"epoch": 2.34
},
{
"loss": 7.3004,
"grad_norm": 4.48193359375,
"learning_rate": 2.203552988535924e-05,
"epoch": 2.34
},
{
"loss": 7.2258,
"grad_norm": 3.1619014739990234,
"learning_rate": 2.1948017852454714e-05,
"epoch": 2.34
},
{
"loss": 7.1968,
"grad_norm": 4.031898021697998,
"learning_rate": 2.186050581955019e-05,
"epoch": 2.34
},
{
"loss": 7.7404,
"grad_norm": 4.030830383300781,
"learning_rate": 2.1772993786645665e-05,
"epoch": 2.35
},
{
"loss": 7.1855,
"grad_norm": 3.3764097690582275,
"learning_rate": 2.168548175374114e-05,
"epoch": 2.35
},
{
"loss": 7.0507,
"grad_norm": 5.506438732147217,
"learning_rate": 2.1597969720836616e-05,
"epoch": 2.35
},
{
"loss": 6.5909,
"grad_norm": 4.797235012054443,
"learning_rate": 2.151045768793209e-05,
"epoch": 2.35
},
{
"loss": 7.418,
"grad_norm": 4.5042853355407715,
"learning_rate": 2.1422945655027567e-05,
"epoch": 2.36
},
{
"loss": 7.3702,
"grad_norm": 3.449220657348633,
"learning_rate": 2.1335433622123043e-05,
"epoch": 2.36
},
{
"loss": 7.2734,
"grad_norm": 5.276688098907471,
"learning_rate": 2.1247921589218518e-05,
"epoch": 2.36
},
{
"loss": 7.1368,
"grad_norm": 4.960446834564209,
"learning_rate": 2.1160409556313997e-05,
"epoch": 2.37
},
{
"loss": 7.3468,
"grad_norm": 4.041114330291748,
"learning_rate": 2.1072897523409473e-05,
"epoch": 2.37
},
{
"loss": 7.5724,
"grad_norm": 5.667148113250732,
"learning_rate": 2.0985385490504945e-05,
"epoch": 2.37
},
{
"loss": 7.1379,
"grad_norm": 3.245389223098755,
"learning_rate": 2.089787345760042e-05,
"epoch": 2.37
},
{
"loss": 6.9722,
"grad_norm": 4.715411186218262,
"learning_rate": 2.0810361424695896e-05,
"epoch": 2.38
},
{
"loss": 7.4667,
"grad_norm": 3.4023447036743164,
"learning_rate": 2.072284939179137e-05,
"epoch": 2.38
},
{
"loss": 7.3368,
"grad_norm": 4.798887252807617,
"learning_rate": 2.0635337358886847e-05,
"epoch": 2.38
},
{
"loss": 7.224,
"grad_norm": 4.741410255432129,
"learning_rate": 2.0547825325982322e-05,
"epoch": 2.38
},
{
"loss": 7.0656,
"grad_norm": 3.5715346336364746,
"learning_rate": 2.046906449636825e-05,
"epoch": 2.39
},
{
"loss": 7.0197,
"grad_norm": 4.428717613220215,
"learning_rate": 2.0381552463463727e-05,
"epoch": 2.39
},
{
"loss": 7.4517,
"grad_norm": 4.353855133056641,
"learning_rate": 2.0294040430559202e-05,
"epoch": 2.39
},
{
"loss": 7.5488,
"grad_norm": 3.331164598464966,
"learning_rate": 2.0206528397654678e-05,
"epoch": 2.39
},
{
"loss": 6.8231,
"grad_norm": 4.357122898101807,
"learning_rate": 2.0119016364750153e-05,
"epoch": 2.4
},
{
"loss": 7.8218,
"grad_norm": 3.6374125480651855,
"learning_rate": 2.003150433184563e-05,
"epoch": 2.4
},
{
"loss": 7.2913,
"grad_norm": 3.414724826812744,
"learning_rate": 1.9943992298941104e-05,
"epoch": 2.4
},
{
"loss": 7.5022,
"grad_norm": 3.073855400085449,
"learning_rate": 1.985648026603658e-05,
"epoch": 2.4
},
{
"loss": 7.1292,
"grad_norm": 5.69718074798584,
"learning_rate": 1.976896823313206e-05,
"epoch": 2.41
},
{
"loss": 7.5066,
"grad_norm": 3.6818926334381104,
"learning_rate": 1.9681456200227534e-05,
"epoch": 2.41
},
{
"loss": 7.1924,
"grad_norm": 6.099584579467773,
"learning_rate": 1.959394416732301e-05,
"epoch": 2.41
},
{
"loss": 7.7311,
"grad_norm": 4.388739109039307,
"learning_rate": 1.9506432134418485e-05,
"epoch": 2.41
},
{
"loss": 7.8034,
"grad_norm": 4.578341007232666,
"learning_rate": 1.941892010151396e-05,
"epoch": 2.42
},
{
"loss": 7.1698,
"grad_norm": 3.4639930725097656,
"learning_rate": 1.9331408068609436e-05,
"epoch": 2.42
},
{
"loss": 7.0058,
"grad_norm": 4.414987564086914,
"learning_rate": 1.924389603570491e-05,
"epoch": 2.42
},
{
"loss": 7.3363,
"grad_norm": 4.268624305725098,
"learning_rate": 1.9156384002800387e-05,
"epoch": 2.43
},
{
"loss": 7.2589,
"grad_norm": 6.716452598571777,
"learning_rate": 1.9068871969895863e-05,
"epoch": 2.43
},
{
"loss": 7.2501,
"grad_norm": 5.058889865875244,
"learning_rate": 1.8981359936991335e-05,
"epoch": 2.43
},
{
"loss": 7.3893,
"grad_norm": 6.656921863555908,
"learning_rate": 1.889384790408681e-05,
"epoch": 2.43
},
{
"loss": 7.2942,
"grad_norm": 4.824561595916748,
"learning_rate": 1.8806335871182286e-05,
"epoch": 2.44
},
{
"loss": 7.6477,
"grad_norm": 5.925858020782471,
"learning_rate": 1.8718823838277765e-05,
"epoch": 2.44
},
{
"loss": 6.9424,
"grad_norm": 3.955688714981079,
"learning_rate": 1.863131180537324e-05,
"epoch": 2.44
},
{
"loss": 6.8707,
"grad_norm": 3.95426869392395,
"learning_rate": 1.8543799772468716e-05,
"epoch": 2.44
},
{
"loss": 7.5012,
"grad_norm": 5.377491474151611,
"learning_rate": 1.845628773956419e-05,
"epoch": 2.45
},
{
"loss": 7.6028,
"grad_norm": 4.264338970184326,
"learning_rate": 1.8368775706659667e-05,
"epoch": 2.45
},
{
"loss": 6.8808,
"grad_norm": 5.041021347045898,
"learning_rate": 1.8281263673755142e-05,
"epoch": 2.45
},
{
"loss": 8.0187,
"grad_norm": 6.484523773193359,
"learning_rate": 1.8193751640850618e-05,
"epoch": 2.45
},
{
"loss": 6.9991,
"grad_norm": 4.5790205001831055,
"learning_rate": 1.8106239607946093e-05,
"epoch": 2.46
},
{
"loss": 7.2012,
"grad_norm": 4.204977989196777,
"learning_rate": 1.801872757504157e-05,
"epoch": 2.46
},
{
"loss": 7.6324,
"grad_norm": 3.803563356399536,
"learning_rate": 1.7931215542137044e-05,
"epoch": 2.46
},
{
"loss": 6.6331,
"grad_norm": 3.6445772647857666,
"learning_rate": 1.784370350923252e-05,
"epoch": 2.46
},
{
"loss": 7.4633,
"grad_norm": 3.9381942749023438,
"learning_rate": 1.7756191476327995e-05,
"epoch": 2.47
},
{
"loss": 7.4318,
"grad_norm": 3.573315382003784,
"learning_rate": 1.766867944342347e-05,
"epoch": 2.47
},
{
"loss": 7.3045,
"grad_norm": 3.7262725830078125,
"learning_rate": 1.7581167410518946e-05,
"epoch": 2.47
},
{
"loss": 6.9975,
"grad_norm": 4.73222541809082,
"learning_rate": 1.7493655377614425e-05,
"epoch": 2.48
},
{
"loss": 6.7643,
"grad_norm": 4.269005298614502,
"learning_rate": 1.74061433447099e-05,
"epoch": 2.48
},
{
"loss": 7.7472,
"grad_norm": 4.969855785369873,
"learning_rate": 1.7318631311805376e-05,
"epoch": 2.48
},
{
"loss": 7.0408,
"grad_norm": 4.290554046630859,
"learning_rate": 1.723111927890085e-05,
"epoch": 2.48
},
{
"loss": 7.5848,
"grad_norm": 4.593362808227539,
"learning_rate": 1.7143607245996327e-05,
"epoch": 2.49
},
{
"loss": 7.2043,
"grad_norm": 3.8505163192749023,
"learning_rate": 1.7056095213091803e-05,
"epoch": 2.49
},
{
"loss": 7.076,
"grad_norm": 5.526023864746094,
"learning_rate": 1.6968583180187275e-05,
"epoch": 2.49
},
{
"loss": 7.5282,
"grad_norm": 4.70090389251709,
"learning_rate": 1.688107114728275e-05,
"epoch": 2.49
},
{
"loss": 7.1022,
"grad_norm": 5.819429397583008,
"learning_rate": 1.6793559114378226e-05,
"epoch": 2.5
},
{
"loss": 7.5449,
"grad_norm": 4.2631707191467285,
"learning_rate": 1.67060470814737e-05,
"epoch": 2.5
},
{
"loss": 7.5941,
"grad_norm": 5.127431392669678,
"learning_rate": 1.6618535048569177e-05,
"epoch": 2.5
},
{
"loss": 7.1741,
"grad_norm": 5.605392932891846,
"learning_rate": 1.6531023015664652e-05,
"epoch": 2.5
},
{
"loss": 7.5186,
"grad_norm": 5.392033576965332,
"learning_rate": 1.644351098276013e-05,
"epoch": 2.51
},
{
"loss": 7.4369,
"grad_norm": 4.743539810180664,
"learning_rate": 1.6355998949855607e-05,
"epoch": 2.51
},
{
"loss": 7.0646,
"grad_norm": 4.0009684562683105,
"learning_rate": 1.6268486916951082e-05,
"epoch": 2.51
},
{
"loss": 7.3658,
"grad_norm": 4.551602363586426,
"learning_rate": 1.6180974884046558e-05,
"epoch": 2.51
},
{
"loss": 7.7071,
"grad_norm": 3.369328737258911,
"learning_rate": 1.6093462851142033e-05,
"epoch": 2.52
},
{
"loss": 7.5386,
"grad_norm": 3.6127750873565674,
"learning_rate": 1.600595081823751e-05,
"epoch": 2.52
},
{
"loss": 7.3196,
"grad_norm": 4.915907382965088,
"learning_rate": 1.5918438785332984e-05,
"epoch": 2.52
},
{
"loss": 7.1598,
"grad_norm": 5.295419216156006,
"learning_rate": 1.583092675242846e-05,
"epoch": 2.53
},
{
"loss": 7.2027,
"grad_norm": 5.066037654876709,
"learning_rate": 1.5743414719523935e-05,
"epoch": 2.53
},
{
"loss": 7.4418,
"grad_norm": 5.553489685058594,
"learning_rate": 1.565590268661941e-05,
"epoch": 2.53
},
{
"loss": 6.8532,
"grad_norm": 4.176399230957031,
"learning_rate": 1.5568390653714886e-05,
"epoch": 2.53
},
{
"loss": 7.1576,
"grad_norm": 5.018221855163574,
"learning_rate": 1.5480878620810362e-05,
"epoch": 2.54
},
{
"loss": 7.5015,
"grad_norm": 3.439542293548584,
"learning_rate": 1.5393366587905837e-05,
"epoch": 2.54
},
{
"loss": 6.8427,
"grad_norm": 3.641223907470703,
"learning_rate": 1.5305854555001313e-05,
"epoch": 2.54
},
{
"loss": 7.7335,
"grad_norm": 5.225297451019287,
"learning_rate": 1.521834252209679e-05,
"epoch": 2.54
},
{
"loss": 7.1007,
"grad_norm": 3.5159335136413574,
"learning_rate": 1.5130830489192265e-05,
"epoch": 2.55
},
{
"loss": 7.3267,
"grad_norm": 4.219715118408203,
"learning_rate": 1.5043318456287741e-05,
"epoch": 2.55
},
{
"loss": 7.0066,
"grad_norm": 4.482273101806641,
"learning_rate": 1.4955806423383215e-05,
"epoch": 2.55
},
{
"loss": 7.4307,
"grad_norm": 4.263273239135742,
"learning_rate": 1.486829439047869e-05,
"epoch": 2.55
},
{
"loss": 7.3687,
"grad_norm": 4.202017784118652,
"learning_rate": 1.4780782357574166e-05,
"epoch": 2.56
},
{
"loss": 7.6405,
"grad_norm": 5.738183498382568,
"learning_rate": 1.4693270324669641e-05,
"epoch": 2.56
},
{
"loss": 7.3089,
"grad_norm": 5.287261962890625,
"learning_rate": 1.4605758291765117e-05,
"epoch": 2.56
},
{
"loss": 7.0879,
"grad_norm": 5.147162914276123,
"learning_rate": 1.4518246258860594e-05,
"epoch": 2.56
},
{
"loss": 7.145,
"grad_norm": 3.873149871826172,
"learning_rate": 1.443073422595607e-05,
"epoch": 2.57
},
{
"loss": 7.1013,
"grad_norm": 4.64039945602417,
"learning_rate": 1.4343222193051545e-05,
"epoch": 2.57
},
{
"loss": 7.0642,
"grad_norm": 3.6532037258148193,
"learning_rate": 1.425571016014702e-05,
"epoch": 2.57
},
{
"loss": 7.1647,
"grad_norm": 3.756361484527588,
"learning_rate": 1.4168198127242496e-05,
"epoch": 2.58
},
{
"loss": 7.0501,
"grad_norm": 3.5314979553222656,
"learning_rate": 1.4080686094337972e-05,
"epoch": 2.58
},
{
"loss": 7.4017,
"grad_norm": 3.386040687561035,
"learning_rate": 1.3993174061433447e-05,
"epoch": 2.58
},
{
"loss": 7.4412,
"grad_norm": 3.566223382949829,
"learning_rate": 1.3905662028528924e-05,
"epoch": 2.58
},
{
"loss": 7.0911,
"grad_norm": 5.274896621704102,
"learning_rate": 1.38181499956244e-05,
"epoch": 2.59
},
{
"loss": 7.0295,
"grad_norm": 5.615356922149658,
"learning_rate": 1.3730637962719875e-05,
"epoch": 2.59
},
{
"loss": 7.1455,
"grad_norm": 4.624752521514893,
"learning_rate": 1.364312592981535e-05,
"epoch": 2.59
},
{
"loss": 7.1833,
"grad_norm": 4.156666278839111,
"learning_rate": 1.3555613896910826e-05,
"epoch": 2.59
},
{
"loss": 7.8835,
"grad_norm": 3.4591434001922607,
"learning_rate": 1.3468101864006302e-05,
"epoch": 2.6
},
{
"loss": 7.1062,
"grad_norm": 3.3804733753204346,
"learning_rate": 1.3380589831101777e-05,
"epoch": 2.6
},
{
"loss": 7.365,
"grad_norm": 3.7281017303466797,
"learning_rate": 1.3293077798197254e-05,
"epoch": 2.6
},
{
"loss": 7.0466,
"grad_norm": 3.4281463623046875,
"learning_rate": 1.320556576529273e-05,
"epoch": 2.6
},
{
"loss": 7.4435,
"grad_norm": 3.9861958026885986,
"learning_rate": 1.3118053732388205e-05,
"epoch": 2.61
},
{
"loss": 7.3772,
"grad_norm": 5.1440253257751465,
"learning_rate": 1.3030541699483681e-05,
"epoch": 2.61
},
{
"loss": 7.2704,
"grad_norm": 3.7356927394866943,
"learning_rate": 1.2943029666579153e-05,
"epoch": 2.61
},
{
"loss": 7.4473,
"grad_norm": 3.138427257537842,
"learning_rate": 1.285551763367463e-05,
"epoch": 2.61
},
{
"loss": 7.654,
"grad_norm": 5.250783920288086,
"learning_rate": 1.2768005600770106e-05,
"epoch": 2.62
},
{
"loss": 7.2723,
"grad_norm": 3.7493326663970947,
"learning_rate": 1.2680493567865581e-05,
"epoch": 2.62
},
{
"loss": 7.2502,
"grad_norm": 4.482826232910156,
"learning_rate": 1.2592981534961057e-05,
"epoch": 2.62
},
{
"loss": 6.9983,
"grad_norm": 4.741217613220215,
"learning_rate": 1.2505469502056532e-05,
"epoch": 2.62
},
{
"loss": 7.1721,
"grad_norm": 5.053958892822266,
"learning_rate": 1.2417957469152008e-05,
"epoch": 2.63
},
{
"loss": 7.2113,
"grad_norm": 5.000698089599609,
"learning_rate": 1.2330445436247483e-05,
"epoch": 2.63
},
{
"loss": 7.0898,
"grad_norm": 5.456648826599121,
"learning_rate": 1.224293340334296e-05,
"epoch": 2.63
},
{
"loss": 7.2822,
"grad_norm": 3.733816146850586,
"learning_rate": 1.2155421370438436e-05,
"epoch": 2.64
},
{
"loss": 7.4952,
"grad_norm": 4.114339351654053,
"learning_rate": 1.2067909337533912e-05,
"epoch": 2.64
},
{
"loss": 6.9912,
"grad_norm": 3.963610887527466,
"learning_rate": 1.1980397304629387e-05,
"epoch": 2.64
},
{
"loss": 7.2077,
"grad_norm": 4.697625637054443,
"learning_rate": 1.1892885271724863e-05,
"epoch": 2.64
},
{
"loss": 6.9453,
"grad_norm": 3.8456337451934814,
"learning_rate": 1.1805373238820338e-05,
"epoch": 2.65
},
{
"loss": 7.6018,
"grad_norm": 3.9979872703552246,
"learning_rate": 1.1717861205915814e-05,
"epoch": 2.65
},
{
"loss": 7.078,
"grad_norm": 4.1047563552856445,
"learning_rate": 1.163034917301129e-05,
"epoch": 2.65
},
{
"loss": 6.7399,
"grad_norm": 5.3073248863220215,
"learning_rate": 1.1542837140106766e-05,
"epoch": 2.65
},
{
"loss": 6.9167,
"grad_norm": 5.714503765106201,
"learning_rate": 1.145532510720224e-05,
"epoch": 2.66
},
{
"loss": 7.4903,
"grad_norm": 3.9626924991607666,
"learning_rate": 1.1367813074297716e-05,
"epoch": 2.66
},
{
"loss": 7.4558,
"grad_norm": 4.751763343811035,
"learning_rate": 1.1280301041393191e-05,
"epoch": 2.66
},
{
"loss": 7.5696,
"grad_norm": 3.73614501953125,
"learning_rate": 1.1192789008488668e-05,
"epoch": 2.66
},
{
"loss": 7.1071,
"grad_norm": 3.236339569091797,
"learning_rate": 1.1105276975584144e-05,
"epoch": 2.67
},
{
"loss": 7.6115,
"grad_norm": 4.271381855010986,
"learning_rate": 1.101776494267962e-05,
"epoch": 2.67
},
{
"loss": 7.1791,
"grad_norm": 3.6989824771881104,
"learning_rate": 1.0930252909775095e-05,
"epoch": 2.67
},
{
"loss": 7.0511,
"grad_norm": 3.856694221496582,
"learning_rate": 1.084274087687057e-05,
"epoch": 2.67
},
{
"loss": 7.2655,
"grad_norm": 4.834972858428955,
"learning_rate": 1.0755228843966046e-05,
"epoch": 2.68
},
{
"loss": 7.0369,
"grad_norm": 4.6722211837768555,
"learning_rate": 1.0667716811061521e-05,
"epoch": 2.68
},
{
"loss": 7.1936,
"grad_norm": 4.993673324584961,
"learning_rate": 1.0580204778156999e-05,
"epoch": 2.68
},
{
"loss": 7.3347,
"grad_norm": 3.4490904808044434,
"learning_rate": 1.0492692745252472e-05,
"epoch": 2.69
},
{
"loss": 7.736,
"grad_norm": 3.283051013946533,
"learning_rate": 1.0405180712347948e-05,
"epoch": 2.69
},
{
"loss": 7.0317,
"grad_norm": 3.656076431274414,
"learning_rate": 1.0317668679443423e-05,
"epoch": 2.69
},
{
"loss": 6.866,
"grad_norm": 3.4769787788391113,
"learning_rate": 1.0230156646538899e-05,
"epoch": 2.69
},
{
"loss": 7.4499,
"grad_norm": 3.384229898452759,
"learning_rate": 1.0142644613634374e-05,
"epoch": 2.7
},
{
"loss": 6.9746,
"grad_norm": 4.784582614898682,
"learning_rate": 1.0055132580729852e-05,
"epoch": 2.7
},
{
"loss": 7.0138,
"grad_norm": 4.076469898223877,
"learning_rate": 9.967620547825327e-06,
"epoch": 2.7
},
{
"loss": 7.2836,
"grad_norm": 5.0796709060668945,
"learning_rate": 9.880108514920803e-06,
"epoch": 2.7
},
{
"loss": 7.0612,
"grad_norm": 4.263620853424072,
"learning_rate": 9.792596482016278e-06,
"epoch": 2.71
},
{
"loss": 7.0573,
"grad_norm": 4.355484485626221,
"learning_rate": 9.705084449111754e-06,
"epoch": 2.71
},
{
"loss": 7.3385,
"grad_norm": 6.618645668029785,
"learning_rate": 9.617572416207229e-06,
"epoch": 2.71
},
{
"loss": 7.5994,
"grad_norm": 4.804537296295166,
"learning_rate": 9.530060383302705e-06,
"epoch": 2.71
},
{
"loss": 8.1717,
"grad_norm": 4.777498722076416,
"learning_rate": 9.44254835039818e-06,
"epoch": 2.72
},
{
"loss": 7.1703,
"grad_norm": 3.5699825286865234,
"learning_rate": 9.355036317493656e-06,
"epoch": 2.72
},
{
"loss": 7.0822,
"grad_norm": 6.044339179992676,
"learning_rate": 9.267524284589131e-06,
"epoch": 2.72
},
{
"loss": 7.0899,
"grad_norm": 3.558217763900757,
"learning_rate": 9.180012251684607e-06,
"epoch": 2.72
},
{
"loss": 6.9441,
"grad_norm": 4.0059075355529785,
"learning_rate": 9.092500218780082e-06,
"epoch": 2.73
},
{
"loss": 7.1065,
"grad_norm": 5.324728012084961,
"learning_rate": 9.004988185875558e-06,
"epoch": 2.73
},
{
"loss": 6.91,
"grad_norm": 3.852426767349243,
"learning_rate": 8.917476152971035e-06,
"epoch": 2.73
},
{
"loss": 7.7985,
"grad_norm": 5.7546844482421875,
"learning_rate": 8.82996412006651e-06,
"epoch": 2.74
},
{
"loss": 7.098,
"grad_norm": 4.994897842407227,
"learning_rate": 8.742452087161986e-06,
"epoch": 2.74
},
{
"loss": 7.4831,
"grad_norm": 4.3503522872924805,
"learning_rate": 8.654940054257461e-06,
"epoch": 2.74
},
{
"loss": 7.3563,
"grad_norm": 3.4878551959991455,
"learning_rate": 8.567428021352937e-06,
"epoch": 2.74
},
{
"loss": 7.8125,
"grad_norm": 4.518803596496582,
"learning_rate": 8.47991598844841e-06,
"epoch": 2.75
},
{
"loss": 7.1354,
"grad_norm": 3.4671084880828857,
"learning_rate": 8.392403955543888e-06,
"epoch": 2.75
},
{
"loss": 7.1782,
"grad_norm": 5.328606128692627,
"learning_rate": 8.304891922639363e-06,
"epoch": 2.75
},
{
"loss": 7.7318,
"grad_norm": 5.223174095153809,
"learning_rate": 8.217379889734839e-06,
"epoch": 2.75
},
{
"loss": 6.9753,
"grad_norm": 3.5544798374176025,
"learning_rate": 8.129867856830314e-06,
"epoch": 2.76
},
{
"loss": 7.5763,
"grad_norm": 8.088041305541992,
"learning_rate": 8.04235582392579e-06,
"epoch": 2.76
},
{
"loss": 7.9196,
"grad_norm": 4.860823631286621,
"learning_rate": 7.954843791021265e-06,
"epoch": 2.76
},
{
"loss": 6.7199,
"grad_norm": 2.9834036827087402,
"learning_rate": 7.867331758116741e-06,
"epoch": 2.76
},
{
"loss": 7.0354,
"grad_norm": 3.6943893432617188,
"learning_rate": 7.779819725212218e-06,
"epoch": 2.77
},
{
"loss": 7.7071,
"grad_norm": 4.091310977935791,
"learning_rate": 7.692307692307694e-06,
"epoch": 2.77
},
{
"loss": 7.2135,
"grad_norm": 3.6830339431762695,
"learning_rate": 7.604795659403169e-06,
"epoch": 2.77
},
{
"loss": 7.5218,
"grad_norm": 3.4381253719329834,
"learning_rate": 7.517283626498645e-06,
"epoch": 2.77
},
{
"loss": 7.1017,
"grad_norm": 5.597609519958496,
"learning_rate": 7.429771593594119e-06,
"epoch": 2.78
},
{
"loss": 6.7574,
"grad_norm": 3.175727128982544,
"learning_rate": 7.342259560689595e-06,
"epoch": 2.78
},
{
"loss": 6.8652,
"grad_norm": 3.873260021209717,
"learning_rate": 7.25474752778507e-06,
"epoch": 2.78
},
{
"loss": 7.0743,
"grad_norm": 3.3759090900421143,
"learning_rate": 7.167235494880546e-06,
"epoch": 2.79
},
{
"loss": 7.4346,
"grad_norm": 4.680045127868652,
"learning_rate": 7.079723461976022e-06,
"epoch": 2.79
},
{
"loss": 7.1374,
"grad_norm": 5.331534385681152,
"learning_rate": 6.992211429071498e-06,
"epoch": 2.79
},
{
"loss": 7.1309,
"grad_norm": 5.658579349517822,
"learning_rate": 6.904699396166973e-06,
"epoch": 2.79
},
{
"loss": 6.9677,
"grad_norm": 4.185191631317139,
"learning_rate": 6.8171873632624495e-06,
"epoch": 2.8
},
{
"loss": 6.9958,
"grad_norm": 3.82647967338562,
"learning_rate": 6.729675330357925e-06,
"epoch": 2.8
},
{
"loss": 7.1796,
"grad_norm": 4.288081645965576,
"learning_rate": 6.6421632974534005e-06,
"epoch": 2.8
},
{
"loss": 7.0815,
"grad_norm": 3.3872106075286865,
"learning_rate": 6.554651264548876e-06,
"epoch": 2.8
},
{
"loss": 6.9035,
"grad_norm": 3.4726109504699707,
"learning_rate": 6.467139231644351e-06,
"epoch": 2.81
},
{
"loss": 7.4375,
"grad_norm": 5.091712951660156,
"learning_rate": 6.379627198739826e-06,
"epoch": 2.81
},
{
"loss": 7.4423,
"grad_norm": 3.271453380584717,
"learning_rate": 6.2921151658353025e-06,
"epoch": 2.81
},
{
"loss": 6.976,
"grad_norm": 3.8439278602600098,
"learning_rate": 6.204603132930778e-06,
"epoch": 2.81
},
{
"loss": 7.7446,
"grad_norm": 3.4631197452545166,
"learning_rate": 6.1170911000262535e-06,
"epoch": 2.82
},
{
"loss": 7.7139,
"grad_norm": 3.5582733154296875,
"learning_rate": 6.02957906712173e-06,
"epoch": 2.82
},
{
"loss": 7.0974,
"grad_norm": 4.480440139770508,
"learning_rate": 5.942067034217205e-06,
"epoch": 2.82
},
{
"loss": 7.6834,
"grad_norm": 4.127463340759277,
"learning_rate": 5.854555001312681e-06,
"epoch": 2.82
},
{
"loss": 7.0067,
"grad_norm": 4.044102191925049,
"learning_rate": 5.767042968408156e-06,
"epoch": 2.83
},
{
"loss": 7.4619,
"grad_norm": 5.20751953125,
"learning_rate": 5.679530935503632e-06,
"epoch": 2.83
},
{
"loss": 6.9783,
"grad_norm": 4.303256511688232,
"learning_rate": 5.592018902599107e-06,
"epoch": 2.83
},
{
"loss": 7.093,
"grad_norm": 5.665140151977539,
"learning_rate": 5.504506869694583e-06,
"epoch": 2.83
},
{
"loss": 7.5041,
"grad_norm": 4.066624164581299,
"learning_rate": 5.416994836790059e-06,
"epoch": 2.84
},
{
"loss": 6.906,
"grad_norm": 4.449793815612793,
"learning_rate": 5.329482803885535e-06,
"epoch": 2.84
},
{
"loss": 7.4408,
"grad_norm": 4.4521074295043945,
"learning_rate": 5.2419707709810094e-06,
"epoch": 2.84
},
{
"loss": 7.1617,
"grad_norm": 3.7591211795806885,
"learning_rate": 5.154458738076486e-06,
"epoch": 2.85
},
{
"loss": 6.955,
"grad_norm": 5.360795974731445,
"learning_rate": 5.066946705171961e-06,
"epoch": 2.85
},
{
"loss": 7.3291,
"grad_norm": 3.872117280960083,
"learning_rate": 4.979434672267437e-06,
"epoch": 2.85
},
{
"loss": 7.7188,
"grad_norm": 5.078587055206299,
"learning_rate": 4.891922639362913e-06,
"epoch": 2.85
},
{
"loss": 7.1934,
"grad_norm": 5.7633056640625,
"learning_rate": 4.804410606458389e-06,
"epoch": 2.86
},
{
"loss": 7.0959,
"grad_norm": 3.960428476333618,
"learning_rate": 4.716898573553863e-06,
"epoch": 2.86
},
{
"loss": 6.9419,
"grad_norm": 6.363913536071777,
"learning_rate": 4.62938654064934e-06,
"epoch": 2.86
},
{
"loss": 7.0335,
"grad_norm": 4.09603214263916,
"learning_rate": 4.541874507744815e-06,
"epoch": 2.86
},
{
"loss": 7.2072,
"grad_norm": 3.217400312423706,
"learning_rate": 4.454362474840291e-06,
"epoch": 2.87
},
{
"loss": 7.3142,
"grad_norm": 4.389254570007324,
"learning_rate": 4.366850441935766e-06,
"epoch": 2.87
},
{
"loss": 7.3642,
"grad_norm": 4.192555904388428,
"learning_rate": 4.2793384090312425e-06,
"epoch": 2.87
},
{
"loss": 6.8518,
"grad_norm": 4.6586809158325195,
"learning_rate": 4.191826376126717e-06,
"epoch": 2.87
},
{
"loss": 7.4924,
"grad_norm": 3.969644784927368,
"learning_rate": 4.104314343222193e-06,
"epoch": 2.88
},
{
"loss": 6.8693,
"grad_norm": 3.7710835933685303,
"learning_rate": 4.016802310317669e-06,
"epoch": 2.88
},
{
"loss": 6.8957,
"grad_norm": 3.549421548843384,
"learning_rate": 3.9292902774131446e-06,
"epoch": 2.88
},
{
"loss": 7.6237,
"grad_norm": 3.6362552642822266,
"learning_rate": 3.84177824450862e-06,
"epoch": 2.88
},
{
"loss": 7.2684,
"grad_norm": 4.023890972137451,
"learning_rate": 3.754266211604096e-06,
"epoch": 2.89
},
{
"loss": 7.2039,
"grad_norm": 3.6492927074432373,
"learning_rate": 3.666754178699571e-06,
"epoch": 2.89
},
{
"loss": 7.7765,
"grad_norm": 4.3814191818237305,
"learning_rate": 3.579242145795047e-06,
"epoch": 2.89
},
{
"loss": 7.3461,
"grad_norm": 5.455050468444824,
"learning_rate": 3.4917301128905225e-06,
"epoch": 2.9
},
{
"loss": 7.2237,
"grad_norm": 4.942239761352539,
"learning_rate": 3.4042180799859984e-06,
"epoch": 2.9
},
{
"loss": 6.8001,
"grad_norm": 3.596323251724243,
"learning_rate": 3.316706047081474e-06,
"epoch": 2.9
},
{
"loss": 7.3173,
"grad_norm": 3.8507444858551025,
"learning_rate": 3.229194014176949e-06,
"epoch": 2.9
},
{
"loss": 7.1595,
"grad_norm": 4.1059651374816895,
"learning_rate": 3.141681981272425e-06,
"epoch": 2.91
},
{
"loss": 7.279,
"grad_norm": 3.6779584884643555,
"learning_rate": 3.0541699483679005e-06,
"epoch": 2.91
},
{
"loss": 6.9319,
"grad_norm": 3.716569423675537,
"learning_rate": 2.9666579154633764e-06,
"epoch": 2.91
},
{
"loss": 7.1694,
"grad_norm": 4.070470333099365,
"learning_rate": 2.8791458825588523e-06,
"epoch": 2.91
},
{
"loss": 7.8244,
"grad_norm": 4.256218910217285,
"learning_rate": 2.7916338496543274e-06,
"epoch": 2.92
},
{
"loss": 7.5893,
"grad_norm": 3.4462125301361084,
"learning_rate": 2.7041218167498033e-06,
"epoch": 2.92
},
{
"loss": 6.8421,
"grad_norm": 3.3473029136657715,
"learning_rate": 2.616609783845279e-06,
"epoch": 2.92
},
{
"loss": 6.735,
"grad_norm": 4.084453582763672,
"learning_rate": 2.5290977509407543e-06,
"epoch": 2.92
},
{
"loss": 7.4639,
"grad_norm": 4.453617572784424,
"learning_rate": 2.4415857180362303e-06,
"epoch": 2.93
},
{
"loss": 7.1289,
"grad_norm": 3.5332283973693848,
"learning_rate": 2.3540736851317058e-06,
"epoch": 2.93
},
{
"loss": 6.9,
"grad_norm": 4.8280792236328125,
"learning_rate": 2.2665616522271813e-06,
"epoch": 2.93
},
{
"loss": 7.534,
"grad_norm": 4.160041809082031,
"learning_rate": 2.179049619322657e-06,
"epoch": 2.93
},
{
"loss": 7.4365,
"grad_norm": 4.7804975509643555,
"learning_rate": 2.0915375864181323e-06,
"epoch": 2.94
},
{
"loss": 7.4622,
"grad_norm": 5.68775749206543,
"learning_rate": 2.0040255535136082e-06,
"epoch": 2.94
},
{
"loss": 6.9841,
"grad_norm": Infinity,
"learning_rate": 1.9252647238995363e-06,
"epoch": 2.94
},
{
"loss": 7.0386,
"grad_norm": 3.1843979358673096,
"learning_rate": 1.8377526909950118e-06,
"epoch": 2.95
},
{
"loss": 7.1348,
"grad_norm": 3.309314489364624,
"learning_rate": 1.7502406580904876e-06,
"epoch": 2.95
},
{
"loss": 7.5823,
"grad_norm": 3.1254711151123047,
"learning_rate": 1.6627286251859633e-06,
"epoch": 2.95
},
{
"loss": 7.4536,
"grad_norm": 4.925593852996826,
"learning_rate": 1.5752165922814386e-06,
"epoch": 2.95
},
{
"loss": 7.7934,
"grad_norm": 3.3663341999053955,
"learning_rate": 1.4877045593769145e-06,
"epoch": 2.96
},
{
"loss": 7.2564,
"grad_norm": 4.316028594970703,
"learning_rate": 1.40019252647239e-06,
"epoch": 2.96
},
{
"loss": 7.0841,
"grad_norm": 3.023416757583618,
"learning_rate": 1.3126804935678657e-06,
"epoch": 2.96
},
{
"loss": 7.0354,
"grad_norm": 4.041641712188721,
"learning_rate": 1.2251684606633412e-06,
"epoch": 2.96
},
{
"loss": 7.2408,
"grad_norm": 4.036230564117432,
"learning_rate": 1.137656427758817e-06,
"epoch": 2.97
},
{
"loss": 6.8542,
"grad_norm": 3.975757360458374,
"learning_rate": 1.0501443948542925e-06,
"epoch": 2.97
},
{
"loss": 7.384,
"grad_norm": 4.212265968322754,
"learning_rate": 9.626323619497682e-07,
"epoch": 2.97
},
{
"loss": 7.5291,
"grad_norm": 4.709102630615234,
"learning_rate": 8.751203290452438e-07,
"epoch": 2.97
},
{
"loss": 7.2415,
"grad_norm": 4.24073600769043,
"learning_rate": 7.876082961407193e-07,
"epoch": 2.98
},
{
"loss": 7.117,
"grad_norm": 4.139495849609375,
"learning_rate": 7.00096263236195e-07,
"epoch": 2.98
},
{
"loss": 7.4584,
"grad_norm": 3.581001043319702,
"learning_rate": 6.125842303316706e-07,
"epoch": 2.98
},
{
"loss": 7.0442,
"grad_norm": 3.6776018142700195,
"learning_rate": 5.250721974271462e-07,
"epoch": 2.98
},
{
"loss": 7.1353,
"grad_norm": 3.3257029056549072,
"learning_rate": 4.375601645226219e-07,
"epoch": 2.99
},
{
"loss": 7.4847,
"grad_norm": 4.7782697677612305,
"learning_rate": 3.500481316180975e-07,
"epoch": 2.99
},
{
"loss": 7.4365,
"grad_norm": 3.6555185317993164,
"learning_rate": 2.625360987135731e-07,
"epoch": 2.99
},
{
"loss": 7.0578,
"grad_norm": 3.675234079360962,
"learning_rate": 1.7502406580904875e-07,
"epoch": 3.0
},
{
"loss": 6.8152,
"grad_norm": 4.117830276489258,
"learning_rate": 8.751203290452438e-08,
"epoch": 3.0
},
{
"train_runtime": 132410.4061,
"train_samples_per_second": 2.762,
"train_steps_per_second": 0.086,
"train_loss": 8.721853916044362,
"epoch": 3.0
}
]