protgpt2-distilled-tiny / training_logs.json
littleworth's picture
Upload folder using huggingface_hub
391db44 verified
[
{
"loss": 30.8234,
"grad_norm": 0.5755094885826111,
"learning_rate": 0.0009991248796709547,
"epoch": 0.0
},
{
"loss": 24.1229,
"grad_norm": 0.2920963764190674,
"learning_rate": 0.0009982497593419095,
"epoch": 0.01
},
{
"loss": 22.7986,
"grad_norm": 0.5106011629104614,
"learning_rate": 0.0009973746390128642,
"epoch": 0.01
},
{
"loss": 21.4924,
"grad_norm": 0.9322375059127808,
"learning_rate": 0.000996499518683819,
"epoch": 0.01
},
{
"loss": 20.7911,
"grad_norm": 0.8529098629951477,
"learning_rate": 0.0009956243983547737,
"epoch": 0.01
},
{
"loss": 19.338,
"grad_norm": 0.776152491569519,
"learning_rate": 0.0009947492780257286,
"epoch": 0.02
},
{
"loss": 19.0175,
"grad_norm": 2.11796498298645,
"learning_rate": 0.0009938741576966832,
"epoch": 0.02
},
{
"loss": 18.2997,
"grad_norm": 1.3791886568069458,
"learning_rate": 0.0009929990373676381,
"epoch": 0.02
},
{
"loss": 17.2791,
"grad_norm": 1.3849037885665894,
"learning_rate": 0.0009921239170385928,
"epoch": 0.02
},
{
"loss": 17.3609,
"grad_norm": 1.1861941814422607,
"learning_rate": 0.0009912487967095476,
"epoch": 0.03
},
{
"loss": 17.1215,
"grad_norm": 1.494122862815857,
"learning_rate": 0.0009903736763805023,
"epoch": 0.03
},
{
"loss": 16.3944,
"grad_norm": 1.5872834920883179,
"learning_rate": 0.0009894985560514572,
"epoch": 0.03
},
{
"loss": 16.0054,
"grad_norm": 1.2658979892730713,
"learning_rate": 0.0009886234357224118,
"epoch": 0.03
},
{
"loss": 15.5523,
"grad_norm": 0.8640480041503906,
"learning_rate": 0.0009877483153933667,
"epoch": 0.04
},
{
"loss": 16.2465,
"grad_norm": 0.8946548700332642,
"learning_rate": 0.0009868731950643213,
"epoch": 0.04
},
{
"loss": 15.0235,
"grad_norm": 0.9279372692108154,
"learning_rate": 0.0009859980747352762,
"epoch": 0.04
},
{
"loss": 15.7517,
"grad_norm": 0.8807494044303894,
"learning_rate": 0.0009851229544062309,
"epoch": 0.04
},
{
"loss": 14.6884,
"grad_norm": 0.683822751045227,
"learning_rate": 0.0009842478340771857,
"epoch": 0.05
},
{
"loss": 14.0949,
"grad_norm": 1.1334095001220703,
"learning_rate": 0.0009833727137481404,
"epoch": 0.05
},
{
"loss": 14.3378,
"grad_norm": 1.1247657537460327,
"learning_rate": 0.0009824975934190953,
"epoch": 0.05
},
{
"loss": 13.7597,
"grad_norm": 0.9332773685455322,
"learning_rate": 0.00098162247309005,
"epoch": 0.06
},
{
"loss": 14.5567,
"grad_norm": 0.8742538690567017,
"learning_rate": 0.0009807473527610048,
"epoch": 0.06
},
{
"loss": 14.0188,
"grad_norm": 1.5592143535614014,
"learning_rate": 0.0009798722324319594,
"epoch": 0.06
},
{
"loss": 13.9401,
"grad_norm": 0.9473065733909607,
"learning_rate": 0.0009789971121029143,
"epoch": 0.06
},
{
"loss": 13.5177,
"grad_norm": 0.5469663143157959,
"learning_rate": 0.000978121991773869,
"epoch": 0.07
},
{
"loss": 13.513,
"grad_norm": 1.7497597932815552,
"learning_rate": 0.0009772468714448236,
"epoch": 0.07
},
{
"loss": 13.6579,
"grad_norm": 0.7552927136421204,
"learning_rate": 0.0009763717511157785,
"epoch": 0.07
},
{
"loss": 13.0899,
"grad_norm": 0.5602779984474182,
"learning_rate": 0.0009754966307867332,
"epoch": 0.07
},
{
"loss": 13.8637,
"grad_norm": 0.6577705144882202,
"learning_rate": 0.000974621510457688,
"epoch": 0.08
},
{
"loss": 14.2909,
"grad_norm": 1.0710817575454712,
"learning_rate": 0.0009737463901286428,
"epoch": 0.08
},
{
"loss": 13.3632,
"grad_norm": 0.48803457617759705,
"learning_rate": 0.0009728712697995975,
"epoch": 0.08
},
{
"loss": 13.5002,
"grad_norm": 0.9970788359642029,
"learning_rate": 0.0009719961494705523,
"epoch": 0.08
},
{
"loss": 13.6276,
"grad_norm": 0.9624769687652588,
"learning_rate": 0.000971121029141507,
"epoch": 0.09
},
{
"loss": 13.7281,
"grad_norm": 0.8082631230354309,
"learning_rate": 0.0009702459088124618,
"epoch": 0.09
},
{
"loss": 13.0793,
"grad_norm": 0.6732771396636963,
"learning_rate": 0.0009693707884834166,
"epoch": 0.09
},
{
"loss": 12.6621,
"grad_norm": 0.8451002240180969,
"learning_rate": 0.0009684956681543713,
"epoch": 0.09
},
{
"loss": 13.2374,
"grad_norm": 1.1656385660171509,
"learning_rate": 0.0009676205478253261,
"epoch": 0.1
},
{
"loss": 12.7625,
"grad_norm": 0.9667061567306519,
"learning_rate": 0.0009667454274962808,
"epoch": 0.1
},
{
"loss": 13.0046,
"grad_norm": 0.9311497807502747,
"learning_rate": 0.0009658703071672355,
"epoch": 0.1
},
{
"loss": 12.9037,
"grad_norm": 1.1891040802001953,
"learning_rate": 0.0009649951868381903,
"epoch": 0.1
},
{
"loss": 12.6521,
"grad_norm": 1.1127817630767822,
"learning_rate": 0.000964120066509145,
"epoch": 0.11
},
{
"loss": 13.2942,
"grad_norm": 0.6665758490562439,
"learning_rate": 0.0009632449461800998,
"epoch": 0.11
},
{
"loss": 12.4443,
"grad_norm": 0.8878126740455627,
"learning_rate": 0.0009623698258510546,
"epoch": 0.11
},
{
"loss": 13.0001,
"grad_norm": 1.5000464916229248,
"learning_rate": 0.0009614947055220093,
"epoch": 0.12
},
{
"loss": 12.2303,
"grad_norm": 1.1078687906265259,
"learning_rate": 0.0009606195851929641,
"epoch": 0.12
},
{
"loss": 12.1915,
"grad_norm": 0.8044748306274414,
"learning_rate": 0.0009597444648639187,
"epoch": 0.12
},
{
"loss": 12.7246,
"grad_norm": 0.9232500195503235,
"learning_rate": 0.0009588693445348735,
"epoch": 0.12
},
{
"loss": 11.9769,
"grad_norm": 0.7413458824157715,
"learning_rate": 0.0009579942242058283,
"epoch": 0.13
},
{
"loss": 12.8006,
"grad_norm": 1.1132707595825195,
"learning_rate": 0.000957119103876783,
"epoch": 0.13
},
{
"loss": 12.4323,
"grad_norm": 0.7814503312110901,
"learning_rate": 0.0009562439835477378,
"epoch": 0.13
},
{
"loss": 12.3482,
"grad_norm": 0.8854762315750122,
"learning_rate": 0.0009553688632186925,
"epoch": 0.13
},
{
"loss": 12.5045,
"grad_norm": 0.704131007194519,
"learning_rate": 0.0009544937428896473,
"epoch": 0.14
},
{
"loss": 12.1405,
"grad_norm": 0.7020297050476074,
"learning_rate": 0.0009536186225606021,
"epoch": 0.14
},
{
"loss": 11.5427,
"grad_norm": 0.398807168006897,
"learning_rate": 0.0009527435022315568,
"epoch": 0.14
},
{
"loss": 12.655,
"grad_norm": 1.0002299547195435,
"learning_rate": 0.0009518683819025116,
"epoch": 0.14
},
{
"loss": 11.9656,
"grad_norm": 0.7870428562164307,
"learning_rate": 0.0009509932615734664,
"epoch": 0.15
},
{
"loss": 12.4639,
"grad_norm": 0.9154604077339172,
"learning_rate": 0.0009501181412444211,
"epoch": 0.15
},
{
"loss": 11.6344,
"grad_norm": 1.1896569728851318,
"learning_rate": 0.0009492430209153759,
"epoch": 0.15
},
{
"loss": 12.4516,
"grad_norm": 0.8169024586677551,
"learning_rate": 0.0009483679005863306,
"epoch": 0.15
},
{
"loss": 12.1848,
"grad_norm": 0.8429264426231384,
"learning_rate": 0.0009474927802572854,
"epoch": 0.16
},
{
"loss": 11.2014,
"grad_norm": 0.8499436378479004,
"learning_rate": 0.0009466176599282402,
"epoch": 0.16
},
{
"loss": 12.2217,
"grad_norm": 0.8969743251800537,
"learning_rate": 0.0009457425395991948,
"epoch": 0.16
},
{
"loss": 11.7729,
"grad_norm": 1.0959218740463257,
"learning_rate": 0.0009448674192701496,
"epoch": 0.17
},
{
"loss": 11.6254,
"grad_norm": 1.1692876815795898,
"learning_rate": 0.0009439922989411043,
"epoch": 0.17
},
{
"loss": 11.5698,
"grad_norm": 1.9476372003555298,
"learning_rate": 0.0009431171786120591,
"epoch": 0.17
},
{
"loss": 11.4321,
"grad_norm": 1.1742662191390991,
"learning_rate": 0.0009422420582830139,
"epoch": 0.17
},
{
"loss": 11.3224,
"grad_norm": 0.9839737415313721,
"learning_rate": 0.0009413669379539686,
"epoch": 0.18
},
{
"loss": 11.8269,
"grad_norm": 0.9094179272651672,
"learning_rate": 0.0009404918176249234,
"epoch": 0.18
},
{
"loss": 11.8652,
"grad_norm": 0.9139958620071411,
"learning_rate": 0.0009396166972958782,
"epoch": 0.18
},
{
"loss": 11.5493,
"grad_norm": 0.7938945889472961,
"learning_rate": 0.0009387415769668329,
"epoch": 0.18
},
{
"loss": 11.413,
"grad_norm": 0.8102487921714783,
"learning_rate": 0.0009378664566377877,
"epoch": 0.19
},
{
"loss": 11.4015,
"grad_norm": 0.5892770290374756,
"learning_rate": 0.0009369913363087424,
"epoch": 0.19
},
{
"loss": 10.8455,
"grad_norm": 0.7269143462181091,
"learning_rate": 0.0009361162159796972,
"epoch": 0.19
},
{
"loss": 11.5612,
"grad_norm": 0.8169882893562317,
"learning_rate": 0.000935241095650652,
"epoch": 0.19
},
{
"loss": 10.545,
"grad_norm": 0.8424365520477295,
"learning_rate": 0.0009343659753216067,
"epoch": 0.2
},
{
"loss": 10.8486,
"grad_norm": 0.855518102645874,
"learning_rate": 0.0009334908549925615,
"epoch": 0.2
},
{
"loss": 10.3733,
"grad_norm": 1.1463903188705444,
"learning_rate": 0.0009326157346635162,
"epoch": 0.2
},
{
"loss": 10.794,
"grad_norm": 0.7493767142295837,
"learning_rate": 0.000931740614334471,
"epoch": 0.2
},
{
"loss": 10.5943,
"grad_norm": 0.8767346739768982,
"learning_rate": 0.0009308654940054258,
"epoch": 0.21
},
{
"loss": 11.4169,
"grad_norm": 1.0650781393051147,
"learning_rate": 0.0009299903736763805,
"epoch": 0.21
},
{
"loss": 10.8176,
"grad_norm": 0.8954362869262695,
"learning_rate": 0.0009291152533473353,
"epoch": 0.21
},
{
"loss": 10.9644,
"grad_norm": 0.697245180606842,
"learning_rate": 0.0009282401330182901,
"epoch": 0.22
},
{
"loss": 11.0427,
"grad_norm": 1.5471469163894653,
"learning_rate": 0.0009273650126892448,
"epoch": 0.22
},
{
"loss": 10.8293,
"grad_norm": 0.7173879146575928,
"learning_rate": 0.0009264898923601996,
"epoch": 0.22
},
{
"loss": 10.744,
"grad_norm": 1.1271495819091797,
"learning_rate": 0.0009256147720311543,
"epoch": 0.22
},
{
"loss": 10.3733,
"grad_norm": 0.7106486558914185,
"learning_rate": 0.0009247396517021091,
"epoch": 0.23
},
{
"loss": 10.9536,
"grad_norm": 1.1200592517852783,
"learning_rate": 0.0009238645313730638,
"epoch": 0.23
},
{
"loss": 10.4749,
"grad_norm": 1.0028458833694458,
"learning_rate": 0.0009229894110440185,
"epoch": 0.23
},
{
"loss": 11.4667,
"grad_norm": 1.187585711479187,
"learning_rate": 0.0009221142907149733,
"epoch": 0.23
},
{
"loss": 10.3349,
"grad_norm": 0.8691514134407043,
"learning_rate": 0.000921239170385928,
"epoch": 0.24
},
{
"loss": 10.6188,
"grad_norm": 0.8789599537849426,
"learning_rate": 0.0009203640500568828,
"epoch": 0.24
},
{
"loss": 10.454,
"grad_norm": 0.8376362919807434,
"learning_rate": 0.0009194889297278376,
"epoch": 0.24
},
{
"loss": 10.2419,
"grad_norm": 1.0760575532913208,
"learning_rate": 0.0009186138093987923,
"epoch": 0.24
},
{
"loss": 10.8593,
"grad_norm": 0.709028422832489,
"learning_rate": 0.0009177386890697471,
"epoch": 0.25
},
{
"loss": 11.073,
"grad_norm": 1.0934019088745117,
"learning_rate": 0.0009168635687407019,
"epoch": 0.25
},
{
"loss": 10.5596,
"grad_norm": 0.7833492159843445,
"learning_rate": 0.0009159884484116566,
"epoch": 0.25
},
{
"loss": 11.2079,
"grad_norm": 0.8762934803962708,
"learning_rate": 0.0009151133280826114,
"epoch": 0.25
},
{
"loss": 11.2229,
"grad_norm": 0.8059395551681519,
"learning_rate": 0.0009142382077535661,
"epoch": 0.26
},
{
"loss": 10.8706,
"grad_norm": 1.0892099142074585,
"learning_rate": 0.0009133630874245209,
"epoch": 0.26
},
{
"loss": 10.9983,
"grad_norm": 0.7471132278442383,
"learning_rate": 0.0009124879670954757,
"epoch": 0.26
},
{
"loss": 11.4291,
"grad_norm": 0.9766479730606079,
"learning_rate": 0.0009116128467664304,
"epoch": 0.27
},
{
"loss": 10.5895,
"grad_norm": 0.7469794154167175,
"learning_rate": 0.0009107377264373852,
"epoch": 0.27
},
{
"loss": 9.9826,
"grad_norm": 0.9510082602500916,
"learning_rate": 0.00090986260610834,
"epoch": 0.27
},
{
"loss": 10.1785,
"grad_norm": 0.8061089515686035,
"learning_rate": 0.0009089874857792947,
"epoch": 0.27
},
{
"loss": 10.5502,
"grad_norm": 0.7467952966690063,
"learning_rate": 0.0009081123654502495,
"epoch": 0.28
},
{
"loss": 10.4848,
"grad_norm": 0.9167515635490417,
"learning_rate": 0.0009072372451212042,
"epoch": 0.28
},
{
"loss": 10.7841,
"grad_norm": 1.0157630443572998,
"learning_rate": 0.000906362124792159,
"epoch": 0.28
},
{
"loss": 10.6985,
"grad_norm": 0.8764671087265015,
"learning_rate": 0.0009054870044631138,
"epoch": 0.28
},
{
"loss": 10.4706,
"grad_norm": 0.7716103196144104,
"learning_rate": 0.0009046118841340685,
"epoch": 0.29
},
{
"loss": 10.4371,
"grad_norm": 0.83449387550354,
"learning_rate": 0.0009037367638050233,
"epoch": 0.29
},
{
"loss": 10.2414,
"grad_norm": 0.785839855670929,
"learning_rate": 0.000902861643475978,
"epoch": 0.29
},
{
"loss": 10.0213,
"grad_norm": 0.7405595183372498,
"learning_rate": 0.0009019865231469327,
"epoch": 0.29
},
{
"loss": 10.2501,
"grad_norm": 0.929263710975647,
"learning_rate": 0.0009011114028178875,
"epoch": 0.3
},
{
"loss": 10.6749,
"grad_norm": 0.9185034036636353,
"learning_rate": 0.0009002362824888422,
"epoch": 0.3
},
{
"loss": 10.4313,
"grad_norm": 0.7888991832733154,
"learning_rate": 0.000899361162159797,
"epoch": 0.3
},
{
"loss": 10.4389,
"grad_norm": 0.9736090302467346,
"learning_rate": 0.0008984860418307517,
"epoch": 0.3
},
{
"loss": 9.9148,
"grad_norm": 0.7677895426750183,
"learning_rate": 0.0008976109215017065,
"epoch": 0.31
},
{
"loss": 9.7635,
"grad_norm": 0.9090219736099243,
"learning_rate": 0.0008967358011726613,
"epoch": 0.31
},
{
"loss": 10.0211,
"grad_norm": 0.7184523344039917,
"learning_rate": 0.000895860680843616,
"epoch": 0.31
},
{
"loss": 9.9932,
"grad_norm": 1.0859735012054443,
"learning_rate": 0.0008949855605145708,
"epoch": 0.31
},
{
"loss": 10.2804,
"grad_norm": 1.0252892971038818,
"learning_rate": 0.0008941104401855256,
"epoch": 0.32
},
{
"loss": 10.0543,
"grad_norm": 1.1707403659820557,
"learning_rate": 0.0008932353198564803,
"epoch": 0.32
},
{
"loss": 10.6658,
"grad_norm": 0.6616178750991821,
"learning_rate": 0.0008923601995274351,
"epoch": 0.32
},
{
"loss": 9.8623,
"grad_norm": 1.9947571754455566,
"learning_rate": 0.0008914850791983898,
"epoch": 0.33
},
{
"loss": 10.1607,
"grad_norm": 1.3363871574401855,
"learning_rate": 0.0008906099588693446,
"epoch": 0.33
},
{
"loss": 10.1063,
"grad_norm": 1.0039112567901611,
"learning_rate": 0.0008897348385402994,
"epoch": 0.33
},
{
"loss": 9.7059,
"grad_norm": 1.0225836038589478,
"learning_rate": 0.0008888597182112541,
"epoch": 0.33
},
{
"loss": 10.2506,
"grad_norm": 1.1005779504776,
"learning_rate": 0.0008879845978822089,
"epoch": 0.34
},
{
"loss": 10.3011,
"grad_norm": 1.1654433012008667,
"learning_rate": 0.0008871094775531636,
"epoch": 0.34
},
{
"loss": 10.088,
"grad_norm": 0.9155218601226807,
"learning_rate": 0.0008862343572241184,
"epoch": 0.34
},
{
"loss": 9.8835,
"grad_norm": 1.2090198993682861,
"learning_rate": 0.0008853592368950732,
"epoch": 0.34
},
{
"loss": 9.6644,
"grad_norm": 1.5198620557785034,
"learning_rate": 0.0008844841165660279,
"epoch": 0.35
},
{
"loss": 9.6799,
"grad_norm": 1.0043960809707642,
"learning_rate": 0.0008836089962369827,
"epoch": 0.35
},
{
"loss": 10.0658,
"grad_norm": 1.0404608249664307,
"learning_rate": 0.0008827338759079375,
"epoch": 0.35
},
{
"loss": 9.9551,
"grad_norm": 1.0412163734436035,
"learning_rate": 0.0008818587555788922,
"epoch": 0.35
},
{
"loss": 9.4082,
"grad_norm": 0.9032560586929321,
"learning_rate": 0.000880983635249847,
"epoch": 0.36
},
{
"loss": 10.2566,
"grad_norm": 1.2763034105300903,
"learning_rate": 0.0008801085149208016,
"epoch": 0.36
},
{
"loss": 9.8585,
"grad_norm": 0.8143719434738159,
"learning_rate": 0.0008792333945917563,
"epoch": 0.36
},
{
"loss": 9.5974,
"grad_norm": 1.3916654586791992,
"learning_rate": 0.000878358274262711,
"epoch": 0.36
},
{
"loss": 10.611,
"grad_norm": 1.2270894050598145,
"learning_rate": 0.0008774831539336658,
"epoch": 0.37
},
{
"loss": 9.4489,
"grad_norm": 1.339573621749878,
"learning_rate": 0.0008766080336046206,
"epoch": 0.37
},
{
"loss": 9.769,
"grad_norm": 1.023978352546692,
"learning_rate": 0.0008757329132755753,
"epoch": 0.37
},
{
"loss": 9.7854,
"grad_norm": 1.1513617038726807,
"learning_rate": 0.0008748577929465301,
"epoch": 0.38
},
{
"loss": 9.4378,
"grad_norm": 0.9918627142906189,
"learning_rate": 0.0008739826726174849,
"epoch": 0.38
},
{
"loss": 9.6902,
"grad_norm": 0.9365573525428772,
"learning_rate": 0.0008731075522884396,
"epoch": 0.38
},
{
"loss": 9.5533,
"grad_norm": 1.1697934865951538,
"learning_rate": 0.0008722324319593944,
"epoch": 0.38
},
{
"loss": 9.5204,
"grad_norm": 1.2257342338562012,
"learning_rate": 0.0008713573116303491,
"epoch": 0.39
},
{
"loss": 9.636,
"grad_norm": 1.0158884525299072,
"learning_rate": 0.0008704821913013039,
"epoch": 0.39
},
{
"loss": 9.8914,
"grad_norm": 1.4228135347366333,
"learning_rate": 0.0008696070709722587,
"epoch": 0.39
},
{
"loss": 9.3714,
"grad_norm": 1.2829135656356812,
"learning_rate": 0.0008687319506432134,
"epoch": 0.39
},
{
"loss": 9.7498,
"grad_norm": 1.2624573707580566,
"learning_rate": 0.0008678568303141682,
"epoch": 0.4
},
{
"loss": 9.8928,
"grad_norm": 1.3651659488677979,
"learning_rate": 0.000866981709985123,
"epoch": 0.4
},
{
"loss": 10.3697,
"grad_norm": 1.1383252143859863,
"learning_rate": 0.0008661065896560777,
"epoch": 0.4
},
{
"loss": 10.1876,
"grad_norm": 1.1688463687896729,
"learning_rate": 0.0008652314693270325,
"epoch": 0.4
},
{
"loss": 9.7974,
"grad_norm": 1.1377474069595337,
"learning_rate": 0.0008643563489979872,
"epoch": 0.41
},
{
"loss": 9.5742,
"grad_norm": 1.0107587575912476,
"learning_rate": 0.000863481228668942,
"epoch": 0.41
},
{
"loss": 9.9821,
"grad_norm": 1.3488329648971558,
"learning_rate": 0.0008626061083398968,
"epoch": 0.41
},
{
"loss": 9.3107,
"grad_norm": 1.0305010080337524,
"learning_rate": 0.0008617309880108515,
"epoch": 0.41
},
{
"loss": 9.3456,
"grad_norm": 0.8658286929130554,
"learning_rate": 0.0008608558676818063,
"epoch": 0.42
},
{
"loss": 9.3709,
"grad_norm": 1.1033709049224854,
"learning_rate": 0.000859980747352761,
"epoch": 0.42
},
{
"loss": 9.5077,
"grad_norm": 1.1051572561264038,
"learning_rate": 0.0008591056270237157,
"epoch": 0.42
},
{
"loss": 9.1458,
"grad_norm": 1.3423538208007812,
"learning_rate": 0.0008582305066946705,
"epoch": 0.43
},
{
"loss": 9.657,
"grad_norm": 1.1479153633117676,
"learning_rate": 0.0008573553863656252,
"epoch": 0.43
},
{
"loss": 10.5804,
"grad_norm": 1.1615872383117676,
"learning_rate": 0.00085648026603658,
"epoch": 0.43
},
{
"loss": 8.2792,
"grad_norm": 1.212221384048462,
"learning_rate": 0.0008556051457075347,
"epoch": 0.43
},
{
"loss": 9.3785,
"grad_norm": 1.0849367380142212,
"learning_rate": 0.0008547300253784895,
"epoch": 0.44
},
{
"loss": 9.4097,
"grad_norm": 1.119325041770935,
"learning_rate": 0.0008538549050494443,
"epoch": 0.44
},
{
"loss": 9.3308,
"grad_norm": 1.3356918096542358,
"learning_rate": 0.000852979784720399,
"epoch": 0.44
},
{
"loss": 9.4548,
"grad_norm": 0.9954844117164612,
"learning_rate": 0.0008521046643913538,
"epoch": 0.44
},
{
"loss": 8.9297,
"grad_norm": 0.8752724528312683,
"learning_rate": 0.0008512295440623086,
"epoch": 0.45
},
{
"loss": 9.1389,
"grad_norm": 1.2811753749847412,
"learning_rate": 0.0008503544237332633,
"epoch": 0.45
},
{
"loss": 9.3155,
"grad_norm": 1.253055453300476,
"learning_rate": 0.0008494793034042181,
"epoch": 0.45
},
{
"loss": 9.548,
"grad_norm": 1.2081260681152344,
"learning_rate": 0.0008486041830751728,
"epoch": 0.45
},
{
"loss": 9.0236,
"grad_norm": 1.3752362728118896,
"learning_rate": 0.0008477290627461276,
"epoch": 0.46
},
{
"loss": 9.0533,
"grad_norm": 1.057065725326538,
"learning_rate": 0.0008468539424170824,
"epoch": 0.46
},
{
"loss": 9.0675,
"grad_norm": 1.0036309957504272,
"learning_rate": 0.0008459788220880371,
"epoch": 0.46
},
{
"loss": 9.5195,
"grad_norm": 1.3881008625030518,
"learning_rate": 0.0008451037017589919,
"epoch": 0.46
},
{
"loss": 9.3519,
"grad_norm": 1.4355233907699585,
"learning_rate": 0.0008442285814299467,
"epoch": 0.47
},
{
"loss": 9.6383,
"grad_norm": 0.9438649415969849,
"learning_rate": 0.0008433534611009014,
"epoch": 0.47
},
{
"loss": 9.2643,
"grad_norm": 0.8599776029586792,
"learning_rate": 0.0008424783407718562,
"epoch": 0.47
},
{
"loss": 8.9869,
"grad_norm": 1.1090342998504639,
"learning_rate": 0.0008416032204428109,
"epoch": 0.48
},
{
"loss": 9.2475,
"grad_norm": 1.272929310798645,
"learning_rate": 0.0008407281001137657,
"epoch": 0.48
},
{
"loss": 9.5772,
"grad_norm": 0.9889743328094482,
"learning_rate": 0.0008398529797847205,
"epoch": 0.48
},
{
"loss": 9.9227,
"grad_norm": 1.2748692035675049,
"learning_rate": 0.0008389778594556752,
"epoch": 0.48
},
{
"loss": 9.9915,
"grad_norm": 1.4889165163040161,
"learning_rate": 0.00083810273912663,
"epoch": 0.49
},
{
"loss": 9.0012,
"grad_norm": 1.2172118425369263,
"learning_rate": 0.0008372276187975846,
"epoch": 0.49
},
{
"loss": 9.2968,
"grad_norm": 1.0313849449157715,
"learning_rate": 0.0008363524984685394,
"epoch": 0.49
},
{
"loss": 8.9158,
"grad_norm": 1.3325482606887817,
"learning_rate": 0.0008354773781394942,
"epoch": 0.49
},
{
"loss": 9.0097,
"grad_norm": 1.5407133102416992,
"learning_rate": 0.0008346022578104489,
"epoch": 0.5
},
{
"loss": 9.0166,
"grad_norm": 1.1565685272216797,
"learning_rate": 0.0008337271374814037,
"epoch": 0.5
},
{
"loss": 9.1856,
"grad_norm": 1.0405404567718506,
"learning_rate": 0.0008328520171523584,
"epoch": 0.5
},
{
"loss": 9.2405,
"grad_norm": 1.465058445930481,
"learning_rate": 0.0008319768968233132,
"epoch": 0.5
},
{
"loss": 8.835,
"grad_norm": 0.9321463704109192,
"learning_rate": 0.000831101776494268,
"epoch": 0.51
},
{
"loss": 9.4076,
"grad_norm": 1.1780034303665161,
"learning_rate": 0.0008302266561652227,
"epoch": 0.51
},
{
"loss": 9.5994,
"grad_norm": 1.488897681236267,
"learning_rate": 0.0008293515358361775,
"epoch": 0.51
},
{
"loss": 8.6378,
"grad_norm": 1.0508447885513306,
"learning_rate": 0.0008284764155071323,
"epoch": 0.51
},
{
"loss": 8.7946,
"grad_norm": 1.2236040830612183,
"learning_rate": 0.000827601295178087,
"epoch": 0.52
},
{
"loss": 9.4619,
"grad_norm": 1.0602221488952637,
"learning_rate": 0.0008267261748490418,
"epoch": 0.52
},
{
"loss": 8.927,
"grad_norm": 1.476576328277588,
"learning_rate": 0.0008258510545199965,
"epoch": 0.52
},
{
"loss": 8.766,
"grad_norm": 1.2723809480667114,
"learning_rate": 0.0008249759341909513,
"epoch": 0.52
},
{
"loss": 9.1577,
"grad_norm": 1.2955093383789062,
"learning_rate": 0.0008241008138619061,
"epoch": 0.53
},
{
"loss": 8.8254,
"grad_norm": 1.1421802043914795,
"learning_rate": 0.0008232256935328608,
"epoch": 0.53
},
{
"loss": 9.3559,
"grad_norm": 1.2015204429626465,
"learning_rate": 0.0008223505732038156,
"epoch": 0.53
},
{
"loss": 8.7055,
"grad_norm": 1.02347993850708,
"learning_rate": 0.0008214754528747703,
"epoch": 0.54
},
{
"loss": 9.1773,
"grad_norm": 1.0733789205551147,
"learning_rate": 0.0008206003325457251,
"epoch": 0.54
},
{
"loss": 9.4909,
"grad_norm": 1.140329360961914,
"learning_rate": 0.0008197252122166799,
"epoch": 0.54
},
{
"loss": 8.4982,
"grad_norm": 0.8933946490287781,
"learning_rate": 0.0008188500918876346,
"epoch": 0.54
},
{
"loss": 9.4497,
"grad_norm": 1.3848881721496582,
"learning_rate": 0.0008179749715585894,
"epoch": 0.55
},
{
"loss": 9.5758,
"grad_norm": 1.175162672996521,
"learning_rate": 0.0008170998512295442,
"epoch": 0.55
},
{
"loss": 9.5138,
"grad_norm": 1.1983882188796997,
"learning_rate": 0.0008162247309004989,
"epoch": 0.55
},
{
"loss": 9.0283,
"grad_norm": 0.9055472612380981,
"learning_rate": 0.0008153496105714536,
"epoch": 0.55
},
{
"loss": 9.2822,
"grad_norm": 0.8885380029678345,
"learning_rate": 0.0008144744902424083,
"epoch": 0.56
},
{
"loss": 8.9084,
"grad_norm": 1.0463942289352417,
"learning_rate": 0.0008135993699133631,
"epoch": 0.56
},
{
"loss": 9.0612,
"grad_norm": 1.1517601013183594,
"learning_rate": 0.0008127242495843179,
"epoch": 0.56
},
{
"loss": 9.7954,
"grad_norm": 1.6062026023864746,
"learning_rate": 0.0008118491292552726,
"epoch": 0.56
},
{
"loss": 8.823,
"grad_norm": 1.079883098602295,
"learning_rate": 0.0008109740089262274,
"epoch": 0.57
},
{
"loss": 8.6287,
"grad_norm": 0.8593969345092773,
"learning_rate": 0.0008100988885971821,
"epoch": 0.57
},
{
"loss": 9.046,
"grad_norm": 1.5058172941207886,
"learning_rate": 0.0008092237682681369,
"epoch": 0.57
},
{
"loss": 8.4422,
"grad_norm": 1.0326484441757202,
"learning_rate": 0.0008083486479390917,
"epoch": 0.57
},
{
"loss": 9.5016,
"grad_norm": 0.9177812337875366,
"learning_rate": 0.0008074735276100464,
"epoch": 0.58
},
{
"loss": 8.4734,
"grad_norm": 1.1267443895339966,
"learning_rate": 0.0008065984072810012,
"epoch": 0.58
},
{
"loss": 8.5878,
"grad_norm": 0.9788813591003418,
"learning_rate": 0.000805723286951956,
"epoch": 0.58
},
{
"loss": 9.1188,
"grad_norm": 1.1300309896469116,
"learning_rate": 0.0008048481666229107,
"epoch": 0.59
},
{
"loss": 8.7167,
"grad_norm": 0.9951778650283813,
"learning_rate": 0.0008039730462938655,
"epoch": 0.59
},
{
"loss": 9.1088,
"grad_norm": 0.9415300488471985,
"learning_rate": 0.0008030979259648202,
"epoch": 0.59
},
{
"loss": 8.4083,
"grad_norm": 0.990203320980072,
"learning_rate": 0.000802222805635775,
"epoch": 0.59
},
{
"loss": 8.4926,
"grad_norm": 1.0430456399917603,
"learning_rate": 0.0008013476853067298,
"epoch": 0.6
},
{
"loss": 9.3307,
"grad_norm": 0.9623116254806519,
"learning_rate": 0.0008004725649776845,
"epoch": 0.6
},
{
"loss": 8.8633,
"grad_norm": 1.0354257822036743,
"learning_rate": 0.0007995974446486392,
"epoch": 0.6
},
{
"loss": 8.7932,
"grad_norm": 1.1962673664093018,
"learning_rate": 0.0007987223243195939,
"epoch": 0.6
},
{
"loss": 8.4265,
"grad_norm": 1.0186195373535156,
"learning_rate": 0.0007978472039905487,
"epoch": 0.61
},
{
"loss": 8.4596,
"grad_norm": 1.2448772192001343,
"learning_rate": 0.0007969720836615035,
"epoch": 0.61
},
{
"loss": 9.0019,
"grad_norm": 1.11643385887146,
"learning_rate": 0.0007960969633324582,
"epoch": 0.61
},
{
"loss": 8.7469,
"grad_norm": 1.9622658491134644,
"learning_rate": 0.000795221843003413,
"epoch": 0.61
},
{
"loss": 8.208,
"grad_norm": 0.9547304511070251,
"learning_rate": 0.0007943467226743676,
"epoch": 0.62
},
{
"loss": 8.3751,
"grad_norm": 0.8313985466957092,
"learning_rate": 0.0007934716023453224,
"epoch": 0.62
},
{
"loss": 8.6238,
"grad_norm": 0.9323874711990356,
"learning_rate": 0.0007925964820162772,
"epoch": 0.62
},
{
"loss": 9.0078,
"grad_norm": 1.0662554502487183,
"learning_rate": 0.0007917213616872319,
"epoch": 0.62
},
{
"loss": 8.7407,
"grad_norm": 1.197045087814331,
"learning_rate": 0.0007908462413581867,
"epoch": 0.63
},
{
"loss": 8.9698,
"grad_norm": 1.0494697093963623,
"learning_rate": 0.0007899711210291415,
"epoch": 0.63
},
{
"loss": 8.56,
"grad_norm": 0.9860395789146423,
"learning_rate": 0.0007890960007000962,
"epoch": 0.63
},
{
"loss": 8.624,
"grad_norm": 0.8026842474937439,
"learning_rate": 0.000788220880371051,
"epoch": 0.64
},
{
"loss": 9.1911,
"grad_norm": 1.0249046087265015,
"learning_rate": 0.0007873457600420057,
"epoch": 0.64
},
{
"loss": 8.552,
"grad_norm": 1.3037137985229492,
"learning_rate": 0.0007864706397129605,
"epoch": 0.64
},
{
"loss": 8.6872,
"grad_norm": 1.1018158197402954,
"learning_rate": 0.0007855955193839153,
"epoch": 0.64
},
{
"loss": 8.5007,
"grad_norm": 0.9974724054336548,
"learning_rate": 0.00078472039905487,
"epoch": 0.65
},
{
"loss": 9.3866,
"grad_norm": 1.2537139654159546,
"learning_rate": 0.0007838452787258248,
"epoch": 0.65
},
{
"loss": 8.9869,
"grad_norm": 1.2758492231369019,
"learning_rate": 0.0007829701583967795,
"epoch": 0.65
},
{
"loss": 8.266,
"grad_norm": 0.9684768915176392,
"learning_rate": 0.0007820950380677343,
"epoch": 0.65
},
{
"loss": 9.0718,
"grad_norm": 1.0212547779083252,
"learning_rate": 0.0007812199177386891,
"epoch": 0.66
},
{
"loss": 8.1438,
"grad_norm": 1.2493318319320679,
"learning_rate": 0.0007803447974096438,
"epoch": 0.66
},
{
"loss": 8.4132,
"grad_norm": 0.8168124556541443,
"learning_rate": 0.0007794696770805986,
"epoch": 0.66
},
{
"loss": 8.4466,
"grad_norm": 1.2837003469467163,
"learning_rate": 0.0007785945567515534,
"epoch": 0.66
},
{
"loss": 8.6008,
"grad_norm": 1.1589733362197876,
"learning_rate": 0.0007777194364225081,
"epoch": 0.67
},
{
"loss": 8.7002,
"grad_norm": 1.036216378211975,
"learning_rate": 0.0007768443160934629,
"epoch": 0.67
},
{
"loss": 8.9616,
"grad_norm": 0.9488565921783447,
"learning_rate": 0.0007759691957644176,
"epoch": 0.67
},
{
"loss": 8.9011,
"grad_norm": 1.1349655389785767,
"learning_rate": 0.0007750940754353724,
"epoch": 0.67
},
{
"loss": 8.7398,
"grad_norm": 1.3466508388519287,
"learning_rate": 0.0007742189551063272,
"epoch": 0.68
},
{
"loss": 8.1787,
"grad_norm": 1.1343966722488403,
"learning_rate": 0.0007733438347772819,
"epoch": 0.68
},
{
"loss": 8.4513,
"grad_norm": 0.9983484148979187,
"learning_rate": 0.0007724687144482366,
"epoch": 0.68
},
{
"loss": 8.6249,
"grad_norm": 1.4816855192184448,
"learning_rate": 0.0007715935941191913,
"epoch": 0.69
},
{
"loss": 8.9094,
"grad_norm": 1.0790578126907349,
"learning_rate": 0.0007707184737901461,
"epoch": 0.69
},
{
"loss": 8.0177,
"grad_norm": 1.2572119235992432,
"learning_rate": 0.0007698433534611009,
"epoch": 0.69
},
{
"loss": 8.5014,
"grad_norm": 1.123079776763916,
"learning_rate": 0.0007689682331320556,
"epoch": 0.69
},
{
"loss": 8.2177,
"grad_norm": 0.8789654970169067,
"learning_rate": 0.0007680931128030104,
"epoch": 0.7
},
{
"loss": 8.3753,
"grad_norm": 0.9512013792991638,
"learning_rate": 0.0007672179924739651,
"epoch": 0.7
},
{
"loss": 8.5434,
"grad_norm": 1.929919719696045,
"learning_rate": 0.0007663428721449199,
"epoch": 0.7
},
{
"loss": 8.5505,
"grad_norm": 1.1756147146224976,
"learning_rate": 0.0007654677518158747,
"epoch": 0.7
},
{
"loss": 8.8823,
"grad_norm": 1.1833679676055908,
"learning_rate": 0.0007645926314868294,
"epoch": 0.71
},
{
"loss": 8.6715,
"grad_norm": 1.4701839685440063,
"learning_rate": 0.0007637175111577842,
"epoch": 0.71
},
{
"loss": 8.7559,
"grad_norm": 0.9352959990501404,
"learning_rate": 0.0007629299028616435,
"epoch": 0.71
},
{
"loss": 9.5594,
"grad_norm": 1.0391898155212402,
"learning_rate": 0.0007620547825325983,
"epoch": 0.71
},
{
"loss": 8.3431,
"grad_norm": 1.0766905546188354,
"learning_rate": 0.000761179662203553,
"epoch": 0.72
},
{
"loss": 8.3928,
"grad_norm": 1.10299551486969,
"learning_rate": 0.0007603045418745078,
"epoch": 0.72
},
{
"loss": 8.9913,
"grad_norm": 1.1581339836120605,
"learning_rate": 0.0007594294215454624,
"epoch": 0.72
},
{
"loss": 8.5142,
"grad_norm": 1.086441993713379,
"learning_rate": 0.0007585543012164172,
"epoch": 0.72
},
{
"loss": 8.7005,
"grad_norm": 0.9478667974472046,
"learning_rate": 0.000757679180887372,
"epoch": 0.73
},
{
"loss": 8.608,
"grad_norm": 1.0929220914840698,
"learning_rate": 0.0007568040605583267,
"epoch": 0.73
},
{
"loss": 8.1125,
"grad_norm": 1.217629313468933,
"learning_rate": 0.0007559289402292815,
"epoch": 0.73
},
{
"loss": 8.4331,
"grad_norm": 1.2786823511123657,
"learning_rate": 0.0007550538199002362,
"epoch": 0.73
},
{
"loss": 9.1985,
"grad_norm": 1.0184354782104492,
"learning_rate": 0.000754178699571191,
"epoch": 0.74
},
{
"loss": 8.6549,
"grad_norm": 0.93660968542099,
"learning_rate": 0.0007533035792421458,
"epoch": 0.74
},
{
"loss": 8.7819,
"grad_norm": 1.0092636346817017,
"learning_rate": 0.0007524284589131005,
"epoch": 0.74
},
{
"loss": 8.3759,
"grad_norm": 1.2108792066574097,
"learning_rate": 0.0007515533385840553,
"epoch": 0.75
},
{
"loss": 8.4973,
"grad_norm": 0.9994498491287231,
"learning_rate": 0.00075067821825501,
"epoch": 0.75
},
{
"loss": 8.3731,
"grad_norm": 1.153273344039917,
"learning_rate": 0.0007498030979259648,
"epoch": 0.75
},
{
"loss": 8.4148,
"grad_norm": 1.051223874092102,
"learning_rate": 0.0007489279775969196,
"epoch": 0.75
},
{
"loss": 8.6672,
"grad_norm": 1.4810237884521484,
"learning_rate": 0.0007480528572678743,
"epoch": 0.76
},
{
"loss": 8.6439,
"grad_norm": 1.021606206893921,
"learning_rate": 0.0007471777369388291,
"epoch": 0.76
},
{
"loss": 8.7591,
"grad_norm": 0.8680776357650757,
"learning_rate": 0.0007463026166097839,
"epoch": 0.76
},
{
"loss": 9.0187,
"grad_norm": 1.0177042484283447,
"learning_rate": 0.0007454274962807386,
"epoch": 0.76
},
{
"loss": 8.9481,
"grad_norm": 1.2384392023086548,
"learning_rate": 0.0007445523759516934,
"epoch": 0.77
},
{
"loss": 8.6184,
"grad_norm": 1.3748959302902222,
"learning_rate": 0.0007436772556226481,
"epoch": 0.77
},
{
"loss": 8.3906,
"grad_norm": 1.042493462562561,
"learning_rate": 0.0007428021352936029,
"epoch": 0.77
},
{
"loss": 9.3308,
"grad_norm": 1.0647776126861572,
"learning_rate": 0.0007419270149645576,
"epoch": 0.77
},
{
"loss": 8.332,
"grad_norm": 1.2385993003845215,
"learning_rate": 0.0007410518946355123,
"epoch": 0.78
},
{
"loss": 8.3127,
"grad_norm": 1.0191227197647095,
"learning_rate": 0.0007401767743064671,
"epoch": 0.78
},
{
"loss": 8.3151,
"grad_norm": 0.8735216856002808,
"learning_rate": 0.0007393016539774218,
"epoch": 0.78
},
{
"loss": 8.701,
"grad_norm": 1.202993392944336,
"learning_rate": 0.0007384265336483766,
"epoch": 0.78
},
{
"loss": 7.8262,
"grad_norm": 0.9682905673980713,
"learning_rate": 0.0007375514133193314,
"epoch": 0.79
},
{
"loss": 8.4729,
"grad_norm": 1.2290154695510864,
"learning_rate": 0.0007366762929902861,
"epoch": 0.79
},
{
"loss": 8.9253,
"grad_norm": 1.0369175672531128,
"learning_rate": 0.0007358011726612409,
"epoch": 0.79
},
{
"loss": 9.2036,
"grad_norm": 1.0748445987701416,
"learning_rate": 0.0007349260523321957,
"epoch": 0.8
},
{
"loss": 8.2364,
"grad_norm": 1.147964596748352,
"learning_rate": 0.0007340509320031504,
"epoch": 0.8
},
{
"loss": 9.006,
"grad_norm": 1.0363622903823853,
"learning_rate": 0.0007331758116741052,
"epoch": 0.8
},
{
"loss": 8.7969,
"grad_norm": 1.2576889991760254,
"learning_rate": 0.0007323006913450599,
"epoch": 0.8
},
{
"loss": 8.4052,
"grad_norm": 1.1075588464736938,
"learning_rate": 0.0007314255710160147,
"epoch": 0.81
},
{
"loss": 8.5912,
"grad_norm": 1.0697672367095947,
"learning_rate": 0.0007305504506869695,
"epoch": 0.81
},
{
"loss": 8.7837,
"grad_norm": 1.0865002870559692,
"learning_rate": 0.0007296753303579242,
"epoch": 0.81
},
{
"loss": 8.0798,
"grad_norm": 1.3645957708358765,
"learning_rate": 0.000728800210028879,
"epoch": 0.81
},
{
"loss": 8.2649,
"grad_norm": 1.0889688730239868,
"learning_rate": 0.0007279250896998337,
"epoch": 0.82
},
{
"loss": 7.902,
"grad_norm": 0.9943633675575256,
"learning_rate": 0.0007270499693707885,
"epoch": 0.82
},
{
"loss": 8.493,
"grad_norm": 1.3548861742019653,
"learning_rate": 0.0007261748490417433,
"epoch": 0.82
},
{
"loss": 9.2024,
"grad_norm": 1.1603728532791138,
"learning_rate": 0.000725299728712698,
"epoch": 0.82
},
{
"loss": 8.7272,
"grad_norm": 1.2872350215911865,
"learning_rate": 0.0007244246083836528,
"epoch": 0.83
},
{
"loss": 8.8292,
"grad_norm": 1.0431410074234009,
"learning_rate": 0.0007235494880546076,
"epoch": 0.83
},
{
"loss": 8.0473,
"grad_norm": 0.9648978114128113,
"learning_rate": 0.0007226743677255623,
"epoch": 0.83
},
{
"loss": 8.134,
"grad_norm": 0.8962783217430115,
"learning_rate": 0.0007217992473965171,
"epoch": 0.83
},
{
"loss": 8.2796,
"grad_norm": 0.8879069685935974,
"learning_rate": 0.0007209241270674718,
"epoch": 0.84
},
{
"loss": 8.6275,
"grad_norm": 1.0046008825302124,
"learning_rate": 0.0007200490067384265,
"epoch": 0.84
},
{
"loss": 8.2847,
"grad_norm": 1.1034067869186401,
"learning_rate": 0.0007191738864093813,
"epoch": 0.84
},
{
"loss": 8.723,
"grad_norm": 0.9179050326347351,
"learning_rate": 0.000718298766080336,
"epoch": 0.85
},
{
"loss": 8.2843,
"grad_norm": 1.0402296781539917,
"learning_rate": 0.0007174236457512908,
"epoch": 0.85
},
{
"loss": 8.2487,
"grad_norm": 1.2751373052597046,
"learning_rate": 0.0007165485254222455,
"epoch": 0.85
},
{
"loss": 8.3491,
"grad_norm": 0.8596373200416565,
"learning_rate": 0.0007156734050932003,
"epoch": 0.85
},
{
"loss": 8.4695,
"grad_norm": 1.0553058385849,
"learning_rate": 0.0007147982847641551,
"epoch": 0.86
},
{
"loss": 8.74,
"grad_norm": 1.0505644083023071,
"learning_rate": 0.0007139231644351098,
"epoch": 0.86
},
{
"loss": 8.3704,
"grad_norm": 1.4136569499969482,
"learning_rate": 0.0007130480441060646,
"epoch": 0.86
},
{
"loss": 7.9998,
"grad_norm": 0.9397268295288086,
"learning_rate": 0.0007121729237770194,
"epoch": 0.86
},
{
"loss": 8.5978,
"grad_norm": 1.1479915380477905,
"learning_rate": 0.0007112978034479741,
"epoch": 0.87
},
{
"loss": 8.6225,
"grad_norm": 1.0489866733551025,
"learning_rate": 0.0007104226831189289,
"epoch": 0.87
},
{
"loss": 8.3155,
"grad_norm": 0.9371022582054138,
"learning_rate": 0.0007095475627898836,
"epoch": 0.87
},
{
"loss": 8.3844,
"grad_norm": 1.1981381177902222,
"learning_rate": 0.0007086724424608384,
"epoch": 0.87
},
{
"loss": 8.5061,
"grad_norm": 0.8924277424812317,
"learning_rate": 0.0007077973221317932,
"epoch": 0.88
},
{
"loss": 8.1918,
"grad_norm": 1.4077969789505005,
"learning_rate": 0.0007069222018027479,
"epoch": 0.88
},
{
"loss": 8.3377,
"grad_norm": 1.1926066875457764,
"learning_rate": 0.0007060470814737027,
"epoch": 0.88
},
{
"loss": 8.4682,
"grad_norm": 1.1524171829223633,
"learning_rate": 0.0007051719611446574,
"epoch": 0.88
},
{
"loss": 8.5678,
"grad_norm": 1.0660207271575928,
"learning_rate": 0.0007042968408156122,
"epoch": 0.89
},
{
"loss": 7.9908,
"grad_norm": 1.1786776781082153,
"learning_rate": 0.000703421720486567,
"epoch": 0.89
},
{
"loss": 9.0339,
"grad_norm": 0.9970653057098389,
"learning_rate": 0.0007025466001575217,
"epoch": 0.89
},
{
"loss": 8.6511,
"grad_norm": 1.171247124671936,
"learning_rate": 0.0007016714798284765,
"epoch": 0.9
},
{
"loss": 8.0249,
"grad_norm": 1.1036537885665894,
"learning_rate": 0.0007007963594994313,
"epoch": 0.9
},
{
"loss": 8.2895,
"grad_norm": 1.4363912343978882,
"learning_rate": 0.000699921239170386,
"epoch": 0.9
},
{
"loss": 8.4263,
"grad_norm": 1.2977561950683594,
"learning_rate": 0.0006990461188413408,
"epoch": 0.9
},
{
"loss": 8.3236,
"grad_norm": 1.2732399702072144,
"learning_rate": 0.0006981709985122954,
"epoch": 0.91
},
{
"loss": 8.0876,
"grad_norm": 0.8092446327209473,
"learning_rate": 0.0006972958781832502,
"epoch": 0.91
},
{
"loss": 8.3052,
"grad_norm": 1.0607753992080688,
"learning_rate": 0.000696420757854205,
"epoch": 0.91
},
{
"loss": 8.2821,
"grad_norm": 1.2833763360977173,
"learning_rate": 0.0006955456375251597,
"epoch": 0.91
},
{
"loss": 8.0437,
"grad_norm": 1.2291605472564697,
"learning_rate": 0.0006946705171961145,
"epoch": 0.92
},
{
"loss": 7.9172,
"grad_norm": 0.9950680732727051,
"learning_rate": 0.0006937953968670692,
"epoch": 0.92
},
{
"loss": 7.8579,
"grad_norm": 1.170876145362854,
"learning_rate": 0.000692920276538024,
"epoch": 0.92
},
{
"loss": 8.7343,
"grad_norm": 1.0266340970993042,
"learning_rate": 0.0006920451562089788,
"epoch": 0.92
},
{
"loss": 8.3685,
"grad_norm": 1.1194366216659546,
"learning_rate": 0.0006911700358799335,
"epoch": 0.93
},
{
"loss": 8.8983,
"grad_norm": 1.130362868309021,
"learning_rate": 0.0006902949155508883,
"epoch": 0.93
},
{
"loss": 8.3624,
"grad_norm": 1.2582019567489624,
"learning_rate": 0.000689419795221843,
"epoch": 0.93
},
{
"loss": 8.5332,
"grad_norm": 1.0985493659973145,
"learning_rate": 0.0006885446748927978,
"epoch": 0.93
},
{
"loss": 8.263,
"grad_norm": 1.0480501651763916,
"learning_rate": 0.0006876695545637526,
"epoch": 0.94
},
{
"loss": 8.1911,
"grad_norm": 1.085471510887146,
"learning_rate": 0.0006867944342347073,
"epoch": 0.94
},
{
"loss": 8.6767,
"grad_norm": 1.109959602355957,
"learning_rate": 0.0006859193139056621,
"epoch": 0.94
},
{
"loss": 8.1904,
"grad_norm": 0.9299295544624329,
"learning_rate": 0.0006850441935766169,
"epoch": 0.94
},
{
"loss": 7.9858,
"grad_norm": 1.3819242715835571,
"learning_rate": 0.0006841690732475716,
"epoch": 0.95
},
{
"loss": 8.3134,
"grad_norm": 1.499324083328247,
"learning_rate": 0.0006832939529185264,
"epoch": 0.95
},
{
"loss": 8.1389,
"grad_norm": 1.0068879127502441,
"learning_rate": 0.0006824188325894811,
"epoch": 0.95
},
{
"loss": 8.0979,
"grad_norm": 1.232861876487732,
"learning_rate": 0.0006815437122604359,
"epoch": 0.96
},
{
"loss": 8.1456,
"grad_norm": 1.020922064781189,
"learning_rate": 0.0006806685919313907,
"epoch": 0.96
},
{
"loss": 8.1438,
"grad_norm": 1.2880629301071167,
"learning_rate": 0.0006797934716023453,
"epoch": 0.96
},
{
"loss": 7.8589,
"grad_norm": 1.2720872163772583,
"learning_rate": 0.0006789183512733001,
"epoch": 0.96
},
{
"loss": 8.338,
"grad_norm": 1.1569981575012207,
"learning_rate": 0.0006780432309442548,
"epoch": 0.97
},
{
"loss": 7.6167,
"grad_norm": 1.0755385160446167,
"learning_rate": 0.0006771681106152095,
"epoch": 0.97
},
{
"loss": 9.1889,
"grad_norm": 1.1371173858642578,
"learning_rate": 0.0006762929902861643,
"epoch": 0.97
},
{
"loss": 8.1603,
"grad_norm": 1.2543790340423584,
"learning_rate": 0.000675417869957119,
"epoch": 0.97
},
{
"loss": 8.1684,
"grad_norm": 1.665987491607666,
"learning_rate": 0.0006745427496280738,
"epoch": 0.98
},
{
"loss": 8.4957,
"grad_norm": 1.1479765176773071,
"learning_rate": 0.0006736676292990285,
"epoch": 0.98
},
{
"loss": 7.998,
"grad_norm": 1.1416277885437012,
"learning_rate": 0.0006727925089699833,
"epoch": 0.98
},
{
"loss": 8.4458,
"grad_norm": 1.2610832452774048,
"learning_rate": 0.0006719173886409381,
"epoch": 0.98
},
{
"loss": 8.2715,
"grad_norm": 1.2478748559951782,
"learning_rate": 0.0006710422683118928,
"epoch": 0.99
},
{
"loss": 8.0882,
"grad_norm": 0.9021313190460205,
"learning_rate": 0.0006701671479828476,
"epoch": 0.99
},
{
"loss": 8.2404,
"grad_norm": 1.0023951530456543,
"learning_rate": 0.0006692920276538024,
"epoch": 0.99
},
{
"loss": 8.681,
"grad_norm": 1.3342375755310059,
"learning_rate": 0.0006684169073247571,
"epoch": 0.99
},
{
"loss": 8.024,
"grad_norm": 1.0199118852615356,
"learning_rate": 0.0006675417869957119,
"epoch": 1.0
},
{
"loss": 8.3688,
"grad_norm": 0.893786609172821,
"learning_rate": 0.0006666666666666666,
"epoch": 1.0
},
{
"loss": 8.0561,
"grad_norm": 1.2774296998977661,
"learning_rate": 0.0006657915463376214,
"epoch": 1.0
},
{
"loss": 7.8444,
"grad_norm": 1.0824223756790161,
"learning_rate": 0.0006649164260085762,
"epoch": 1.01
},
{
"loss": 8.1771,
"grad_norm": 0.869452178478241,
"learning_rate": 0.0006640413056795309,
"epoch": 1.01
},
{
"loss": 7.6838,
"grad_norm": 1.1132241487503052,
"learning_rate": 0.0006631661853504857,
"epoch": 1.01
},
{
"loss": 7.9475,
"grad_norm": 1.2853749990463257,
"learning_rate": 0.0006622910650214405,
"epoch": 1.01
},
{
"loss": 8.8546,
"grad_norm": 1.2339048385620117,
"learning_rate": 0.0006614159446923952,
"epoch": 1.02
},
{
"loss": 8.1339,
"grad_norm": 1.2211487293243408,
"learning_rate": 0.00066054082436335,
"epoch": 1.02
},
{
"loss": 7.402,
"grad_norm": 1.0966975688934326,
"learning_rate": 0.0006596657040343047,
"epoch": 1.02
},
{
"loss": 8.1777,
"grad_norm": 1.0253325700759888,
"learning_rate": 0.0006587905837052595,
"epoch": 1.02
},
{
"loss": 8.2748,
"grad_norm": 1.2987836599349976,
"learning_rate": 0.0006579154633762143,
"epoch": 1.03
},
{
"loss": 8.3225,
"grad_norm": 0.945371687412262,
"learning_rate": 0.000657040343047169,
"epoch": 1.03
},
{
"loss": 8.4416,
"grad_norm": 1.0868079662322998,
"learning_rate": 0.0006561652227181238,
"epoch": 1.03
},
{
"loss": 8.1007,
"grad_norm": 1.0190479755401611,
"learning_rate": 0.0006552901023890784,
"epoch": 1.03
},
{
"loss": 8.1317,
"grad_norm": 1.0896625518798828,
"learning_rate": 0.0006544149820600332,
"epoch": 1.04
},
{
"loss": 7.7364,
"grad_norm": 1.1690502166748047,
"learning_rate": 0.000653539861730988,
"epoch": 1.04
},
{
"loss": 7.8173,
"grad_norm": 1.0521645545959473,
"learning_rate": 0.0006526647414019427,
"epoch": 1.04
},
{
"loss": 7.6212,
"grad_norm": 1.3057899475097656,
"learning_rate": 0.0006517896210728975,
"epoch": 1.04
},
{
"loss": 8.0228,
"grad_norm": 0.968885064125061,
"learning_rate": 0.0006509145007438522,
"epoch": 1.05
},
{
"loss": 7.8535,
"grad_norm": 1.1838873624801636,
"learning_rate": 0.000650039380414807,
"epoch": 1.05
},
{
"loss": 8.1991,
"grad_norm": 1.0967016220092773,
"learning_rate": 0.0006491642600857618,
"epoch": 1.05
},
{
"loss": 8.1515,
"grad_norm": 1.0798629522323608,
"learning_rate": 0.0006482891397567165,
"epoch": 1.06
},
{
"loss": 8.291,
"grad_norm": 1.1506596803665161,
"learning_rate": 0.0006474140194276713,
"epoch": 1.06
},
{
"loss": 7.956,
"grad_norm": 1.0459505319595337,
"learning_rate": 0.0006465388990986261,
"epoch": 1.06
},
{
"loss": 8.4393,
"grad_norm": 1.070776343345642,
"learning_rate": 0.0006456637787695808,
"epoch": 1.06
},
{
"loss": 8.5445,
"grad_norm": 1.3064284324645996,
"learning_rate": 0.0006447886584405356,
"epoch": 1.07
},
{
"loss": 8.701,
"grad_norm": 1.0707839727401733,
"learning_rate": 0.0006439135381114903,
"epoch": 1.07
},
{
"loss": 7.4342,
"grad_norm": 1.123377799987793,
"learning_rate": 0.0006430384177824451,
"epoch": 1.07
},
{
"loss": 8.4883,
"grad_norm": 1.7230886220932007,
"learning_rate": 0.0006421632974533999,
"epoch": 1.07
},
{
"loss": 8.5288,
"grad_norm": 0.9721227288246155,
"learning_rate": 0.0006412881771243546,
"epoch": 1.08
},
{
"loss": 7.8249,
"grad_norm": 1.2729851007461548,
"learning_rate": 0.0006404130567953094,
"epoch": 1.08
},
{
"loss": 8.3277,
"grad_norm": 0.9693044424057007,
"learning_rate": 0.0006395379364662642,
"epoch": 1.08
},
{
"loss": 7.8798,
"grad_norm": 1.104020118713379,
"learning_rate": 0.0006386628161372189,
"epoch": 1.08
},
{
"loss": 7.899,
"grad_norm": 1.0556141138076782,
"learning_rate": 0.0006377876958081737,
"epoch": 1.09
},
{
"loss": 8.6403,
"grad_norm": 1.227303147315979,
"learning_rate": 0.0006369125754791284,
"epoch": 1.09
},
{
"loss": 8.7407,
"grad_norm": 1.2486103773117065,
"learning_rate": 0.0006360374551500832,
"epoch": 1.09
},
{
"loss": 8.226,
"grad_norm": 1.1452488899230957,
"learning_rate": 0.000635162334821038,
"epoch": 1.09
},
{
"loss": 8.5083,
"grad_norm": 1.466182827949524,
"learning_rate": 0.0006342872144919927,
"epoch": 1.1
},
{
"loss": 7.8041,
"grad_norm": 1.2693302631378174,
"learning_rate": 0.0006334120941629474,
"epoch": 1.1
},
{
"loss": 7.918,
"grad_norm": 1.1236190795898438,
"learning_rate": 0.0006325369738339021,
"epoch": 1.1
},
{
"loss": 7.8792,
"grad_norm": 0.9166776537895203,
"learning_rate": 0.0006316618535048569,
"epoch": 1.11
},
{
"loss": 8.3714,
"grad_norm": 1.2021427154541016,
"learning_rate": 0.0006307867331758117,
"epoch": 1.11
},
{
"loss": 8.5282,
"grad_norm": 1.1508140563964844,
"learning_rate": 0.0006299116128467664,
"epoch": 1.11
},
{
"loss": 7.7235,
"grad_norm": 1.044027328491211,
"learning_rate": 0.0006290364925177212,
"epoch": 1.11
},
{
"loss": 8.0483,
"grad_norm": 1.00051748752594,
"learning_rate": 0.000628161372188676,
"epoch": 1.12
},
{
"loss": 8.0003,
"grad_norm": 1.0397716760635376,
"learning_rate": 0.0006272862518596307,
"epoch": 1.12
},
{
"loss": 8.274,
"grad_norm": 1.0577192306518555,
"learning_rate": 0.0006264111315305855,
"epoch": 1.12
},
{
"loss": 7.8435,
"grad_norm": 1.1829681396484375,
"learning_rate": 0.0006255360112015402,
"epoch": 1.12
},
{
"loss": 8.5019,
"grad_norm": 1.9353641271591187,
"learning_rate": 0.000624660890872495,
"epoch": 1.13
},
{
"loss": 8.4582,
"grad_norm": 1.237269639968872,
"learning_rate": 0.0006237857705434498,
"epoch": 1.13
},
{
"loss": 8.0735,
"grad_norm": 1.1674834489822388,
"learning_rate": 0.0006229106502144045,
"epoch": 1.13
},
{
"loss": 8.3781,
"grad_norm": 1.32883620262146,
"learning_rate": 0.0006220355298853593,
"epoch": 1.13
},
{
"loss": 8.723,
"grad_norm": 1.3197271823883057,
"learning_rate": 0.000621160409556314,
"epoch": 1.14
},
{
"loss": 8.414,
"grad_norm": 1.137764573097229,
"learning_rate": 0.0006202852892272688,
"epoch": 1.14
},
{
"loss": 7.9197,
"grad_norm": 1.1574738025665283,
"learning_rate": 0.0006194101688982236,
"epoch": 1.14
},
{
"loss": 8.09,
"grad_norm": 1.0444676876068115,
"learning_rate": 0.0006185350485691783,
"epoch": 1.14
},
{
"loss": 7.3329,
"grad_norm": 0.8655235767364502,
"learning_rate": 0.0006176599282401331,
"epoch": 1.15
},
{
"loss": 8.4163,
"grad_norm": 0.9860300421714783,
"learning_rate": 0.0006167848079110879,
"epoch": 1.15
},
{
"loss": 8.2608,
"grad_norm": 1.1680139303207397,
"learning_rate": 0.0006159096875820426,
"epoch": 1.15
},
{
"loss": 7.9283,
"grad_norm": 1.545938491821289,
"learning_rate": 0.0006150345672529974,
"epoch": 1.15
},
{
"loss": 8.4113,
"grad_norm": 1.2768994569778442,
"learning_rate": 0.0006141594469239521,
"epoch": 1.16
},
{
"loss": 8.2389,
"grad_norm": 1.0001721382141113,
"learning_rate": 0.0006132843265949069,
"epoch": 1.16
},
{
"loss": 8.397,
"grad_norm": 1.8651808500289917,
"learning_rate": 0.0006124092062658617,
"epoch": 1.16
},
{
"loss": 8.003,
"grad_norm": 0.947693407535553,
"learning_rate": 0.0006115340859368163,
"epoch": 1.17
},
{
"loss": 7.5861,
"grad_norm": 1.1168384552001953,
"learning_rate": 0.0006106589656077711,
"epoch": 1.17
},
{
"loss": 8.7788,
"grad_norm": 1.1341112852096558,
"learning_rate": 0.0006097838452787258,
"epoch": 1.17
},
{
"loss": 7.9428,
"grad_norm": 1.2905473709106445,
"learning_rate": 0.0006089087249496806,
"epoch": 1.17
},
{
"loss": 8.6196,
"grad_norm": 0.9961435794830322,
"learning_rate": 0.0006080336046206354,
"epoch": 1.18
},
{
"loss": 8.224,
"grad_norm": 1.3134316205978394,
"learning_rate": 0.0006071584842915901,
"epoch": 1.18
},
{
"loss": 7.9156,
"grad_norm": 1.5898418426513672,
"learning_rate": 0.0006062833639625449,
"epoch": 1.18
},
{
"loss": 8.2147,
"grad_norm": 0.99250727891922,
"learning_rate": 0.0006054082436334996,
"epoch": 1.18
},
{
"loss": 7.6957,
"grad_norm": 1.2642431259155273,
"learning_rate": 0.0006045331233044544,
"epoch": 1.19
},
{
"loss": 7.7926,
"grad_norm": 1.314082384109497,
"learning_rate": 0.0006036580029754092,
"epoch": 1.19
},
{
"loss": 7.9682,
"grad_norm": 1.1342573165893555,
"learning_rate": 0.0006027828826463639,
"epoch": 1.19
},
{
"loss": 8.0208,
"grad_norm": 1.3015680313110352,
"learning_rate": 0.0006019077623173187,
"epoch": 1.19
},
{
"loss": 8.3608,
"grad_norm": 0.9990431666374207,
"learning_rate": 0.0006010326419882735,
"epoch": 1.2
},
{
"loss": 8.2009,
"grad_norm": 0.9804344773292542,
"learning_rate": 0.0006001575216592282,
"epoch": 1.2
},
{
"loss": 8.0484,
"grad_norm": 1.1591954231262207,
"learning_rate": 0.0005992824013301829,
"epoch": 1.2
},
{
"loss": 8.116,
"grad_norm": 1.042474627494812,
"learning_rate": 0.0005984072810011376,
"epoch": 1.2
},
{
"loss": 7.9246,
"grad_norm": 1.8579179048538208,
"learning_rate": 0.0005975321606720924,
"epoch": 1.21
},
{
"loss": 7.9183,
"grad_norm": 0.8727061748504639,
"learning_rate": 0.0005966570403430472,
"epoch": 1.21
},
{
"loss": 7.675,
"grad_norm": 1.0189380645751953,
"learning_rate": 0.0005957819200140019,
"epoch": 1.21
},
{
"loss": 7.6222,
"grad_norm": 1.0766206979751587,
"learning_rate": 0.0005949067996849567,
"epoch": 1.22
},
{
"loss": 7.6455,
"grad_norm": 1.121745228767395,
"learning_rate": 0.0005940316793559114,
"epoch": 1.22
},
{
"loss": 8.1449,
"grad_norm": 1.2497507333755493,
"learning_rate": 0.0005931565590268662,
"epoch": 1.22
},
{
"loss": 8.3586,
"grad_norm": 1.301903486251831,
"learning_rate": 0.000592281438697821,
"epoch": 1.22
},
{
"loss": 8.163,
"grad_norm": 1.1964079141616821,
"learning_rate": 0.0005914063183687757,
"epoch": 1.23
},
{
"loss": 8.2938,
"grad_norm": 1.1423827409744263,
"learning_rate": 0.0005905311980397304,
"epoch": 1.23
},
{
"loss": 8.165,
"grad_norm": 1.119884967803955,
"learning_rate": 0.0005896560777106851,
"epoch": 1.23
},
{
"loss": 7.7234,
"grad_norm": 1.4375518560409546,
"learning_rate": 0.0005887809573816399,
"epoch": 1.23
},
{
"loss": 8.0758,
"grad_norm": 1.1417185068130493,
"learning_rate": 0.0005879058370525947,
"epoch": 1.24
},
{
"loss": 7.9137,
"grad_norm": 1.048060417175293,
"learning_rate": 0.0005870307167235494,
"epoch": 1.24
},
{
"loss": 8.4029,
"grad_norm": 0.9880658388137817,
"learning_rate": 0.0005861555963945042,
"epoch": 1.24
},
{
"loss": 8.4489,
"grad_norm": 1.000611424446106,
"learning_rate": 0.000585280476065459,
"epoch": 1.24
},
{
"loss": 8.2688,
"grad_norm": 1.3099920749664307,
"learning_rate": 0.0005844053557364137,
"epoch": 1.25
},
{
"loss": 7.7948,
"grad_norm": 0.8548302054405212,
"learning_rate": 0.0005835302354073685,
"epoch": 1.25
},
{
"loss": 8.442,
"grad_norm": 1.1732860803604126,
"learning_rate": 0.0005826551150783232,
"epoch": 1.25
},
{
"loss": 7.6346,
"grad_norm": 0.803125262260437,
"learning_rate": 0.000581779994749278,
"epoch": 1.25
},
{
"loss": 8.0567,
"grad_norm": 1.258419156074524,
"learning_rate": 0.0005809048744202328,
"epoch": 1.26
},
{
"loss": 8.1142,
"grad_norm": 1.1331418752670288,
"learning_rate": 0.0005800297540911875,
"epoch": 1.26
},
{
"loss": 8.5457,
"grad_norm": 1.5619804859161377,
"learning_rate": 0.0005791546337621423,
"epoch": 1.26
},
{
"loss": 7.9416,
"grad_norm": 1.880534052848816,
"learning_rate": 0.000578279513433097,
"epoch": 1.27
},
{
"loss": 7.8216,
"grad_norm": 1.2279471158981323,
"learning_rate": 0.0005774043931040518,
"epoch": 1.27
},
{
"loss": 7.8216,
"grad_norm": 1.1597974300384521,
"learning_rate": 0.0005765292727750066,
"epoch": 1.27
},
{
"loss": 7.9033,
"grad_norm": 1.1710484027862549,
"learning_rate": 0.0005756541524459613,
"epoch": 1.27
},
{
"loss": 7.6036,
"grad_norm": 1.0655231475830078,
"learning_rate": 0.0005747790321169161,
"epoch": 1.28
},
{
"loss": 7.5982,
"grad_norm": 1.0066710710525513,
"learning_rate": 0.0005739039117878709,
"epoch": 1.28
},
{
"loss": 7.738,
"grad_norm": 1.1333460807800293,
"learning_rate": 0.0005730287914588256,
"epoch": 1.28
},
{
"loss": 8.0025,
"grad_norm": 1.468841791152954,
"learning_rate": 0.0005721536711297804,
"epoch": 1.28
},
{
"loss": 7.4888,
"grad_norm": 1.1363178491592407,
"learning_rate": 0.0005712785508007351,
"epoch": 1.29
},
{
"loss": 7.3176,
"grad_norm": 1.1589970588684082,
"learning_rate": 0.0005704034304716899,
"epoch": 1.29
},
{
"loss": 7.6323,
"grad_norm": 0.9033693075180054,
"learning_rate": 0.0005695283101426447,
"epoch": 1.29
},
{
"loss": 7.8839,
"grad_norm": 1.2384039163589478,
"learning_rate": 0.0005686531898135993,
"epoch": 1.29
},
{
"loss": 7.8408,
"grad_norm": 1.3826912641525269,
"learning_rate": 0.0005677780694845541,
"epoch": 1.3
},
{
"loss": 7.4433,
"grad_norm": 1.1403487920761108,
"learning_rate": 0.0005669029491555088,
"epoch": 1.3
},
{
"loss": 8.5407,
"grad_norm": 1.037423014640808,
"learning_rate": 0.0005660278288264636,
"epoch": 1.3
},
{
"loss": 8.0943,
"grad_norm": 1.4421013593673706,
"learning_rate": 0.0005651527084974184,
"epoch": 1.3
},
{
"loss": 7.7771,
"grad_norm": 1.2977713346481323,
"learning_rate": 0.0005642775881683731,
"epoch": 1.31
},
{
"loss": 7.54,
"grad_norm": 1.049196720123291,
"learning_rate": 0.0005634024678393279,
"epoch": 1.31
},
{
"loss": 7.4699,
"grad_norm": 1.0489652156829834,
"learning_rate": 0.0005625273475102827,
"epoch": 1.31
},
{
"loss": 7.9441,
"grad_norm": 1.1373968124389648,
"learning_rate": 0.0005616522271812374,
"epoch": 1.32
},
{
"loss": 7.2627,
"grad_norm": 1.0570902824401855,
"learning_rate": 0.0005607771068521922,
"epoch": 1.32
},
{
"loss": 7.7472,
"grad_norm": 1.0547776222229004,
"learning_rate": 0.0005599019865231469,
"epoch": 1.32
},
{
"loss": 7.8815,
"grad_norm": 1.2481534481048584,
"learning_rate": 0.0005590268661941017,
"epoch": 1.32
},
{
"loss": 8.2547,
"grad_norm": 1.1728442907333374,
"learning_rate": 0.0005581517458650565,
"epoch": 1.33
},
{
"loss": 7.5035,
"grad_norm": 1.0567808151245117,
"learning_rate": 0.0005572766255360112,
"epoch": 1.33
},
{
"loss": 7.9982,
"grad_norm": 0.8234537243843079,
"learning_rate": 0.000556401505206966,
"epoch": 1.33
},
{
"loss": 7.5333,
"grad_norm": 1.09587824344635,
"learning_rate": 0.0005555263848779207,
"epoch": 1.33
},
{
"loss": 7.768,
"grad_norm": 1.3897008895874023,
"learning_rate": 0.0005546512645488755,
"epoch": 1.34
},
{
"loss": 7.7645,
"grad_norm": 1.1089082956314087,
"learning_rate": 0.0005537761442198303,
"epoch": 1.34
},
{
"loss": 7.7809,
"grad_norm": 1.2678576707839966,
"learning_rate": 0.000552901023890785,
"epoch": 1.34
},
{
"loss": 7.7376,
"grad_norm": 1.3946635723114014,
"learning_rate": 0.0005520259035617398,
"epoch": 1.34
},
{
"loss": 8.2773,
"grad_norm": 1.3742512464523315,
"learning_rate": 0.0005511507832326946,
"epoch": 1.35
},
{
"loss": 7.7902,
"grad_norm": 1.416434645652771,
"learning_rate": 0.0005502756629036493,
"epoch": 1.35
},
{
"loss": 7.6157,
"grad_norm": 1.0419012308120728,
"learning_rate": 0.0005494005425746041,
"epoch": 1.35
},
{
"loss": 7.5897,
"grad_norm": 1.7180145978927612,
"learning_rate": 0.0005485254222455588,
"epoch": 1.35
},
{
"loss": 8.0068,
"grad_norm": 1.6651771068572998,
"learning_rate": 0.0005476503019165136,
"epoch": 1.36
},
{
"loss": 7.4023,
"grad_norm": 1.0715596675872803,
"learning_rate": 0.0005467751815874683,
"epoch": 1.36
},
{
"loss": 8.0369,
"grad_norm": 1.208898901939392,
"learning_rate": 0.000545900061258423,
"epoch": 1.36
},
{
"loss": 7.6188,
"grad_norm": 0.9920070767402649,
"learning_rate": 0.0005450249409293778,
"epoch": 1.36
},
{
"loss": 8.6854,
"grad_norm": 1.174086570739746,
"learning_rate": 0.0005441498206003325,
"epoch": 1.37
},
{
"loss": 7.5733,
"grad_norm": 1.244912028312683,
"learning_rate": 0.0005432747002712873,
"epoch": 1.37
},
{
"loss": 7.389,
"grad_norm": 1.5966273546218872,
"learning_rate": 0.0005423995799422421,
"epoch": 1.37
},
{
"loss": 8.1756,
"grad_norm": 1.0320965051651,
"learning_rate": 0.0005415244596131968,
"epoch": 1.38
},
{
"loss": 8.897,
"grad_norm": 1.2478450536727905,
"learning_rate": 0.0005406493392841516,
"epoch": 1.38
},
{
"loss": 7.6083,
"grad_norm": 1.4347364902496338,
"learning_rate": 0.0005397742189551064,
"epoch": 1.38
},
{
"loss": 7.9916,
"grad_norm": 1.1878119707107544,
"learning_rate": 0.0005388990986260611,
"epoch": 1.38
},
{
"loss": 8.1032,
"grad_norm": 1.3169543743133545,
"learning_rate": 0.0005380239782970159,
"epoch": 1.39
},
{
"loss": 7.3094,
"grad_norm": 1.271192193031311,
"learning_rate": 0.0005371488579679706,
"epoch": 1.39
},
{
"loss": 7.2947,
"grad_norm": 1.484824299812317,
"learning_rate": 0.0005362737376389254,
"epoch": 1.39
},
{
"loss": 7.7483,
"grad_norm": 1.0237884521484375,
"learning_rate": 0.0005353986173098802,
"epoch": 1.39
},
{
"loss": 7.7284,
"grad_norm": 1.141897201538086,
"learning_rate": 0.0005345234969808349,
"epoch": 1.4
},
{
"loss": 7.9684,
"grad_norm": 1.2076783180236816,
"learning_rate": 0.0005336483766517897,
"epoch": 1.4
},
{
"loss": 7.4731,
"grad_norm": 1.0815685987472534,
"learning_rate": 0.0005327732563227444,
"epoch": 1.4
},
{
"loss": 7.6468,
"grad_norm": 1.9115163087844849,
"learning_rate": 0.0005318981359936992,
"epoch": 1.4
},
{
"loss": 8.179,
"grad_norm": 1.1872133016586304,
"learning_rate": 0.000531023015664654,
"epoch": 1.41
},
{
"loss": 8.1254,
"grad_norm": 1.144726037979126,
"learning_rate": 0.0005301478953356087,
"epoch": 1.41
},
{
"loss": 7.7947,
"grad_norm": 1.562495231628418,
"learning_rate": 0.0005292727750065635,
"epoch": 1.41
},
{
"loss": 7.2917,
"grad_norm": 1.20420241355896,
"learning_rate": 0.0005283976546775183,
"epoch": 1.41
},
{
"loss": 7.9956,
"grad_norm": 1.0302613973617554,
"learning_rate": 0.000527522534348473,
"epoch": 1.42
},
{
"loss": 7.8058,
"grad_norm": 1.161452293395996,
"learning_rate": 0.0005266474140194278,
"epoch": 1.42
},
{
"loss": 8.2652,
"grad_norm": 1.2876991033554077,
"learning_rate": 0.0005257722936903825,
"epoch": 1.42
},
{
"loss": 8.0375,
"grad_norm": 1.1002925634384155,
"learning_rate": 0.0005248971733613372,
"epoch": 1.43
},
{
"loss": 7.82,
"grad_norm": 1.0201154947280884,
"learning_rate": 0.000524022053032292,
"epoch": 1.43
},
{
"loss": 8.3203,
"grad_norm": 1.1177037954330444,
"learning_rate": 0.0005231469327032467,
"epoch": 1.43
},
{
"loss": 7.9789,
"grad_norm": 1.4295682907104492,
"learning_rate": 0.0005222718123742015,
"epoch": 1.43
},
{
"loss": 8.0088,
"grad_norm": 1.4420737028121948,
"learning_rate": 0.0005213966920451562,
"epoch": 1.44
},
{
"loss": 7.8298,
"grad_norm": 1.1020231246948242,
"learning_rate": 0.000520521571716111,
"epoch": 1.44
},
{
"loss": 7.8801,
"grad_norm": 1.4339189529418945,
"learning_rate": 0.0005196464513870657,
"epoch": 1.44
},
{
"loss": 7.6756,
"grad_norm": 1.5243607759475708,
"learning_rate": 0.0005187713310580204,
"epoch": 1.44
},
{
"loss": 8.1007,
"grad_norm": 0.9880979657173157,
"learning_rate": 0.0005178962107289752,
"epoch": 1.45
},
{
"loss": 7.7396,
"grad_norm": 1.1447367668151855,
"learning_rate": 0.0005170210903999299,
"epoch": 1.45
},
{
"loss": 7.8537,
"grad_norm": 1.384048342704773,
"learning_rate": 0.0005161459700708847,
"epoch": 1.45
},
{
"loss": 7.8855,
"grad_norm": 1.3757721185684204,
"learning_rate": 0.0005152708497418395,
"epoch": 1.45
},
{
"loss": 7.8651,
"grad_norm": 1.1160024404525757,
"learning_rate": 0.0005143957294127942,
"epoch": 1.46
},
{
"loss": 7.8378,
"grad_norm": 0.9774546027183533,
"learning_rate": 0.000513520609083749,
"epoch": 1.46
},
{
"loss": 7.9251,
"grad_norm": 1.5181477069854736,
"learning_rate": 0.0005126454887547038,
"epoch": 1.46
},
{
"loss": 8.6781,
"grad_norm": 1.203229308128357,
"learning_rate": 0.0005117703684256585,
"epoch": 1.46
},
{
"loss": 7.6571,
"grad_norm": 1.0401496887207031,
"learning_rate": 0.0005108952480966133,
"epoch": 1.47
},
{
"loss": 7.3908,
"grad_norm": 1.3228225708007812,
"learning_rate": 0.000510020127767568,
"epoch": 1.47
},
{
"loss": 8.1244,
"grad_norm": 1.3072296380996704,
"learning_rate": 0.0005091450074385228,
"epoch": 1.47
},
{
"loss": 7.7535,
"grad_norm": 1.9105629920959473,
"learning_rate": 0.0005082698871094776,
"epoch": 1.48
},
{
"loss": 8.2387,
"grad_norm": 1.3035160303115845,
"learning_rate": 0.0005073947667804323,
"epoch": 1.48
},
{
"loss": 7.998,
"grad_norm": 0.9805745482444763,
"learning_rate": 0.0005065196464513871,
"epoch": 1.48
},
{
"loss": 8.0499,
"grad_norm": 1.28218412399292,
"learning_rate": 0.0005056445261223418,
"epoch": 1.48
},
{
"loss": 8.0939,
"grad_norm": 1.289697527885437,
"learning_rate": 0.0005047694057932966,
"epoch": 1.49
},
{
"loss": 7.8801,
"grad_norm": 1.3982206583023071,
"learning_rate": 0.0005038942854642513,
"epoch": 1.49
},
{
"loss": 7.5012,
"grad_norm": 1.1884011030197144,
"learning_rate": 0.000503019165135206,
"epoch": 1.49
},
{
"loss": 7.7792,
"grad_norm": 1.2014328241348267,
"learning_rate": 0.0005021440448061608,
"epoch": 1.49
},
{
"loss": 8.3151,
"grad_norm": 1.2958098649978638,
"learning_rate": 0.0005012689244771155,
"epoch": 1.5
},
{
"loss": 7.3702,
"grad_norm": 1.1195346117019653,
"learning_rate": 0.0005003938041480703,
"epoch": 1.5
},
{
"loss": 8.0952,
"grad_norm": 1.2185337543487549,
"learning_rate": 0.0004995186838190251,
"epoch": 1.5
},
{
"loss": 7.6605,
"grad_norm": 1.1054099798202515,
"learning_rate": 0.0004986435634899798,
"epoch": 1.5
},
{
"loss": 7.8926,
"grad_norm": 1.3183029890060425,
"learning_rate": 0.0004977684431609346,
"epoch": 1.51
},
{
"loss": 7.8356,
"grad_norm": 1.3786067962646484,
"learning_rate": 0.0004968933228318894,
"epoch": 1.51
},
{
"loss": 7.7605,
"grad_norm": 1.3373888731002808,
"learning_rate": 0.0004960182025028441,
"epoch": 1.51
},
{
"loss": 7.9272,
"grad_norm": 1.5524091720581055,
"learning_rate": 0.0004951430821737989,
"epoch": 1.51
},
{
"loss": 8.1264,
"grad_norm": 0.927689790725708,
"learning_rate": 0.0004942679618447536,
"epoch": 1.52
},
{
"loss": 8.1456,
"grad_norm": 1.4429559707641602,
"learning_rate": 0.0004933928415157084,
"epoch": 1.52
},
{
"loss": 8.5349,
"grad_norm": 1.17830228805542,
"learning_rate": 0.0004925177211866632,
"epoch": 1.52
},
{
"loss": 8.4138,
"grad_norm": 1.7398778200149536,
"learning_rate": 0.0004916426008576179,
"epoch": 1.53
},
{
"loss": 7.6329,
"grad_norm": 1.101945161819458,
"learning_rate": 0.0004907674805285727,
"epoch": 1.53
},
{
"loss": 8.2694,
"grad_norm": 1.2424931526184082,
"learning_rate": 0.0004898923601995274,
"epoch": 1.53
},
{
"loss": 7.2639,
"grad_norm": 0.8726850748062134,
"learning_rate": 0.0004890172398704822,
"epoch": 1.53
},
{
"loss": 7.5542,
"grad_norm": 1.020978331565857,
"learning_rate": 0.0004881421195414369,
"epoch": 1.54
},
{
"loss": 7.3334,
"grad_norm": 1.058136224746704,
"learning_rate": 0.0004872669992123917,
"epoch": 1.54
},
{
"loss": 7.6285,
"grad_norm": 1.7856310606002808,
"learning_rate": 0.00048639187888334644,
"epoch": 1.54
},
{
"loss": 7.8873,
"grad_norm": 1.1540299654006958,
"learning_rate": 0.0004855167585543012,
"epoch": 1.54
},
{
"loss": 7.5676,
"grad_norm": 1.4844547510147095,
"learning_rate": 0.00048464163822525597,
"epoch": 1.55
},
{
"loss": 8.0284,
"grad_norm": 1.1018364429473877,
"learning_rate": 0.00048376651789621073,
"epoch": 1.55
},
{
"loss": 7.8478,
"grad_norm": 1.4421080350875854,
"learning_rate": 0.0004828913975671655,
"epoch": 1.55
},
{
"loss": 8.0614,
"grad_norm": 1.322413444519043,
"learning_rate": 0.00048201627723812025,
"epoch": 1.55
},
{
"loss": 7.9015,
"grad_norm": 1.1930081844329834,
"learning_rate": 0.000481141156909075,
"epoch": 1.56
},
{
"loss": 7.843,
"grad_norm": 1.2846688032150269,
"learning_rate": 0.0004802660365800298,
"epoch": 1.56
},
{
"loss": 7.8268,
"grad_norm": 2.0413529872894287,
"learning_rate": 0.00047939091625098454,
"epoch": 1.56
},
{
"loss": 7.3241,
"grad_norm": 1.058362364768982,
"learning_rate": 0.0004785157959219393,
"epoch": 1.56
},
{
"loss": 7.8329,
"grad_norm": 1.725417971611023,
"learning_rate": 0.00047764067559289406,
"epoch": 1.57
},
{
"loss": 7.6295,
"grad_norm": 1.1373404264450073,
"learning_rate": 0.00047676555526384877,
"epoch": 1.57
},
{
"loss": 7.4763,
"grad_norm": 1.1107378005981445,
"learning_rate": 0.00047589043493480353,
"epoch": 1.57
},
{
"loss": 7.8846,
"grad_norm": 1.2450941801071167,
"learning_rate": 0.0004750153146057583,
"epoch": 1.57
},
{
"loss": 8.4109,
"grad_norm": 1.0643541812896729,
"learning_rate": 0.00047414019427671305,
"epoch": 1.58
},
{
"loss": 7.9126,
"grad_norm": 1.2940372228622437,
"learning_rate": 0.0004732650739476678,
"epoch": 1.58
},
{
"loss": 7.7132,
"grad_norm": 2.6067655086517334,
"learning_rate": 0.0004723899536186226,
"epoch": 1.58
},
{
"loss": 7.5708,
"grad_norm": 0.9783037304878235,
"learning_rate": 0.00047151483328957734,
"epoch": 1.59
},
{
"loss": 7.2771,
"grad_norm": 1.037582278251648,
"learning_rate": 0.0004706397129605321,
"epoch": 1.59
},
{
"loss": 7.7599,
"grad_norm": 1.0178707838058472,
"learning_rate": 0.00046976459263148686,
"epoch": 1.59
},
{
"loss": 7.1538,
"grad_norm": 1.558307409286499,
"learning_rate": 0.0004688894723024416,
"epoch": 1.59
},
{
"loss": 7.5229,
"grad_norm": 1.1060800552368164,
"learning_rate": 0.0004680143519733964,
"epoch": 1.6
},
{
"loss": 7.5813,
"grad_norm": 1.8988709449768066,
"learning_rate": 0.00046713923164435115,
"epoch": 1.6
},
{
"loss": 7.8319,
"grad_norm": 1.6066781282424927,
"learning_rate": 0.00046626411131530586,
"epoch": 1.6
},
{
"loss": 7.8222,
"grad_norm": 1.4711729288101196,
"learning_rate": 0.0004653889909862606,
"epoch": 1.6
},
{
"loss": 7.6115,
"grad_norm": 1.3585811853408813,
"learning_rate": 0.0004645138706572154,
"epoch": 1.61
},
{
"loss": 7.7618,
"grad_norm": 1.1487444639205933,
"learning_rate": 0.00046363875032817014,
"epoch": 1.61
},
{
"loss": 7.868,
"grad_norm": 1.4386248588562012,
"learning_rate": 0.0004627636299991249,
"epoch": 1.61
},
{
"loss": 7.7931,
"grad_norm": 1.0714224576950073,
"learning_rate": 0.00046188850967007967,
"epoch": 1.61
},
{
"loss": 8.2688,
"grad_norm": 1.6375863552093506,
"learning_rate": 0.00046101338934103443,
"epoch": 1.62
},
{
"loss": 7.621,
"grad_norm": 1.024120807647705,
"learning_rate": 0.0004601382690119892,
"epoch": 1.62
},
{
"loss": 8.2226,
"grad_norm": 1.2234493494033813,
"learning_rate": 0.0004592631486829439,
"epoch": 1.62
},
{
"loss": 7.596,
"grad_norm": 1.0593066215515137,
"learning_rate": 0.00045838802835389866,
"epoch": 1.62
},
{
"loss": 7.7407,
"grad_norm": 1.2529680728912354,
"learning_rate": 0.0004575129080248534,
"epoch": 1.63
},
{
"loss": 7.221,
"grad_norm": 1.1312929391860962,
"learning_rate": 0.0004566377876958082,
"epoch": 1.63
},
{
"loss": 7.6136,
"grad_norm": 1.4004312753677368,
"learning_rate": 0.00045576266736676294,
"epoch": 1.63
},
{
"loss": 7.9718,
"grad_norm": 1.4514151811599731,
"learning_rate": 0.00045488754703771765,
"epoch": 1.64
},
{
"loss": 7.3337,
"grad_norm": 1.1595350503921509,
"learning_rate": 0.0004540124267086724,
"epoch": 1.64
},
{
"loss": 7.7414,
"grad_norm": 1.1403205394744873,
"learning_rate": 0.0004531373063796272,
"epoch": 1.64
},
{
"loss": 7.7323,
"grad_norm": 1.677051305770874,
"learning_rate": 0.00045226218605058194,
"epoch": 1.64
},
{
"loss": 8.0048,
"grad_norm": 1.338146686553955,
"learning_rate": 0.0004513870657215367,
"epoch": 1.65
},
{
"loss": 7.9544,
"grad_norm": 1.0941588878631592,
"learning_rate": 0.00045051194539249146,
"epoch": 1.65
},
{
"loss": 8.1043,
"grad_norm": 1.224746584892273,
"learning_rate": 0.0004496368250634462,
"epoch": 1.65
},
{
"loss": 8.0849,
"grad_norm": 1.5772489309310913,
"learning_rate": 0.000448761704734401,
"epoch": 1.65
},
{
"loss": 7.3165,
"grad_norm": 1.4434912204742432,
"learning_rate": 0.00044788658440535575,
"epoch": 1.66
},
{
"loss": 7.8826,
"grad_norm": 0.9971029162406921,
"learning_rate": 0.0004470114640763105,
"epoch": 1.66
},
{
"loss": 7.7822,
"grad_norm": 1.061712384223938,
"learning_rate": 0.00044613634374726527,
"epoch": 1.66
},
{
"loss": 7.8387,
"grad_norm": 1.6292518377304077,
"learning_rate": 0.00044526122341822003,
"epoch": 1.66
},
{
"loss": 7.3463,
"grad_norm": 1.0507898330688477,
"learning_rate": 0.00044438610308917474,
"epoch": 1.67
},
{
"loss": 7.693,
"grad_norm": 1.332474708557129,
"learning_rate": 0.0004435109827601295,
"epoch": 1.67
},
{
"loss": 7.4542,
"grad_norm": 1.3393101692199707,
"learning_rate": 0.00044263586243108426,
"epoch": 1.67
},
{
"loss": 7.4236,
"grad_norm": 1.4949504137039185,
"learning_rate": 0.000441760742102039,
"epoch": 1.67
},
{
"loss": 7.4087,
"grad_norm": 1.3824454545974731,
"learning_rate": 0.0004408856217729938,
"epoch": 1.68
},
{
"loss": 7.3,
"grad_norm": 1.3991942405700684,
"learning_rate": 0.00044001050144394855,
"epoch": 1.68
},
{
"loss": 7.1648,
"grad_norm": 1.3270092010498047,
"learning_rate": 0.0004391353811149033,
"epoch": 1.68
},
{
"loss": 7.753,
"grad_norm": 1.1912864446640015,
"learning_rate": 0.00043826026078585807,
"epoch": 1.69
},
{
"loss": 7.6531,
"grad_norm": 1.2112165689468384,
"learning_rate": 0.00043738514045681283,
"epoch": 1.69
},
{
"loss": 8.0168,
"grad_norm": 1.0204828977584839,
"learning_rate": 0.0004365100201277676,
"epoch": 1.69
},
{
"loss": 7.8334,
"grad_norm": 1.8065035343170166,
"learning_rate": 0.00043563489979872236,
"epoch": 1.69
},
{
"loss": 7.9395,
"grad_norm": 1.1826367378234863,
"learning_rate": 0.0004347597794696771,
"epoch": 1.7
},
{
"loss": 8.0071,
"grad_norm": 0.9689782857894897,
"learning_rate": 0.00043388465914063183,
"epoch": 1.7
},
{
"loss": 7.5284,
"grad_norm": 0.9889323115348816,
"learning_rate": 0.0004330095388115866,
"epoch": 1.7
},
{
"loss": 7.7318,
"grad_norm": 1.4257516860961914,
"learning_rate": 0.00043213441848254135,
"epoch": 1.7
},
{
"loss": 7.6343,
"grad_norm": 1.623134970664978,
"learning_rate": 0.0004312592981534961,
"epoch": 1.71
},
{
"loss": 7.9645,
"grad_norm": 1.2686361074447632,
"learning_rate": 0.0004303841778244509,
"epoch": 1.71
},
{
"loss": 7.5339,
"grad_norm": 1.5115247964859009,
"learning_rate": 0.00042950905749540564,
"epoch": 1.71
},
{
"loss": 7.7401,
"grad_norm": 1.285506010055542,
"learning_rate": 0.0004286339371663604,
"epoch": 1.71
},
{
"loss": 7.6018,
"grad_norm": 1.4150651693344116,
"learning_rate": 0.00042775881683731516,
"epoch": 1.72
},
{
"loss": 7.4015,
"grad_norm": 1.485231637954712,
"learning_rate": 0.0004268836965082699,
"epoch": 1.72
},
{
"loss": 8.4429,
"grad_norm": 2.1629021167755127,
"learning_rate": 0.0004260085761792247,
"epoch": 1.72
},
{
"loss": 7.8298,
"grad_norm": 1.1586624383926392,
"learning_rate": 0.00042513345585017945,
"epoch": 1.72
},
{
"loss": 7.8121,
"grad_norm": 1.0134670734405518,
"learning_rate": 0.0004242583355211342,
"epoch": 1.73
},
{
"loss": 7.8337,
"grad_norm": 1.257633090019226,
"learning_rate": 0.00042338321519208897,
"epoch": 1.73
},
{
"loss": 7.6701,
"grad_norm": 1.212266445159912,
"learning_rate": 0.0004225080948630437,
"epoch": 1.73
},
{
"loss": 7.706,
"grad_norm": 1.2191237211227417,
"learning_rate": 0.00042163297453399844,
"epoch": 1.74
},
{
"loss": 7.4639,
"grad_norm": 1.476140022277832,
"learning_rate": 0.0004207578542049532,
"epoch": 1.74
},
{
"loss": 7.8126,
"grad_norm": 1.0655369758605957,
"learning_rate": 0.0004198827338759079,
"epoch": 1.74
},
{
"loss": 7.4091,
"grad_norm": 1.3340696096420288,
"learning_rate": 0.00041900761354686267,
"epoch": 1.74
},
{
"loss": 7.5701,
"grad_norm": 1.3290128707885742,
"learning_rate": 0.00041813249321781743,
"epoch": 1.75
},
{
"loss": 7.5513,
"grad_norm": 1.1993497610092163,
"learning_rate": 0.0004172573728887722,
"epoch": 1.75
},
{
"loss": 7.5115,
"grad_norm": 0.9953559041023254,
"learning_rate": 0.00041638225255972696,
"epoch": 1.75
},
{
"loss": 8.0513,
"grad_norm": 1.1929738521575928,
"learning_rate": 0.0004155071322306817,
"epoch": 1.75
},
{
"loss": 7.4431,
"grad_norm": 1.0211223363876343,
"learning_rate": 0.0004146320119016365,
"epoch": 1.76
},
{
"loss": 7.4024,
"grad_norm": 1.0484708547592163,
"learning_rate": 0.00041375689157259124,
"epoch": 1.76
},
{
"loss": 7.4321,
"grad_norm": 1.2012499570846558,
"learning_rate": 0.000412881771243546,
"epoch": 1.76
},
{
"loss": 7.2608,
"grad_norm": 0.9850478768348694,
"learning_rate": 0.0004120066509145007,
"epoch": 1.76
},
{
"loss": 7.4744,
"grad_norm": 1.1142171621322632,
"learning_rate": 0.00041113153058545547,
"epoch": 1.77
},
{
"loss": 7.8258,
"grad_norm": 1.0107368230819702,
"learning_rate": 0.00041025641025641023,
"epoch": 1.77
},
{
"loss": 8.0338,
"grad_norm": 1.3827756643295288,
"learning_rate": 0.000409381289927365,
"epoch": 1.77
},
{
"loss": 7.2029,
"grad_norm": 1.056078553199768,
"learning_rate": 0.00040850616959831976,
"epoch": 1.77
},
{
"loss": 7.9763,
"grad_norm": 1.3796826601028442,
"learning_rate": 0.0004076310492692745,
"epoch": 1.78
},
{
"loss": 7.39,
"grad_norm": 1.5586506128311157,
"learning_rate": 0.0004067559289402293,
"epoch": 1.78
},
{
"loss": 7.7479,
"grad_norm": 1.3467471599578857,
"learning_rate": 0.00040588080861118404,
"epoch": 1.78
},
{
"loss": 7.5281,
"grad_norm": 1.5824648141860962,
"learning_rate": 0.0004050056882821388,
"epoch": 1.78
},
{
"loss": 7.4095,
"grad_norm": 1.5600448846817017,
"learning_rate": 0.00040413056795309357,
"epoch": 1.79
},
{
"loss": 7.6296,
"grad_norm": 1.4003773927688599,
"learning_rate": 0.00040325544762404833,
"epoch": 1.79
},
{
"loss": 7.2299,
"grad_norm": 1.1784484386444092,
"learning_rate": 0.0004023803272950031,
"epoch": 1.79
},
{
"loss": 7.2215,
"grad_norm": 1.0865730047225952,
"learning_rate": 0.0004015052069659578,
"epoch": 1.8
},
{
"loss": 7.619,
"grad_norm": 1.3708497285842896,
"learning_rate": 0.00040063008663691256,
"epoch": 1.8
},
{
"loss": 7.6305,
"grad_norm": 1.3728278875350952,
"learning_rate": 0.0003997549663078673,
"epoch": 1.8
},
{
"loss": 7.4218,
"grad_norm": 1.385901689529419,
"learning_rate": 0.0003988798459788221,
"epoch": 1.8
},
{
"loss": 7.7959,
"grad_norm": 1.5370672941207886,
"learning_rate": 0.00039800472564977685,
"epoch": 1.81
},
{
"loss": 7.8249,
"grad_norm": 1.039469838142395,
"learning_rate": 0.0003971296053207316,
"epoch": 1.81
},
{
"loss": 7.4311,
"grad_norm": 1.4947952032089233,
"learning_rate": 0.00039625448499168637,
"epoch": 1.81
},
{
"loss": 7.7797,
"grad_norm": 1.2262136936187744,
"learning_rate": 0.00039546687669554567,
"epoch": 1.81
},
{
"loss": 7.8844,
"grad_norm": 1.5757509469985962,
"learning_rate": 0.00039459175636650043,
"epoch": 1.82
},
{
"loss": 7.8737,
"grad_norm": 1.2183258533477783,
"learning_rate": 0.0003937166360374552,
"epoch": 1.82
},
{
"loss": 7.3515,
"grad_norm": 1.3697617053985596,
"learning_rate": 0.00039284151570840995,
"epoch": 1.82
},
{
"loss": 7.1169,
"grad_norm": 1.3007692098617554,
"learning_rate": 0.00039196639537936466,
"epoch": 1.82
},
{
"loss": 7.2926,
"grad_norm": 1.3538720607757568,
"learning_rate": 0.0003910912750503194,
"epoch": 1.83
},
{
"loss": 7.8445,
"grad_norm": 1.4245976209640503,
"learning_rate": 0.0003902161547212742,
"epoch": 1.83
},
{
"loss": 7.456,
"grad_norm": 1.323899269104004,
"learning_rate": 0.00038934103439222894,
"epoch": 1.83
},
{
"loss": 7.6163,
"grad_norm": 1.2635420560836792,
"learning_rate": 0.0003884659140631837,
"epoch": 1.83
},
{
"loss": 7.5885,
"grad_norm": 1.4714936017990112,
"learning_rate": 0.00038759079373413847,
"epoch": 1.84
},
{
"loss": 7.8382,
"grad_norm": 1.1696442365646362,
"learning_rate": 0.00038671567340509323,
"epoch": 1.84
},
{
"loss": 7.4885,
"grad_norm": 1.3797491788864136,
"learning_rate": 0.000385840553076048,
"epoch": 1.84
},
{
"loss": 7.4614,
"grad_norm": 1.0410481691360474,
"learning_rate": 0.00038496543274700275,
"epoch": 1.85
},
{
"loss": 6.9584,
"grad_norm": 1.7356559038162231,
"learning_rate": 0.0003840903124179575,
"epoch": 1.85
},
{
"loss": 7.8161,
"grad_norm": 1.326489806175232,
"learning_rate": 0.0003832151920889123,
"epoch": 1.85
},
{
"loss": 7.6985,
"grad_norm": 1.3822075128555298,
"learning_rate": 0.00038234007175986704,
"epoch": 1.85
},
{
"loss": 7.9532,
"grad_norm": 1.2612171173095703,
"learning_rate": 0.00038146495143082175,
"epoch": 1.86
},
{
"loss": 7.3309,
"grad_norm": 1.8743207454681396,
"learning_rate": 0.0003805898311017765,
"epoch": 1.86
},
{
"loss": 7.8573,
"grad_norm": 1.515641212463379,
"learning_rate": 0.0003797147107727312,
"epoch": 1.86
},
{
"loss": 8.0815,
"grad_norm": 1.970818281173706,
"learning_rate": 0.000378839590443686,
"epoch": 1.86
},
{
"loss": 7.7197,
"grad_norm": 1.6418136358261108,
"learning_rate": 0.00037796447011464074,
"epoch": 1.87
},
{
"loss": 7.6527,
"grad_norm": 1.3693944215774536,
"learning_rate": 0.0003770893497855955,
"epoch": 1.87
},
{
"loss": 7.7717,
"grad_norm": 1.311493992805481,
"learning_rate": 0.00037621422945655026,
"epoch": 1.87
},
{
"loss": 8.0735,
"grad_norm": 1.593992829322815,
"learning_rate": 0.000375339109127505,
"epoch": 1.87
},
{
"loss": 7.4285,
"grad_norm": 1.212729573249817,
"learning_rate": 0.0003744639887984598,
"epoch": 1.88
},
{
"loss": 7.7873,
"grad_norm": 1.1326895952224731,
"learning_rate": 0.00037358886846941455,
"epoch": 1.88
},
{
"loss": 7.3515,
"grad_norm": 1.3937299251556396,
"learning_rate": 0.0003727137481403693,
"epoch": 1.88
},
{
"loss": 7.353,
"grad_norm": 1.5152568817138672,
"learning_rate": 0.00037183862781132407,
"epoch": 1.88
},
{
"loss": 7.8015,
"grad_norm": 1.207973599433899,
"learning_rate": 0.0003709635074822788,
"epoch": 1.89
},
{
"loss": 7.6713,
"grad_norm": 1.003139615058899,
"learning_rate": 0.00037008838715323354,
"epoch": 1.89
},
{
"loss": 7.9247,
"grad_norm": 1.1870025396347046,
"learning_rate": 0.0003692132668241883,
"epoch": 1.89
},
{
"loss": 7.4496,
"grad_norm": 1.237275242805481,
"learning_rate": 0.00036833814649514307,
"epoch": 1.9
},
{
"loss": 7.2638,
"grad_norm": 1.7287304401397705,
"learning_rate": 0.00036746302616609783,
"epoch": 1.9
},
{
"loss": 7.7464,
"grad_norm": 1.5875813961029053,
"learning_rate": 0.0003665879058370526,
"epoch": 1.9
},
{
"loss": 7.683,
"grad_norm": 1.7219480276107788,
"learning_rate": 0.00036571278550800735,
"epoch": 1.9
},
{
"loss": 7.6059,
"grad_norm": 1.3815206289291382,
"learning_rate": 0.0003648376651789621,
"epoch": 1.91
},
{
"loss": 7.3258,
"grad_norm": 1.1902978420257568,
"learning_rate": 0.0003639625448499169,
"epoch": 1.91
},
{
"loss": 7.4436,
"grad_norm": 1.6532816886901855,
"learning_rate": 0.00036308742452087164,
"epoch": 1.91
},
{
"loss": 7.438,
"grad_norm": 1.1358212232589722,
"learning_rate": 0.0003622123041918264,
"epoch": 1.91
},
{
"loss": 7.9777,
"grad_norm": 1.3459230661392212,
"learning_rate": 0.00036133718386278116,
"epoch": 1.92
},
{
"loss": 7.9087,
"grad_norm": 1.0352368354797363,
"learning_rate": 0.0003604620635337359,
"epoch": 1.92
},
{
"loss": 7.5855,
"grad_norm": 1.2582918405532837,
"learning_rate": 0.00035958694320469063,
"epoch": 1.92
},
{
"loss": 7.4576,
"grad_norm": 1.1787996292114258,
"learning_rate": 0.0003587118228756454,
"epoch": 1.92
},
{
"loss": 7.2572,
"grad_norm": 1.2917609214782715,
"learning_rate": 0.00035783670254660015,
"epoch": 1.93
},
{
"loss": 7.6433,
"grad_norm": 1.1689330339431763,
"learning_rate": 0.0003569615822175549,
"epoch": 1.93
},
{
"loss": 7.6579,
"grad_norm": 1.2844352722167969,
"learning_rate": 0.0003560864618885097,
"epoch": 1.93
},
{
"loss": 7.5178,
"grad_norm": 1.498838186264038,
"learning_rate": 0.00035521134155946444,
"epoch": 1.93
},
{
"loss": 7.0155,
"grad_norm": 1.3718552589416504,
"learning_rate": 0.0003543362212304192,
"epoch": 1.94
},
{
"loss": 7.7558,
"grad_norm": 1.2343835830688477,
"learning_rate": 0.00035346110090137396,
"epoch": 1.94
},
{
"loss": 7.4386,
"grad_norm": 1.307979702949524,
"learning_rate": 0.0003525859805723287,
"epoch": 1.94
},
{
"loss": 7.4287,
"grad_norm": 1.46335768699646,
"learning_rate": 0.0003517108602432835,
"epoch": 1.95
},
{
"loss": 7.0541,
"grad_norm": 1.4892301559448242,
"learning_rate": 0.00035083573991423825,
"epoch": 1.95
},
{
"loss": 7.6458,
"grad_norm": 1.3297821283340454,
"learning_rate": 0.000349960619585193,
"epoch": 1.95
},
{
"loss": 7.3704,
"grad_norm": 1.9190036058425903,
"learning_rate": 0.0003490854992561477,
"epoch": 1.95
},
{
"loss": 7.9292,
"grad_norm": 1.1013009548187256,
"learning_rate": 0.0003482103789271025,
"epoch": 1.96
},
{
"loss": 7.8039,
"grad_norm": 1.284121036529541,
"learning_rate": 0.00034733525859805724,
"epoch": 1.96
},
{
"loss": 7.7188,
"grad_norm": 1.118995189666748,
"learning_rate": 0.000346460138269012,
"epoch": 1.96
},
{
"loss": 7.3617,
"grad_norm": 1.5446746349334717,
"learning_rate": 0.00034558501793996676,
"epoch": 1.96
},
{
"loss": 7.5614,
"grad_norm": 1.254835844039917,
"learning_rate": 0.0003447098976109215,
"epoch": 1.97
},
{
"loss": 7.9923,
"grad_norm": 2.215224266052246,
"learning_rate": 0.0003438347772818763,
"epoch": 1.97
},
{
"loss": 7.6609,
"grad_norm": 1.2917975187301636,
"learning_rate": 0.00034295965695283105,
"epoch": 1.97
},
{
"loss": 6.9695,
"grad_norm": 1.3251945972442627,
"learning_rate": 0.0003420845366237858,
"epoch": 1.97
},
{
"loss": 7.4109,
"grad_norm": 1.5397628545761108,
"learning_rate": 0.0003412094162947406,
"epoch": 1.98
},
{
"loss": 7.3063,
"grad_norm": 1.1789202690124512,
"learning_rate": 0.00034033429596569534,
"epoch": 1.98
},
{
"loss": 7.8137,
"grad_norm": 1.6068191528320312,
"learning_rate": 0.00033945917563665004,
"epoch": 1.98
},
{
"loss": 7.5466,
"grad_norm": 1.2397950887680054,
"learning_rate": 0.00033858405530760475,
"epoch": 1.98
},
{
"loss": 7.9522,
"grad_norm": 1.5175119638442993,
"learning_rate": 0.0003377089349785595,
"epoch": 1.99
},
{
"loss": 7.6781,
"grad_norm": 1.315258502960205,
"learning_rate": 0.0003368338146495143,
"epoch": 1.99
},
{
"loss": 7.7292,
"grad_norm": 2.664515256881714,
"learning_rate": 0.00033595869432046904,
"epoch": 1.99
},
{
"loss": 8.1965,
"grad_norm": 1.405129313468933,
"learning_rate": 0.0003350835739914238,
"epoch": 1.99
},
{
"loss": 7.4133,
"grad_norm": 1.0774602890014648,
"learning_rate": 0.00033420845366237856,
"epoch": 2.0
},
{
"loss": 8.1777,
"grad_norm": 1.75553560256958,
"learning_rate": 0.0003333333333333333,
"epoch": 2.0
},
{
"loss": 7.5693,
"grad_norm": 1.857081651687622,
"learning_rate": 0.0003324582130042881,
"epoch": 2.0
},
{
"loss": 7.4888,
"grad_norm": 1.0721529722213745,
"learning_rate": 0.00033158309267524285,
"epoch": 2.01
},
{
"loss": 7.1311,
"grad_norm": 1.0766797065734863,
"learning_rate": 0.0003307079723461976,
"epoch": 2.01
},
{
"loss": 7.5107,
"grad_norm": 1.4615150690078735,
"learning_rate": 0.00032983285201715237,
"epoch": 2.01
},
{
"loss": 7.5258,
"grad_norm": 1.4252068996429443,
"learning_rate": 0.00032895773168810713,
"epoch": 2.01
},
{
"loss": 7.6049,
"grad_norm": 1.2926585674285889,
"learning_rate": 0.0003280826113590619,
"epoch": 2.02
},
{
"loss": 7.2436,
"grad_norm": 1.6630724668502808,
"learning_rate": 0.0003272074910300166,
"epoch": 2.02
},
{
"loss": 6.951,
"grad_norm": 1.2705895900726318,
"learning_rate": 0.00032633237070097136,
"epoch": 2.02
},
{
"loss": 7.4782,
"grad_norm": 1.6801918745040894,
"learning_rate": 0.0003254572503719261,
"epoch": 2.02
},
{
"loss": 7.7247,
"grad_norm": 1.2789455652236938,
"learning_rate": 0.0003245821300428809,
"epoch": 2.03
},
{
"loss": 7.65,
"grad_norm": 1.0772324800491333,
"learning_rate": 0.00032370700971383565,
"epoch": 2.03
},
{
"loss": 7.3484,
"grad_norm": 1.218855857849121,
"learning_rate": 0.0003228318893847904,
"epoch": 2.03
},
{
"loss": 7.7201,
"grad_norm": 1.7484831809997559,
"learning_rate": 0.00032195676905574517,
"epoch": 2.03
},
{
"loss": 7.606,
"grad_norm": 1.4081809520721436,
"learning_rate": 0.00032108164872669993,
"epoch": 2.04
},
{
"loss": 7.4735,
"grad_norm": 1.2214211225509644,
"learning_rate": 0.0003202065283976547,
"epoch": 2.04
},
{
"loss": 7.3052,
"grad_norm": 2.243197441101074,
"learning_rate": 0.00031933140806860946,
"epoch": 2.04
},
{
"loss": 7.2611,
"grad_norm": 1.0560696125030518,
"learning_rate": 0.0003184562877395642,
"epoch": 2.04
},
{
"loss": 7.3347,
"grad_norm": 1.3903985023498535,
"learning_rate": 0.000317581167410519,
"epoch": 2.05
},
{
"loss": 7.4106,
"grad_norm": 1.285888910293579,
"learning_rate": 0.0003167060470814737,
"epoch": 2.05
},
{
"loss": 7.3237,
"grad_norm": 1.6455745697021484,
"learning_rate": 0.00031583092675242845,
"epoch": 2.05
},
{
"loss": 7.4445,
"grad_norm": 1.3552714586257935,
"learning_rate": 0.0003149558064233832,
"epoch": 2.06
},
{
"loss": 7.3175,
"grad_norm": 1.4250375032424927,
"learning_rate": 0.000314080686094338,
"epoch": 2.06
},
{
"loss": 7.5334,
"grad_norm": 1.8445017337799072,
"learning_rate": 0.00031320556576529274,
"epoch": 2.06
},
{
"loss": 7.7627,
"grad_norm": 1.1116868257522583,
"learning_rate": 0.0003123304454362475,
"epoch": 2.06
},
{
"loss": 7.5347,
"grad_norm": 1.1636768579483032,
"learning_rate": 0.00031145532510720226,
"epoch": 2.07
},
{
"loss": 7.6581,
"grad_norm": 1.4612860679626465,
"learning_rate": 0.000310580204778157,
"epoch": 2.07
},
{
"loss": 7.6164,
"grad_norm": 1.4403191804885864,
"learning_rate": 0.0003097050844491118,
"epoch": 2.07
},
{
"loss": 7.3776,
"grad_norm": 1.366955041885376,
"learning_rate": 0.00030882996412006655,
"epoch": 2.07
},
{
"loss": 7.556,
"grad_norm": 1.4476971626281738,
"learning_rate": 0.0003079548437910213,
"epoch": 2.08
},
{
"loss": 7.6019,
"grad_norm": 1.4753084182739258,
"learning_rate": 0.00030707972346197607,
"epoch": 2.08
},
{
"loss": 7.8493,
"grad_norm": 1.2335758209228516,
"learning_rate": 0.00030620460313293083,
"epoch": 2.08
},
{
"loss": 7.9252,
"grad_norm": 1.3958989381790161,
"learning_rate": 0.00030532948280388554,
"epoch": 2.08
},
{
"loss": 7.2945,
"grad_norm": 1.4621672630310059,
"learning_rate": 0.0003044543624748403,
"epoch": 2.09
},
{
"loss": 7.3977,
"grad_norm": 1.428195834159851,
"learning_rate": 0.00030357924214579506,
"epoch": 2.09
},
{
"loss": 7.74,
"grad_norm": 1.363600492477417,
"learning_rate": 0.0003027041218167498,
"epoch": 2.09
},
{
"loss": 7.4894,
"grad_norm": 1.2117736339569092,
"learning_rate": 0.0003018290014877046,
"epoch": 2.09
},
{
"loss": 7.5678,
"grad_norm": 1.9844530820846558,
"learning_rate": 0.00030095388115865935,
"epoch": 2.1
},
{
"loss": 7.6681,
"grad_norm": 1.3558523654937744,
"learning_rate": 0.0003000787608296141,
"epoch": 2.1
},
{
"loss": 7.9793,
"grad_norm": 1.3802049160003662,
"learning_rate": 0.0002992036405005688,
"epoch": 2.1
},
{
"loss": 8.1848,
"grad_norm": 1.845702886581421,
"learning_rate": 0.0002983285201715236,
"epoch": 2.11
},
{
"loss": 7.2184,
"grad_norm": 1.4479707479476929,
"learning_rate": 0.00029745339984247834,
"epoch": 2.11
},
{
"loss": 7.4373,
"grad_norm": 1.9233028888702393,
"learning_rate": 0.0002965782795134331,
"epoch": 2.11
},
{
"loss": 7.2478,
"grad_norm": 1.3621513843536377,
"learning_rate": 0.00029570315918438786,
"epoch": 2.11
},
{
"loss": 7.5867,
"grad_norm": 1.449763298034668,
"learning_rate": 0.00029482803885534257,
"epoch": 2.12
},
{
"loss": 7.2909,
"grad_norm": 1.543834924697876,
"learning_rate": 0.00029395291852629733,
"epoch": 2.12
},
{
"loss": 7.5481,
"grad_norm": 1.2582162618637085,
"learning_rate": 0.0002930777981972521,
"epoch": 2.12
},
{
"loss": 7.2092,
"grad_norm": 1.25532865524292,
"learning_rate": 0.00029220267786820686,
"epoch": 2.12
},
{
"loss": 7.5117,
"grad_norm": 1.4368300437927246,
"learning_rate": 0.0002913275575391616,
"epoch": 2.13
},
{
"loss": 7.8661,
"grad_norm": 1.4054632186889648,
"learning_rate": 0.0002904524372101164,
"epoch": 2.13
},
{
"loss": 7.7641,
"grad_norm": 1.4426825046539307,
"learning_rate": 0.00028957731688107114,
"epoch": 2.13
},
{
"loss": 6.9808,
"grad_norm": 1.6069836616516113,
"learning_rate": 0.0002887021965520259,
"epoch": 2.13
},
{
"loss": 8.0412,
"grad_norm": 1.603289246559143,
"learning_rate": 0.00028782707622298067,
"epoch": 2.14
},
{
"loss": 7.7541,
"grad_norm": 1.2069703340530396,
"learning_rate": 0.00028695195589393543,
"epoch": 2.14
},
{
"loss": 7.5413,
"grad_norm": 1.2976186275482178,
"learning_rate": 0.0002860768355648902,
"epoch": 2.14
},
{
"loss": 7.6833,
"grad_norm": 1.4646226167678833,
"learning_rate": 0.00028520171523584495,
"epoch": 2.14
},
{
"loss": 7.3603,
"grad_norm": 1.3783011436462402,
"learning_rate": 0.00028432659490679966,
"epoch": 2.15
},
{
"loss": 7.1131,
"grad_norm": 1.1677837371826172,
"learning_rate": 0.0002834514745777544,
"epoch": 2.15
},
{
"loss": 7.8353,
"grad_norm": 1.5966696739196777,
"learning_rate": 0.0002825763542487092,
"epoch": 2.15
},
{
"loss": 7.651,
"grad_norm": 1.3074275255203247,
"learning_rate": 0.00028170123391966394,
"epoch": 2.16
},
{
"loss": 6.8535,
"grad_norm": 1.2238943576812744,
"learning_rate": 0.0002808261135906187,
"epoch": 2.16
},
{
"loss": 7.1677,
"grad_norm": 1.2107079029083252,
"learning_rate": 0.00027995099326157347,
"epoch": 2.16
},
{
"loss": 7.1232,
"grad_norm": 1.482686996459961,
"learning_rate": 0.00027907587293252823,
"epoch": 2.16
},
{
"loss": 7.6958,
"grad_norm": 1.9235337972640991,
"learning_rate": 0.000278200752603483,
"epoch": 2.17
},
{
"loss": 7.5763,
"grad_norm": 1.0629470348358154,
"learning_rate": 0.00027732563227443775,
"epoch": 2.17
},
{
"loss": 7.417,
"grad_norm": 1.4404977560043335,
"learning_rate": 0.0002764505119453925,
"epoch": 2.17
},
{
"loss": 7.4457,
"grad_norm": 1.6266590356826782,
"learning_rate": 0.0002755753916163473,
"epoch": 2.17
},
{
"loss": 7.6768,
"grad_norm": 1.4418647289276123,
"learning_rate": 0.00027470027128730204,
"epoch": 2.18
},
{
"loss": 7.7301,
"grad_norm": 1.7269823551177979,
"learning_rate": 0.0002738251509582568,
"epoch": 2.18
},
{
"loss": 7.1704,
"grad_norm": 1.9527968168258667,
"learning_rate": 0.0002729500306292115,
"epoch": 2.18
},
{
"loss": 8.0284,
"grad_norm": 1.1195765733718872,
"learning_rate": 0.00027207491030016627,
"epoch": 2.18
},
{
"loss": 7.876,
"grad_norm": 1.381032109260559,
"learning_rate": 0.00027119978997112103,
"epoch": 2.19
},
{
"loss": 7.4609,
"grad_norm": 2.2558112144470215,
"learning_rate": 0.0002703246696420758,
"epoch": 2.19
},
{
"loss": 7.524,
"grad_norm": 1.0892398357391357,
"learning_rate": 0.00026944954931303056,
"epoch": 2.19
},
{
"loss": 7.1756,
"grad_norm": 1.432793140411377,
"learning_rate": 0.0002685744289839853,
"epoch": 2.19
},
{
"loss": 7.4677,
"grad_norm": 2.4381473064422607,
"learning_rate": 0.0002676993086549401,
"epoch": 2.2
},
{
"loss": 7.2004,
"grad_norm": 1.0947704315185547,
"learning_rate": 0.00026682418832589484,
"epoch": 2.2
},
{
"loss": 7.6084,
"grad_norm": 1.1396403312683105,
"learning_rate": 0.0002659490679968496,
"epoch": 2.2
},
{
"loss": 7.4592,
"grad_norm": 1.7132469415664673,
"learning_rate": 0.00026507394766780437,
"epoch": 2.2
},
{
"loss": 7.6666,
"grad_norm": 1.507416844367981,
"learning_rate": 0.00026419882733875913,
"epoch": 2.21
},
{
"loss": 7.9483,
"grad_norm": 1.997502326965332,
"learning_rate": 0.0002633237070097139,
"epoch": 2.21
},
{
"loss": 6.8979,
"grad_norm": 1.180274486541748,
"learning_rate": 0.0002624485866806686,
"epoch": 2.21
},
{
"loss": 7.5387,
"grad_norm": 1.4130629301071167,
"learning_rate": 0.00026157346635162336,
"epoch": 2.22
},
{
"loss": 7.7374,
"grad_norm": 1.9466407299041748,
"learning_rate": 0.0002606983460225781,
"epoch": 2.22
},
{
"loss": 7.2489,
"grad_norm": 1.2844946384429932,
"learning_rate": 0.00025982322569353283,
"epoch": 2.22
},
{
"loss": 7.2583,
"grad_norm": 1.4728493690490723,
"learning_rate": 0.0002589481053644876,
"epoch": 2.22
},
{
"loss": 7.1689,
"grad_norm": 1.505767583847046,
"learning_rate": 0.00025807298503544235,
"epoch": 2.23
},
{
"loss": 7.3824,
"grad_norm": 1.164609432220459,
"learning_rate": 0.0002571978647063971,
"epoch": 2.23
},
{
"loss": 8.208,
"grad_norm": 1.3337666988372803,
"learning_rate": 0.0002563227443773519,
"epoch": 2.23
},
{
"loss": 7.1503,
"grad_norm": 1.2840052843093872,
"learning_rate": 0.00025544762404830664,
"epoch": 2.23
},
{
"loss": 7.7838,
"grad_norm": 1.6767994165420532,
"learning_rate": 0.0002545725037192614,
"epoch": 2.24
},
{
"loss": 7.4818,
"grad_norm": 1.2790688276290894,
"learning_rate": 0.00025369738339021616,
"epoch": 2.24
},
{
"loss": 7.1404,
"grad_norm": 1.9306037425994873,
"learning_rate": 0.0002528222630611709,
"epoch": 2.24
},
{
"loss": 6.9151,
"grad_norm": 1.0568101406097412,
"learning_rate": 0.00025194714273212563,
"epoch": 2.24
},
{
"loss": 7.5813,
"grad_norm": 1.8494940996170044,
"learning_rate": 0.0002510720224030804,
"epoch": 2.25
},
{
"loss": 7.1433,
"grad_norm": 1.2321641445159912,
"learning_rate": 0.00025019690207403515,
"epoch": 2.25
},
{
"loss": 7.0211,
"grad_norm": 1.5231260061264038,
"learning_rate": 0.0002493217817449899,
"epoch": 2.25
},
{
"loss": 7.5108,
"grad_norm": 1.6787548065185547,
"learning_rate": 0.0002484466614159447,
"epoch": 2.25
},
{
"loss": 7.5859,
"grad_norm": 1.8862128257751465,
"learning_rate": 0.00024757154108689944,
"epoch": 2.26
},
{
"loss": 7.0871,
"grad_norm": 1.5295615196228027,
"learning_rate": 0.0002466964207578542,
"epoch": 2.26
},
{
"loss": 7.2151,
"grad_norm": 1.6439179182052612,
"learning_rate": 0.00024582130042880896,
"epoch": 2.26
},
{
"loss": 7.851,
"grad_norm": 1.5902001857757568,
"learning_rate": 0.0002449461800997637,
"epoch": 2.27
},
{
"loss": 7.695,
"grad_norm": 1.447240948677063,
"learning_rate": 0.00024407105977071846,
"epoch": 2.27
},
{
"loss": 7.218,
"grad_norm": 1.7448298931121826,
"learning_rate": 0.00024319593944167322,
"epoch": 2.27
},
{
"loss": 7.4559,
"grad_norm": 1.7815390825271606,
"learning_rate": 0.00024232081911262798,
"epoch": 2.27
},
{
"loss": 7.5519,
"grad_norm": 1.746805191040039,
"learning_rate": 0.00024144569878358275,
"epoch": 2.28
},
{
"loss": 7.4818,
"grad_norm": 1.771155834197998,
"learning_rate": 0.0002405705784545375,
"epoch": 2.28
},
{
"loss": 7.8775,
"grad_norm": 1.2886364459991455,
"learning_rate": 0.00023969545812549227,
"epoch": 2.28
},
{
"loss": 7.0862,
"grad_norm": 1.3562748432159424,
"learning_rate": 0.00023882033779644703,
"epoch": 2.28
},
{
"loss": 7.4458,
"grad_norm": 1.5549288988113403,
"learning_rate": 0.00023794521746740177,
"epoch": 2.29
},
{
"loss": 7.5017,
"grad_norm": 1.3231199979782104,
"learning_rate": 0.00023707009713835653,
"epoch": 2.29
},
{
"loss": 6.9317,
"grad_norm": 1.0973995923995972,
"learning_rate": 0.0002361949768093113,
"epoch": 2.29
},
{
"loss": 7.2512,
"grad_norm": 1.161665916442871,
"learning_rate": 0.00023531985648026605,
"epoch": 2.29
},
{
"loss": 7.3376,
"grad_norm": 1.1249802112579346,
"learning_rate": 0.0002344447361512208,
"epoch": 2.3
},
{
"loss": 7.6856,
"grad_norm": 1.4549752473831177,
"learning_rate": 0.00023356961582217557,
"epoch": 2.3
},
{
"loss": 7.6518,
"grad_norm": 1.2443310022354126,
"learning_rate": 0.0002326944954931303,
"epoch": 2.3
},
{
"loss": 7.9287,
"grad_norm": 1.2414274215698242,
"learning_rate": 0.00023181937516408507,
"epoch": 2.3
},
{
"loss": 7.4844,
"grad_norm": 1.250632882118225,
"learning_rate": 0.00023094425483503983,
"epoch": 2.31
},
{
"loss": 6.9439,
"grad_norm": 1.5678353309631348,
"learning_rate": 0.0002300691345059946,
"epoch": 2.31
},
{
"loss": 7.2214,
"grad_norm": 1.2777363061904907,
"learning_rate": 0.00022919401417694933,
"epoch": 2.31
},
{
"loss": 7.6909,
"grad_norm": 1.1702243089675903,
"learning_rate": 0.0002283188938479041,
"epoch": 2.32
},
{
"loss": 7.843,
"grad_norm": 1.1647387742996216,
"learning_rate": 0.00022744377351885883,
"epoch": 2.32
},
{
"loss": 7.5598,
"grad_norm": 1.5888360738754272,
"learning_rate": 0.0002265686531898136,
"epoch": 2.32
},
{
"loss": 7.4084,
"grad_norm": 1.2132010459899902,
"learning_rate": 0.00022569353286076835,
"epoch": 2.32
},
{
"loss": 8.0077,
"grad_norm": 1.3676106929779053,
"learning_rate": 0.0002248184125317231,
"epoch": 2.33
},
{
"loss": 7.4475,
"grad_norm": 1.4785172939300537,
"learning_rate": 0.00022394329220267787,
"epoch": 2.33
},
{
"loss": 7.4934,
"grad_norm": 1.6854803562164307,
"learning_rate": 0.00022306817187363264,
"epoch": 2.33
},
{
"loss": 7.5371,
"grad_norm": 1.3336540460586548,
"learning_rate": 0.00022219305154458737,
"epoch": 2.33
},
{
"loss": 7.091,
"grad_norm": 1.5374839305877686,
"learning_rate": 0.00022131793121554213,
"epoch": 2.34
},
{
"loss": 7.5715,
"grad_norm": 1.259857177734375,
"learning_rate": 0.0002204428108864969,
"epoch": 2.34
},
{
"loss": 7.5012,
"grad_norm": 1.435889482498169,
"learning_rate": 0.00021956769055745166,
"epoch": 2.34
},
{
"loss": 7.5925,
"grad_norm": 1.6067544221878052,
"learning_rate": 0.00021869257022840642,
"epoch": 2.34
},
{
"loss": 7.2756,
"grad_norm": 1.2057377099990845,
"learning_rate": 0.00021781744989936118,
"epoch": 2.35
},
{
"loss": 7.0737,
"grad_norm": 1.0249065160751343,
"learning_rate": 0.00021694232957031591,
"epoch": 2.35
},
{
"loss": 7.2857,
"grad_norm": 1.1336891651153564,
"learning_rate": 0.00021606720924127068,
"epoch": 2.35
},
{
"loss": 7.0709,
"grad_norm": 1.1853156089782715,
"learning_rate": 0.00021519208891222544,
"epoch": 2.35
},
{
"loss": 6.9118,
"grad_norm": 1.4682341814041138,
"learning_rate": 0.0002143169685831802,
"epoch": 2.36
},
{
"loss": 7.3363,
"grad_norm": 1.3039721250534058,
"learning_rate": 0.00021344184825413496,
"epoch": 2.36
},
{
"loss": 7.2827,
"grad_norm": 1.28932785987854,
"learning_rate": 0.00021256672792508972,
"epoch": 2.36
},
{
"loss": 7.6069,
"grad_norm": 1.7343271970748901,
"learning_rate": 0.00021169160759604448,
"epoch": 2.37
},
{
"loss": 7.3543,
"grad_norm": 1.9730132818222046,
"learning_rate": 0.00021081648726699922,
"epoch": 2.37
},
{
"loss": 7.3351,
"grad_norm": 2.070822238922119,
"learning_rate": 0.00020994136693795395,
"epoch": 2.37
},
{
"loss": 7.3199,
"grad_norm": 1.1327873468399048,
"learning_rate": 0.00020906624660890872,
"epoch": 2.37
},
{
"loss": 7.4058,
"grad_norm": 1.3796617984771729,
"learning_rate": 0.00020819112627986348,
"epoch": 2.38
},
{
"loss": 7.3027,
"grad_norm": 1.8397942781448364,
"learning_rate": 0.00020731600595081824,
"epoch": 2.38
},
{
"loss": 7.6354,
"grad_norm": 1.4503923654556274,
"learning_rate": 0.0002065283976546775,
"epoch": 2.38
},
{
"loss": 7.2284,
"grad_norm": 1.550950527191162,
"learning_rate": 0.00020565327732563227,
"epoch": 2.38
},
{
"loss": 7.3061,
"grad_norm": 1.5306216478347778,
"learning_rate": 0.00020477815699658703,
"epoch": 2.39
},
{
"loss": 7.3337,
"grad_norm": 1.269167184829712,
"learning_rate": 0.0002039030366675418,
"epoch": 2.39
},
{
"loss": 7.7686,
"grad_norm": 1.600019931793213,
"learning_rate": 0.00020302791633849656,
"epoch": 2.39
},
{
"loss": 7.35,
"grad_norm": 1.5773662328720093,
"learning_rate": 0.0002021527960094513,
"epoch": 2.39
},
{
"loss": 7.3691,
"grad_norm": 1.547160029411316,
"learning_rate": 0.00020127767568040605,
"epoch": 2.4
},
{
"loss": 7.4863,
"grad_norm": 1.4968856573104858,
"learning_rate": 0.00020040255535136081,
"epoch": 2.4
},
{
"loss": 7.9482,
"grad_norm": 1.2087891101837158,
"learning_rate": 0.00019952743502231558,
"epoch": 2.4
},
{
"loss": 7.0255,
"grad_norm": 1.290597677230835,
"learning_rate": 0.00019865231469327034,
"epoch": 2.4
},
{
"loss": 7.178,
"grad_norm": 1.5743247270584106,
"learning_rate": 0.0001977771943642251,
"epoch": 2.41
},
{
"loss": 7.6474,
"grad_norm": 1.5197412967681885,
"learning_rate": 0.00019690207403517984,
"epoch": 2.41
},
{
"loss": 7.3527,
"grad_norm": 1.4716495275497437,
"learning_rate": 0.0001960269537061346,
"epoch": 2.41
},
{
"loss": 7.6313,
"grad_norm": 1.9746785163879395,
"learning_rate": 0.00019515183337708936,
"epoch": 2.41
},
{
"loss": 7.6972,
"grad_norm": 1.2683417797088623,
"learning_rate": 0.00019427671304804412,
"epoch": 2.42
},
{
"loss": 7.1378,
"grad_norm": 1.1373748779296875,
"learning_rate": 0.00019340159271899888,
"epoch": 2.42
},
{
"loss": 7.0196,
"grad_norm": 1.4191349744796753,
"learning_rate": 0.00019252647238995364,
"epoch": 2.42
},
{
"loss": 6.9102,
"grad_norm": 1.6580002307891846,
"learning_rate": 0.00019165135206090838,
"epoch": 2.43
},
{
"loss": 7.5105,
"grad_norm": 1.2877469062805176,
"learning_rate": 0.00019077623173186314,
"epoch": 2.43
},
{
"loss": 8.0212,
"grad_norm": 1.2933236360549927,
"learning_rate": 0.00018990111140281788,
"epoch": 2.43
},
{
"loss": 7.2108,
"grad_norm": 1.6515684127807617,
"learning_rate": 0.00018902599107377264,
"epoch": 2.43
},
{
"loss": 7.2944,
"grad_norm": 1.443547010421753,
"learning_rate": 0.0001881508707447274,
"epoch": 2.44
},
{
"loss": 6.9623,
"grad_norm": 1.5022013187408447,
"learning_rate": 0.00018727575041568216,
"epoch": 2.44
},
{
"loss": 7.5751,
"grad_norm": 1.639228343963623,
"learning_rate": 0.0001864006300866369,
"epoch": 2.44
},
{
"loss": 7.6183,
"grad_norm": 1.3685816526412964,
"learning_rate": 0.00018552550975759166,
"epoch": 2.44
},
{
"loss": 7.7862,
"grad_norm": 1.4008909463882446,
"learning_rate": 0.00018465038942854642,
"epoch": 2.45
},
{
"loss": 7.3036,
"grad_norm": 1.4068384170532227,
"learning_rate": 0.00018377526909950118,
"epoch": 2.45
},
{
"loss": 7.3222,
"grad_norm": 1.4874199628829956,
"learning_rate": 0.00018290014877045594,
"epoch": 2.45
},
{
"loss": 7.4538,
"grad_norm": 2.161606788635254,
"learning_rate": 0.0001820250284414107,
"epoch": 2.45
},
{
"loss": 7.099,
"grad_norm": 1.4761602878570557,
"learning_rate": 0.00018114990811236544,
"epoch": 2.46
},
{
"loss": 7.6725,
"grad_norm": 1.3598577976226807,
"learning_rate": 0.0001802747877833202,
"epoch": 2.46
},
{
"loss": 7.4651,
"grad_norm": 1.352389931678772,
"learning_rate": 0.00017939966745427496,
"epoch": 2.46
},
{
"loss": 7.0266,
"grad_norm": 1.302270770072937,
"learning_rate": 0.00017852454712522973,
"epoch": 2.46
},
{
"loss": 7.4879,
"grad_norm": 1.2166621685028076,
"learning_rate": 0.0001776494267961845,
"epoch": 2.47
},
{
"loss": 6.7354,
"grad_norm": 1.4442105293273926,
"learning_rate": 0.00017677430646713925,
"epoch": 2.47
},
{
"loss": 7.1184,
"grad_norm": 1.6301904916763306,
"learning_rate": 0.000175899186138094,
"epoch": 2.47
},
{
"loss": 7.4326,
"grad_norm": 1.2478090524673462,
"learning_rate": 0.00017502406580904875,
"epoch": 2.48
},
{
"loss": 7.6185,
"grad_norm": 1.2676613330841064,
"learning_rate": 0.0001741489454800035,
"epoch": 2.48
},
{
"loss": 7.439,
"grad_norm": 1.4324458837509155,
"learning_rate": 0.00017327382515095827,
"epoch": 2.48
},
{
"loss": 7.7999,
"grad_norm": 1.634446382522583,
"learning_rate": 0.00017239870482191303,
"epoch": 2.48
},
{
"loss": 7.3043,
"grad_norm": 1.2877479791641235,
"learning_rate": 0.0001715235844928678,
"epoch": 2.49
},
{
"loss": 7.054,
"grad_norm": 1.7003803253173828,
"learning_rate": 0.00017064846416382255,
"epoch": 2.49
},
{
"loss": 7.1568,
"grad_norm": 1.8888310194015503,
"learning_rate": 0.00016977334383477726,
"epoch": 2.49
},
{
"loss": 7.3495,
"grad_norm": 1.2593083381652832,
"learning_rate": 0.00016889822350573202,
"epoch": 2.49
},
{
"loss": 7.4716,
"grad_norm": 1.4410508871078491,
"learning_rate": 0.00016802310317668679,
"epoch": 2.5
},
{
"loss": 7.5133,
"grad_norm": 1.20904541015625,
"learning_rate": 0.00016714798284764155,
"epoch": 2.5
},
{
"loss": 7.3222,
"grad_norm": 1.4503611326217651,
"learning_rate": 0.0001662728625185963,
"epoch": 2.5
},
{
"loss": 7.6387,
"grad_norm": 1.3705183267593384,
"learning_rate": 0.00016539774218955107,
"epoch": 2.5
},
{
"loss": 7.0609,
"grad_norm": 1.2106906175613403,
"learning_rate": 0.0001645226218605058,
"epoch": 2.51
},
{
"loss": 7.342,
"grad_norm": 1.5564229488372803,
"learning_rate": 0.00016364750153146057,
"epoch": 2.51
},
{
"loss": 7.8121,
"grad_norm": 1.6493812799453735,
"learning_rate": 0.00016277238120241533,
"epoch": 2.51
},
{
"loss": 7.3909,
"grad_norm": 1.9025623798370361,
"learning_rate": 0.0001618972608733701,
"epoch": 2.51
},
{
"loss": 7.0106,
"grad_norm": 1.2934685945510864,
"learning_rate": 0.00016102214054432485,
"epoch": 2.52
},
{
"loss": 7.5199,
"grad_norm": 1.2549662590026855,
"learning_rate": 0.00016014702021527962,
"epoch": 2.52
},
{
"loss": 7.3509,
"grad_norm": 1.2111480236053467,
"learning_rate": 0.00015927189988623435,
"epoch": 2.52
},
{
"loss": 7.5281,
"grad_norm": 2.2498984336853027,
"learning_rate": 0.0001583967795571891,
"epoch": 2.53
},
{
"loss": 7.5218,
"grad_norm": 1.4710973501205444,
"learning_rate": 0.00015752165922814387,
"epoch": 2.53
},
{
"loss": 7.1575,
"grad_norm": 1.4040391445159912,
"learning_rate": 0.00015664653889909864,
"epoch": 2.53
},
{
"loss": 7.3097,
"grad_norm": 2.3657708168029785,
"learning_rate": 0.0001557714185700534,
"epoch": 2.53
},
{
"loss": 7.3235,
"grad_norm": 1.8456711769104004,
"learning_rate": 0.00015489629824100816,
"epoch": 2.54
},
{
"loss": 7.1772,
"grad_norm": 1.3032398223876953,
"learning_rate": 0.0001540211779119629,
"epoch": 2.54
},
{
"loss": 7.331,
"grad_norm": 1.2472988367080688,
"learning_rate": 0.00015314605758291766,
"epoch": 2.54
},
{
"loss": 6.9758,
"grad_norm": 1.1861238479614258,
"learning_rate": 0.00015227093725387242,
"epoch": 2.54
},
{
"loss": 7.357,
"grad_norm": 1.2937425374984741,
"learning_rate": 0.00015139581692482718,
"epoch": 2.55
},
{
"loss": 7.6132,
"grad_norm": 1.5241109132766724,
"learning_rate": 0.00015052069659578194,
"epoch": 2.55
},
{
"loss": 7.1769,
"grad_norm": 1.2426915168762207,
"learning_rate": 0.00014964557626673668,
"epoch": 2.55
},
{
"loss": 7.2242,
"grad_norm": 1.5336363315582275,
"learning_rate": 0.0001487704559376914,
"epoch": 2.55
},
{
"loss": 8.0839,
"grad_norm": 1.6944379806518555,
"learning_rate": 0.00014789533560864617,
"epoch": 2.56
},
{
"loss": 7.2667,
"grad_norm": 1.6602429151535034,
"learning_rate": 0.00014702021527960093,
"epoch": 2.56
},
{
"loss": 7.4821,
"grad_norm": 1.331986665725708,
"learning_rate": 0.0001461450949505557,
"epoch": 2.56
},
{
"loss": 7.4808,
"grad_norm": 1.4923409223556519,
"learning_rate": 0.00014526997462151046,
"epoch": 2.56
},
{
"loss": 7.3579,
"grad_norm": 1.5323739051818848,
"learning_rate": 0.00014439485429246522,
"epoch": 2.57
},
{
"loss": 7.1833,
"grad_norm": 1.0281411409378052,
"learning_rate": 0.00014351973396341998,
"epoch": 2.57
},
{
"loss": 7.521,
"grad_norm": 1.777385950088501,
"learning_rate": 0.00014264461363437472,
"epoch": 2.57
},
{
"loss": 7.5531,
"grad_norm": 1.7528423070907593,
"learning_rate": 0.00014176949330532948,
"epoch": 2.58
},
{
"loss": 7.3295,
"grad_norm": 1.665503740310669,
"learning_rate": 0.00014089437297628424,
"epoch": 2.58
},
{
"loss": 6.9815,
"grad_norm": 1.4323763847351074,
"learning_rate": 0.000140019252647239,
"epoch": 2.58
},
{
"loss": 7.7957,
"grad_norm": 1.2623038291931152,
"learning_rate": 0.00013914413231819376,
"epoch": 2.58
},
{
"loss": 7.2667,
"grad_norm": 1.3770829439163208,
"learning_rate": 0.00013826901198914853,
"epoch": 2.59
},
{
"loss": 7.2641,
"grad_norm": 1.495597243309021,
"learning_rate": 0.00013739389166010326,
"epoch": 2.59
},
{
"loss": 7.6276,
"grad_norm": 1.0396783351898193,
"learning_rate": 0.00013651877133105802,
"epoch": 2.59
},
{
"loss": 7.4811,
"grad_norm": 1.5590603351593018,
"learning_rate": 0.00013564365100201278,
"epoch": 2.59
},
{
"loss": 6.9941,
"grad_norm": 1.266262173652649,
"learning_rate": 0.00013476853067296755,
"epoch": 2.6
},
{
"loss": 7.0138,
"grad_norm": 1.3331608772277832,
"learning_rate": 0.0001338934103439223,
"epoch": 2.6
},
{
"loss": 7.6792,
"grad_norm": 1.54330575466156,
"learning_rate": 0.00013301829001487707,
"epoch": 2.6
},
{
"loss": 7.5151,
"grad_norm": 1.266360878944397,
"learning_rate": 0.0001321431696858318,
"epoch": 2.6
},
{
"loss": 7.6357,
"grad_norm": 1.1992617845535278,
"learning_rate": 0.00013126804935678657,
"epoch": 2.61
},
{
"loss": 7.6848,
"grad_norm": 1.6269259452819824,
"learning_rate": 0.00013039292902774133,
"epoch": 2.61
},
{
"loss": 7.3941,
"grad_norm": 1.4221471548080444,
"learning_rate": 0.00012951780869869606,
"epoch": 2.61
},
{
"loss": 7.5638,
"grad_norm": 1.31778085231781,
"learning_rate": 0.00012864268836965082,
"epoch": 2.61
},
{
"loss": 7.3716,
"grad_norm": 1.4217979907989502,
"learning_rate": 0.00012776756804060559,
"epoch": 2.62
},
{
"loss": 7.7403,
"grad_norm": 1.549012541770935,
"learning_rate": 0.00012689244771156032,
"epoch": 2.62
},
{
"loss": 7.5079,
"grad_norm": 1.7808821201324463,
"learning_rate": 0.00012601732738251508,
"epoch": 2.62
},
{
"loss": 7.338,
"grad_norm": 1.6030139923095703,
"learning_rate": 0.00012514220705346984,
"epoch": 2.62
},
{
"loss": 7.2113,
"grad_norm": 1.688103437423706,
"learning_rate": 0.0001242670867244246,
"epoch": 2.63
},
{
"loss": 7.5297,
"grad_norm": 1.4482861757278442,
"learning_rate": 0.00012339196639537937,
"epoch": 2.63
},
{
"loss": 7.6226,
"grad_norm": 1.481149435043335,
"learning_rate": 0.00012251684606633413,
"epoch": 2.63
},
{
"loss": 7.1199,
"grad_norm": 1.5914816856384277,
"learning_rate": 0.00012164172573728888,
"epoch": 2.64
},
{
"loss": 7.5294,
"grad_norm": 1.6436686515808105,
"learning_rate": 0.00012076660540824364,
"epoch": 2.64
},
{
"loss": 7.7319,
"grad_norm": 1.422884225845337,
"learning_rate": 0.00011989148507919839,
"epoch": 2.64
},
{
"loss": 7.5878,
"grad_norm": 1.2468681335449219,
"learning_rate": 0.00011901636475015315,
"epoch": 2.64
},
{
"loss": 7.4093,
"grad_norm": 1.6080206632614136,
"learning_rate": 0.00011814124442110791,
"epoch": 2.65
},
{
"loss": 6.927,
"grad_norm": 1.2568819522857666,
"learning_rate": 0.00011726612409206266,
"epoch": 2.65
},
{
"loss": 7.524,
"grad_norm": 1.4558569192886353,
"learning_rate": 0.00011639100376301742,
"epoch": 2.65
},
{
"loss": 6.7721,
"grad_norm": 1.3554805517196655,
"learning_rate": 0.00011551588343397218,
"epoch": 2.65
},
{
"loss": 7.5129,
"grad_norm": 2.061342239379883,
"learning_rate": 0.00011464076310492692,
"epoch": 2.66
},
{
"loss": 7.271,
"grad_norm": 1.7581554651260376,
"learning_rate": 0.00011376564277588168,
"epoch": 2.66
},
{
"loss": 7.4605,
"grad_norm": 1.3818498849868774,
"learning_rate": 0.00011289052244683644,
"epoch": 2.66
},
{
"loss": 7.2747,
"grad_norm": 1.4640157222747803,
"learning_rate": 0.00011201540211779119,
"epoch": 2.66
},
{
"loss": 7.4137,
"grad_norm": 1.628440499305725,
"learning_rate": 0.00011114028178874595,
"epoch": 2.67
},
{
"loss": 7.1947,
"grad_norm": 2.1291253566741943,
"learning_rate": 0.00011026516145970071,
"epoch": 2.67
},
{
"loss": 7.3972,
"grad_norm": 1.53203284740448,
"learning_rate": 0.00010939004113065546,
"epoch": 2.67
},
{
"loss": 7.1343,
"grad_norm": 1.7009447813034058,
"learning_rate": 0.00010851492080161022,
"epoch": 2.67
},
{
"loss": 7.4999,
"grad_norm": 1.981833815574646,
"learning_rate": 0.00010763980047256499,
"epoch": 2.68
},
{
"loss": 7.0649,
"grad_norm": 1.4151135683059692,
"learning_rate": 0.00010676468014351973,
"epoch": 2.68
},
{
"loss": 7.4975,
"grad_norm": 1.8214997053146362,
"learning_rate": 0.0001058895598144745,
"epoch": 2.68
},
{
"loss": 7.1928,
"grad_norm": 1.475014328956604,
"learning_rate": 0.00010501443948542926,
"epoch": 2.69
},
{
"loss": 6.7309,
"grad_norm": 1.500470757484436,
"learning_rate": 0.00010413931915638399,
"epoch": 2.69
},
{
"loss": 7.2154,
"grad_norm": 1.0923032760620117,
"learning_rate": 0.00010326419882733875,
"epoch": 2.69
},
{
"loss": 7.4584,
"grad_norm": 1.476189136505127,
"learning_rate": 0.00010238907849829352,
"epoch": 2.69
},
{
"loss": 7.5696,
"grad_norm": 1.3299099206924438,
"learning_rate": 0.00010151395816924828,
"epoch": 2.7
},
{
"loss": 7.4462,
"grad_norm": 1.248026967048645,
"learning_rate": 0.00010063883784020303,
"epoch": 2.7
},
{
"loss": 7.057,
"grad_norm": 1.5154845714569092,
"learning_rate": 9.976371751115779e-05,
"epoch": 2.7
},
{
"loss": 7.4942,
"grad_norm": 1.504868745803833,
"learning_rate": 9.888859718211255e-05,
"epoch": 2.7
},
{
"loss": 7.7042,
"grad_norm": 1.2087482213974,
"learning_rate": 9.80134768530673e-05,
"epoch": 2.71
},
{
"loss": 7.7138,
"grad_norm": 2.066254138946533,
"learning_rate": 9.713835652402206e-05,
"epoch": 2.71
},
{
"loss": 7.4746,
"grad_norm": 1.2078548669815063,
"learning_rate": 9.626323619497682e-05,
"epoch": 2.71
},
{
"loss": 7.5682,
"grad_norm": 1.2530779838562012,
"learning_rate": 9.538811586593157e-05,
"epoch": 2.71
},
{
"loss": 7.4491,
"grad_norm": 1.5170719623565674,
"learning_rate": 9.451299553688632e-05,
"epoch": 2.72
},
{
"loss": 7.2938,
"grad_norm": 1.2933870553970337,
"learning_rate": 9.363787520784108e-05,
"epoch": 2.72
},
{
"loss": 7.1455,
"grad_norm": 1.212755799293518,
"learning_rate": 9.276275487879583e-05,
"epoch": 2.72
},
{
"loss": 7.3702,
"grad_norm": 1.4118942022323608,
"learning_rate": 9.188763454975059e-05,
"epoch": 2.72
},
{
"loss": 7.1194,
"grad_norm": 1.575276494026184,
"learning_rate": 9.101251422070535e-05,
"epoch": 2.73
},
{
"loss": 7.046,
"grad_norm": 1.3244752883911133,
"learning_rate": 9.01373938916601e-05,
"epoch": 2.73
},
{
"loss": 6.875,
"grad_norm": 1.369280219078064,
"learning_rate": 8.926227356261486e-05,
"epoch": 2.73
},
{
"loss": 7.4045,
"grad_norm": 1.3210042715072632,
"learning_rate": 8.838715323356962e-05,
"epoch": 2.74
},
{
"loss": 7.5159,
"grad_norm": 1.4352552890777588,
"learning_rate": 8.751203290452437e-05,
"epoch": 2.74
},
{
"loss": 7.2315,
"grad_norm": 1.4860197305679321,
"learning_rate": 8.663691257547913e-05,
"epoch": 2.74
},
{
"loss": 6.8597,
"grad_norm": 1.2331523895263672,
"learning_rate": 8.57617922464339e-05,
"epoch": 2.74
},
{
"loss": 7.3485,
"grad_norm": 1.2187525033950806,
"learning_rate": 8.488667191738863e-05,
"epoch": 2.75
},
{
"loss": 7.388,
"grad_norm": 1.1800241470336914,
"learning_rate": 8.401155158834339e-05,
"epoch": 2.75
},
{
"loss": 6.9186,
"grad_norm": 1.3542723655700684,
"learning_rate": 8.313643125929815e-05,
"epoch": 2.75
},
{
"loss": 6.9582,
"grad_norm": 1.3839143514633179,
"learning_rate": 8.22613109302529e-05,
"epoch": 2.75
},
{
"loss": 7.4176,
"grad_norm": 1.4546840190887451,
"learning_rate": 8.138619060120766e-05,
"epoch": 2.76
},
{
"loss": 7.2731,
"grad_norm": 1.3623560667037964,
"learning_rate": 8.051107027216243e-05,
"epoch": 2.76
},
{
"loss": 7.1633,
"grad_norm": 1.9331005811691284,
"learning_rate": 7.963594994311717e-05,
"epoch": 2.76
},
{
"loss": 6.8972,
"grad_norm": 1.2791029214859009,
"learning_rate": 7.876082961407194e-05,
"epoch": 2.76
},
{
"loss": 7.1043,
"grad_norm": 1.6202424764633179,
"learning_rate": 7.78857092850267e-05,
"epoch": 2.77
},
{
"loss": 7.0727,
"grad_norm": 1.0835381746292114,
"learning_rate": 7.701058895598145e-05,
"epoch": 2.77
},
{
"loss": 7.0958,
"grad_norm": 1.2778371572494507,
"learning_rate": 7.613546862693621e-05,
"epoch": 2.77
},
{
"loss": 7.2219,
"grad_norm": 1.9295389652252197,
"learning_rate": 7.526034829789097e-05,
"epoch": 2.77
},
{
"loss": 7.0189,
"grad_norm": 1.9394477605819702,
"learning_rate": 7.43852279688457e-05,
"epoch": 2.78
},
{
"loss": 7.0144,
"grad_norm": 1.4238934516906738,
"learning_rate": 7.351010763980047e-05,
"epoch": 2.78
},
{
"loss": 7.2353,
"grad_norm": 1.350537657737732,
"learning_rate": 7.263498731075523e-05,
"epoch": 2.78
},
{
"loss": 6.7353,
"grad_norm": 1.3214153051376343,
"learning_rate": 7.175986698170999e-05,
"epoch": 2.79
},
{
"loss": 7.4143,
"grad_norm": 2.469216823577881,
"learning_rate": 7.088474665266474e-05,
"epoch": 2.79
},
{
"loss": 7.4276,
"grad_norm": 1.414184808731079,
"learning_rate": 7.00096263236195e-05,
"epoch": 2.79
},
{
"loss": 6.9842,
"grad_norm": 1.4708011150360107,
"learning_rate": 6.913450599457426e-05,
"epoch": 2.79
},
{
"loss": 7.572,
"grad_norm": 1.449560284614563,
"learning_rate": 6.825938566552901e-05,
"epoch": 2.8
},
{
"loss": 7.3449,
"grad_norm": 1.1261264085769653,
"learning_rate": 6.738426533648377e-05,
"epoch": 2.8
},
{
"loss": 7.1776,
"grad_norm": 1.5502110719680786,
"learning_rate": 6.650914500743853e-05,
"epoch": 2.8
},
{
"loss": 7.0565,
"grad_norm": 1.3916562795639038,
"learning_rate": 6.563402467839328e-05,
"epoch": 2.8
},
{
"loss": 7.0882,
"grad_norm": 1.361229658126831,
"learning_rate": 6.475890434934803e-05,
"epoch": 2.81
},
{
"loss": 6.981,
"grad_norm": 1.6100305318832397,
"learning_rate": 6.388378402030279e-05,
"epoch": 2.81
},
{
"loss": 7.2502,
"grad_norm": 1.5449306964874268,
"learning_rate": 6.300866369125754e-05,
"epoch": 2.81
},
{
"loss": 7.4208,
"grad_norm": 1.3188410997390747,
"learning_rate": 6.21335433622123e-05,
"epoch": 2.81
},
{
"loss": 7.2957,
"grad_norm": 1.543289303779602,
"learning_rate": 6.125842303316706e-05,
"epoch": 2.82
},
{
"loss": 7.0319,
"grad_norm": 1.1590594053268433,
"learning_rate": 6.038330270412182e-05,
"epoch": 2.82
},
{
"loss": 7.23,
"grad_norm": 1.1623939275741577,
"learning_rate": 5.9508182375076575e-05,
"epoch": 2.82
},
{
"loss": 7.1254,
"grad_norm": 1.6204333305358887,
"learning_rate": 5.863306204603133e-05,
"epoch": 2.82
},
{
"loss": 7.4319,
"grad_norm": 1.5845638513565063,
"learning_rate": 5.775794171698609e-05,
"epoch": 2.83
},
{
"loss": 7.4574,
"grad_norm": 1.3281787633895874,
"learning_rate": 5.688282138794084e-05,
"epoch": 2.83
},
{
"loss": 6.8629,
"grad_norm": 1.6502999067306519,
"learning_rate": 5.6007701058895595e-05,
"epoch": 2.83
},
{
"loss": 7.1493,
"grad_norm": 1.7768168449401855,
"learning_rate": 5.513258072985036e-05,
"epoch": 2.83
},
{
"loss": 7.1971,
"grad_norm": 1.1763763427734375,
"learning_rate": 5.425746040080511e-05,
"epoch": 2.84
},
{
"loss": 7.4182,
"grad_norm": 1.4033911228179932,
"learning_rate": 5.338234007175987e-05,
"epoch": 2.84
},
{
"loss": 6.8175,
"grad_norm": 1.5407586097717285,
"learning_rate": 5.250721974271463e-05,
"epoch": 2.84
},
{
"loss": 7.5091,
"grad_norm": 1.5829062461853027,
"learning_rate": 5.163209941366938e-05,
"epoch": 2.85
},
{
"loss": 7.0728,
"grad_norm": 1.3185957670211792,
"learning_rate": 5.075697908462414e-05,
"epoch": 2.85
},
{
"loss": 7.1931,
"grad_norm": 1.1996837854385376,
"learning_rate": 4.9881858755578894e-05,
"epoch": 2.85
},
{
"loss": 7.2327,
"grad_norm": 1.6188883781433105,
"learning_rate": 4.900673842653365e-05,
"epoch": 2.85
},
{
"loss": 7.2432,
"grad_norm": 1.7829197645187378,
"learning_rate": 4.813161809748841e-05,
"epoch": 2.86
},
{
"loss": 6.8231,
"grad_norm": 1.3998175859451294,
"learning_rate": 4.725649776844316e-05,
"epoch": 2.86
},
{
"loss": 7.5838,
"grad_norm": 1.6664845943450928,
"learning_rate": 4.6381377439397914e-05,
"epoch": 2.86
},
{
"loss": 7.3804,
"grad_norm": 1.2328096628189087,
"learning_rate": 4.5506257110352676e-05,
"epoch": 2.86
},
{
"loss": 7.1497,
"grad_norm": 1.5543657541275024,
"learning_rate": 4.463113678130743e-05,
"epoch": 2.87
},
{
"loss": 7.5067,
"grad_norm": 2.0711114406585693,
"learning_rate": 4.3756016452262186e-05,
"epoch": 2.87
},
{
"loss": 7.1481,
"grad_norm": 2.340829372406006,
"learning_rate": 4.288089612321695e-05,
"epoch": 2.87
},
{
"loss": 7.2767,
"grad_norm": 1.3014119863510132,
"learning_rate": 4.2005775794171696e-05,
"epoch": 2.87
},
{
"loss": 7.2583,
"grad_norm": 1.186070442199707,
"learning_rate": 4.113065546512645e-05,
"epoch": 2.88
},
{
"loss": 7.7179,
"grad_norm": 1.4286901950836182,
"learning_rate": 4.025553513608121e-05,
"epoch": 2.88
},
{
"loss": 6.9271,
"grad_norm": 1.561988115310669,
"learning_rate": 3.938041480703597e-05,
"epoch": 2.88
},
{
"loss": 6.9378,
"grad_norm": 1.2756584882736206,
"learning_rate": 3.8505294477990723e-05,
"epoch": 2.88
},
{
"loss": 7.8091,
"grad_norm": 1.5452569723129272,
"learning_rate": 3.7630174148945485e-05,
"epoch": 2.89
},
{
"loss": 6.7905,
"grad_norm": 1.2616968154907227,
"learning_rate": 3.6755053819900234e-05,
"epoch": 2.89
},
{
"loss": 7.3958,
"grad_norm": 1.1684807538986206,
"learning_rate": 3.5879933490854995e-05,
"epoch": 2.89
},
{
"loss": 6.9238,
"grad_norm": 1.351366639137268,
"learning_rate": 3.500481316180975e-05,
"epoch": 2.9
},
{
"loss": 7.4026,
"grad_norm": 1.2473573684692383,
"learning_rate": 3.4129692832764505e-05,
"epoch": 2.9
},
{
"loss": 7.4247,
"grad_norm": 1.5123474597930908,
"learning_rate": 3.325457250371927e-05,
"epoch": 2.9
},
{
"loss": 7.0967,
"grad_norm": 1.1452938318252563,
"learning_rate": 3.2379452174674016e-05,
"epoch": 2.9
},
{
"loss": 7.0357,
"grad_norm": 1.1505627632141113,
"learning_rate": 3.150433184562877e-05,
"epoch": 2.91
},
{
"loss": 7.4973,
"grad_norm": 1.438091516494751,
"learning_rate": 3.062921151658353e-05,
"epoch": 2.91
},
{
"loss": 7.4715,
"grad_norm": 1.1489310264587402,
"learning_rate": 2.9754091187538288e-05,
"epoch": 2.91
},
{
"loss": 7.0076,
"grad_norm": 1.3423534631729126,
"learning_rate": 2.8878970858493046e-05,
"epoch": 2.91
},
{
"loss": 7.0935,
"grad_norm": 1.2484374046325684,
"learning_rate": 2.8003850529447798e-05,
"epoch": 2.92
},
{
"loss": 7.1792,
"grad_norm": 1.310231328010559,
"learning_rate": 2.7128730200402556e-05,
"epoch": 2.92
},
{
"loss": 7.3469,
"grad_norm": 1.417974591255188,
"learning_rate": 2.6253609871357314e-05,
"epoch": 2.92
},
{
"loss": 7.2473,
"grad_norm": 1.3878840208053589,
"learning_rate": 2.537848954231207e-05,
"epoch": 2.92
},
{
"loss": 7.1321,
"grad_norm": 1.6403028964996338,
"learning_rate": 2.459088124617135e-05,
"epoch": 2.93
},
{
"loss": 7.6076,
"grad_norm": 1.2110294103622437,
"learning_rate": 2.3715760917126104e-05,
"epoch": 2.93
},
{
"loss": 7.3466,
"grad_norm": 1.203755497932434,
"learning_rate": 2.2840640588080863e-05,
"epoch": 2.93
},
{
"loss": 7.4367,
"grad_norm": 1.2081892490386963,
"learning_rate": 2.1965520259035618e-05,
"epoch": 2.93
},
{
"loss": 7.6191,
"grad_norm": 1.2515225410461426,
"learning_rate": 2.1090399929990373e-05,
"epoch": 2.94
},
{
"loss": 7.2915,
"grad_norm": 1.2461618185043335,
"learning_rate": 2.021527960094513e-05,
"epoch": 2.94
},
{
"loss": 7.0825,
"grad_norm": 1.3424855470657349,
"learning_rate": 1.9340159271899886e-05,
"epoch": 2.94
},
{
"loss": 7.6924,
"grad_norm": 1.2109103202819824,
"learning_rate": 1.846503894285464e-05,
"epoch": 2.95
},
{
"loss": 7.531,
"grad_norm": 1.2161798477172852,
"learning_rate": 1.75899186138094e-05,
"epoch": 2.95
},
{
"loss": 7.1992,
"grad_norm": 1.347778081893921,
"learning_rate": 1.6714798284764158e-05,
"epoch": 2.95
},
{
"loss": 7.7785,
"grad_norm": 1.2869161367416382,
"learning_rate": 1.583967795571891e-05,
"epoch": 2.95
},
{
"loss": 7.6703,
"grad_norm": 1.1452679634094238,
"learning_rate": 1.4964557626673668e-05,
"epoch": 2.96
},
{
"loss": 7.3311,
"grad_norm": 1.7757437229156494,
"learning_rate": 1.4089437297628423e-05,
"epoch": 2.96
},
{
"loss": 7.4272,
"grad_norm": 1.2730258703231812,
"learning_rate": 1.3214316968583182e-05,
"epoch": 2.96
},
{
"loss": 6.8195,
"grad_norm": 1.0826276540756226,
"learning_rate": 1.2339196639537937e-05,
"epoch": 2.96
},
{
"loss": 7.1219,
"grad_norm": 1.3847414255142212,
"learning_rate": 1.1464076310492692e-05,
"epoch": 2.97
},
{
"loss": 7.5912,
"grad_norm": 1.4612926244735718,
"learning_rate": 1.0588955981447449e-05,
"epoch": 2.97
},
{
"loss": 6.9373,
"grad_norm": 1.5692036151885986,
"learning_rate": 9.713835652402205e-06,
"epoch": 2.97
},
{
"loss": 7.7104,
"grad_norm": 1.4740134477615356,
"learning_rate": 8.838715323356962e-06,
"epoch": 2.97
},
{
"loss": 7.1918,
"grad_norm": 1.026573657989502,
"learning_rate": 7.963594994311717e-06,
"epoch": 2.98
},
{
"loss": 6.8717,
"grad_norm": 1.1959487199783325,
"learning_rate": 7.088474665266474e-06,
"epoch": 2.98
},
{
"loss": 7.4154,
"grad_norm": 1.1354584693908691,
"learning_rate": 6.213354336221231e-06,
"epoch": 2.98
},
{
"loss": 7.1622,
"grad_norm": 1.3372441530227661,
"learning_rate": 5.338234007175987e-06,
"epoch": 2.98
},
{
"loss": 6.9564,
"grad_norm": 1.1713366508483887,
"learning_rate": 4.463113678130743e-06,
"epoch": 2.99
},
{
"loss": 7.462,
"grad_norm": 1.8238294124603271,
"learning_rate": 3.587993349085499e-06,
"epoch": 2.99
},
{
"loss": 7.5493,
"grad_norm": 1.3313993215560913,
"learning_rate": 2.7128730200402555e-06,
"epoch": 2.99
},
{
"loss": 7.2399,
"grad_norm": 1.1780248880386353,
"learning_rate": 1.8377526909950118e-06,
"epoch": 3.0
},
{
"loss": 6.879,
"grad_norm": 1.2703826427459717,
"learning_rate": 9.626323619497682e-07,
"epoch": 3.0
},
{
"train_runtime": 104781.7564,
"train_samples_per_second": 3.49,
"train_steps_per_second": 0.109,
"train_loss": 8.437174775609405,
"epoch": 3.0
}
]