1SV69 / checkpoint-900 /trainer_state.json
gotzmann's picture
..
a95e7c5
raw
history blame
144 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.448542237727386,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00049838026414154,
"grad_norm": 0.8175273537635803,
"learning_rate": 1e-05,
"loss": 1.8901,
"step": 1
},
{
"epoch": 0.00099676052828308,
"grad_norm": 0.5205090641975403,
"learning_rate": 2e-05,
"loss": 1.8661,
"step": 2
},
{
"epoch": 0.00149514079242462,
"grad_norm": 0.7050982713699341,
"learning_rate": 3e-05,
"loss": 1.884,
"step": 3
},
{
"epoch": 0.00199352105656616,
"grad_norm": 0.3958536684513092,
"learning_rate": 4e-05,
"loss": 1.848,
"step": 4
},
{
"epoch": 0.0024919013207077,
"grad_norm": 0.2910257577896118,
"learning_rate": 5e-05,
"loss": 1.8363,
"step": 5
},
{
"epoch": 0.00299028158484924,
"grad_norm": 1.1061186790466309,
"learning_rate": 6e-05,
"loss": 2.1065,
"step": 6
},
{
"epoch": 0.00348866184899078,
"grad_norm": 0.35989394783973694,
"learning_rate": 7e-05,
"loss": 1.8461,
"step": 7
},
{
"epoch": 0.00398704211313232,
"grad_norm": 0.3001234233379364,
"learning_rate": 8e-05,
"loss": 1.8691,
"step": 8
},
{
"epoch": 0.00448542237727386,
"grad_norm": 0.3210326135158539,
"learning_rate": 9e-05,
"loss": 1.8006,
"step": 9
},
{
"epoch": 0.0049838026414154,
"grad_norm": 0.24240201711654663,
"learning_rate": 0.0001,
"loss": 1.8136,
"step": 10
},
{
"epoch": 0.00548218290555694,
"grad_norm": 0.2921009957790375,
"learning_rate": 0.00011000000000000002,
"loss": 1.7785,
"step": 11
},
{
"epoch": 0.00598056316969848,
"grad_norm": 0.2199179232120514,
"learning_rate": 0.00012,
"loss": 1.8334,
"step": 12
},
{
"epoch": 0.00647894343384002,
"grad_norm": 0.18247301876544952,
"learning_rate": 0.00013000000000000002,
"loss": 1.8171,
"step": 13
},
{
"epoch": 0.00697732369798156,
"grad_norm": 0.16971151530742645,
"learning_rate": 0.00014,
"loss": 1.8838,
"step": 14
},
{
"epoch": 0.0074757039621231,
"grad_norm": 0.19395150244235992,
"learning_rate": 0.00015000000000000001,
"loss": 1.8121,
"step": 15
},
{
"epoch": 0.00797408422626464,
"grad_norm": 0.18596555292606354,
"learning_rate": 0.00016,
"loss": 1.7756,
"step": 16
},
{
"epoch": 0.00847246449040618,
"grad_norm": 0.23639832437038422,
"learning_rate": 0.00017,
"loss": 1.8293,
"step": 17
},
{
"epoch": 0.00897084475454772,
"grad_norm": 0.5992503762245178,
"learning_rate": 0.00018,
"loss": 1.8285,
"step": 18
},
{
"epoch": 0.00946922501868926,
"grad_norm": 0.24062925577163696,
"learning_rate": 0.00019,
"loss": 1.8139,
"step": 19
},
{
"epoch": 0.0099676052828308,
"grad_norm": 0.1615862101316452,
"learning_rate": 0.0002,
"loss": 1.7916,
"step": 20
},
{
"epoch": 0.01046598554697234,
"grad_norm": 0.1461448222398758,
"learning_rate": 0.0002,
"loss": 1.756,
"step": 21
},
{
"epoch": 0.01096436581111388,
"grad_norm": 0.16745099425315857,
"learning_rate": 0.0002,
"loss": 1.7139,
"step": 22
},
{
"epoch": 0.01146274607525542,
"grad_norm": 0.13099125027656555,
"learning_rate": 0.0002,
"loss": 1.7764,
"step": 23
},
{
"epoch": 0.01196112633939696,
"grad_norm": 0.11523797363042831,
"learning_rate": 0.0002,
"loss": 1.6983,
"step": 24
},
{
"epoch": 0.012459506603538499,
"grad_norm": 0.4995543956756592,
"learning_rate": 0.0002,
"loss": 1.7629,
"step": 25
},
{
"epoch": 0.01295788686768004,
"grad_norm": 0.1197713166475296,
"learning_rate": 0.0002,
"loss": 1.6818,
"step": 26
},
{
"epoch": 0.01345626713182158,
"grad_norm": 0.12242875248193741,
"learning_rate": 0.0002,
"loss": 1.7446,
"step": 27
},
{
"epoch": 0.01395464739596312,
"grad_norm": 0.11533704400062561,
"learning_rate": 0.0002,
"loss": 1.7924,
"step": 28
},
{
"epoch": 0.01445302766010466,
"grad_norm": 0.11372833698987961,
"learning_rate": 0.0002,
"loss": 1.8541,
"step": 29
},
{
"epoch": 0.0149514079242462,
"grad_norm": 0.10559230297803879,
"learning_rate": 0.0002,
"loss": 1.727,
"step": 30
},
{
"epoch": 0.01544978818838774,
"grad_norm": 0.1040055975317955,
"learning_rate": 0.0002,
"loss": 1.6867,
"step": 31
},
{
"epoch": 0.01594816845252928,
"grad_norm": 0.09699314832687378,
"learning_rate": 0.0002,
"loss": 1.7119,
"step": 32
},
{
"epoch": 0.016446548716670818,
"grad_norm": 0.09951823949813843,
"learning_rate": 0.0002,
"loss": 1.6883,
"step": 33
},
{
"epoch": 0.01694492898081236,
"grad_norm": 0.09926764667034149,
"learning_rate": 0.0002,
"loss": 1.6828,
"step": 34
},
{
"epoch": 0.0174433092449539,
"grad_norm": 0.11137701570987701,
"learning_rate": 0.0002,
"loss": 1.8129,
"step": 35
},
{
"epoch": 0.01794168950909544,
"grad_norm": 0.09449079632759094,
"learning_rate": 0.0002,
"loss": 1.7351,
"step": 36
},
{
"epoch": 0.01844006977323698,
"grad_norm": 0.10035137832164764,
"learning_rate": 0.0002,
"loss": 1.7835,
"step": 37
},
{
"epoch": 0.01893845003737852,
"grad_norm": 0.0987599715590477,
"learning_rate": 0.0002,
"loss": 1.6905,
"step": 38
},
{
"epoch": 0.01943683030152006,
"grad_norm": 0.1124144196510315,
"learning_rate": 0.0002,
"loss": 1.7833,
"step": 39
},
{
"epoch": 0.0199352105656616,
"grad_norm": 0.10424085706472397,
"learning_rate": 0.0002,
"loss": 1.7308,
"step": 40
},
{
"epoch": 0.02043359082980314,
"grad_norm": 0.10069456696510315,
"learning_rate": 0.0002,
"loss": 1.7756,
"step": 41
},
{
"epoch": 0.02093197109394468,
"grad_norm": 0.096500463783741,
"learning_rate": 0.0002,
"loss": 1.6723,
"step": 42
},
{
"epoch": 0.02143035135808622,
"grad_norm": 0.10054206848144531,
"learning_rate": 0.0002,
"loss": 1.7609,
"step": 43
},
{
"epoch": 0.02192873162222776,
"grad_norm": 0.6995068192481995,
"learning_rate": 0.0002,
"loss": 1.8469,
"step": 44
},
{
"epoch": 0.0224271118863693,
"grad_norm": 0.10629299283027649,
"learning_rate": 0.0002,
"loss": 1.7838,
"step": 45
},
{
"epoch": 0.02292549215051084,
"grad_norm": 0.7601500749588013,
"learning_rate": 0.0002,
"loss": 1.9191,
"step": 46
},
{
"epoch": 0.02342387241465238,
"grad_norm": 0.15130610764026642,
"learning_rate": 0.0002,
"loss": 1.7054,
"step": 47
},
{
"epoch": 0.02392225267879392,
"grad_norm": 0.13523732125759125,
"learning_rate": 0.0002,
"loss": 1.8099,
"step": 48
},
{
"epoch": 0.02442063294293546,
"grad_norm": 0.13607007265090942,
"learning_rate": 0.0002,
"loss": 1.7106,
"step": 49
},
{
"epoch": 0.024919013207076998,
"grad_norm": 0.12477318197488785,
"learning_rate": 0.0002,
"loss": 1.6664,
"step": 50
},
{
"epoch": 0.02541739347121854,
"grad_norm": 0.6004332304000854,
"learning_rate": 0.0002,
"loss": 1.8337,
"step": 51
},
{
"epoch": 0.02591577373536008,
"grad_norm": 0.11952889710664749,
"learning_rate": 0.0002,
"loss": 1.8014,
"step": 52
},
{
"epoch": 0.02641415399950162,
"grad_norm": 0.12411167472600937,
"learning_rate": 0.0002,
"loss": 1.716,
"step": 53
},
{
"epoch": 0.02691253426364316,
"grad_norm": 0.13071775436401367,
"learning_rate": 0.0002,
"loss": 1.8158,
"step": 54
},
{
"epoch": 0.0274109145277847,
"grad_norm": 0.10316825658082962,
"learning_rate": 0.0002,
"loss": 1.7051,
"step": 55
},
{
"epoch": 0.02790929479192624,
"grad_norm": 0.12366951256990433,
"learning_rate": 0.0002,
"loss": 1.7233,
"step": 56
},
{
"epoch": 0.02840767505606778,
"grad_norm": 0.11353752017021179,
"learning_rate": 0.0002,
"loss": 1.7875,
"step": 57
},
{
"epoch": 0.02890605532020932,
"grad_norm": 0.10084105283021927,
"learning_rate": 0.0002,
"loss": 1.8455,
"step": 58
},
{
"epoch": 0.02940443558435086,
"grad_norm": 0.09446979314088821,
"learning_rate": 0.0002,
"loss": 1.6738,
"step": 59
},
{
"epoch": 0.0299028158484924,
"grad_norm": 0.10983336716890335,
"learning_rate": 0.0002,
"loss": 1.7517,
"step": 60
},
{
"epoch": 0.03040119611263394,
"grad_norm": 0.09697376936674118,
"learning_rate": 0.0002,
"loss": 1.7885,
"step": 61
},
{
"epoch": 0.03089957637677548,
"grad_norm": 0.10111090540885925,
"learning_rate": 0.0002,
"loss": 1.7711,
"step": 62
},
{
"epoch": 0.03139795664091702,
"grad_norm": 0.09077231585979462,
"learning_rate": 0.0002,
"loss": 1.6886,
"step": 63
},
{
"epoch": 0.03189633690505856,
"grad_norm": 0.09181386977434158,
"learning_rate": 0.0002,
"loss": 1.7101,
"step": 64
},
{
"epoch": 0.0323947171692001,
"grad_norm": 0.09549912065267563,
"learning_rate": 0.0002,
"loss": 1.727,
"step": 65
},
{
"epoch": 0.032893097433341636,
"grad_norm": 0.09550771117210388,
"learning_rate": 0.0002,
"loss": 1.7627,
"step": 66
},
{
"epoch": 0.03339147769748318,
"grad_norm": 0.09617152065038681,
"learning_rate": 0.0002,
"loss": 1.7195,
"step": 67
},
{
"epoch": 0.03388985796162472,
"grad_norm": 0.08987727761268616,
"learning_rate": 0.0002,
"loss": 1.6672,
"step": 68
},
{
"epoch": 0.03438823822576626,
"grad_norm": 0.1968306601047516,
"learning_rate": 0.0002,
"loss": 1.7743,
"step": 69
},
{
"epoch": 0.0348866184899078,
"grad_norm": 0.11987251788377762,
"learning_rate": 0.0002,
"loss": 1.7883,
"step": 70
},
{
"epoch": 0.03538499875404934,
"grad_norm": 0.09412620961666107,
"learning_rate": 0.0002,
"loss": 1.7965,
"step": 71
},
{
"epoch": 0.03588337901819088,
"grad_norm": 0.09160133451223373,
"learning_rate": 0.0002,
"loss": 1.7451,
"step": 72
},
{
"epoch": 0.03638175928233242,
"grad_norm": 0.08958347886800766,
"learning_rate": 0.0002,
"loss": 1.6991,
"step": 73
},
{
"epoch": 0.03688013954647396,
"grad_norm": 0.08735426515340805,
"learning_rate": 0.0002,
"loss": 1.7267,
"step": 74
},
{
"epoch": 0.0373785198106155,
"grad_norm": 0.09234903752803802,
"learning_rate": 0.0002,
"loss": 1.7363,
"step": 75
},
{
"epoch": 0.03787690007475704,
"grad_norm": 0.3366870582103729,
"learning_rate": 0.0002,
"loss": 1.7519,
"step": 76
},
{
"epoch": 0.03837528033889858,
"grad_norm": 0.11989757418632507,
"learning_rate": 0.0002,
"loss": 1.7388,
"step": 77
},
{
"epoch": 0.03887366060304012,
"grad_norm": 0.09671110659837723,
"learning_rate": 0.0002,
"loss": 1.6955,
"step": 78
},
{
"epoch": 0.03937204086718166,
"grad_norm": 0.3544454276561737,
"learning_rate": 0.0002,
"loss": 1.7123,
"step": 79
},
{
"epoch": 0.0398704211313232,
"grad_norm": 0.36497563123703003,
"learning_rate": 0.0002,
"loss": 1.8832,
"step": 80
},
{
"epoch": 0.04036880139546474,
"grad_norm": 0.1029423251748085,
"learning_rate": 0.0002,
"loss": 1.6739,
"step": 81
},
{
"epoch": 0.04086718165960628,
"grad_norm": 0.13265877962112427,
"learning_rate": 0.0002,
"loss": 1.6735,
"step": 82
},
{
"epoch": 0.041365561923747816,
"grad_norm": 0.10281170904636383,
"learning_rate": 0.0002,
"loss": 1.7079,
"step": 83
},
{
"epoch": 0.04186394218788936,
"grad_norm": 0.9060964584350586,
"learning_rate": 0.0002,
"loss": 2.0666,
"step": 84
},
{
"epoch": 0.0423623224520309,
"grad_norm": 0.6496222615242004,
"learning_rate": 0.0002,
"loss": 1.7719,
"step": 85
},
{
"epoch": 0.04286070271617244,
"grad_norm": 0.20052167773246765,
"learning_rate": 0.0002,
"loss": 1.7717,
"step": 86
},
{
"epoch": 0.04335908298031398,
"grad_norm": 0.20841394364833832,
"learning_rate": 0.0002,
"loss": 1.7548,
"step": 87
},
{
"epoch": 0.04385746324445552,
"grad_norm": 0.14324237406253815,
"learning_rate": 0.0002,
"loss": 1.6689,
"step": 88
},
{
"epoch": 0.04435584350859706,
"grad_norm": 0.1330689936876297,
"learning_rate": 0.0002,
"loss": 1.741,
"step": 89
},
{
"epoch": 0.0448542237727386,
"grad_norm": 0.13436254858970642,
"learning_rate": 0.0002,
"loss": 1.8316,
"step": 90
},
{
"epoch": 0.04535260403688014,
"grad_norm": 0.11558011174201965,
"learning_rate": 0.0002,
"loss": 1.7094,
"step": 91
},
{
"epoch": 0.04585098430102168,
"grad_norm": 0.13997307419776917,
"learning_rate": 0.0002,
"loss": 1.7487,
"step": 92
},
{
"epoch": 0.04634936456516322,
"grad_norm": 0.11401030421257019,
"learning_rate": 0.0002,
"loss": 1.6971,
"step": 93
},
{
"epoch": 0.04684774482930476,
"grad_norm": 0.1490752398967743,
"learning_rate": 0.0002,
"loss": 1.7318,
"step": 94
},
{
"epoch": 0.0473461250934463,
"grad_norm": 0.10417014360427856,
"learning_rate": 0.0002,
"loss": 1.6225,
"step": 95
},
{
"epoch": 0.04784450535758784,
"grad_norm": 0.11896169185638428,
"learning_rate": 0.0002,
"loss": 1.6585,
"step": 96
},
{
"epoch": 0.04834288562172938,
"grad_norm": 0.1187196597456932,
"learning_rate": 0.0002,
"loss": 1.6665,
"step": 97
},
{
"epoch": 0.04884126588587092,
"grad_norm": 0.10665114969015121,
"learning_rate": 0.0002,
"loss": 1.7154,
"step": 98
},
{
"epoch": 0.04933964615001246,
"grad_norm": 0.11822202056646347,
"learning_rate": 0.0002,
"loss": 1.7159,
"step": 99
},
{
"epoch": 0.049838026414153996,
"grad_norm": 0.10062436759471893,
"learning_rate": 0.0002,
"loss": 1.6696,
"step": 100
},
{
"epoch": 0.05033640667829554,
"grad_norm": 0.10343766212463379,
"learning_rate": 0.0002,
"loss": 1.69,
"step": 101
},
{
"epoch": 0.05083478694243708,
"grad_norm": 0.09872441738843918,
"learning_rate": 0.0002,
"loss": 1.7566,
"step": 102
},
{
"epoch": 0.05133316720657862,
"grad_norm": 0.08979122340679169,
"learning_rate": 0.0002,
"loss": 1.6714,
"step": 103
},
{
"epoch": 0.05183154747072016,
"grad_norm": 0.10805679857730865,
"learning_rate": 0.0002,
"loss": 1.7127,
"step": 104
},
{
"epoch": 0.0523299277348617,
"grad_norm": 0.0966518372297287,
"learning_rate": 0.0002,
"loss": 1.6586,
"step": 105
},
{
"epoch": 0.05282830799900324,
"grad_norm": 0.6643556952476501,
"learning_rate": 0.0002,
"loss": 1.906,
"step": 106
},
{
"epoch": 0.05332668826314478,
"grad_norm": 0.14238013327121735,
"learning_rate": 0.0002,
"loss": 1.7367,
"step": 107
},
{
"epoch": 0.05382506852728632,
"grad_norm": 0.2091197371482849,
"learning_rate": 0.0002,
"loss": 1.7879,
"step": 108
},
{
"epoch": 0.05432344879142786,
"grad_norm": 0.11703892797231674,
"learning_rate": 0.0002,
"loss": 1.7743,
"step": 109
},
{
"epoch": 0.0548218290555694,
"grad_norm": 0.15277640521526337,
"learning_rate": 0.0002,
"loss": 1.6906,
"step": 110
},
{
"epoch": 0.05532020931971094,
"grad_norm": 0.11744142323732376,
"learning_rate": 0.0002,
"loss": 1.6935,
"step": 111
},
{
"epoch": 0.05581858958385248,
"grad_norm": 0.10640200227499008,
"learning_rate": 0.0002,
"loss": 1.6654,
"step": 112
},
{
"epoch": 0.05631696984799402,
"grad_norm": 0.10955353826284409,
"learning_rate": 0.0002,
"loss": 1.7095,
"step": 113
},
{
"epoch": 0.05681535011213556,
"grad_norm": 0.3743372857570648,
"learning_rate": 0.0002,
"loss": 1.8212,
"step": 114
},
{
"epoch": 0.0573137303762771,
"grad_norm": 0.11817771941423416,
"learning_rate": 0.0002,
"loss": 1.7246,
"step": 115
},
{
"epoch": 0.05781211064041864,
"grad_norm": 0.10563557595014572,
"learning_rate": 0.0002,
"loss": 1.6554,
"step": 116
},
{
"epoch": 0.058310490904560176,
"grad_norm": 0.11494623869657516,
"learning_rate": 0.0002,
"loss": 1.7563,
"step": 117
},
{
"epoch": 0.05880887116870172,
"grad_norm": 0.12262585759162903,
"learning_rate": 0.0002,
"loss": 1.7416,
"step": 118
},
{
"epoch": 0.05930725143284326,
"grad_norm": 0.09501025080680847,
"learning_rate": 0.0002,
"loss": 1.7068,
"step": 119
},
{
"epoch": 0.0598056316969848,
"grad_norm": 0.15478286147117615,
"learning_rate": 0.0002,
"loss": 1.8005,
"step": 120
},
{
"epoch": 0.06030401196112634,
"grad_norm": 0.5174306631088257,
"learning_rate": 0.0002,
"loss": 1.7736,
"step": 121
},
{
"epoch": 0.06080239222526788,
"grad_norm": 0.37489035725593567,
"learning_rate": 0.0002,
"loss": 1.7367,
"step": 122
},
{
"epoch": 0.06130077248940942,
"grad_norm": 0.10632194578647614,
"learning_rate": 0.0002,
"loss": 1.6754,
"step": 123
},
{
"epoch": 0.06179915275355096,
"grad_norm": 0.5897635817527771,
"learning_rate": 0.0002,
"loss": 1.8483,
"step": 124
},
{
"epoch": 0.0622975330176925,
"grad_norm": 0.1104891449213028,
"learning_rate": 0.0002,
"loss": 1.6705,
"step": 125
},
{
"epoch": 0.06279591328183404,
"grad_norm": 0.171495720744133,
"learning_rate": 0.0002,
"loss": 1.8345,
"step": 126
},
{
"epoch": 0.06329429354597559,
"grad_norm": 0.2864750921726227,
"learning_rate": 0.0002,
"loss": 1.6944,
"step": 127
},
{
"epoch": 0.06379267381011712,
"grad_norm": 0.1258823126554489,
"learning_rate": 0.0002,
"loss": 1.6922,
"step": 128
},
{
"epoch": 0.06429105407425866,
"grad_norm": 0.10813643783330917,
"learning_rate": 0.0002,
"loss": 1.6886,
"step": 129
},
{
"epoch": 0.0647894343384002,
"grad_norm": 0.12285427749156952,
"learning_rate": 0.0002,
"loss": 1.712,
"step": 130
},
{
"epoch": 0.06528781460254174,
"grad_norm": 0.11049698293209076,
"learning_rate": 0.0002,
"loss": 1.7107,
"step": 131
},
{
"epoch": 0.06578619486668327,
"grad_norm": 0.4740373492240906,
"learning_rate": 0.0002,
"loss": 1.8128,
"step": 132
},
{
"epoch": 0.06628457513082482,
"grad_norm": 0.11663281917572021,
"learning_rate": 0.0002,
"loss": 1.7054,
"step": 133
},
{
"epoch": 0.06678295539496636,
"grad_norm": 0.1274426281452179,
"learning_rate": 0.0002,
"loss": 1.7461,
"step": 134
},
{
"epoch": 0.0672813356591079,
"grad_norm": 0.11273318529129028,
"learning_rate": 0.0002,
"loss": 1.6195,
"step": 135
},
{
"epoch": 0.06777971592324944,
"grad_norm": 0.12240920960903168,
"learning_rate": 0.0002,
"loss": 1.7528,
"step": 136
},
{
"epoch": 0.06827809618739097,
"grad_norm": 0.1003924235701561,
"learning_rate": 0.0002,
"loss": 1.5651,
"step": 137
},
{
"epoch": 0.06877647645153252,
"grad_norm": 0.12279325723648071,
"learning_rate": 0.0002,
"loss": 1.7905,
"step": 138
},
{
"epoch": 0.06927485671567406,
"grad_norm": 0.10567662119865417,
"learning_rate": 0.0002,
"loss": 1.7437,
"step": 139
},
{
"epoch": 0.0697732369798156,
"grad_norm": 0.0949968695640564,
"learning_rate": 0.0002,
"loss": 1.7375,
"step": 140
},
{
"epoch": 0.07027161724395714,
"grad_norm": 0.10375083237886429,
"learning_rate": 0.0002,
"loss": 1.713,
"step": 141
},
{
"epoch": 0.07076999750809868,
"grad_norm": 0.0937686413526535,
"learning_rate": 0.0002,
"loss": 1.7152,
"step": 142
},
{
"epoch": 0.07126837777224022,
"grad_norm": 0.0981929674744606,
"learning_rate": 0.0002,
"loss": 1.7116,
"step": 143
},
{
"epoch": 0.07176675803638176,
"grad_norm": 1.1460381746292114,
"learning_rate": 0.0002,
"loss": 1.9091,
"step": 144
},
{
"epoch": 0.0722651383005233,
"grad_norm": 0.1193133145570755,
"learning_rate": 0.0002,
"loss": 1.7387,
"step": 145
},
{
"epoch": 0.07276351856466484,
"grad_norm": 0.13854117691516876,
"learning_rate": 0.0002,
"loss": 1.656,
"step": 146
},
{
"epoch": 0.07326189882880638,
"grad_norm": 0.6005303263664246,
"learning_rate": 0.0002,
"loss": 1.9014,
"step": 147
},
{
"epoch": 0.07376027909294793,
"grad_norm": 0.13879133760929108,
"learning_rate": 0.0002,
"loss": 1.7158,
"step": 148
},
{
"epoch": 0.07425865935708946,
"grad_norm": 0.13073574006557465,
"learning_rate": 0.0002,
"loss": 1.7355,
"step": 149
},
{
"epoch": 0.074757039621231,
"grad_norm": 0.12578125298023224,
"learning_rate": 0.0002,
"loss": 1.7376,
"step": 150
},
{
"epoch": 0.07525541988537254,
"grad_norm": 0.13024558126926422,
"learning_rate": 0.0002,
"loss": 1.7675,
"step": 151
},
{
"epoch": 0.07575380014951408,
"grad_norm": 0.12630225718021393,
"learning_rate": 0.0002,
"loss": 1.6509,
"step": 152
},
{
"epoch": 0.07625218041365561,
"grad_norm": 0.13081084191799164,
"learning_rate": 0.0002,
"loss": 1.7393,
"step": 153
},
{
"epoch": 0.07675056067779716,
"grad_norm": 0.11292438209056854,
"learning_rate": 0.0002,
"loss": 1.6533,
"step": 154
},
{
"epoch": 0.0772489409419387,
"grad_norm": 0.10187578946352005,
"learning_rate": 0.0002,
"loss": 1.6915,
"step": 155
},
{
"epoch": 0.07774732120608024,
"grad_norm": 0.10563293844461441,
"learning_rate": 0.0002,
"loss": 1.7378,
"step": 156
},
{
"epoch": 0.07824570147022178,
"grad_norm": 0.10501443594694138,
"learning_rate": 0.0002,
"loss": 1.6498,
"step": 157
},
{
"epoch": 0.07874408173436331,
"grad_norm": 0.11756912618875504,
"learning_rate": 0.0002,
"loss": 1.7963,
"step": 158
},
{
"epoch": 0.07924246199850486,
"grad_norm": 0.1010415181517601,
"learning_rate": 0.0002,
"loss": 1.6637,
"step": 159
},
{
"epoch": 0.0797408422626464,
"grad_norm": 0.09472226351499557,
"learning_rate": 0.0002,
"loss": 1.6057,
"step": 160
},
{
"epoch": 0.08023922252678795,
"grad_norm": 0.10156677663326263,
"learning_rate": 0.0002,
"loss": 1.7573,
"step": 161
},
{
"epoch": 0.08073760279092948,
"grad_norm": 0.09345332533121109,
"learning_rate": 0.0002,
"loss": 1.6327,
"step": 162
},
{
"epoch": 0.08123598305507101,
"grad_norm": 0.09440191835165024,
"learning_rate": 0.0002,
"loss": 1.6753,
"step": 163
},
{
"epoch": 0.08173436331921256,
"grad_norm": 0.0925949364900589,
"learning_rate": 0.0002,
"loss": 1.6786,
"step": 164
},
{
"epoch": 0.0822327435833541,
"grad_norm": 0.09808436781167984,
"learning_rate": 0.0002,
"loss": 1.75,
"step": 165
},
{
"epoch": 0.08273112384749563,
"grad_norm": 0.10032784938812256,
"learning_rate": 0.0002,
"loss": 1.6463,
"step": 166
},
{
"epoch": 0.08322950411163718,
"grad_norm": 0.769005298614502,
"learning_rate": 0.0002,
"loss": 1.8314,
"step": 167
},
{
"epoch": 0.08372788437577872,
"grad_norm": 1.013753890991211,
"learning_rate": 0.0002,
"loss": 1.9179,
"step": 168
},
{
"epoch": 0.08422626463992026,
"grad_norm": 0.11522974818944931,
"learning_rate": 0.0002,
"loss": 1.8271,
"step": 169
},
{
"epoch": 0.0847246449040618,
"grad_norm": 0.1381683349609375,
"learning_rate": 0.0002,
"loss": 1.7015,
"step": 170
},
{
"epoch": 0.08522302516820333,
"grad_norm": 0.13124744594097137,
"learning_rate": 0.0002,
"loss": 1.7213,
"step": 171
},
{
"epoch": 0.08572140543234488,
"grad_norm": 0.1552695333957672,
"learning_rate": 0.0002,
"loss": 1.6868,
"step": 172
},
{
"epoch": 0.08621978569648642,
"grad_norm": 0.11559716612100601,
"learning_rate": 0.0002,
"loss": 1.7474,
"step": 173
},
{
"epoch": 0.08671816596062797,
"grad_norm": 0.11131990700960159,
"learning_rate": 0.0002,
"loss": 1.6365,
"step": 174
},
{
"epoch": 0.0872165462247695,
"grad_norm": 0.11412417143583298,
"learning_rate": 0.0002,
"loss": 1.6205,
"step": 175
},
{
"epoch": 0.08771492648891104,
"grad_norm": 0.11382830142974854,
"learning_rate": 0.0002,
"loss": 1.7673,
"step": 176
},
{
"epoch": 0.08821330675305258,
"grad_norm": 0.7038962244987488,
"learning_rate": 0.0002,
"loss": 1.8568,
"step": 177
},
{
"epoch": 0.08871168701719412,
"grad_norm": 0.11253572255373001,
"learning_rate": 0.0002,
"loss": 1.7263,
"step": 178
},
{
"epoch": 0.08921006728133565,
"grad_norm": 0.12908123433589935,
"learning_rate": 0.0002,
"loss": 1.7021,
"step": 179
},
{
"epoch": 0.0897084475454772,
"grad_norm": 0.12027324736118317,
"learning_rate": 0.0002,
"loss": 1.7542,
"step": 180
},
{
"epoch": 0.09020682780961874,
"grad_norm": 0.13822880387306213,
"learning_rate": 0.0002,
"loss": 1.7947,
"step": 181
},
{
"epoch": 0.09070520807376029,
"grad_norm": 0.11809349060058594,
"learning_rate": 0.0002,
"loss": 1.7438,
"step": 182
},
{
"epoch": 0.09120358833790182,
"grad_norm": 0.11567198485136032,
"learning_rate": 0.0002,
"loss": 1.7006,
"step": 183
},
{
"epoch": 0.09170196860204335,
"grad_norm": 0.11884818226099014,
"learning_rate": 0.0002,
"loss": 1.7481,
"step": 184
},
{
"epoch": 0.0922003488661849,
"grad_norm": 0.13118627667427063,
"learning_rate": 0.0002,
"loss": 1.7579,
"step": 185
},
{
"epoch": 0.09269872913032644,
"grad_norm": 0.10780288279056549,
"learning_rate": 0.0002,
"loss": 1.7563,
"step": 186
},
{
"epoch": 0.09319710939446797,
"grad_norm": 0.1052689403295517,
"learning_rate": 0.0002,
"loss": 1.7176,
"step": 187
},
{
"epoch": 0.09369548965860952,
"grad_norm": 0.11142247915267944,
"learning_rate": 0.0002,
"loss": 1.6998,
"step": 188
},
{
"epoch": 0.09419386992275106,
"grad_norm": 0.11082904785871506,
"learning_rate": 0.0002,
"loss": 1.7492,
"step": 189
},
{
"epoch": 0.0946922501868926,
"grad_norm": 0.09668837487697601,
"learning_rate": 0.0002,
"loss": 1.6655,
"step": 190
},
{
"epoch": 0.09519063045103414,
"grad_norm": 0.09926537424325943,
"learning_rate": 0.0002,
"loss": 1.7393,
"step": 191
},
{
"epoch": 0.09568901071517567,
"grad_norm": 0.09865368157625198,
"learning_rate": 0.0002,
"loss": 1.7538,
"step": 192
},
{
"epoch": 0.09618739097931722,
"grad_norm": 0.10074108839035034,
"learning_rate": 0.0002,
"loss": 1.7556,
"step": 193
},
{
"epoch": 0.09668577124345876,
"grad_norm": 0.11467942595481873,
"learning_rate": 0.0002,
"loss": 1.7414,
"step": 194
},
{
"epoch": 0.0971841515076003,
"grad_norm": 0.09638036042451859,
"learning_rate": 0.0002,
"loss": 1.7296,
"step": 195
},
{
"epoch": 0.09768253177174184,
"grad_norm": 0.09951262921094894,
"learning_rate": 0.0002,
"loss": 1.6691,
"step": 196
},
{
"epoch": 0.09818091203588337,
"grad_norm": 0.09425103664398193,
"learning_rate": 0.0002,
"loss": 1.6563,
"step": 197
},
{
"epoch": 0.09867929230002492,
"grad_norm": 0.09163974225521088,
"learning_rate": 0.0002,
"loss": 1.6591,
"step": 198
},
{
"epoch": 0.09917767256416646,
"grad_norm": 0.10825615376234055,
"learning_rate": 0.0002,
"loss": 1.6748,
"step": 199
},
{
"epoch": 0.09967605282830799,
"grad_norm": 0.08873865008354187,
"learning_rate": 0.0002,
"loss": 1.7027,
"step": 200
},
{
"epoch": 0.10017443309244954,
"grad_norm": 0.09379550069570541,
"learning_rate": 0.0002,
"loss": 1.7475,
"step": 201
},
{
"epoch": 0.10067281335659108,
"grad_norm": 0.09395930916070938,
"learning_rate": 0.0002,
"loss": 1.7183,
"step": 202
},
{
"epoch": 0.10117119362073262,
"grad_norm": 0.09373954683542252,
"learning_rate": 0.0002,
"loss": 1.7413,
"step": 203
},
{
"epoch": 0.10166957388487416,
"grad_norm": 0.0926884338259697,
"learning_rate": 0.0002,
"loss": 1.7284,
"step": 204
},
{
"epoch": 0.1021679541490157,
"grad_norm": 0.09394028782844543,
"learning_rate": 0.0002,
"loss": 1.6777,
"step": 205
},
{
"epoch": 0.10266633441315724,
"grad_norm": 0.0934232845902443,
"learning_rate": 0.0002,
"loss": 1.6389,
"step": 206
},
{
"epoch": 0.10316471467729878,
"grad_norm": 0.08943123370409012,
"learning_rate": 0.0002,
"loss": 1.7382,
"step": 207
},
{
"epoch": 0.10366309494144033,
"grad_norm": 0.09671316295862198,
"learning_rate": 0.0002,
"loss": 1.7017,
"step": 208
},
{
"epoch": 0.10416147520558186,
"grad_norm": 0.12016978114843369,
"learning_rate": 0.0002,
"loss": 1.7993,
"step": 209
},
{
"epoch": 0.1046598554697234,
"grad_norm": 0.5822897553443909,
"learning_rate": 0.0002,
"loss": 1.6948,
"step": 210
},
{
"epoch": 0.10515823573386494,
"grad_norm": 0.10984666645526886,
"learning_rate": 0.0002,
"loss": 1.703,
"step": 211
},
{
"epoch": 0.10565661599800648,
"grad_norm": 0.661040186882019,
"learning_rate": 0.0002,
"loss": 1.7008,
"step": 212
},
{
"epoch": 0.10615499626214801,
"grad_norm": 0.1641639620065689,
"learning_rate": 0.0002,
"loss": 1.8105,
"step": 213
},
{
"epoch": 0.10665337652628956,
"grad_norm": 0.34271761775016785,
"learning_rate": 0.0002,
"loss": 1.7768,
"step": 214
},
{
"epoch": 0.1071517567904311,
"grad_norm": 0.11224206537008286,
"learning_rate": 0.0002,
"loss": 1.7126,
"step": 215
},
{
"epoch": 0.10765013705457264,
"grad_norm": 0.11788146197795868,
"learning_rate": 0.0002,
"loss": 1.7617,
"step": 216
},
{
"epoch": 0.10814851731871418,
"grad_norm": 0.10918893665075302,
"learning_rate": 0.0002,
"loss": 1.6258,
"step": 217
},
{
"epoch": 0.10864689758285571,
"grad_norm": 0.12023265659809113,
"learning_rate": 0.0002,
"loss": 1.7459,
"step": 218
},
{
"epoch": 0.10914527784699726,
"grad_norm": 0.11474837362766266,
"learning_rate": 0.0002,
"loss": 1.749,
"step": 219
},
{
"epoch": 0.1096436581111388,
"grad_norm": 0.10222747921943665,
"learning_rate": 0.0002,
"loss": 1.696,
"step": 220
},
{
"epoch": 0.11014203837528033,
"grad_norm": 0.1074354350566864,
"learning_rate": 0.0002,
"loss": 1.708,
"step": 221
},
{
"epoch": 0.11064041863942188,
"grad_norm": 0.5447832345962524,
"learning_rate": 0.0002,
"loss": 1.8402,
"step": 222
},
{
"epoch": 0.11113879890356342,
"grad_norm": 0.12009864300489426,
"learning_rate": 0.0002,
"loss": 1.7412,
"step": 223
},
{
"epoch": 0.11163717916770496,
"grad_norm": 0.11686031520366669,
"learning_rate": 0.0002,
"loss": 1.7185,
"step": 224
},
{
"epoch": 0.1121355594318465,
"grad_norm": 0.12914586067199707,
"learning_rate": 0.0002,
"loss": 1.6867,
"step": 225
},
{
"epoch": 0.11263393969598803,
"grad_norm": 0.10797183215618134,
"learning_rate": 0.0002,
"loss": 1.706,
"step": 226
},
{
"epoch": 0.11313231996012958,
"grad_norm": 0.1088324561715126,
"learning_rate": 0.0002,
"loss": 1.6257,
"step": 227
},
{
"epoch": 0.11363070022427112,
"grad_norm": 0.10438574105501175,
"learning_rate": 0.0002,
"loss": 1.6798,
"step": 228
},
{
"epoch": 0.11412908048841267,
"grad_norm": 0.14163640141487122,
"learning_rate": 0.0002,
"loss": 1.785,
"step": 229
},
{
"epoch": 0.1146274607525542,
"grad_norm": 0.10191742330789566,
"learning_rate": 0.0002,
"loss": 1.6979,
"step": 230
},
{
"epoch": 0.11512584101669573,
"grad_norm": 0.11547041684389114,
"learning_rate": 0.0002,
"loss": 1.7793,
"step": 231
},
{
"epoch": 0.11562422128083728,
"grad_norm": 0.10447453707456589,
"learning_rate": 0.0002,
"loss": 1.7791,
"step": 232
},
{
"epoch": 0.11612260154497882,
"grad_norm": 0.10447558760643005,
"learning_rate": 0.0002,
"loss": 1.6799,
"step": 233
},
{
"epoch": 0.11662098180912035,
"grad_norm": 0.10260461270809174,
"learning_rate": 0.0002,
"loss": 1.6561,
"step": 234
},
{
"epoch": 0.1171193620732619,
"grad_norm": 0.10199354588985443,
"learning_rate": 0.0002,
"loss": 1.6476,
"step": 235
},
{
"epoch": 0.11761774233740344,
"grad_norm": 0.09869713336229324,
"learning_rate": 0.0002,
"loss": 1.6183,
"step": 236
},
{
"epoch": 0.11811612260154498,
"grad_norm": 0.9354596138000488,
"learning_rate": 0.0002,
"loss": 1.9584,
"step": 237
},
{
"epoch": 0.11861450286568652,
"grad_norm": 0.15785987675189972,
"learning_rate": 0.0002,
"loss": 1.718,
"step": 238
},
{
"epoch": 0.11911288312982805,
"grad_norm": 0.16236662864685059,
"learning_rate": 0.0002,
"loss": 1.7275,
"step": 239
},
{
"epoch": 0.1196112633939696,
"grad_norm": 0.1407175064086914,
"learning_rate": 0.0002,
"loss": 1.6987,
"step": 240
},
{
"epoch": 0.12010964365811114,
"grad_norm": 0.13428977131843567,
"learning_rate": 0.0002,
"loss": 1.6998,
"step": 241
},
{
"epoch": 0.12060802392225269,
"grad_norm": 0.5954437255859375,
"learning_rate": 0.0002,
"loss": 1.7536,
"step": 242
},
{
"epoch": 0.12110640418639422,
"grad_norm": 0.12084382027387619,
"learning_rate": 0.0002,
"loss": 1.6446,
"step": 243
},
{
"epoch": 0.12160478445053576,
"grad_norm": 0.12887060642242432,
"learning_rate": 0.0002,
"loss": 1.6994,
"step": 244
},
{
"epoch": 0.1221031647146773,
"grad_norm": 0.12585604190826416,
"learning_rate": 0.0002,
"loss": 1.6705,
"step": 245
},
{
"epoch": 0.12260154497881884,
"grad_norm": 0.11495430767536163,
"learning_rate": 0.0002,
"loss": 1.6833,
"step": 246
},
{
"epoch": 0.12309992524296037,
"grad_norm": 0.36918768286705017,
"learning_rate": 0.0002,
"loss": 1.8354,
"step": 247
},
{
"epoch": 0.12359830550710192,
"grad_norm": 0.1330924779176712,
"learning_rate": 0.0002,
"loss": 1.6915,
"step": 248
},
{
"epoch": 0.12409668577124346,
"grad_norm": 0.6573293805122375,
"learning_rate": 0.0002,
"loss": 1.7672,
"step": 249
},
{
"epoch": 0.124595066035385,
"grad_norm": 0.13000234961509705,
"learning_rate": 0.0002,
"loss": 1.6639,
"step": 250
},
{
"epoch": 0.12509344629952654,
"grad_norm": 0.14653077721595764,
"learning_rate": 0.0002,
"loss": 1.7126,
"step": 251
},
{
"epoch": 0.12559182656366807,
"grad_norm": 0.13498292863368988,
"learning_rate": 0.0002,
"loss": 1.6848,
"step": 252
},
{
"epoch": 0.1260902068278096,
"grad_norm": 0.13268351554870605,
"learning_rate": 0.0002,
"loss": 1.7338,
"step": 253
},
{
"epoch": 0.12658858709195117,
"grad_norm": 0.1395343542098999,
"learning_rate": 0.0002,
"loss": 1.7099,
"step": 254
},
{
"epoch": 0.1270869673560927,
"grad_norm": 0.1279151439666748,
"learning_rate": 0.0002,
"loss": 1.7156,
"step": 255
},
{
"epoch": 0.12758534762023424,
"grad_norm": 0.112457275390625,
"learning_rate": 0.0002,
"loss": 1.7054,
"step": 256
},
{
"epoch": 0.12808372788437578,
"grad_norm": 0.11672843992710114,
"learning_rate": 0.0002,
"loss": 1.6895,
"step": 257
},
{
"epoch": 0.1285821081485173,
"grad_norm": 0.1295323520898819,
"learning_rate": 0.0002,
"loss": 1.6738,
"step": 258
},
{
"epoch": 0.12908048841265887,
"grad_norm": 0.10538823157548904,
"learning_rate": 0.0002,
"loss": 1.626,
"step": 259
},
{
"epoch": 0.1295788686768004,
"grad_norm": 0.1093951016664505,
"learning_rate": 0.0002,
"loss": 1.6494,
"step": 260
},
{
"epoch": 0.13007724894094194,
"grad_norm": 0.10753627866506577,
"learning_rate": 0.0002,
"loss": 1.7058,
"step": 261
},
{
"epoch": 0.13057562920508348,
"grad_norm": 0.11015735566616058,
"learning_rate": 0.0002,
"loss": 1.7519,
"step": 262
},
{
"epoch": 0.131074009469225,
"grad_norm": 0.10606027394533157,
"learning_rate": 0.0002,
"loss": 1.6725,
"step": 263
},
{
"epoch": 0.13157238973336655,
"grad_norm": 0.09919940680265427,
"learning_rate": 0.0002,
"loss": 1.6522,
"step": 264
},
{
"epoch": 0.1320707699975081,
"grad_norm": 0.1004357561469078,
"learning_rate": 0.0002,
"loss": 1.7,
"step": 265
},
{
"epoch": 0.13256915026164964,
"grad_norm": 0.1044403687119484,
"learning_rate": 0.0002,
"loss": 1.7131,
"step": 266
},
{
"epoch": 0.13306753052579118,
"grad_norm": 0.09830351173877716,
"learning_rate": 0.0002,
"loss": 1.7057,
"step": 267
},
{
"epoch": 0.1335659107899327,
"grad_norm": 0.09731124341487885,
"learning_rate": 0.0002,
"loss": 1.6696,
"step": 268
},
{
"epoch": 0.13406429105407425,
"grad_norm": 0.09874913096427917,
"learning_rate": 0.0002,
"loss": 1.6704,
"step": 269
},
{
"epoch": 0.1345626713182158,
"grad_norm": 1.0015792846679688,
"learning_rate": 0.0002,
"loss": 1.828,
"step": 270
},
{
"epoch": 0.13506105158235734,
"grad_norm": 0.15942072868347168,
"learning_rate": 0.0002,
"loss": 1.6851,
"step": 271
},
{
"epoch": 0.13555943184649888,
"grad_norm": 0.1272728443145752,
"learning_rate": 0.0002,
"loss": 1.6946,
"step": 272
},
{
"epoch": 0.1360578121106404,
"grad_norm": 0.13415473699569702,
"learning_rate": 0.0002,
"loss": 1.6865,
"step": 273
},
{
"epoch": 0.13655619237478195,
"grad_norm": 0.6600972414016724,
"learning_rate": 0.0002,
"loss": 1.845,
"step": 274
},
{
"epoch": 0.1370545726389235,
"grad_norm": 0.16784119606018066,
"learning_rate": 0.0002,
"loss": 1.8104,
"step": 275
},
{
"epoch": 0.13755295290306505,
"grad_norm": 0.14813649654388428,
"learning_rate": 0.0002,
"loss": 1.7188,
"step": 276
},
{
"epoch": 0.13805133316720658,
"grad_norm": 0.14158020913600922,
"learning_rate": 0.0002,
"loss": 1.7002,
"step": 277
},
{
"epoch": 0.13854971343134811,
"grad_norm": 0.48206424713134766,
"learning_rate": 0.0002,
"loss": 1.8617,
"step": 278
},
{
"epoch": 0.13904809369548965,
"grad_norm": 0.18177767097949982,
"learning_rate": 0.0002,
"loss": 1.7111,
"step": 279
},
{
"epoch": 0.1395464739596312,
"grad_norm": 0.12430819869041443,
"learning_rate": 0.0002,
"loss": 1.6939,
"step": 280
},
{
"epoch": 0.14004485422377275,
"grad_norm": 0.44922658801078796,
"learning_rate": 0.0002,
"loss": 1.7779,
"step": 281
},
{
"epoch": 0.14054323448791428,
"grad_norm": 0.14023765921592712,
"learning_rate": 0.0002,
"loss": 1.6521,
"step": 282
},
{
"epoch": 0.14104161475205582,
"grad_norm": 0.15241369605064392,
"learning_rate": 0.0002,
"loss": 1.6819,
"step": 283
},
{
"epoch": 0.14153999501619735,
"grad_norm": 0.12531667947769165,
"learning_rate": 0.0002,
"loss": 1.7014,
"step": 284
},
{
"epoch": 0.14203837528033889,
"grad_norm": 0.13596689701080322,
"learning_rate": 0.0002,
"loss": 1.6841,
"step": 285
},
{
"epoch": 0.14253675554448045,
"grad_norm": 0.1316744089126587,
"learning_rate": 0.0002,
"loss": 1.7503,
"step": 286
},
{
"epoch": 0.14303513580862198,
"grad_norm": 0.11584890633821487,
"learning_rate": 0.0002,
"loss": 1.6776,
"step": 287
},
{
"epoch": 0.14353351607276352,
"grad_norm": 0.37444308400154114,
"learning_rate": 0.0002,
"loss": 1.7808,
"step": 288
},
{
"epoch": 0.14403189633690505,
"grad_norm": 0.3217577338218689,
"learning_rate": 0.0002,
"loss": 1.6491,
"step": 289
},
{
"epoch": 0.1445302766010466,
"grad_norm": 0.12234029918909073,
"learning_rate": 0.0002,
"loss": 1.7131,
"step": 290
},
{
"epoch": 0.14502865686518815,
"grad_norm": 0.13871504366397858,
"learning_rate": 0.0002,
"loss": 1.7737,
"step": 291
},
{
"epoch": 0.14552703712932968,
"grad_norm": 0.10792572051286697,
"learning_rate": 0.0002,
"loss": 1.7162,
"step": 292
},
{
"epoch": 0.14602541739347122,
"grad_norm": 0.11277946084737778,
"learning_rate": 0.0002,
"loss": 1.666,
"step": 293
},
{
"epoch": 0.14652379765761275,
"grad_norm": 0.11250103265047073,
"learning_rate": 0.0002,
"loss": 1.7334,
"step": 294
},
{
"epoch": 0.1470221779217543,
"grad_norm": 0.10644537955522537,
"learning_rate": 0.0002,
"loss": 1.6836,
"step": 295
},
{
"epoch": 0.14752055818589585,
"grad_norm": 0.12423089891672134,
"learning_rate": 0.0002,
"loss": 1.7349,
"step": 296
},
{
"epoch": 0.14801893845003739,
"grad_norm": 0.10547474026679993,
"learning_rate": 0.0002,
"loss": 1.6783,
"step": 297
},
{
"epoch": 0.14851731871417892,
"grad_norm": 0.10867539793252945,
"learning_rate": 0.0002,
"loss": 1.6709,
"step": 298
},
{
"epoch": 0.14901569897832045,
"grad_norm": 0.21218198537826538,
"learning_rate": 0.0002,
"loss": 1.6717,
"step": 299
},
{
"epoch": 0.149514079242462,
"grad_norm": 0.11373799294233322,
"learning_rate": 0.0002,
"loss": 1.7398,
"step": 300
},
{
"epoch": 0.15001245950660355,
"grad_norm": 0.12452666461467743,
"learning_rate": 0.0002,
"loss": 1.7625,
"step": 301
},
{
"epoch": 0.1505108397707451,
"grad_norm": 0.4068242609500885,
"learning_rate": 0.0002,
"loss": 1.7357,
"step": 302
},
{
"epoch": 0.15100922003488662,
"grad_norm": 0.15395419299602509,
"learning_rate": 0.0002,
"loss": 1.6878,
"step": 303
},
{
"epoch": 0.15150760029902816,
"grad_norm": 0.11441215127706528,
"learning_rate": 0.0002,
"loss": 1.7055,
"step": 304
},
{
"epoch": 0.1520059805631697,
"grad_norm": 0.13675518333911896,
"learning_rate": 0.0002,
"loss": 1.7005,
"step": 305
},
{
"epoch": 0.15250436082731123,
"grad_norm": 0.11606375873088837,
"learning_rate": 0.0002,
"loss": 1.6453,
"step": 306
},
{
"epoch": 0.1530027410914528,
"grad_norm": 0.4435337483882904,
"learning_rate": 0.0002,
"loss": 1.7435,
"step": 307
},
{
"epoch": 0.15350112135559432,
"grad_norm": 0.12212298810482025,
"learning_rate": 0.0002,
"loss": 1.705,
"step": 308
},
{
"epoch": 0.15399950161973586,
"grad_norm": 0.14606495201587677,
"learning_rate": 0.0002,
"loss": 1.6517,
"step": 309
},
{
"epoch": 0.1544978818838774,
"grad_norm": 0.11753024160861969,
"learning_rate": 0.0002,
"loss": 1.7427,
"step": 310
},
{
"epoch": 0.15499626214801893,
"grad_norm": 0.13007789850234985,
"learning_rate": 0.0002,
"loss": 1.7462,
"step": 311
},
{
"epoch": 0.1554946424121605,
"grad_norm": 0.11651528626680374,
"learning_rate": 0.0002,
"loss": 1.7128,
"step": 312
},
{
"epoch": 0.15599302267630202,
"grad_norm": 0.1128389984369278,
"learning_rate": 0.0002,
"loss": 1.6977,
"step": 313
},
{
"epoch": 0.15649140294044356,
"grad_norm": 0.10965872555971146,
"learning_rate": 0.0002,
"loss": 1.6578,
"step": 314
},
{
"epoch": 0.1569897832045851,
"grad_norm": 0.10751237720251083,
"learning_rate": 0.0002,
"loss": 1.6346,
"step": 315
},
{
"epoch": 0.15748816346872663,
"grad_norm": 0.09646358340978622,
"learning_rate": 0.0002,
"loss": 1.6873,
"step": 316
},
{
"epoch": 0.1579865437328682,
"grad_norm": 0.09908836334943771,
"learning_rate": 0.0002,
"loss": 1.6934,
"step": 317
},
{
"epoch": 0.15848492399700972,
"grad_norm": 0.09631779044866562,
"learning_rate": 0.0002,
"loss": 1.6703,
"step": 318
},
{
"epoch": 0.15898330426115126,
"grad_norm": 0.5702200531959534,
"learning_rate": 0.0002,
"loss": 1.7651,
"step": 319
},
{
"epoch": 0.1594816845252928,
"grad_norm": 0.1274351179599762,
"learning_rate": 0.0002,
"loss": 1.6632,
"step": 320
},
{
"epoch": 0.15998006478943433,
"grad_norm": 0.10685572028160095,
"learning_rate": 0.0002,
"loss": 1.6691,
"step": 321
},
{
"epoch": 0.1604784450535759,
"grad_norm": 0.12333345413208008,
"learning_rate": 0.0002,
"loss": 1.6811,
"step": 322
},
{
"epoch": 0.16097682531771743,
"grad_norm": 0.10747205466032028,
"learning_rate": 0.0002,
"loss": 1.6292,
"step": 323
},
{
"epoch": 0.16147520558185896,
"grad_norm": 0.10506169497966766,
"learning_rate": 0.0002,
"loss": 1.7463,
"step": 324
},
{
"epoch": 0.1619735858460005,
"grad_norm": 0.11267457902431488,
"learning_rate": 0.0002,
"loss": 1.7192,
"step": 325
},
{
"epoch": 0.16247196611014203,
"grad_norm": 0.10924848914146423,
"learning_rate": 0.0002,
"loss": 1.7146,
"step": 326
},
{
"epoch": 0.16297034637428356,
"grad_norm": 0.11103785783052444,
"learning_rate": 0.0002,
"loss": 1.6215,
"step": 327
},
{
"epoch": 0.16346872663842513,
"grad_norm": 0.3997076451778412,
"learning_rate": 0.0002,
"loss": 1.8753,
"step": 328
},
{
"epoch": 0.16396710690256666,
"grad_norm": 0.10188498347997665,
"learning_rate": 0.0002,
"loss": 1.7483,
"step": 329
},
{
"epoch": 0.1644654871667082,
"grad_norm": 0.10824645310640335,
"learning_rate": 0.0002,
"loss": 1.6828,
"step": 330
},
{
"epoch": 0.16496386743084973,
"grad_norm": 0.09962976723909378,
"learning_rate": 0.0002,
"loss": 1.7127,
"step": 331
},
{
"epoch": 0.16546224769499127,
"grad_norm": 0.10796276479959488,
"learning_rate": 0.0002,
"loss": 1.6799,
"step": 332
},
{
"epoch": 0.16596062795913283,
"grad_norm": 0.09546298533678055,
"learning_rate": 0.0002,
"loss": 1.736,
"step": 333
},
{
"epoch": 0.16645900822327436,
"grad_norm": 0.3045598864555359,
"learning_rate": 0.0002,
"loss": 1.6192,
"step": 334
},
{
"epoch": 0.1669573884874159,
"grad_norm": 0.10275569558143616,
"learning_rate": 0.0002,
"loss": 1.7551,
"step": 335
},
{
"epoch": 0.16745576875155743,
"grad_norm": 0.14451362192630768,
"learning_rate": 0.0002,
"loss": 1.7094,
"step": 336
},
{
"epoch": 0.16795414901569897,
"grad_norm": 0.0982123464345932,
"learning_rate": 0.0002,
"loss": 1.6996,
"step": 337
},
{
"epoch": 0.16845252927984053,
"grad_norm": 0.11521178483963013,
"learning_rate": 0.0002,
"loss": 1.6409,
"step": 338
},
{
"epoch": 0.16895090954398206,
"grad_norm": 0.2746621072292328,
"learning_rate": 0.0002,
"loss": 1.7035,
"step": 339
},
{
"epoch": 0.1694492898081236,
"grad_norm": 0.0955624207854271,
"learning_rate": 0.0002,
"loss": 1.6689,
"step": 340
},
{
"epoch": 0.16994767007226513,
"grad_norm": 0.10157962888479233,
"learning_rate": 0.0002,
"loss": 1.6561,
"step": 341
},
{
"epoch": 0.17044605033640667,
"grad_norm": 0.0971306711435318,
"learning_rate": 0.0002,
"loss": 1.7626,
"step": 342
},
{
"epoch": 0.17094443060054823,
"grad_norm": 0.10407841205596924,
"learning_rate": 0.0002,
"loss": 1.681,
"step": 343
},
{
"epoch": 0.17144281086468977,
"grad_norm": 0.09228493273258209,
"learning_rate": 0.0002,
"loss": 1.6196,
"step": 344
},
{
"epoch": 0.1719411911288313,
"grad_norm": 0.10309567302465439,
"learning_rate": 0.0002,
"loss": 1.6534,
"step": 345
},
{
"epoch": 0.17243957139297283,
"grad_norm": 0.10019028931856155,
"learning_rate": 0.0002,
"loss": 1.7315,
"step": 346
},
{
"epoch": 0.17293795165711437,
"grad_norm": 0.09051994234323502,
"learning_rate": 0.0002,
"loss": 1.6537,
"step": 347
},
{
"epoch": 0.17343633192125593,
"grad_norm": 0.09501929581165314,
"learning_rate": 0.0002,
"loss": 1.681,
"step": 348
},
{
"epoch": 0.17393471218539747,
"grad_norm": 0.09314325451850891,
"learning_rate": 0.0002,
"loss": 1.6141,
"step": 349
},
{
"epoch": 0.174433092449539,
"grad_norm": 0.09021347016096115,
"learning_rate": 0.0002,
"loss": 1.6864,
"step": 350
},
{
"epoch": 0.17493147271368054,
"grad_norm": 0.27376627922058105,
"learning_rate": 0.0002,
"loss": 1.7223,
"step": 351
},
{
"epoch": 0.17542985297782207,
"grad_norm": 0.11608853936195374,
"learning_rate": 0.0002,
"loss": 1.6974,
"step": 352
},
{
"epoch": 0.1759282332419636,
"grad_norm": 0.09565002471208572,
"learning_rate": 0.0002,
"loss": 1.6925,
"step": 353
},
{
"epoch": 0.17642661350610517,
"grad_norm": 0.10814974457025528,
"learning_rate": 0.0002,
"loss": 1.6349,
"step": 354
},
{
"epoch": 0.1769249937702467,
"grad_norm": 0.09551705420017242,
"learning_rate": 0.0002,
"loss": 1.6715,
"step": 355
},
{
"epoch": 0.17742337403438824,
"grad_norm": 0.10541266202926636,
"learning_rate": 0.0002,
"loss": 1.6592,
"step": 356
},
{
"epoch": 0.17792175429852977,
"grad_norm": 0.09884203970432281,
"learning_rate": 0.0002,
"loss": 1.638,
"step": 357
},
{
"epoch": 0.1784201345626713,
"grad_norm": 0.19244062900543213,
"learning_rate": 0.0002,
"loss": 1.6823,
"step": 358
},
{
"epoch": 0.17891851482681287,
"grad_norm": 0.1312815397977829,
"learning_rate": 0.0002,
"loss": 1.747,
"step": 359
},
{
"epoch": 0.1794168950909544,
"grad_norm": 0.10575084388256073,
"learning_rate": 0.0002,
"loss": 1.6958,
"step": 360
},
{
"epoch": 0.17991527535509594,
"grad_norm": 0.1993856579065323,
"learning_rate": 0.0002,
"loss": 1.5862,
"step": 361
},
{
"epoch": 0.18041365561923747,
"grad_norm": 0.1053745448589325,
"learning_rate": 0.0002,
"loss": 1.705,
"step": 362
},
{
"epoch": 0.180912035883379,
"grad_norm": 0.10017159581184387,
"learning_rate": 0.0002,
"loss": 1.6565,
"step": 363
},
{
"epoch": 0.18141041614752057,
"grad_norm": 0.12066628038883209,
"learning_rate": 0.0002,
"loss": 1.639,
"step": 364
},
{
"epoch": 0.1819087964116621,
"grad_norm": 0.12606841325759888,
"learning_rate": 0.0002,
"loss": 1.8435,
"step": 365
},
{
"epoch": 0.18240717667580364,
"grad_norm": 0.10491355508565903,
"learning_rate": 0.0002,
"loss": 1.5846,
"step": 366
},
{
"epoch": 0.18290555693994517,
"grad_norm": 0.10337149351835251,
"learning_rate": 0.0002,
"loss": 1.6903,
"step": 367
},
{
"epoch": 0.1834039372040867,
"grad_norm": 0.09452168643474579,
"learning_rate": 0.0002,
"loss": 1.6865,
"step": 368
},
{
"epoch": 0.18390231746822827,
"grad_norm": 0.09799271076917648,
"learning_rate": 0.0002,
"loss": 1.6343,
"step": 369
},
{
"epoch": 0.1844006977323698,
"grad_norm": 0.09442919492721558,
"learning_rate": 0.0002,
"loss": 1.6266,
"step": 370
},
{
"epoch": 0.18489907799651134,
"grad_norm": 0.09542658925056458,
"learning_rate": 0.0002,
"loss": 1.612,
"step": 371
},
{
"epoch": 0.18539745826065288,
"grad_norm": 0.0989847183227539,
"learning_rate": 0.0002,
"loss": 1.6957,
"step": 372
},
{
"epoch": 0.1858958385247944,
"grad_norm": 0.09289655089378357,
"learning_rate": 0.0002,
"loss": 1.6501,
"step": 373
},
{
"epoch": 0.18639421878893594,
"grad_norm": 0.10097731649875641,
"learning_rate": 0.0002,
"loss": 1.7114,
"step": 374
},
{
"epoch": 0.1868925990530775,
"grad_norm": 0.09352610260248184,
"learning_rate": 0.0002,
"loss": 1.7375,
"step": 375
},
{
"epoch": 0.18739097931721904,
"grad_norm": 0.0907459631562233,
"learning_rate": 0.0002,
"loss": 1.651,
"step": 376
},
{
"epoch": 0.18788935958136058,
"grad_norm": 0.0915813073515892,
"learning_rate": 0.0002,
"loss": 1.6289,
"step": 377
},
{
"epoch": 0.1883877398455021,
"grad_norm": 0.09011110663414001,
"learning_rate": 0.0002,
"loss": 1.7024,
"step": 378
},
{
"epoch": 0.18888612010964365,
"grad_norm": 0.4069153964519501,
"learning_rate": 0.0002,
"loss": 1.6647,
"step": 379
},
{
"epoch": 0.1893845003737852,
"grad_norm": 0.1351984292268753,
"learning_rate": 0.0002,
"loss": 1.7911,
"step": 380
},
{
"epoch": 0.18988288063792674,
"grad_norm": 0.537133514881134,
"learning_rate": 0.0002,
"loss": 1.75,
"step": 381
},
{
"epoch": 0.19038126090206828,
"grad_norm": 0.10901357978582382,
"learning_rate": 0.0002,
"loss": 1.6767,
"step": 382
},
{
"epoch": 0.1908796411662098,
"grad_norm": 0.19000430405139923,
"learning_rate": 0.0002,
"loss": 1.6682,
"step": 383
},
{
"epoch": 0.19137802143035135,
"grad_norm": 0.12100650370121002,
"learning_rate": 0.0002,
"loss": 1.6844,
"step": 384
},
{
"epoch": 0.1918764016944929,
"grad_norm": 0.12487197667360306,
"learning_rate": 0.0002,
"loss": 1.7239,
"step": 385
},
{
"epoch": 0.19237478195863444,
"grad_norm": 0.12008525431156158,
"learning_rate": 0.0002,
"loss": 1.6443,
"step": 386
},
{
"epoch": 0.19287316222277598,
"grad_norm": 0.119840107858181,
"learning_rate": 0.0002,
"loss": 1.6271,
"step": 387
},
{
"epoch": 0.1933715424869175,
"grad_norm": 0.1126130223274231,
"learning_rate": 0.0002,
"loss": 1.681,
"step": 388
},
{
"epoch": 0.19386992275105905,
"grad_norm": 0.11164896190166473,
"learning_rate": 0.0002,
"loss": 1.6586,
"step": 389
},
{
"epoch": 0.1943683030152006,
"grad_norm": 0.1496819108724594,
"learning_rate": 0.0002,
"loss": 1.6856,
"step": 390
},
{
"epoch": 0.19486668327934215,
"grad_norm": 0.09984704852104187,
"learning_rate": 0.0002,
"loss": 1.6656,
"step": 391
},
{
"epoch": 0.19536506354348368,
"grad_norm": 0.10864219069480896,
"learning_rate": 0.0002,
"loss": 1.659,
"step": 392
},
{
"epoch": 0.19586344380762521,
"grad_norm": 0.09744228422641754,
"learning_rate": 0.0002,
"loss": 1.6162,
"step": 393
},
{
"epoch": 0.19636182407176675,
"grad_norm": 0.11409466713666916,
"learning_rate": 0.0002,
"loss": 1.6646,
"step": 394
},
{
"epoch": 0.19686020433590828,
"grad_norm": 0.096027672290802,
"learning_rate": 0.0002,
"loss": 1.6464,
"step": 395
},
{
"epoch": 0.19735858460004985,
"grad_norm": 0.48993775248527527,
"learning_rate": 0.0002,
"loss": 1.7454,
"step": 396
},
{
"epoch": 0.19785696486419138,
"grad_norm": 0.11972647160291672,
"learning_rate": 0.0002,
"loss": 1.6958,
"step": 397
},
{
"epoch": 0.19835534512833292,
"grad_norm": 0.49595576524734497,
"learning_rate": 0.0002,
"loss": 1.6128,
"step": 398
},
{
"epoch": 0.19885372539247445,
"grad_norm": 0.11590411514043808,
"learning_rate": 0.0002,
"loss": 1.7173,
"step": 399
},
{
"epoch": 0.19935210565661599,
"grad_norm": 0.11584487557411194,
"learning_rate": 0.0002,
"loss": 1.6773,
"step": 400
},
{
"epoch": 0.19985048592075755,
"grad_norm": 0.1017480343580246,
"learning_rate": 0.0002,
"loss": 1.6388,
"step": 401
},
{
"epoch": 0.20034886618489908,
"grad_norm": 0.12011077255010605,
"learning_rate": 0.0002,
"loss": 1.707,
"step": 402
},
{
"epoch": 0.20084724644904062,
"grad_norm": 0.36016201972961426,
"learning_rate": 0.0002,
"loss": 1.8179,
"step": 403
},
{
"epoch": 0.20134562671318215,
"grad_norm": 0.11278028786182404,
"learning_rate": 0.0002,
"loss": 1.6733,
"step": 404
},
{
"epoch": 0.2018440069773237,
"grad_norm": 0.10928738862276077,
"learning_rate": 0.0002,
"loss": 1.6858,
"step": 405
},
{
"epoch": 0.20234238724146525,
"grad_norm": 0.10860306769609451,
"learning_rate": 0.0002,
"loss": 1.6975,
"step": 406
},
{
"epoch": 0.20284076750560678,
"grad_norm": 0.11352024972438812,
"learning_rate": 0.0002,
"loss": 1.7504,
"step": 407
},
{
"epoch": 0.20333914776974832,
"grad_norm": 0.10320567339658737,
"learning_rate": 0.0002,
"loss": 1.6715,
"step": 408
},
{
"epoch": 0.20383752803388985,
"grad_norm": 0.12056868523359299,
"learning_rate": 0.0002,
"loss": 1.7571,
"step": 409
},
{
"epoch": 0.2043359082980314,
"grad_norm": 0.11091714352369308,
"learning_rate": 0.0002,
"loss": 1.6391,
"step": 410
},
{
"epoch": 0.20483428856217295,
"grad_norm": 0.10888761281967163,
"learning_rate": 0.0002,
"loss": 1.6763,
"step": 411
},
{
"epoch": 0.20533266882631449,
"grad_norm": 0.2625375986099243,
"learning_rate": 0.0002,
"loss": 1.58,
"step": 412
},
{
"epoch": 0.20583104909045602,
"grad_norm": 0.12070990353822708,
"learning_rate": 0.0002,
"loss": 1.7437,
"step": 413
},
{
"epoch": 0.20632942935459755,
"grad_norm": 0.09670402854681015,
"learning_rate": 0.0002,
"loss": 1.6502,
"step": 414
},
{
"epoch": 0.2068278096187391,
"grad_norm": 0.10343360900878906,
"learning_rate": 0.0002,
"loss": 1.7273,
"step": 415
},
{
"epoch": 0.20732618988288065,
"grad_norm": 0.10445055365562439,
"learning_rate": 0.0002,
"loss": 1.674,
"step": 416
},
{
"epoch": 0.2078245701470222,
"grad_norm": 0.24325382709503174,
"learning_rate": 0.0002,
"loss": 1.7492,
"step": 417
},
{
"epoch": 0.20832295041116372,
"grad_norm": 0.10541153699159622,
"learning_rate": 0.0002,
"loss": 1.6389,
"step": 418
},
{
"epoch": 0.20882133067530526,
"grad_norm": 0.09688902646303177,
"learning_rate": 0.0002,
"loss": 1.7145,
"step": 419
},
{
"epoch": 0.2093197109394468,
"grad_norm": 0.10568691790103912,
"learning_rate": 0.0002,
"loss": 1.6699,
"step": 420
},
{
"epoch": 0.20981809120358832,
"grad_norm": 0.09683585166931152,
"learning_rate": 0.0002,
"loss": 1.6411,
"step": 421
},
{
"epoch": 0.2103164714677299,
"grad_norm": 0.10286644101142883,
"learning_rate": 0.0002,
"loss": 1.6951,
"step": 422
},
{
"epoch": 0.21081485173187142,
"grad_norm": 0.09786178171634674,
"learning_rate": 0.0002,
"loss": 1.6316,
"step": 423
},
{
"epoch": 0.21131323199601296,
"grad_norm": 0.10202211886644363,
"learning_rate": 0.0002,
"loss": 1.6702,
"step": 424
},
{
"epoch": 0.2118116122601545,
"grad_norm": 0.10444546490907669,
"learning_rate": 0.0002,
"loss": 1.6371,
"step": 425
},
{
"epoch": 0.21230999252429603,
"grad_norm": 0.09346964955329895,
"learning_rate": 0.0002,
"loss": 1.6638,
"step": 426
},
{
"epoch": 0.2128083727884376,
"grad_norm": 0.09578395634889603,
"learning_rate": 0.0002,
"loss": 1.622,
"step": 427
},
{
"epoch": 0.21330675305257912,
"grad_norm": 0.09412133693695068,
"learning_rate": 0.0002,
"loss": 1.6292,
"step": 428
},
{
"epoch": 0.21380513331672066,
"grad_norm": 0.49985215067863464,
"learning_rate": 0.0002,
"loss": 1.7932,
"step": 429
},
{
"epoch": 0.2143035135808622,
"grad_norm": 0.58636075258255,
"learning_rate": 0.0002,
"loss": 1.7671,
"step": 430
},
{
"epoch": 0.21480189384500373,
"grad_norm": 0.12334456294775009,
"learning_rate": 0.0002,
"loss": 1.6392,
"step": 431
},
{
"epoch": 0.2153002741091453,
"grad_norm": 0.13144731521606445,
"learning_rate": 0.0002,
"loss": 1.6686,
"step": 432
},
{
"epoch": 0.21579865437328682,
"grad_norm": 0.14804112911224365,
"learning_rate": 0.0002,
"loss": 1.7357,
"step": 433
},
{
"epoch": 0.21629703463742836,
"grad_norm": 0.7628450393676758,
"learning_rate": 0.0002,
"loss": 1.8465,
"step": 434
},
{
"epoch": 0.2167954149015699,
"grad_norm": 0.18024517595767975,
"learning_rate": 0.0002,
"loss": 1.6732,
"step": 435
},
{
"epoch": 0.21729379516571143,
"grad_norm": 0.195417121052742,
"learning_rate": 0.0002,
"loss": 1.7811,
"step": 436
},
{
"epoch": 0.217792175429853,
"grad_norm": 0.28199324011802673,
"learning_rate": 0.0002,
"loss": 1.6088,
"step": 437
},
{
"epoch": 0.21829055569399453,
"grad_norm": 0.15422897040843964,
"learning_rate": 0.0002,
"loss": 1.7555,
"step": 438
},
{
"epoch": 0.21878893595813606,
"grad_norm": 0.13214194774627686,
"learning_rate": 0.0002,
"loss": 1.6575,
"step": 439
},
{
"epoch": 0.2192873162222776,
"grad_norm": 0.14797765016555786,
"learning_rate": 0.0002,
"loss": 1.7903,
"step": 440
},
{
"epoch": 0.21978569648641913,
"grad_norm": 0.12424055486917496,
"learning_rate": 0.0002,
"loss": 1.7089,
"step": 441
},
{
"epoch": 0.22028407675056066,
"grad_norm": 0.5921161770820618,
"learning_rate": 0.0002,
"loss": 1.7352,
"step": 442
},
{
"epoch": 0.22078245701470223,
"grad_norm": 0.1724957525730133,
"learning_rate": 0.0002,
"loss": 1.7427,
"step": 443
},
{
"epoch": 0.22128083727884376,
"grad_norm": 0.1341264247894287,
"learning_rate": 0.0002,
"loss": 1.6738,
"step": 444
},
{
"epoch": 0.2217792175429853,
"grad_norm": 0.43373820185661316,
"learning_rate": 0.0002,
"loss": 1.7591,
"step": 445
},
{
"epoch": 0.22227759780712683,
"grad_norm": 0.15030571818351746,
"learning_rate": 0.0002,
"loss": 1.7306,
"step": 446
},
{
"epoch": 0.22277597807126837,
"grad_norm": 0.15096893906593323,
"learning_rate": 0.0002,
"loss": 1.7637,
"step": 447
},
{
"epoch": 0.22327435833540993,
"grad_norm": 0.1577889323234558,
"learning_rate": 0.0002,
"loss": 1.6704,
"step": 448
},
{
"epoch": 0.22377273859955146,
"grad_norm": 0.11596284061670303,
"learning_rate": 0.0002,
"loss": 1.5843,
"step": 449
},
{
"epoch": 0.224271118863693,
"grad_norm": 0.14083531498908997,
"learning_rate": 0.0002,
"loss": 1.6502,
"step": 450
},
{
"epoch": 0.22476949912783453,
"grad_norm": 0.11369968950748444,
"learning_rate": 0.0002,
"loss": 1.7063,
"step": 451
},
{
"epoch": 0.22526787939197607,
"grad_norm": 0.12249240279197693,
"learning_rate": 0.0002,
"loss": 1.6041,
"step": 452
},
{
"epoch": 0.22576625965611763,
"grad_norm": 0.13246704638004303,
"learning_rate": 0.0002,
"loss": 1.7227,
"step": 453
},
{
"epoch": 0.22626463992025916,
"grad_norm": 0.15372870862483978,
"learning_rate": 0.0002,
"loss": 1.7364,
"step": 454
},
{
"epoch": 0.2267630201844007,
"grad_norm": 0.10773339122533798,
"learning_rate": 0.0002,
"loss": 1.6797,
"step": 455
},
{
"epoch": 0.22726140044854223,
"grad_norm": 0.10603539645671844,
"learning_rate": 0.0002,
"loss": 1.6608,
"step": 456
},
{
"epoch": 0.22775978071268377,
"grad_norm": 0.11118324100971222,
"learning_rate": 0.0002,
"loss": 1.6659,
"step": 457
},
{
"epoch": 0.22825816097682533,
"grad_norm": 0.10193316638469696,
"learning_rate": 0.0002,
"loss": 1.7149,
"step": 458
},
{
"epoch": 0.22875654124096687,
"grad_norm": 0.118270143866539,
"learning_rate": 0.0002,
"loss": 1.6581,
"step": 459
},
{
"epoch": 0.2292549215051084,
"grad_norm": 0.09839551895856857,
"learning_rate": 0.0002,
"loss": 1.6906,
"step": 460
},
{
"epoch": 0.22975330176924993,
"grad_norm": 0.10430920869112015,
"learning_rate": 0.0002,
"loss": 1.6367,
"step": 461
},
{
"epoch": 0.23025168203339147,
"grad_norm": 0.7883297204971313,
"learning_rate": 0.0002,
"loss": 1.8726,
"step": 462
},
{
"epoch": 0.230750062297533,
"grad_norm": 0.14015096426010132,
"learning_rate": 0.0002,
"loss": 1.6885,
"step": 463
},
{
"epoch": 0.23124844256167457,
"grad_norm": 0.6940969824790955,
"learning_rate": 0.0002,
"loss": 1.8366,
"step": 464
},
{
"epoch": 0.2317468228258161,
"grad_norm": 0.16839167475700378,
"learning_rate": 0.0002,
"loss": 1.6627,
"step": 465
},
{
"epoch": 0.23224520308995764,
"grad_norm": 0.14831361174583435,
"learning_rate": 0.0002,
"loss": 1.6192,
"step": 466
},
{
"epoch": 0.23274358335409917,
"grad_norm": 0.6374949216842651,
"learning_rate": 0.0002,
"loss": 1.8086,
"step": 467
},
{
"epoch": 0.2332419636182407,
"grad_norm": 0.1442909985780716,
"learning_rate": 0.0002,
"loss": 1.6875,
"step": 468
},
{
"epoch": 0.23374034388238227,
"grad_norm": 0.15487882494926453,
"learning_rate": 0.0002,
"loss": 1.6939,
"step": 469
},
{
"epoch": 0.2342387241465238,
"grad_norm": 0.133474662899971,
"learning_rate": 0.0002,
"loss": 1.6011,
"step": 470
},
{
"epoch": 0.23473710441066534,
"grad_norm": 0.15738508105278015,
"learning_rate": 0.0002,
"loss": 1.6801,
"step": 471
},
{
"epoch": 0.23523548467480687,
"grad_norm": 0.13371291756629944,
"learning_rate": 0.0002,
"loss": 1.6454,
"step": 472
},
{
"epoch": 0.2357338649389484,
"grad_norm": 0.12480079382658005,
"learning_rate": 0.0002,
"loss": 1.613,
"step": 473
},
{
"epoch": 0.23623224520308997,
"grad_norm": 0.138162761926651,
"learning_rate": 0.0002,
"loss": 1.6844,
"step": 474
},
{
"epoch": 0.2367306254672315,
"grad_norm": 0.13453134894371033,
"learning_rate": 0.0002,
"loss": 1.7113,
"step": 475
},
{
"epoch": 0.23722900573137304,
"grad_norm": 0.11864453554153442,
"learning_rate": 0.0002,
"loss": 1.7311,
"step": 476
},
{
"epoch": 0.23772738599551457,
"grad_norm": 0.3905930817127228,
"learning_rate": 0.0002,
"loss": 1.7638,
"step": 477
},
{
"epoch": 0.2382257662596561,
"grad_norm": 0.1613403707742691,
"learning_rate": 0.0002,
"loss": 1.6413,
"step": 478
},
{
"epoch": 0.23872414652379767,
"grad_norm": 0.13828811049461365,
"learning_rate": 0.0002,
"loss": 1.7163,
"step": 479
},
{
"epoch": 0.2392225267879392,
"grad_norm": 0.13535858690738678,
"learning_rate": 0.0002,
"loss": 1.6059,
"step": 480
},
{
"epoch": 0.23972090705208074,
"grad_norm": 0.15594834089279175,
"learning_rate": 0.0002,
"loss": 1.7161,
"step": 481
},
{
"epoch": 0.24021928731622227,
"grad_norm": 0.11990589648485184,
"learning_rate": 0.0002,
"loss": 1.7051,
"step": 482
},
{
"epoch": 0.2407176675803638,
"grad_norm": 0.11655411124229431,
"learning_rate": 0.0002,
"loss": 1.6711,
"step": 483
},
{
"epoch": 0.24121604784450537,
"grad_norm": 0.11754405498504639,
"learning_rate": 0.0002,
"loss": 1.7237,
"step": 484
},
{
"epoch": 0.2417144281086469,
"grad_norm": 0.1332051157951355,
"learning_rate": 0.0002,
"loss": 1.7598,
"step": 485
},
{
"epoch": 0.24221280837278844,
"grad_norm": 0.10240749269723892,
"learning_rate": 0.0002,
"loss": 1.6356,
"step": 486
},
{
"epoch": 0.24271118863692998,
"grad_norm": 0.1425447165966034,
"learning_rate": 0.0002,
"loss": 1.7993,
"step": 487
},
{
"epoch": 0.2432095689010715,
"grad_norm": 0.10178319364786148,
"learning_rate": 0.0002,
"loss": 1.6705,
"step": 488
},
{
"epoch": 0.24370794916521304,
"grad_norm": 0.354878306388855,
"learning_rate": 0.0002,
"loss": 1.7251,
"step": 489
},
{
"epoch": 0.2442063294293546,
"grad_norm": 0.10244394838809967,
"learning_rate": 0.0002,
"loss": 1.5874,
"step": 490
},
{
"epoch": 0.24470470969349614,
"grad_norm": 0.10944903641939163,
"learning_rate": 0.0002,
"loss": 1.5817,
"step": 491
},
{
"epoch": 0.24520308995763768,
"grad_norm": 0.11182764172554016,
"learning_rate": 0.0002,
"loss": 1.6859,
"step": 492
},
{
"epoch": 0.2457014702217792,
"grad_norm": 0.11066277325153351,
"learning_rate": 0.0002,
"loss": 1.6275,
"step": 493
},
{
"epoch": 0.24619985048592075,
"grad_norm": 0.6789163947105408,
"learning_rate": 0.0002,
"loss": 1.8408,
"step": 494
},
{
"epoch": 0.2466982307500623,
"grad_norm": 0.15237462520599365,
"learning_rate": 0.0002,
"loss": 1.5969,
"step": 495
},
{
"epoch": 0.24719661101420384,
"grad_norm": 0.14016127586364746,
"learning_rate": 0.0002,
"loss": 1.6325,
"step": 496
},
{
"epoch": 0.24769499127834538,
"grad_norm": 0.12557458877563477,
"learning_rate": 0.0002,
"loss": 1.6745,
"step": 497
},
{
"epoch": 0.2481933715424869,
"grad_norm": 0.12593714892864227,
"learning_rate": 0.0002,
"loss": 1.7337,
"step": 498
},
{
"epoch": 0.24869175180662845,
"grad_norm": 0.12869895994663239,
"learning_rate": 0.0002,
"loss": 1.6982,
"step": 499
},
{
"epoch": 0.24919013207077,
"grad_norm": 0.6727408766746521,
"learning_rate": 0.0002,
"loss": 1.7735,
"step": 500
},
{
"epoch": 0.24968851233491154,
"grad_norm": 0.18164046108722687,
"learning_rate": 0.0002,
"loss": 1.7327,
"step": 501
},
{
"epoch": 0.2501868925990531,
"grad_norm": 0.12988890707492828,
"learning_rate": 0.0002,
"loss": 1.6335,
"step": 502
},
{
"epoch": 0.25068527286319464,
"grad_norm": 0.14229950308799744,
"learning_rate": 0.0002,
"loss": 1.6705,
"step": 503
},
{
"epoch": 0.25118365312733615,
"grad_norm": 0.12232649326324463,
"learning_rate": 0.0002,
"loss": 1.5992,
"step": 504
},
{
"epoch": 0.2516820333914777,
"grad_norm": 0.12053592503070831,
"learning_rate": 0.0002,
"loss": 1.5962,
"step": 505
},
{
"epoch": 0.2521804136556192,
"grad_norm": 0.12370762974023819,
"learning_rate": 0.0002,
"loss": 1.6675,
"step": 506
},
{
"epoch": 0.2526787939197608,
"grad_norm": 0.11628440022468567,
"learning_rate": 0.0002,
"loss": 1.6743,
"step": 507
},
{
"epoch": 0.25317717418390234,
"grad_norm": 0.1284741759300232,
"learning_rate": 0.0002,
"loss": 1.6903,
"step": 508
},
{
"epoch": 0.25367555444804385,
"grad_norm": 0.133184552192688,
"learning_rate": 0.0002,
"loss": 1.6735,
"step": 509
},
{
"epoch": 0.2541739347121854,
"grad_norm": 0.11966334283351898,
"learning_rate": 0.0002,
"loss": 1.6323,
"step": 510
},
{
"epoch": 0.2546723149763269,
"grad_norm": 0.12117716670036316,
"learning_rate": 0.0002,
"loss": 1.6458,
"step": 511
},
{
"epoch": 0.2551706952404685,
"grad_norm": 0.11778345704078674,
"learning_rate": 0.0002,
"loss": 1.6272,
"step": 512
},
{
"epoch": 0.25566907550461004,
"grad_norm": 0.11609595268964767,
"learning_rate": 0.0002,
"loss": 1.6588,
"step": 513
},
{
"epoch": 0.25616745576875155,
"grad_norm": 0.11605001240968704,
"learning_rate": 0.0002,
"loss": 1.6666,
"step": 514
},
{
"epoch": 0.2566658360328931,
"grad_norm": 0.10593124479055405,
"learning_rate": 0.0002,
"loss": 1.6628,
"step": 515
},
{
"epoch": 0.2571642162970346,
"grad_norm": 0.11132659763097763,
"learning_rate": 0.0002,
"loss": 1.7112,
"step": 516
},
{
"epoch": 0.2576625965611762,
"grad_norm": 0.09980247169733047,
"learning_rate": 0.0002,
"loss": 1.6759,
"step": 517
},
{
"epoch": 0.25816097682531775,
"grad_norm": 0.6143377423286438,
"learning_rate": 0.0002,
"loss": 1.6616,
"step": 518
},
{
"epoch": 0.25865935708945925,
"grad_norm": 0.11244726181030273,
"learning_rate": 0.0002,
"loss": 1.7124,
"step": 519
},
{
"epoch": 0.2591577373536008,
"grad_norm": 0.6190444827079773,
"learning_rate": 0.0002,
"loss": 1.7698,
"step": 520
},
{
"epoch": 0.2596561176177423,
"grad_norm": 0.7441633939743042,
"learning_rate": 0.0002,
"loss": 1.8182,
"step": 521
},
{
"epoch": 0.2601544978818839,
"grad_norm": 0.13578347861766815,
"learning_rate": 0.0002,
"loss": 1.6609,
"step": 522
},
{
"epoch": 0.2606528781460254,
"grad_norm": 0.1662416160106659,
"learning_rate": 0.0002,
"loss": 1.7167,
"step": 523
},
{
"epoch": 0.26115125841016695,
"grad_norm": 0.16020916402339935,
"learning_rate": 0.0002,
"loss": 1.6636,
"step": 524
},
{
"epoch": 0.2616496386743085,
"grad_norm": 0.12748084962368011,
"learning_rate": 0.0002,
"loss": 1.6832,
"step": 525
},
{
"epoch": 0.26214801893845,
"grad_norm": 0.13277047872543335,
"learning_rate": 0.0002,
"loss": 1.682,
"step": 526
},
{
"epoch": 0.2626463992025916,
"grad_norm": 0.11746570467948914,
"learning_rate": 0.0002,
"loss": 1.6567,
"step": 527
},
{
"epoch": 0.2631447794667331,
"grad_norm": 0.1124933585524559,
"learning_rate": 0.0002,
"loss": 1.6462,
"step": 528
},
{
"epoch": 0.26364315973087465,
"grad_norm": 0.13045774400234222,
"learning_rate": 0.0002,
"loss": 1.7247,
"step": 529
},
{
"epoch": 0.2641415399950162,
"grad_norm": 0.11953026801347733,
"learning_rate": 0.0002,
"loss": 1.6896,
"step": 530
},
{
"epoch": 0.2646399202591577,
"grad_norm": 0.3236943185329437,
"learning_rate": 0.0002,
"loss": 1.6562,
"step": 531
},
{
"epoch": 0.2651383005232993,
"grad_norm": 0.13000494241714478,
"learning_rate": 0.0002,
"loss": 1.6329,
"step": 532
},
{
"epoch": 0.2656366807874408,
"grad_norm": 0.13072949647903442,
"learning_rate": 0.0002,
"loss": 1.6584,
"step": 533
},
{
"epoch": 0.26613506105158236,
"grad_norm": 0.30452999472618103,
"learning_rate": 0.0002,
"loss": 1.6066,
"step": 534
},
{
"epoch": 0.2666334413157239,
"grad_norm": 0.11118455231189728,
"learning_rate": 0.0002,
"loss": 1.6874,
"step": 535
},
{
"epoch": 0.2671318215798654,
"grad_norm": 0.12459013611078262,
"learning_rate": 0.0002,
"loss": 1.6959,
"step": 536
},
{
"epoch": 0.267630201844007,
"grad_norm": 0.10970738530158997,
"learning_rate": 0.0002,
"loss": 1.6167,
"step": 537
},
{
"epoch": 0.2681285821081485,
"grad_norm": 0.1440659761428833,
"learning_rate": 0.0002,
"loss": 1.7254,
"step": 538
},
{
"epoch": 0.26862696237229006,
"grad_norm": 0.11448108404874802,
"learning_rate": 0.0002,
"loss": 1.6896,
"step": 539
},
{
"epoch": 0.2691253426364316,
"grad_norm": 0.11026275157928467,
"learning_rate": 0.0002,
"loss": 1.6675,
"step": 540
},
{
"epoch": 0.2696237229005731,
"grad_norm": 0.10443202406167984,
"learning_rate": 0.0002,
"loss": 1.7035,
"step": 541
},
{
"epoch": 0.2701221031647147,
"grad_norm": 0.11404629796743393,
"learning_rate": 0.0002,
"loss": 1.727,
"step": 542
},
{
"epoch": 0.2706204834288562,
"grad_norm": 0.12783807516098022,
"learning_rate": 0.0002,
"loss": 1.7468,
"step": 543
},
{
"epoch": 0.27111886369299776,
"grad_norm": 0.1040879487991333,
"learning_rate": 0.0002,
"loss": 1.642,
"step": 544
},
{
"epoch": 0.2716172439571393,
"grad_norm": 0.10120297223329544,
"learning_rate": 0.0002,
"loss": 1.6792,
"step": 545
},
{
"epoch": 0.2721156242212808,
"grad_norm": 0.11116039007902145,
"learning_rate": 0.0002,
"loss": 1.6685,
"step": 546
},
{
"epoch": 0.2726140044854224,
"grad_norm": 0.353816956281662,
"learning_rate": 0.0002,
"loss": 1.7458,
"step": 547
},
{
"epoch": 0.2731123847495639,
"grad_norm": 0.10361409932374954,
"learning_rate": 0.0002,
"loss": 1.583,
"step": 548
},
{
"epoch": 0.27361076501370546,
"grad_norm": 0.10164079070091248,
"learning_rate": 0.0002,
"loss": 1.7219,
"step": 549
},
{
"epoch": 0.274109145277847,
"grad_norm": 0.3576943278312683,
"learning_rate": 0.0002,
"loss": 1.7155,
"step": 550
},
{
"epoch": 0.27460752554198853,
"grad_norm": 0.1307370960712433,
"learning_rate": 0.0002,
"loss": 1.6491,
"step": 551
},
{
"epoch": 0.2751059058061301,
"grad_norm": 0.11267419159412384,
"learning_rate": 0.0002,
"loss": 1.6299,
"step": 552
},
{
"epoch": 0.2756042860702716,
"grad_norm": 0.10955934971570969,
"learning_rate": 0.0002,
"loss": 1.6972,
"step": 553
},
{
"epoch": 0.27610266633441316,
"grad_norm": 0.3629993796348572,
"learning_rate": 0.0002,
"loss": 1.6558,
"step": 554
},
{
"epoch": 0.2766010465985547,
"grad_norm": 0.10678595304489136,
"learning_rate": 0.0002,
"loss": 1.7133,
"step": 555
},
{
"epoch": 0.27709942686269623,
"grad_norm": 0.3551732301712036,
"learning_rate": 0.0002,
"loss": 1.7884,
"step": 556
},
{
"epoch": 0.2775978071268378,
"grad_norm": 0.1157960370182991,
"learning_rate": 0.0002,
"loss": 1.6664,
"step": 557
},
{
"epoch": 0.2780961873909793,
"grad_norm": 0.4219015836715698,
"learning_rate": 0.0002,
"loss": 1.6258,
"step": 558
},
{
"epoch": 0.27859456765512086,
"grad_norm": 0.1442400962114334,
"learning_rate": 0.0002,
"loss": 1.7081,
"step": 559
},
{
"epoch": 0.2790929479192624,
"grad_norm": 0.12307796627283096,
"learning_rate": 0.0002,
"loss": 1.5812,
"step": 560
},
{
"epoch": 0.27959132818340393,
"grad_norm": 0.13523195683956146,
"learning_rate": 0.0002,
"loss": 1.6644,
"step": 561
},
{
"epoch": 0.2800897084475455,
"grad_norm": 0.14576253294944763,
"learning_rate": 0.0002,
"loss": 1.6724,
"step": 562
},
{
"epoch": 0.280588088711687,
"grad_norm": 0.1239597350358963,
"learning_rate": 0.0002,
"loss": 1.6501,
"step": 563
},
{
"epoch": 0.28108646897582856,
"grad_norm": 0.11444118618965149,
"learning_rate": 0.0002,
"loss": 1.6218,
"step": 564
},
{
"epoch": 0.28158484923997007,
"grad_norm": 0.11568321287631989,
"learning_rate": 0.0002,
"loss": 1.622,
"step": 565
},
{
"epoch": 0.28208322950411163,
"grad_norm": 0.1155436560511589,
"learning_rate": 0.0002,
"loss": 1.6856,
"step": 566
},
{
"epoch": 0.2825816097682532,
"grad_norm": 0.10945037007331848,
"learning_rate": 0.0002,
"loss": 1.5764,
"step": 567
},
{
"epoch": 0.2830799900323947,
"grad_norm": 0.5043824315071106,
"learning_rate": 0.0002,
"loss": 1.7022,
"step": 568
},
{
"epoch": 0.28357837029653626,
"grad_norm": 0.7879558801651001,
"learning_rate": 0.0002,
"loss": 1.8313,
"step": 569
},
{
"epoch": 0.28407675056067777,
"grad_norm": 0.13888636231422424,
"learning_rate": 0.0002,
"loss": 1.6418,
"step": 570
},
{
"epoch": 0.28457513082481933,
"grad_norm": 0.16137146949768066,
"learning_rate": 0.0002,
"loss": 1.6884,
"step": 571
},
{
"epoch": 0.2850735110889609,
"grad_norm": 0.2237291783094406,
"learning_rate": 0.0002,
"loss": 1.7934,
"step": 572
},
{
"epoch": 0.2855718913531024,
"grad_norm": 0.14624369144439697,
"learning_rate": 0.0002,
"loss": 1.676,
"step": 573
},
{
"epoch": 0.28607027161724397,
"grad_norm": 0.1463831216096878,
"learning_rate": 0.0002,
"loss": 1.5869,
"step": 574
},
{
"epoch": 0.28656865188138547,
"grad_norm": 0.14725126326084137,
"learning_rate": 0.0002,
"loss": 1.632,
"step": 575
},
{
"epoch": 0.28706703214552703,
"grad_norm": 0.13732214272022247,
"learning_rate": 0.0002,
"loss": 1.7513,
"step": 576
},
{
"epoch": 0.2875654124096686,
"grad_norm": 0.14334504306316376,
"learning_rate": 0.0002,
"loss": 1.6318,
"step": 577
},
{
"epoch": 0.2880637926738101,
"grad_norm": 0.8194677829742432,
"learning_rate": 0.0002,
"loss": 1.8945,
"step": 578
},
{
"epoch": 0.28856217293795167,
"grad_norm": 0.1749170422554016,
"learning_rate": 0.0002,
"loss": 1.6608,
"step": 579
},
{
"epoch": 0.2890605532020932,
"grad_norm": 0.12977321445941925,
"learning_rate": 0.0002,
"loss": 1.6363,
"step": 580
},
{
"epoch": 0.28955893346623474,
"grad_norm": 0.2908933162689209,
"learning_rate": 0.0002,
"loss": 1.8448,
"step": 581
},
{
"epoch": 0.2900573137303763,
"grad_norm": 0.17108629643917084,
"learning_rate": 0.0002,
"loss": 1.6822,
"step": 582
},
{
"epoch": 0.2905556939945178,
"grad_norm": 0.14702463150024414,
"learning_rate": 0.0002,
"loss": 1.7491,
"step": 583
},
{
"epoch": 0.29105407425865937,
"grad_norm": 0.12582743167877197,
"learning_rate": 0.0002,
"loss": 1.6245,
"step": 584
},
{
"epoch": 0.2915524545228009,
"grad_norm": 0.14732137322425842,
"learning_rate": 0.0002,
"loss": 1.6916,
"step": 585
},
{
"epoch": 0.29205083478694244,
"grad_norm": 0.12849657237529755,
"learning_rate": 0.0002,
"loss": 1.6583,
"step": 586
},
{
"epoch": 0.292549215051084,
"grad_norm": 0.11466097086668015,
"learning_rate": 0.0002,
"loss": 1.6306,
"step": 587
},
{
"epoch": 0.2930475953152255,
"grad_norm": 0.12361207604408264,
"learning_rate": 0.0002,
"loss": 1.6765,
"step": 588
},
{
"epoch": 0.29354597557936707,
"grad_norm": 0.1265360414981842,
"learning_rate": 0.0002,
"loss": 1.667,
"step": 589
},
{
"epoch": 0.2940443558435086,
"grad_norm": 0.11903838813304901,
"learning_rate": 0.0002,
"loss": 1.6567,
"step": 590
},
{
"epoch": 0.29454273610765014,
"grad_norm": 0.8345243334770203,
"learning_rate": 0.0002,
"loss": 1.6467,
"step": 591
},
{
"epoch": 0.2950411163717917,
"grad_norm": 0.1365821361541748,
"learning_rate": 0.0002,
"loss": 1.7028,
"step": 592
},
{
"epoch": 0.2955394966359332,
"grad_norm": 0.13564884662628174,
"learning_rate": 0.0002,
"loss": 1.6129,
"step": 593
},
{
"epoch": 0.29603787690007477,
"grad_norm": 0.13604499399662018,
"learning_rate": 0.0002,
"loss": 1.7387,
"step": 594
},
{
"epoch": 0.2965362571642163,
"grad_norm": 0.12102136015892029,
"learning_rate": 0.0002,
"loss": 1.632,
"step": 595
},
{
"epoch": 0.29703463742835784,
"grad_norm": 0.11927222460508347,
"learning_rate": 0.0002,
"loss": 1.7149,
"step": 596
},
{
"epoch": 0.2975330176924994,
"grad_norm": 0.10716401040554047,
"learning_rate": 0.0002,
"loss": 1.6268,
"step": 597
},
{
"epoch": 0.2980313979566409,
"grad_norm": 0.12001641094684601,
"learning_rate": 0.0002,
"loss": 1.6879,
"step": 598
},
{
"epoch": 0.29852977822078247,
"grad_norm": 0.11045756936073303,
"learning_rate": 0.0002,
"loss": 1.6871,
"step": 599
},
{
"epoch": 0.299028158484924,
"grad_norm": 0.7450900077819824,
"learning_rate": 0.0002,
"loss": 1.8146,
"step": 600
},
{
"epoch": 0.29952653874906554,
"grad_norm": 0.16306158900260925,
"learning_rate": 0.0002,
"loss": 1.7092,
"step": 601
},
{
"epoch": 0.3000249190132071,
"grad_norm": 0.43425318598747253,
"learning_rate": 0.0002,
"loss": 1.7405,
"step": 602
},
{
"epoch": 0.3005232992773486,
"grad_norm": 0.16279961168766022,
"learning_rate": 0.0002,
"loss": 1.6,
"step": 603
},
{
"epoch": 0.3010216795414902,
"grad_norm": 0.1403011977672577,
"learning_rate": 0.0002,
"loss": 1.5979,
"step": 604
},
{
"epoch": 0.3015200598056317,
"grad_norm": 0.13146822154521942,
"learning_rate": 0.0002,
"loss": 1.5689,
"step": 605
},
{
"epoch": 0.30201844006977324,
"grad_norm": 0.15902653336524963,
"learning_rate": 0.0002,
"loss": 1.6664,
"step": 606
},
{
"epoch": 0.3025168203339148,
"grad_norm": 0.12351160496473312,
"learning_rate": 0.0002,
"loss": 1.714,
"step": 607
},
{
"epoch": 0.3030152005980563,
"grad_norm": 0.1543518602848053,
"learning_rate": 0.0002,
"loss": 1.6432,
"step": 608
},
{
"epoch": 0.3035135808621979,
"grad_norm": 0.11827117949724197,
"learning_rate": 0.0002,
"loss": 1.6325,
"step": 609
},
{
"epoch": 0.3040119611263394,
"grad_norm": 0.5559304356575012,
"learning_rate": 0.0002,
"loss": 1.6789,
"step": 610
},
{
"epoch": 0.30451034139048094,
"grad_norm": 0.13763754069805145,
"learning_rate": 0.0002,
"loss": 1.6715,
"step": 611
},
{
"epoch": 0.30500872165462245,
"grad_norm": 0.12646999955177307,
"learning_rate": 0.0002,
"loss": 1.7162,
"step": 612
},
{
"epoch": 0.305507101918764,
"grad_norm": 0.34849414229393005,
"learning_rate": 0.0002,
"loss": 1.6708,
"step": 613
},
{
"epoch": 0.3060054821829056,
"grad_norm": 0.11648757755756378,
"learning_rate": 0.0002,
"loss": 1.646,
"step": 614
},
{
"epoch": 0.3065038624470471,
"grad_norm": 0.13477148115634918,
"learning_rate": 0.0002,
"loss": 1.6502,
"step": 615
},
{
"epoch": 0.30700224271118864,
"grad_norm": 0.1102217361330986,
"learning_rate": 0.0002,
"loss": 1.6729,
"step": 616
},
{
"epoch": 0.30750062297533015,
"grad_norm": 0.5752671957015991,
"learning_rate": 0.0002,
"loss": 1.6233,
"step": 617
},
{
"epoch": 0.3079990032394717,
"grad_norm": 0.13107599318027496,
"learning_rate": 0.0002,
"loss": 1.6636,
"step": 618
},
{
"epoch": 0.3084973835036133,
"grad_norm": 0.11860768496990204,
"learning_rate": 0.0002,
"loss": 1.7313,
"step": 619
},
{
"epoch": 0.3089957637677548,
"grad_norm": 0.1229948177933693,
"learning_rate": 0.0002,
"loss": 1.6327,
"step": 620
},
{
"epoch": 0.30949414403189635,
"grad_norm": 0.30836552381515503,
"learning_rate": 0.0002,
"loss": 1.6969,
"step": 621
},
{
"epoch": 0.30999252429603785,
"grad_norm": 0.11798208951950073,
"learning_rate": 0.0002,
"loss": 1.7364,
"step": 622
},
{
"epoch": 0.3104909045601794,
"grad_norm": 0.4807080030441284,
"learning_rate": 0.0002,
"loss": 1.6899,
"step": 623
},
{
"epoch": 0.310989284824321,
"grad_norm": 0.1726754605770111,
"learning_rate": 0.0002,
"loss": 1.8045,
"step": 624
},
{
"epoch": 0.3114876650884625,
"grad_norm": 0.13296914100646973,
"learning_rate": 0.0002,
"loss": 1.6966,
"step": 625
},
{
"epoch": 0.31198604535260405,
"grad_norm": 0.14966656267642975,
"learning_rate": 0.0002,
"loss": 1.6685,
"step": 626
},
{
"epoch": 0.31248442561674555,
"grad_norm": 0.3757789731025696,
"learning_rate": 0.0002,
"loss": 1.7225,
"step": 627
},
{
"epoch": 0.3129828058808871,
"grad_norm": 0.1234004870057106,
"learning_rate": 0.0002,
"loss": 1.6204,
"step": 628
},
{
"epoch": 0.3134811861450287,
"grad_norm": 0.12280552089214325,
"learning_rate": 0.0002,
"loss": 1.6913,
"step": 629
},
{
"epoch": 0.3139795664091702,
"grad_norm": 0.12360548228025436,
"learning_rate": 0.0002,
"loss": 1.6808,
"step": 630
},
{
"epoch": 0.31447794667331175,
"grad_norm": 0.1292014867067337,
"learning_rate": 0.0002,
"loss": 1.6697,
"step": 631
},
{
"epoch": 0.31497632693745325,
"grad_norm": 0.11038494855165482,
"learning_rate": 0.0002,
"loss": 1.6103,
"step": 632
},
{
"epoch": 0.3154747072015948,
"grad_norm": 0.11607655137777328,
"learning_rate": 0.0002,
"loss": 1.6241,
"step": 633
},
{
"epoch": 0.3159730874657364,
"grad_norm": 0.10514742881059647,
"learning_rate": 0.0002,
"loss": 1.6922,
"step": 634
},
{
"epoch": 0.3164714677298779,
"grad_norm": 0.107606902718544,
"learning_rate": 0.0002,
"loss": 1.6975,
"step": 635
},
{
"epoch": 0.31696984799401945,
"grad_norm": 0.20367765426635742,
"learning_rate": 0.0002,
"loss": 1.5704,
"step": 636
},
{
"epoch": 0.31746822825816096,
"grad_norm": 0.10455407947301865,
"learning_rate": 0.0002,
"loss": 1.7109,
"step": 637
},
{
"epoch": 0.3179666085223025,
"grad_norm": 0.48424893617630005,
"learning_rate": 0.0002,
"loss": 1.5871,
"step": 638
},
{
"epoch": 0.3184649887864441,
"grad_norm": 0.16340336203575134,
"learning_rate": 0.0002,
"loss": 1.6856,
"step": 639
},
{
"epoch": 0.3189633690505856,
"grad_norm": 0.1317445933818817,
"learning_rate": 0.0002,
"loss": 1.6904,
"step": 640
},
{
"epoch": 0.31946174931472715,
"grad_norm": 0.12784677743911743,
"learning_rate": 0.0002,
"loss": 1.6983,
"step": 641
},
{
"epoch": 0.31996012957886866,
"grad_norm": 0.10745134204626083,
"learning_rate": 0.0002,
"loss": 1.6353,
"step": 642
},
{
"epoch": 0.3204585098430102,
"grad_norm": 0.1444125920534134,
"learning_rate": 0.0002,
"loss": 1.7109,
"step": 643
},
{
"epoch": 0.3209568901071518,
"grad_norm": 0.3750239908695221,
"learning_rate": 0.0002,
"loss": 1.6571,
"step": 644
},
{
"epoch": 0.3214552703712933,
"grad_norm": 0.11034873872995377,
"learning_rate": 0.0002,
"loss": 1.6547,
"step": 645
},
{
"epoch": 0.32195365063543485,
"grad_norm": 0.10759663581848145,
"learning_rate": 0.0002,
"loss": 1.628,
"step": 646
},
{
"epoch": 0.32245203089957636,
"grad_norm": 0.11017131060361862,
"learning_rate": 0.0002,
"loss": 1.6877,
"step": 647
},
{
"epoch": 0.3229504111637179,
"grad_norm": 0.1253817230463028,
"learning_rate": 0.0002,
"loss": 1.7226,
"step": 648
},
{
"epoch": 0.3234487914278595,
"grad_norm": 0.5153695344924927,
"learning_rate": 0.0002,
"loss": 1.7687,
"step": 649
},
{
"epoch": 0.323947171692001,
"grad_norm": 0.11948184669017792,
"learning_rate": 0.0002,
"loss": 1.7044,
"step": 650
},
{
"epoch": 0.32444555195614255,
"grad_norm": 0.11249465495347977,
"learning_rate": 0.0002,
"loss": 1.6282,
"step": 651
},
{
"epoch": 0.32494393222028406,
"grad_norm": 0.11555810272693634,
"learning_rate": 0.0002,
"loss": 1.7295,
"step": 652
},
{
"epoch": 0.3254423124844256,
"grad_norm": 0.11882718652486801,
"learning_rate": 0.0002,
"loss": 1.6531,
"step": 653
},
{
"epoch": 0.32594069274856713,
"grad_norm": 0.10453632473945618,
"learning_rate": 0.0002,
"loss": 1.6342,
"step": 654
},
{
"epoch": 0.3264390730127087,
"grad_norm": 0.11219029873609543,
"learning_rate": 0.0002,
"loss": 1.6902,
"step": 655
},
{
"epoch": 0.32693745327685025,
"grad_norm": 0.10499835759401321,
"learning_rate": 0.0002,
"loss": 1.5583,
"step": 656
},
{
"epoch": 0.32743583354099176,
"grad_norm": 0.10964427143335342,
"learning_rate": 0.0002,
"loss": 1.5675,
"step": 657
},
{
"epoch": 0.3279342138051333,
"grad_norm": 0.18510489165782928,
"learning_rate": 0.0002,
"loss": 1.6178,
"step": 658
},
{
"epoch": 0.32843259406927483,
"grad_norm": 0.11548275500535965,
"learning_rate": 0.0002,
"loss": 1.6699,
"step": 659
},
{
"epoch": 0.3289309743334164,
"grad_norm": 0.11357063800096512,
"learning_rate": 0.0002,
"loss": 1.6008,
"step": 660
},
{
"epoch": 0.32942935459755796,
"grad_norm": 0.10668730735778809,
"learning_rate": 0.0002,
"loss": 1.6433,
"step": 661
},
{
"epoch": 0.32992773486169946,
"grad_norm": 0.11750250309705734,
"learning_rate": 0.0002,
"loss": 1.6813,
"step": 662
},
{
"epoch": 0.330426115125841,
"grad_norm": 0.8277010321617126,
"learning_rate": 0.0002,
"loss": 1.7333,
"step": 663
},
{
"epoch": 0.33092449538998253,
"grad_norm": 0.165303573012352,
"learning_rate": 0.0002,
"loss": 1.6812,
"step": 664
},
{
"epoch": 0.3314228756541241,
"grad_norm": 0.12780268490314484,
"learning_rate": 0.0002,
"loss": 1.7106,
"step": 665
},
{
"epoch": 0.33192125591826566,
"grad_norm": 0.13066166639328003,
"learning_rate": 0.0002,
"loss": 1.6846,
"step": 666
},
{
"epoch": 0.33241963618240716,
"grad_norm": 0.12650184333324432,
"learning_rate": 0.0002,
"loss": 1.6144,
"step": 667
},
{
"epoch": 0.3329180164465487,
"grad_norm": 0.12420842051506042,
"learning_rate": 0.0002,
"loss": 1.7015,
"step": 668
},
{
"epoch": 0.33341639671069023,
"grad_norm": 0.1261165291070938,
"learning_rate": 0.0002,
"loss": 1.67,
"step": 669
},
{
"epoch": 0.3339147769748318,
"grad_norm": 0.11121337115764618,
"learning_rate": 0.0002,
"loss": 1.6772,
"step": 670
},
{
"epoch": 0.33441315723897336,
"grad_norm": 0.10835525393486023,
"learning_rate": 0.0002,
"loss": 1.6681,
"step": 671
},
{
"epoch": 0.33491153750311486,
"grad_norm": 0.10837749391794205,
"learning_rate": 0.0002,
"loss": 1.6268,
"step": 672
},
{
"epoch": 0.3354099177672564,
"grad_norm": 0.10254842787981033,
"learning_rate": 0.0002,
"loss": 1.5997,
"step": 673
},
{
"epoch": 0.33590829803139793,
"grad_norm": 0.5288554430007935,
"learning_rate": 0.0002,
"loss": 1.7397,
"step": 674
},
{
"epoch": 0.3364066782955395,
"grad_norm": 0.10820039361715317,
"learning_rate": 0.0002,
"loss": 1.6962,
"step": 675
},
{
"epoch": 0.33690505855968106,
"grad_norm": 0.11754646897315979,
"learning_rate": 0.0002,
"loss": 1.6059,
"step": 676
},
{
"epoch": 0.33740343882382257,
"grad_norm": 0.9506744742393494,
"learning_rate": 0.0002,
"loss": 1.8916,
"step": 677
},
{
"epoch": 0.33790181908796413,
"grad_norm": 0.1273750215768814,
"learning_rate": 0.0002,
"loss": 1.6896,
"step": 678
},
{
"epoch": 0.33840019935210564,
"grad_norm": 0.14315767586231232,
"learning_rate": 0.0002,
"loss": 1.6903,
"step": 679
},
{
"epoch": 0.3388985796162472,
"grad_norm": 0.15645241737365723,
"learning_rate": 0.0002,
"loss": 1.6823,
"step": 680
},
{
"epoch": 0.33939695988038876,
"grad_norm": 0.5159462690353394,
"learning_rate": 0.0002,
"loss": 1.6947,
"step": 681
},
{
"epoch": 0.33989534014453027,
"grad_norm": 0.13883577287197113,
"learning_rate": 0.0002,
"loss": 1.7448,
"step": 682
},
{
"epoch": 0.34039372040867183,
"grad_norm": 0.39283788204193115,
"learning_rate": 0.0002,
"loss": 1.6181,
"step": 683
},
{
"epoch": 0.34089210067281334,
"grad_norm": 0.20534516870975494,
"learning_rate": 0.0002,
"loss": 1.721,
"step": 684
},
{
"epoch": 0.3413904809369549,
"grad_norm": 0.14379210770130157,
"learning_rate": 0.0002,
"loss": 1.6955,
"step": 685
},
{
"epoch": 0.34188886120109646,
"grad_norm": 0.1505320966243744,
"learning_rate": 0.0002,
"loss": 1.7168,
"step": 686
},
{
"epoch": 0.34238724146523797,
"grad_norm": 0.1377919316291809,
"learning_rate": 0.0002,
"loss": 1.7001,
"step": 687
},
{
"epoch": 0.34288562172937953,
"grad_norm": 0.1268286257982254,
"learning_rate": 0.0002,
"loss": 1.6405,
"step": 688
},
{
"epoch": 0.34338400199352104,
"grad_norm": 0.11991781741380692,
"learning_rate": 0.0002,
"loss": 1.6862,
"step": 689
},
{
"epoch": 0.3438823822576626,
"grad_norm": 0.12283925712108612,
"learning_rate": 0.0002,
"loss": 1.7222,
"step": 690
},
{
"epoch": 0.34438076252180416,
"grad_norm": 0.11207298189401627,
"learning_rate": 0.0002,
"loss": 1.6477,
"step": 691
},
{
"epoch": 0.34487914278594567,
"grad_norm": 0.11342150717973709,
"learning_rate": 0.0002,
"loss": 1.6907,
"step": 692
},
{
"epoch": 0.34537752305008723,
"grad_norm": 0.1479737013578415,
"learning_rate": 0.0002,
"loss": 1.6982,
"step": 693
},
{
"epoch": 0.34587590331422874,
"grad_norm": 0.11498729884624481,
"learning_rate": 0.0002,
"loss": 1.6604,
"step": 694
},
{
"epoch": 0.3463742835783703,
"grad_norm": 0.12394261360168457,
"learning_rate": 0.0002,
"loss": 1.699,
"step": 695
},
{
"epoch": 0.34687266384251186,
"grad_norm": 0.12563689053058624,
"learning_rate": 0.0002,
"loss": 1.6637,
"step": 696
},
{
"epoch": 0.34737104410665337,
"grad_norm": 0.10661863535642624,
"learning_rate": 0.0002,
"loss": 1.6921,
"step": 697
},
{
"epoch": 0.34786942437079493,
"grad_norm": 0.10778840631246567,
"learning_rate": 0.0002,
"loss": 1.6719,
"step": 698
},
{
"epoch": 0.34836780463493644,
"grad_norm": 0.10504487156867981,
"learning_rate": 0.0002,
"loss": 1.6616,
"step": 699
},
{
"epoch": 0.348866184899078,
"grad_norm": 0.10722413659095764,
"learning_rate": 0.0002,
"loss": 1.6452,
"step": 700
},
{
"epoch": 0.3493645651632195,
"grad_norm": 0.10450419783592224,
"learning_rate": 0.0002,
"loss": 1.6342,
"step": 701
},
{
"epoch": 0.34986294542736107,
"grad_norm": 0.10961712151765823,
"learning_rate": 0.0002,
"loss": 1.68,
"step": 702
},
{
"epoch": 0.35036132569150263,
"grad_norm": 0.10789170861244202,
"learning_rate": 0.0002,
"loss": 1.6662,
"step": 703
},
{
"epoch": 0.35085970595564414,
"grad_norm": 0.10823702067136765,
"learning_rate": 0.0002,
"loss": 1.6733,
"step": 704
},
{
"epoch": 0.3513580862197857,
"grad_norm": 0.11080746352672577,
"learning_rate": 0.0002,
"loss": 1.6332,
"step": 705
},
{
"epoch": 0.3518564664839272,
"grad_norm": 0.10004162788391113,
"learning_rate": 0.0002,
"loss": 1.5841,
"step": 706
},
{
"epoch": 0.3523548467480688,
"grad_norm": 0.10398257523775101,
"learning_rate": 0.0002,
"loss": 1.6735,
"step": 707
},
{
"epoch": 0.35285322701221034,
"grad_norm": 0.10170764476060867,
"learning_rate": 0.0002,
"loss": 1.6584,
"step": 708
},
{
"epoch": 0.35335160727635184,
"grad_norm": 0.8194452524185181,
"learning_rate": 0.0002,
"loss": 1.8272,
"step": 709
},
{
"epoch": 0.3538499875404934,
"grad_norm": 0.15103065967559814,
"learning_rate": 0.0002,
"loss": 1.6954,
"step": 710
},
{
"epoch": 0.3543483678046349,
"grad_norm": 0.12205032259225845,
"learning_rate": 0.0002,
"loss": 1.6823,
"step": 711
},
{
"epoch": 0.3548467480687765,
"grad_norm": 0.1272657811641693,
"learning_rate": 0.0002,
"loss": 1.5557,
"step": 712
},
{
"epoch": 0.35534512833291804,
"grad_norm": 0.503338634967804,
"learning_rate": 0.0002,
"loss": 1.7847,
"step": 713
},
{
"epoch": 0.35584350859705954,
"grad_norm": 0.11442038416862488,
"learning_rate": 0.0002,
"loss": 1.6633,
"step": 714
},
{
"epoch": 0.3563418888612011,
"grad_norm": 0.1573084145784378,
"learning_rate": 0.0002,
"loss": 1.7377,
"step": 715
},
{
"epoch": 0.3568402691253426,
"grad_norm": 0.11450973153114319,
"learning_rate": 0.0002,
"loss": 1.5862,
"step": 716
},
{
"epoch": 0.3573386493894842,
"grad_norm": 0.1249619573354721,
"learning_rate": 0.0002,
"loss": 1.5954,
"step": 717
},
{
"epoch": 0.35783702965362574,
"grad_norm": 0.11494952440261841,
"learning_rate": 0.0002,
"loss": 1.6432,
"step": 718
},
{
"epoch": 0.35833540991776724,
"grad_norm": 0.13213759660720825,
"learning_rate": 0.0002,
"loss": 1.803,
"step": 719
},
{
"epoch": 0.3588337901819088,
"grad_norm": 1.1261271238327026,
"learning_rate": 0.0002,
"loss": 1.818,
"step": 720
},
{
"epoch": 0.3593321704460503,
"grad_norm": 1.338255524635315,
"learning_rate": 0.0002,
"loss": 1.7306,
"step": 721
},
{
"epoch": 0.3598305507101919,
"grad_norm": 0.21815264225006104,
"learning_rate": 0.0002,
"loss": 1.7224,
"step": 722
},
{
"epoch": 0.36032893097433344,
"grad_norm": 0.5178132653236389,
"learning_rate": 0.0002,
"loss": 1.7097,
"step": 723
},
{
"epoch": 0.36082731123847495,
"grad_norm": 0.241803839802742,
"learning_rate": 0.0002,
"loss": 1.7047,
"step": 724
},
{
"epoch": 0.3613256915026165,
"grad_norm": 0.20727293193340302,
"learning_rate": 0.0002,
"loss": 1.7278,
"step": 725
},
{
"epoch": 0.361824071766758,
"grad_norm": 0.16459515690803528,
"learning_rate": 0.0002,
"loss": 1.7204,
"step": 726
},
{
"epoch": 0.3623224520308996,
"grad_norm": 0.16415144503116608,
"learning_rate": 0.0002,
"loss": 1.6764,
"step": 727
},
{
"epoch": 0.36282083229504114,
"grad_norm": 0.16096027195453644,
"learning_rate": 0.0002,
"loss": 1.665,
"step": 728
},
{
"epoch": 0.36331921255918265,
"grad_norm": 0.17240643501281738,
"learning_rate": 0.0002,
"loss": 1.6761,
"step": 729
},
{
"epoch": 0.3638175928233242,
"grad_norm": 0.19763271510601044,
"learning_rate": 0.0002,
"loss": 1.7402,
"step": 730
},
{
"epoch": 0.3643159730874657,
"grad_norm": 0.15238463878631592,
"learning_rate": 0.0002,
"loss": 1.6884,
"step": 731
},
{
"epoch": 0.3648143533516073,
"grad_norm": 0.27482038736343384,
"learning_rate": 0.0002,
"loss": 1.7064,
"step": 732
},
{
"epoch": 0.36531273361574884,
"grad_norm": 0.5192012786865234,
"learning_rate": 0.0002,
"loss": 1.8117,
"step": 733
},
{
"epoch": 0.36581111387989035,
"grad_norm": 0.1510191708803177,
"learning_rate": 0.0002,
"loss": 1.667,
"step": 734
},
{
"epoch": 0.3663094941440319,
"grad_norm": 0.14513470232486725,
"learning_rate": 0.0002,
"loss": 1.6431,
"step": 735
},
{
"epoch": 0.3668078744081734,
"grad_norm": 0.7901990413665771,
"learning_rate": 0.0002,
"loss": 1.764,
"step": 736
},
{
"epoch": 0.367306254672315,
"grad_norm": 0.17642100155353546,
"learning_rate": 0.0002,
"loss": 1.7096,
"step": 737
},
{
"epoch": 0.36780463493645654,
"grad_norm": 0.14719779789447784,
"learning_rate": 0.0002,
"loss": 1.6343,
"step": 738
},
{
"epoch": 0.36830301520059805,
"grad_norm": 0.16173601150512695,
"learning_rate": 0.0002,
"loss": 1.6937,
"step": 739
},
{
"epoch": 0.3688013954647396,
"grad_norm": 0.32359546422958374,
"learning_rate": 0.0002,
"loss": 1.681,
"step": 740
},
{
"epoch": 0.3692997757288811,
"grad_norm": 0.14779435098171234,
"learning_rate": 0.0002,
"loss": 1.6745,
"step": 741
},
{
"epoch": 0.3697981559930227,
"grad_norm": 0.19540923833847046,
"learning_rate": 0.0002,
"loss": 1.5529,
"step": 742
},
{
"epoch": 0.37029653625716424,
"grad_norm": 0.13870155811309814,
"learning_rate": 0.0002,
"loss": 1.6497,
"step": 743
},
{
"epoch": 0.37079491652130575,
"grad_norm": 0.13447612524032593,
"learning_rate": 0.0002,
"loss": 1.7275,
"step": 744
},
{
"epoch": 0.3712932967854473,
"grad_norm": 0.13197576999664307,
"learning_rate": 0.0002,
"loss": 1.6776,
"step": 745
},
{
"epoch": 0.3717916770495888,
"grad_norm": 0.13072870671749115,
"learning_rate": 0.0002,
"loss": 1.6227,
"step": 746
},
{
"epoch": 0.3722900573137304,
"grad_norm": 0.13418208062648773,
"learning_rate": 0.0002,
"loss": 1.6998,
"step": 747
},
{
"epoch": 0.3727884375778719,
"grad_norm": 0.11689562350511551,
"learning_rate": 0.0002,
"loss": 1.6863,
"step": 748
},
{
"epoch": 0.37328681784201345,
"grad_norm": 0.1243453249335289,
"learning_rate": 0.0002,
"loss": 1.6456,
"step": 749
},
{
"epoch": 0.373785198106155,
"grad_norm": 0.11520450562238693,
"learning_rate": 0.0002,
"loss": 1.6815,
"step": 750
},
{
"epoch": 0.3742835783702965,
"grad_norm": 0.13939018547534943,
"learning_rate": 0.0002,
"loss": 1.6556,
"step": 751
},
{
"epoch": 0.3747819586344381,
"grad_norm": 0.11021385341882706,
"learning_rate": 0.0002,
"loss": 1.6923,
"step": 752
},
{
"epoch": 0.3752803388985796,
"grad_norm": 0.11470180004835129,
"learning_rate": 0.0002,
"loss": 1.6402,
"step": 753
},
{
"epoch": 0.37577871916272115,
"grad_norm": 0.12256886065006256,
"learning_rate": 0.0002,
"loss": 1.7271,
"step": 754
},
{
"epoch": 0.3762770994268627,
"grad_norm": 0.11696486920118332,
"learning_rate": 0.0002,
"loss": 1.7069,
"step": 755
},
{
"epoch": 0.3767754796910042,
"grad_norm": 0.11340934783220291,
"learning_rate": 0.0002,
"loss": 1.6261,
"step": 756
},
{
"epoch": 0.3772738599551458,
"grad_norm": 0.10606078803539276,
"learning_rate": 0.0002,
"loss": 1.6425,
"step": 757
},
{
"epoch": 0.3777722402192873,
"grad_norm": 0.12084966152906418,
"learning_rate": 0.0002,
"loss": 1.6273,
"step": 758
},
{
"epoch": 0.37827062048342885,
"grad_norm": 0.1084008663892746,
"learning_rate": 0.0002,
"loss": 1.6471,
"step": 759
},
{
"epoch": 0.3787690007475704,
"grad_norm": 0.11194922029972076,
"learning_rate": 0.0002,
"loss": 1.6478,
"step": 760
},
{
"epoch": 0.3792673810117119,
"grad_norm": 0.48235663771629333,
"learning_rate": 0.0002,
"loss": 1.5982,
"step": 761
},
{
"epoch": 0.3797657612758535,
"grad_norm": 0.586637556552887,
"learning_rate": 0.0002,
"loss": 1.7294,
"step": 762
},
{
"epoch": 0.380264141539995,
"grad_norm": 0.14328181743621826,
"learning_rate": 0.0002,
"loss": 1.7112,
"step": 763
},
{
"epoch": 0.38076252180413656,
"grad_norm": 0.13296020030975342,
"learning_rate": 0.0002,
"loss": 1.7044,
"step": 764
},
{
"epoch": 0.3812609020682781,
"grad_norm": 0.44004350900650024,
"learning_rate": 0.0002,
"loss": 1.6377,
"step": 765
},
{
"epoch": 0.3817592823324196,
"grad_norm": 0.12628889083862305,
"learning_rate": 0.0002,
"loss": 1.6192,
"step": 766
},
{
"epoch": 0.3822576625965612,
"grad_norm": 0.1330346316099167,
"learning_rate": 0.0002,
"loss": 1.6461,
"step": 767
},
{
"epoch": 0.3827560428607027,
"grad_norm": 0.11893340200185776,
"learning_rate": 0.0002,
"loss": 1.6299,
"step": 768
},
{
"epoch": 0.38325442312484426,
"grad_norm": 0.15412816405296326,
"learning_rate": 0.0002,
"loss": 1.7436,
"step": 769
},
{
"epoch": 0.3837528033889858,
"grad_norm": 0.12351204454898834,
"learning_rate": 0.0002,
"loss": 1.6844,
"step": 770
},
{
"epoch": 0.3842511836531273,
"grad_norm": 0.11671744287014008,
"learning_rate": 0.0002,
"loss": 1.6748,
"step": 771
},
{
"epoch": 0.3847495639172689,
"grad_norm": 0.12512736022472382,
"learning_rate": 0.0002,
"loss": 1.6362,
"step": 772
},
{
"epoch": 0.3852479441814104,
"grad_norm": 0.12629447877407074,
"learning_rate": 0.0002,
"loss": 1.6033,
"step": 773
},
{
"epoch": 0.38574632444555196,
"grad_norm": 0.11553051322698593,
"learning_rate": 0.0002,
"loss": 1.6639,
"step": 774
},
{
"epoch": 0.3862447047096935,
"grad_norm": 0.12756189703941345,
"learning_rate": 0.0002,
"loss": 1.6397,
"step": 775
},
{
"epoch": 0.386743084973835,
"grad_norm": 0.11309953778982162,
"learning_rate": 0.0002,
"loss": 1.6098,
"step": 776
},
{
"epoch": 0.3872414652379766,
"grad_norm": 0.164617121219635,
"learning_rate": 0.0002,
"loss": 1.54,
"step": 777
},
{
"epoch": 0.3877398455021181,
"grad_norm": 0.45813101530075073,
"learning_rate": 0.0002,
"loss": 1.7208,
"step": 778
},
{
"epoch": 0.38823822576625966,
"grad_norm": 0.7587694525718689,
"learning_rate": 0.0002,
"loss": 1.6195,
"step": 779
},
{
"epoch": 0.3887366060304012,
"grad_norm": 0.12699078023433685,
"learning_rate": 0.0002,
"loss": 1.6596,
"step": 780
},
{
"epoch": 0.38923498629454273,
"grad_norm": 0.139120951294899,
"learning_rate": 0.0002,
"loss": 1.6511,
"step": 781
},
{
"epoch": 0.3897333665586843,
"grad_norm": 0.13968676328659058,
"learning_rate": 0.0002,
"loss": 1.7033,
"step": 782
},
{
"epoch": 0.3902317468228258,
"grad_norm": 0.28061848878860474,
"learning_rate": 0.0002,
"loss": 1.6016,
"step": 783
},
{
"epoch": 0.39073012708696736,
"grad_norm": 0.11748450994491577,
"learning_rate": 0.0002,
"loss": 1.5984,
"step": 784
},
{
"epoch": 0.3912285073511089,
"grad_norm": 0.7288643717765808,
"learning_rate": 0.0002,
"loss": 1.769,
"step": 785
},
{
"epoch": 0.39172688761525043,
"grad_norm": 0.12540021538734436,
"learning_rate": 0.0002,
"loss": 1.6622,
"step": 786
},
{
"epoch": 0.392225267879392,
"grad_norm": 0.13594292104244232,
"learning_rate": 0.0002,
"loss": 1.6626,
"step": 787
},
{
"epoch": 0.3927236481435335,
"grad_norm": 0.12894773483276367,
"learning_rate": 0.0002,
"loss": 1.5733,
"step": 788
},
{
"epoch": 0.39322202840767506,
"grad_norm": 0.6577300429344177,
"learning_rate": 0.0002,
"loss": 1.8085,
"step": 789
},
{
"epoch": 0.39372040867181657,
"grad_norm": 0.12034627795219421,
"learning_rate": 0.0002,
"loss": 1.5798,
"step": 790
},
{
"epoch": 0.39421878893595813,
"grad_norm": 0.1254388988018036,
"learning_rate": 0.0002,
"loss": 1.6677,
"step": 791
},
{
"epoch": 0.3947171692000997,
"grad_norm": 0.136959508061409,
"learning_rate": 0.0002,
"loss": 1.6108,
"step": 792
},
{
"epoch": 0.3952155494642412,
"grad_norm": 0.37221673130989075,
"learning_rate": 0.0002,
"loss": 1.826,
"step": 793
},
{
"epoch": 0.39571392972838276,
"grad_norm": 0.14947831630706787,
"learning_rate": 0.0002,
"loss": 1.6967,
"step": 794
},
{
"epoch": 0.39621230999252427,
"grad_norm": 0.1409454494714737,
"learning_rate": 0.0002,
"loss": 1.7217,
"step": 795
},
{
"epoch": 0.39671069025666583,
"grad_norm": 0.1448691040277481,
"learning_rate": 0.0002,
"loss": 1.7872,
"step": 796
},
{
"epoch": 0.3972090705208074,
"grad_norm": 0.12816311419010162,
"learning_rate": 0.0002,
"loss": 1.6976,
"step": 797
},
{
"epoch": 0.3977074507849489,
"grad_norm": 0.12581898272037506,
"learning_rate": 0.0002,
"loss": 1.7111,
"step": 798
},
{
"epoch": 0.39820583104909046,
"grad_norm": 0.1256158947944641,
"learning_rate": 0.0002,
"loss": 1.6778,
"step": 799
},
{
"epoch": 0.39870421131323197,
"grad_norm": 0.12009266763925552,
"learning_rate": 0.0002,
"loss": 1.6336,
"step": 800
},
{
"epoch": 0.39920259157737353,
"grad_norm": 0.14727051556110382,
"learning_rate": 0.0002,
"loss": 1.7165,
"step": 801
},
{
"epoch": 0.3997009718415151,
"grad_norm": 1.98500394821167,
"learning_rate": 0.0002,
"loss": 1.9632,
"step": 802
},
{
"epoch": 0.4001993521056566,
"grad_norm": 0.12300129979848862,
"learning_rate": 0.0002,
"loss": 1.6003,
"step": 803
},
{
"epoch": 0.40069773236979817,
"grad_norm": 0.13758836686611176,
"learning_rate": 0.0002,
"loss": 1.6486,
"step": 804
},
{
"epoch": 0.40119611263393967,
"grad_norm": 0.13127754628658295,
"learning_rate": 0.0002,
"loss": 1.6673,
"step": 805
},
{
"epoch": 0.40169449289808123,
"grad_norm": 0.13612794876098633,
"learning_rate": 0.0002,
"loss": 1.7149,
"step": 806
},
{
"epoch": 0.4021928731622228,
"grad_norm": 0.3637385964393616,
"learning_rate": 0.0002,
"loss": 1.6486,
"step": 807
},
{
"epoch": 0.4026912534263643,
"grad_norm": 0.19778436422348022,
"learning_rate": 0.0002,
"loss": 1.5517,
"step": 808
},
{
"epoch": 0.40318963369050587,
"grad_norm": 0.1478605717420578,
"learning_rate": 0.0002,
"loss": 1.7642,
"step": 809
},
{
"epoch": 0.4036880139546474,
"grad_norm": 0.3014202415943146,
"learning_rate": 0.0002,
"loss": 1.6141,
"step": 810
},
{
"epoch": 0.40418639421878894,
"grad_norm": 0.13049842417240143,
"learning_rate": 0.0002,
"loss": 1.6579,
"step": 811
},
{
"epoch": 0.4046847744829305,
"grad_norm": 0.932788610458374,
"learning_rate": 0.0002,
"loss": 1.7722,
"step": 812
},
{
"epoch": 0.405183154747072,
"grad_norm": 0.1687835305929184,
"learning_rate": 0.0002,
"loss": 1.6492,
"step": 813
},
{
"epoch": 0.40568153501121357,
"grad_norm": 0.2024388164281845,
"learning_rate": 0.0002,
"loss": 1.5523,
"step": 814
},
{
"epoch": 0.4061799152753551,
"grad_norm": 0.20838886499404907,
"learning_rate": 0.0002,
"loss": 1.6884,
"step": 815
},
{
"epoch": 0.40667829553949664,
"grad_norm": 0.1490757167339325,
"learning_rate": 0.0002,
"loss": 1.6936,
"step": 816
},
{
"epoch": 0.4071766758036382,
"grad_norm": 1.1997255086898804,
"learning_rate": 0.0002,
"loss": 1.873,
"step": 817
},
{
"epoch": 0.4076750560677797,
"grad_norm": 0.139000803232193,
"learning_rate": 0.0002,
"loss": 1.7303,
"step": 818
},
{
"epoch": 0.40817343633192127,
"grad_norm": 0.14747615158557892,
"learning_rate": 0.0002,
"loss": 1.6558,
"step": 819
},
{
"epoch": 0.4086718165960628,
"grad_norm": 0.15866988897323608,
"learning_rate": 0.0002,
"loss": 1.6991,
"step": 820
},
{
"epoch": 0.40917019686020434,
"grad_norm": 0.14660963416099548,
"learning_rate": 0.0002,
"loss": 1.7233,
"step": 821
},
{
"epoch": 0.4096685771243459,
"grad_norm": 0.14071424305438995,
"learning_rate": 0.0002,
"loss": 1.6434,
"step": 822
},
{
"epoch": 0.4101669573884874,
"grad_norm": 0.1368856132030487,
"learning_rate": 0.0002,
"loss": 1.6415,
"step": 823
},
{
"epoch": 0.41066533765262897,
"grad_norm": 0.14662376046180725,
"learning_rate": 0.0002,
"loss": 1.7111,
"step": 824
},
{
"epoch": 0.4111637179167705,
"grad_norm": 0.14027300477027893,
"learning_rate": 0.0002,
"loss": 1.6698,
"step": 825
},
{
"epoch": 0.41166209818091204,
"grad_norm": 0.5542290210723877,
"learning_rate": 0.0002,
"loss": 1.6551,
"step": 826
},
{
"epoch": 0.4121604784450536,
"grad_norm": 0.15360352396965027,
"learning_rate": 0.0002,
"loss": 1.7313,
"step": 827
},
{
"epoch": 0.4126588587091951,
"grad_norm": 0.14451801776885986,
"learning_rate": 0.0002,
"loss": 1.6481,
"step": 828
},
{
"epoch": 0.41315723897333667,
"grad_norm": 0.1393883228302002,
"learning_rate": 0.0002,
"loss": 1.5922,
"step": 829
},
{
"epoch": 0.4136556192374782,
"grad_norm": 0.13610626757144928,
"learning_rate": 0.0002,
"loss": 1.6347,
"step": 830
},
{
"epoch": 0.41415399950161974,
"grad_norm": 0.12424327433109283,
"learning_rate": 0.0002,
"loss": 1.6563,
"step": 831
},
{
"epoch": 0.4146523797657613,
"grad_norm": 0.127548947930336,
"learning_rate": 0.0002,
"loss": 1.6609,
"step": 832
},
{
"epoch": 0.4151507600299028,
"grad_norm": 0.1881740391254425,
"learning_rate": 0.0002,
"loss": 1.7251,
"step": 833
},
{
"epoch": 0.4156491402940444,
"grad_norm": 0.12144262343645096,
"learning_rate": 0.0002,
"loss": 1.6922,
"step": 834
},
{
"epoch": 0.4161475205581859,
"grad_norm": 0.11799559742212296,
"learning_rate": 0.0002,
"loss": 1.672,
"step": 835
},
{
"epoch": 0.41664590082232744,
"grad_norm": 0.12129071354866028,
"learning_rate": 0.0002,
"loss": 1.6189,
"step": 836
},
{
"epoch": 0.41714428108646895,
"grad_norm": 0.11648084223270416,
"learning_rate": 0.0002,
"loss": 1.636,
"step": 837
},
{
"epoch": 0.4176426613506105,
"grad_norm": 0.11401843279600143,
"learning_rate": 0.0002,
"loss": 1.6266,
"step": 838
},
{
"epoch": 0.4181410416147521,
"grad_norm": 0.11244560778141022,
"learning_rate": 0.0002,
"loss": 1.6338,
"step": 839
},
{
"epoch": 0.4186394218788936,
"grad_norm": 0.11274567991495132,
"learning_rate": 0.0002,
"loss": 1.5518,
"step": 840
},
{
"epoch": 0.41913780214303514,
"grad_norm": 0.11203539371490479,
"learning_rate": 0.0002,
"loss": 1.6372,
"step": 841
},
{
"epoch": 0.41963618240717665,
"grad_norm": 0.11548861116170883,
"learning_rate": 0.0002,
"loss": 1.5787,
"step": 842
},
{
"epoch": 0.4201345626713182,
"grad_norm": 0.10921257734298706,
"learning_rate": 0.0002,
"loss": 1.6457,
"step": 843
},
{
"epoch": 0.4206329429354598,
"grad_norm": 0.10832211375236511,
"learning_rate": 0.0002,
"loss": 1.6613,
"step": 844
},
{
"epoch": 0.4211313231996013,
"grad_norm": 0.11785157024860382,
"learning_rate": 0.0002,
"loss": 1.6687,
"step": 845
},
{
"epoch": 0.42162970346374284,
"grad_norm": 0.1575067639350891,
"learning_rate": 0.0002,
"loss": 1.7148,
"step": 846
},
{
"epoch": 0.42212808372788435,
"grad_norm": 0.5687432885169983,
"learning_rate": 0.0002,
"loss": 1.8016,
"step": 847
},
{
"epoch": 0.4226264639920259,
"grad_norm": 0.887058675289154,
"learning_rate": 0.0002,
"loss": 1.7988,
"step": 848
},
{
"epoch": 0.4231248442561675,
"grad_norm": 0.12778295576572418,
"learning_rate": 0.0002,
"loss": 1.6586,
"step": 849
},
{
"epoch": 0.423623224520309,
"grad_norm": 0.13481804728507996,
"learning_rate": 0.0002,
"loss": 1.696,
"step": 850
},
{
"epoch": 0.42412160478445055,
"grad_norm": 0.1478685438632965,
"learning_rate": 0.0002,
"loss": 1.6758,
"step": 851
},
{
"epoch": 0.42461998504859205,
"grad_norm": 0.13414372503757477,
"learning_rate": 0.0002,
"loss": 1.657,
"step": 852
},
{
"epoch": 0.4251183653127336,
"grad_norm": 0.13211821019649506,
"learning_rate": 0.0002,
"loss": 1.6403,
"step": 853
},
{
"epoch": 0.4256167455768752,
"grad_norm": 0.13594435155391693,
"learning_rate": 0.0002,
"loss": 1.6363,
"step": 854
},
{
"epoch": 0.4261151258410167,
"grad_norm": 0.13266883790493011,
"learning_rate": 0.0002,
"loss": 1.6632,
"step": 855
},
{
"epoch": 0.42661350610515825,
"grad_norm": 0.12024448066949844,
"learning_rate": 0.0002,
"loss": 1.6745,
"step": 856
},
{
"epoch": 0.42711188636929975,
"grad_norm": 0.12828536331653595,
"learning_rate": 0.0002,
"loss": 1.6493,
"step": 857
},
{
"epoch": 0.4276102666334413,
"grad_norm": 0.12315808236598969,
"learning_rate": 0.0002,
"loss": 1.6803,
"step": 858
},
{
"epoch": 0.4281086468975829,
"grad_norm": 0.13026510179042816,
"learning_rate": 0.0002,
"loss": 1.6536,
"step": 859
},
{
"epoch": 0.4286070271617244,
"grad_norm": 0.45274946093559265,
"learning_rate": 0.0002,
"loss": 1.7579,
"step": 860
},
{
"epoch": 0.42910540742586595,
"grad_norm": 0.12899275124073029,
"learning_rate": 0.0002,
"loss": 1.6603,
"step": 861
},
{
"epoch": 0.42960378769000745,
"grad_norm": 0.12414630502462387,
"learning_rate": 0.0002,
"loss": 1.6933,
"step": 862
},
{
"epoch": 0.430102167954149,
"grad_norm": 0.146366149187088,
"learning_rate": 0.0002,
"loss": 1.6799,
"step": 863
},
{
"epoch": 0.4306005482182906,
"grad_norm": 0.11743781715631485,
"learning_rate": 0.0002,
"loss": 1.6395,
"step": 864
},
{
"epoch": 0.4310989284824321,
"grad_norm": 0.15248535573482513,
"learning_rate": 0.0002,
"loss": 1.7598,
"step": 865
},
{
"epoch": 0.43159730874657365,
"grad_norm": 0.11914569139480591,
"learning_rate": 0.0002,
"loss": 1.663,
"step": 866
},
{
"epoch": 0.43209568901071516,
"grad_norm": 0.11982624977827072,
"learning_rate": 0.0002,
"loss": 1.651,
"step": 867
},
{
"epoch": 0.4325940692748567,
"grad_norm": 0.12126267701387405,
"learning_rate": 0.0002,
"loss": 1.7153,
"step": 868
},
{
"epoch": 0.4330924495389983,
"grad_norm": 0.3660570979118347,
"learning_rate": 0.0002,
"loss": 1.6142,
"step": 869
},
{
"epoch": 0.4335908298031398,
"grad_norm": 0.11174522340297699,
"learning_rate": 0.0002,
"loss": 1.6199,
"step": 870
},
{
"epoch": 0.43408921006728135,
"grad_norm": 0.12089698761701584,
"learning_rate": 0.0002,
"loss": 1.7026,
"step": 871
},
{
"epoch": 0.43458759033142286,
"grad_norm": 0.11779413372278214,
"learning_rate": 0.0002,
"loss": 1.6757,
"step": 872
},
{
"epoch": 0.4350859705955644,
"grad_norm": 0.11461353302001953,
"learning_rate": 0.0002,
"loss": 1.6943,
"step": 873
},
{
"epoch": 0.435584350859706,
"grad_norm": 0.1294202357530594,
"learning_rate": 0.0002,
"loss": 1.7078,
"step": 874
},
{
"epoch": 0.4360827311238475,
"grad_norm": 0.1081145629286766,
"learning_rate": 0.0002,
"loss": 1.6078,
"step": 875
},
{
"epoch": 0.43658111138798905,
"grad_norm": 0.11721238493919373,
"learning_rate": 0.0002,
"loss": 1.6056,
"step": 876
},
{
"epoch": 0.43707949165213056,
"grad_norm": 0.11436528712511063,
"learning_rate": 0.0002,
"loss": 1.6806,
"step": 877
},
{
"epoch": 0.4375778719162721,
"grad_norm": 0.11401306092739105,
"learning_rate": 0.0002,
"loss": 1.7225,
"step": 878
},
{
"epoch": 0.4380762521804137,
"grad_norm": 0.11282623559236526,
"learning_rate": 0.0002,
"loss": 1.6614,
"step": 879
},
{
"epoch": 0.4385746324445552,
"grad_norm": 0.11592991650104523,
"learning_rate": 0.0002,
"loss": 1.5984,
"step": 880
},
{
"epoch": 0.43907301270869675,
"grad_norm": 0.10579363256692886,
"learning_rate": 0.0002,
"loss": 1.6349,
"step": 881
},
{
"epoch": 0.43957139297283826,
"grad_norm": 0.1032218486070633,
"learning_rate": 0.0002,
"loss": 1.6017,
"step": 882
},
{
"epoch": 0.4400697732369798,
"grad_norm": 0.10277747362852097,
"learning_rate": 0.0002,
"loss": 1.6396,
"step": 883
},
{
"epoch": 0.44056815350112133,
"grad_norm": 0.12377838790416718,
"learning_rate": 0.0002,
"loss": 1.6298,
"step": 884
},
{
"epoch": 0.4410665337652629,
"grad_norm": 0.10326054692268372,
"learning_rate": 0.0002,
"loss": 1.6335,
"step": 885
},
{
"epoch": 0.44156491402940445,
"grad_norm": 0.10518341511487961,
"learning_rate": 0.0002,
"loss": 1.6343,
"step": 886
},
{
"epoch": 0.44206329429354596,
"grad_norm": 0.10297736525535583,
"learning_rate": 0.0002,
"loss": 1.622,
"step": 887
},
{
"epoch": 0.4425616745576875,
"grad_norm": 0.10891593992710114,
"learning_rate": 0.0002,
"loss": 1.6928,
"step": 888
},
{
"epoch": 0.44306005482182903,
"grad_norm": 0.10570312291383743,
"learning_rate": 0.0002,
"loss": 1.5769,
"step": 889
},
{
"epoch": 0.4435584350859706,
"grad_norm": 0.10274644941091537,
"learning_rate": 0.0002,
"loss": 1.7139,
"step": 890
},
{
"epoch": 0.44405681535011216,
"grad_norm": 0.11095419526100159,
"learning_rate": 0.0002,
"loss": 1.6141,
"step": 891
},
{
"epoch": 0.44455519561425366,
"grad_norm": 0.14802560210227966,
"learning_rate": 0.0002,
"loss": 1.6019,
"step": 892
},
{
"epoch": 0.4450535758783952,
"grad_norm": 0.10468854010105133,
"learning_rate": 0.0002,
"loss": 1.5875,
"step": 893
},
{
"epoch": 0.44555195614253673,
"grad_norm": 0.10267975926399231,
"learning_rate": 0.0002,
"loss": 1.6071,
"step": 894
},
{
"epoch": 0.4460503364066783,
"grad_norm": 0.10226966440677643,
"learning_rate": 0.0002,
"loss": 1.6654,
"step": 895
},
{
"epoch": 0.44654871667081986,
"grad_norm": 0.1046745628118515,
"learning_rate": 0.0002,
"loss": 1.6244,
"step": 896
},
{
"epoch": 0.44704709693496136,
"grad_norm": 0.5514235496520996,
"learning_rate": 0.0002,
"loss": 1.6949,
"step": 897
},
{
"epoch": 0.4475454771991029,
"grad_norm": 0.10770034044981003,
"learning_rate": 0.0002,
"loss": 1.6388,
"step": 898
},
{
"epoch": 0.44804385746324443,
"grad_norm": 0.1274634599685669,
"learning_rate": 0.0002,
"loss": 1.7169,
"step": 899
},
{
"epoch": 0.448542237727386,
"grad_norm": 0.11944198608398438,
"learning_rate": 0.0002,
"loss": 1.635,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 4012,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.024817808581591e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}