diff --git "a/checkpoint-900/trainer_state.json" "b/checkpoint-900/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-900/trainer_state.json" @@ -0,0 +1,6333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.448542237727386, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00049838026414154, + "grad_norm": 0.8175273537635803, + "learning_rate": 1e-05, + "loss": 1.8901, + "step": 1 + }, + { + "epoch": 0.00099676052828308, + "grad_norm": 0.5205090641975403, + "learning_rate": 2e-05, + "loss": 1.8661, + "step": 2 + }, + { + "epoch": 0.00149514079242462, + "grad_norm": 0.7050982713699341, + "learning_rate": 3e-05, + "loss": 1.884, + "step": 3 + }, + { + "epoch": 0.00199352105656616, + "grad_norm": 0.3958536684513092, + "learning_rate": 4e-05, + "loss": 1.848, + "step": 4 + }, + { + "epoch": 0.0024919013207077, + "grad_norm": 0.2910257577896118, + "learning_rate": 5e-05, + "loss": 1.8363, + "step": 5 + }, + { + "epoch": 0.00299028158484924, + "grad_norm": 1.1061186790466309, + "learning_rate": 6e-05, + "loss": 2.1065, + "step": 6 + }, + { + "epoch": 0.00348866184899078, + "grad_norm": 0.35989394783973694, + "learning_rate": 7e-05, + "loss": 1.8461, + "step": 7 + }, + { + "epoch": 0.00398704211313232, + "grad_norm": 0.3001234233379364, + "learning_rate": 8e-05, + "loss": 1.8691, + "step": 8 + }, + { + "epoch": 0.00448542237727386, + "grad_norm": 0.3210326135158539, + "learning_rate": 9e-05, + "loss": 1.8006, + "step": 9 + }, + { + "epoch": 0.0049838026414154, + "grad_norm": 0.24240201711654663, + "learning_rate": 0.0001, + "loss": 1.8136, + "step": 10 + }, + { + "epoch": 0.00548218290555694, + "grad_norm": 0.2921009957790375, + "learning_rate": 0.00011000000000000002, + "loss": 1.7785, + "step": 11 + }, + { + "epoch": 0.00598056316969848, + "grad_norm": 0.2199179232120514, + "learning_rate": 0.00012, + "loss": 1.8334, + "step": 12 + }, + { + "epoch": 0.00647894343384002, + "grad_norm": 0.18247301876544952, + "learning_rate": 0.00013000000000000002, + "loss": 1.8171, + "step": 13 + }, + { + "epoch": 0.00697732369798156, + "grad_norm": 0.16971151530742645, + "learning_rate": 0.00014, + "loss": 1.8838, + "step": 14 + }, + { + "epoch": 0.0074757039621231, + "grad_norm": 0.19395150244235992, + "learning_rate": 0.00015000000000000001, + "loss": 1.8121, + "step": 15 + }, + { + "epoch": 0.00797408422626464, + "grad_norm": 0.18596555292606354, + "learning_rate": 0.00016, + "loss": 1.7756, + "step": 16 + }, + { + "epoch": 0.00847246449040618, + "grad_norm": 0.23639832437038422, + "learning_rate": 0.00017, + "loss": 1.8293, + "step": 17 + }, + { + "epoch": 0.00897084475454772, + "grad_norm": 0.5992503762245178, + "learning_rate": 0.00018, + "loss": 1.8285, + "step": 18 + }, + { + "epoch": 0.00946922501868926, + "grad_norm": 0.24062925577163696, + "learning_rate": 0.00019, + "loss": 1.8139, + "step": 19 + }, + { + "epoch": 0.0099676052828308, + "grad_norm": 0.1615862101316452, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 20 + }, + { + "epoch": 0.01046598554697234, + "grad_norm": 0.1461448222398758, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 21 + }, + { + "epoch": 0.01096436581111388, + "grad_norm": 0.16745099425315857, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 22 + }, + { + "epoch": 0.01146274607525542, + "grad_norm": 0.13099125027656555, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 23 + }, + { + "epoch": 0.01196112633939696, + "grad_norm": 0.11523797363042831, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 24 + }, + { + "epoch": 0.012459506603538499, + "grad_norm": 0.4995543956756592, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 25 + }, + { + "epoch": 0.01295788686768004, + "grad_norm": 0.1197713166475296, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 26 + }, + { + "epoch": 0.01345626713182158, + "grad_norm": 0.12242875248193741, + "learning_rate": 0.0002, + "loss": 1.7446, + "step": 27 + }, + { + "epoch": 0.01395464739596312, + "grad_norm": 0.11533704400062561, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 28 + }, + { + "epoch": 0.01445302766010466, + "grad_norm": 0.11372833698987961, + "learning_rate": 0.0002, + "loss": 1.8541, + "step": 29 + }, + { + "epoch": 0.0149514079242462, + "grad_norm": 0.10559230297803879, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 30 + }, + { + "epoch": 0.01544978818838774, + "grad_norm": 0.1040055975317955, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 31 + }, + { + "epoch": 0.01594816845252928, + "grad_norm": 0.09699314832687378, + "learning_rate": 0.0002, + "loss": 1.7119, + "step": 32 + }, + { + "epoch": 0.016446548716670818, + "grad_norm": 0.09951823949813843, + "learning_rate": 0.0002, + "loss": 1.6883, + "step": 33 + }, + { + "epoch": 0.01694492898081236, + "grad_norm": 0.09926764667034149, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 34 + }, + { + "epoch": 0.0174433092449539, + "grad_norm": 0.11137701570987701, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 35 + }, + { + "epoch": 0.01794168950909544, + "grad_norm": 0.09449079632759094, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 36 + }, + { + "epoch": 0.01844006977323698, + "grad_norm": 0.10035137832164764, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 37 + }, + { + "epoch": 0.01893845003737852, + "grad_norm": 0.0987599715590477, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 38 + }, + { + "epoch": 0.01943683030152006, + "grad_norm": 0.1124144196510315, + "learning_rate": 0.0002, + "loss": 1.7833, + "step": 39 + }, + { + "epoch": 0.0199352105656616, + "grad_norm": 0.10424085706472397, + "learning_rate": 0.0002, + "loss": 1.7308, + "step": 40 + }, + { + "epoch": 0.02043359082980314, + "grad_norm": 0.10069456696510315, + "learning_rate": 0.0002, + "loss": 1.7756, + "step": 41 + }, + { + "epoch": 0.02093197109394468, + "grad_norm": 0.096500463783741, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 42 + }, + { + "epoch": 0.02143035135808622, + "grad_norm": 0.10054206848144531, + "learning_rate": 0.0002, + "loss": 1.7609, + "step": 43 + }, + { + "epoch": 0.02192873162222776, + "grad_norm": 0.6995068192481995, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 44 + }, + { + "epoch": 0.0224271118863693, + "grad_norm": 0.10629299283027649, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 45 + }, + { + "epoch": 0.02292549215051084, + "grad_norm": 0.7601500749588013, + "learning_rate": 0.0002, + "loss": 1.9191, + "step": 46 + }, + { + "epoch": 0.02342387241465238, + "grad_norm": 0.15130610764026642, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 47 + }, + { + "epoch": 0.02392225267879392, + "grad_norm": 0.13523732125759125, + "learning_rate": 0.0002, + "loss": 1.8099, + "step": 48 + }, + { + "epoch": 0.02442063294293546, + "grad_norm": 0.13607007265090942, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 49 + }, + { + "epoch": 0.024919013207076998, + "grad_norm": 0.12477318197488785, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 50 + }, + { + "epoch": 0.02541739347121854, + "grad_norm": 0.6004332304000854, + "learning_rate": 0.0002, + "loss": 1.8337, + "step": 51 + }, + { + "epoch": 0.02591577373536008, + "grad_norm": 0.11952889710664749, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 52 + }, + { + "epoch": 0.02641415399950162, + "grad_norm": 0.12411167472600937, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 53 + }, + { + "epoch": 0.02691253426364316, + "grad_norm": 0.13071775436401367, + "learning_rate": 0.0002, + "loss": 1.8158, + "step": 54 + }, + { + "epoch": 0.0274109145277847, + "grad_norm": 0.10316825658082962, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 55 + }, + { + "epoch": 0.02790929479192624, + "grad_norm": 0.12366951256990433, + "learning_rate": 0.0002, + "loss": 1.7233, + "step": 56 + }, + { + "epoch": 0.02840767505606778, + "grad_norm": 0.11353752017021179, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 57 + }, + { + "epoch": 0.02890605532020932, + "grad_norm": 0.10084105283021927, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 58 + }, + { + "epoch": 0.02940443558435086, + "grad_norm": 0.09446979314088821, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 59 + }, + { + "epoch": 0.0299028158484924, + "grad_norm": 0.10983336716890335, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 60 + }, + { + "epoch": 0.03040119611263394, + "grad_norm": 0.09697376936674118, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 61 + }, + { + "epoch": 0.03089957637677548, + "grad_norm": 0.10111090540885925, + "learning_rate": 0.0002, + "loss": 1.7711, + "step": 62 + }, + { + "epoch": 0.03139795664091702, + "grad_norm": 0.09077231585979462, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 63 + }, + { + "epoch": 0.03189633690505856, + "grad_norm": 0.09181386977434158, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 64 + }, + { + "epoch": 0.0323947171692001, + "grad_norm": 0.09549912065267563, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 65 + }, + { + "epoch": 0.032893097433341636, + "grad_norm": 0.09550771117210388, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 66 + }, + { + "epoch": 0.03339147769748318, + "grad_norm": 0.09617152065038681, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 67 + }, + { + "epoch": 0.03388985796162472, + "grad_norm": 0.08987727761268616, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 68 + }, + { + "epoch": 0.03438823822576626, + "grad_norm": 0.1968306601047516, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 69 + }, + { + "epoch": 0.0348866184899078, + "grad_norm": 0.11987251788377762, + "learning_rate": 0.0002, + "loss": 1.7883, + "step": 70 + }, + { + "epoch": 0.03538499875404934, + "grad_norm": 0.09412620961666107, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 71 + }, + { + "epoch": 0.03588337901819088, + "grad_norm": 0.09160133451223373, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 72 + }, + { + "epoch": 0.03638175928233242, + "grad_norm": 0.08958347886800766, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 73 + }, + { + "epoch": 0.03688013954647396, + "grad_norm": 0.08735426515340805, + "learning_rate": 0.0002, + "loss": 1.7267, + "step": 74 + }, + { + "epoch": 0.0373785198106155, + "grad_norm": 0.09234903752803802, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 75 + }, + { + "epoch": 0.03787690007475704, + "grad_norm": 0.3366870582103729, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 76 + }, + { + "epoch": 0.03837528033889858, + "grad_norm": 0.11989757418632507, + "learning_rate": 0.0002, + "loss": 1.7388, + "step": 77 + }, + { + "epoch": 0.03887366060304012, + "grad_norm": 0.09671110659837723, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 78 + }, + { + "epoch": 0.03937204086718166, + "grad_norm": 0.3544454276561737, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 79 + }, + { + "epoch": 0.0398704211313232, + "grad_norm": 0.36497563123703003, + "learning_rate": 0.0002, + "loss": 1.8832, + "step": 80 + }, + { + "epoch": 0.04036880139546474, + "grad_norm": 0.1029423251748085, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 81 + }, + { + "epoch": 0.04086718165960628, + "grad_norm": 0.13265877962112427, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 82 + }, + { + "epoch": 0.041365561923747816, + "grad_norm": 0.10281170904636383, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 83 + }, + { + "epoch": 0.04186394218788936, + "grad_norm": 0.9060964584350586, + "learning_rate": 0.0002, + "loss": 2.0666, + "step": 84 + }, + { + "epoch": 0.0423623224520309, + "grad_norm": 0.6496222615242004, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 85 + }, + { + "epoch": 0.04286070271617244, + "grad_norm": 0.20052167773246765, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 86 + }, + { + "epoch": 0.04335908298031398, + "grad_norm": 0.20841394364833832, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 87 + }, + { + "epoch": 0.04385746324445552, + "grad_norm": 0.14324237406253815, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 88 + }, + { + "epoch": 0.04435584350859706, + "grad_norm": 0.1330689936876297, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 89 + }, + { + "epoch": 0.0448542237727386, + "grad_norm": 0.13436254858970642, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 90 + }, + { + "epoch": 0.04535260403688014, + "grad_norm": 0.11558011174201965, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 91 + }, + { + "epoch": 0.04585098430102168, + "grad_norm": 0.13997307419776917, + "learning_rate": 0.0002, + "loss": 1.7487, + "step": 92 + }, + { + "epoch": 0.04634936456516322, + "grad_norm": 0.11401030421257019, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 93 + }, + { + "epoch": 0.04684774482930476, + "grad_norm": 0.1490752398967743, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 94 + }, + { + "epoch": 0.0473461250934463, + "grad_norm": 0.10417014360427856, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 95 + }, + { + "epoch": 0.04784450535758784, + "grad_norm": 0.11896169185638428, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 96 + }, + { + "epoch": 0.04834288562172938, + "grad_norm": 0.1187196597456932, + "learning_rate": 0.0002, + "loss": 1.6665, + "step": 97 + }, + { + "epoch": 0.04884126588587092, + "grad_norm": 0.10665114969015121, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 98 + }, + { + "epoch": 0.04933964615001246, + "grad_norm": 0.11822202056646347, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 99 + }, + { + "epoch": 0.049838026414153996, + "grad_norm": 0.10062436759471893, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 100 + }, + { + "epoch": 0.05033640667829554, + "grad_norm": 0.10343766212463379, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 101 + }, + { + "epoch": 0.05083478694243708, + "grad_norm": 0.09872441738843918, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 102 + }, + { + "epoch": 0.05133316720657862, + "grad_norm": 0.08979122340679169, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 103 + }, + { + "epoch": 0.05183154747072016, + "grad_norm": 0.10805679857730865, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 104 + }, + { + "epoch": 0.0523299277348617, + "grad_norm": 0.0966518372297287, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 105 + }, + { + "epoch": 0.05282830799900324, + "grad_norm": 0.6643556952476501, + "learning_rate": 0.0002, + "loss": 1.906, + "step": 106 + }, + { + "epoch": 0.05332668826314478, + "grad_norm": 0.14238013327121735, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 107 + }, + { + "epoch": 0.05382506852728632, + "grad_norm": 0.2091197371482849, + "learning_rate": 0.0002, + "loss": 1.7879, + "step": 108 + }, + { + "epoch": 0.05432344879142786, + "grad_norm": 0.11703892797231674, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 109 + }, + { + "epoch": 0.0548218290555694, + "grad_norm": 0.15277640521526337, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 110 + }, + { + "epoch": 0.05532020931971094, + "grad_norm": 0.11744142323732376, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 111 + }, + { + "epoch": 0.05581858958385248, + "grad_norm": 0.10640200227499008, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 112 + }, + { + "epoch": 0.05631696984799402, + "grad_norm": 0.10955353826284409, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 113 + }, + { + "epoch": 0.05681535011213556, + "grad_norm": 0.3743372857570648, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 114 + }, + { + "epoch": 0.0573137303762771, + "grad_norm": 0.11817771941423416, + "learning_rate": 0.0002, + "loss": 1.7246, + "step": 115 + }, + { + "epoch": 0.05781211064041864, + "grad_norm": 0.10563557595014572, + "learning_rate": 0.0002, + "loss": 1.6554, + "step": 116 + }, + { + "epoch": 0.058310490904560176, + "grad_norm": 0.11494623869657516, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 117 + }, + { + "epoch": 0.05880887116870172, + "grad_norm": 0.12262585759162903, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 118 + }, + { + "epoch": 0.05930725143284326, + "grad_norm": 0.09501025080680847, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 119 + }, + { + "epoch": 0.0598056316969848, + "grad_norm": 0.15478286147117615, + "learning_rate": 0.0002, + "loss": 1.8005, + "step": 120 + }, + { + "epoch": 0.06030401196112634, + "grad_norm": 0.5174306631088257, + "learning_rate": 0.0002, + "loss": 1.7736, + "step": 121 + }, + { + "epoch": 0.06080239222526788, + "grad_norm": 0.37489035725593567, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 122 + }, + { + "epoch": 0.06130077248940942, + "grad_norm": 0.10632194578647614, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 123 + }, + { + "epoch": 0.06179915275355096, + "grad_norm": 0.5897635817527771, + "learning_rate": 0.0002, + "loss": 1.8483, + "step": 124 + }, + { + "epoch": 0.0622975330176925, + "grad_norm": 0.1104891449213028, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 125 + }, + { + "epoch": 0.06279591328183404, + "grad_norm": 0.171495720744133, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 126 + }, + { + "epoch": 0.06329429354597559, + "grad_norm": 0.2864750921726227, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 127 + }, + { + "epoch": 0.06379267381011712, + "grad_norm": 0.1258823126554489, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 128 + }, + { + "epoch": 0.06429105407425866, + "grad_norm": 0.10813643783330917, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 129 + }, + { + "epoch": 0.0647894343384002, + "grad_norm": 0.12285427749156952, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 130 + }, + { + "epoch": 0.06528781460254174, + "grad_norm": 0.11049698293209076, + "learning_rate": 0.0002, + "loss": 1.7107, + "step": 131 + }, + { + "epoch": 0.06578619486668327, + "grad_norm": 0.4740373492240906, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 132 + }, + { + "epoch": 0.06628457513082482, + "grad_norm": 0.11663281917572021, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 133 + }, + { + "epoch": 0.06678295539496636, + "grad_norm": 0.1274426281452179, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 134 + }, + { + "epoch": 0.0672813356591079, + "grad_norm": 0.11273318529129028, + "learning_rate": 0.0002, + "loss": 1.6195, + "step": 135 + }, + { + "epoch": 0.06777971592324944, + "grad_norm": 0.12240920960903168, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 136 + }, + { + "epoch": 0.06827809618739097, + "grad_norm": 0.1003924235701561, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 137 + }, + { + "epoch": 0.06877647645153252, + "grad_norm": 0.12279325723648071, + "learning_rate": 0.0002, + "loss": 1.7905, + "step": 138 + }, + { + "epoch": 0.06927485671567406, + "grad_norm": 0.10567662119865417, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 139 + }, + { + "epoch": 0.0697732369798156, + "grad_norm": 0.0949968695640564, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 140 + }, + { + "epoch": 0.07027161724395714, + "grad_norm": 0.10375083237886429, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 141 + }, + { + "epoch": 0.07076999750809868, + "grad_norm": 0.0937686413526535, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 142 + }, + { + "epoch": 0.07126837777224022, + "grad_norm": 0.0981929674744606, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 143 + }, + { + "epoch": 0.07176675803638176, + "grad_norm": 1.1460381746292114, + "learning_rate": 0.0002, + "loss": 1.9091, + "step": 144 + }, + { + "epoch": 0.0722651383005233, + "grad_norm": 0.1193133145570755, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 145 + }, + { + "epoch": 0.07276351856466484, + "grad_norm": 0.13854117691516876, + "learning_rate": 0.0002, + "loss": 1.656, + "step": 146 + }, + { + "epoch": 0.07326189882880638, + "grad_norm": 0.6005303263664246, + "learning_rate": 0.0002, + "loss": 1.9014, + "step": 147 + }, + { + "epoch": 0.07376027909294793, + "grad_norm": 0.13879133760929108, + "learning_rate": 0.0002, + "loss": 1.7158, + "step": 148 + }, + { + "epoch": 0.07425865935708946, + "grad_norm": 0.13073574006557465, + "learning_rate": 0.0002, + "loss": 1.7355, + "step": 149 + }, + { + "epoch": 0.074757039621231, + "grad_norm": 0.12578125298023224, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 150 + }, + { + "epoch": 0.07525541988537254, + "grad_norm": 0.13024558126926422, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 151 + }, + { + "epoch": 0.07575380014951408, + "grad_norm": 0.12630225718021393, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 152 + }, + { + "epoch": 0.07625218041365561, + "grad_norm": 0.13081084191799164, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 153 + }, + { + "epoch": 0.07675056067779716, + "grad_norm": 0.11292438209056854, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 154 + }, + { + "epoch": 0.0772489409419387, + "grad_norm": 0.10187578946352005, + "learning_rate": 0.0002, + "loss": 1.6915, + "step": 155 + }, + { + "epoch": 0.07774732120608024, + "grad_norm": 0.10563293844461441, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 156 + }, + { + "epoch": 0.07824570147022178, + "grad_norm": 0.10501443594694138, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 157 + }, + { + "epoch": 0.07874408173436331, + "grad_norm": 0.11756912618875504, + "learning_rate": 0.0002, + "loss": 1.7963, + "step": 158 + }, + { + "epoch": 0.07924246199850486, + "grad_norm": 0.1010415181517601, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 159 + }, + { + "epoch": 0.0797408422626464, + "grad_norm": 0.09472226351499557, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 160 + }, + { + "epoch": 0.08023922252678795, + "grad_norm": 0.10156677663326263, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 161 + }, + { + "epoch": 0.08073760279092948, + "grad_norm": 0.09345332533121109, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 162 + }, + { + "epoch": 0.08123598305507101, + "grad_norm": 0.09440191835165024, + "learning_rate": 0.0002, + "loss": 1.6753, + "step": 163 + }, + { + "epoch": 0.08173436331921256, + "grad_norm": 0.0925949364900589, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 164 + }, + { + "epoch": 0.0822327435833541, + "grad_norm": 0.09808436781167984, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 165 + }, + { + "epoch": 0.08273112384749563, + "grad_norm": 0.10032784938812256, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 166 + }, + { + "epoch": 0.08322950411163718, + "grad_norm": 0.769005298614502, + "learning_rate": 0.0002, + "loss": 1.8314, + "step": 167 + }, + { + "epoch": 0.08372788437577872, + "grad_norm": 1.013753890991211, + "learning_rate": 0.0002, + "loss": 1.9179, + "step": 168 + }, + { + "epoch": 0.08422626463992026, + "grad_norm": 0.11522974818944931, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 169 + }, + { + "epoch": 0.0847246449040618, + "grad_norm": 0.1381683349609375, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 170 + }, + { + "epoch": 0.08522302516820333, + "grad_norm": 0.13124744594097137, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 171 + }, + { + "epoch": 0.08572140543234488, + "grad_norm": 0.1552695333957672, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 172 + }, + { + "epoch": 0.08621978569648642, + "grad_norm": 0.11559716612100601, + "learning_rate": 0.0002, + "loss": 1.7474, + "step": 173 + }, + { + "epoch": 0.08671816596062797, + "grad_norm": 0.11131990700960159, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 174 + }, + { + "epoch": 0.0872165462247695, + "grad_norm": 0.11412417143583298, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 175 + }, + { + "epoch": 0.08771492648891104, + "grad_norm": 0.11382830142974854, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 176 + }, + { + "epoch": 0.08821330675305258, + "grad_norm": 0.7038962244987488, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 177 + }, + { + "epoch": 0.08871168701719412, + "grad_norm": 0.11253572255373001, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 178 + }, + { + "epoch": 0.08921006728133565, + "grad_norm": 0.12908123433589935, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 179 + }, + { + "epoch": 0.0897084475454772, + "grad_norm": 0.12027324736118317, + "learning_rate": 0.0002, + "loss": 1.7542, + "step": 180 + }, + { + "epoch": 0.09020682780961874, + "grad_norm": 0.13822880387306213, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 181 + }, + { + "epoch": 0.09070520807376029, + "grad_norm": 0.11809349060058594, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 182 + }, + { + "epoch": 0.09120358833790182, + "grad_norm": 0.11567198485136032, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 183 + }, + { + "epoch": 0.09170196860204335, + "grad_norm": 0.11884818226099014, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 184 + }, + { + "epoch": 0.0922003488661849, + "grad_norm": 0.13118627667427063, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 185 + }, + { + "epoch": 0.09269872913032644, + "grad_norm": 0.10780288279056549, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 186 + }, + { + "epoch": 0.09319710939446797, + "grad_norm": 0.1052689403295517, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 187 + }, + { + "epoch": 0.09369548965860952, + "grad_norm": 0.11142247915267944, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 188 + }, + { + "epoch": 0.09419386992275106, + "grad_norm": 0.11082904785871506, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 189 + }, + { + "epoch": 0.0946922501868926, + "grad_norm": 0.09668837487697601, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 190 + }, + { + "epoch": 0.09519063045103414, + "grad_norm": 0.09926537424325943, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 191 + }, + { + "epoch": 0.09568901071517567, + "grad_norm": 0.09865368157625198, + "learning_rate": 0.0002, + "loss": 1.7538, + "step": 192 + }, + { + "epoch": 0.09618739097931722, + "grad_norm": 0.10074108839035034, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 193 + }, + { + "epoch": 0.09668577124345876, + "grad_norm": 0.11467942595481873, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 194 + }, + { + "epoch": 0.0971841515076003, + "grad_norm": 0.09638036042451859, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 195 + }, + { + "epoch": 0.09768253177174184, + "grad_norm": 0.09951262921094894, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 196 + }, + { + "epoch": 0.09818091203588337, + "grad_norm": 0.09425103664398193, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 197 + }, + { + "epoch": 0.09867929230002492, + "grad_norm": 0.09163974225521088, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 198 + }, + { + "epoch": 0.09917767256416646, + "grad_norm": 0.10825615376234055, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 199 + }, + { + "epoch": 0.09967605282830799, + "grad_norm": 0.08873865008354187, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 200 + }, + { + "epoch": 0.10017443309244954, + "grad_norm": 0.09379550069570541, + "learning_rate": 0.0002, + "loss": 1.7475, + "step": 201 + }, + { + "epoch": 0.10067281335659108, + "grad_norm": 0.09395930916070938, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 202 + }, + { + "epoch": 0.10117119362073262, + "grad_norm": 0.09373954683542252, + "learning_rate": 0.0002, + "loss": 1.7413, + "step": 203 + }, + { + "epoch": 0.10166957388487416, + "grad_norm": 0.0926884338259697, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 204 + }, + { + "epoch": 0.1021679541490157, + "grad_norm": 0.09394028782844543, + "learning_rate": 0.0002, + "loss": 1.6777, + "step": 205 + }, + { + "epoch": 0.10266633441315724, + "grad_norm": 0.0934232845902443, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 206 + }, + { + "epoch": 0.10316471467729878, + "grad_norm": 0.08943123370409012, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 207 + }, + { + "epoch": 0.10366309494144033, + "grad_norm": 0.09671316295862198, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 208 + }, + { + "epoch": 0.10416147520558186, + "grad_norm": 0.12016978114843369, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 209 + }, + { + "epoch": 0.1046598554697234, + "grad_norm": 0.5822897553443909, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 210 + }, + { + "epoch": 0.10515823573386494, + "grad_norm": 0.10984666645526886, + "learning_rate": 0.0002, + "loss": 1.703, + "step": 211 + }, + { + "epoch": 0.10565661599800648, + "grad_norm": 0.661040186882019, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 212 + }, + { + "epoch": 0.10615499626214801, + "grad_norm": 0.1641639620065689, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 213 + }, + { + "epoch": 0.10665337652628956, + "grad_norm": 0.34271761775016785, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 214 + }, + { + "epoch": 0.1071517567904311, + "grad_norm": 0.11224206537008286, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 215 + }, + { + "epoch": 0.10765013705457264, + "grad_norm": 0.11788146197795868, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 216 + }, + { + "epoch": 0.10814851731871418, + "grad_norm": 0.10918893665075302, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 217 + }, + { + "epoch": 0.10864689758285571, + "grad_norm": 0.12023265659809113, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 218 + }, + { + "epoch": 0.10914527784699726, + "grad_norm": 0.11474837362766266, + "learning_rate": 0.0002, + "loss": 1.749, + "step": 219 + }, + { + "epoch": 0.1096436581111388, + "grad_norm": 0.10222747921943665, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 220 + }, + { + "epoch": 0.11014203837528033, + "grad_norm": 0.1074354350566864, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 221 + }, + { + "epoch": 0.11064041863942188, + "grad_norm": 0.5447832345962524, + "learning_rate": 0.0002, + "loss": 1.8402, + "step": 222 + }, + { + "epoch": 0.11113879890356342, + "grad_norm": 0.12009864300489426, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 223 + }, + { + "epoch": 0.11163717916770496, + "grad_norm": 0.11686031520366669, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 224 + }, + { + "epoch": 0.1121355594318465, + "grad_norm": 0.12914586067199707, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 225 + }, + { + "epoch": 0.11263393969598803, + "grad_norm": 0.10797183215618134, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 226 + }, + { + "epoch": 0.11313231996012958, + "grad_norm": 0.1088324561715126, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 227 + }, + { + "epoch": 0.11363070022427112, + "grad_norm": 0.10438574105501175, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 228 + }, + { + "epoch": 0.11412908048841267, + "grad_norm": 0.14163640141487122, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 229 + }, + { + "epoch": 0.1146274607525542, + "grad_norm": 0.10191742330789566, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 230 + }, + { + "epoch": 0.11512584101669573, + "grad_norm": 0.11547041684389114, + "learning_rate": 0.0002, + "loss": 1.7793, + "step": 231 + }, + { + "epoch": 0.11562422128083728, + "grad_norm": 0.10447453707456589, + "learning_rate": 0.0002, + "loss": 1.7791, + "step": 232 + }, + { + "epoch": 0.11612260154497882, + "grad_norm": 0.10447558760643005, + "learning_rate": 0.0002, + "loss": 1.6799, + "step": 233 + }, + { + "epoch": 0.11662098180912035, + "grad_norm": 0.10260461270809174, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 234 + }, + { + "epoch": 0.1171193620732619, + "grad_norm": 0.10199354588985443, + "learning_rate": 0.0002, + "loss": 1.6476, + "step": 235 + }, + { + "epoch": 0.11761774233740344, + "grad_norm": 0.09869713336229324, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 236 + }, + { + "epoch": 0.11811612260154498, + "grad_norm": 0.9354596138000488, + "learning_rate": 0.0002, + "loss": 1.9584, + "step": 237 + }, + { + "epoch": 0.11861450286568652, + "grad_norm": 0.15785987675189972, + "learning_rate": 0.0002, + "loss": 1.718, + "step": 238 + }, + { + "epoch": 0.11911288312982805, + "grad_norm": 0.16236662864685059, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 239 + }, + { + "epoch": 0.1196112633939696, + "grad_norm": 0.1407175064086914, + "learning_rate": 0.0002, + "loss": 1.6987, + "step": 240 + }, + { + "epoch": 0.12010964365811114, + "grad_norm": 0.13428977131843567, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 241 + }, + { + "epoch": 0.12060802392225269, + "grad_norm": 0.5954437255859375, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 242 + }, + { + "epoch": 0.12110640418639422, + "grad_norm": 0.12084382027387619, + "learning_rate": 0.0002, + "loss": 1.6446, + "step": 243 + }, + { + "epoch": 0.12160478445053576, + "grad_norm": 0.12887060642242432, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 244 + }, + { + "epoch": 0.1221031647146773, + "grad_norm": 0.12585604190826416, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 245 + }, + { + "epoch": 0.12260154497881884, + "grad_norm": 0.11495430767536163, + "learning_rate": 0.0002, + "loss": 1.6833, + "step": 246 + }, + { + "epoch": 0.12309992524296037, + "grad_norm": 0.36918768286705017, + "learning_rate": 0.0002, + "loss": 1.8354, + "step": 247 + }, + { + "epoch": 0.12359830550710192, + "grad_norm": 0.1330924779176712, + "learning_rate": 0.0002, + "loss": 1.6915, + "step": 248 + }, + { + "epoch": 0.12409668577124346, + "grad_norm": 0.6573293805122375, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 249 + }, + { + "epoch": 0.124595066035385, + "grad_norm": 0.13000234961509705, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 250 + }, + { + "epoch": 0.12509344629952654, + "grad_norm": 0.14653077721595764, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 251 + }, + { + "epoch": 0.12559182656366807, + "grad_norm": 0.13498292863368988, + "learning_rate": 0.0002, + "loss": 1.6848, + "step": 252 + }, + { + "epoch": 0.1260902068278096, + "grad_norm": 0.13268351554870605, + "learning_rate": 0.0002, + "loss": 1.7338, + "step": 253 + }, + { + "epoch": 0.12658858709195117, + "grad_norm": 0.1395343542098999, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 254 + }, + { + "epoch": 0.1270869673560927, + "grad_norm": 0.1279151439666748, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 255 + }, + { + "epoch": 0.12758534762023424, + "grad_norm": 0.112457275390625, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 256 + }, + { + "epoch": 0.12808372788437578, + "grad_norm": 0.11672843992710114, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 257 + }, + { + "epoch": 0.1285821081485173, + "grad_norm": 0.1295323520898819, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 258 + }, + { + "epoch": 0.12908048841265887, + "grad_norm": 0.10538823157548904, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 259 + }, + { + "epoch": 0.1295788686768004, + "grad_norm": 0.1093951016664505, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 260 + }, + { + "epoch": 0.13007724894094194, + "grad_norm": 0.10753627866506577, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 261 + }, + { + "epoch": 0.13057562920508348, + "grad_norm": 0.11015735566616058, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 262 + }, + { + "epoch": 0.131074009469225, + "grad_norm": 0.10606027394533157, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 263 + }, + { + "epoch": 0.13157238973336655, + "grad_norm": 0.09919940680265427, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 264 + }, + { + "epoch": 0.1320707699975081, + "grad_norm": 0.1004357561469078, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 265 + }, + { + "epoch": 0.13256915026164964, + "grad_norm": 0.1044403687119484, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 266 + }, + { + "epoch": 0.13306753052579118, + "grad_norm": 0.09830351173877716, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 267 + }, + { + "epoch": 0.1335659107899327, + "grad_norm": 0.09731124341487885, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 268 + }, + { + "epoch": 0.13406429105407425, + "grad_norm": 0.09874913096427917, + "learning_rate": 0.0002, + "loss": 1.6704, + "step": 269 + }, + { + "epoch": 0.1345626713182158, + "grad_norm": 1.0015792846679688, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 270 + }, + { + "epoch": 0.13506105158235734, + "grad_norm": 0.15942072868347168, + "learning_rate": 0.0002, + "loss": 1.6851, + "step": 271 + }, + { + "epoch": 0.13555943184649888, + "grad_norm": 0.1272728443145752, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 272 + }, + { + "epoch": 0.1360578121106404, + "grad_norm": 0.13415473699569702, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 273 + }, + { + "epoch": 0.13655619237478195, + "grad_norm": 0.6600972414016724, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 274 + }, + { + "epoch": 0.1370545726389235, + "grad_norm": 0.16784119606018066, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 275 + }, + { + "epoch": 0.13755295290306505, + "grad_norm": 0.14813649654388428, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 276 + }, + { + "epoch": 0.13805133316720658, + "grad_norm": 0.14158020913600922, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 277 + }, + { + "epoch": 0.13854971343134811, + "grad_norm": 0.48206424713134766, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 278 + }, + { + "epoch": 0.13904809369548965, + "grad_norm": 0.18177767097949982, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 279 + }, + { + "epoch": 0.1395464739596312, + "grad_norm": 0.12430819869041443, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 280 + }, + { + "epoch": 0.14004485422377275, + "grad_norm": 0.44922658801078796, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 281 + }, + { + "epoch": 0.14054323448791428, + "grad_norm": 0.14023765921592712, + "learning_rate": 0.0002, + "loss": 1.6521, + "step": 282 + }, + { + "epoch": 0.14104161475205582, + "grad_norm": 0.15241369605064392, + "learning_rate": 0.0002, + "loss": 1.6819, + "step": 283 + }, + { + "epoch": 0.14153999501619735, + "grad_norm": 0.12531667947769165, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 284 + }, + { + "epoch": 0.14203837528033889, + "grad_norm": 0.13596689701080322, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 285 + }, + { + "epoch": 0.14253675554448045, + "grad_norm": 0.1316744089126587, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 286 + }, + { + "epoch": 0.14303513580862198, + "grad_norm": 0.11584890633821487, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 287 + }, + { + "epoch": 0.14353351607276352, + "grad_norm": 0.37444308400154114, + "learning_rate": 0.0002, + "loss": 1.7808, + "step": 288 + }, + { + "epoch": 0.14403189633690505, + "grad_norm": 0.3217577338218689, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 289 + }, + { + "epoch": 0.1445302766010466, + "grad_norm": 0.12234029918909073, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 290 + }, + { + "epoch": 0.14502865686518815, + "grad_norm": 0.13871504366397858, + "learning_rate": 0.0002, + "loss": 1.7737, + "step": 291 + }, + { + "epoch": 0.14552703712932968, + "grad_norm": 0.10792572051286697, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 292 + }, + { + "epoch": 0.14602541739347122, + "grad_norm": 0.11277946084737778, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 293 + }, + { + "epoch": 0.14652379765761275, + "grad_norm": 0.11250103265047073, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 294 + }, + { + "epoch": 0.1470221779217543, + "grad_norm": 0.10644537955522537, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 295 + }, + { + "epoch": 0.14752055818589585, + "grad_norm": 0.12423089891672134, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 296 + }, + { + "epoch": 0.14801893845003739, + "grad_norm": 0.10547474026679993, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 297 + }, + { + "epoch": 0.14851731871417892, + "grad_norm": 0.10867539793252945, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 298 + }, + { + "epoch": 0.14901569897832045, + "grad_norm": 0.21218198537826538, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 299 + }, + { + "epoch": 0.149514079242462, + "grad_norm": 0.11373799294233322, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 300 + }, + { + "epoch": 0.15001245950660355, + "grad_norm": 0.12452666461467743, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 301 + }, + { + "epoch": 0.1505108397707451, + "grad_norm": 0.4068242609500885, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 302 + }, + { + "epoch": 0.15100922003488662, + "grad_norm": 0.15395419299602509, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 303 + }, + { + "epoch": 0.15150760029902816, + "grad_norm": 0.11441215127706528, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 304 + }, + { + "epoch": 0.1520059805631697, + "grad_norm": 0.13675518333911896, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 305 + }, + { + "epoch": 0.15250436082731123, + "grad_norm": 0.11606375873088837, + "learning_rate": 0.0002, + "loss": 1.6453, + "step": 306 + }, + { + "epoch": 0.1530027410914528, + "grad_norm": 0.4435337483882904, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 307 + }, + { + "epoch": 0.15350112135559432, + "grad_norm": 0.12212298810482025, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 308 + }, + { + "epoch": 0.15399950161973586, + "grad_norm": 0.14606495201587677, + "learning_rate": 0.0002, + "loss": 1.6517, + "step": 309 + }, + { + "epoch": 0.1544978818838774, + "grad_norm": 0.11753024160861969, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 310 + }, + { + "epoch": 0.15499626214801893, + "grad_norm": 0.13007789850234985, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 311 + }, + { + "epoch": 0.1554946424121605, + "grad_norm": 0.11651528626680374, + "learning_rate": 0.0002, + "loss": 1.7128, + "step": 312 + }, + { + "epoch": 0.15599302267630202, + "grad_norm": 0.1128389984369278, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 313 + }, + { + "epoch": 0.15649140294044356, + "grad_norm": 0.10965872555971146, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 314 + }, + { + "epoch": 0.1569897832045851, + "grad_norm": 0.10751237720251083, + "learning_rate": 0.0002, + "loss": 1.6346, + "step": 315 + }, + { + "epoch": 0.15748816346872663, + "grad_norm": 0.09646358340978622, + "learning_rate": 0.0002, + "loss": 1.6873, + "step": 316 + }, + { + "epoch": 0.1579865437328682, + "grad_norm": 0.09908836334943771, + "learning_rate": 0.0002, + "loss": 1.6934, + "step": 317 + }, + { + "epoch": 0.15848492399700972, + "grad_norm": 0.09631779044866562, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 318 + }, + { + "epoch": 0.15898330426115126, + "grad_norm": 0.5702200531959534, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 319 + }, + { + "epoch": 0.1594816845252928, + "grad_norm": 0.1274351179599762, + "learning_rate": 0.0002, + "loss": 1.6632, + "step": 320 + }, + { + "epoch": 0.15998006478943433, + "grad_norm": 0.10685572028160095, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 321 + }, + { + "epoch": 0.1604784450535759, + "grad_norm": 0.12333345413208008, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 322 + }, + { + "epoch": 0.16097682531771743, + "grad_norm": 0.10747205466032028, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 323 + }, + { + "epoch": 0.16147520558185896, + "grad_norm": 0.10506169497966766, + "learning_rate": 0.0002, + "loss": 1.7463, + "step": 324 + }, + { + "epoch": 0.1619735858460005, + "grad_norm": 0.11267457902431488, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 325 + }, + { + "epoch": 0.16247196611014203, + "grad_norm": 0.10924848914146423, + "learning_rate": 0.0002, + "loss": 1.7146, + "step": 326 + }, + { + "epoch": 0.16297034637428356, + "grad_norm": 0.11103785783052444, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 327 + }, + { + "epoch": 0.16346872663842513, + "grad_norm": 0.3997076451778412, + "learning_rate": 0.0002, + "loss": 1.8753, + "step": 328 + }, + { + "epoch": 0.16396710690256666, + "grad_norm": 0.10188498347997665, + "learning_rate": 0.0002, + "loss": 1.7483, + "step": 329 + }, + { + "epoch": 0.1644654871667082, + "grad_norm": 0.10824645310640335, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 330 + }, + { + "epoch": 0.16496386743084973, + "grad_norm": 0.09962976723909378, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 331 + }, + { + "epoch": 0.16546224769499127, + "grad_norm": 0.10796276479959488, + "learning_rate": 0.0002, + "loss": 1.6799, + "step": 332 + }, + { + "epoch": 0.16596062795913283, + "grad_norm": 0.09546298533678055, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 333 + }, + { + "epoch": 0.16645900822327436, + "grad_norm": 0.3045598864555359, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 334 + }, + { + "epoch": 0.1669573884874159, + "grad_norm": 0.10275569558143616, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 335 + }, + { + "epoch": 0.16745576875155743, + "grad_norm": 0.14451362192630768, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 336 + }, + { + "epoch": 0.16795414901569897, + "grad_norm": 0.0982123464345932, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 337 + }, + { + "epoch": 0.16845252927984053, + "grad_norm": 0.11521178483963013, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 338 + }, + { + "epoch": 0.16895090954398206, + "grad_norm": 0.2746621072292328, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 339 + }, + { + "epoch": 0.1694492898081236, + "grad_norm": 0.0955624207854271, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 340 + }, + { + "epoch": 0.16994767007226513, + "grad_norm": 0.10157962888479233, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 341 + }, + { + "epoch": 0.17044605033640667, + "grad_norm": 0.0971306711435318, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 342 + }, + { + "epoch": 0.17094443060054823, + "grad_norm": 0.10407841205596924, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 343 + }, + { + "epoch": 0.17144281086468977, + "grad_norm": 0.09228493273258209, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 344 + }, + { + "epoch": 0.1719411911288313, + "grad_norm": 0.10309567302465439, + "learning_rate": 0.0002, + "loss": 1.6534, + "step": 345 + }, + { + "epoch": 0.17243957139297283, + "grad_norm": 0.10019028931856155, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 346 + }, + { + "epoch": 0.17293795165711437, + "grad_norm": 0.09051994234323502, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 347 + }, + { + "epoch": 0.17343633192125593, + "grad_norm": 0.09501929581165314, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 348 + }, + { + "epoch": 0.17393471218539747, + "grad_norm": 0.09314325451850891, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 349 + }, + { + "epoch": 0.174433092449539, + "grad_norm": 0.09021347016096115, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 350 + }, + { + "epoch": 0.17493147271368054, + "grad_norm": 0.27376627922058105, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 351 + }, + { + "epoch": 0.17542985297782207, + "grad_norm": 0.11608853936195374, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 352 + }, + { + "epoch": 0.1759282332419636, + "grad_norm": 0.09565002471208572, + "learning_rate": 0.0002, + "loss": 1.6925, + "step": 353 + }, + { + "epoch": 0.17642661350610517, + "grad_norm": 0.10814974457025528, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 354 + }, + { + "epoch": 0.1769249937702467, + "grad_norm": 0.09551705420017242, + "learning_rate": 0.0002, + "loss": 1.6715, + "step": 355 + }, + { + "epoch": 0.17742337403438824, + "grad_norm": 0.10541266202926636, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 356 + }, + { + "epoch": 0.17792175429852977, + "grad_norm": 0.09884203970432281, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 357 + }, + { + "epoch": 0.1784201345626713, + "grad_norm": 0.19244062900543213, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 358 + }, + { + "epoch": 0.17891851482681287, + "grad_norm": 0.1312815397977829, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 359 + }, + { + "epoch": 0.1794168950909544, + "grad_norm": 0.10575084388256073, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 360 + }, + { + "epoch": 0.17991527535509594, + "grad_norm": 0.1993856579065323, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 361 + }, + { + "epoch": 0.18041365561923747, + "grad_norm": 0.1053745448589325, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 362 + }, + { + "epoch": 0.180912035883379, + "grad_norm": 0.10017159581184387, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 363 + }, + { + "epoch": 0.18141041614752057, + "grad_norm": 0.12066628038883209, + "learning_rate": 0.0002, + "loss": 1.639, + "step": 364 + }, + { + "epoch": 0.1819087964116621, + "grad_norm": 0.12606841325759888, + "learning_rate": 0.0002, + "loss": 1.8435, + "step": 365 + }, + { + "epoch": 0.18240717667580364, + "grad_norm": 0.10491355508565903, + "learning_rate": 0.0002, + "loss": 1.5846, + "step": 366 + }, + { + "epoch": 0.18290555693994517, + "grad_norm": 0.10337149351835251, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 367 + }, + { + "epoch": 0.1834039372040867, + "grad_norm": 0.09452168643474579, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 368 + }, + { + "epoch": 0.18390231746822827, + "grad_norm": 0.09799271076917648, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 369 + }, + { + "epoch": 0.1844006977323698, + "grad_norm": 0.09442919492721558, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 370 + }, + { + "epoch": 0.18489907799651134, + "grad_norm": 0.09542658925056458, + "learning_rate": 0.0002, + "loss": 1.612, + "step": 371 + }, + { + "epoch": 0.18539745826065288, + "grad_norm": 0.0989847183227539, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 372 + }, + { + "epoch": 0.1858958385247944, + "grad_norm": 0.09289655089378357, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 373 + }, + { + "epoch": 0.18639421878893594, + "grad_norm": 0.10097731649875641, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 374 + }, + { + "epoch": 0.1868925990530775, + "grad_norm": 0.09352610260248184, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 375 + }, + { + "epoch": 0.18739097931721904, + "grad_norm": 0.0907459631562233, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 376 + }, + { + "epoch": 0.18788935958136058, + "grad_norm": 0.0915813073515892, + "learning_rate": 0.0002, + "loss": 1.6289, + "step": 377 + }, + { + "epoch": 0.1883877398455021, + "grad_norm": 0.09011110663414001, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 378 + }, + { + "epoch": 0.18888612010964365, + "grad_norm": 0.4069153964519501, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 379 + }, + { + "epoch": 0.1893845003737852, + "grad_norm": 0.1351984292268753, + "learning_rate": 0.0002, + "loss": 1.7911, + "step": 380 + }, + { + "epoch": 0.18988288063792674, + "grad_norm": 0.537133514881134, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 381 + }, + { + "epoch": 0.19038126090206828, + "grad_norm": 0.10901357978582382, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 382 + }, + { + "epoch": 0.1908796411662098, + "grad_norm": 0.19000430405139923, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 383 + }, + { + "epoch": 0.19137802143035135, + "grad_norm": 0.12100650370121002, + "learning_rate": 0.0002, + "loss": 1.6844, + "step": 384 + }, + { + "epoch": 0.1918764016944929, + "grad_norm": 0.12487197667360306, + "learning_rate": 0.0002, + "loss": 1.7239, + "step": 385 + }, + { + "epoch": 0.19237478195863444, + "grad_norm": 0.12008525431156158, + "learning_rate": 0.0002, + "loss": 1.6443, + "step": 386 + }, + { + "epoch": 0.19287316222277598, + "grad_norm": 0.119840107858181, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 387 + }, + { + "epoch": 0.1933715424869175, + "grad_norm": 0.1126130223274231, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 388 + }, + { + "epoch": 0.19386992275105905, + "grad_norm": 0.11164896190166473, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 389 + }, + { + "epoch": 0.1943683030152006, + "grad_norm": 0.1496819108724594, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 390 + }, + { + "epoch": 0.19486668327934215, + "grad_norm": 0.09984704852104187, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 391 + }, + { + "epoch": 0.19536506354348368, + "grad_norm": 0.10864219069480896, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 392 + }, + { + "epoch": 0.19586344380762521, + "grad_norm": 0.09744228422641754, + "learning_rate": 0.0002, + "loss": 1.6162, + "step": 393 + }, + { + "epoch": 0.19636182407176675, + "grad_norm": 0.11409466713666916, + "learning_rate": 0.0002, + "loss": 1.6646, + "step": 394 + }, + { + "epoch": 0.19686020433590828, + "grad_norm": 0.096027672290802, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 395 + }, + { + "epoch": 0.19735858460004985, + "grad_norm": 0.48993775248527527, + "learning_rate": 0.0002, + "loss": 1.7454, + "step": 396 + }, + { + "epoch": 0.19785696486419138, + "grad_norm": 0.11972647160291672, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 397 + }, + { + "epoch": 0.19835534512833292, + "grad_norm": 0.49595576524734497, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 398 + }, + { + "epoch": 0.19885372539247445, + "grad_norm": 0.11590411514043808, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 399 + }, + { + "epoch": 0.19935210565661599, + "grad_norm": 0.11584487557411194, + "learning_rate": 0.0002, + "loss": 1.6773, + "step": 400 + }, + { + "epoch": 0.19985048592075755, + "grad_norm": 0.1017480343580246, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 401 + }, + { + "epoch": 0.20034886618489908, + "grad_norm": 0.12011077255010605, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 402 + }, + { + "epoch": 0.20084724644904062, + "grad_norm": 0.36016201972961426, + "learning_rate": 0.0002, + "loss": 1.8179, + "step": 403 + }, + { + "epoch": 0.20134562671318215, + "grad_norm": 0.11278028786182404, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 404 + }, + { + "epoch": 0.2018440069773237, + "grad_norm": 0.10928738862276077, + "learning_rate": 0.0002, + "loss": 1.6858, + "step": 405 + }, + { + "epoch": 0.20234238724146525, + "grad_norm": 0.10860306769609451, + "learning_rate": 0.0002, + "loss": 1.6975, + "step": 406 + }, + { + "epoch": 0.20284076750560678, + "grad_norm": 0.11352024972438812, + "learning_rate": 0.0002, + "loss": 1.7504, + "step": 407 + }, + { + "epoch": 0.20333914776974832, + "grad_norm": 0.10320567339658737, + "learning_rate": 0.0002, + "loss": 1.6715, + "step": 408 + }, + { + "epoch": 0.20383752803388985, + "grad_norm": 0.12056868523359299, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 409 + }, + { + "epoch": 0.2043359082980314, + "grad_norm": 0.11091714352369308, + "learning_rate": 0.0002, + "loss": 1.6391, + "step": 410 + }, + { + "epoch": 0.20483428856217295, + "grad_norm": 0.10888761281967163, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 411 + }, + { + "epoch": 0.20533266882631449, + "grad_norm": 0.2625375986099243, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 412 + }, + { + "epoch": 0.20583104909045602, + "grad_norm": 0.12070990353822708, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 413 + }, + { + "epoch": 0.20632942935459755, + "grad_norm": 0.09670402854681015, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 414 + }, + { + "epoch": 0.2068278096187391, + "grad_norm": 0.10343360900878906, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 415 + }, + { + "epoch": 0.20732618988288065, + "grad_norm": 0.10445055365562439, + "learning_rate": 0.0002, + "loss": 1.674, + "step": 416 + }, + { + "epoch": 0.2078245701470222, + "grad_norm": 0.24325382709503174, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 417 + }, + { + "epoch": 0.20832295041116372, + "grad_norm": 0.10541153699159622, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 418 + }, + { + "epoch": 0.20882133067530526, + "grad_norm": 0.09688902646303177, + "learning_rate": 0.0002, + "loss": 1.7145, + "step": 419 + }, + { + "epoch": 0.2093197109394468, + "grad_norm": 0.10568691790103912, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 420 + }, + { + "epoch": 0.20981809120358832, + "grad_norm": 0.09683585166931152, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 421 + }, + { + "epoch": 0.2103164714677299, + "grad_norm": 0.10286644101142883, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 422 + }, + { + "epoch": 0.21081485173187142, + "grad_norm": 0.09786178171634674, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 423 + }, + { + "epoch": 0.21131323199601296, + "grad_norm": 0.10202211886644363, + "learning_rate": 0.0002, + "loss": 1.6702, + "step": 424 + }, + { + "epoch": 0.2118116122601545, + "grad_norm": 0.10444546490907669, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 425 + }, + { + "epoch": 0.21230999252429603, + "grad_norm": 0.09346964955329895, + "learning_rate": 0.0002, + "loss": 1.6638, + "step": 426 + }, + { + "epoch": 0.2128083727884376, + "grad_norm": 0.09578395634889603, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 427 + }, + { + "epoch": 0.21330675305257912, + "grad_norm": 0.09412133693695068, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 428 + }, + { + "epoch": 0.21380513331672066, + "grad_norm": 0.49985215067863464, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 429 + }, + { + "epoch": 0.2143035135808622, + "grad_norm": 0.58636075258255, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 430 + }, + { + "epoch": 0.21480189384500373, + "grad_norm": 0.12334456294775009, + "learning_rate": 0.0002, + "loss": 1.6392, + "step": 431 + }, + { + "epoch": 0.2153002741091453, + "grad_norm": 0.13144731521606445, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 432 + }, + { + "epoch": 0.21579865437328682, + "grad_norm": 0.14804112911224365, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 433 + }, + { + "epoch": 0.21629703463742836, + "grad_norm": 0.7628450393676758, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 434 + }, + { + "epoch": 0.2167954149015699, + "grad_norm": 0.18024517595767975, + "learning_rate": 0.0002, + "loss": 1.6732, + "step": 435 + }, + { + "epoch": 0.21729379516571143, + "grad_norm": 0.195417121052742, + "learning_rate": 0.0002, + "loss": 1.7811, + "step": 436 + }, + { + "epoch": 0.217792175429853, + "grad_norm": 0.28199324011802673, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 437 + }, + { + "epoch": 0.21829055569399453, + "grad_norm": 0.15422897040843964, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 438 + }, + { + "epoch": 0.21878893595813606, + "grad_norm": 0.13214194774627686, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 439 + }, + { + "epoch": 0.2192873162222776, + "grad_norm": 0.14797765016555786, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 440 + }, + { + "epoch": 0.21978569648641913, + "grad_norm": 0.12424055486917496, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 441 + }, + { + "epoch": 0.22028407675056066, + "grad_norm": 0.5921161770820618, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 442 + }, + { + "epoch": 0.22078245701470223, + "grad_norm": 0.1724957525730133, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 443 + }, + { + "epoch": 0.22128083727884376, + "grad_norm": 0.1341264247894287, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 444 + }, + { + "epoch": 0.2217792175429853, + "grad_norm": 0.43373820185661316, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 445 + }, + { + "epoch": 0.22227759780712683, + "grad_norm": 0.15030571818351746, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 446 + }, + { + "epoch": 0.22277597807126837, + "grad_norm": 0.15096893906593323, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 447 + }, + { + "epoch": 0.22327435833540993, + "grad_norm": 0.1577889323234558, + "learning_rate": 0.0002, + "loss": 1.6704, + "step": 448 + }, + { + "epoch": 0.22377273859955146, + "grad_norm": 0.11596284061670303, + "learning_rate": 0.0002, + "loss": 1.5843, + "step": 449 + }, + { + "epoch": 0.224271118863693, + "grad_norm": 0.14083531498908997, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 450 + }, + { + "epoch": 0.22476949912783453, + "grad_norm": 0.11369968950748444, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 451 + }, + { + "epoch": 0.22526787939197607, + "grad_norm": 0.12249240279197693, + "learning_rate": 0.0002, + "loss": 1.6041, + "step": 452 + }, + { + "epoch": 0.22576625965611763, + "grad_norm": 0.13246704638004303, + "learning_rate": 0.0002, + "loss": 1.7227, + "step": 453 + }, + { + "epoch": 0.22626463992025916, + "grad_norm": 0.15372870862483978, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 454 + }, + { + "epoch": 0.2267630201844007, + "grad_norm": 0.10773339122533798, + "learning_rate": 0.0002, + "loss": 1.6797, + "step": 455 + }, + { + "epoch": 0.22726140044854223, + "grad_norm": 0.10603539645671844, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 456 + }, + { + "epoch": 0.22775978071268377, + "grad_norm": 0.11118324100971222, + "learning_rate": 0.0002, + "loss": 1.6659, + "step": 457 + }, + { + "epoch": 0.22825816097682533, + "grad_norm": 0.10193316638469696, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 458 + }, + { + "epoch": 0.22875654124096687, + "grad_norm": 0.118270143866539, + "learning_rate": 0.0002, + "loss": 1.6581, + "step": 459 + }, + { + "epoch": 0.2292549215051084, + "grad_norm": 0.09839551895856857, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 460 + }, + { + "epoch": 0.22975330176924993, + "grad_norm": 0.10430920869112015, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 461 + }, + { + "epoch": 0.23025168203339147, + "grad_norm": 0.7883297204971313, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 462 + }, + { + "epoch": 0.230750062297533, + "grad_norm": 0.14015096426010132, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 463 + }, + { + "epoch": 0.23124844256167457, + "grad_norm": 0.6940969824790955, + "learning_rate": 0.0002, + "loss": 1.8366, + "step": 464 + }, + { + "epoch": 0.2317468228258161, + "grad_norm": 0.16839167475700378, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 465 + }, + { + "epoch": 0.23224520308995764, + "grad_norm": 0.14831361174583435, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 466 + }, + { + "epoch": 0.23274358335409917, + "grad_norm": 0.6374949216842651, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 467 + }, + { + "epoch": 0.2332419636182407, + "grad_norm": 0.1442909985780716, + "learning_rate": 0.0002, + "loss": 1.6875, + "step": 468 + }, + { + "epoch": 0.23374034388238227, + "grad_norm": 0.15487882494926453, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 469 + }, + { + "epoch": 0.2342387241465238, + "grad_norm": 0.133474662899971, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 470 + }, + { + "epoch": 0.23473710441066534, + "grad_norm": 0.15738508105278015, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 471 + }, + { + "epoch": 0.23523548467480687, + "grad_norm": 0.13371291756629944, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 472 + }, + { + "epoch": 0.2357338649389484, + "grad_norm": 0.12480079382658005, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 473 + }, + { + "epoch": 0.23623224520308997, + "grad_norm": 0.138162761926651, + "learning_rate": 0.0002, + "loss": 1.6844, + "step": 474 + }, + { + "epoch": 0.2367306254672315, + "grad_norm": 0.13453134894371033, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 475 + }, + { + "epoch": 0.23722900573137304, + "grad_norm": 0.11864453554153442, + "learning_rate": 0.0002, + "loss": 1.7311, + "step": 476 + }, + { + "epoch": 0.23772738599551457, + "grad_norm": 0.3905930817127228, + "learning_rate": 0.0002, + "loss": 1.7638, + "step": 477 + }, + { + "epoch": 0.2382257662596561, + "grad_norm": 0.1613403707742691, + "learning_rate": 0.0002, + "loss": 1.6413, + "step": 478 + }, + { + "epoch": 0.23872414652379767, + "grad_norm": 0.13828811049461365, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 479 + }, + { + "epoch": 0.2392225267879392, + "grad_norm": 0.13535858690738678, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 480 + }, + { + "epoch": 0.23972090705208074, + "grad_norm": 0.15594834089279175, + "learning_rate": 0.0002, + "loss": 1.7161, + "step": 481 + }, + { + "epoch": 0.24021928731622227, + "grad_norm": 0.11990589648485184, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 482 + }, + { + "epoch": 0.2407176675803638, + "grad_norm": 0.11655411124229431, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 483 + }, + { + "epoch": 0.24121604784450537, + "grad_norm": 0.11754405498504639, + "learning_rate": 0.0002, + "loss": 1.7237, + "step": 484 + }, + { + "epoch": 0.2417144281086469, + "grad_norm": 0.1332051157951355, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 485 + }, + { + "epoch": 0.24221280837278844, + "grad_norm": 0.10240749269723892, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 486 + }, + { + "epoch": 0.24271118863692998, + "grad_norm": 0.1425447165966034, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 487 + }, + { + "epoch": 0.2432095689010715, + "grad_norm": 0.10178319364786148, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 488 + }, + { + "epoch": 0.24370794916521304, + "grad_norm": 0.354878306388855, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 489 + }, + { + "epoch": 0.2442063294293546, + "grad_norm": 0.10244394838809967, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 490 + }, + { + "epoch": 0.24470470969349614, + "grad_norm": 0.10944903641939163, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 491 + }, + { + "epoch": 0.24520308995763768, + "grad_norm": 0.11182764172554016, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 492 + }, + { + "epoch": 0.2457014702217792, + "grad_norm": 0.11066277325153351, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 493 + }, + { + "epoch": 0.24619985048592075, + "grad_norm": 0.6789163947105408, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 494 + }, + { + "epoch": 0.2466982307500623, + "grad_norm": 0.15237462520599365, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 495 + }, + { + "epoch": 0.24719661101420384, + "grad_norm": 0.14016127586364746, + "learning_rate": 0.0002, + "loss": 1.6325, + "step": 496 + }, + { + "epoch": 0.24769499127834538, + "grad_norm": 0.12557458877563477, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 497 + }, + { + "epoch": 0.2481933715424869, + "grad_norm": 0.12593714892864227, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 498 + }, + { + "epoch": 0.24869175180662845, + "grad_norm": 0.12869895994663239, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 499 + }, + { + "epoch": 0.24919013207077, + "grad_norm": 0.6727408766746521, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 500 + }, + { + "epoch": 0.24968851233491154, + "grad_norm": 0.18164046108722687, + "learning_rate": 0.0002, + "loss": 1.7327, + "step": 501 + }, + { + "epoch": 0.2501868925990531, + "grad_norm": 0.12988890707492828, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 502 + }, + { + "epoch": 0.25068527286319464, + "grad_norm": 0.14229950308799744, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 503 + }, + { + "epoch": 0.25118365312733615, + "grad_norm": 0.12232649326324463, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 504 + }, + { + "epoch": 0.2516820333914777, + "grad_norm": 0.12053592503070831, + "learning_rate": 0.0002, + "loss": 1.5962, + "step": 505 + }, + { + "epoch": 0.2521804136556192, + "grad_norm": 0.12370762974023819, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 506 + }, + { + "epoch": 0.2526787939197608, + "grad_norm": 0.11628440022468567, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 507 + }, + { + "epoch": 0.25317717418390234, + "grad_norm": 0.1284741759300232, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 508 + }, + { + "epoch": 0.25367555444804385, + "grad_norm": 0.133184552192688, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 509 + }, + { + "epoch": 0.2541739347121854, + "grad_norm": 0.11966334283351898, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 510 + }, + { + "epoch": 0.2546723149763269, + "grad_norm": 0.12117716670036316, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 511 + }, + { + "epoch": 0.2551706952404685, + "grad_norm": 0.11778345704078674, + "learning_rate": 0.0002, + "loss": 1.6272, + "step": 512 + }, + { + "epoch": 0.25566907550461004, + "grad_norm": 0.11609595268964767, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 513 + }, + { + "epoch": 0.25616745576875155, + "grad_norm": 0.11605001240968704, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 514 + }, + { + "epoch": 0.2566658360328931, + "grad_norm": 0.10593124479055405, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 515 + }, + { + "epoch": 0.2571642162970346, + "grad_norm": 0.11132659763097763, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 516 + }, + { + "epoch": 0.2576625965611762, + "grad_norm": 0.09980247169733047, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 517 + }, + { + "epoch": 0.25816097682531775, + "grad_norm": 0.6143377423286438, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 518 + }, + { + "epoch": 0.25865935708945925, + "grad_norm": 0.11244726181030273, + "learning_rate": 0.0002, + "loss": 1.7124, + "step": 519 + }, + { + "epoch": 0.2591577373536008, + "grad_norm": 0.6190444827079773, + "learning_rate": 0.0002, + "loss": 1.7698, + "step": 520 + }, + { + "epoch": 0.2596561176177423, + "grad_norm": 0.7441633939743042, + "learning_rate": 0.0002, + "loss": 1.8182, + "step": 521 + }, + { + "epoch": 0.2601544978818839, + "grad_norm": 0.13578347861766815, + "learning_rate": 0.0002, + "loss": 1.6609, + "step": 522 + }, + { + "epoch": 0.2606528781460254, + "grad_norm": 0.1662416160106659, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 523 + }, + { + "epoch": 0.26115125841016695, + "grad_norm": 0.16020916402339935, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 524 + }, + { + "epoch": 0.2616496386743085, + "grad_norm": 0.12748084962368011, + "learning_rate": 0.0002, + "loss": 1.6832, + "step": 525 + }, + { + "epoch": 0.26214801893845, + "grad_norm": 0.13277047872543335, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 526 + }, + { + "epoch": 0.2626463992025916, + "grad_norm": 0.11746570467948914, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 527 + }, + { + "epoch": 0.2631447794667331, + "grad_norm": 0.1124933585524559, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 528 + }, + { + "epoch": 0.26364315973087465, + "grad_norm": 0.13045774400234222, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 529 + }, + { + "epoch": 0.2641415399950162, + "grad_norm": 0.11953026801347733, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 530 + }, + { + "epoch": 0.2646399202591577, + "grad_norm": 0.3236943185329437, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 531 + }, + { + "epoch": 0.2651383005232993, + "grad_norm": 0.13000494241714478, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 532 + }, + { + "epoch": 0.2656366807874408, + "grad_norm": 0.13072949647903442, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 533 + }, + { + "epoch": 0.26613506105158236, + "grad_norm": 0.30452999472618103, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 534 + }, + { + "epoch": 0.2666334413157239, + "grad_norm": 0.11118455231189728, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 535 + }, + { + "epoch": 0.2671318215798654, + "grad_norm": 0.12459013611078262, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 536 + }, + { + "epoch": 0.267630201844007, + "grad_norm": 0.10970738530158997, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 537 + }, + { + "epoch": 0.2681285821081485, + "grad_norm": 0.1440659761428833, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 538 + }, + { + "epoch": 0.26862696237229006, + "grad_norm": 0.11448108404874802, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 539 + }, + { + "epoch": 0.2691253426364316, + "grad_norm": 0.11026275157928467, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 540 + }, + { + "epoch": 0.2696237229005731, + "grad_norm": 0.10443202406167984, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 541 + }, + { + "epoch": 0.2701221031647147, + "grad_norm": 0.11404629796743393, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 542 + }, + { + "epoch": 0.2706204834288562, + "grad_norm": 0.12783807516098022, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 543 + }, + { + "epoch": 0.27111886369299776, + "grad_norm": 0.1040879487991333, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 544 + }, + { + "epoch": 0.2716172439571393, + "grad_norm": 0.10120297223329544, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 545 + }, + { + "epoch": 0.2721156242212808, + "grad_norm": 0.11116039007902145, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 546 + }, + { + "epoch": 0.2726140044854224, + "grad_norm": 0.353816956281662, + "learning_rate": 0.0002, + "loss": 1.7458, + "step": 547 + }, + { + "epoch": 0.2731123847495639, + "grad_norm": 0.10361409932374954, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 548 + }, + { + "epoch": 0.27361076501370546, + "grad_norm": 0.10164079070091248, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 549 + }, + { + "epoch": 0.274109145277847, + "grad_norm": 0.3576943278312683, + "learning_rate": 0.0002, + "loss": 1.7155, + "step": 550 + }, + { + "epoch": 0.27460752554198853, + "grad_norm": 0.1307370960712433, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 551 + }, + { + "epoch": 0.2751059058061301, + "grad_norm": 0.11267419159412384, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 552 + }, + { + "epoch": 0.2756042860702716, + "grad_norm": 0.10955934971570969, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 553 + }, + { + "epoch": 0.27610266633441316, + "grad_norm": 0.3629993796348572, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 554 + }, + { + "epoch": 0.2766010465985547, + "grad_norm": 0.10678595304489136, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 555 + }, + { + "epoch": 0.27709942686269623, + "grad_norm": 0.3551732301712036, + "learning_rate": 0.0002, + "loss": 1.7884, + "step": 556 + }, + { + "epoch": 0.2775978071268378, + "grad_norm": 0.1157960370182991, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 557 + }, + { + "epoch": 0.2780961873909793, + "grad_norm": 0.4219015836715698, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 558 + }, + { + "epoch": 0.27859456765512086, + "grad_norm": 0.1442400962114334, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 559 + }, + { + "epoch": 0.2790929479192624, + "grad_norm": 0.12307796627283096, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 560 + }, + { + "epoch": 0.27959132818340393, + "grad_norm": 0.13523195683956146, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 561 + }, + { + "epoch": 0.2800897084475455, + "grad_norm": 0.14576253294944763, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 562 + }, + { + "epoch": 0.280588088711687, + "grad_norm": 0.1239597350358963, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 563 + }, + { + "epoch": 0.28108646897582856, + "grad_norm": 0.11444118618965149, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 564 + }, + { + "epoch": 0.28158484923997007, + "grad_norm": 0.11568321287631989, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 565 + }, + { + "epoch": 0.28208322950411163, + "grad_norm": 0.1155436560511589, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 566 + }, + { + "epoch": 0.2825816097682532, + "grad_norm": 0.10945037007331848, + "learning_rate": 0.0002, + "loss": 1.5764, + "step": 567 + }, + { + "epoch": 0.2830799900323947, + "grad_norm": 0.5043824315071106, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 568 + }, + { + "epoch": 0.28357837029653626, + "grad_norm": 0.7879558801651001, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 569 + }, + { + "epoch": 0.28407675056067777, + "grad_norm": 0.13888636231422424, + "learning_rate": 0.0002, + "loss": 1.6418, + "step": 570 + }, + { + "epoch": 0.28457513082481933, + "grad_norm": 0.16137146949768066, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 571 + }, + { + "epoch": 0.2850735110889609, + "grad_norm": 0.2237291783094406, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 572 + }, + { + "epoch": 0.2855718913531024, + "grad_norm": 0.14624369144439697, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 573 + }, + { + "epoch": 0.28607027161724397, + "grad_norm": 0.1463831216096878, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 574 + }, + { + "epoch": 0.28656865188138547, + "grad_norm": 0.14725126326084137, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 575 + }, + { + "epoch": 0.28706703214552703, + "grad_norm": 0.13732214272022247, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 576 + }, + { + "epoch": 0.2875654124096686, + "grad_norm": 0.14334504306316376, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 577 + }, + { + "epoch": 0.2880637926738101, + "grad_norm": 0.8194677829742432, + "learning_rate": 0.0002, + "loss": 1.8945, + "step": 578 + }, + { + "epoch": 0.28856217293795167, + "grad_norm": 0.1749170422554016, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 579 + }, + { + "epoch": 0.2890605532020932, + "grad_norm": 0.12977321445941925, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 580 + }, + { + "epoch": 0.28955893346623474, + "grad_norm": 0.2908933162689209, + "learning_rate": 0.0002, + "loss": 1.8448, + "step": 581 + }, + { + "epoch": 0.2900573137303763, + "grad_norm": 0.17108629643917084, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 582 + }, + { + "epoch": 0.2905556939945178, + "grad_norm": 0.14702463150024414, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 583 + }, + { + "epoch": 0.29105407425865937, + "grad_norm": 0.12582743167877197, + "learning_rate": 0.0002, + "loss": 1.6245, + "step": 584 + }, + { + "epoch": 0.2915524545228009, + "grad_norm": 0.14732137322425842, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 585 + }, + { + "epoch": 0.29205083478694244, + "grad_norm": 0.12849657237529755, + "learning_rate": 0.0002, + "loss": 1.6583, + "step": 586 + }, + { + "epoch": 0.292549215051084, + "grad_norm": 0.11466097086668015, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 587 + }, + { + "epoch": 0.2930475953152255, + "grad_norm": 0.12361207604408264, + "learning_rate": 0.0002, + "loss": 1.6765, + "step": 588 + }, + { + "epoch": 0.29354597557936707, + "grad_norm": 0.1265360414981842, + "learning_rate": 0.0002, + "loss": 1.667, + "step": 589 + }, + { + "epoch": 0.2940443558435086, + "grad_norm": 0.11903838813304901, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 590 + }, + { + "epoch": 0.29454273610765014, + "grad_norm": 0.8345243334770203, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 591 + }, + { + "epoch": 0.2950411163717917, + "grad_norm": 0.1365821361541748, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 592 + }, + { + "epoch": 0.2955394966359332, + "grad_norm": 0.13564884662628174, + "learning_rate": 0.0002, + "loss": 1.6129, + "step": 593 + }, + { + "epoch": 0.29603787690007477, + "grad_norm": 0.13604499399662018, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 594 + }, + { + "epoch": 0.2965362571642163, + "grad_norm": 0.12102136015892029, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 595 + }, + { + "epoch": 0.29703463742835784, + "grad_norm": 0.11927222460508347, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 596 + }, + { + "epoch": 0.2975330176924994, + "grad_norm": 0.10716401040554047, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 597 + }, + { + "epoch": 0.2980313979566409, + "grad_norm": 0.12001641094684601, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 598 + }, + { + "epoch": 0.29852977822078247, + "grad_norm": 0.11045756936073303, + "learning_rate": 0.0002, + "loss": 1.6871, + "step": 599 + }, + { + "epoch": 0.299028158484924, + "grad_norm": 0.7450900077819824, + "learning_rate": 0.0002, + "loss": 1.8146, + "step": 600 + }, + { + "epoch": 0.29952653874906554, + "grad_norm": 0.16306158900260925, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 601 + }, + { + "epoch": 0.3000249190132071, + "grad_norm": 0.43425318598747253, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 602 + }, + { + "epoch": 0.3005232992773486, + "grad_norm": 0.16279961168766022, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 603 + }, + { + "epoch": 0.3010216795414902, + "grad_norm": 0.1403011977672577, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 604 + }, + { + "epoch": 0.3015200598056317, + "grad_norm": 0.13146822154521942, + "learning_rate": 0.0002, + "loss": 1.5689, + "step": 605 + }, + { + "epoch": 0.30201844006977324, + "grad_norm": 0.15902653336524963, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 606 + }, + { + "epoch": 0.3025168203339148, + "grad_norm": 0.12351160496473312, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 607 + }, + { + "epoch": 0.3030152005980563, + "grad_norm": 0.1543518602848053, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 608 + }, + { + "epoch": 0.3035135808621979, + "grad_norm": 0.11827117949724197, + "learning_rate": 0.0002, + "loss": 1.6325, + "step": 609 + }, + { + "epoch": 0.3040119611263394, + "grad_norm": 0.5559304356575012, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 610 + }, + { + "epoch": 0.30451034139048094, + "grad_norm": 0.13763754069805145, + "learning_rate": 0.0002, + "loss": 1.6715, + "step": 611 + }, + { + "epoch": 0.30500872165462245, + "grad_norm": 0.12646999955177307, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 612 + }, + { + "epoch": 0.305507101918764, + "grad_norm": 0.34849414229393005, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 613 + }, + { + "epoch": 0.3060054821829056, + "grad_norm": 0.11648757755756378, + "learning_rate": 0.0002, + "loss": 1.646, + "step": 614 + }, + { + "epoch": 0.3065038624470471, + "grad_norm": 0.13477148115634918, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 615 + }, + { + "epoch": 0.30700224271118864, + "grad_norm": 0.1102217361330986, + "learning_rate": 0.0002, + "loss": 1.6729, + "step": 616 + }, + { + "epoch": 0.30750062297533015, + "grad_norm": 0.5752671957015991, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 617 + }, + { + "epoch": 0.3079990032394717, + "grad_norm": 0.13107599318027496, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 618 + }, + { + "epoch": 0.3084973835036133, + "grad_norm": 0.11860768496990204, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 619 + }, + { + "epoch": 0.3089957637677548, + "grad_norm": 0.1229948177933693, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 620 + }, + { + "epoch": 0.30949414403189635, + "grad_norm": 0.30836552381515503, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 621 + }, + { + "epoch": 0.30999252429603785, + "grad_norm": 0.11798208951950073, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 622 + }, + { + "epoch": 0.3104909045601794, + "grad_norm": 0.4807080030441284, + "learning_rate": 0.0002, + "loss": 1.6899, + "step": 623 + }, + { + "epoch": 0.310989284824321, + "grad_norm": 0.1726754605770111, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 624 + }, + { + "epoch": 0.3114876650884625, + "grad_norm": 0.13296914100646973, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 625 + }, + { + "epoch": 0.31198604535260405, + "grad_norm": 0.14966656267642975, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 626 + }, + { + "epoch": 0.31248442561674555, + "grad_norm": 0.3757789731025696, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 627 + }, + { + "epoch": 0.3129828058808871, + "grad_norm": 0.1234004870057106, + "learning_rate": 0.0002, + "loss": 1.6204, + "step": 628 + }, + { + "epoch": 0.3134811861450287, + "grad_norm": 0.12280552089214325, + "learning_rate": 0.0002, + "loss": 1.6913, + "step": 629 + }, + { + "epoch": 0.3139795664091702, + "grad_norm": 0.12360548228025436, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 630 + }, + { + "epoch": 0.31447794667331175, + "grad_norm": 0.1292014867067337, + "learning_rate": 0.0002, + "loss": 1.6697, + "step": 631 + }, + { + "epoch": 0.31497632693745325, + "grad_norm": 0.11038494855165482, + "learning_rate": 0.0002, + "loss": 1.6103, + "step": 632 + }, + { + "epoch": 0.3154747072015948, + "grad_norm": 0.11607655137777328, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 633 + }, + { + "epoch": 0.3159730874657364, + "grad_norm": 0.10514742881059647, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 634 + }, + { + "epoch": 0.3164714677298779, + "grad_norm": 0.107606902718544, + "learning_rate": 0.0002, + "loss": 1.6975, + "step": 635 + }, + { + "epoch": 0.31696984799401945, + "grad_norm": 0.20367765426635742, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 636 + }, + { + "epoch": 0.31746822825816096, + "grad_norm": 0.10455407947301865, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 637 + }, + { + "epoch": 0.3179666085223025, + "grad_norm": 0.48424893617630005, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 638 + }, + { + "epoch": 0.3184649887864441, + "grad_norm": 0.16340336203575134, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 639 + }, + { + "epoch": 0.3189633690505856, + "grad_norm": 0.1317445933818817, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 640 + }, + { + "epoch": 0.31946174931472715, + "grad_norm": 0.12784677743911743, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 641 + }, + { + "epoch": 0.31996012957886866, + "grad_norm": 0.10745134204626083, + "learning_rate": 0.0002, + "loss": 1.6353, + "step": 642 + }, + { + "epoch": 0.3204585098430102, + "grad_norm": 0.1444125920534134, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 643 + }, + { + "epoch": 0.3209568901071518, + "grad_norm": 0.3750239908695221, + "learning_rate": 0.0002, + "loss": 1.6571, + "step": 644 + }, + { + "epoch": 0.3214552703712933, + "grad_norm": 0.11034873872995377, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 645 + }, + { + "epoch": 0.32195365063543485, + "grad_norm": 0.10759663581848145, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 646 + }, + { + "epoch": 0.32245203089957636, + "grad_norm": 0.11017131060361862, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 647 + }, + { + "epoch": 0.3229504111637179, + "grad_norm": 0.1253817230463028, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 648 + }, + { + "epoch": 0.3234487914278595, + "grad_norm": 0.5153695344924927, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 649 + }, + { + "epoch": 0.323947171692001, + "grad_norm": 0.11948184669017792, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 650 + }, + { + "epoch": 0.32444555195614255, + "grad_norm": 0.11249465495347977, + "learning_rate": 0.0002, + "loss": 1.6282, + "step": 651 + }, + { + "epoch": 0.32494393222028406, + "grad_norm": 0.11555810272693634, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 652 + }, + { + "epoch": 0.3254423124844256, + "grad_norm": 0.11882718652486801, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 653 + }, + { + "epoch": 0.32594069274856713, + "grad_norm": 0.10453632473945618, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 654 + }, + { + "epoch": 0.3264390730127087, + "grad_norm": 0.11219029873609543, + "learning_rate": 0.0002, + "loss": 1.6902, + "step": 655 + }, + { + "epoch": 0.32693745327685025, + "grad_norm": 0.10499835759401321, + "learning_rate": 0.0002, + "loss": 1.5583, + "step": 656 + }, + { + "epoch": 0.32743583354099176, + "grad_norm": 0.10964427143335342, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 657 + }, + { + "epoch": 0.3279342138051333, + "grad_norm": 0.18510489165782928, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 658 + }, + { + "epoch": 0.32843259406927483, + "grad_norm": 0.11548275500535965, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 659 + }, + { + "epoch": 0.3289309743334164, + "grad_norm": 0.11357063800096512, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 660 + }, + { + "epoch": 0.32942935459755796, + "grad_norm": 0.10668730735778809, + "learning_rate": 0.0002, + "loss": 1.6433, + "step": 661 + }, + { + "epoch": 0.32992773486169946, + "grad_norm": 0.11750250309705734, + "learning_rate": 0.0002, + "loss": 1.6813, + "step": 662 + }, + { + "epoch": 0.330426115125841, + "grad_norm": 0.8277010321617126, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 663 + }, + { + "epoch": 0.33092449538998253, + "grad_norm": 0.165303573012352, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 664 + }, + { + "epoch": 0.3314228756541241, + "grad_norm": 0.12780268490314484, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 665 + }, + { + "epoch": 0.33192125591826566, + "grad_norm": 0.13066166639328003, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 666 + }, + { + "epoch": 0.33241963618240716, + "grad_norm": 0.12650184333324432, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 667 + }, + { + "epoch": 0.3329180164465487, + "grad_norm": 0.12420842051506042, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 668 + }, + { + "epoch": 0.33341639671069023, + "grad_norm": 0.1261165291070938, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 669 + }, + { + "epoch": 0.3339147769748318, + "grad_norm": 0.11121337115764618, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 670 + }, + { + "epoch": 0.33441315723897336, + "grad_norm": 0.10835525393486023, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 671 + }, + { + "epoch": 0.33491153750311486, + "grad_norm": 0.10837749391794205, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 672 + }, + { + "epoch": 0.3354099177672564, + "grad_norm": 0.10254842787981033, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 673 + }, + { + "epoch": 0.33590829803139793, + "grad_norm": 0.5288554430007935, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 674 + }, + { + "epoch": 0.3364066782955395, + "grad_norm": 0.10820039361715317, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 675 + }, + { + "epoch": 0.33690505855968106, + "grad_norm": 0.11754646897315979, + "learning_rate": 0.0002, + "loss": 1.6059, + "step": 676 + }, + { + "epoch": 0.33740343882382257, + "grad_norm": 0.9506744742393494, + "learning_rate": 0.0002, + "loss": 1.8916, + "step": 677 + }, + { + "epoch": 0.33790181908796413, + "grad_norm": 0.1273750215768814, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 678 + }, + { + "epoch": 0.33840019935210564, + "grad_norm": 0.14315767586231232, + "learning_rate": 0.0002, + "loss": 1.6903, + "step": 679 + }, + { + "epoch": 0.3388985796162472, + "grad_norm": 0.15645241737365723, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 680 + }, + { + "epoch": 0.33939695988038876, + "grad_norm": 0.5159462690353394, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 681 + }, + { + "epoch": 0.33989534014453027, + "grad_norm": 0.13883577287197113, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 682 + }, + { + "epoch": 0.34039372040867183, + "grad_norm": 0.39283788204193115, + "learning_rate": 0.0002, + "loss": 1.6181, + "step": 683 + }, + { + "epoch": 0.34089210067281334, + "grad_norm": 0.20534516870975494, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 684 + }, + { + "epoch": 0.3413904809369549, + "grad_norm": 0.14379210770130157, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 685 + }, + { + "epoch": 0.34188886120109646, + "grad_norm": 0.1505320966243744, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 686 + }, + { + "epoch": 0.34238724146523797, + "grad_norm": 0.1377919316291809, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 687 + }, + { + "epoch": 0.34288562172937953, + "grad_norm": 0.1268286257982254, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 688 + }, + { + "epoch": 0.34338400199352104, + "grad_norm": 0.11991781741380692, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 689 + }, + { + "epoch": 0.3438823822576626, + "grad_norm": 0.12283925712108612, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 690 + }, + { + "epoch": 0.34438076252180416, + "grad_norm": 0.11207298189401627, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 691 + }, + { + "epoch": 0.34487914278594567, + "grad_norm": 0.11342150717973709, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 692 + }, + { + "epoch": 0.34537752305008723, + "grad_norm": 0.1479737013578415, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 693 + }, + { + "epoch": 0.34587590331422874, + "grad_norm": 0.11498729884624481, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 694 + }, + { + "epoch": 0.3463742835783703, + "grad_norm": 0.12394261360168457, + "learning_rate": 0.0002, + "loss": 1.699, + "step": 695 + }, + { + "epoch": 0.34687266384251186, + "grad_norm": 0.12563689053058624, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 696 + }, + { + "epoch": 0.34737104410665337, + "grad_norm": 0.10661863535642624, + "learning_rate": 0.0002, + "loss": 1.6921, + "step": 697 + }, + { + "epoch": 0.34786942437079493, + "grad_norm": 0.10778840631246567, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 698 + }, + { + "epoch": 0.34836780463493644, + "grad_norm": 0.10504487156867981, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 699 + }, + { + "epoch": 0.348866184899078, + "grad_norm": 0.10722413659095764, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 700 + }, + { + "epoch": 0.3493645651632195, + "grad_norm": 0.10450419783592224, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 701 + }, + { + "epoch": 0.34986294542736107, + "grad_norm": 0.10961712151765823, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 702 + }, + { + "epoch": 0.35036132569150263, + "grad_norm": 0.10789170861244202, + "learning_rate": 0.0002, + "loss": 1.6662, + "step": 703 + }, + { + "epoch": 0.35085970595564414, + "grad_norm": 0.10823702067136765, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 704 + }, + { + "epoch": 0.3513580862197857, + "grad_norm": 0.11080746352672577, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 705 + }, + { + "epoch": 0.3518564664839272, + "grad_norm": 0.10004162788391113, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 706 + }, + { + "epoch": 0.3523548467480688, + "grad_norm": 0.10398257523775101, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 707 + }, + { + "epoch": 0.35285322701221034, + "grad_norm": 0.10170764476060867, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 708 + }, + { + "epoch": 0.35335160727635184, + "grad_norm": 0.8194452524185181, + "learning_rate": 0.0002, + "loss": 1.8272, + "step": 709 + }, + { + "epoch": 0.3538499875404934, + "grad_norm": 0.15103065967559814, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 710 + }, + { + "epoch": 0.3543483678046349, + "grad_norm": 0.12205032259225845, + "learning_rate": 0.0002, + "loss": 1.6823, + "step": 711 + }, + { + "epoch": 0.3548467480687765, + "grad_norm": 0.1272657811641693, + "learning_rate": 0.0002, + "loss": 1.5557, + "step": 712 + }, + { + "epoch": 0.35534512833291804, + "grad_norm": 0.503338634967804, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 713 + }, + { + "epoch": 0.35584350859705954, + "grad_norm": 0.11442038416862488, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 714 + }, + { + "epoch": 0.3563418888612011, + "grad_norm": 0.1573084145784378, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 715 + }, + { + "epoch": 0.3568402691253426, + "grad_norm": 0.11450973153114319, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 716 + }, + { + "epoch": 0.3573386493894842, + "grad_norm": 0.1249619573354721, + "learning_rate": 0.0002, + "loss": 1.5954, + "step": 717 + }, + { + "epoch": 0.35783702965362574, + "grad_norm": 0.11494952440261841, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 718 + }, + { + "epoch": 0.35833540991776724, + "grad_norm": 0.13213759660720825, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 719 + }, + { + "epoch": 0.3588337901819088, + "grad_norm": 1.1261271238327026, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 720 + }, + { + "epoch": 0.3593321704460503, + "grad_norm": 1.338255524635315, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 721 + }, + { + "epoch": 0.3598305507101919, + "grad_norm": 0.21815264225006104, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 722 + }, + { + "epoch": 0.36032893097433344, + "grad_norm": 0.5178132653236389, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 723 + }, + { + "epoch": 0.36082731123847495, + "grad_norm": 0.241803839802742, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 724 + }, + { + "epoch": 0.3613256915026165, + "grad_norm": 0.20727293193340302, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 725 + }, + { + "epoch": 0.361824071766758, + "grad_norm": 0.16459515690803528, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 726 + }, + { + "epoch": 0.3623224520308996, + "grad_norm": 0.16415144503116608, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 727 + }, + { + "epoch": 0.36282083229504114, + "grad_norm": 0.16096027195453644, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 728 + }, + { + "epoch": 0.36331921255918265, + "grad_norm": 0.17240643501281738, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 729 + }, + { + "epoch": 0.3638175928233242, + "grad_norm": 0.19763271510601044, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 730 + }, + { + "epoch": 0.3643159730874657, + "grad_norm": 0.15238463878631592, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 731 + }, + { + "epoch": 0.3648143533516073, + "grad_norm": 0.27482038736343384, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 732 + }, + { + "epoch": 0.36531273361574884, + "grad_norm": 0.5192012786865234, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 733 + }, + { + "epoch": 0.36581111387989035, + "grad_norm": 0.1510191708803177, + "learning_rate": 0.0002, + "loss": 1.667, + "step": 734 + }, + { + "epoch": 0.3663094941440319, + "grad_norm": 0.14513470232486725, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 735 + }, + { + "epoch": 0.3668078744081734, + "grad_norm": 0.7901990413665771, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 736 + }, + { + "epoch": 0.367306254672315, + "grad_norm": 0.17642100155353546, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 737 + }, + { + "epoch": 0.36780463493645654, + "grad_norm": 0.14719779789447784, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 738 + }, + { + "epoch": 0.36830301520059805, + "grad_norm": 0.16173601150512695, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 739 + }, + { + "epoch": 0.3688013954647396, + "grad_norm": 0.32359546422958374, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 740 + }, + { + "epoch": 0.3692997757288811, + "grad_norm": 0.14779435098171234, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 741 + }, + { + "epoch": 0.3697981559930227, + "grad_norm": 0.19540923833847046, + "learning_rate": 0.0002, + "loss": 1.5529, + "step": 742 + }, + { + "epoch": 0.37029653625716424, + "grad_norm": 0.13870155811309814, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 743 + }, + { + "epoch": 0.37079491652130575, + "grad_norm": 0.13447612524032593, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 744 + }, + { + "epoch": 0.3712932967854473, + "grad_norm": 0.13197576999664307, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 745 + }, + { + "epoch": 0.3717916770495888, + "grad_norm": 0.13072870671749115, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 746 + }, + { + "epoch": 0.3722900573137304, + "grad_norm": 0.13418208062648773, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 747 + }, + { + "epoch": 0.3727884375778719, + "grad_norm": 0.11689562350511551, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 748 + }, + { + "epoch": 0.37328681784201345, + "grad_norm": 0.1243453249335289, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 749 + }, + { + "epoch": 0.373785198106155, + "grad_norm": 0.11520450562238693, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 750 + }, + { + "epoch": 0.3742835783702965, + "grad_norm": 0.13939018547534943, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 751 + }, + { + "epoch": 0.3747819586344381, + "grad_norm": 0.11021385341882706, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 752 + }, + { + "epoch": 0.3752803388985796, + "grad_norm": 0.11470180004835129, + "learning_rate": 0.0002, + "loss": 1.6402, + "step": 753 + }, + { + "epoch": 0.37577871916272115, + "grad_norm": 0.12256886065006256, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 754 + }, + { + "epoch": 0.3762770994268627, + "grad_norm": 0.11696486920118332, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 755 + }, + { + "epoch": 0.3767754796910042, + "grad_norm": 0.11340934783220291, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 756 + }, + { + "epoch": 0.3772738599551458, + "grad_norm": 0.10606078803539276, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 757 + }, + { + "epoch": 0.3777722402192873, + "grad_norm": 0.12084966152906418, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 758 + }, + { + "epoch": 0.37827062048342885, + "grad_norm": 0.1084008663892746, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 759 + }, + { + "epoch": 0.3787690007475704, + "grad_norm": 0.11194922029972076, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 760 + }, + { + "epoch": 0.3792673810117119, + "grad_norm": 0.48235663771629333, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 761 + }, + { + "epoch": 0.3797657612758535, + "grad_norm": 0.586637556552887, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 762 + }, + { + "epoch": 0.380264141539995, + "grad_norm": 0.14328181743621826, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 763 + }, + { + "epoch": 0.38076252180413656, + "grad_norm": 0.13296020030975342, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 764 + }, + { + "epoch": 0.3812609020682781, + "grad_norm": 0.44004350900650024, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 765 + }, + { + "epoch": 0.3817592823324196, + "grad_norm": 0.12628889083862305, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 766 + }, + { + "epoch": 0.3822576625965612, + "grad_norm": 0.1330346316099167, + "learning_rate": 0.0002, + "loss": 1.6461, + "step": 767 + }, + { + "epoch": 0.3827560428607027, + "grad_norm": 0.11893340200185776, + "learning_rate": 0.0002, + "loss": 1.6299, + "step": 768 + }, + { + "epoch": 0.38325442312484426, + "grad_norm": 0.15412816405296326, + "learning_rate": 0.0002, + "loss": 1.7436, + "step": 769 + }, + { + "epoch": 0.3837528033889858, + "grad_norm": 0.12351204454898834, + "learning_rate": 0.0002, + "loss": 1.6844, + "step": 770 + }, + { + "epoch": 0.3842511836531273, + "grad_norm": 0.11671744287014008, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 771 + }, + { + "epoch": 0.3847495639172689, + "grad_norm": 0.12512736022472382, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 772 + }, + { + "epoch": 0.3852479441814104, + "grad_norm": 0.12629447877407074, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 773 + }, + { + "epoch": 0.38574632444555196, + "grad_norm": 0.11553051322698593, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 774 + }, + { + "epoch": 0.3862447047096935, + "grad_norm": 0.12756189703941345, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 775 + }, + { + "epoch": 0.386743084973835, + "grad_norm": 0.11309953778982162, + "learning_rate": 0.0002, + "loss": 1.6098, + "step": 776 + }, + { + "epoch": 0.3872414652379766, + "grad_norm": 0.164617121219635, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 777 + }, + { + "epoch": 0.3877398455021181, + "grad_norm": 0.45813101530075073, + "learning_rate": 0.0002, + "loss": 1.7208, + "step": 778 + }, + { + "epoch": 0.38823822576625966, + "grad_norm": 0.7587694525718689, + "learning_rate": 0.0002, + "loss": 1.6195, + "step": 779 + }, + { + "epoch": 0.3887366060304012, + "grad_norm": 0.12699078023433685, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 780 + }, + { + "epoch": 0.38923498629454273, + "grad_norm": 0.139120951294899, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 781 + }, + { + "epoch": 0.3897333665586843, + "grad_norm": 0.13968676328659058, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 782 + }, + { + "epoch": 0.3902317468228258, + "grad_norm": 0.28061848878860474, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 783 + }, + { + "epoch": 0.39073012708696736, + "grad_norm": 0.11748450994491577, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 784 + }, + { + "epoch": 0.3912285073511089, + "grad_norm": 0.7288643717765808, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 785 + }, + { + "epoch": 0.39172688761525043, + "grad_norm": 0.12540021538734436, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 786 + }, + { + "epoch": 0.392225267879392, + "grad_norm": 0.13594292104244232, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 787 + }, + { + "epoch": 0.3927236481435335, + "grad_norm": 0.12894773483276367, + "learning_rate": 0.0002, + "loss": 1.5733, + "step": 788 + }, + { + "epoch": 0.39322202840767506, + "grad_norm": 0.6577300429344177, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 789 + }, + { + "epoch": 0.39372040867181657, + "grad_norm": 0.12034627795219421, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 790 + }, + { + "epoch": 0.39421878893595813, + "grad_norm": 0.1254388988018036, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 791 + }, + { + "epoch": 0.3947171692000997, + "grad_norm": 0.136959508061409, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 792 + }, + { + "epoch": 0.3952155494642412, + "grad_norm": 0.37221673130989075, + "learning_rate": 0.0002, + "loss": 1.826, + "step": 793 + }, + { + "epoch": 0.39571392972838276, + "grad_norm": 0.14947831630706787, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 794 + }, + { + "epoch": 0.39621230999252427, + "grad_norm": 0.1409454494714737, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 795 + }, + { + "epoch": 0.39671069025666583, + "grad_norm": 0.1448691040277481, + "learning_rate": 0.0002, + "loss": 1.7872, + "step": 796 + }, + { + "epoch": 0.3972090705208074, + "grad_norm": 0.12816311419010162, + "learning_rate": 0.0002, + "loss": 1.6976, + "step": 797 + }, + { + "epoch": 0.3977074507849489, + "grad_norm": 0.12581898272037506, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 798 + }, + { + "epoch": 0.39820583104909046, + "grad_norm": 0.1256158947944641, + "learning_rate": 0.0002, + "loss": 1.6778, + "step": 799 + }, + { + "epoch": 0.39870421131323197, + "grad_norm": 0.12009266763925552, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 800 + }, + { + "epoch": 0.39920259157737353, + "grad_norm": 0.14727051556110382, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 801 + }, + { + "epoch": 0.3997009718415151, + "grad_norm": 1.98500394821167, + "learning_rate": 0.0002, + "loss": 1.9632, + "step": 802 + }, + { + "epoch": 0.4001993521056566, + "grad_norm": 0.12300129979848862, + "learning_rate": 0.0002, + "loss": 1.6003, + "step": 803 + }, + { + "epoch": 0.40069773236979817, + "grad_norm": 0.13758836686611176, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 804 + }, + { + "epoch": 0.40119611263393967, + "grad_norm": 0.13127754628658295, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 805 + }, + { + "epoch": 0.40169449289808123, + "grad_norm": 0.13612794876098633, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 806 + }, + { + "epoch": 0.4021928731622228, + "grad_norm": 0.3637385964393616, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 807 + }, + { + "epoch": 0.4026912534263643, + "grad_norm": 0.19778436422348022, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 808 + }, + { + "epoch": 0.40318963369050587, + "grad_norm": 0.1478605717420578, + "learning_rate": 0.0002, + "loss": 1.7642, + "step": 809 + }, + { + "epoch": 0.4036880139546474, + "grad_norm": 0.3014202415943146, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 810 + }, + { + "epoch": 0.40418639421878894, + "grad_norm": 0.13049842417240143, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 811 + }, + { + "epoch": 0.4046847744829305, + "grad_norm": 0.932788610458374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 812 + }, + { + "epoch": 0.405183154747072, + "grad_norm": 0.1687835305929184, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 813 + }, + { + "epoch": 0.40568153501121357, + "grad_norm": 0.2024388164281845, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 814 + }, + { + "epoch": 0.4061799152753551, + "grad_norm": 0.20838886499404907, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 815 + }, + { + "epoch": 0.40667829553949664, + "grad_norm": 0.1490757167339325, + "learning_rate": 0.0002, + "loss": 1.6936, + "step": 816 + }, + { + "epoch": 0.4071766758036382, + "grad_norm": 1.1997255086898804, + "learning_rate": 0.0002, + "loss": 1.873, + "step": 817 + }, + { + "epoch": 0.4076750560677797, + "grad_norm": 0.139000803232193, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 818 + }, + { + "epoch": 0.40817343633192127, + "grad_norm": 0.14747615158557892, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 819 + }, + { + "epoch": 0.4086718165960628, + "grad_norm": 0.15866988897323608, + "learning_rate": 0.0002, + "loss": 1.6991, + "step": 820 + }, + { + "epoch": 0.40917019686020434, + "grad_norm": 0.14660963416099548, + "learning_rate": 0.0002, + "loss": 1.7233, + "step": 821 + }, + { + "epoch": 0.4096685771243459, + "grad_norm": 0.14071424305438995, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 822 + }, + { + "epoch": 0.4101669573884874, + "grad_norm": 0.1368856132030487, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 823 + }, + { + "epoch": 0.41066533765262897, + "grad_norm": 0.14662376046180725, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 824 + }, + { + "epoch": 0.4111637179167705, + "grad_norm": 0.14027300477027893, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 825 + }, + { + "epoch": 0.41166209818091204, + "grad_norm": 0.5542290210723877, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 826 + }, + { + "epoch": 0.4121604784450536, + "grad_norm": 0.15360352396965027, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 827 + }, + { + "epoch": 0.4126588587091951, + "grad_norm": 0.14451801776885986, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 828 + }, + { + "epoch": 0.41315723897333667, + "grad_norm": 0.1393883228302002, + "learning_rate": 0.0002, + "loss": 1.5922, + "step": 829 + }, + { + "epoch": 0.4136556192374782, + "grad_norm": 0.13610626757144928, + "learning_rate": 0.0002, + "loss": 1.6347, + "step": 830 + }, + { + "epoch": 0.41415399950161974, + "grad_norm": 0.12424327433109283, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 831 + }, + { + "epoch": 0.4146523797657613, + "grad_norm": 0.127548947930336, + "learning_rate": 0.0002, + "loss": 1.6609, + "step": 832 + }, + { + "epoch": 0.4151507600299028, + "grad_norm": 0.1881740391254425, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 833 + }, + { + "epoch": 0.4156491402940444, + "grad_norm": 0.12144262343645096, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 834 + }, + { + "epoch": 0.4161475205581859, + "grad_norm": 0.11799559742212296, + "learning_rate": 0.0002, + "loss": 1.672, + "step": 835 + }, + { + "epoch": 0.41664590082232744, + "grad_norm": 0.12129071354866028, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 836 + }, + { + "epoch": 0.41714428108646895, + "grad_norm": 0.11648084223270416, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 837 + }, + { + "epoch": 0.4176426613506105, + "grad_norm": 0.11401843279600143, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 838 + }, + { + "epoch": 0.4181410416147521, + "grad_norm": 0.11244560778141022, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 839 + }, + { + "epoch": 0.4186394218788936, + "grad_norm": 0.11274567991495132, + "learning_rate": 0.0002, + "loss": 1.5518, + "step": 840 + }, + { + "epoch": 0.41913780214303514, + "grad_norm": 0.11203539371490479, + "learning_rate": 0.0002, + "loss": 1.6372, + "step": 841 + }, + { + "epoch": 0.41963618240717665, + "grad_norm": 0.11548861116170883, + "learning_rate": 0.0002, + "loss": 1.5787, + "step": 842 + }, + { + "epoch": 0.4201345626713182, + "grad_norm": 0.10921257734298706, + "learning_rate": 0.0002, + "loss": 1.6457, + "step": 843 + }, + { + "epoch": 0.4206329429354598, + "grad_norm": 0.10832211375236511, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 844 + }, + { + "epoch": 0.4211313231996013, + "grad_norm": 0.11785157024860382, + "learning_rate": 0.0002, + "loss": 1.6687, + "step": 845 + }, + { + "epoch": 0.42162970346374284, + "grad_norm": 0.1575067639350891, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 846 + }, + { + "epoch": 0.42212808372788435, + "grad_norm": 0.5687432885169983, + "learning_rate": 0.0002, + "loss": 1.8016, + "step": 847 + }, + { + "epoch": 0.4226264639920259, + "grad_norm": 0.887058675289154, + "learning_rate": 0.0002, + "loss": 1.7988, + "step": 848 + }, + { + "epoch": 0.4231248442561675, + "grad_norm": 0.12778295576572418, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 849 + }, + { + "epoch": 0.423623224520309, + "grad_norm": 0.13481804728507996, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 850 + }, + { + "epoch": 0.42412160478445055, + "grad_norm": 0.1478685438632965, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 851 + }, + { + "epoch": 0.42461998504859205, + "grad_norm": 0.13414372503757477, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 852 + }, + { + "epoch": 0.4251183653127336, + "grad_norm": 0.13211821019649506, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 853 + }, + { + "epoch": 0.4256167455768752, + "grad_norm": 0.13594435155391693, + "learning_rate": 0.0002, + "loss": 1.6363, + "step": 854 + }, + { + "epoch": 0.4261151258410167, + "grad_norm": 0.13266883790493011, + "learning_rate": 0.0002, + "loss": 1.6632, + "step": 855 + }, + { + "epoch": 0.42661350610515825, + "grad_norm": 0.12024448066949844, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 856 + }, + { + "epoch": 0.42711188636929975, + "grad_norm": 0.12828536331653595, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 857 + }, + { + "epoch": 0.4276102666334413, + "grad_norm": 0.12315808236598969, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 858 + }, + { + "epoch": 0.4281086468975829, + "grad_norm": 0.13026510179042816, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 859 + }, + { + "epoch": 0.4286070271617244, + "grad_norm": 0.45274946093559265, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 860 + }, + { + "epoch": 0.42910540742586595, + "grad_norm": 0.12899275124073029, + "learning_rate": 0.0002, + "loss": 1.6603, + "step": 861 + }, + { + "epoch": 0.42960378769000745, + "grad_norm": 0.12414630502462387, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 862 + }, + { + "epoch": 0.430102167954149, + "grad_norm": 0.146366149187088, + "learning_rate": 0.0002, + "loss": 1.6799, + "step": 863 + }, + { + "epoch": 0.4306005482182906, + "grad_norm": 0.11743781715631485, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 864 + }, + { + "epoch": 0.4310989284824321, + "grad_norm": 0.15248535573482513, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 865 + }, + { + "epoch": 0.43159730874657365, + "grad_norm": 0.11914569139480591, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 866 + }, + { + "epoch": 0.43209568901071516, + "grad_norm": 0.11982624977827072, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 867 + }, + { + "epoch": 0.4325940692748567, + "grad_norm": 0.12126267701387405, + "learning_rate": 0.0002, + "loss": 1.7153, + "step": 868 + }, + { + "epoch": 0.4330924495389983, + "grad_norm": 0.3660570979118347, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 869 + }, + { + "epoch": 0.4335908298031398, + "grad_norm": 0.11174522340297699, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 870 + }, + { + "epoch": 0.43408921006728135, + "grad_norm": 0.12089698761701584, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 871 + }, + { + "epoch": 0.43458759033142286, + "grad_norm": 0.11779413372278214, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 872 + }, + { + "epoch": 0.4350859705955644, + "grad_norm": 0.11461353302001953, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 873 + }, + { + "epoch": 0.435584350859706, + "grad_norm": 0.1294202357530594, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 874 + }, + { + "epoch": 0.4360827311238475, + "grad_norm": 0.1081145629286766, + "learning_rate": 0.0002, + "loss": 1.6078, + "step": 875 + }, + { + "epoch": 0.43658111138798905, + "grad_norm": 0.11721238493919373, + "learning_rate": 0.0002, + "loss": 1.6056, + "step": 876 + }, + { + "epoch": 0.43707949165213056, + "grad_norm": 0.11436528712511063, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 877 + }, + { + "epoch": 0.4375778719162721, + "grad_norm": 0.11401306092739105, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 878 + }, + { + "epoch": 0.4380762521804137, + "grad_norm": 0.11282623559236526, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 879 + }, + { + "epoch": 0.4385746324445552, + "grad_norm": 0.11592991650104523, + "learning_rate": 0.0002, + "loss": 1.5984, + "step": 880 + }, + { + "epoch": 0.43907301270869675, + "grad_norm": 0.10579363256692886, + "learning_rate": 0.0002, + "loss": 1.6349, + "step": 881 + }, + { + "epoch": 0.43957139297283826, + "grad_norm": 0.1032218486070633, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 882 + }, + { + "epoch": 0.4400697732369798, + "grad_norm": 0.10277747362852097, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 883 + }, + { + "epoch": 0.44056815350112133, + "grad_norm": 0.12377838790416718, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 884 + }, + { + "epoch": 0.4410665337652629, + "grad_norm": 0.10326054692268372, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 885 + }, + { + "epoch": 0.44156491402940445, + "grad_norm": 0.10518341511487961, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 886 + }, + { + "epoch": 0.44206329429354596, + "grad_norm": 0.10297736525535583, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 887 + }, + { + "epoch": 0.4425616745576875, + "grad_norm": 0.10891593992710114, + "learning_rate": 0.0002, + "loss": 1.6928, + "step": 888 + }, + { + "epoch": 0.44306005482182903, + "grad_norm": 0.10570312291383743, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 889 + }, + { + "epoch": 0.4435584350859706, + "grad_norm": 0.10274644941091537, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 890 + }, + { + "epoch": 0.44405681535011216, + "grad_norm": 0.11095419526100159, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 891 + }, + { + "epoch": 0.44455519561425366, + "grad_norm": 0.14802560210227966, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 892 + }, + { + "epoch": 0.4450535758783952, + "grad_norm": 0.10468854010105133, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 893 + }, + { + "epoch": 0.44555195614253673, + "grad_norm": 0.10267975926399231, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 894 + }, + { + "epoch": 0.4460503364066783, + "grad_norm": 0.10226966440677643, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 895 + }, + { + "epoch": 0.44654871667081986, + "grad_norm": 0.1046745628118515, + "learning_rate": 0.0002, + "loss": 1.6244, + "step": 896 + }, + { + "epoch": 0.44704709693496136, + "grad_norm": 0.5514235496520996, + "learning_rate": 0.0002, + "loss": 1.6949, + "step": 897 + }, + { + "epoch": 0.4475454771991029, + "grad_norm": 0.10770034044981003, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 898 + }, + { + "epoch": 0.44804385746324443, + "grad_norm": 0.1274634599685669, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 899 + }, + { + "epoch": 0.448542237727386, + "grad_norm": 0.11944198608398438, + "learning_rate": 0.0002, + "loss": 1.635, + "step": 900 + } + ], + "logging_steps": 1, + "max_steps": 4012, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.024817808581591e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}