diff --git "a/checkpoint-1200/trainer_state.json" "b/checkpoint-1200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1200/trainer_state.json" @@ -0,0 +1,8433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2096774193548387, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010080645161290322, + "grad_norm": 0.9473515748977661, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9769, + "step": 1 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 0.9036028981208801, + "learning_rate": 8.000000000000001e-06, + "loss": 1.9331, + "step": 2 + }, + { + "epoch": 0.0030241935483870967, + "grad_norm": 0.9499556422233582, + "learning_rate": 1.2e-05, + "loss": 1.9852, + "step": 3 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 0.903069019317627, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.9668, + "step": 4 + }, + { + "epoch": 0.005040322580645161, + "grad_norm": 0.5635794997215271, + "learning_rate": 2e-05, + "loss": 1.9327, + "step": 5 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 0.9521661996841431, + "learning_rate": 2.4e-05, + "loss": 2.0026, + "step": 6 + }, + { + "epoch": 0.007056451612903226, + "grad_norm": 0.4393383860588074, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.8885, + "step": 7 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 0.36857879161834717, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.8537, + "step": 8 + }, + { + "epoch": 0.009072580645161291, + "grad_norm": 0.3844268321990967, + "learning_rate": 3.6e-05, + "loss": 1.8874, + "step": 9 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 0.41415101289749146, + "learning_rate": 4e-05, + "loss": 1.9386, + "step": 10 + }, + { + "epoch": 0.011088709677419355, + "grad_norm": 0.3869949281215668, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.9359, + "step": 11 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 0.3345952033996582, + "learning_rate": 4.8e-05, + "loss": 1.903, + "step": 12 + }, + { + "epoch": 0.01310483870967742, + "grad_norm": 0.3590312600135803, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.9024, + "step": 13 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 0.2288215309381485, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.8431, + "step": 14 + }, + { + "epoch": 0.015120967741935484, + "grad_norm": 0.20984530448913574, + "learning_rate": 6e-05, + "loss": 1.8522, + "step": 15 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 0.2080329954624176, + "learning_rate": 6.400000000000001e-05, + "loss": 1.9895, + "step": 16 + }, + { + "epoch": 0.017137096774193547, + "grad_norm": 0.20060451328754425, + "learning_rate": 6.800000000000001e-05, + "loss": 1.8289, + "step": 17 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 0.16062042117118835, + "learning_rate": 7.2e-05, + "loss": 1.8823, + "step": 18 + }, + { + "epoch": 0.019153225806451613, + "grad_norm": 0.15423905849456787, + "learning_rate": 7.6e-05, + "loss": 1.7997, + "step": 19 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 0.15496863424777985, + "learning_rate": 8e-05, + "loss": 1.8237, + "step": 20 + }, + { + "epoch": 0.021169354838709676, + "grad_norm": 0.16305851936340332, + "learning_rate": 8.4e-05, + "loss": 1.7973, + "step": 21 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 0.1680663675069809, + "learning_rate": 8.800000000000001e-05, + "loss": 1.82, + "step": 22 + }, + { + "epoch": 0.023185483870967742, + "grad_norm": 0.16471807658672333, + "learning_rate": 9.200000000000001e-05, + "loss": 1.8314, + "step": 23 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 0.13601982593536377, + "learning_rate": 9.6e-05, + "loss": 1.8488, + "step": 24 + }, + { + "epoch": 0.025201612903225805, + "grad_norm": 0.12553684413433075, + "learning_rate": 0.0001, + "loss": 1.839, + "step": 25 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 0.12679991126060486, + "learning_rate": 0.00010400000000000001, + "loss": 1.8615, + "step": 26 + }, + { + "epoch": 0.02721774193548387, + "grad_norm": 0.1284348964691162, + "learning_rate": 0.00010800000000000001, + "loss": 1.8215, + "step": 27 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 0.11629381030797958, + "learning_rate": 0.00011200000000000001, + "loss": 1.8536, + "step": 28 + }, + { + "epoch": 0.029233870967741934, + "grad_norm": 0.10016848891973495, + "learning_rate": 0.000116, + "loss": 1.8095, + "step": 29 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 0.10154619067907333, + "learning_rate": 0.00012, + "loss": 1.8355, + "step": 30 + }, + { + "epoch": 0.03125, + "grad_norm": 0.11825895309448242, + "learning_rate": 0.000124, + "loss": 1.7984, + "step": 31 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 0.104405976831913, + "learning_rate": 0.00012800000000000002, + "loss": 1.7673, + "step": 32 + }, + { + "epoch": 0.03326612903225806, + "grad_norm": 0.09943860024213791, + "learning_rate": 0.000132, + "loss": 1.813, + "step": 33 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 0.10970743000507355, + "learning_rate": 0.00013600000000000003, + "loss": 1.9213, + "step": 34 + }, + { + "epoch": 0.03528225806451613, + "grad_norm": 0.1049584224820137, + "learning_rate": 0.00014, + "loss": 1.7818, + "step": 35 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 0.08986247330904007, + "learning_rate": 0.000144, + "loss": 1.7944, + "step": 36 + }, + { + "epoch": 0.037298387096774195, + "grad_norm": 0.09243710339069366, + "learning_rate": 0.000148, + "loss": 1.7158, + "step": 37 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 0.10768643021583557, + "learning_rate": 0.000152, + "loss": 1.8295, + "step": 38 + }, + { + "epoch": 0.03931451612903226, + "grad_norm": 0.07883578538894653, + "learning_rate": 0.00015600000000000002, + "loss": 1.757, + "step": 39 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 0.10219922661781311, + "learning_rate": 0.00016, + "loss": 1.7423, + "step": 40 + }, + { + "epoch": 0.04133064516129032, + "grad_norm": 0.08045803755521774, + "learning_rate": 0.000164, + "loss": 1.7649, + "step": 41 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 0.07191110402345657, + "learning_rate": 0.000168, + "loss": 1.7441, + "step": 42 + }, + { + "epoch": 0.04334677419354839, + "grad_norm": 0.08571028709411621, + "learning_rate": 0.000172, + "loss": 1.8094, + "step": 43 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 0.08775891363620758, + "learning_rate": 0.00017600000000000002, + "loss": 1.817, + "step": 44 + }, + { + "epoch": 0.04536290322580645, + "grad_norm": 0.08328275382518768, + "learning_rate": 0.00018, + "loss": 1.7753, + "step": 45 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 0.08221882581710815, + "learning_rate": 0.00018400000000000003, + "loss": 1.7824, + "step": 46 + }, + { + "epoch": 0.047379032258064516, + "grad_norm": 0.0885847732424736, + "learning_rate": 0.000188, + "loss": 1.7423, + "step": 47 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 0.08126149326562881, + "learning_rate": 0.000192, + "loss": 1.7495, + "step": 48 + }, + { + "epoch": 0.04939516129032258, + "grad_norm": 0.08296285569667816, + "learning_rate": 0.000196, + "loss": 1.6909, + "step": 49 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 0.09005258232355118, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 50 + }, + { + "epoch": 0.05141129032258065, + "grad_norm": 0.08956532180309296, + "learning_rate": 0.00019999986806600454, + "loss": 1.6662, + "step": 51 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 0.08471240848302841, + "learning_rate": 0.00019999947226436628, + "loss": 1.8274, + "step": 52 + }, + { + "epoch": 0.05342741935483871, + "grad_norm": 0.09117641299962997, + "learning_rate": 0.00019999881259612963, + "loss": 1.7027, + "step": 53 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 0.08552085608243942, + "learning_rate": 0.00019999788906303518, + "loss": 1.7738, + "step": 54 + }, + { + "epoch": 0.055443548387096774, + "grad_norm": 0.07708004862070084, + "learning_rate": 0.00019999670166751993, + "loss": 1.7821, + "step": 55 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 0.07826384156942368, + "learning_rate": 0.000199995250412717, + "loss": 1.7579, + "step": 56 + }, + { + "epoch": 0.057459677419354836, + "grad_norm": 0.0721641331911087, + "learning_rate": 0.00019999353530245572, + "loss": 1.7372, + "step": 57 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 0.07667742669582367, + "learning_rate": 0.0001999915563412618, + "loss": 1.7323, + "step": 58 + }, + { + "epoch": 0.059475806451612906, + "grad_norm": 0.10455285757780075, + "learning_rate": 0.00019998931353435709, + "loss": 1.8221, + "step": 59 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 0.07621350884437561, + "learning_rate": 0.00019998680688765959, + "loss": 1.7305, + "step": 60 + }, + { + "epoch": 0.06149193548387097, + "grad_norm": 0.08454013615846634, + "learning_rate": 0.00019998403640778358, + "loss": 1.7558, + "step": 61 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08005455136299133, + "learning_rate": 0.00019998100210203942, + "loss": 1.6703, + "step": 62 + }, + { + "epoch": 0.06350806451612903, + "grad_norm": 0.09527427703142166, + "learning_rate": 0.0001999777039784337, + "loss": 1.7896, + "step": 63 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.10536834597587585, + "learning_rate": 0.00019997414204566915, + "loss": 1.7909, + "step": 64 + }, + { + "epoch": 0.0655241935483871, + "grad_norm": 0.08326593041419983, + "learning_rate": 0.0001999703163131445, + "loss": 1.7501, + "step": 65 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 0.0823182687163353, + "learning_rate": 0.00019996622679095468, + "loss": 1.7625, + "step": 66 + }, + { + "epoch": 0.06754032258064516, + "grad_norm": 0.07878896594047546, + "learning_rate": 0.00019996187348989063, + "loss": 1.7235, + "step": 67 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 0.0899212434887886, + "learning_rate": 0.0001999572564214393, + "loss": 1.7685, + "step": 68 + }, + { + "epoch": 0.06955645161290322, + "grad_norm": 0.07247278839349747, + "learning_rate": 0.00019995237559778363, + "loss": 1.6281, + "step": 69 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 0.08588135987520218, + "learning_rate": 0.00019994723103180265, + "loss": 1.7785, + "step": 70 + }, + { + "epoch": 0.0715725806451613, + "grad_norm": 0.12004637718200684, + "learning_rate": 0.00019994182273707107, + "loss": 1.7552, + "step": 71 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 0.1002095490694046, + "learning_rate": 0.00019993615072785978, + "loss": 1.715, + "step": 72 + }, + { + "epoch": 0.07358870967741936, + "grad_norm": 0.07339724153280258, + "learning_rate": 0.00019993021501913536, + "loss": 1.7019, + "step": 73 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 0.1305348128080368, + "learning_rate": 0.00019992401562656022, + "loss": 1.8078, + "step": 74 + }, + { + "epoch": 0.07560483870967742, + "grad_norm": 0.09164395183324814, + "learning_rate": 0.0001999175525664926, + "loss": 1.6756, + "step": 75 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 0.0749751552939415, + "learning_rate": 0.0001999108258559864, + "loss": 1.7616, + "step": 76 + }, + { + "epoch": 0.07762096774193548, + "grad_norm": 0.1132885217666626, + "learning_rate": 0.00019990383551279136, + "loss": 1.8232, + "step": 77 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 0.0832655057311058, + "learning_rate": 0.00019989658155535262, + "loss": 1.7371, + "step": 78 + }, + { + "epoch": 0.07963709677419355, + "grad_norm": 0.09641417115926743, + "learning_rate": 0.00019988906400281116, + "loss": 1.7989, + "step": 79 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 0.08800283074378967, + "learning_rate": 0.00019988128287500335, + "loss": 1.7235, + "step": 80 + }, + { + "epoch": 0.08165322580645161, + "grad_norm": 0.0772438570857048, + "learning_rate": 0.00019987323819246108, + "loss": 1.7488, + "step": 81 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 0.09178374707698822, + "learning_rate": 0.00019986492997641175, + "loss": 1.7018, + "step": 82 + }, + { + "epoch": 0.08366935483870967, + "grad_norm": 0.09313932806253433, + "learning_rate": 0.00019985635824877802, + "loss": 1.7914, + "step": 83 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 0.0906209945678711, + "learning_rate": 0.00019984752303217797, + "loss": 1.7197, + "step": 84 + }, + { + "epoch": 0.08568548387096774, + "grad_norm": 0.09081698209047318, + "learning_rate": 0.0001998384243499249, + "loss": 1.7666, + "step": 85 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 0.07680635154247284, + "learning_rate": 0.0001998290622260273, + "loss": 1.6946, + "step": 86 + }, + { + "epoch": 0.08770161290322581, + "grad_norm": 0.0743766576051712, + "learning_rate": 0.00019981943668518888, + "loss": 1.7588, + "step": 87 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 0.07674787193536758, + "learning_rate": 0.00019980954775280832, + "loss": 1.6896, + "step": 88 + }, + { + "epoch": 0.08971774193548387, + "grad_norm": 0.07708673924207687, + "learning_rate": 0.00019979939545497933, + "loss": 1.6944, + "step": 89 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 0.07248947024345398, + "learning_rate": 0.00019978897981849056, + "loss": 1.7114, + "step": 90 + }, + { + "epoch": 0.09173387096774194, + "grad_norm": 0.07939179986715317, + "learning_rate": 0.0001997783008708256, + "loss": 1.7552, + "step": 91 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 0.09288234263658524, + "learning_rate": 0.00019976735864016276, + "loss": 1.7554, + "step": 92 + }, + { + "epoch": 0.09375, + "grad_norm": 0.08074582368135452, + "learning_rate": 0.00019975615315537506, + "loss": 1.7209, + "step": 93 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 0.08087307959794998, + "learning_rate": 0.0001997446844460302, + "loss": 1.7118, + "step": 94 + }, + { + "epoch": 0.09576612903225806, + "grad_norm": 0.08976717293262482, + "learning_rate": 0.00019973295254239044, + "loss": 1.7384, + "step": 95 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.08545631170272827, + "learning_rate": 0.0001997209574754125, + "loss": 1.7524, + "step": 96 + }, + { + "epoch": 0.09778225806451613, + "grad_norm": 0.07703512907028198, + "learning_rate": 0.00019970869927674753, + "loss": 1.6947, + "step": 97 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 0.07614375650882721, + "learning_rate": 0.000199696177978741, + "loss": 1.7135, + "step": 98 + }, + { + "epoch": 0.09979838709677419, + "grad_norm": 0.0809471607208252, + "learning_rate": 0.0001996833936144326, + "loss": 1.727, + "step": 99 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 0.1023879274725914, + "learning_rate": 0.00019967034621755622, + "loss": 1.7297, + "step": 100 + }, + { + "epoch": 0.10181451612903226, + "grad_norm": 0.07705037295818329, + "learning_rate": 0.00019965703582253965, + "loss": 1.6571, + "step": 101 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 0.08601151406764984, + "learning_rate": 0.00019964346246450487, + "loss": 1.7404, + "step": 102 + }, + { + "epoch": 0.10383064516129033, + "grad_norm": 0.0756453350186348, + "learning_rate": 0.00019962962617926756, + "loss": 1.7311, + "step": 103 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 0.10456051677465439, + "learning_rate": 0.00019961552700333734, + "loss": 1.7517, + "step": 104 + }, + { + "epoch": 0.10584677419354839, + "grad_norm": 0.07731463760137558, + "learning_rate": 0.00019960116497391733, + "loss": 1.716, + "step": 105 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 0.0789295881986618, + "learning_rate": 0.00019958654012890435, + "loss": 1.7233, + "step": 106 + }, + { + "epoch": 0.10786290322580645, + "grad_norm": 0.08179011940956116, + "learning_rate": 0.0001995716525068887, + "loss": 1.6556, + "step": 107 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 0.08565866947174072, + "learning_rate": 0.00019955650214715406, + "loss": 1.7512, + "step": 108 + }, + { + "epoch": 0.10987903225806452, + "grad_norm": 0.08556907624006271, + "learning_rate": 0.00019954108908967736, + "loss": 1.7522, + "step": 109 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 0.08097026497125626, + "learning_rate": 0.00019952541337512868, + "loss": 1.6656, + "step": 110 + }, + { + "epoch": 0.11189516129032258, + "grad_norm": 0.07853402197360992, + "learning_rate": 0.0001995094750448713, + "loss": 1.7299, + "step": 111 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 0.07205012440681458, + "learning_rate": 0.00019949327414096134, + "loss": 1.7118, + "step": 112 + }, + { + "epoch": 0.11391129032258064, + "grad_norm": 0.0683959424495697, + "learning_rate": 0.00019947681070614777, + "loss": 1.6742, + "step": 113 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 0.07890711724758148, + "learning_rate": 0.00019946008478387238, + "loss": 1.6962, + "step": 114 + }, + { + "epoch": 0.1159274193548387, + "grad_norm": 0.08321288973093033, + "learning_rate": 0.00019944309641826947, + "loss": 1.7552, + "step": 115 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 0.0974084734916687, + "learning_rate": 0.0001994258456541659, + "loss": 1.7971, + "step": 116 + }, + { + "epoch": 0.11794354838709678, + "grad_norm": 0.08591660857200623, + "learning_rate": 0.00019940833253708097, + "loss": 1.7644, + "step": 117 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 0.07388189435005188, + "learning_rate": 0.00019939055711322616, + "loss": 1.6513, + "step": 118 + }, + { + "epoch": 0.11995967741935484, + "grad_norm": 0.07635471969842911, + "learning_rate": 0.00019937251942950512, + "loss": 1.7005, + "step": 119 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 0.08252502232789993, + "learning_rate": 0.0001993542195335135, + "loss": 1.7267, + "step": 120 + }, + { + "epoch": 0.1219758064516129, + "grad_norm": 0.10845799744129181, + "learning_rate": 0.0001993356574735389, + "loss": 1.7756, + "step": 121 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 0.07942607253789902, + "learning_rate": 0.00019931683329856066, + "loss": 1.6849, + "step": 122 + }, + { + "epoch": 0.12399193548387097, + "grad_norm": 0.08841695636510849, + "learning_rate": 0.00019929774705824973, + "loss": 1.7343, + "step": 123 + }, + { + "epoch": 0.125, + "grad_norm": 0.09001098573207855, + "learning_rate": 0.0001992783988029686, + "loss": 1.7534, + "step": 124 + }, + { + "epoch": 0.12600806451612903, + "grad_norm": 0.07412228733301163, + "learning_rate": 0.00019925878858377113, + "loss": 1.7125, + "step": 125 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 0.09205227345228195, + "learning_rate": 0.00019923891645240238, + "loss": 1.6712, + "step": 126 + }, + { + "epoch": 0.1280241935483871, + "grad_norm": 0.07850176095962524, + "learning_rate": 0.00019921878246129858, + "loss": 1.6747, + "step": 127 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.07801543176174164, + "learning_rate": 0.00019919838666358688, + "loss": 1.6799, + "step": 128 + }, + { + "epoch": 0.13004032258064516, + "grad_norm": 0.08263793587684631, + "learning_rate": 0.00019917772911308524, + "loss": 1.7368, + "step": 129 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 0.10233369469642639, + "learning_rate": 0.00019915680986430233, + "loss": 1.7377, + "step": 130 + }, + { + "epoch": 0.13205645161290322, + "grad_norm": 0.08960834890604019, + "learning_rate": 0.00019913562897243736, + "loss": 1.7146, + "step": 131 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 0.07425748556852341, + "learning_rate": 0.00019911418649337997, + "loss": 1.6796, + "step": 132 + }, + { + "epoch": 0.13407258064516128, + "grad_norm": 0.11380482465028763, + "learning_rate": 0.00019909248248370988, + "loss": 1.7688, + "step": 133 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 0.09946684539318085, + "learning_rate": 0.00019907051700069714, + "loss": 1.7016, + "step": 134 + }, + { + "epoch": 0.13608870967741934, + "grad_norm": 0.07686997205018997, + "learning_rate": 0.0001990482901023016, + "loss": 1.7209, + "step": 135 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 0.08980387449264526, + "learning_rate": 0.0001990258018471729, + "loss": 1.6922, + "step": 136 + }, + { + "epoch": 0.1381048387096774, + "grad_norm": 0.08946418762207031, + "learning_rate": 0.00019900305229465036, + "loss": 1.7231, + "step": 137 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 0.07228976488113403, + "learning_rate": 0.00019898004150476278, + "loss": 1.6864, + "step": 138 + }, + { + "epoch": 0.14012096774193547, + "grad_norm": 0.09577012807130814, + "learning_rate": 0.00019895676953822822, + "loss": 1.6812, + "step": 139 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 0.08688167482614517, + "learning_rate": 0.00019893323645645404, + "loss": 1.738, + "step": 140 + }, + { + "epoch": 0.14213709677419356, + "grad_norm": 0.07488682866096497, + "learning_rate": 0.00019890944232153643, + "loss": 1.6202, + "step": 141 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 0.09752912074327469, + "learning_rate": 0.00019888538719626053, + "loss": 1.7006, + "step": 142 + }, + { + "epoch": 0.14415322580645162, + "grad_norm": 0.08033961057662964, + "learning_rate": 0.0001988610711441001, + "loss": 1.7119, + "step": 143 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 0.07507845759391785, + "learning_rate": 0.00019883649422921745, + "loss": 1.6504, + "step": 144 + }, + { + "epoch": 0.1461693548387097, + "grad_norm": 0.07756344974040985, + "learning_rate": 0.00019881165651646317, + "loss": 1.7107, + "step": 145 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 0.07581036537885666, + "learning_rate": 0.00019878655807137603, + "loss": 1.6777, + "step": 146 + }, + { + "epoch": 0.14818548387096775, + "grad_norm": 0.06943333894014359, + "learning_rate": 0.0001987611989601828, + "loss": 1.6282, + "step": 147 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 0.07314992696046829, + "learning_rate": 0.00019873557924979804, + "loss": 1.6773, + "step": 148 + }, + { + "epoch": 0.1502016129032258, + "grad_norm": 0.08181635290384293, + "learning_rate": 0.000198709699007824, + "loss": 1.668, + "step": 149 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 0.07046262919902802, + "learning_rate": 0.00019868355830255033, + "loss": 1.6857, + "step": 150 + }, + { + "epoch": 0.15221774193548387, + "grad_norm": 0.07162804901599884, + "learning_rate": 0.00019865715720295397, + "loss": 1.6299, + "step": 151 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 0.0785004273056984, + "learning_rate": 0.00019863049577869898, + "loss": 1.6651, + "step": 152 + }, + { + "epoch": 0.15423387096774194, + "grad_norm": 0.06895990669727325, + "learning_rate": 0.00019860357410013638, + "loss": 1.636, + "step": 153 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 0.0736781507730484, + "learning_rate": 0.00019857639223830377, + "loss": 1.6859, + "step": 154 + }, + { + "epoch": 0.15625, + "grad_norm": 0.07190602272748947, + "learning_rate": 0.00019854895026492545, + "loss": 1.706, + "step": 155 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 0.07781372219324112, + "learning_rate": 0.00019852124825241201, + "loss": 1.7015, + "step": 156 + }, + { + "epoch": 0.15826612903225806, + "grad_norm": 0.08466929197311401, + "learning_rate": 0.0001984932862738601, + "loss": 1.6684, + "step": 157 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 0.08189702033996582, + "learning_rate": 0.00019846506440305257, + "loss": 1.6914, + "step": 158 + }, + { + "epoch": 0.16028225806451613, + "grad_norm": 0.08032141625881195, + "learning_rate": 0.00019843658271445776, + "loss": 1.6574, + "step": 159 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.08438081294298172, + "learning_rate": 0.00019840784128322985, + "loss": 1.7503, + "step": 160 + }, + { + "epoch": 0.1622983870967742, + "grad_norm": 0.10350456833839417, + "learning_rate": 0.0001983788401852082, + "loss": 1.697, + "step": 161 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 0.08714311569929123, + "learning_rate": 0.00019834957949691747, + "loss": 1.7595, + "step": 162 + }, + { + "epoch": 0.16431451612903225, + "grad_norm": 0.08562017232179642, + "learning_rate": 0.00019832005929556722, + "loss": 1.7502, + "step": 163 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 0.0961882621049881, + "learning_rate": 0.00019829027965905186, + "loss": 1.6875, + "step": 164 + }, + { + "epoch": 0.1663306451612903, + "grad_norm": 0.09505471587181091, + "learning_rate": 0.00019826024066595027, + "loss": 1.6958, + "step": 165 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 0.07493823021650314, + "learning_rate": 0.00019822994239552573, + "loss": 1.6677, + "step": 166 + }, + { + "epoch": 0.16834677419354838, + "grad_norm": 0.09159812331199646, + "learning_rate": 0.00019819938492772568, + "loss": 1.6994, + "step": 167 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 0.1118432804942131, + "learning_rate": 0.00019816856834318155, + "loss": 1.7143, + "step": 168 + }, + { + "epoch": 0.17036290322580644, + "grad_norm": 0.09199640899896622, + "learning_rate": 0.0001981374927232084, + "loss": 1.6896, + "step": 169 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 0.0801042765378952, + "learning_rate": 0.00019810615814980483, + "loss": 1.7292, + "step": 170 + }, + { + "epoch": 0.17237903225806453, + "grad_norm": 0.1115993857383728, + "learning_rate": 0.00019807456470565283, + "loss": 1.6995, + "step": 171 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 0.10155931115150452, + "learning_rate": 0.00019804271247411727, + "loss": 1.6984, + "step": 172 + }, + { + "epoch": 0.1743951612903226, + "grad_norm": 0.07809167355298996, + "learning_rate": 0.00019801060153924608, + "loss": 1.7152, + "step": 173 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 0.08765136450529099, + "learning_rate": 0.0001979782319857697, + "loss": 1.6451, + "step": 174 + }, + { + "epoch": 0.17641129032258066, + "grad_norm": 0.07360592484474182, + "learning_rate": 0.00019794560389910102, + "loss": 1.6889, + "step": 175 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 0.09308324754238129, + "learning_rate": 0.00019791271736533512, + "loss": 1.7225, + "step": 176 + }, + { + "epoch": 0.17842741935483872, + "grad_norm": 0.08810586482286453, + "learning_rate": 0.00019787957247124907, + "loss": 1.6808, + "step": 177 + }, + { + "epoch": 0.17943548387096775, + "grad_norm": 0.07750339061021805, + "learning_rate": 0.00019784616930430157, + "loss": 1.6324, + "step": 178 + }, + { + "epoch": 0.18044354838709678, + "grad_norm": 0.08474040031433105, + "learning_rate": 0.00019781250795263295, + "loss": 1.6858, + "step": 179 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 0.08277326822280884, + "learning_rate": 0.0001977785885050647, + "loss": 1.7043, + "step": 180 + }, + { + "epoch": 0.18245967741935484, + "grad_norm": 0.07668858766555786, + "learning_rate": 0.00019774441105109943, + "loss": 1.6599, + "step": 181 + }, + { + "epoch": 0.18346774193548387, + "grad_norm": 0.07402200996875763, + "learning_rate": 0.00019770997568092046, + "loss": 1.6524, + "step": 182 + }, + { + "epoch": 0.1844758064516129, + "grad_norm": 0.08267819881439209, + "learning_rate": 0.0001976752824853917, + "loss": 1.6973, + "step": 183 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 0.0688646137714386, + "learning_rate": 0.00019764033155605747, + "loss": 1.63, + "step": 184 + }, + { + "epoch": 0.18649193548387097, + "grad_norm": 0.0818399116396904, + "learning_rate": 0.00019760512298514198, + "loss": 1.6773, + "step": 185 + }, + { + "epoch": 0.1875, + "grad_norm": 0.08086924254894257, + "learning_rate": 0.0001975696568655494, + "loss": 1.7037, + "step": 186 + }, + { + "epoch": 0.18850806451612903, + "grad_norm": 0.08136597275733948, + "learning_rate": 0.00019753393329086354, + "loss": 1.6634, + "step": 187 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 0.10008742660284042, + "learning_rate": 0.00019749795235534737, + "loss": 1.7139, + "step": 188 + }, + { + "epoch": 0.1905241935483871, + "grad_norm": 0.08657586574554443, + "learning_rate": 0.0001974617141539432, + "loss": 1.6877, + "step": 189 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 0.09825193136930466, + "learning_rate": 0.0001974252187822719, + "loss": 1.7274, + "step": 190 + }, + { + "epoch": 0.19254032258064516, + "grad_norm": 0.06964825093746185, + "learning_rate": 0.00019738846633663318, + "loss": 1.6431, + "step": 191 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.07197541743516922, + "learning_rate": 0.0001973514569140049, + "loss": 1.6532, + "step": 192 + }, + { + "epoch": 0.19455645161290322, + "grad_norm": 0.07691382616758347, + "learning_rate": 0.00019731419061204316, + "loss": 1.6816, + "step": 193 + }, + { + "epoch": 0.19556451612903225, + "grad_norm": 0.08229187875986099, + "learning_rate": 0.00019727666752908173, + "loss": 1.6471, + "step": 194 + }, + { + "epoch": 0.19657258064516128, + "grad_norm": 0.0788332000374794, + "learning_rate": 0.00019723888776413206, + "loss": 1.6745, + "step": 195 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 0.08446817100048065, + "learning_rate": 0.00019720085141688285, + "loss": 1.6863, + "step": 196 + }, + { + "epoch": 0.19858870967741934, + "grad_norm": 0.0747678205370903, + "learning_rate": 0.00019716255858769982, + "loss": 1.6553, + "step": 197 + }, + { + "epoch": 0.19959677419354838, + "grad_norm": 0.08248293399810791, + "learning_rate": 0.0001971240093776255, + "loss": 1.7021, + "step": 198 + }, + { + "epoch": 0.2006048387096774, + "grad_norm": 0.0832241103053093, + "learning_rate": 0.00019708520388837897, + "loss": 1.6832, + "step": 199 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 0.10792431235313416, + "learning_rate": 0.00019704614222235543, + "loss": 1.7196, + "step": 200 + }, + { + "epoch": 0.20262096774193547, + "grad_norm": 0.09173596650362015, + "learning_rate": 0.0001970068244826261, + "loss": 1.7039, + "step": 201 + }, + { + "epoch": 0.20362903225806453, + "grad_norm": 0.07657129317522049, + "learning_rate": 0.00019696725077293796, + "loss": 1.6614, + "step": 202 + }, + { + "epoch": 0.20463709677419356, + "grad_norm": 0.08881079405546188, + "learning_rate": 0.00019692742119771338, + "loss": 1.7062, + "step": 203 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 0.11070767790079117, + "learning_rate": 0.00019688733586204976, + "loss": 1.7759, + "step": 204 + }, + { + "epoch": 0.20665322580645162, + "grad_norm": 0.07556972652673721, + "learning_rate": 0.00019684699487171957, + "loss": 1.6664, + "step": 205 + }, + { + "epoch": 0.20766129032258066, + "grad_norm": 0.11293460428714752, + "learning_rate": 0.00019680639833316975, + "loss": 1.7476, + "step": 206 + }, + { + "epoch": 0.2086693548387097, + "grad_norm": 0.08948105573654175, + "learning_rate": 0.00019676554635352154, + "loss": 1.6933, + "step": 207 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 0.1004069596529007, + "learning_rate": 0.00019672443904057024, + "loss": 1.6909, + "step": 208 + }, + { + "epoch": 0.21068548387096775, + "grad_norm": 0.0815928652882576, + "learning_rate": 0.00019668307650278492, + "loss": 1.6881, + "step": 209 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 0.10198971629142761, + "learning_rate": 0.00019664145884930808, + "loss": 1.6653, + "step": 210 + }, + { + "epoch": 0.2127016129032258, + "grad_norm": 0.07174786180257797, + "learning_rate": 0.00019659958618995532, + "loss": 1.6204, + "step": 211 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 0.09819284826517105, + "learning_rate": 0.0001965574586352153, + "loss": 1.6574, + "step": 212 + }, + { + "epoch": 0.21471774193548387, + "grad_norm": 0.07578348368406296, + "learning_rate": 0.00019651507629624902, + "loss": 1.7096, + "step": 213 + }, + { + "epoch": 0.2157258064516129, + "grad_norm": 0.09160558879375458, + "learning_rate": 0.00019647243928489, + "loss": 1.673, + "step": 214 + }, + { + "epoch": 0.21673387096774194, + "grad_norm": 0.07697172462940216, + "learning_rate": 0.00019642954771364362, + "loss": 1.7069, + "step": 215 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 0.0956280305981636, + "learning_rate": 0.00019638640169568702, + "loss": 1.6727, + "step": 216 + }, + { + "epoch": 0.21875, + "grad_norm": 0.0775306299328804, + "learning_rate": 0.00019634300134486877, + "loss": 1.6846, + "step": 217 + }, + { + "epoch": 0.21975806451612903, + "grad_norm": 0.11724736541509628, + "learning_rate": 0.00019629934677570848, + "loss": 1.6723, + "step": 218 + }, + { + "epoch": 0.22076612903225806, + "grad_norm": 0.08374209702014923, + "learning_rate": 0.00019625543810339652, + "loss": 1.6552, + "step": 219 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 0.09895430505275726, + "learning_rate": 0.00019621127544379392, + "loss": 1.6843, + "step": 220 + }, + { + "epoch": 0.22278225806451613, + "grad_norm": 0.07595435529947281, + "learning_rate": 0.00019616685891343173, + "loss": 1.6878, + "step": 221 + }, + { + "epoch": 0.22379032258064516, + "grad_norm": 0.10327397286891937, + "learning_rate": 0.00019612218862951098, + "loss": 1.641, + "step": 222 + }, + { + "epoch": 0.2247983870967742, + "grad_norm": 0.08979543298482895, + "learning_rate": 0.00019607726470990229, + "loss": 1.7116, + "step": 223 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 0.08411210030317307, + "learning_rate": 0.00019603208727314543, + "loss": 1.6503, + "step": 224 + }, + { + "epoch": 0.22681451612903225, + "grad_norm": 0.08849965780973434, + "learning_rate": 0.00019598665643844924, + "loss": 1.7119, + "step": 225 + }, + { + "epoch": 0.22782258064516128, + "grad_norm": 0.08358252048492432, + "learning_rate": 0.00019594097232569118, + "loss": 1.7034, + "step": 226 + }, + { + "epoch": 0.2288306451612903, + "grad_norm": 0.08862830698490143, + "learning_rate": 0.0001958950350554169, + "loss": 1.6937, + "step": 227 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 0.09029026329517365, + "learning_rate": 0.00019584884474884025, + "loss": 1.6537, + "step": 228 + }, + { + "epoch": 0.23084677419354838, + "grad_norm": 0.0766313225030899, + "learning_rate": 0.00019580240152784265, + "loss": 1.6399, + "step": 229 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 0.09331216663122177, + "learning_rate": 0.00019575570551497287, + "loss": 1.6876, + "step": 230 + }, + { + "epoch": 0.23286290322580644, + "grad_norm": 0.07506153732538223, + "learning_rate": 0.00019570875683344672, + "loss": 1.6339, + "step": 231 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 0.08822404593229294, + "learning_rate": 0.0001956615556071468, + "loss": 1.6883, + "step": 232 + }, + { + "epoch": 0.23487903225806453, + "grad_norm": 0.07617950439453125, + "learning_rate": 0.000195614101960622, + "loss": 1.6845, + "step": 233 + }, + { + "epoch": 0.23588709677419356, + "grad_norm": 0.0857347846031189, + "learning_rate": 0.00019556639601908728, + "loss": 1.6769, + "step": 234 + }, + { + "epoch": 0.2368951612903226, + "grad_norm": 0.08155297487974167, + "learning_rate": 0.00019551843790842338, + "loss": 1.7275, + "step": 235 + }, + { + "epoch": 0.23790322580645162, + "grad_norm": 0.08427773416042328, + "learning_rate": 0.00019547022775517645, + "loss": 1.627, + "step": 236 + }, + { + "epoch": 0.23891129032258066, + "grad_norm": 0.0765247493982315, + "learning_rate": 0.00019542176568655757, + "loss": 1.6719, + "step": 237 + }, + { + "epoch": 0.2399193548387097, + "grad_norm": 0.07752780616283417, + "learning_rate": 0.00019537305183044268, + "loss": 1.6307, + "step": 238 + }, + { + "epoch": 0.24092741935483872, + "grad_norm": 0.07956812530755997, + "learning_rate": 0.00019532408631537203, + "loss": 1.6466, + "step": 239 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 0.07456839084625244, + "learning_rate": 0.00019527486927054994, + "loss": 1.6692, + "step": 240 + }, + { + "epoch": 0.24294354838709678, + "grad_norm": 0.08381907641887665, + "learning_rate": 0.00019522540082584443, + "loss": 1.679, + "step": 241 + }, + { + "epoch": 0.2439516129032258, + "grad_norm": 0.07443513721227646, + "learning_rate": 0.0001951756811117869, + "loss": 1.6867, + "step": 242 + }, + { + "epoch": 0.24495967741935484, + "grad_norm": 0.08541234582662582, + "learning_rate": 0.00019512571025957182, + "loss": 1.6424, + "step": 243 + }, + { + "epoch": 0.24596774193548387, + "grad_norm": 0.07867056876420975, + "learning_rate": 0.00019507548840105618, + "loss": 1.6847, + "step": 244 + }, + { + "epoch": 0.2469758064516129, + "grad_norm": 0.11804165691137314, + "learning_rate": 0.00019502501566875943, + "loss": 1.783, + "step": 245 + }, + { + "epoch": 0.24798387096774194, + "grad_norm": 0.0737847164273262, + "learning_rate": 0.00019497429219586296, + "loss": 1.6644, + "step": 246 + }, + { + "epoch": 0.24899193548387097, + "grad_norm": 0.08608712255954742, + "learning_rate": 0.00019492331811620976, + "loss": 1.6763, + "step": 247 + }, + { + "epoch": 0.25, + "grad_norm": 0.09786904603242874, + "learning_rate": 0.00019487209356430413, + "loss": 1.7245, + "step": 248 + }, + { + "epoch": 0.25100806451612906, + "grad_norm": 0.10795535892248154, + "learning_rate": 0.00019482061867531127, + "loss": 1.7183, + "step": 249 + }, + { + "epoch": 0.25201612903225806, + "grad_norm": 0.0815276950597763, + "learning_rate": 0.0001947688935850569, + "loss": 1.7026, + "step": 250 + }, + { + "epoch": 0.2530241935483871, + "grad_norm": 0.09202085435390472, + "learning_rate": 0.00019471691843002701, + "loss": 1.6327, + "step": 251 + }, + { + "epoch": 0.2540322580645161, + "grad_norm": 0.08682993054389954, + "learning_rate": 0.00019466469334736739, + "loss": 1.6532, + "step": 252 + }, + { + "epoch": 0.2550403225806452, + "grad_norm": 0.08007092773914337, + "learning_rate": 0.00019461221847488333, + "loss": 1.6587, + "step": 253 + }, + { + "epoch": 0.2560483870967742, + "grad_norm": 0.12094767391681671, + "learning_rate": 0.0001945594939510392, + "loss": 1.7491, + "step": 254 + }, + { + "epoch": 0.25705645161290325, + "grad_norm": 0.10074511170387268, + "learning_rate": 0.00019450651991495812, + "loss": 1.7363, + "step": 255 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.0891348272562027, + "learning_rate": 0.00019445329650642163, + "loss": 1.6925, + "step": 256 + }, + { + "epoch": 0.2590725806451613, + "grad_norm": 0.1022176444530487, + "learning_rate": 0.00019439982386586932, + "loss": 1.6419, + "step": 257 + }, + { + "epoch": 0.2600806451612903, + "grad_norm": 0.08925571292638779, + "learning_rate": 0.00019434610213439832, + "loss": 1.6575, + "step": 258 + }, + { + "epoch": 0.2610887096774194, + "grad_norm": 0.07562322169542313, + "learning_rate": 0.0001942921314537631, + "loss": 1.6187, + "step": 259 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 0.09982999414205551, + "learning_rate": 0.000194237911966375, + "loss": 1.6341, + "step": 260 + }, + { + "epoch": 0.26310483870967744, + "grad_norm": 0.08155392110347748, + "learning_rate": 0.0001941834438153019, + "loss": 1.7189, + "step": 261 + }, + { + "epoch": 0.26411290322580644, + "grad_norm": 0.08979921042919159, + "learning_rate": 0.00019412872714426782, + "loss": 1.6556, + "step": 262 + }, + { + "epoch": 0.2651209677419355, + "grad_norm": 0.08493686467409134, + "learning_rate": 0.00019407376209765255, + "loss": 1.6919, + "step": 263 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 0.0822565034031868, + "learning_rate": 0.0001940185488204912, + "loss": 1.6205, + "step": 264 + }, + { + "epoch": 0.26713709677419356, + "grad_norm": 0.08931294083595276, + "learning_rate": 0.00019396308745847402, + "loss": 1.6848, + "step": 265 + }, + { + "epoch": 0.26814516129032256, + "grad_norm": 0.08736932277679443, + "learning_rate": 0.00019390737815794574, + "loss": 1.6882, + "step": 266 + }, + { + "epoch": 0.2691532258064516, + "grad_norm": 0.09153414517641068, + "learning_rate": 0.00019385142106590535, + "loss": 1.7596, + "step": 267 + }, + { + "epoch": 0.2701612903225806, + "grad_norm": 0.07890645414590836, + "learning_rate": 0.00019379521633000572, + "loss": 1.6987, + "step": 268 + }, + { + "epoch": 0.2711693548387097, + "grad_norm": 0.08790858089923859, + "learning_rate": 0.0001937387640985532, + "loss": 1.6744, + "step": 269 + }, + { + "epoch": 0.2721774193548387, + "grad_norm": 0.0803663581609726, + "learning_rate": 0.00019368206452050713, + "loss": 1.6846, + "step": 270 + }, + { + "epoch": 0.27318548387096775, + "grad_norm": 0.09086322039365768, + "learning_rate": 0.00019362511774547955, + "loss": 1.6878, + "step": 271 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 0.07199586182832718, + "learning_rate": 0.00019356792392373479, + "loss": 1.6316, + "step": 272 + }, + { + "epoch": 0.2752016129032258, + "grad_norm": 0.08460623025894165, + "learning_rate": 0.00019351048320618896, + "loss": 1.6558, + "step": 273 + }, + { + "epoch": 0.2762096774193548, + "grad_norm": 0.0732608363032341, + "learning_rate": 0.0001934527957444098, + "loss": 1.6752, + "step": 274 + }, + { + "epoch": 0.2772177419354839, + "grad_norm": 0.0906132385134697, + "learning_rate": 0.00019339486169061608, + "loss": 1.7395, + "step": 275 + }, + { + "epoch": 0.2782258064516129, + "grad_norm": 0.07827211916446686, + "learning_rate": 0.00019333668119767716, + "loss": 1.6681, + "step": 276 + }, + { + "epoch": 0.27923387096774194, + "grad_norm": 0.08276840299367905, + "learning_rate": 0.00019327825441911275, + "loss": 1.6645, + "step": 277 + }, + { + "epoch": 0.28024193548387094, + "grad_norm": 0.09114561229944229, + "learning_rate": 0.00019321958150909243, + "loss": 1.6857, + "step": 278 + }, + { + "epoch": 0.28125, + "grad_norm": 0.08729056268930435, + "learning_rate": 0.00019316066262243525, + "loss": 1.6483, + "step": 279 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 0.08572946488857269, + "learning_rate": 0.00019310149791460925, + "loss": 1.6872, + "step": 280 + }, + { + "epoch": 0.28326612903225806, + "grad_norm": 0.10044838488101959, + "learning_rate": 0.00019304208754173117, + "loss": 1.6935, + "step": 281 + }, + { + "epoch": 0.2842741935483871, + "grad_norm": 0.0785636454820633, + "learning_rate": 0.000192982431660566, + "loss": 1.6613, + "step": 282 + }, + { + "epoch": 0.2852822580645161, + "grad_norm": 0.08499724417924881, + "learning_rate": 0.00019292253042852648, + "loss": 1.6208, + "step": 283 + }, + { + "epoch": 0.2862903225806452, + "grad_norm": 0.09399082511663437, + "learning_rate": 0.00019286238400367277, + "loss": 1.619, + "step": 284 + }, + { + "epoch": 0.2872983870967742, + "grad_norm": 0.07334808260202408, + "learning_rate": 0.0001928019925447121, + "loss": 1.6813, + "step": 285 + }, + { + "epoch": 0.28830645161290325, + "grad_norm": 0.09035395085811615, + "learning_rate": 0.00019274135621099813, + "loss": 1.6265, + "step": 286 + }, + { + "epoch": 0.28931451612903225, + "grad_norm": 0.07861501723527908, + "learning_rate": 0.00019268047516253077, + "loss": 1.6808, + "step": 287 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.09788773208856583, + "learning_rate": 0.00019261934955995563, + "loss": 1.708, + "step": 288 + }, + { + "epoch": 0.2913306451612903, + "grad_norm": 0.07571721822023392, + "learning_rate": 0.00019255797956456357, + "loss": 1.6612, + "step": 289 + }, + { + "epoch": 0.2923387096774194, + "grad_norm": 0.0836874321103096, + "learning_rate": 0.00019249636533829042, + "loss": 1.6804, + "step": 290 + }, + { + "epoch": 0.2933467741935484, + "grad_norm": 0.08373916894197464, + "learning_rate": 0.00019243450704371632, + "loss": 1.6317, + "step": 291 + }, + { + "epoch": 0.29435483870967744, + "grad_norm": 0.08029752969741821, + "learning_rate": 0.00019237240484406561, + "loss": 1.6782, + "step": 292 + }, + { + "epoch": 0.29536290322580644, + "grad_norm": 0.08353215456008911, + "learning_rate": 0.00019231005890320602, + "loss": 1.6517, + "step": 293 + }, + { + "epoch": 0.2963709677419355, + "grad_norm": 0.09467596560716629, + "learning_rate": 0.00019224746938564859, + "loss": 1.6862, + "step": 294 + }, + { + "epoch": 0.2973790322580645, + "grad_norm": 0.10909095406532288, + "learning_rate": 0.000192184636456547, + "loss": 1.6579, + "step": 295 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 0.08434964716434479, + "learning_rate": 0.00019212156028169724, + "loss": 1.6516, + "step": 296 + }, + { + "epoch": 0.29939516129032256, + "grad_norm": 0.09146866202354431, + "learning_rate": 0.00019205824102753717, + "loss": 1.6754, + "step": 297 + }, + { + "epoch": 0.3004032258064516, + "grad_norm": 0.10936370491981506, + "learning_rate": 0.00019199467886114603, + "loss": 1.6495, + "step": 298 + }, + { + "epoch": 0.3014112903225806, + "grad_norm": 0.08099015057086945, + "learning_rate": 0.00019193087395024397, + "loss": 1.6656, + "step": 299 + }, + { + "epoch": 0.3024193548387097, + "grad_norm": 0.09252738207578659, + "learning_rate": 0.0001918668264631918, + "loss": 1.6711, + "step": 300 + }, + { + "epoch": 0.3034274193548387, + "grad_norm": 0.08917499333620071, + "learning_rate": 0.0001918025365689903, + "loss": 1.6356, + "step": 301 + }, + { + "epoch": 0.30443548387096775, + "grad_norm": 0.088597372174263, + "learning_rate": 0.00019173800443727994, + "loss": 1.6659, + "step": 302 + }, + { + "epoch": 0.30544354838709675, + "grad_norm": 0.09308971464633942, + "learning_rate": 0.00019167323023834033, + "loss": 1.7218, + "step": 303 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 0.07813969999551773, + "learning_rate": 0.00019160821414308988, + "loss": 1.6042, + "step": 304 + }, + { + "epoch": 0.3074596774193548, + "grad_norm": 0.08843039721250534, + "learning_rate": 0.0001915429563230853, + "loss": 1.6409, + "step": 305 + }, + { + "epoch": 0.3084677419354839, + "grad_norm": 0.09537311643362045, + "learning_rate": 0.00019147745695052097, + "loss": 1.6723, + "step": 306 + }, + { + "epoch": 0.3094758064516129, + "grad_norm": 0.08754942566156387, + "learning_rate": 0.00019141171619822882, + "loss": 1.643, + "step": 307 + }, + { + "epoch": 0.31048387096774194, + "grad_norm": 0.07768256217241287, + "learning_rate": 0.0001913457342396777, + "loss": 1.6109, + "step": 308 + }, + { + "epoch": 0.31149193548387094, + "grad_norm": 0.09593945741653442, + "learning_rate": 0.00019127951124897283, + "loss": 1.6756, + "step": 309 + }, + { + "epoch": 0.3125, + "grad_norm": 0.07348258048295975, + "learning_rate": 0.00019121304740085546, + "loss": 1.623, + "step": 310 + }, + { + "epoch": 0.31350806451612906, + "grad_norm": 0.08579769730567932, + "learning_rate": 0.0001911463428707025, + "loss": 1.658, + "step": 311 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 0.08485422283411026, + "learning_rate": 0.00019107939783452577, + "loss": 1.655, + "step": 312 + }, + { + "epoch": 0.3155241935483871, + "grad_norm": 0.08101114630699158, + "learning_rate": 0.00019101221246897184, + "loss": 1.6391, + "step": 313 + }, + { + "epoch": 0.3165322580645161, + "grad_norm": 0.08206996321678162, + "learning_rate": 0.00019094478695132138, + "loss": 1.6131, + "step": 314 + }, + { + "epoch": 0.3175403225806452, + "grad_norm": 0.07818609476089478, + "learning_rate": 0.00019087712145948868, + "loss": 1.6632, + "step": 315 + }, + { + "epoch": 0.3185483870967742, + "grad_norm": 0.09414539486169815, + "learning_rate": 0.0001908092161720214, + "loss": 1.6717, + "step": 316 + }, + { + "epoch": 0.31955645161290325, + "grad_norm": 0.08382460474967957, + "learning_rate": 0.00019074107126809984, + "loss": 1.6867, + "step": 317 + }, + { + "epoch": 0.32056451612903225, + "grad_norm": 0.07750436663627625, + "learning_rate": 0.00019067268692753655, + "loss": 1.6311, + "step": 318 + }, + { + "epoch": 0.3215725806451613, + "grad_norm": 0.08067768812179565, + "learning_rate": 0.00019060406333077596, + "loss": 1.6681, + "step": 319 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.074059396982193, + "learning_rate": 0.00019053520065889375, + "loss": 1.6408, + "step": 320 + }, + { + "epoch": 0.3235887096774194, + "grad_norm": 0.10559958219528198, + "learning_rate": 0.00019046609909359648, + "loss": 1.7342, + "step": 321 + }, + { + "epoch": 0.3245967741935484, + "grad_norm": 0.08121935278177261, + "learning_rate": 0.00019039675881722104, + "loss": 1.6808, + "step": 322 + }, + { + "epoch": 0.32560483870967744, + "grad_norm": 0.08211352676153183, + "learning_rate": 0.00019032718001273427, + "loss": 1.6127, + "step": 323 + }, + { + "epoch": 0.32661290322580644, + "grad_norm": 0.07450398057699203, + "learning_rate": 0.0001902573628637323, + "loss": 1.6555, + "step": 324 + }, + { + "epoch": 0.3276209677419355, + "grad_norm": 0.0976330116391182, + "learning_rate": 0.0001901873075544403, + "loss": 1.6775, + "step": 325 + }, + { + "epoch": 0.3286290322580645, + "grad_norm": 0.08012880384922028, + "learning_rate": 0.00019011701426971178, + "loss": 1.6213, + "step": 326 + }, + { + "epoch": 0.32963709677419356, + "grad_norm": 0.08508668839931488, + "learning_rate": 0.00019004648319502824, + "loss": 1.5809, + "step": 327 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 0.08622655272483826, + "learning_rate": 0.00018997571451649856, + "loss": 1.666, + "step": 328 + }, + { + "epoch": 0.3316532258064516, + "grad_norm": 0.09803669154644012, + "learning_rate": 0.00018990470842085867, + "loss": 1.6784, + "step": 329 + }, + { + "epoch": 0.3326612903225806, + "grad_norm": 0.08453961461782455, + "learning_rate": 0.0001898334650954709, + "loss": 1.6109, + "step": 330 + }, + { + "epoch": 0.3336693548387097, + "grad_norm": 0.07246208935976028, + "learning_rate": 0.00018976198472832364, + "loss": 1.6117, + "step": 331 + }, + { + "epoch": 0.3346774193548387, + "grad_norm": 0.08284757286310196, + "learning_rate": 0.00018969026750803063, + "loss": 1.6094, + "step": 332 + }, + { + "epoch": 0.33568548387096775, + "grad_norm": 0.08026500046253204, + "learning_rate": 0.00018961831362383067, + "loss": 1.6555, + "step": 333 + }, + { + "epoch": 0.33669354838709675, + "grad_norm": 0.08912428468465805, + "learning_rate": 0.00018954612326558707, + "loss": 1.6602, + "step": 334 + }, + { + "epoch": 0.3377016129032258, + "grad_norm": 0.08738451451063156, + "learning_rate": 0.00018947369662378704, + "loss": 1.6125, + "step": 335 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 0.07017836719751358, + "learning_rate": 0.00018940103388954133, + "loss": 1.6173, + "step": 336 + }, + { + "epoch": 0.3397177419354839, + "grad_norm": 0.08264176547527313, + "learning_rate": 0.00018932813525458363, + "loss": 1.6716, + "step": 337 + }, + { + "epoch": 0.3407258064516129, + "grad_norm": 0.08516332507133484, + "learning_rate": 0.00018925500091127007, + "loss": 1.6752, + "step": 338 + }, + { + "epoch": 0.34173387096774194, + "grad_norm": 0.07101423293352127, + "learning_rate": 0.00018918163105257883, + "loss": 1.6393, + "step": 339 + }, + { + "epoch": 0.34274193548387094, + "grad_norm": 0.07172892987728119, + "learning_rate": 0.00018910802587210942, + "loss": 1.6116, + "step": 340 + }, + { + "epoch": 0.34375, + "grad_norm": 0.07889813184738159, + "learning_rate": 0.0001890341855640824, + "loss": 1.6107, + "step": 341 + }, + { + "epoch": 0.34475806451612906, + "grad_norm": 0.07734905183315277, + "learning_rate": 0.0001889601103233387, + "loss": 1.6686, + "step": 342 + }, + { + "epoch": 0.34576612903225806, + "grad_norm": 0.09568161517381668, + "learning_rate": 0.00018888580034533915, + "loss": 1.6914, + "step": 343 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 0.0727929100394249, + "learning_rate": 0.000188811255826164, + "loss": 1.6271, + "step": 344 + }, + { + "epoch": 0.3477822580645161, + "grad_norm": 0.07241855561733246, + "learning_rate": 0.0001887364769625124, + "loss": 1.6514, + "step": 345 + }, + { + "epoch": 0.3487903225806452, + "grad_norm": 0.07215382158756256, + "learning_rate": 0.00018866146395170178, + "loss": 1.6578, + "step": 346 + }, + { + "epoch": 0.3497983870967742, + "grad_norm": 0.07429207116365433, + "learning_rate": 0.00018858621699166755, + "loss": 1.6176, + "step": 347 + }, + { + "epoch": 0.35080645161290325, + "grad_norm": 0.07516060024499893, + "learning_rate": 0.00018851073628096225, + "loss": 1.6735, + "step": 348 + }, + { + "epoch": 0.35181451612903225, + "grad_norm": 0.08864877372980118, + "learning_rate": 0.0001884350220187554, + "loss": 1.6044, + "step": 349 + }, + { + "epoch": 0.3528225806451613, + "grad_norm": 0.0749056488275528, + "learning_rate": 0.00018835907440483267, + "loss": 1.6316, + "step": 350 + }, + { + "epoch": 0.3538306451612903, + "grad_norm": 0.09181974828243256, + "learning_rate": 0.0001882828936395955, + "loss": 1.6834, + "step": 351 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 0.08013599365949631, + "learning_rate": 0.00018820647992406054, + "loss": 1.6367, + "step": 352 + }, + { + "epoch": 0.3558467741935484, + "grad_norm": 0.0809824988245964, + "learning_rate": 0.00018812983345985914, + "loss": 1.658, + "step": 353 + }, + { + "epoch": 0.35685483870967744, + "grad_norm": 0.1000952199101448, + "learning_rate": 0.0001880529544492368, + "loss": 1.6571, + "step": 354 + }, + { + "epoch": 0.35786290322580644, + "grad_norm": 0.074663445353508, + "learning_rate": 0.00018797584309505254, + "loss": 1.6358, + "step": 355 + }, + { + "epoch": 0.3588709677419355, + "grad_norm": 0.0898260623216629, + "learning_rate": 0.00018789849960077864, + "loss": 1.6496, + "step": 356 + }, + { + "epoch": 0.3598790322580645, + "grad_norm": 0.08878135681152344, + "learning_rate": 0.00018782092417049979, + "loss": 1.6819, + "step": 357 + }, + { + "epoch": 0.36088709677419356, + "grad_norm": 0.07256605476140976, + "learning_rate": 0.00018774311700891269, + "loss": 1.6521, + "step": 358 + }, + { + "epoch": 0.36189516129032256, + "grad_norm": 0.07939675450325012, + "learning_rate": 0.00018766507832132558, + "loss": 1.6898, + "step": 359 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 0.07508337497711182, + "learning_rate": 0.00018758680831365755, + "loss": 1.6204, + "step": 360 + }, + { + "epoch": 0.3639112903225806, + "grad_norm": 0.07679913192987442, + "learning_rate": 0.00018750830719243812, + "loss": 1.597, + "step": 361 + }, + { + "epoch": 0.3649193548387097, + "grad_norm": 0.07900839298963547, + "learning_rate": 0.00018742957516480657, + "loss": 1.6197, + "step": 362 + }, + { + "epoch": 0.3659274193548387, + "grad_norm": 0.08279551565647125, + "learning_rate": 0.00018735061243851158, + "loss": 1.7151, + "step": 363 + }, + { + "epoch": 0.36693548387096775, + "grad_norm": 0.10616319626569748, + "learning_rate": 0.00018727141922191047, + "loss": 1.7228, + "step": 364 + }, + { + "epoch": 0.36794354838709675, + "grad_norm": 0.08777708560228348, + "learning_rate": 0.00018719199572396882, + "loss": 1.6661, + "step": 365 + }, + { + "epoch": 0.3689516129032258, + "grad_norm": 0.0981433242559433, + "learning_rate": 0.00018711234215425978, + "loss": 1.6331, + "step": 366 + }, + { + "epoch": 0.3699596774193548, + "grad_norm": 0.07754123210906982, + "learning_rate": 0.00018703245872296365, + "loss": 1.6757, + "step": 367 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 0.09494742751121521, + "learning_rate": 0.00018695234564086724, + "loss": 1.6565, + "step": 368 + }, + { + "epoch": 0.3719758064516129, + "grad_norm": 0.100984126329422, + "learning_rate": 0.00018687200311936328, + "loss": 1.6879, + "step": 369 + }, + { + "epoch": 0.37298387096774194, + "grad_norm": 0.08996261656284332, + "learning_rate": 0.00018679143137045006, + "loss": 1.6579, + "step": 370 + }, + { + "epoch": 0.37399193548387094, + "grad_norm": 0.0966666117310524, + "learning_rate": 0.00018671063060673055, + "loss": 1.5853, + "step": 371 + }, + { + "epoch": 0.375, + "grad_norm": 0.07991211116313934, + "learning_rate": 0.00018662960104141215, + "loss": 1.6355, + "step": 372 + }, + { + "epoch": 0.37600806451612906, + "grad_norm": 0.09592580795288086, + "learning_rate": 0.00018654834288830591, + "loss": 1.6172, + "step": 373 + }, + { + "epoch": 0.37701612903225806, + "grad_norm": 0.07976924628019333, + "learning_rate": 0.00018646685636182614, + "loss": 1.641, + "step": 374 + }, + { + "epoch": 0.3780241935483871, + "grad_norm": 0.08822676539421082, + "learning_rate": 0.00018638514167698965, + "loss": 1.6267, + "step": 375 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 0.07680735737085342, + "learning_rate": 0.00018630319904941535, + "loss": 1.6484, + "step": 376 + }, + { + "epoch": 0.3800403225806452, + "grad_norm": 0.09095903486013412, + "learning_rate": 0.0001862210286953236, + "loss": 1.6041, + "step": 377 + }, + { + "epoch": 0.3810483870967742, + "grad_norm": 0.07204829901456833, + "learning_rate": 0.0001861386308315357, + "loss": 1.6058, + "step": 378 + }, + { + "epoch": 0.38205645161290325, + "grad_norm": 0.12447134405374527, + "learning_rate": 0.00018605600567547318, + "loss": 1.6528, + "step": 379 + }, + { + "epoch": 0.38306451612903225, + "grad_norm": 0.08234449476003647, + "learning_rate": 0.00018597315344515744, + "loss": 1.6408, + "step": 380 + }, + { + "epoch": 0.3840725806451613, + "grad_norm": 0.0997692123055458, + "learning_rate": 0.00018589007435920892, + "loss": 1.631, + "step": 381 + }, + { + "epoch": 0.3850806451612903, + "grad_norm": 0.10275771468877792, + "learning_rate": 0.0001858067686368468, + "loss": 1.6979, + "step": 382 + }, + { + "epoch": 0.3860887096774194, + "grad_norm": 0.07703027874231339, + "learning_rate": 0.00018572323649788822, + "loss": 1.6037, + "step": 383 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.08485141396522522, + "learning_rate": 0.0001856394781627477, + "loss": 1.6027, + "step": 384 + }, + { + "epoch": 0.38810483870967744, + "grad_norm": 0.09312494099140167, + "learning_rate": 0.00018555549385243674, + "loss": 1.6757, + "step": 385 + }, + { + "epoch": 0.38911290322580644, + "grad_norm": 0.09300917387008667, + "learning_rate": 0.000185471283788563, + "loss": 1.6615, + "step": 386 + }, + { + "epoch": 0.3901209677419355, + "grad_norm": 0.07911553978919983, + "learning_rate": 0.0001853868481933299, + "loss": 1.6214, + "step": 387 + }, + { + "epoch": 0.3911290322580645, + "grad_norm": 0.07960621267557144, + "learning_rate": 0.00018530218728953597, + "loss": 1.6709, + "step": 388 + }, + { + "epoch": 0.39213709677419356, + "grad_norm": 0.0723830983042717, + "learning_rate": 0.0001852173013005742, + "loss": 1.6287, + "step": 389 + }, + { + "epoch": 0.39314516129032256, + "grad_norm": 0.08178212493658066, + "learning_rate": 0.00018513219045043156, + "loss": 1.5888, + "step": 390 + }, + { + "epoch": 0.3941532258064516, + "grad_norm": 0.07604778558015823, + "learning_rate": 0.00018504685496368838, + "loss": 1.6097, + "step": 391 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 0.07833520323038101, + "learning_rate": 0.00018496129506551763, + "loss": 1.6119, + "step": 392 + }, + { + "epoch": 0.3961693548387097, + "grad_norm": 0.0738687738776207, + "learning_rate": 0.00018487551098168452, + "loss": 1.646, + "step": 393 + }, + { + "epoch": 0.3971774193548387, + "grad_norm": 0.08156421035528183, + "learning_rate": 0.0001847895029385458, + "loss": 1.612, + "step": 394 + }, + { + "epoch": 0.39818548387096775, + "grad_norm": 0.0760064423084259, + "learning_rate": 0.00018470327116304916, + "loss": 1.6556, + "step": 395 + }, + { + "epoch": 0.39919354838709675, + "grad_norm": 0.07635514438152313, + "learning_rate": 0.0001846168158827326, + "loss": 1.5948, + "step": 396 + }, + { + "epoch": 0.4002016129032258, + "grad_norm": 0.07415641099214554, + "learning_rate": 0.00018453013732572403, + "loss": 1.6379, + "step": 397 + }, + { + "epoch": 0.4012096774193548, + "grad_norm": 0.07627629488706589, + "learning_rate": 0.00018444323572074035, + "loss": 1.6067, + "step": 398 + }, + { + "epoch": 0.4022177419354839, + "grad_norm": 0.08279147744178772, + "learning_rate": 0.00018435611129708713, + "loss": 1.6152, + "step": 399 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 0.07391797006130219, + "learning_rate": 0.00018426876428465777, + "loss": 1.6568, + "step": 400 + }, + { + "epoch": 0.40423387096774194, + "grad_norm": 0.07815629243850708, + "learning_rate": 0.00018418119491393312, + "loss": 1.6301, + "step": 401 + }, + { + "epoch": 0.40524193548387094, + "grad_norm": 0.07491758465766907, + "learning_rate": 0.0001840934034159807, + "loss": 1.6668, + "step": 402 + }, + { + "epoch": 0.40625, + "grad_norm": 0.07878877222537994, + "learning_rate": 0.0001840053900224542, + "loss": 1.6305, + "step": 403 + }, + { + "epoch": 0.40725806451612906, + "grad_norm": 0.07592154294252396, + "learning_rate": 0.00018391715496559273, + "loss": 1.6853, + "step": 404 + }, + { + "epoch": 0.40826612903225806, + "grad_norm": 0.082845039665699, + "learning_rate": 0.00018382869847822044, + "loss": 1.6918, + "step": 405 + }, + { + "epoch": 0.4092741935483871, + "grad_norm": 0.07842651754617691, + "learning_rate": 0.00018374002079374569, + "loss": 1.65, + "step": 406 + }, + { + "epoch": 0.4102822580645161, + "grad_norm": 0.07326355576515198, + "learning_rate": 0.0001836511221461604, + "loss": 1.6157, + "step": 407 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 0.08537916839122772, + "learning_rate": 0.00018356200277003975, + "loss": 1.5959, + "step": 408 + }, + { + "epoch": 0.4122983870967742, + "grad_norm": 0.09612290561199188, + "learning_rate": 0.00018347266290054116, + "loss": 1.6876, + "step": 409 + }, + { + "epoch": 0.41330645161290325, + "grad_norm": 0.07688483595848083, + "learning_rate": 0.00018338310277340406, + "loss": 1.6094, + "step": 410 + }, + { + "epoch": 0.41431451612903225, + "grad_norm": 0.09224136173725128, + "learning_rate": 0.00018329332262494887, + "loss": 1.616, + "step": 411 + }, + { + "epoch": 0.4153225806451613, + "grad_norm": 0.09629214555025101, + "learning_rate": 0.00018320332269207667, + "loss": 1.6197, + "step": 412 + }, + { + "epoch": 0.4163306451612903, + "grad_norm": 0.0956406518816948, + "learning_rate": 0.00018311310321226853, + "loss": 1.6939, + "step": 413 + }, + { + "epoch": 0.4173387096774194, + "grad_norm": 0.11505012959241867, + "learning_rate": 0.00018302266442358472, + "loss": 1.6692, + "step": 414 + }, + { + "epoch": 0.4183467741935484, + "grad_norm": 0.08150719106197357, + "learning_rate": 0.0001829320065646643, + "loss": 1.6428, + "step": 415 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 0.10705471783876419, + "learning_rate": 0.0001828411298747243, + "loss": 1.7328, + "step": 416 + }, + { + "epoch": 0.42036290322580644, + "grad_norm": 0.10280334204435349, + "learning_rate": 0.00018275003459355924, + "loss": 1.6245, + "step": 417 + }, + { + "epoch": 0.4213709677419355, + "grad_norm": 0.07620084285736084, + "learning_rate": 0.00018265872096154043, + "loss": 1.6317, + "step": 418 + }, + { + "epoch": 0.4223790322580645, + "grad_norm": 0.09292726963758469, + "learning_rate": 0.00018256718921961525, + "loss": 1.6555, + "step": 419 + }, + { + "epoch": 0.42338709677419356, + "grad_norm": 0.07884904742240906, + "learning_rate": 0.00018247543960930672, + "loss": 1.6325, + "step": 420 + }, + { + "epoch": 0.42439516129032256, + "grad_norm": 0.1114020049571991, + "learning_rate": 0.00018238347237271266, + "loss": 1.6861, + "step": 421 + }, + { + "epoch": 0.4254032258064516, + "grad_norm": 0.08363789319992065, + "learning_rate": 0.00018229128775250523, + "loss": 1.6398, + "step": 422 + }, + { + "epoch": 0.4264112903225806, + "grad_norm": 0.10317594558000565, + "learning_rate": 0.00018219888599193008, + "loss": 1.5966, + "step": 423 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 0.09324808418750763, + "learning_rate": 0.00018210626733480593, + "loss": 1.6463, + "step": 424 + }, + { + "epoch": 0.4284274193548387, + "grad_norm": 0.0866997167468071, + "learning_rate": 0.00018201343202552367, + "loss": 1.5802, + "step": 425 + }, + { + "epoch": 0.42943548387096775, + "grad_norm": 0.09528562426567078, + "learning_rate": 0.00018192038030904608, + "loss": 1.6768, + "step": 426 + }, + { + "epoch": 0.43044354838709675, + "grad_norm": 0.08449150621891022, + "learning_rate": 0.00018182711243090678, + "loss": 1.6323, + "step": 427 + }, + { + "epoch": 0.4314516129032258, + "grad_norm": 0.07713552564382553, + "learning_rate": 0.00018173362863720986, + "loss": 1.6264, + "step": 428 + }, + { + "epoch": 0.4324596774193548, + "grad_norm": 0.08549489825963974, + "learning_rate": 0.00018163992917462918, + "loss": 1.6628, + "step": 429 + }, + { + "epoch": 0.4334677419354839, + "grad_norm": 0.07783807069063187, + "learning_rate": 0.00018154601429040757, + "loss": 1.6892, + "step": 430 + }, + { + "epoch": 0.4344758064516129, + "grad_norm": 0.09653409570455551, + "learning_rate": 0.00018145188423235634, + "loss": 1.6651, + "step": 431 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 0.08650687336921692, + "learning_rate": 0.00018135753924885465, + "loss": 1.6113, + "step": 432 + }, + { + "epoch": 0.43649193548387094, + "grad_norm": 0.08643219619989395, + "learning_rate": 0.00018126297958884866, + "loss": 1.6111, + "step": 433 + }, + { + "epoch": 0.4375, + "grad_norm": 0.08586744964122772, + "learning_rate": 0.00018116820550185107, + "loss": 1.643, + "step": 434 + }, + { + "epoch": 0.43850806451612906, + "grad_norm": 0.09063699096441269, + "learning_rate": 0.00018107321723794036, + "loss": 1.6422, + "step": 435 + }, + { + "epoch": 0.43951612903225806, + "grad_norm": 0.07849163562059402, + "learning_rate": 0.00018097801504776012, + "loss": 1.6183, + "step": 436 + }, + { + "epoch": 0.4405241935483871, + "grad_norm": 0.07795203477144241, + "learning_rate": 0.00018088259918251846, + "loss": 1.6267, + "step": 437 + }, + { + "epoch": 0.4415322580645161, + "grad_norm": 0.08508776873350143, + "learning_rate": 0.00018078696989398734, + "loss": 1.6581, + "step": 438 + }, + { + "epoch": 0.4425403225806452, + "grad_norm": 0.08001305162906647, + "learning_rate": 0.00018069112743450183, + "loss": 1.6287, + "step": 439 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 0.07482777535915375, + "learning_rate": 0.0001805950720569595, + "loss": 1.6426, + "step": 440 + }, + { + "epoch": 0.44455645161290325, + "grad_norm": 0.07578035444021225, + "learning_rate": 0.00018049880401481972, + "loss": 1.6294, + "step": 441 + }, + { + "epoch": 0.44556451612903225, + "grad_norm": 0.07782859355211258, + "learning_rate": 0.00018040232356210308, + "loss": 1.5935, + "step": 442 + }, + { + "epoch": 0.4465725806451613, + "grad_norm": 0.07492804527282715, + "learning_rate": 0.00018030563095339062, + "loss": 1.5769, + "step": 443 + }, + { + "epoch": 0.4475806451612903, + "grad_norm": 0.07825621962547302, + "learning_rate": 0.00018020872644382313, + "loss": 1.5786, + "step": 444 + }, + { + "epoch": 0.4485887096774194, + "grad_norm": 0.09208081662654877, + "learning_rate": 0.0001801116102891006, + "loss": 1.6649, + "step": 445 + }, + { + "epoch": 0.4495967741935484, + "grad_norm": 0.07900070399045944, + "learning_rate": 0.00018001428274548156, + "loss": 1.6529, + "step": 446 + }, + { + "epoch": 0.45060483870967744, + "grad_norm": 0.07847368717193604, + "learning_rate": 0.00017991674406978215, + "loss": 1.6133, + "step": 447 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.0754162147641182, + "learning_rate": 0.00017981899451937573, + "loss": 1.6478, + "step": 448 + }, + { + "epoch": 0.4526209677419355, + "grad_norm": 0.08314093947410583, + "learning_rate": 0.0001797210343521921, + "loss": 1.5926, + "step": 449 + }, + { + "epoch": 0.4536290322580645, + "grad_norm": 0.07506029307842255, + "learning_rate": 0.00017962286382671678, + "loss": 1.6031, + "step": 450 + }, + { + "epoch": 0.45463709677419356, + "grad_norm": 0.09021966904401779, + "learning_rate": 0.00017952448320199035, + "loss": 1.5805, + "step": 451 + }, + { + "epoch": 0.45564516129032256, + "grad_norm": 0.07435688376426697, + "learning_rate": 0.00017942589273760783, + "loss": 1.6291, + "step": 452 + }, + { + "epoch": 0.4566532258064516, + "grad_norm": 0.07785916328430176, + "learning_rate": 0.00017932709269371784, + "loss": 1.6525, + "step": 453 + }, + { + "epoch": 0.4576612903225806, + "grad_norm": 0.07916136831045151, + "learning_rate": 0.00017922808333102207, + "loss": 1.6301, + "step": 454 + }, + { + "epoch": 0.4586693548387097, + "grad_norm": 0.08399738371372223, + "learning_rate": 0.00017912886491077462, + "loss": 1.6915, + "step": 455 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 0.08618689328432083, + "learning_rate": 0.000179029437694781, + "loss": 1.6718, + "step": 456 + }, + { + "epoch": 0.46068548387096775, + "grad_norm": 0.07570008933544159, + "learning_rate": 0.00017892980194539798, + "loss": 1.6588, + "step": 457 + }, + { + "epoch": 0.46169354838709675, + "grad_norm": 0.09821120649576187, + "learning_rate": 0.00017882995792553228, + "loss": 1.6914, + "step": 458 + }, + { + "epoch": 0.4627016129032258, + "grad_norm": 0.07994726300239563, + "learning_rate": 0.00017872990589864034, + "loss": 1.6077, + "step": 459 + }, + { + "epoch": 0.4637096774193548, + "grad_norm": 0.08893134444952011, + "learning_rate": 0.00017862964612872748, + "loss": 1.6447, + "step": 460 + }, + { + "epoch": 0.4647177419354839, + "grad_norm": 0.08347106724977493, + "learning_rate": 0.00017852917888034706, + "loss": 1.6501, + "step": 461 + }, + { + "epoch": 0.4657258064516129, + "grad_norm": 0.07879969477653503, + "learning_rate": 0.00017842850441860005, + "loss": 1.643, + "step": 462 + }, + { + "epoch": 0.46673387096774194, + "grad_norm": 0.08305401355028152, + "learning_rate": 0.00017832762300913413, + "loss": 1.677, + "step": 463 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 0.0827251598238945, + "learning_rate": 0.00017822653491814304, + "loss": 1.6432, + "step": 464 + }, + { + "epoch": 0.46875, + "grad_norm": 0.08472172170877457, + "learning_rate": 0.00017812524041236586, + "loss": 1.654, + "step": 465 + }, + { + "epoch": 0.46975806451612906, + "grad_norm": 0.07689754664897919, + "learning_rate": 0.0001780237397590864, + "loss": 1.5642, + "step": 466 + }, + { + "epoch": 0.47076612903225806, + "grad_norm": 0.10658534616231918, + "learning_rate": 0.00017792203322613236, + "loss": 1.6561, + "step": 467 + }, + { + "epoch": 0.4717741935483871, + "grad_norm": 0.08347711712121964, + "learning_rate": 0.0001778201210818748, + "loss": 1.6595, + "step": 468 + }, + { + "epoch": 0.4727822580645161, + "grad_norm": 0.08595866709947586, + "learning_rate": 0.0001777180035952272, + "loss": 1.6185, + "step": 469 + }, + { + "epoch": 0.4737903225806452, + "grad_norm": 0.08824612945318222, + "learning_rate": 0.00017761568103564487, + "loss": 1.6779, + "step": 470 + }, + { + "epoch": 0.4747983870967742, + "grad_norm": 0.07452390342950821, + "learning_rate": 0.0001775131536731244, + "loss": 1.6252, + "step": 471 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 0.09783647954463959, + "learning_rate": 0.00017741042177820258, + "loss": 1.6417, + "step": 472 + }, + { + "epoch": 0.47681451612903225, + "grad_norm": 0.07527977973222733, + "learning_rate": 0.0001773074856219561, + "loss": 1.6128, + "step": 473 + }, + { + "epoch": 0.4778225806451613, + "grad_norm": 0.07836946099996567, + "learning_rate": 0.00017720434547600043, + "loss": 1.625, + "step": 474 + }, + { + "epoch": 0.4788306451612903, + "grad_norm": 0.07427874952554703, + "learning_rate": 0.00017710100161248945, + "loss": 1.6261, + "step": 475 + }, + { + "epoch": 0.4798387096774194, + "grad_norm": 0.09168553352355957, + "learning_rate": 0.0001769974543041145, + "loss": 1.702, + "step": 476 + }, + { + "epoch": 0.4808467741935484, + "grad_norm": 0.0791415199637413, + "learning_rate": 0.00017689370382410386, + "loss": 1.6129, + "step": 477 + }, + { + "epoch": 0.48185483870967744, + "grad_norm": 0.07638856768608093, + "learning_rate": 0.00017678975044622174, + "loss": 1.593, + "step": 478 + }, + { + "epoch": 0.48286290322580644, + "grad_norm": 0.08905162662267685, + "learning_rate": 0.00017668559444476793, + "loss": 1.6803, + "step": 479 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.08039755374193192, + "learning_rate": 0.00017658123609457668, + "loss": 1.6624, + "step": 480 + }, + { + "epoch": 0.4848790322580645, + "grad_norm": 0.07831753045320511, + "learning_rate": 0.00017647667567101632, + "loss": 1.6602, + "step": 481 + }, + { + "epoch": 0.48588709677419356, + "grad_norm": 0.07645969092845917, + "learning_rate": 0.00017637191344998837, + "loss": 1.6462, + "step": 482 + }, + { + "epoch": 0.48689516129032256, + "grad_norm": 0.0790887251496315, + "learning_rate": 0.00017626694970792673, + "loss": 1.581, + "step": 483 + }, + { + "epoch": 0.4879032258064516, + "grad_norm": 0.07644886523485184, + "learning_rate": 0.00017616178472179715, + "loss": 1.6035, + "step": 484 + }, + { + "epoch": 0.4889112903225806, + "grad_norm": 0.08160758763551712, + "learning_rate": 0.0001760564187690964, + "loss": 1.6169, + "step": 485 + }, + { + "epoch": 0.4899193548387097, + "grad_norm": 0.09234445542097092, + "learning_rate": 0.00017595085212785146, + "loss": 1.5878, + "step": 486 + }, + { + "epoch": 0.4909274193548387, + "grad_norm": 0.09042947739362717, + "learning_rate": 0.0001758450850766189, + "loss": 1.6629, + "step": 487 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 0.08583879470825195, + "learning_rate": 0.00017573911789448414, + "loss": 1.6398, + "step": 488 + }, + { + "epoch": 0.49294354838709675, + "grad_norm": 0.07878076285123825, + "learning_rate": 0.00017563295086106063, + "loss": 1.64, + "step": 489 + }, + { + "epoch": 0.4939516129032258, + "grad_norm": 0.08849604427814484, + "learning_rate": 0.00017552658425648923, + "loss": 1.6015, + "step": 490 + }, + { + "epoch": 0.4949596774193548, + "grad_norm": 0.07961837202310562, + "learning_rate": 0.00017542001836143731, + "loss": 1.6392, + "step": 491 + }, + { + "epoch": 0.4959677419354839, + "grad_norm": 0.08883430808782578, + "learning_rate": 0.00017531325345709816, + "loss": 1.6417, + "step": 492 + }, + { + "epoch": 0.4969758064516129, + "grad_norm": 0.07420235127210617, + "learning_rate": 0.00017520628982519023, + "loss": 1.635, + "step": 493 + }, + { + "epoch": 0.49798387096774194, + "grad_norm": 0.08477555215358734, + "learning_rate": 0.0001750991277479563, + "loss": 1.6264, + "step": 494 + }, + { + "epoch": 0.49899193548387094, + "grad_norm": 0.07410185784101486, + "learning_rate": 0.00017499176750816276, + "loss": 1.6414, + "step": 495 + }, + { + "epoch": 0.5, + "grad_norm": 0.08427213877439499, + "learning_rate": 0.00017488420938909893, + "loss": 1.6546, + "step": 496 + }, + { + "epoch": 0.501008064516129, + "grad_norm": 0.0739702582359314, + "learning_rate": 0.00017477645367457628, + "loss": 1.6316, + "step": 497 + }, + { + "epoch": 0.5020161290322581, + "grad_norm": 0.08044146001338959, + "learning_rate": 0.00017466850064892762, + "loss": 1.6256, + "step": 498 + }, + { + "epoch": 0.5030241935483871, + "grad_norm": 0.08690078556537628, + "learning_rate": 0.0001745603505970064, + "loss": 1.589, + "step": 499 + }, + { + "epoch": 0.5040322580645161, + "grad_norm": 0.07842793315649033, + "learning_rate": 0.00017445200380418607, + "loss": 1.6352, + "step": 500 + }, + { + "epoch": 0.5050403225806451, + "grad_norm": 0.08214239776134491, + "learning_rate": 0.00017434346055635912, + "loss": 1.6244, + "step": 501 + }, + { + "epoch": 0.5060483870967742, + "grad_norm": 0.07770374417304993, + "learning_rate": 0.00017423472113993634, + "loss": 1.65, + "step": 502 + }, + { + "epoch": 0.5070564516129032, + "grad_norm": 0.08378950506448746, + "learning_rate": 0.00017412578584184637, + "loss": 1.6129, + "step": 503 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 0.07839113473892212, + "learning_rate": 0.00017401665494953453, + "loss": 1.6479, + "step": 504 + }, + { + "epoch": 0.5090725806451613, + "grad_norm": 0.0775337815284729, + "learning_rate": 0.00017390732875096227, + "loss": 1.6005, + "step": 505 + }, + { + "epoch": 0.5100806451612904, + "grad_norm": 0.08532094955444336, + "learning_rate": 0.00017379780753460654, + "loss": 1.6669, + "step": 506 + }, + { + "epoch": 0.5110887096774194, + "grad_norm": 0.07484716176986694, + "learning_rate": 0.00017368809158945872, + "loss": 1.6786, + "step": 507 + }, + { + "epoch": 0.5120967741935484, + "grad_norm": 0.08861152827739716, + "learning_rate": 0.00017357818120502402, + "loss": 1.6753, + "step": 508 + }, + { + "epoch": 0.5131048387096774, + "grad_norm": 0.08586420863866806, + "learning_rate": 0.00017346807667132085, + "loss": 1.6483, + "step": 509 + }, + { + "epoch": 0.5141129032258065, + "grad_norm": 0.08970779180526733, + "learning_rate": 0.00017335777827887978, + "loss": 1.6776, + "step": 510 + }, + { + "epoch": 0.5151209677419355, + "grad_norm": 0.08755983412265778, + "learning_rate": 0.00017324728631874298, + "loss": 1.6666, + "step": 511 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.08634518831968307, + "learning_rate": 0.00017313660108246337, + "loss": 1.6195, + "step": 512 + }, + { + "epoch": 0.5171370967741935, + "grad_norm": 0.08298657834529877, + "learning_rate": 0.00017302572286210382, + "loss": 1.5564, + "step": 513 + }, + { + "epoch": 0.5181451612903226, + "grad_norm": 0.07834544777870178, + "learning_rate": 0.00017291465195023653, + "loss": 1.6109, + "step": 514 + }, + { + "epoch": 0.5191532258064516, + "grad_norm": 0.09181385487318039, + "learning_rate": 0.000172803388639942, + "loss": 1.6387, + "step": 515 + }, + { + "epoch": 0.5201612903225806, + "grad_norm": 0.07698329538106918, + "learning_rate": 0.00017269193322480856, + "loss": 1.6223, + "step": 516 + }, + { + "epoch": 0.5211693548387096, + "grad_norm": 0.10118810087442398, + "learning_rate": 0.00017258028599893136, + "loss": 1.6365, + "step": 517 + }, + { + "epoch": 0.5221774193548387, + "grad_norm": 0.08565083891153336, + "learning_rate": 0.00017246844725691166, + "loss": 1.5905, + "step": 518 + }, + { + "epoch": 0.5231854838709677, + "grad_norm": 0.08563411980867386, + "learning_rate": 0.00017235641729385615, + "loss": 1.6141, + "step": 519 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 0.07669138163328171, + "learning_rate": 0.00017224419640537598, + "loss": 1.6278, + "step": 520 + }, + { + "epoch": 0.5252016129032258, + "grad_norm": 0.09773047268390656, + "learning_rate": 0.00017213178488758622, + "loss": 1.7324, + "step": 521 + }, + { + "epoch": 0.5262096774193549, + "grad_norm": 0.07799120247364044, + "learning_rate": 0.00017201918303710482, + "loss": 1.5967, + "step": 522 + }, + { + "epoch": 0.5272177419354839, + "grad_norm": 0.0810832753777504, + "learning_rate": 0.0001719063911510521, + "loss": 1.6204, + "step": 523 + }, + { + "epoch": 0.5282258064516129, + "grad_norm": 0.08055137097835541, + "learning_rate": 0.0001717934095270497, + "loss": 1.6138, + "step": 524 + }, + { + "epoch": 0.5292338709677419, + "grad_norm": 0.08200159668922424, + "learning_rate": 0.0001716802384632199, + "loss": 1.6211, + "step": 525 + }, + { + "epoch": 0.530241935483871, + "grad_norm": 0.0793243944644928, + "learning_rate": 0.00017156687825818504, + "loss": 1.579, + "step": 526 + }, + { + "epoch": 0.53125, + "grad_norm": 0.08332548290491104, + "learning_rate": 0.00017145332921106633, + "loss": 1.5874, + "step": 527 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 0.07582446932792664, + "learning_rate": 0.00017133959162148336, + "loss": 1.5871, + "step": 528 + }, + { + "epoch": 0.5332661290322581, + "grad_norm": 0.0803590714931488, + "learning_rate": 0.00017122566578955324, + "loss": 1.6451, + "step": 529 + }, + { + "epoch": 0.5342741935483871, + "grad_norm": 0.07705288380384445, + "learning_rate": 0.00017111155201588978, + "loss": 1.5892, + "step": 530 + }, + { + "epoch": 0.5352822580645161, + "grad_norm": 0.08003994822502136, + "learning_rate": 0.0001709972506016027, + "loss": 1.6701, + "step": 531 + }, + { + "epoch": 0.5362903225806451, + "grad_norm": 0.07644215226173401, + "learning_rate": 0.00017088276184829685, + "loss": 1.6271, + "step": 532 + }, + { + "epoch": 0.5372983870967742, + "grad_norm": 0.08193427324295044, + "learning_rate": 0.00017076808605807138, + "loss": 1.5906, + "step": 533 + }, + { + "epoch": 0.5383064516129032, + "grad_norm": 0.08339913934469223, + "learning_rate": 0.00017065322353351903, + "loss": 1.6452, + "step": 534 + }, + { + "epoch": 0.5393145161290323, + "grad_norm": 0.08375068008899689, + "learning_rate": 0.0001705381745777252, + "loss": 1.6573, + "step": 535 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 0.07980147749185562, + "learning_rate": 0.00017042293949426726, + "loss": 1.5999, + "step": 536 + }, + { + "epoch": 0.5413306451612904, + "grad_norm": 0.07945246994495392, + "learning_rate": 0.00017030751858721375, + "loss": 1.6372, + "step": 537 + }, + { + "epoch": 0.5423387096774194, + "grad_norm": 0.07931476086378098, + "learning_rate": 0.00017019191216112342, + "loss": 1.6244, + "step": 538 + }, + { + "epoch": 0.5433467741935484, + "grad_norm": 0.07984746247529984, + "learning_rate": 0.00017007612052104474, + "loss": 1.5592, + "step": 539 + }, + { + "epoch": 0.5443548387096774, + "grad_norm": 0.09376467764377594, + "learning_rate": 0.00016996014397251466, + "loss": 1.6774, + "step": 540 + }, + { + "epoch": 0.5453629032258065, + "grad_norm": 0.08642607182264328, + "learning_rate": 0.00016984398282155825, + "loss": 1.6101, + "step": 541 + }, + { + "epoch": 0.5463709677419355, + "grad_norm": 0.07891902327537537, + "learning_rate": 0.00016972763737468758, + "loss": 1.6109, + "step": 542 + }, + { + "epoch": 0.5473790322580645, + "grad_norm": 0.07893992215394974, + "learning_rate": 0.00016961110793890108, + "loss": 1.643, + "step": 543 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.08107249438762665, + "learning_rate": 0.00016949439482168255, + "loss": 1.6093, + "step": 544 + }, + { + "epoch": 0.5493951612903226, + "grad_norm": 0.08450604975223541, + "learning_rate": 0.00016937749833100064, + "loss": 1.6406, + "step": 545 + }, + { + "epoch": 0.5504032258064516, + "grad_norm": 0.08088622242212296, + "learning_rate": 0.0001692604187753077, + "loss": 1.6293, + "step": 546 + }, + { + "epoch": 0.5514112903225806, + "grad_norm": 0.09227669984102249, + "learning_rate": 0.0001691431564635392, + "loss": 1.6022, + "step": 547 + }, + { + "epoch": 0.5524193548387096, + "grad_norm": 0.08562039583921432, + "learning_rate": 0.00016902571170511292, + "loss": 1.6341, + "step": 548 + }, + { + "epoch": 0.5534274193548387, + "grad_norm": 0.09240545332431793, + "learning_rate": 0.0001689080848099279, + "loss": 1.643, + "step": 549 + }, + { + "epoch": 0.5544354838709677, + "grad_norm": 0.09082893282175064, + "learning_rate": 0.00016879027608836394, + "loss": 1.6132, + "step": 550 + }, + { + "epoch": 0.5554435483870968, + "grad_norm": 0.08730785548686981, + "learning_rate": 0.00016867228585128047, + "loss": 1.631, + "step": 551 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 0.08937687426805496, + "learning_rate": 0.000168554114410016, + "loss": 1.7034, + "step": 552 + }, + { + "epoch": 0.5574596774193549, + "grad_norm": 0.07652641087770462, + "learning_rate": 0.0001684357620763872, + "loss": 1.6019, + "step": 553 + }, + { + "epoch": 0.5584677419354839, + "grad_norm": 0.08145558089017868, + "learning_rate": 0.00016831722916268787, + "loss": 1.6705, + "step": 554 + }, + { + "epoch": 0.5594758064516129, + "grad_norm": 0.09578656405210495, + "learning_rate": 0.0001681985159816885, + "loss": 1.6889, + "step": 555 + }, + { + "epoch": 0.5604838709677419, + "grad_norm": 0.085781030356884, + "learning_rate": 0.00016807962284663518, + "loss": 1.6362, + "step": 556 + }, + { + "epoch": 0.561491935483871, + "grad_norm": 0.07998887449502945, + "learning_rate": 0.0001679605500712488, + "loss": 1.6045, + "step": 557 + }, + { + "epoch": 0.5625, + "grad_norm": 0.09279566258192062, + "learning_rate": 0.00016784129796972431, + "loss": 1.5786, + "step": 558 + }, + { + "epoch": 0.563508064516129, + "grad_norm": 0.08150017261505127, + "learning_rate": 0.0001677218668567299, + "loss": 1.6313, + "step": 559 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.08562348783016205, + "learning_rate": 0.00016760225704740594, + "loss": 1.6047, + "step": 560 + }, + { + "epoch": 0.5655241935483871, + "grad_norm": 0.09371492266654968, + "learning_rate": 0.00016748246885736452, + "loss": 1.6599, + "step": 561 + }, + { + "epoch": 0.5665322580645161, + "grad_norm": 0.08150923997163773, + "learning_rate": 0.00016736250260268828, + "loss": 1.6556, + "step": 562 + }, + { + "epoch": 0.5675403225806451, + "grad_norm": 0.08109602332115173, + "learning_rate": 0.0001672423585999298, + "loss": 1.6143, + "step": 563 + }, + { + "epoch": 0.5685483870967742, + "grad_norm": 0.07796693593263626, + "learning_rate": 0.0001671220371661106, + "loss": 1.6046, + "step": 564 + }, + { + "epoch": 0.5695564516129032, + "grad_norm": 0.08694635331630707, + "learning_rate": 0.0001670015386187205, + "loss": 1.6564, + "step": 565 + }, + { + "epoch": 0.5705645161290323, + "grad_norm": 0.08142531663179398, + "learning_rate": 0.00016688086327571648, + "loss": 1.6406, + "step": 566 + }, + { + "epoch": 0.5715725806451613, + "grad_norm": 0.07907096296548843, + "learning_rate": 0.00016676001145552228, + "loss": 1.5948, + "step": 567 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 0.08147318661212921, + "learning_rate": 0.0001666389834770271, + "loss": 1.5789, + "step": 568 + }, + { + "epoch": 0.5735887096774194, + "grad_norm": 0.08041603118181229, + "learning_rate": 0.00016651777965958503, + "loss": 1.6229, + "step": 569 + }, + { + "epoch": 0.5745967741935484, + "grad_norm": 0.07601971924304962, + "learning_rate": 0.00016639640032301413, + "loss": 1.5722, + "step": 570 + }, + { + "epoch": 0.5756048387096774, + "grad_norm": 0.08111369609832764, + "learning_rate": 0.0001662748457875957, + "loss": 1.6485, + "step": 571 + }, + { + "epoch": 0.5766129032258065, + "grad_norm": 0.07956349104642868, + "learning_rate": 0.00016615311637407316, + "loss": 1.6118, + "step": 572 + }, + { + "epoch": 0.5776209677419355, + "grad_norm": 0.08260063081979752, + "learning_rate": 0.00016603121240365152, + "loss": 1.6618, + "step": 573 + }, + { + "epoch": 0.5786290322580645, + "grad_norm": 0.077680803835392, + "learning_rate": 0.00016590913419799633, + "loss": 1.6316, + "step": 574 + }, + { + "epoch": 0.5796370967741935, + "grad_norm": 0.08391865342855453, + "learning_rate": 0.00016578688207923289, + "loss": 1.6273, + "step": 575 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.08210872858762741, + "learning_rate": 0.0001656644563699454, + "loss": 1.6222, + "step": 576 + }, + { + "epoch": 0.5816532258064516, + "grad_norm": 0.07796725630760193, + "learning_rate": 0.00016554185739317616, + "loss": 1.5981, + "step": 577 + }, + { + "epoch": 0.5826612903225806, + "grad_norm": 0.0765356495976448, + "learning_rate": 0.00016541908547242459, + "loss": 1.6164, + "step": 578 + }, + { + "epoch": 0.5836693548387096, + "grad_norm": 0.090540811419487, + "learning_rate": 0.00016529614093164648, + "loss": 1.6994, + "step": 579 + }, + { + "epoch": 0.5846774193548387, + "grad_norm": 0.08444759249687195, + "learning_rate": 0.00016517302409525315, + "loss": 1.6154, + "step": 580 + }, + { + "epoch": 0.5856854838709677, + "grad_norm": 0.0766877606511116, + "learning_rate": 0.0001650497352881105, + "loss": 1.6046, + "step": 581 + }, + { + "epoch": 0.5866935483870968, + "grad_norm": 0.0797574445605278, + "learning_rate": 0.00016492627483553822, + "loss": 1.6298, + "step": 582 + }, + { + "epoch": 0.5877016129032258, + "grad_norm": 0.07783927023410797, + "learning_rate": 0.00016480264306330898, + "loss": 1.5702, + "step": 583 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 0.08371485024690628, + "learning_rate": 0.0001646788402976474, + "loss": 1.6215, + "step": 584 + }, + { + "epoch": 0.5897177419354839, + "grad_norm": 0.08839402347803116, + "learning_rate": 0.0001645548668652294, + "loss": 1.5996, + "step": 585 + }, + { + "epoch": 0.5907258064516129, + "grad_norm": 0.07832740247249603, + "learning_rate": 0.0001644307230931811, + "loss": 1.6281, + "step": 586 + }, + { + "epoch": 0.5917338709677419, + "grad_norm": 0.07553452998399734, + "learning_rate": 0.00016430640930907827, + "loss": 1.6147, + "step": 587 + }, + { + "epoch": 0.592741935483871, + "grad_norm": 0.07809963822364807, + "learning_rate": 0.00016418192584094515, + "loss": 1.5993, + "step": 588 + }, + { + "epoch": 0.59375, + "grad_norm": 0.07688596844673157, + "learning_rate": 0.00016405727301725377, + "loss": 1.6019, + "step": 589 + }, + { + "epoch": 0.594758064516129, + "grad_norm": 0.07611083984375, + "learning_rate": 0.00016393245116692304, + "loss": 1.5689, + "step": 590 + }, + { + "epoch": 0.5957661290322581, + "grad_norm": 0.08132312446832657, + "learning_rate": 0.00016380746061931786, + "loss": 1.6307, + "step": 591 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 0.07959824800491333, + "learning_rate": 0.00016368230170424826, + "loss": 1.5851, + "step": 592 + }, + { + "epoch": 0.5977822580645161, + "grad_norm": 0.08210327476263046, + "learning_rate": 0.0001635569747519686, + "loss": 1.6139, + "step": 593 + }, + { + "epoch": 0.5987903225806451, + "grad_norm": 0.1014091745018959, + "learning_rate": 0.00016343148009317657, + "loss": 1.564, + "step": 594 + }, + { + "epoch": 0.5997983870967742, + "grad_norm": 0.08163224905729294, + "learning_rate": 0.00016330581805901239, + "loss": 1.5896, + "step": 595 + }, + { + "epoch": 0.6008064516129032, + "grad_norm": 0.08205213397741318, + "learning_rate": 0.00016317998898105797, + "loss": 1.6271, + "step": 596 + }, + { + "epoch": 0.6018145161290323, + "grad_norm": 0.07970026135444641, + "learning_rate": 0.00016305399319133595, + "loss": 1.6024, + "step": 597 + }, + { + "epoch": 0.6028225806451613, + "grad_norm": 0.07718155533075333, + "learning_rate": 0.00016292783102230888, + "loss": 1.5951, + "step": 598 + }, + { + "epoch": 0.6038306451612904, + "grad_norm": 0.09728401899337769, + "learning_rate": 0.00016280150280687834, + "loss": 1.6838, + "step": 599 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 0.08184093236923218, + "learning_rate": 0.00016267500887838412, + "loss": 1.5902, + "step": 600 + }, + { + "epoch": 0.6058467741935484, + "grad_norm": 0.08744041621685028, + "learning_rate": 0.00016254834957060309, + "loss": 1.6292, + "step": 601 + }, + { + "epoch": 0.6068548387096774, + "grad_norm": 0.09200835227966309, + "learning_rate": 0.00016242152521774874, + "loss": 1.6393, + "step": 602 + }, + { + "epoch": 0.6078629032258065, + "grad_norm": 0.08810313045978546, + "learning_rate": 0.0001622945361544699, + "loss": 1.6201, + "step": 603 + }, + { + "epoch": 0.6088709677419355, + "grad_norm": 0.09700248390436172, + "learning_rate": 0.00016216738271584999, + "loss": 1.5638, + "step": 604 + }, + { + "epoch": 0.6098790322580645, + "grad_norm": 0.08686663955450058, + "learning_rate": 0.00016204006523740634, + "loss": 1.5734, + "step": 605 + }, + { + "epoch": 0.6108870967741935, + "grad_norm": 0.07873237133026123, + "learning_rate": 0.00016191258405508896, + "loss": 1.5469, + "step": 606 + }, + { + "epoch": 0.6118951612903226, + "grad_norm": 0.08019126206636429, + "learning_rate": 0.0001617849395052799, + "loss": 1.6431, + "step": 607 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.08971964567899704, + "learning_rate": 0.00016165713192479227, + "loss": 1.6535, + "step": 608 + }, + { + "epoch": 0.6139112903225806, + "grad_norm": 0.07752855867147446, + "learning_rate": 0.00016152916165086936, + "loss": 1.5829, + "step": 609 + }, + { + "epoch": 0.6149193548387096, + "grad_norm": 0.08348417282104492, + "learning_rate": 0.00016140102902118377, + "loss": 1.6305, + "step": 610 + }, + { + "epoch": 0.6159274193548387, + "grad_norm": 0.0761261060833931, + "learning_rate": 0.0001612727343738365, + "loss": 1.5835, + "step": 611 + }, + { + "epoch": 0.6169354838709677, + "grad_norm": 0.11013983935117722, + "learning_rate": 0.00016114427804735603, + "loss": 1.6364, + "step": 612 + }, + { + "epoch": 0.6179435483870968, + "grad_norm": 0.086505226790905, + "learning_rate": 0.00016101566038069756, + "loss": 1.61, + "step": 613 + }, + { + "epoch": 0.6189516129032258, + "grad_norm": 0.08692600578069687, + "learning_rate": 0.00016088688171324184, + "loss": 1.6153, + "step": 614 + }, + { + "epoch": 0.6199596774193549, + "grad_norm": 0.09537503123283386, + "learning_rate": 0.0001607579423847946, + "loss": 1.6053, + "step": 615 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 0.08204115927219391, + "learning_rate": 0.00016062884273558545, + "loss": 1.5939, + "step": 616 + }, + { + "epoch": 0.6219758064516129, + "grad_norm": 0.08595214784145355, + "learning_rate": 0.00016049958310626708, + "loss": 1.6162, + "step": 617 + }, + { + "epoch": 0.6229838709677419, + "grad_norm": 0.08318503201007843, + "learning_rate": 0.00016037016383791425, + "loss": 1.6401, + "step": 618 + }, + { + "epoch": 0.623991935483871, + "grad_norm": 0.08207780867815018, + "learning_rate": 0.00016024058527202298, + "loss": 1.6226, + "step": 619 + }, + { + "epoch": 0.625, + "grad_norm": 0.08268122375011444, + "learning_rate": 0.00016011084775050959, + "loss": 1.6522, + "step": 620 + }, + { + "epoch": 0.626008064516129, + "grad_norm": 0.07751034945249557, + "learning_rate": 0.00015998095161570995, + "loss": 1.5455, + "step": 621 + }, + { + "epoch": 0.6270161290322581, + "grad_norm": 0.08539839088916779, + "learning_rate": 0.00015985089721037832, + "loss": 1.6116, + "step": 622 + }, + { + "epoch": 0.6280241935483871, + "grad_norm": 0.08065900206565857, + "learning_rate": 0.00015972068487768665, + "loss": 1.6102, + "step": 623 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 0.07968778163194656, + "learning_rate": 0.00015959031496122364, + "loss": 1.6065, + "step": 624 + }, + { + "epoch": 0.6300403225806451, + "grad_norm": 0.08040513843297958, + "learning_rate": 0.00015945978780499375, + "loss": 1.5974, + "step": 625 + }, + { + "epoch": 0.6310483870967742, + "grad_norm": 0.0841718390583992, + "learning_rate": 0.00015932910375341639, + "loss": 1.5943, + "step": 626 + }, + { + "epoch": 0.6320564516129032, + "grad_norm": 0.07834211736917496, + "learning_rate": 0.0001591982631513249, + "loss": 1.5856, + "step": 627 + }, + { + "epoch": 0.6330645161290323, + "grad_norm": 0.08371677994728088, + "learning_rate": 0.00015906726634396575, + "loss": 1.5972, + "step": 628 + }, + { + "epoch": 0.6340725806451613, + "grad_norm": 0.09251397848129272, + "learning_rate": 0.00015893611367699762, + "loss": 1.6529, + "step": 629 + }, + { + "epoch": 0.6350806451612904, + "grad_norm": 0.080534428358078, + "learning_rate": 0.00015880480549649038, + "loss": 1.5786, + "step": 630 + }, + { + "epoch": 0.6360887096774194, + "grad_norm": 0.09134898334741592, + "learning_rate": 0.00015867334214892436, + "loss": 1.6303, + "step": 631 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 0.08673352748155594, + "learning_rate": 0.00015854172398118913, + "loss": 1.6281, + "step": 632 + }, + { + "epoch": 0.6381048387096774, + "grad_norm": 0.11661474406719208, + "learning_rate": 0.000158409951340583, + "loss": 1.6826, + "step": 633 + }, + { + "epoch": 0.6391129032258065, + "grad_norm": 0.08508265018463135, + "learning_rate": 0.0001582780245748118, + "loss": 1.5785, + "step": 634 + }, + { + "epoch": 0.6401209677419355, + "grad_norm": 0.09865213930606842, + "learning_rate": 0.00015814594403198794, + "loss": 1.619, + "step": 635 + }, + { + "epoch": 0.6411290322580645, + "grad_norm": 0.08882018178701401, + "learning_rate": 0.00015801371006062982, + "loss": 1.6076, + "step": 636 + }, + { + "epoch": 0.6421370967741935, + "grad_norm": 0.10395356267690659, + "learning_rate": 0.00015788132300966046, + "loss": 1.6193, + "step": 637 + }, + { + "epoch": 0.6431451612903226, + "grad_norm": 0.08556309342384338, + "learning_rate": 0.00015774878322840694, + "loss": 1.6313, + "step": 638 + }, + { + "epoch": 0.6441532258064516, + "grad_norm": 0.08463555574417114, + "learning_rate": 0.00015761609106659935, + "loss": 1.5852, + "step": 639 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.08253596723079681, + "learning_rate": 0.0001574832468743698, + "loss": 1.65, + "step": 640 + }, + { + "epoch": 0.6461693548387096, + "grad_norm": 0.09345366060733795, + "learning_rate": 0.0001573502510022516, + "loss": 1.5869, + "step": 641 + }, + { + "epoch": 0.6471774193548387, + "grad_norm": 0.08240879327058792, + "learning_rate": 0.00015721710380117826, + "loss": 1.6057, + "step": 642 + }, + { + "epoch": 0.6481854838709677, + "grad_norm": 0.08767805248498917, + "learning_rate": 0.0001570838056224827, + "loss": 1.5864, + "step": 643 + }, + { + "epoch": 0.6491935483870968, + "grad_norm": 0.08595956861972809, + "learning_rate": 0.0001569503568178961, + "loss": 1.593, + "step": 644 + }, + { + "epoch": 0.6502016129032258, + "grad_norm": 0.0859324112534523, + "learning_rate": 0.0001568167577395471, + "loss": 1.6248, + "step": 645 + }, + { + "epoch": 0.6512096774193549, + "grad_norm": 0.07949813455343246, + "learning_rate": 0.00015668300873996095, + "loss": 1.6269, + "step": 646 + }, + { + "epoch": 0.6522177419354839, + "grad_norm": 0.08270735293626785, + "learning_rate": 0.00015654911017205846, + "loss": 1.6161, + "step": 647 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 0.08057011663913727, + "learning_rate": 0.000156415062389155, + "loss": 1.615, + "step": 648 + }, + { + "epoch": 0.6542338709677419, + "grad_norm": 0.07924232631921768, + "learning_rate": 0.00015628086574495992, + "loss": 1.5898, + "step": 649 + }, + { + "epoch": 0.655241935483871, + "grad_norm": 0.08501306176185608, + "learning_rate": 0.00015614652059357508, + "loss": 1.6709, + "step": 650 + }, + { + "epoch": 0.65625, + "grad_norm": 0.08682959526777267, + "learning_rate": 0.00015601202728949436, + "loss": 1.6214, + "step": 651 + }, + { + "epoch": 0.657258064516129, + "grad_norm": 0.08149803429841995, + "learning_rate": 0.00015587738618760258, + "loss": 1.6337, + "step": 652 + }, + { + "epoch": 0.6582661290322581, + "grad_norm": 0.09022454917430878, + "learning_rate": 0.00015574259764317448, + "loss": 1.5809, + "step": 653 + }, + { + "epoch": 0.6592741935483871, + "grad_norm": 0.08189895004034042, + "learning_rate": 0.00015560766201187386, + "loss": 1.6188, + "step": 654 + }, + { + "epoch": 0.6602822580645161, + "grad_norm": 0.080174021422863, + "learning_rate": 0.00015547257964975273, + "loss": 1.5991, + "step": 655 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 0.08346089720726013, + "learning_rate": 0.0001553373509132501, + "loss": 1.5734, + "step": 656 + }, + { + "epoch": 0.6622983870967742, + "grad_norm": 0.07657915353775024, + "learning_rate": 0.00015520197615919145, + "loss": 1.5422, + "step": 657 + }, + { + "epoch": 0.6633064516129032, + "grad_norm": 0.08029603213071823, + "learning_rate": 0.0001550664557447873, + "loss": 1.5886, + "step": 658 + }, + { + "epoch": 0.6643145161290323, + "grad_norm": 0.08529450744390488, + "learning_rate": 0.0001549307900276327, + "loss": 1.629, + "step": 659 + }, + { + "epoch": 0.6653225806451613, + "grad_norm": 0.07882041484117508, + "learning_rate": 0.0001547949793657061, + "loss": 1.66, + "step": 660 + }, + { + "epoch": 0.6663306451612904, + "grad_norm": 0.08514705300331116, + "learning_rate": 0.00015465902411736828, + "loss": 1.6113, + "step": 661 + }, + { + "epoch": 0.6673387096774194, + "grad_norm": 0.07738941162824631, + "learning_rate": 0.00015452292464136167, + "loss": 1.5959, + "step": 662 + }, + { + "epoch": 0.6683467741935484, + "grad_norm": 0.08031867444515228, + "learning_rate": 0.0001543866812968092, + "loss": 1.601, + "step": 663 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 0.08055873215198517, + "learning_rate": 0.00015425029444321347, + "loss": 1.5731, + "step": 664 + }, + { + "epoch": 0.6703629032258065, + "grad_norm": 0.08486857265233994, + "learning_rate": 0.0001541137644404557, + "loss": 1.5703, + "step": 665 + }, + { + "epoch": 0.6713709677419355, + "grad_norm": 0.07934212684631348, + "learning_rate": 0.0001539770916487949, + "loss": 1.6163, + "step": 666 + }, + { + "epoch": 0.6723790322580645, + "grad_norm": 0.08954691141843796, + "learning_rate": 0.0001538402764288668, + "loss": 1.6139, + "step": 667 + }, + { + "epoch": 0.6733870967741935, + "grad_norm": 0.08842763304710388, + "learning_rate": 0.00015370331914168296, + "loss": 1.6322, + "step": 668 + }, + { + "epoch": 0.6743951612903226, + "grad_norm": 0.08686459064483643, + "learning_rate": 0.00015356622014862988, + "loss": 1.59, + "step": 669 + }, + { + "epoch": 0.6754032258064516, + "grad_norm": 0.07980991154909134, + "learning_rate": 0.00015342897981146785, + "loss": 1.576, + "step": 670 + }, + { + "epoch": 0.6764112903225806, + "grad_norm": 0.08613515645265579, + "learning_rate": 0.00015329159849233022, + "loss": 1.6328, + "step": 671 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.10668696463108063, + "learning_rate": 0.0001531540765537223, + "loss": 1.6482, + "step": 672 + }, + { + "epoch": 0.6784274193548387, + "grad_norm": 0.07826445251703262, + "learning_rate": 0.00015301641435852046, + "loss": 1.5984, + "step": 673 + }, + { + "epoch": 0.6794354838709677, + "grad_norm": 0.09749854356050491, + "learning_rate": 0.00015287861226997125, + "loss": 1.586, + "step": 674 + }, + { + "epoch": 0.6804435483870968, + "grad_norm": 0.09301649779081345, + "learning_rate": 0.00015274067065169017, + "loss": 1.6806, + "step": 675 + }, + { + "epoch": 0.6814516129032258, + "grad_norm": 0.08719351887702942, + "learning_rate": 0.00015260258986766104, + "loss": 1.5568, + "step": 676 + }, + { + "epoch": 0.6824596774193549, + "grad_norm": 0.08005709946155548, + "learning_rate": 0.00015246437028223486, + "loss": 1.6252, + "step": 677 + }, + { + "epoch": 0.6834677419354839, + "grad_norm": 0.08304545283317566, + "learning_rate": 0.00015232601226012886, + "loss": 1.6137, + "step": 678 + }, + { + "epoch": 0.6844758064516129, + "grad_norm": 0.07949443906545639, + "learning_rate": 0.0001521875161664256, + "loss": 1.5808, + "step": 679 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 0.08979618549346924, + "learning_rate": 0.00015204888236657188, + "loss": 1.6164, + "step": 680 + }, + { + "epoch": 0.686491935483871, + "grad_norm": 0.07843173295259476, + "learning_rate": 0.00015191011122637796, + "loss": 1.6246, + "step": 681 + }, + { + "epoch": 0.6875, + "grad_norm": 0.09026903659105301, + "learning_rate": 0.00015177120311201647, + "loss": 1.6352, + "step": 682 + }, + { + "epoch": 0.688508064516129, + "grad_norm": 0.09385894238948822, + "learning_rate": 0.00015163215839002146, + "loss": 1.622, + "step": 683 + }, + { + "epoch": 0.6895161290322581, + "grad_norm": 0.07961908727884293, + "learning_rate": 0.0001514929774272874, + "loss": 1.5745, + "step": 684 + }, + { + "epoch": 0.6905241935483871, + "grad_norm": 0.08670490235090256, + "learning_rate": 0.00015135366059106832, + "loss": 1.5945, + "step": 685 + }, + { + "epoch": 0.6915322580645161, + "grad_norm": 0.08476680517196655, + "learning_rate": 0.00015121420824897678, + "loss": 1.6316, + "step": 686 + }, + { + "epoch": 0.6925403225806451, + "grad_norm": 0.0937148854136467, + "learning_rate": 0.00015107462076898289, + "loss": 1.6054, + "step": 687 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 0.08981835842132568, + "learning_rate": 0.00015093489851941328, + "loss": 1.6683, + "step": 688 + }, + { + "epoch": 0.6945564516129032, + "grad_norm": 0.08677362650632858, + "learning_rate": 0.0001507950418689503, + "loss": 1.6306, + "step": 689 + }, + { + "epoch": 0.6955645161290323, + "grad_norm": 0.07769922912120819, + "learning_rate": 0.00015065505118663078, + "loss": 1.6164, + "step": 690 + }, + { + "epoch": 0.6965725806451613, + "grad_norm": 0.08614321053028107, + "learning_rate": 0.00015051492684184546, + "loss": 1.5615, + "step": 691 + }, + { + "epoch": 0.6975806451612904, + "grad_norm": 0.09230528026819229, + "learning_rate": 0.00015037466920433753, + "loss": 1.6901, + "step": 692 + }, + { + "epoch": 0.6985887096774194, + "grad_norm": 0.09350752830505371, + "learning_rate": 0.00015023427864420202, + "loss": 1.6465, + "step": 693 + }, + { + "epoch": 0.6995967741935484, + "grad_norm": 0.09468571841716766, + "learning_rate": 0.00015009375553188468, + "loss": 1.6485, + "step": 694 + }, + { + "epoch": 0.7006048387096774, + "grad_norm": 0.08464954793453217, + "learning_rate": 0.00014995310023818107, + "loss": 1.5865, + "step": 695 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 0.09060323238372803, + "learning_rate": 0.00014981231313423545, + "loss": 1.6074, + "step": 696 + }, + { + "epoch": 0.7026209677419355, + "grad_norm": 0.08714771270751953, + "learning_rate": 0.00014967139459153993, + "loss": 1.5824, + "step": 697 + }, + { + "epoch": 0.7036290322580645, + "grad_norm": 0.0776834785938263, + "learning_rate": 0.00014953034498193341, + "loss": 1.5689, + "step": 698 + }, + { + "epoch": 0.7046370967741935, + "grad_norm": 0.08315813541412354, + "learning_rate": 0.0001493891646776007, + "loss": 1.6187, + "step": 699 + }, + { + "epoch": 0.7056451612903226, + "grad_norm": 0.07914920896291733, + "learning_rate": 0.00014924785405107143, + "loss": 1.5417, + "step": 700 + }, + { + "epoch": 0.7066532258064516, + "grad_norm": 0.08314627408981323, + "learning_rate": 0.00014910641347521907, + "loss": 1.6298, + "step": 701 + }, + { + "epoch": 0.7076612903225806, + "grad_norm": 0.07665257155895233, + "learning_rate": 0.0001489648433232601, + "loss": 1.5464, + "step": 702 + }, + { + "epoch": 0.7086693548387096, + "grad_norm": 0.09670589119195938, + "learning_rate": 0.00014882314396875274, + "loss": 1.654, + "step": 703 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.08459917455911636, + "learning_rate": 0.00014868131578559633, + "loss": 1.6326, + "step": 704 + }, + { + "epoch": 0.7106854838709677, + "grad_norm": 0.08236029744148254, + "learning_rate": 0.00014853935914802994, + "loss": 1.59, + "step": 705 + }, + { + "epoch": 0.7116935483870968, + "grad_norm": 0.07780009508132935, + "learning_rate": 0.0001483972744306318, + "loss": 1.5801, + "step": 706 + }, + { + "epoch": 0.7127016129032258, + "grad_norm": 0.0835953950881958, + "learning_rate": 0.00014825506200831794, + "loss": 1.5765, + "step": 707 + }, + { + "epoch": 0.7137096774193549, + "grad_norm": 0.08014727383852005, + "learning_rate": 0.00014811272225634145, + "loss": 1.6156, + "step": 708 + }, + { + "epoch": 0.7147177419354839, + "grad_norm": 0.08108653128147125, + "learning_rate": 0.00014797025555029133, + "loss": 1.5825, + "step": 709 + }, + { + "epoch": 0.7157258064516129, + "grad_norm": 0.08455085754394531, + "learning_rate": 0.00014782766226609166, + "loss": 1.6218, + "step": 710 + }, + { + "epoch": 0.7167338709677419, + "grad_norm": 0.07630985975265503, + "learning_rate": 0.00014768494278000048, + "loss": 1.5889, + "step": 711 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 0.08318428695201874, + "learning_rate": 0.00014754209746860878, + "loss": 1.5827, + "step": 712 + }, + { + "epoch": 0.71875, + "grad_norm": 0.08248715102672577, + "learning_rate": 0.00014739912670883967, + "loss": 1.621, + "step": 713 + }, + { + "epoch": 0.719758064516129, + "grad_norm": 0.07857991755008698, + "learning_rate": 0.00014725603087794716, + "loss": 1.5605, + "step": 714 + }, + { + "epoch": 0.7207661290322581, + "grad_norm": 0.08540824055671692, + "learning_rate": 0.0001471128103535154, + "loss": 1.5471, + "step": 715 + }, + { + "epoch": 0.7217741935483871, + "grad_norm": 0.0777583196759224, + "learning_rate": 0.00014696946551345747, + "loss": 1.5029, + "step": 716 + }, + { + "epoch": 0.7227822580645161, + "grad_norm": 0.08295831829309464, + "learning_rate": 0.00014682599673601458, + "loss": 1.5709, + "step": 717 + }, + { + "epoch": 0.7237903225806451, + "grad_norm": 0.08069245517253876, + "learning_rate": 0.00014668240439975482, + "loss": 1.5601, + "step": 718 + }, + { + "epoch": 0.7247983870967742, + "grad_norm": 0.08142071962356567, + "learning_rate": 0.00014653868888357249, + "loss": 1.6004, + "step": 719 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 0.09048129618167877, + "learning_rate": 0.0001463948505666868, + "loss": 1.6614, + "step": 720 + }, + { + "epoch": 0.7268145161290323, + "grad_norm": 0.09065764397382736, + "learning_rate": 0.00014625088982864098, + "loss": 1.6612, + "step": 721 + }, + { + "epoch": 0.7278225806451613, + "grad_norm": 0.0859372541308403, + "learning_rate": 0.00014610680704930142, + "loss": 1.5914, + "step": 722 + }, + { + "epoch": 0.7288306451612904, + "grad_norm": 0.0821571797132492, + "learning_rate": 0.0001459626026088564, + "loss": 1.5458, + "step": 723 + }, + { + "epoch": 0.7298387096774194, + "grad_norm": 0.08414388447999954, + "learning_rate": 0.0001458182768878153, + "loss": 1.5608, + "step": 724 + }, + { + "epoch": 0.7308467741935484, + "grad_norm": 0.08222994953393936, + "learning_rate": 0.00014567383026700752, + "loss": 1.5943, + "step": 725 + }, + { + "epoch": 0.7318548387096774, + "grad_norm": 0.08996201306581497, + "learning_rate": 0.0001455292631275814, + "loss": 1.5524, + "step": 726 + }, + { + "epoch": 0.7328629032258065, + "grad_norm": 0.08061891794204712, + "learning_rate": 0.0001453845758510034, + "loss": 1.6428, + "step": 727 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 0.09720771759748459, + "learning_rate": 0.0001452397688190569, + "loss": 1.6538, + "step": 728 + }, + { + "epoch": 0.7348790322580645, + "grad_norm": 0.08087541162967682, + "learning_rate": 0.00014509484241384134, + "loss": 1.6078, + "step": 729 + }, + { + "epoch": 0.7358870967741935, + "grad_norm": 0.09106358885765076, + "learning_rate": 0.00014494979701777102, + "loss": 1.589, + "step": 730 + }, + { + "epoch": 0.7368951612903226, + "grad_norm": 0.07827623188495636, + "learning_rate": 0.00014480463301357445, + "loss": 1.5937, + "step": 731 + }, + { + "epoch": 0.7379032258064516, + "grad_norm": 0.09681122750043869, + "learning_rate": 0.00014465935078429286, + "loss": 1.6308, + "step": 732 + }, + { + "epoch": 0.7389112903225806, + "grad_norm": 0.0876043364405632, + "learning_rate": 0.00014451395071327964, + "loss": 1.6136, + "step": 733 + }, + { + "epoch": 0.7399193548387096, + "grad_norm": 0.10326588153839111, + "learning_rate": 0.00014436843318419896, + "loss": 1.5964, + "step": 734 + }, + { + "epoch": 0.7409274193548387, + "grad_norm": 0.08790312707424164, + "learning_rate": 0.00014422279858102504, + "loss": 1.5992, + "step": 735 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.0805894061923027, + "learning_rate": 0.00014407704728804097, + "loss": 1.5503, + "step": 736 + }, + { + "epoch": 0.7429435483870968, + "grad_norm": 0.0813809409737587, + "learning_rate": 0.00014393117968983777, + "loss": 1.5807, + "step": 737 + }, + { + "epoch": 0.7439516129032258, + "grad_norm": 0.0871429443359375, + "learning_rate": 0.0001437851961713133, + "loss": 1.6493, + "step": 738 + }, + { + "epoch": 0.7449596774193549, + "grad_norm": 0.08929460495710373, + "learning_rate": 0.0001436390971176714, + "loss": 1.58, + "step": 739 + }, + { + "epoch": 0.7459677419354839, + "grad_norm": 0.08278234302997589, + "learning_rate": 0.0001434928829144206, + "loss": 1.6442, + "step": 740 + }, + { + "epoch": 0.7469758064516129, + "grad_norm": 0.09997319430112839, + "learning_rate": 0.00014334655394737355, + "loss": 1.5756, + "step": 741 + }, + { + "epoch": 0.7479838709677419, + "grad_norm": 0.07914005219936371, + "learning_rate": 0.0001432001106026454, + "loss": 1.5642, + "step": 742 + }, + { + "epoch": 0.748991935483871, + "grad_norm": 0.09618489444255829, + "learning_rate": 0.00014305355326665339, + "loss": 1.6108, + "step": 743 + }, + { + "epoch": 0.75, + "grad_norm": 0.09149473160505295, + "learning_rate": 0.00014290688232611526, + "loss": 1.6007, + "step": 744 + }, + { + "epoch": 0.751008064516129, + "grad_norm": 0.08550098538398743, + "learning_rate": 0.00014276009816804885, + "loss": 1.588, + "step": 745 + }, + { + "epoch": 0.7520161290322581, + "grad_norm": 0.08285672217607498, + "learning_rate": 0.00014261320117977042, + "loss": 1.5845, + "step": 746 + }, + { + "epoch": 0.7530241935483871, + "grad_norm": 0.09440962970256805, + "learning_rate": 0.00014246619174889422, + "loss": 1.7127, + "step": 747 + }, + { + "epoch": 0.7540322580645161, + "grad_norm": 0.08045286685228348, + "learning_rate": 0.00014231907026333098, + "loss": 1.6066, + "step": 748 + }, + { + "epoch": 0.7550403225806451, + "grad_norm": 0.08301718533039093, + "learning_rate": 0.0001421718371112873, + "loss": 1.5732, + "step": 749 + }, + { + "epoch": 0.7560483870967742, + "grad_norm": 0.08225584775209427, + "learning_rate": 0.00014202449268126426, + "loss": 1.563, + "step": 750 + }, + { + "epoch": 0.7570564516129032, + "grad_norm": 0.08871738612651825, + "learning_rate": 0.00014187703736205667, + "loss": 1.6364, + "step": 751 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 0.08189701288938522, + "learning_rate": 0.00014172947154275195, + "loss": 1.5972, + "step": 752 + }, + { + "epoch": 0.7590725806451613, + "grad_norm": 0.08560924977064133, + "learning_rate": 0.00014158179561272907, + "loss": 1.5971, + "step": 753 + }, + { + "epoch": 0.7600806451612904, + "grad_norm": 0.08616410940885544, + "learning_rate": 0.00014143400996165746, + "loss": 1.6331, + "step": 754 + }, + { + "epoch": 0.7610887096774194, + "grad_norm": 0.08963197469711304, + "learning_rate": 0.00014128611497949626, + "loss": 1.5887, + "step": 755 + }, + { + "epoch": 0.7620967741935484, + "grad_norm": 0.09272851049900055, + "learning_rate": 0.0001411381110564929, + "loss": 1.5692, + "step": 756 + }, + { + "epoch": 0.7631048387096774, + "grad_norm": 0.08667407929897308, + "learning_rate": 0.0001409899985831824, + "loss": 1.5852, + "step": 757 + }, + { + "epoch": 0.7641129032258065, + "grad_norm": 0.08354497700929642, + "learning_rate": 0.00014084177795038613, + "loss": 1.6024, + "step": 758 + }, + { + "epoch": 0.7651209677419355, + "grad_norm": 0.09121601283550262, + "learning_rate": 0.00014069344954921096, + "loss": 1.5896, + "step": 759 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 0.09622003138065338, + "learning_rate": 0.00014054501377104797, + "loss": 1.5781, + "step": 760 + }, + { + "epoch": 0.7671370967741935, + "grad_norm": 0.08506747335195541, + "learning_rate": 0.00014039647100757177, + "loss": 1.5752, + "step": 761 + }, + { + "epoch": 0.7681451612903226, + "grad_norm": 0.09725549817085266, + "learning_rate": 0.00014024782165073912, + "loss": 1.599, + "step": 762 + }, + { + "epoch": 0.7691532258064516, + "grad_norm": 0.08023160696029663, + "learning_rate": 0.00014009906609278806, + "loss": 1.5503, + "step": 763 + }, + { + "epoch": 0.7701612903225806, + "grad_norm": 0.092674620449543, + "learning_rate": 0.00013995020472623693, + "loss": 1.6196, + "step": 764 + }, + { + "epoch": 0.7711693548387096, + "grad_norm": 0.07756571471691132, + "learning_rate": 0.0001398012379438832, + "loss": 1.599, + "step": 765 + }, + { + "epoch": 0.7721774193548387, + "grad_norm": 0.09609861671924591, + "learning_rate": 0.00013965216613880257, + "loss": 1.6356, + "step": 766 + }, + { + "epoch": 0.7731854838709677, + "grad_norm": 0.08073242753744125, + "learning_rate": 0.00013950298970434775, + "loss": 1.5975, + "step": 767 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.08342421054840088, + "learning_rate": 0.00013935370903414768, + "loss": 1.594, + "step": 768 + }, + { + "epoch": 0.7752016129032258, + "grad_norm": 0.07886181771755219, + "learning_rate": 0.00013920432452210619, + "loss": 1.5947, + "step": 769 + }, + { + "epoch": 0.7762096774193549, + "grad_norm": 0.08256496489048004, + "learning_rate": 0.00013905483656240125, + "loss": 1.5772, + "step": 770 + }, + { + "epoch": 0.7772177419354839, + "grad_norm": 0.08527923375368118, + "learning_rate": 0.0001389052455494837, + "loss": 1.5936, + "step": 771 + }, + { + "epoch": 0.7782258064516129, + "grad_norm": 0.08340179920196533, + "learning_rate": 0.00013875555187807637, + "loss": 1.5786, + "step": 772 + }, + { + "epoch": 0.7792338709677419, + "grad_norm": 0.07682585716247559, + "learning_rate": 0.00013860575594317292, + "loss": 1.542, + "step": 773 + }, + { + "epoch": 0.780241935483871, + "grad_norm": 0.08884165436029434, + "learning_rate": 0.00013845585814003684, + "loss": 1.5969, + "step": 774 + }, + { + "epoch": 0.78125, + "grad_norm": 0.07785353809595108, + "learning_rate": 0.00013830585886420054, + "loss": 1.5671, + "step": 775 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 0.08034134656190872, + "learning_rate": 0.000138155758511464, + "loss": 1.5774, + "step": 776 + }, + { + "epoch": 0.7832661290322581, + "grad_norm": 0.0796407014131546, + "learning_rate": 0.0001380055574778941, + "loss": 1.5606, + "step": 777 + }, + { + "epoch": 0.7842741935483871, + "grad_norm": 0.07933478057384491, + "learning_rate": 0.00013785525615982319, + "loss": 1.5651, + "step": 778 + }, + { + "epoch": 0.7852822580645161, + "grad_norm": 0.08734553307294846, + "learning_rate": 0.00013770485495384843, + "loss": 1.6262, + "step": 779 + }, + { + "epoch": 0.7862903225806451, + "grad_norm": 0.08349025249481201, + "learning_rate": 0.0001375543542568304, + "loss": 1.5835, + "step": 780 + }, + { + "epoch": 0.7872983870967742, + "grad_norm": 0.09640732407569885, + "learning_rate": 0.00013740375446589232, + "loss": 1.586, + "step": 781 + }, + { + "epoch": 0.7883064516129032, + "grad_norm": 0.09520639479160309, + "learning_rate": 0.00013725305597841878, + "loss": 1.6521, + "step": 782 + }, + { + "epoch": 0.7893145161290323, + "grad_norm": 0.07939834147691727, + "learning_rate": 0.00013710225919205484, + "loss": 1.5062, + "step": 783 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 0.08648645132780075, + "learning_rate": 0.000136951364504705, + "loss": 1.6303, + "step": 784 + }, + { + "epoch": 0.7913306451612904, + "grad_norm": 0.09467138350009918, + "learning_rate": 0.00013680037231453203, + "loss": 1.6333, + "step": 785 + }, + { + "epoch": 0.7923387096774194, + "grad_norm": 0.08505504578351974, + "learning_rate": 0.000136649283019956, + "loss": 1.5953, + "step": 786 + }, + { + "epoch": 0.7933467741935484, + "grad_norm": 0.0903257429599762, + "learning_rate": 0.00013649809701965311, + "loss": 1.5841, + "step": 787 + }, + { + "epoch": 0.7943548387096774, + "grad_norm": 0.08327475190162659, + "learning_rate": 0.00013634681471255493, + "loss": 1.578, + "step": 788 + }, + { + "epoch": 0.7953629032258065, + "grad_norm": 0.09311467409133911, + "learning_rate": 0.000136195436497847, + "loss": 1.5911, + "step": 789 + }, + { + "epoch": 0.7963709677419355, + "grad_norm": 0.09214780479669571, + "learning_rate": 0.00013604396277496796, + "loss": 1.6009, + "step": 790 + }, + { + "epoch": 0.7973790322580645, + "grad_norm": 0.08812731504440308, + "learning_rate": 0.00013589239394360848, + "loss": 1.6141, + "step": 791 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 0.11389174312353134, + "learning_rate": 0.00013574073040371022, + "loss": 1.6369, + "step": 792 + }, + { + "epoch": 0.7993951612903226, + "grad_norm": 0.08469700813293457, + "learning_rate": 0.00013558897255546473, + "loss": 1.6009, + "step": 793 + }, + { + "epoch": 0.8004032258064516, + "grad_norm": 0.08306135982275009, + "learning_rate": 0.0001354371207993123, + "loss": 1.5556, + "step": 794 + }, + { + "epoch": 0.8014112903225806, + "grad_norm": 0.08287226408720016, + "learning_rate": 0.00013528517553594124, + "loss": 1.571, + "step": 795 + }, + { + "epoch": 0.8024193548387096, + "grad_norm": 0.0797332376241684, + "learning_rate": 0.00013513313716628637, + "loss": 1.5679, + "step": 796 + }, + { + "epoch": 0.8034274193548387, + "grad_norm": 0.07978206872940063, + "learning_rate": 0.0001349810060915283, + "loss": 1.5865, + "step": 797 + }, + { + "epoch": 0.8044354838709677, + "grad_norm": 0.07792511582374573, + "learning_rate": 0.00013482878271309226, + "loss": 1.5849, + "step": 798 + }, + { + "epoch": 0.8054435483870968, + "grad_norm": 0.07994278520345688, + "learning_rate": 0.000134676467432647, + "loss": 1.6026, + "step": 799 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.08317188918590546, + "learning_rate": 0.00013452406065210382, + "loss": 1.6333, + "step": 800 + }, + { + "epoch": 0.8074596774193549, + "grad_norm": 0.09058106690645218, + "learning_rate": 0.00013437156277361538, + "loss": 1.5936, + "step": 801 + }, + { + "epoch": 0.8084677419354839, + "grad_norm": 0.08963512629270554, + "learning_rate": 0.00013421897419957482, + "loss": 1.6422, + "step": 802 + }, + { + "epoch": 0.8094758064516129, + "grad_norm": 0.09142173826694489, + "learning_rate": 0.0001340662953326145, + "loss": 1.6779, + "step": 803 + }, + { + "epoch": 0.8104838709677419, + "grad_norm": 0.08868789672851562, + "learning_rate": 0.00013391352657560513, + "loss": 1.6594, + "step": 804 + }, + { + "epoch": 0.811491935483871, + "grad_norm": 0.08746343106031418, + "learning_rate": 0.0001337606683316545, + "loss": 1.5312, + "step": 805 + }, + { + "epoch": 0.8125, + "grad_norm": 0.07589108496904373, + "learning_rate": 0.00013360772100410665, + "loss": 1.5462, + "step": 806 + }, + { + "epoch": 0.813508064516129, + "grad_norm": 0.0817432850599289, + "learning_rate": 0.00013345468499654056, + "loss": 1.5393, + "step": 807 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 0.07965264469385147, + "learning_rate": 0.00013330156071276932, + "loss": 1.5687, + "step": 808 + }, + { + "epoch": 0.8155241935483871, + "grad_norm": 0.08861200511455536, + "learning_rate": 0.00013314834855683886, + "loss": 1.6412, + "step": 809 + }, + { + "epoch": 0.8165322580645161, + "grad_norm": 0.07894746214151382, + "learning_rate": 0.00013299504893302705, + "loss": 1.5738, + "step": 810 + }, + { + "epoch": 0.8175403225806451, + "grad_norm": 0.07987947016954422, + "learning_rate": 0.00013284166224584253, + "loss": 1.6212, + "step": 811 + }, + { + "epoch": 0.8185483870967742, + "grad_norm": 0.09027516096830368, + "learning_rate": 0.0001326881889000236, + "loss": 1.6113, + "step": 812 + }, + { + "epoch": 0.8195564516129032, + "grad_norm": 0.11448541283607483, + "learning_rate": 0.00013253462930053742, + "loss": 1.6315, + "step": 813 + }, + { + "epoch": 0.8205645161290323, + "grad_norm": 0.08771926164627075, + "learning_rate": 0.00013238098385257848, + "loss": 1.5919, + "step": 814 + }, + { + "epoch": 0.8215725806451613, + "grad_norm": 0.09016083925962448, + "learning_rate": 0.00013222725296156807, + "loss": 1.5629, + "step": 815 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 0.08411089330911636, + "learning_rate": 0.0001320734370331527, + "loss": 1.6037, + "step": 816 + }, + { + "epoch": 0.8235887096774194, + "grad_norm": 0.09559720754623413, + "learning_rate": 0.0001319195364732034, + "loss": 1.5463, + "step": 817 + }, + { + "epoch": 0.8245967741935484, + "grad_norm": 0.10408146679401398, + "learning_rate": 0.00013176555168781451, + "loss": 1.5768, + "step": 818 + }, + { + "epoch": 0.8256048387096774, + "grad_norm": 0.09700962156057358, + "learning_rate": 0.00013161148308330257, + "loss": 1.5739, + "step": 819 + }, + { + "epoch": 0.8266129032258065, + "grad_norm": 0.10024348646402359, + "learning_rate": 0.00013145733106620532, + "loss": 1.6281, + "step": 820 + }, + { + "epoch": 0.8276209677419355, + "grad_norm": 0.09777159988880157, + "learning_rate": 0.00013130309604328057, + "loss": 1.6059, + "step": 821 + }, + { + "epoch": 0.8286290322580645, + "grad_norm": 0.0887807309627533, + "learning_rate": 0.00013114877842150516, + "loss": 1.5857, + "step": 822 + }, + { + "epoch": 0.8296370967741935, + "grad_norm": 0.09031641483306885, + "learning_rate": 0.000130994378608074, + "loss": 1.5523, + "step": 823 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 0.0985943153500557, + "learning_rate": 0.00013083989701039868, + "loss": 1.5464, + "step": 824 + }, + { + "epoch": 0.8316532258064516, + "grad_norm": 0.09250693768262863, + "learning_rate": 0.0001306853340361067, + "loss": 1.5564, + "step": 825 + }, + { + "epoch": 0.8326612903225806, + "grad_norm": 0.10353913903236389, + "learning_rate": 0.0001305306900930403, + "loss": 1.6126, + "step": 826 + }, + { + "epoch": 0.8336693548387096, + "grad_norm": 0.10408423840999603, + "learning_rate": 0.00013037596558925532, + "loss": 1.5946, + "step": 827 + }, + { + "epoch": 0.8346774193548387, + "grad_norm": 0.09186139702796936, + "learning_rate": 0.00013022116093302022, + "loss": 1.5692, + "step": 828 + }, + { + "epoch": 0.8356854838709677, + "grad_norm": 0.08551473915576935, + "learning_rate": 0.00013006627653281493, + "loss": 1.5486, + "step": 829 + }, + { + "epoch": 0.8366935483870968, + "grad_norm": 0.0928485244512558, + "learning_rate": 0.0001299113127973298, + "loss": 1.5435, + "step": 830 + }, + { + "epoch": 0.8377016129032258, + "grad_norm": 0.08251947164535522, + "learning_rate": 0.00012975627013546453, + "loss": 1.5519, + "step": 831 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.09292181581258774, + "learning_rate": 0.0001296011489563271, + "loss": 1.6129, + "step": 832 + }, + { + "epoch": 0.8397177419354839, + "grad_norm": 0.07900629937648773, + "learning_rate": 0.00012944594966923263, + "loss": 1.5951, + "step": 833 + }, + { + "epoch": 0.8407258064516129, + "grad_norm": 0.08966945856809616, + "learning_rate": 0.00012929067268370234, + "loss": 1.5484, + "step": 834 + }, + { + "epoch": 0.8417338709677419, + "grad_norm": 0.08244184404611588, + "learning_rate": 0.00012913531840946248, + "loss": 1.5852, + "step": 835 + }, + { + "epoch": 0.842741935483871, + "grad_norm": 0.0986471101641655, + "learning_rate": 0.00012897988725644335, + "loss": 1.5797, + "step": 836 + }, + { + "epoch": 0.84375, + "grad_norm": 0.09217972308397293, + "learning_rate": 0.0001288243796347779, + "loss": 1.6433, + "step": 837 + }, + { + "epoch": 0.844758064516129, + "grad_norm": 0.07959865033626556, + "learning_rate": 0.00012866879595480098, + "loss": 1.5639, + "step": 838 + }, + { + "epoch": 0.8457661290322581, + "grad_norm": 0.08987965434789658, + "learning_rate": 0.0001285131366270482, + "loss": 1.567, + "step": 839 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 0.08139210939407349, + "learning_rate": 0.00012835740206225464, + "loss": 1.5881, + "step": 840 + }, + { + "epoch": 0.8477822580645161, + "grad_norm": 0.09342298656702042, + "learning_rate": 0.00012820159267135396, + "loss": 1.6147, + "step": 841 + }, + { + "epoch": 0.8487903225806451, + "grad_norm": 0.08475241810083389, + "learning_rate": 0.0001280457088654773, + "loss": 1.6063, + "step": 842 + }, + { + "epoch": 0.8497983870967742, + "grad_norm": 0.0910174772143364, + "learning_rate": 0.00012788975105595214, + "loss": 1.6055, + "step": 843 + }, + { + "epoch": 0.8508064516129032, + "grad_norm": 0.08082278817892075, + "learning_rate": 0.00012773371965430115, + "loss": 1.5668, + "step": 844 + }, + { + "epoch": 0.8518145161290323, + "grad_norm": 0.0862516313791275, + "learning_rate": 0.00012757761507224132, + "loss": 1.5415, + "step": 845 + }, + { + "epoch": 0.8528225806451613, + "grad_norm": 0.07902859151363373, + "learning_rate": 0.00012742143772168264, + "loss": 1.5333, + "step": 846 + }, + { + "epoch": 0.8538306451612904, + "grad_norm": 0.090780109167099, + "learning_rate": 0.00012726518801472718, + "loss": 1.6311, + "step": 847 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 0.08239061385393143, + "learning_rate": 0.0001271088663636679, + "loss": 1.5331, + "step": 848 + }, + { + "epoch": 0.8558467741935484, + "grad_norm": 0.08999927341938019, + "learning_rate": 0.0001269524731809875, + "loss": 1.5775, + "step": 849 + }, + { + "epoch": 0.8568548387096774, + "grad_norm": 0.07954005897045135, + "learning_rate": 0.00012679600887935768, + "loss": 1.5969, + "step": 850 + }, + { + "epoch": 0.8578629032258065, + "grad_norm": 0.08286864310503006, + "learning_rate": 0.00012663947387163755, + "loss": 1.551, + "step": 851 + }, + { + "epoch": 0.8588709677419355, + "grad_norm": 0.08236175030469894, + "learning_rate": 0.00012648286857087294, + "loss": 1.5575, + "step": 852 + }, + { + "epoch": 0.8598790322580645, + "grad_norm": 0.08063997328281403, + "learning_rate": 0.00012632619339029508, + "loss": 1.5899, + "step": 853 + }, + { + "epoch": 0.8608870967741935, + "grad_norm": 0.08329153805971146, + "learning_rate": 0.00012616944874331963, + "loss": 1.5523, + "step": 854 + }, + { + "epoch": 0.8618951612903226, + "grad_norm": 0.08181768655776978, + "learning_rate": 0.00012601263504354555, + "loss": 1.5743, + "step": 855 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 0.07989370822906494, + "learning_rate": 0.00012585575270475402, + "loss": 1.5629, + "step": 856 + }, + { + "epoch": 0.8639112903225806, + "grad_norm": 0.0804544985294342, + "learning_rate": 0.00012569880214090726, + "loss": 1.5573, + "step": 857 + }, + { + "epoch": 0.8649193548387096, + "grad_norm": 0.08739953488111496, + "learning_rate": 0.0001255417837661476, + "loss": 1.5705, + "step": 858 + }, + { + "epoch": 0.8659274193548387, + "grad_norm": 0.08386445045471191, + "learning_rate": 0.00012538469799479627, + "loss": 1.6106, + "step": 859 + }, + { + "epoch": 0.8669354838709677, + "grad_norm": 0.10252925008535385, + "learning_rate": 0.00012522754524135228, + "loss": 1.5472, + "step": 860 + }, + { + "epoch": 0.8679435483870968, + "grad_norm": 0.08197301626205444, + "learning_rate": 0.0001250703259204916, + "loss": 1.5955, + "step": 861 + }, + { + "epoch": 0.8689516129032258, + "grad_norm": 0.09445837140083313, + "learning_rate": 0.00012491304044706553, + "loss": 1.5536, + "step": 862 + }, + { + "epoch": 0.8699596774193549, + "grad_norm": 0.0779092088341713, + "learning_rate": 0.00012475568923610015, + "loss": 1.5235, + "step": 863 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.08657954633235931, + "learning_rate": 0.00012459827270279499, + "loss": 1.5306, + "step": 864 + }, + { + "epoch": 0.8719758064516129, + "grad_norm": 0.08000969886779785, + "learning_rate": 0.0001244407912625218, + "loss": 1.5451, + "step": 865 + }, + { + "epoch": 0.8729838709677419, + "grad_norm": 0.1217707023024559, + "learning_rate": 0.00012428324533082376, + "loss": 1.5896, + "step": 866 + }, + { + "epoch": 0.873991935483871, + "grad_norm": 0.09770061075687408, + "learning_rate": 0.00012412563532341413, + "loss": 1.5649, + "step": 867 + }, + { + "epoch": 0.875, + "grad_norm": 0.08925329893827438, + "learning_rate": 0.0001239679616561753, + "loss": 1.59, + "step": 868 + }, + { + "epoch": 0.876008064516129, + "grad_norm": 0.0919514149427414, + "learning_rate": 0.0001238102247451575, + "loss": 1.6517, + "step": 869 + }, + { + "epoch": 0.8770161290322581, + "grad_norm": 0.0922718271613121, + "learning_rate": 0.0001236524250065781, + "loss": 1.6104, + "step": 870 + }, + { + "epoch": 0.8780241935483871, + "grad_norm": 0.08782748132944107, + "learning_rate": 0.00012349456285682002, + "loss": 1.6027, + "step": 871 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 0.08689384907484055, + "learning_rate": 0.00012333663871243094, + "loss": 1.5969, + "step": 872 + }, + { + "epoch": 0.8800403225806451, + "grad_norm": 0.08294008672237396, + "learning_rate": 0.00012317865299012212, + "loss": 1.5852, + "step": 873 + }, + { + "epoch": 0.8810483870967742, + "grad_norm": 0.1106681302189827, + "learning_rate": 0.00012302060610676737, + "loss": 1.622, + "step": 874 + }, + { + "epoch": 0.8820564516129032, + "grad_norm": 0.10415118932723999, + "learning_rate": 0.00012286249847940178, + "loss": 1.6416, + "step": 875 + }, + { + "epoch": 0.8830645161290323, + "grad_norm": 0.08293262124061584, + "learning_rate": 0.00012270433052522073, + "loss": 1.5963, + "step": 876 + }, + { + "epoch": 0.8840725806451613, + "grad_norm": 0.09230700880289078, + "learning_rate": 0.0001225461026615789, + "loss": 1.6242, + "step": 877 + }, + { + "epoch": 0.8850806451612904, + "grad_norm": 0.08799263834953308, + "learning_rate": 0.00012238781530598896, + "loss": 1.5607, + "step": 878 + }, + { + "epoch": 0.8860887096774194, + "grad_norm": 0.08640427887439728, + "learning_rate": 0.00012222946887612056, + "loss": 1.6114, + "step": 879 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.08553026616573334, + "learning_rate": 0.0001220710637897992, + "loss": 1.5549, + "step": 880 + }, + { + "epoch": 0.8881048387096774, + "grad_norm": 0.0878986194729805, + "learning_rate": 0.00012191260046500525, + "loss": 1.5697, + "step": 881 + }, + { + "epoch": 0.8891129032258065, + "grad_norm": 0.08509572595357895, + "learning_rate": 0.00012175407931987273, + "loss": 1.6237, + "step": 882 + }, + { + "epoch": 0.8901209677419355, + "grad_norm": 0.09629905223846436, + "learning_rate": 0.0001215955007726881, + "loss": 1.5869, + "step": 883 + }, + { + "epoch": 0.8911290322580645, + "grad_norm": 0.07942201942205429, + "learning_rate": 0.00012143686524188954, + "loss": 1.5933, + "step": 884 + }, + { + "epoch": 0.8921370967741935, + "grad_norm": 0.0878920629620552, + "learning_rate": 0.00012127817314606526, + "loss": 1.5485, + "step": 885 + }, + { + "epoch": 0.8931451612903226, + "grad_norm": 0.07961869984865189, + "learning_rate": 0.00012111942490395305, + "loss": 1.571, + "step": 886 + }, + { + "epoch": 0.8941532258064516, + "grad_norm": 0.08690143376588821, + "learning_rate": 0.00012096062093443863, + "loss": 1.5437, + "step": 887 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 0.08331328630447388, + "learning_rate": 0.00012080176165655488, + "loss": 1.5967, + "step": 888 + }, + { + "epoch": 0.8961693548387096, + "grad_norm": 0.08849766850471497, + "learning_rate": 0.00012064284748948053, + "loss": 1.6156, + "step": 889 + }, + { + "epoch": 0.8971774193548387, + "grad_norm": 0.08413555473089218, + "learning_rate": 0.00012048387885253925, + "loss": 1.5603, + "step": 890 + }, + { + "epoch": 0.8981854838709677, + "grad_norm": 0.08616600930690765, + "learning_rate": 0.0001203248561651984, + "loss": 1.5682, + "step": 891 + }, + { + "epoch": 0.8991935483870968, + "grad_norm": 0.08520584553480148, + "learning_rate": 0.00012016577984706792, + "loss": 1.6327, + "step": 892 + }, + { + "epoch": 0.9002016129032258, + "grad_norm": 0.08620157837867737, + "learning_rate": 0.0001200066503178993, + "loss": 1.6143, + "step": 893 + }, + { + "epoch": 0.9012096774193549, + "grad_norm": 0.07895144820213318, + "learning_rate": 0.00011984746799758442, + "loss": 1.5533, + "step": 894 + }, + { + "epoch": 0.9022177419354839, + "grad_norm": 0.08743470162153244, + "learning_rate": 0.0001196882333061545, + "loss": 1.6004, + "step": 895 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.08172673732042313, + "learning_rate": 0.0001195289466637789, + "loss": 1.6032, + "step": 896 + }, + { + "epoch": 0.9042338709677419, + "grad_norm": 0.09668843448162079, + "learning_rate": 0.00011936960849076411, + "loss": 1.6198, + "step": 897 + }, + { + "epoch": 0.905241935483871, + "grad_norm": 0.08503922075033188, + "learning_rate": 0.00011921021920755253, + "loss": 1.5638, + "step": 898 + }, + { + "epoch": 0.90625, + "grad_norm": 0.0889093279838562, + "learning_rate": 0.00011905077923472146, + "loss": 1.624, + "step": 899 + }, + { + "epoch": 0.907258064516129, + "grad_norm": 0.08409906178712845, + "learning_rate": 0.00011889128899298198, + "loss": 1.5562, + "step": 900 + }, + { + "epoch": 0.9082661290322581, + "grad_norm": 0.08293265849351883, + "learning_rate": 0.00011873174890317775, + "loss": 1.5709, + "step": 901 + }, + { + "epoch": 0.9092741935483871, + "grad_norm": 0.09479732066392899, + "learning_rate": 0.00011857215938628403, + "loss": 1.6222, + "step": 902 + }, + { + "epoch": 0.9102822580645161, + "grad_norm": 0.08044169843196869, + "learning_rate": 0.00011841252086340649, + "loss": 1.5862, + "step": 903 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 0.08543860912322998, + "learning_rate": 0.00011825283375578005, + "loss": 1.5764, + "step": 904 + }, + { + "epoch": 0.9122983870967742, + "grad_norm": 0.08160272240638733, + "learning_rate": 0.0001180930984847679, + "loss": 1.5204, + "step": 905 + }, + { + "epoch": 0.9133064516129032, + "grad_norm": 0.10486453771591187, + "learning_rate": 0.00011793331547186026, + "loss": 1.5921, + "step": 906 + }, + { + "epoch": 0.9143145161290323, + "grad_norm": 0.0780840739607811, + "learning_rate": 0.00011777348513867341, + "loss": 1.5173, + "step": 907 + }, + { + "epoch": 0.9153225806451613, + "grad_norm": 0.08347219228744507, + "learning_rate": 0.00011761360790694837, + "loss": 1.5543, + "step": 908 + }, + { + "epoch": 0.9163306451612904, + "grad_norm": 0.09629109501838684, + "learning_rate": 0.00011745368419855005, + "loss": 1.6039, + "step": 909 + }, + { + "epoch": 0.9173387096774194, + "grad_norm": 0.08534412831068039, + "learning_rate": 0.00011729371443546587, + "loss": 1.5787, + "step": 910 + }, + { + "epoch": 0.9183467741935484, + "grad_norm": 0.08703077584505081, + "learning_rate": 0.00011713369903980485, + "loss": 1.6218, + "step": 911 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 0.08057136088609695, + "learning_rate": 0.00011697363843379641, + "loss": 1.5475, + "step": 912 + }, + { + "epoch": 0.9203629032258065, + "grad_norm": 0.09287240356206894, + "learning_rate": 0.00011681353303978924, + "loss": 1.5587, + "step": 913 + }, + { + "epoch": 0.9213709677419355, + "grad_norm": 0.08380912989377975, + "learning_rate": 0.00011665338328025027, + "loss": 1.6194, + "step": 914 + }, + { + "epoch": 0.9223790322580645, + "grad_norm": 0.08018894493579865, + "learning_rate": 0.00011649318957776336, + "loss": 1.545, + "step": 915 + }, + { + "epoch": 0.9233870967741935, + "grad_norm": 0.07932014018297195, + "learning_rate": 0.00011633295235502851, + "loss": 1.5688, + "step": 916 + }, + { + "epoch": 0.9243951612903226, + "grad_norm": 0.08409032970666885, + "learning_rate": 0.0001161726720348604, + "loss": 1.5354, + "step": 917 + }, + { + "epoch": 0.9254032258064516, + "grad_norm": 0.07981358468532562, + "learning_rate": 0.00011601234904018751, + "loss": 1.5604, + "step": 918 + }, + { + "epoch": 0.9264112903225806, + "grad_norm": 0.0860762745141983, + "learning_rate": 0.00011585198379405092, + "loss": 1.5857, + "step": 919 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 0.09491165727376938, + "learning_rate": 0.00011569157671960316, + "loss": 1.5479, + "step": 920 + }, + { + "epoch": 0.9284274193548387, + "grad_norm": 0.08277281373739243, + "learning_rate": 0.00011553112824010716, + "loss": 1.5773, + "step": 921 + }, + { + "epoch": 0.9294354838709677, + "grad_norm": 0.08350729942321777, + "learning_rate": 0.00011537063877893513, + "loss": 1.5508, + "step": 922 + }, + { + "epoch": 0.9304435483870968, + "grad_norm": 0.08306790888309479, + "learning_rate": 0.00011521010875956734, + "loss": 1.5807, + "step": 923 + }, + { + "epoch": 0.9314516129032258, + "grad_norm": 0.07756998389959335, + "learning_rate": 0.00011504953860559116, + "loss": 1.546, + "step": 924 + }, + { + "epoch": 0.9324596774193549, + "grad_norm": 0.08689188212156296, + "learning_rate": 0.00011488892874069981, + "loss": 1.5929, + "step": 925 + }, + { + "epoch": 0.9334677419354839, + "grad_norm": 0.08053242415189743, + "learning_rate": 0.00011472827958869133, + "loss": 1.5578, + "step": 926 + }, + { + "epoch": 0.9344758064516129, + "grad_norm": 0.08326185494661331, + "learning_rate": 0.0001145675915734674, + "loss": 1.544, + "step": 927 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 0.08363624662160873, + "learning_rate": 0.00011440686511903223, + "loss": 1.5564, + "step": 928 + }, + { + "epoch": 0.936491935483871, + "grad_norm": 0.08229418098926544, + "learning_rate": 0.00011424610064949153, + "loss": 1.5542, + "step": 929 + }, + { + "epoch": 0.9375, + "grad_norm": 0.09776529669761658, + "learning_rate": 0.00011408529858905126, + "loss": 1.5353, + "step": 930 + }, + { + "epoch": 0.938508064516129, + "grad_norm": 0.09616075456142426, + "learning_rate": 0.0001139244593620166, + "loss": 1.6193, + "step": 931 + }, + { + "epoch": 0.9395161290322581, + "grad_norm": 0.1000729650259018, + "learning_rate": 0.00011376358339279076, + "loss": 1.633, + "step": 932 + }, + { + "epoch": 0.9405241935483871, + "grad_norm": 0.08457247912883759, + "learning_rate": 0.00011360267110587393, + "loss": 1.5798, + "step": 933 + }, + { + "epoch": 0.9415322580645161, + "grad_norm": 0.07730599492788315, + "learning_rate": 0.00011344172292586217, + "loss": 1.5163, + "step": 934 + }, + { + "epoch": 0.9425403225806451, + "grad_norm": 0.09660627692937851, + "learning_rate": 0.00011328073927744616, + "loss": 1.6322, + "step": 935 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 0.08001486957073212, + "learning_rate": 0.00011311972058541023, + "loss": 1.5755, + "step": 936 + }, + { + "epoch": 0.9445564516129032, + "grad_norm": 0.08265230059623718, + "learning_rate": 0.0001129586672746312, + "loss": 1.61, + "step": 937 + }, + { + "epoch": 0.9455645161290323, + "grad_norm": 0.09588516503572464, + "learning_rate": 0.00011279757977007717, + "loss": 1.6023, + "step": 938 + }, + { + "epoch": 0.9465725806451613, + "grad_norm": 0.0791090875864029, + "learning_rate": 0.0001126364584968065, + "loss": 1.5158, + "step": 939 + }, + { + "epoch": 0.9475806451612904, + "grad_norm": 0.09306017309427261, + "learning_rate": 0.00011247530387996668, + "loss": 1.5724, + "step": 940 + }, + { + "epoch": 0.9485887096774194, + "grad_norm": 0.08578615635633469, + "learning_rate": 0.00011231411634479316, + "loss": 1.5692, + "step": 941 + }, + { + "epoch": 0.9495967741935484, + "grad_norm": 0.0851496234536171, + "learning_rate": 0.00011215289631660823, + "loss": 1.5677, + "step": 942 + }, + { + "epoch": 0.9506048387096774, + "grad_norm": 0.08048581331968307, + "learning_rate": 0.00011199164422081995, + "loss": 1.5537, + "step": 943 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 0.08106819540262222, + "learning_rate": 0.000111830360482921, + "loss": 1.5296, + "step": 944 + }, + { + "epoch": 0.9526209677419355, + "grad_norm": 0.07831558585166931, + "learning_rate": 0.00011166904552848749, + "loss": 1.5503, + "step": 945 + }, + { + "epoch": 0.9536290322580645, + "grad_norm": 0.07931654155254364, + "learning_rate": 0.000111507699783178, + "loss": 1.5592, + "step": 946 + }, + { + "epoch": 0.9546370967741935, + "grad_norm": 0.07992593944072723, + "learning_rate": 0.0001113463236727323, + "loss": 1.5671, + "step": 947 + }, + { + "epoch": 0.9556451612903226, + "grad_norm": 0.08474520593881607, + "learning_rate": 0.00011118491762297027, + "loss": 1.5699, + "step": 948 + }, + { + "epoch": 0.9566532258064516, + "grad_norm": 0.08235491812229156, + "learning_rate": 0.0001110234820597908, + "loss": 1.5671, + "step": 949 + }, + { + "epoch": 0.9576612903225806, + "grad_norm": 0.09822028130292892, + "learning_rate": 0.00011086201740917075, + "loss": 1.6389, + "step": 950 + }, + { + "epoch": 0.9586693548387096, + "grad_norm": 0.08909379690885544, + "learning_rate": 0.00011070052409716354, + "loss": 1.6273, + "step": 951 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 0.08938673883676529, + "learning_rate": 0.00011053900254989837, + "loss": 1.5894, + "step": 952 + }, + { + "epoch": 0.9606854838709677, + "grad_norm": 0.08622390776872635, + "learning_rate": 0.00011037745319357893, + "loss": 1.6217, + "step": 953 + }, + { + "epoch": 0.9616935483870968, + "grad_norm": 0.08985532820224762, + "learning_rate": 0.00011021587645448222, + "loss": 1.6432, + "step": 954 + }, + { + "epoch": 0.9627016129032258, + "grad_norm": 0.08598313480615616, + "learning_rate": 0.00011005427275895756, + "loss": 1.54, + "step": 955 + }, + { + "epoch": 0.9637096774193549, + "grad_norm": 0.0815306007862091, + "learning_rate": 0.00010989264253342538, + "loss": 1.5172, + "step": 956 + }, + { + "epoch": 0.9647177419354839, + "grad_norm": 0.09671612083911896, + "learning_rate": 0.00010973098620437609, + "loss": 1.6054, + "step": 957 + }, + { + "epoch": 0.9657258064516129, + "grad_norm": 0.0809609442949295, + "learning_rate": 0.00010956930419836899, + "loss": 1.528, + "step": 958 + }, + { + "epoch": 0.9667338709677419, + "grad_norm": 0.08456597477197647, + "learning_rate": 0.0001094075969420312, + "loss": 1.5383, + "step": 959 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.09076231718063354, + "learning_rate": 0.00010924586486205632, + "loss": 1.5948, + "step": 960 + }, + { + "epoch": 0.96875, + "grad_norm": 0.08709228038787842, + "learning_rate": 0.00010908410838520362, + "loss": 1.5425, + "step": 961 + }, + { + "epoch": 0.969758064516129, + "grad_norm": 0.09060946106910706, + "learning_rate": 0.00010892232793829659, + "loss": 1.57, + "step": 962 + }, + { + "epoch": 0.9707661290322581, + "grad_norm": 0.0881752297282219, + "learning_rate": 0.0001087605239482221, + "loss": 1.5874, + "step": 963 + }, + { + "epoch": 0.9717741935483871, + "grad_norm": 0.086030974984169, + "learning_rate": 0.00010859869684192907, + "loss": 1.5792, + "step": 964 + }, + { + "epoch": 0.9727822580645161, + "grad_norm": 0.0817110538482666, + "learning_rate": 0.00010843684704642744, + "loss": 1.5506, + "step": 965 + }, + { + "epoch": 0.9737903225806451, + "grad_norm": 0.08721321821212769, + "learning_rate": 0.00010827497498878703, + "loss": 1.5907, + "step": 966 + }, + { + "epoch": 0.9747983870967742, + "grad_norm": 0.07887570559978485, + "learning_rate": 0.00010811308109613634, + "loss": 1.578, + "step": 967 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 0.11064060032367706, + "learning_rate": 0.00010795116579566158, + "loss": 1.6, + "step": 968 + }, + { + "epoch": 0.9768145161290323, + "grad_norm": 0.08672841638326645, + "learning_rate": 0.00010778922951460537, + "loss": 1.5872, + "step": 969 + }, + { + "epoch": 0.9778225806451613, + "grad_norm": 0.08424878865480423, + "learning_rate": 0.00010762727268026571, + "loss": 1.5698, + "step": 970 + }, + { + "epoch": 0.9788306451612904, + "grad_norm": 0.08876322209835052, + "learning_rate": 0.00010746529571999491, + "loss": 1.5775, + "step": 971 + }, + { + "epoch": 0.9798387096774194, + "grad_norm": 0.08440111577510834, + "learning_rate": 0.00010730329906119822, + "loss": 1.5574, + "step": 972 + }, + { + "epoch": 0.9808467741935484, + "grad_norm": 0.08397315442562103, + "learning_rate": 0.00010714128313133307, + "loss": 1.6166, + "step": 973 + }, + { + "epoch": 0.9818548387096774, + "grad_norm": 0.09894799441099167, + "learning_rate": 0.00010697924835790758, + "loss": 1.6352, + "step": 974 + }, + { + "epoch": 0.9828629032258065, + "grad_norm": 0.08329147845506668, + "learning_rate": 0.00010681719516847968, + "loss": 1.555, + "step": 975 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 0.08748366683721542, + "learning_rate": 0.00010665512399065582, + "loss": 1.5519, + "step": 976 + }, + { + "epoch": 0.9848790322580645, + "grad_norm": 0.08558699488639832, + "learning_rate": 0.00010649303525209005, + "loss": 1.5762, + "step": 977 + }, + { + "epoch": 0.9858870967741935, + "grad_norm": 0.11034592986106873, + "learning_rate": 0.00010633092938048257, + "loss": 1.5972, + "step": 978 + }, + { + "epoch": 0.9868951612903226, + "grad_norm": 0.08514732867479324, + "learning_rate": 0.00010616880680357892, + "loss": 1.5625, + "step": 979 + }, + { + "epoch": 0.9879032258064516, + "grad_norm": 0.09123446047306061, + "learning_rate": 0.00010600666794916871, + "loss": 1.5516, + "step": 980 + }, + { + "epoch": 0.9889112903225806, + "grad_norm": 0.08317586034536362, + "learning_rate": 0.00010584451324508444, + "loss": 1.6043, + "step": 981 + }, + { + "epoch": 0.9899193548387096, + "grad_norm": 0.09369304031133652, + "learning_rate": 0.00010568234311920051, + "loss": 1.5575, + "step": 982 + }, + { + "epoch": 0.9909274193548387, + "grad_norm": 0.08730312436819077, + "learning_rate": 0.00010552015799943193, + "loss": 1.5848, + "step": 983 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 0.08520778268575668, + "learning_rate": 0.00010535795831373337, + "loss": 1.5697, + "step": 984 + }, + { + "epoch": 0.9929435483870968, + "grad_norm": 0.08985403925180435, + "learning_rate": 0.00010519574449009784, + "loss": 1.546, + "step": 985 + }, + { + "epoch": 0.9939516129032258, + "grad_norm": 0.08371421694755554, + "learning_rate": 0.0001050335169565557, + "loss": 1.5724, + "step": 986 + }, + { + "epoch": 0.9949596774193549, + "grad_norm": 0.08613915741443634, + "learning_rate": 0.00010487127614117352, + "loss": 1.5974, + "step": 987 + }, + { + "epoch": 0.9959677419354839, + "grad_norm": 0.09249399602413177, + "learning_rate": 0.00010470902247205283, + "loss": 1.6173, + "step": 988 + }, + { + "epoch": 0.9969758064516129, + "grad_norm": 0.09915943443775177, + "learning_rate": 0.00010454675637732916, + "loss": 1.5947, + "step": 989 + }, + { + "epoch": 0.9979838709677419, + "grad_norm": 0.0846395492553711, + "learning_rate": 0.00010438447828517077, + "loss": 1.5243, + "step": 990 + }, + { + "epoch": 0.998991935483871, + "grad_norm": 0.08313705772161484, + "learning_rate": 0.00010422218862377764, + "loss": 1.5333, + "step": 991 + }, + { + "epoch": 1.0, + "grad_norm": 0.08256080746650696, + "learning_rate": 0.00010405988782138019, + "loss": 1.5527, + "step": 992 + }, + { + "epoch": 1.001008064516129, + "grad_norm": 0.09215422719717026, + "learning_rate": 0.00010389757630623831, + "loss": 1.5035, + "step": 993 + }, + { + "epoch": 1.002016129032258, + "grad_norm": 0.08784796297550201, + "learning_rate": 0.00010373525450664016, + "loss": 1.5397, + "step": 994 + }, + { + "epoch": 1.003024193548387, + "grad_norm": 0.08578605949878693, + "learning_rate": 0.000103572922850901, + "loss": 1.5449, + "step": 995 + }, + { + "epoch": 1.0040322580645162, + "grad_norm": 0.09281399846076965, + "learning_rate": 0.00010341058176736207, + "loss": 1.4507, + "step": 996 + }, + { + "epoch": 1.0050403225806452, + "grad_norm": 0.09404852986335754, + "learning_rate": 0.00010324823168438953, + "loss": 1.4817, + "step": 997 + }, + { + "epoch": 1.0060483870967742, + "grad_norm": 0.0944603756070137, + "learning_rate": 0.00010308587303037334, + "loss": 1.536, + "step": 998 + }, + { + "epoch": 1.0070564516129032, + "grad_norm": 0.11103025823831558, + "learning_rate": 0.00010292350623372598, + "loss": 1.5278, + "step": 999 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 0.0859605222940445, + "learning_rate": 0.00010276113172288144, + "loss": 1.4855, + "step": 1000 + }, + { + "epoch": 1.0090725806451613, + "grad_norm": 0.08268768340349197, + "learning_rate": 0.0001025987499262941, + "loss": 1.4975, + "step": 1001 + }, + { + "epoch": 1.0100806451612903, + "grad_norm": 0.09450601041316986, + "learning_rate": 0.00010243636127243754, + "loss": 1.5052, + "step": 1002 + }, + { + "epoch": 1.0110887096774193, + "grad_norm": 0.10182943195104599, + "learning_rate": 0.00010227396618980344, + "loss": 1.5889, + "step": 1003 + }, + { + "epoch": 1.0120967741935485, + "grad_norm": 0.10887010395526886, + "learning_rate": 0.00010211156510690043, + "loss": 1.5387, + "step": 1004 + }, + { + "epoch": 1.0131048387096775, + "grad_norm": 0.09432150423526764, + "learning_rate": 0.00010194915845225304, + "loss": 1.51, + "step": 1005 + }, + { + "epoch": 1.0141129032258065, + "grad_norm": 0.0892212763428688, + "learning_rate": 0.00010178674665440034, + "loss": 1.4975, + "step": 1006 + }, + { + "epoch": 1.0151209677419355, + "grad_norm": 0.08749305456876755, + "learning_rate": 0.00010162433014189519, + "loss": 1.5303, + "step": 1007 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 0.09416648000478745, + "learning_rate": 0.00010146190934330268, + "loss": 1.499, + "step": 1008 + }, + { + "epoch": 1.0171370967741935, + "grad_norm": 0.10288472473621368, + "learning_rate": 0.00010129948468719939, + "loss": 1.4785, + "step": 1009 + }, + { + "epoch": 1.0181451612903225, + "grad_norm": 0.08718498051166534, + "learning_rate": 0.00010113705660217197, + "loss": 1.5045, + "step": 1010 + }, + { + "epoch": 1.0191532258064515, + "grad_norm": 0.08473226428031921, + "learning_rate": 0.00010097462551681612, + "loss": 1.4799, + "step": 1011 + }, + { + "epoch": 1.0201612903225807, + "grad_norm": 0.09531670063734055, + "learning_rate": 0.00010081219185973552, + "loss": 1.545, + "step": 1012 + }, + { + "epoch": 1.0211693548387097, + "grad_norm": 0.08223138749599457, + "learning_rate": 0.00010064975605954054, + "loss": 1.4807, + "step": 1013 + }, + { + "epoch": 1.0221774193548387, + "grad_norm": 0.08815553784370422, + "learning_rate": 0.00010048731854484735, + "loss": 1.47, + "step": 1014 + }, + { + "epoch": 1.0231854838709677, + "grad_norm": 0.09323311597108841, + "learning_rate": 0.00010032487974427645, + "loss": 1.5823, + "step": 1015 + }, + { + "epoch": 1.0241935483870968, + "grad_norm": 0.1007145345211029, + "learning_rate": 0.00010016244008645195, + "loss": 1.4864, + "step": 1016 + }, + { + "epoch": 1.0252016129032258, + "grad_norm": 0.09309312701225281, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 1017 + }, + { + "epoch": 1.0262096774193548, + "grad_norm": 0.08557573705911636, + "learning_rate": 9.983755991354809e-05, + "loss": 1.5165, + "step": 1018 + }, + { + "epoch": 1.0272177419354838, + "grad_norm": 0.10075996816158295, + "learning_rate": 9.967512025572356e-05, + "loss": 1.5106, + "step": 1019 + }, + { + "epoch": 1.028225806451613, + "grad_norm": 0.08483249694108963, + "learning_rate": 9.951268145515269e-05, + "loss": 1.4974, + "step": 1020 + }, + { + "epoch": 1.029233870967742, + "grad_norm": 0.11874374747276306, + "learning_rate": 9.935024394045948e-05, + "loss": 1.5622, + "step": 1021 + }, + { + "epoch": 1.030241935483871, + "grad_norm": 0.11608150601387024, + "learning_rate": 9.918780814026452e-05, + "loss": 1.5636, + "step": 1022 + }, + { + "epoch": 1.03125, + "grad_norm": 0.11097010225057602, + "learning_rate": 9.90253744831839e-05, + "loss": 1.5388, + "step": 1023 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 0.08821584284305573, + "learning_rate": 9.886294339782805e-05, + "loss": 1.4808, + "step": 1024 + }, + { + "epoch": 1.033266129032258, + "grad_norm": 0.08906351774930954, + "learning_rate": 9.870051531280064e-05, + "loss": 1.4567, + "step": 1025 + }, + { + "epoch": 1.034274193548387, + "grad_norm": 0.08993887901306152, + "learning_rate": 9.853809065669733e-05, + "loss": 1.5174, + "step": 1026 + }, + { + "epoch": 1.0352822580645162, + "grad_norm": 0.0829705148935318, + "learning_rate": 9.837566985810484e-05, + "loss": 1.5275, + "step": 1027 + }, + { + "epoch": 1.0362903225806452, + "grad_norm": 0.09338941425085068, + "learning_rate": 9.821325334559967e-05, + "loss": 1.5197, + "step": 1028 + }, + { + "epoch": 1.0372983870967742, + "grad_norm": 0.0843081921339035, + "learning_rate": 9.8050841547747e-05, + "loss": 1.5121, + "step": 1029 + }, + { + "epoch": 1.0383064516129032, + "grad_norm": 0.09108688682317734, + "learning_rate": 9.78884348930996e-05, + "loss": 1.5642, + "step": 1030 + }, + { + "epoch": 1.0393145161290323, + "grad_norm": 0.08404973894357681, + "learning_rate": 9.772603381019658e-05, + "loss": 1.4552, + "step": 1031 + }, + { + "epoch": 1.0403225806451613, + "grad_norm": 0.08852069824934006, + "learning_rate": 9.756363872756249e-05, + "loss": 1.5511, + "step": 1032 + }, + { + "epoch": 1.0413306451612903, + "grad_norm": 0.08855357021093369, + "learning_rate": 9.740125007370592e-05, + "loss": 1.5341, + "step": 1033 + }, + { + "epoch": 1.0423387096774193, + "grad_norm": 0.08306348323822021, + "learning_rate": 9.723886827711857e-05, + "loss": 1.4941, + "step": 1034 + }, + { + "epoch": 1.0433467741935485, + "grad_norm": 0.11460579931735992, + "learning_rate": 9.707649376627406e-05, + "loss": 1.541, + "step": 1035 + }, + { + "epoch": 1.0443548387096775, + "grad_norm": 0.0861547440290451, + "learning_rate": 9.691412696962667e-05, + "loss": 1.5364, + "step": 1036 + }, + { + "epoch": 1.0453629032258065, + "grad_norm": 0.092412069439888, + "learning_rate": 9.675176831561048e-05, + "loss": 1.5179, + "step": 1037 + }, + { + "epoch": 1.0463709677419355, + "grad_norm": 0.08788943290710449, + "learning_rate": 9.658941823263797e-05, + "loss": 1.4936, + "step": 1038 + }, + { + "epoch": 1.0473790322580645, + "grad_norm": 0.08519960939884186, + "learning_rate": 9.642707714909904e-05, + "loss": 1.539, + "step": 1039 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 0.08832072466611862, + "learning_rate": 9.626474549335986e-05, + "loss": 1.5077, + "step": 1040 + }, + { + "epoch": 1.0493951612903225, + "grad_norm": 0.10505390167236328, + "learning_rate": 9.61024236937617e-05, + "loss": 1.5432, + "step": 1041 + }, + { + "epoch": 1.0504032258064515, + "grad_norm": 0.09197022020816803, + "learning_rate": 9.594011217861982e-05, + "loss": 1.5595, + "step": 1042 + }, + { + "epoch": 1.0514112903225807, + "grad_norm": 0.0843205377459526, + "learning_rate": 9.577781137622238e-05, + "loss": 1.4353, + "step": 1043 + }, + { + "epoch": 1.0524193548387097, + "grad_norm": 0.10806506127119064, + "learning_rate": 9.561552171482925e-05, + "loss": 1.515, + "step": 1044 + }, + { + "epoch": 1.0534274193548387, + "grad_norm": 0.08592282235622406, + "learning_rate": 9.545324362267086e-05, + "loss": 1.5279, + "step": 1045 + }, + { + "epoch": 1.0544354838709677, + "grad_norm": 0.11082509160041809, + "learning_rate": 9.52909775279472e-05, + "loss": 1.5395, + "step": 1046 + }, + { + "epoch": 1.0554435483870968, + "grad_norm": 0.08529554307460785, + "learning_rate": 9.51287238588265e-05, + "loss": 1.4849, + "step": 1047 + }, + { + "epoch": 1.0564516129032258, + "grad_norm": 0.08765090256929398, + "learning_rate": 9.496648304344433e-05, + "loss": 1.4944, + "step": 1048 + }, + { + "epoch": 1.0574596774193548, + "grad_norm": 0.08893377333879471, + "learning_rate": 9.480425550990219e-05, + "loss": 1.5, + "step": 1049 + }, + { + "epoch": 1.0584677419354838, + "grad_norm": 0.09724058210849762, + "learning_rate": 9.464204168626665e-05, + "loss": 1.5281, + "step": 1050 + }, + { + "epoch": 1.059475806451613, + "grad_norm": 0.0883408635854721, + "learning_rate": 9.447984200056808e-05, + "loss": 1.5211, + "step": 1051 + }, + { + "epoch": 1.060483870967742, + "grad_norm": 0.08431454002857208, + "learning_rate": 9.43176568807995e-05, + "loss": 1.5175, + "step": 1052 + }, + { + "epoch": 1.061491935483871, + "grad_norm": 0.09407296776771545, + "learning_rate": 9.415548675491559e-05, + "loss": 1.5722, + "step": 1053 + }, + { + "epoch": 1.0625, + "grad_norm": 0.08895613998174667, + "learning_rate": 9.399333205083131e-05, + "loss": 1.5702, + "step": 1054 + }, + { + "epoch": 1.063508064516129, + "grad_norm": 0.08799167722463608, + "learning_rate": 9.38311931964211e-05, + "loss": 1.5531, + "step": 1055 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 0.08785036206245422, + "learning_rate": 9.366907061951745e-05, + "loss": 1.5398, + "step": 1056 + }, + { + "epoch": 1.065524193548387, + "grad_norm": 0.10027193278074265, + "learning_rate": 9.350696474790999e-05, + "loss": 1.5256, + "step": 1057 + }, + { + "epoch": 1.066532258064516, + "grad_norm": 0.08771440386772156, + "learning_rate": 9.334487600934416e-05, + "loss": 1.5085, + "step": 1058 + }, + { + "epoch": 1.0675403225806452, + "grad_norm": 0.08703982830047607, + "learning_rate": 9.318280483152033e-05, + "loss": 1.4827, + "step": 1059 + }, + { + "epoch": 1.0685483870967742, + "grad_norm": 0.08651833981275558, + "learning_rate": 9.302075164209241e-05, + "loss": 1.5255, + "step": 1060 + }, + { + "epoch": 1.0695564516129032, + "grad_norm": 0.0868133008480072, + "learning_rate": 9.285871686866692e-05, + "loss": 1.4953, + "step": 1061 + }, + { + "epoch": 1.0705645161290323, + "grad_norm": 0.08549060672521591, + "learning_rate": 9.269670093880177e-05, + "loss": 1.5239, + "step": 1062 + }, + { + "epoch": 1.0715725806451613, + "grad_norm": 0.08664209395647049, + "learning_rate": 9.25347042800051e-05, + "loss": 1.5328, + "step": 1063 + }, + { + "epoch": 1.0725806451612903, + "grad_norm": 0.0853060856461525, + "learning_rate": 9.237272731973428e-05, + "loss": 1.4854, + "step": 1064 + }, + { + "epoch": 1.0735887096774193, + "grad_norm": 0.10764405876398087, + "learning_rate": 9.221077048539464e-05, + "loss": 1.5174, + "step": 1065 + }, + { + "epoch": 1.0745967741935485, + "grad_norm": 0.09327509254217148, + "learning_rate": 9.204883420433844e-05, + "loss": 1.5074, + "step": 1066 + }, + { + "epoch": 1.0756048387096775, + "grad_norm": 0.08912849426269531, + "learning_rate": 9.188691890386367e-05, + "loss": 1.4915, + "step": 1067 + }, + { + "epoch": 1.0766129032258065, + "grad_norm": 0.08654549717903137, + "learning_rate": 9.172502501121297e-05, + "loss": 1.4998, + "step": 1068 + }, + { + "epoch": 1.0776209677419355, + "grad_norm": 0.09039713442325592, + "learning_rate": 9.156315295357257e-05, + "loss": 1.5139, + "step": 1069 + }, + { + "epoch": 1.0786290322580645, + "grad_norm": 0.08438859134912491, + "learning_rate": 9.140130315807091e-05, + "loss": 1.4935, + "step": 1070 + }, + { + "epoch": 1.0796370967741935, + "grad_norm": 0.08553072065114975, + "learning_rate": 9.123947605177791e-05, + "loss": 1.508, + "step": 1071 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 0.08692750334739685, + "learning_rate": 9.107767206170342e-05, + "loss": 1.5114, + "step": 1072 + }, + { + "epoch": 1.0816532258064515, + "grad_norm": 0.09480643272399902, + "learning_rate": 9.09158916147964e-05, + "loss": 1.5726, + "step": 1073 + }, + { + "epoch": 1.0826612903225807, + "grad_norm": 0.0879359245300293, + "learning_rate": 9.075413513794369e-05, + "loss": 1.4962, + "step": 1074 + }, + { + "epoch": 1.0836693548387097, + "grad_norm": 0.09322493523359299, + "learning_rate": 9.059240305796884e-05, + "loss": 1.5454, + "step": 1075 + }, + { + "epoch": 1.0846774193548387, + "grad_norm": 0.09673374146223068, + "learning_rate": 9.043069580163099e-05, + "loss": 1.509, + "step": 1076 + }, + { + "epoch": 1.0856854838709677, + "grad_norm": 0.08707006275653839, + "learning_rate": 9.02690137956239e-05, + "loss": 1.5632, + "step": 1077 + }, + { + "epoch": 1.0866935483870968, + "grad_norm": 0.08686521649360657, + "learning_rate": 9.010735746657462e-05, + "loss": 1.4968, + "step": 1078 + }, + { + "epoch": 1.0877016129032258, + "grad_norm": 0.08472903817892075, + "learning_rate": 8.994572724104242e-05, + "loss": 1.4908, + "step": 1079 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 0.09030890464782715, + "learning_rate": 8.978412354551779e-05, + "loss": 1.5018, + "step": 1080 + }, + { + "epoch": 1.089717741935484, + "grad_norm": 0.08417510986328125, + "learning_rate": 8.962254680642107e-05, + "loss": 1.4444, + "step": 1081 + }, + { + "epoch": 1.090725806451613, + "grad_norm": 0.09092919528484344, + "learning_rate": 8.946099745010164e-05, + "loss": 1.5303, + "step": 1082 + }, + { + "epoch": 1.091733870967742, + "grad_norm": 0.09100567549467087, + "learning_rate": 8.929947590283647e-05, + "loss": 1.5403, + "step": 1083 + }, + { + "epoch": 1.092741935483871, + "grad_norm": 0.12923839688301086, + "learning_rate": 8.913798259082928e-05, + "loss": 1.4664, + "step": 1084 + }, + { + "epoch": 1.09375, + "grad_norm": 0.09925505518913269, + "learning_rate": 8.897651794020918e-05, + "loss": 1.5229, + "step": 1085 + }, + { + "epoch": 1.094758064516129, + "grad_norm": 0.08671566098928452, + "learning_rate": 8.881508237702973e-05, + "loss": 1.4995, + "step": 1086 + }, + { + "epoch": 1.095766129032258, + "grad_norm": 0.08649452030658722, + "learning_rate": 8.865367632726772e-05, + "loss": 1.4993, + "step": 1087 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.0882314071059227, + "learning_rate": 8.849230021682199e-05, + "loss": 1.5442, + "step": 1088 + }, + { + "epoch": 1.097782258064516, + "grad_norm": 0.088679738342762, + "learning_rate": 8.833095447151252e-05, + "loss": 1.5014, + "step": 1089 + }, + { + "epoch": 1.0987903225806452, + "grad_norm": 0.09637542814016342, + "learning_rate": 8.816963951707901e-05, + "loss": 1.5503, + "step": 1090 + }, + { + "epoch": 1.0997983870967742, + "grad_norm": 0.09071476012468338, + "learning_rate": 8.800835577918006e-05, + "loss": 1.5016, + "step": 1091 + }, + { + "epoch": 1.1008064516129032, + "grad_norm": 0.09719227999448776, + "learning_rate": 8.784710368339178e-05, + "loss": 1.4767, + "step": 1092 + }, + { + "epoch": 1.1018145161290323, + "grad_norm": 0.08729701489210129, + "learning_rate": 8.768588365520685e-05, + "loss": 1.5011, + "step": 1093 + }, + { + "epoch": 1.1028225806451613, + "grad_norm": 0.08893397450447083, + "learning_rate": 8.752469612003332e-05, + "loss": 1.5368, + "step": 1094 + }, + { + "epoch": 1.1038306451612903, + "grad_norm": 0.08354583382606506, + "learning_rate": 8.736354150319349e-05, + "loss": 1.5199, + "step": 1095 + }, + { + "epoch": 1.1048387096774193, + "grad_norm": 0.08970467001199722, + "learning_rate": 8.720242022992284e-05, + "loss": 1.5328, + "step": 1096 + }, + { + "epoch": 1.1058467741935485, + "grad_norm": 0.09049658477306366, + "learning_rate": 8.704133272536879e-05, + "loss": 1.5323, + "step": 1097 + }, + { + "epoch": 1.1068548387096775, + "grad_norm": 0.08495205640792847, + "learning_rate": 8.68802794145898e-05, + "loss": 1.4833, + "step": 1098 + }, + { + "epoch": 1.1078629032258065, + "grad_norm": 0.08763737976551056, + "learning_rate": 8.671926072255389e-05, + "loss": 1.5314, + "step": 1099 + }, + { + "epoch": 1.1088709677419355, + "grad_norm": 0.0835312008857727, + "learning_rate": 8.655827707413788e-05, + "loss": 1.5162, + "step": 1100 + }, + { + "epoch": 1.1098790322580645, + "grad_norm": 0.08878222852945328, + "learning_rate": 8.63973288941261e-05, + "loss": 1.4885, + "step": 1101 + }, + { + "epoch": 1.1108870967741935, + "grad_norm": 0.09213855862617493, + "learning_rate": 8.623641660720928e-05, + "loss": 1.5398, + "step": 1102 + }, + { + "epoch": 1.1118951612903225, + "grad_norm": 0.08432666957378387, + "learning_rate": 8.607554063798346e-05, + "loss": 1.4907, + "step": 1103 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 0.10029254853725433, + "learning_rate": 8.591470141094878e-05, + "loss": 1.5904, + "step": 1104 + }, + { + "epoch": 1.1139112903225807, + "grad_norm": 0.08696424961090088, + "learning_rate": 8.57538993505085e-05, + "loss": 1.5079, + "step": 1105 + }, + { + "epoch": 1.1149193548387097, + "grad_norm": 0.08842870593070984, + "learning_rate": 8.559313488096782e-05, + "loss": 1.5223, + "step": 1106 + }, + { + "epoch": 1.1159274193548387, + "grad_norm": 0.08505623787641525, + "learning_rate": 8.543240842653266e-05, + "loss": 1.4939, + "step": 1107 + }, + { + "epoch": 1.1169354838709677, + "grad_norm": 0.09814995527267456, + "learning_rate": 8.527172041130874e-05, + "loss": 1.5732, + "step": 1108 + }, + { + "epoch": 1.1179435483870968, + "grad_norm": 0.09438839554786682, + "learning_rate": 8.511107125930022e-05, + "loss": 1.5903, + "step": 1109 + }, + { + "epoch": 1.1189516129032258, + "grad_norm": 0.08910852670669556, + "learning_rate": 8.49504613944089e-05, + "loss": 1.5203, + "step": 1110 + }, + { + "epoch": 1.1199596774193548, + "grad_norm": 0.0924610123038292, + "learning_rate": 8.47898912404327e-05, + "loss": 1.5302, + "step": 1111 + }, + { + "epoch": 1.120967741935484, + "grad_norm": 0.08957453072071075, + "learning_rate": 8.462936122106489e-05, + "loss": 1.5179, + "step": 1112 + }, + { + "epoch": 1.121975806451613, + "grad_norm": 0.1187904104590416, + "learning_rate": 8.446887175989286e-05, + "loss": 1.5622, + "step": 1113 + }, + { + "epoch": 1.122983870967742, + "grad_norm": 0.0907069593667984, + "learning_rate": 8.430842328039686e-05, + "loss": 1.502, + "step": 1114 + }, + { + "epoch": 1.123991935483871, + "grad_norm": 0.09245329350233078, + "learning_rate": 8.414801620594912e-05, + "loss": 1.476, + "step": 1115 + }, + { + "epoch": 1.125, + "grad_norm": 0.10100734978914261, + "learning_rate": 8.398765095981251e-05, + "loss": 1.5111, + "step": 1116 + }, + { + "epoch": 1.126008064516129, + "grad_norm": 0.09156333655118942, + "learning_rate": 8.382732796513966e-05, + "loss": 1.4985, + "step": 1117 + }, + { + "epoch": 1.127016129032258, + "grad_norm": 0.11173349618911743, + "learning_rate": 8.366704764497154e-05, + "loss": 1.4869, + "step": 1118 + }, + { + "epoch": 1.128024193548387, + "grad_norm": 0.08984418958425522, + "learning_rate": 8.35068104222367e-05, + "loss": 1.52, + "step": 1119 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.11599362641572952, + "learning_rate": 8.33466167197498e-05, + "loss": 1.5154, + "step": 1120 + }, + { + "epoch": 1.1300403225806452, + "grad_norm": 0.09752003848552704, + "learning_rate": 8.318646696021077e-05, + "loss": 1.4838, + "step": 1121 + }, + { + "epoch": 1.1310483870967742, + "grad_norm": 0.09071122109889984, + "learning_rate": 8.302636156620363e-05, + "loss": 1.5761, + "step": 1122 + }, + { + "epoch": 1.1320564516129032, + "grad_norm": 0.08928891271352768, + "learning_rate": 8.286630096019518e-05, + "loss": 1.5265, + "step": 1123 + }, + { + "epoch": 1.1330645161290323, + "grad_norm": 0.08508775383234024, + "learning_rate": 8.270628556453417e-05, + "loss": 1.548, + "step": 1124 + }, + { + "epoch": 1.1340725806451613, + "grad_norm": 0.08637328445911407, + "learning_rate": 8.254631580144999e-05, + "loss": 1.4786, + "step": 1125 + }, + { + "epoch": 1.1350806451612903, + "grad_norm": 0.08538668602705002, + "learning_rate": 8.238639209305166e-05, + "loss": 1.4797, + "step": 1126 + }, + { + "epoch": 1.1360887096774193, + "grad_norm": 0.08973052352666855, + "learning_rate": 8.222651486132664e-05, + "loss": 1.5066, + "step": 1127 + }, + { + "epoch": 1.1370967741935485, + "grad_norm": 0.08729778975248337, + "learning_rate": 8.206668452813978e-05, + "loss": 1.4973, + "step": 1128 + }, + { + "epoch": 1.1381048387096775, + "grad_norm": 0.08795138448476791, + "learning_rate": 8.190690151523215e-05, + "loss": 1.4892, + "step": 1129 + }, + { + "epoch": 1.1391129032258065, + "grad_norm": 0.08695145696401596, + "learning_rate": 8.174716624421997e-05, + "loss": 1.5163, + "step": 1130 + }, + { + "epoch": 1.1401209677419355, + "grad_norm": 0.08848337084054947, + "learning_rate": 8.158747913659355e-05, + "loss": 1.4907, + "step": 1131 + }, + { + "epoch": 1.1411290322580645, + "grad_norm": 0.08827504515647888, + "learning_rate": 8.142784061371598e-05, + "loss": 1.5306, + "step": 1132 + }, + { + "epoch": 1.1421370967741935, + "grad_norm": 0.09366059303283691, + "learning_rate": 8.126825109682228e-05, + "loss": 1.4598, + "step": 1133 + }, + { + "epoch": 1.1431451612903225, + "grad_norm": 0.09082233905792236, + "learning_rate": 8.110871100701807e-05, + "loss": 1.5746, + "step": 1134 + }, + { + "epoch": 1.1441532258064515, + "grad_norm": 0.10159925371408463, + "learning_rate": 8.094922076527859e-05, + "loss": 1.5689, + "step": 1135 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 0.10202515870332718, + "learning_rate": 8.078978079244752e-05, + "loss": 1.5155, + "step": 1136 + }, + { + "epoch": 1.1461693548387097, + "grad_norm": 0.0907059907913208, + "learning_rate": 8.063039150923595e-05, + "loss": 1.5552, + "step": 1137 + }, + { + "epoch": 1.1471774193548387, + "grad_norm": 0.08588322252035141, + "learning_rate": 8.047105333622112e-05, + "loss": 1.5299, + "step": 1138 + }, + { + "epoch": 1.1481854838709677, + "grad_norm": 0.08953887969255447, + "learning_rate": 8.031176669384552e-05, + "loss": 1.5528, + "step": 1139 + }, + { + "epoch": 1.1491935483870968, + "grad_norm": 0.08963429927825928, + "learning_rate": 8.01525320024156e-05, + "loss": 1.4823, + "step": 1140 + }, + { + "epoch": 1.1502016129032258, + "grad_norm": 0.09360229223966599, + "learning_rate": 7.999334968210073e-05, + "loss": 1.5288, + "step": 1141 + }, + { + "epoch": 1.1512096774193548, + "grad_norm": 0.09653651714324951, + "learning_rate": 7.983422015293212e-05, + "loss": 1.502, + "step": 1142 + }, + { + "epoch": 1.152217741935484, + "grad_norm": 0.0958021953701973, + "learning_rate": 7.967514383480161e-05, + "loss": 1.4772, + "step": 1143 + }, + { + "epoch": 1.153225806451613, + "grad_norm": 0.0900203064084053, + "learning_rate": 7.951612114746076e-05, + "loss": 1.5536, + "step": 1144 + }, + { + "epoch": 1.154233870967742, + "grad_norm": 0.1079091802239418, + "learning_rate": 7.935715251051949e-05, + "loss": 1.482, + "step": 1145 + }, + { + "epoch": 1.155241935483871, + "grad_norm": 0.09951366484165192, + "learning_rate": 7.919823834344516e-05, + "loss": 1.4908, + "step": 1146 + }, + { + "epoch": 1.15625, + "grad_norm": 0.08866190165281296, + "learning_rate": 7.90393790655614e-05, + "loss": 1.5027, + "step": 1147 + }, + { + "epoch": 1.157258064516129, + "grad_norm": 0.09670446068048477, + "learning_rate": 7.888057509604697e-05, + "loss": 1.4905, + "step": 1148 + }, + { + "epoch": 1.158266129032258, + "grad_norm": 0.0998421311378479, + "learning_rate": 7.872182685393475e-05, + "loss": 1.5349, + "step": 1149 + }, + { + "epoch": 1.159274193548387, + "grad_norm": 0.09023125469684601, + "learning_rate": 7.85631347581105e-05, + "loss": 1.5502, + "step": 1150 + }, + { + "epoch": 1.160282258064516, + "grad_norm": 0.09362298995256424, + "learning_rate": 7.84044992273119e-05, + "loss": 1.4587, + "step": 1151 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 0.09614353626966476, + "learning_rate": 7.82459206801273e-05, + "loss": 1.5398, + "step": 1152 + }, + { + "epoch": 1.1622983870967742, + "grad_norm": 0.08735020458698273, + "learning_rate": 7.808739953499478e-05, + "loss": 1.5106, + "step": 1153 + }, + { + "epoch": 1.1633064516129032, + "grad_norm": 0.11043401807546616, + "learning_rate": 7.792893621020082e-05, + "loss": 1.533, + "step": 1154 + }, + { + "epoch": 1.1643145161290323, + "grad_norm": 0.11868879944086075, + "learning_rate": 7.777053112387949e-05, + "loss": 1.5086, + "step": 1155 + }, + { + "epoch": 1.1653225806451613, + "grad_norm": 0.08818439394235611, + "learning_rate": 7.761218469401108e-05, + "loss": 1.5127, + "step": 1156 + }, + { + "epoch": 1.1663306451612903, + "grad_norm": 0.1308388113975525, + "learning_rate": 7.745389733842112e-05, + "loss": 1.4556, + "step": 1157 + }, + { + "epoch": 1.1673387096774193, + "grad_norm": 0.09634990245103836, + "learning_rate": 7.729566947477928e-05, + "loss": 1.5527, + "step": 1158 + }, + { + "epoch": 1.1683467741935485, + "grad_norm": 0.11291810870170593, + "learning_rate": 7.713750152059826e-05, + "loss": 1.5556, + "step": 1159 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 0.10674012452363968, + "learning_rate": 7.697939389323267e-05, + "loss": 1.4921, + "step": 1160 + }, + { + "epoch": 1.1703629032258065, + "grad_norm": 0.09948462247848511, + "learning_rate": 7.682134700987789e-05, + "loss": 1.5691, + "step": 1161 + }, + { + "epoch": 1.1713709677419355, + "grad_norm": 0.09521344304084778, + "learning_rate": 7.66633612875691e-05, + "loss": 1.475, + "step": 1162 + }, + { + "epoch": 1.1723790322580645, + "grad_norm": 0.11034592241048813, + "learning_rate": 7.650543714318001e-05, + "loss": 1.5353, + "step": 1163 + }, + { + "epoch": 1.1733870967741935, + "grad_norm": 0.08763138949871063, + "learning_rate": 7.634757499342191e-05, + "loss": 1.4952, + "step": 1164 + }, + { + "epoch": 1.1743951612903225, + "grad_norm": 0.09569991379976273, + "learning_rate": 7.61897752548425e-05, + "loss": 1.5287, + "step": 1165 + }, + { + "epoch": 1.1754032258064515, + "grad_norm": 0.12841151654720306, + "learning_rate": 7.603203834382476e-05, + "loss": 1.6028, + "step": 1166 + }, + { + "epoch": 1.1764112903225807, + "grad_norm": 0.08578557521104813, + "learning_rate": 7.58743646765859e-05, + "loss": 1.4683, + "step": 1167 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 0.10593171417713165, + "learning_rate": 7.571675466917626e-05, + "loss": 1.5351, + "step": 1168 + }, + { + "epoch": 1.1784274193548387, + "grad_norm": 0.10871924459934235, + "learning_rate": 7.555920873747823e-05, + "loss": 1.5334, + "step": 1169 + }, + { + "epoch": 1.1794354838709677, + "grad_norm": 0.08840969204902649, + "learning_rate": 7.540172729720504e-05, + "loss": 1.5035, + "step": 1170 + }, + { + "epoch": 1.1804435483870968, + "grad_norm": 0.08680961281061172, + "learning_rate": 7.524431076389986e-05, + "loss": 1.4756, + "step": 1171 + }, + { + "epoch": 1.1814516129032258, + "grad_norm": 0.0890466570854187, + "learning_rate": 7.50869595529345e-05, + "loss": 1.5077, + "step": 1172 + }, + { + "epoch": 1.1824596774193548, + "grad_norm": 0.10022439807653427, + "learning_rate": 7.492967407950844e-05, + "loss": 1.5001, + "step": 1173 + }, + { + "epoch": 1.183467741935484, + "grad_norm": 0.12129071354866028, + "learning_rate": 7.477245475864771e-05, + "loss": 1.5234, + "step": 1174 + }, + { + "epoch": 1.184475806451613, + "grad_norm": 0.09167549759149551, + "learning_rate": 7.461530200520377e-05, + "loss": 1.4971, + "step": 1175 + }, + { + "epoch": 1.185483870967742, + "grad_norm": 0.08763924986124039, + "learning_rate": 7.445821623385245e-05, + "loss": 1.5229, + "step": 1176 + }, + { + "epoch": 1.186491935483871, + "grad_norm": 0.13247455656528473, + "learning_rate": 7.430119785909278e-05, + "loss": 1.4973, + "step": 1177 + }, + { + "epoch": 1.1875, + "grad_norm": 0.10564038902521133, + "learning_rate": 7.414424729524602e-05, + "loss": 1.4549, + "step": 1178 + }, + { + "epoch": 1.188508064516129, + "grad_norm": 0.09784973412752151, + "learning_rate": 7.398736495645447e-05, + "loss": 1.514, + "step": 1179 + }, + { + "epoch": 1.189516129032258, + "grad_norm": 0.0994093120098114, + "learning_rate": 7.383055125668038e-05, + "loss": 1.4899, + "step": 1180 + }, + { + "epoch": 1.190524193548387, + "grad_norm": 0.09787564724683762, + "learning_rate": 7.367380660970493e-05, + "loss": 1.5306, + "step": 1181 + }, + { + "epoch": 1.191532258064516, + "grad_norm": 0.09221166372299194, + "learning_rate": 7.351713142912707e-05, + "loss": 1.5314, + "step": 1182 + }, + { + "epoch": 1.1925403225806452, + "grad_norm": 0.10157594084739685, + "learning_rate": 7.336052612836246e-05, + "loss": 1.5738, + "step": 1183 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.09838045388460159, + "learning_rate": 7.320399112064233e-05, + "loss": 1.543, + "step": 1184 + }, + { + "epoch": 1.1945564516129032, + "grad_norm": 0.0991106852889061, + "learning_rate": 7.304752681901251e-05, + "loss": 1.5159, + "step": 1185 + }, + { + "epoch": 1.1955645161290323, + "grad_norm": 0.09091247618198395, + "learning_rate": 7.289113363633215e-05, + "loss": 1.5109, + "step": 1186 + }, + { + "epoch": 1.1965725806451613, + "grad_norm": 0.10874085873365402, + "learning_rate": 7.273481198527285e-05, + "loss": 1.539, + "step": 1187 + }, + { + "epoch": 1.1975806451612903, + "grad_norm": 0.26701486110687256, + "learning_rate": 7.257856227831738e-05, + "loss": 1.5231, + "step": 1188 + }, + { + "epoch": 1.1985887096774193, + "grad_norm": 0.11611919850111008, + "learning_rate": 7.242238492775869e-05, + "loss": 1.5325, + "step": 1189 + }, + { + "epoch": 1.1995967741935485, + "grad_norm": 0.09033048897981644, + "learning_rate": 7.226628034569886e-05, + "loss": 1.5223, + "step": 1190 + }, + { + "epoch": 1.2006048387096775, + "grad_norm": 0.09677241742610931, + "learning_rate": 7.211024894404788e-05, + "loss": 1.5277, + "step": 1191 + }, + { + "epoch": 1.2016129032258065, + "grad_norm": 0.09878189116716385, + "learning_rate": 7.195429113452271e-05, + "loss": 1.525, + "step": 1192 + }, + { + "epoch": 1.2026209677419355, + "grad_norm": 0.09079443663358688, + "learning_rate": 7.179840732864604e-05, + "loss": 1.4836, + "step": 1193 + }, + { + "epoch": 1.2036290322580645, + "grad_norm": 0.10527854412794113, + "learning_rate": 7.16425979377454e-05, + "loss": 1.5068, + "step": 1194 + }, + { + "epoch": 1.2046370967741935, + "grad_norm": 0.1167258769273758, + "learning_rate": 7.148686337295181e-05, + "loss": 1.4535, + "step": 1195 + }, + { + "epoch": 1.2056451612903225, + "grad_norm": 0.09939006716012955, + "learning_rate": 7.133120404519903e-05, + "loss": 1.4873, + "step": 1196 + }, + { + "epoch": 1.2066532258064515, + "grad_norm": 0.09883987158536911, + "learning_rate": 7.117562036522213e-05, + "loss": 1.5022, + "step": 1197 + }, + { + "epoch": 1.2076612903225807, + "grad_norm": 0.10209079831838608, + "learning_rate": 7.102011274355667e-05, + "loss": 1.5432, + "step": 1198 + }, + { + "epoch": 1.2086693548387097, + "grad_norm": 0.10384919494390488, + "learning_rate": 7.086468159053751e-05, + "loss": 1.5568, + "step": 1199 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 0.1108224168419838, + "learning_rate": 7.070932731629769e-05, + "loss": 1.4903, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 1984, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.728889247690916e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}