diff --git "a/checkpoint-1200/trainer_state.json" "b/checkpoint-1200/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1200/trainer_state.json"
@@ -0,0 +1,8433 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.2096774193548387,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0010080645161290322,
+      "grad_norm": 0.9473515748977661,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.9769,
+      "step": 1
+    },
+    {
+      "epoch": 0.0020161290322580645,
+      "grad_norm": 0.9036028981208801,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.9331,
+      "step": 2
+    },
+    {
+      "epoch": 0.0030241935483870967,
+      "grad_norm": 0.9499556422233582,
+      "learning_rate": 1.2e-05,
+      "loss": 1.9852,
+      "step": 3
+    },
+    {
+      "epoch": 0.004032258064516129,
+      "grad_norm": 0.903069019317627,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.9668,
+      "step": 4
+    },
+    {
+      "epoch": 0.005040322580645161,
+      "grad_norm": 0.5635794997215271,
+      "learning_rate": 2e-05,
+      "loss": 1.9327,
+      "step": 5
+    },
+    {
+      "epoch": 0.006048387096774193,
+      "grad_norm": 0.9521661996841431,
+      "learning_rate": 2.4e-05,
+      "loss": 2.0026,
+      "step": 6
+    },
+    {
+      "epoch": 0.007056451612903226,
+      "grad_norm": 0.4393383860588074,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.8885,
+      "step": 7
+    },
+    {
+      "epoch": 0.008064516129032258,
+      "grad_norm": 0.36857879161834717,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.8537,
+      "step": 8
+    },
+    {
+      "epoch": 0.009072580645161291,
+      "grad_norm": 0.3844268321990967,
+      "learning_rate": 3.6e-05,
+      "loss": 1.8874,
+      "step": 9
+    },
+    {
+      "epoch": 0.010080645161290322,
+      "grad_norm": 0.41415101289749146,
+      "learning_rate": 4e-05,
+      "loss": 1.9386,
+      "step": 10
+    },
+    {
+      "epoch": 0.011088709677419355,
+      "grad_norm": 0.3869949281215668,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.9359,
+      "step": 11
+    },
+    {
+      "epoch": 0.012096774193548387,
+      "grad_norm": 0.3345952033996582,
+      "learning_rate": 4.8e-05,
+      "loss": 1.903,
+      "step": 12
+    },
+    {
+      "epoch": 0.01310483870967742,
+      "grad_norm": 0.3590312600135803,
+      "learning_rate": 5.2000000000000004e-05,
+      "loss": 1.9024,
+      "step": 13
+    },
+    {
+      "epoch": 0.014112903225806451,
+      "grad_norm": 0.2288215309381485,
+      "learning_rate": 5.6000000000000006e-05,
+      "loss": 1.8431,
+      "step": 14
+    },
+    {
+      "epoch": 0.015120967741935484,
+      "grad_norm": 0.20984530448913574,
+      "learning_rate": 6e-05,
+      "loss": 1.8522,
+      "step": 15
+    },
+    {
+      "epoch": 0.016129032258064516,
+      "grad_norm": 0.2080329954624176,
+      "learning_rate": 6.400000000000001e-05,
+      "loss": 1.9895,
+      "step": 16
+    },
+    {
+      "epoch": 0.017137096774193547,
+      "grad_norm": 0.20060451328754425,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 1.8289,
+      "step": 17
+    },
+    {
+      "epoch": 0.018145161290322582,
+      "grad_norm": 0.16062042117118835,
+      "learning_rate": 7.2e-05,
+      "loss": 1.8823,
+      "step": 18
+    },
+    {
+      "epoch": 0.019153225806451613,
+      "grad_norm": 0.15423905849456787,
+      "learning_rate": 7.6e-05,
+      "loss": 1.7997,
+      "step": 19
+    },
+    {
+      "epoch": 0.020161290322580645,
+      "grad_norm": 0.15496863424777985,
+      "learning_rate": 8e-05,
+      "loss": 1.8237,
+      "step": 20
+    },
+    {
+      "epoch": 0.021169354838709676,
+      "grad_norm": 0.16305851936340332,
+      "learning_rate": 8.4e-05,
+      "loss": 1.7973,
+      "step": 21
+    },
+    {
+      "epoch": 0.02217741935483871,
+      "grad_norm": 0.1680663675069809,
+      "learning_rate": 8.800000000000001e-05,
+      "loss": 1.82,
+      "step": 22
+    },
+    {
+      "epoch": 0.023185483870967742,
+      "grad_norm": 0.16471807658672333,
+      "learning_rate": 9.200000000000001e-05,
+      "loss": 1.8314,
+      "step": 23
+    },
+    {
+      "epoch": 0.024193548387096774,
+      "grad_norm": 0.13601982593536377,
+      "learning_rate": 9.6e-05,
+      "loss": 1.8488,
+      "step": 24
+    },
+    {
+      "epoch": 0.025201612903225805,
+      "grad_norm": 0.12553684413433075,
+      "learning_rate": 0.0001,
+      "loss": 1.839,
+      "step": 25
+    },
+    {
+      "epoch": 0.02620967741935484,
+      "grad_norm": 0.12679991126060486,
+      "learning_rate": 0.00010400000000000001,
+      "loss": 1.8615,
+      "step": 26
+    },
+    {
+      "epoch": 0.02721774193548387,
+      "grad_norm": 0.1284348964691162,
+      "learning_rate": 0.00010800000000000001,
+      "loss": 1.8215,
+      "step": 27
+    },
+    {
+      "epoch": 0.028225806451612902,
+      "grad_norm": 0.11629381030797958,
+      "learning_rate": 0.00011200000000000001,
+      "loss": 1.8536,
+      "step": 28
+    },
+    {
+      "epoch": 0.029233870967741934,
+      "grad_norm": 0.10016848891973495,
+      "learning_rate": 0.000116,
+      "loss": 1.8095,
+      "step": 29
+    },
+    {
+      "epoch": 0.03024193548387097,
+      "grad_norm": 0.10154619067907333,
+      "learning_rate": 0.00012,
+      "loss": 1.8355,
+      "step": 30
+    },
+    {
+      "epoch": 0.03125,
+      "grad_norm": 0.11825895309448242,
+      "learning_rate": 0.000124,
+      "loss": 1.7984,
+      "step": 31
+    },
+    {
+      "epoch": 0.03225806451612903,
+      "grad_norm": 0.104405976831913,
+      "learning_rate": 0.00012800000000000002,
+      "loss": 1.7673,
+      "step": 32
+    },
+    {
+      "epoch": 0.03326612903225806,
+      "grad_norm": 0.09943860024213791,
+      "learning_rate": 0.000132,
+      "loss": 1.813,
+      "step": 33
+    },
+    {
+      "epoch": 0.034274193548387094,
+      "grad_norm": 0.10970743000507355,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 1.9213,
+      "step": 34
+    },
+    {
+      "epoch": 0.03528225806451613,
+      "grad_norm": 0.1049584224820137,
+      "learning_rate": 0.00014,
+      "loss": 1.7818,
+      "step": 35
+    },
+    {
+      "epoch": 0.036290322580645164,
+      "grad_norm": 0.08986247330904007,
+      "learning_rate": 0.000144,
+      "loss": 1.7944,
+      "step": 36
+    },
+    {
+      "epoch": 0.037298387096774195,
+      "grad_norm": 0.09243710339069366,
+      "learning_rate": 0.000148,
+      "loss": 1.7158,
+      "step": 37
+    },
+    {
+      "epoch": 0.038306451612903226,
+      "grad_norm": 0.10768643021583557,
+      "learning_rate": 0.000152,
+      "loss": 1.8295,
+      "step": 38
+    },
+    {
+      "epoch": 0.03931451612903226,
+      "grad_norm": 0.07883578538894653,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 1.757,
+      "step": 39
+    },
+    {
+      "epoch": 0.04032258064516129,
+      "grad_norm": 0.10219922661781311,
+      "learning_rate": 0.00016,
+      "loss": 1.7423,
+      "step": 40
+    },
+    {
+      "epoch": 0.04133064516129032,
+      "grad_norm": 0.08045803755521774,
+      "learning_rate": 0.000164,
+      "loss": 1.7649,
+      "step": 41
+    },
+    {
+      "epoch": 0.04233870967741935,
+      "grad_norm": 0.07191110402345657,
+      "learning_rate": 0.000168,
+      "loss": 1.7441,
+      "step": 42
+    },
+    {
+      "epoch": 0.04334677419354839,
+      "grad_norm": 0.08571028709411621,
+      "learning_rate": 0.000172,
+      "loss": 1.8094,
+      "step": 43
+    },
+    {
+      "epoch": 0.04435483870967742,
+      "grad_norm": 0.08775891363620758,
+      "learning_rate": 0.00017600000000000002,
+      "loss": 1.817,
+      "step": 44
+    },
+    {
+      "epoch": 0.04536290322580645,
+      "grad_norm": 0.08328275382518768,
+      "learning_rate": 0.00018,
+      "loss": 1.7753,
+      "step": 45
+    },
+    {
+      "epoch": 0.046370967741935484,
+      "grad_norm": 0.08221882581710815,
+      "learning_rate": 0.00018400000000000003,
+      "loss": 1.7824,
+      "step": 46
+    },
+    {
+      "epoch": 0.047379032258064516,
+      "grad_norm": 0.0885847732424736,
+      "learning_rate": 0.000188,
+      "loss": 1.7423,
+      "step": 47
+    },
+    {
+      "epoch": 0.04838709677419355,
+      "grad_norm": 0.08126149326562881,
+      "learning_rate": 0.000192,
+      "loss": 1.7495,
+      "step": 48
+    },
+    {
+      "epoch": 0.04939516129032258,
+      "grad_norm": 0.08296285569667816,
+      "learning_rate": 0.000196,
+      "loss": 1.6909,
+      "step": 49
+    },
+    {
+      "epoch": 0.05040322580645161,
+      "grad_norm": 0.09005258232355118,
+      "learning_rate": 0.0002,
+      "loss": 1.8159,
+      "step": 50
+    },
+    {
+      "epoch": 0.05141129032258065,
+      "grad_norm": 0.08956532180309296,
+      "learning_rate": 0.00019999986806600454,
+      "loss": 1.6662,
+      "step": 51
+    },
+    {
+      "epoch": 0.05241935483870968,
+      "grad_norm": 0.08471240848302841,
+      "learning_rate": 0.00019999947226436628,
+      "loss": 1.8274,
+      "step": 52
+    },
+    {
+      "epoch": 0.05342741935483871,
+      "grad_norm": 0.09117641299962997,
+      "learning_rate": 0.00019999881259612963,
+      "loss": 1.7027,
+      "step": 53
+    },
+    {
+      "epoch": 0.05443548387096774,
+      "grad_norm": 0.08552085608243942,
+      "learning_rate": 0.00019999788906303518,
+      "loss": 1.7738,
+      "step": 54
+    },
+    {
+      "epoch": 0.055443548387096774,
+      "grad_norm": 0.07708004862070084,
+      "learning_rate": 0.00019999670166751993,
+      "loss": 1.7821,
+      "step": 55
+    },
+    {
+      "epoch": 0.056451612903225805,
+      "grad_norm": 0.07826384156942368,
+      "learning_rate": 0.000199995250412717,
+      "loss": 1.7579,
+      "step": 56
+    },
+    {
+      "epoch": 0.057459677419354836,
+      "grad_norm": 0.0721641331911087,
+      "learning_rate": 0.00019999353530245572,
+      "loss": 1.7372,
+      "step": 57
+    },
+    {
+      "epoch": 0.05846774193548387,
+      "grad_norm": 0.07667742669582367,
+      "learning_rate": 0.0001999915563412618,
+      "loss": 1.7323,
+      "step": 58
+    },
+    {
+      "epoch": 0.059475806451612906,
+      "grad_norm": 0.10455285757780075,
+      "learning_rate": 0.00019998931353435709,
+      "loss": 1.8221,
+      "step": 59
+    },
+    {
+      "epoch": 0.06048387096774194,
+      "grad_norm": 0.07621350884437561,
+      "learning_rate": 0.00019998680688765959,
+      "loss": 1.7305,
+      "step": 60
+    },
+    {
+      "epoch": 0.06149193548387097,
+      "grad_norm": 0.08454013615846634,
+      "learning_rate": 0.00019998403640778358,
+      "loss": 1.7558,
+      "step": 61
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.08005455136299133,
+      "learning_rate": 0.00019998100210203942,
+      "loss": 1.6703,
+      "step": 62
+    },
+    {
+      "epoch": 0.06350806451612903,
+      "grad_norm": 0.09527427703142166,
+      "learning_rate": 0.0001999777039784337,
+      "loss": 1.7896,
+      "step": 63
+    },
+    {
+      "epoch": 0.06451612903225806,
+      "grad_norm": 0.10536834597587585,
+      "learning_rate": 0.00019997414204566915,
+      "loss": 1.7909,
+      "step": 64
+    },
+    {
+      "epoch": 0.0655241935483871,
+      "grad_norm": 0.08326593041419983,
+      "learning_rate": 0.0001999703163131445,
+      "loss": 1.7501,
+      "step": 65
+    },
+    {
+      "epoch": 0.06653225806451613,
+      "grad_norm": 0.0823182687163353,
+      "learning_rate": 0.00019996622679095468,
+      "loss": 1.7625,
+      "step": 66
+    },
+    {
+      "epoch": 0.06754032258064516,
+      "grad_norm": 0.07878896594047546,
+      "learning_rate": 0.00019996187348989063,
+      "loss": 1.7235,
+      "step": 67
+    },
+    {
+      "epoch": 0.06854838709677419,
+      "grad_norm": 0.0899212434887886,
+      "learning_rate": 0.0001999572564214393,
+      "loss": 1.7685,
+      "step": 68
+    },
+    {
+      "epoch": 0.06955645161290322,
+      "grad_norm": 0.07247278839349747,
+      "learning_rate": 0.00019995237559778363,
+      "loss": 1.6281,
+      "step": 69
+    },
+    {
+      "epoch": 0.07056451612903226,
+      "grad_norm": 0.08588135987520218,
+      "learning_rate": 0.00019994723103180265,
+      "loss": 1.7785,
+      "step": 70
+    },
+    {
+      "epoch": 0.0715725806451613,
+      "grad_norm": 0.12004637718200684,
+      "learning_rate": 0.00019994182273707107,
+      "loss": 1.7552,
+      "step": 71
+    },
+    {
+      "epoch": 0.07258064516129033,
+      "grad_norm": 0.1002095490694046,
+      "learning_rate": 0.00019993615072785978,
+      "loss": 1.715,
+      "step": 72
+    },
+    {
+      "epoch": 0.07358870967741936,
+      "grad_norm": 0.07339724153280258,
+      "learning_rate": 0.00019993021501913536,
+      "loss": 1.7019,
+      "step": 73
+    },
+    {
+      "epoch": 0.07459677419354839,
+      "grad_norm": 0.1305348128080368,
+      "learning_rate": 0.00019992401562656022,
+      "loss": 1.8078,
+      "step": 74
+    },
+    {
+      "epoch": 0.07560483870967742,
+      "grad_norm": 0.09164395183324814,
+      "learning_rate": 0.0001999175525664926,
+      "loss": 1.6756,
+      "step": 75
+    },
+    {
+      "epoch": 0.07661290322580645,
+      "grad_norm": 0.0749751552939415,
+      "learning_rate": 0.0001999108258559864,
+      "loss": 1.7616,
+      "step": 76
+    },
+    {
+      "epoch": 0.07762096774193548,
+      "grad_norm": 0.1132885217666626,
+      "learning_rate": 0.00019990383551279136,
+      "loss": 1.8232,
+      "step": 77
+    },
+    {
+      "epoch": 0.07862903225806452,
+      "grad_norm": 0.0832655057311058,
+      "learning_rate": 0.00019989658155535262,
+      "loss": 1.7371,
+      "step": 78
+    },
+    {
+      "epoch": 0.07963709677419355,
+      "grad_norm": 0.09641417115926743,
+      "learning_rate": 0.00019988906400281116,
+      "loss": 1.7989,
+      "step": 79
+    },
+    {
+      "epoch": 0.08064516129032258,
+      "grad_norm": 0.08800283074378967,
+      "learning_rate": 0.00019988128287500335,
+      "loss": 1.7235,
+      "step": 80
+    },
+    {
+      "epoch": 0.08165322580645161,
+      "grad_norm": 0.0772438570857048,
+      "learning_rate": 0.00019987323819246108,
+      "loss": 1.7488,
+      "step": 81
+    },
+    {
+      "epoch": 0.08266129032258064,
+      "grad_norm": 0.09178374707698822,
+      "learning_rate": 0.00019986492997641175,
+      "loss": 1.7018,
+      "step": 82
+    },
+    {
+      "epoch": 0.08366935483870967,
+      "grad_norm": 0.09313932806253433,
+      "learning_rate": 0.00019985635824877802,
+      "loss": 1.7914,
+      "step": 83
+    },
+    {
+      "epoch": 0.0846774193548387,
+      "grad_norm": 0.0906209945678711,
+      "learning_rate": 0.00019984752303217797,
+      "loss": 1.7197,
+      "step": 84
+    },
+    {
+      "epoch": 0.08568548387096774,
+      "grad_norm": 0.09081698209047318,
+      "learning_rate": 0.0001998384243499249,
+      "loss": 1.7666,
+      "step": 85
+    },
+    {
+      "epoch": 0.08669354838709678,
+      "grad_norm": 0.07680635154247284,
+      "learning_rate": 0.0001998290622260273,
+      "loss": 1.6946,
+      "step": 86
+    },
+    {
+      "epoch": 0.08770161290322581,
+      "grad_norm": 0.0743766576051712,
+      "learning_rate": 0.00019981943668518888,
+      "loss": 1.7588,
+      "step": 87
+    },
+    {
+      "epoch": 0.08870967741935484,
+      "grad_norm": 0.07674787193536758,
+      "learning_rate": 0.00019980954775280832,
+      "loss": 1.6896,
+      "step": 88
+    },
+    {
+      "epoch": 0.08971774193548387,
+      "grad_norm": 0.07708673924207687,
+      "learning_rate": 0.00019979939545497933,
+      "loss": 1.6944,
+      "step": 89
+    },
+    {
+      "epoch": 0.0907258064516129,
+      "grad_norm": 0.07248947024345398,
+      "learning_rate": 0.00019978897981849056,
+      "loss": 1.7114,
+      "step": 90
+    },
+    {
+      "epoch": 0.09173387096774194,
+      "grad_norm": 0.07939179986715317,
+      "learning_rate": 0.0001997783008708256,
+      "loss": 1.7552,
+      "step": 91
+    },
+    {
+      "epoch": 0.09274193548387097,
+      "grad_norm": 0.09288234263658524,
+      "learning_rate": 0.00019976735864016276,
+      "loss": 1.7554,
+      "step": 92
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 0.08074582368135452,
+      "learning_rate": 0.00019975615315537506,
+      "loss": 1.7209,
+      "step": 93
+    },
+    {
+      "epoch": 0.09475806451612903,
+      "grad_norm": 0.08087307959794998,
+      "learning_rate": 0.0001997446844460302,
+      "loss": 1.7118,
+      "step": 94
+    },
+    {
+      "epoch": 0.09576612903225806,
+      "grad_norm": 0.08976717293262482,
+      "learning_rate": 0.00019973295254239044,
+      "loss": 1.7384,
+      "step": 95
+    },
+    {
+      "epoch": 0.0967741935483871,
+      "grad_norm": 0.08545631170272827,
+      "learning_rate": 0.0001997209574754125,
+      "loss": 1.7524,
+      "step": 96
+    },
+    {
+      "epoch": 0.09778225806451613,
+      "grad_norm": 0.07703512907028198,
+      "learning_rate": 0.00019970869927674753,
+      "loss": 1.6947,
+      "step": 97
+    },
+    {
+      "epoch": 0.09879032258064516,
+      "grad_norm": 0.07614375650882721,
+      "learning_rate": 0.000199696177978741,
+      "loss": 1.7135,
+      "step": 98
+    },
+    {
+      "epoch": 0.09979838709677419,
+      "grad_norm": 0.0809471607208252,
+      "learning_rate": 0.0001996833936144326,
+      "loss": 1.727,
+      "step": 99
+    },
+    {
+      "epoch": 0.10080645161290322,
+      "grad_norm": 0.1023879274725914,
+      "learning_rate": 0.00019967034621755622,
+      "loss": 1.7297,
+      "step": 100
+    },
+    {
+      "epoch": 0.10181451612903226,
+      "grad_norm": 0.07705037295818329,
+      "learning_rate": 0.00019965703582253965,
+      "loss": 1.6571,
+      "step": 101
+    },
+    {
+      "epoch": 0.1028225806451613,
+      "grad_norm": 0.08601151406764984,
+      "learning_rate": 0.00019964346246450487,
+      "loss": 1.7404,
+      "step": 102
+    },
+    {
+      "epoch": 0.10383064516129033,
+      "grad_norm": 0.0756453350186348,
+      "learning_rate": 0.00019962962617926756,
+      "loss": 1.7311,
+      "step": 103
+    },
+    {
+      "epoch": 0.10483870967741936,
+      "grad_norm": 0.10456051677465439,
+      "learning_rate": 0.00019961552700333734,
+      "loss": 1.7517,
+      "step": 104
+    },
+    {
+      "epoch": 0.10584677419354839,
+      "grad_norm": 0.07731463760137558,
+      "learning_rate": 0.00019960116497391733,
+      "loss": 1.716,
+      "step": 105
+    },
+    {
+      "epoch": 0.10685483870967742,
+      "grad_norm": 0.0789295881986618,
+      "learning_rate": 0.00019958654012890435,
+      "loss": 1.7233,
+      "step": 106
+    },
+    {
+      "epoch": 0.10786290322580645,
+      "grad_norm": 0.08179011940956116,
+      "learning_rate": 0.0001995716525068887,
+      "loss": 1.6556,
+      "step": 107
+    },
+    {
+      "epoch": 0.10887096774193548,
+      "grad_norm": 0.08565866947174072,
+      "learning_rate": 0.00019955650214715406,
+      "loss": 1.7512,
+      "step": 108
+    },
+    {
+      "epoch": 0.10987903225806452,
+      "grad_norm": 0.08556907624006271,
+      "learning_rate": 0.00019954108908967736,
+      "loss": 1.7522,
+      "step": 109
+    },
+    {
+      "epoch": 0.11088709677419355,
+      "grad_norm": 0.08097026497125626,
+      "learning_rate": 0.00019952541337512868,
+      "loss": 1.6656,
+      "step": 110
+    },
+    {
+      "epoch": 0.11189516129032258,
+      "grad_norm": 0.07853402197360992,
+      "learning_rate": 0.0001995094750448713,
+      "loss": 1.7299,
+      "step": 111
+    },
+    {
+      "epoch": 0.11290322580645161,
+      "grad_norm": 0.07205012440681458,
+      "learning_rate": 0.00019949327414096134,
+      "loss": 1.7118,
+      "step": 112
+    },
+    {
+      "epoch": 0.11391129032258064,
+      "grad_norm": 0.0683959424495697,
+      "learning_rate": 0.00019947681070614777,
+      "loss": 1.6742,
+      "step": 113
+    },
+    {
+      "epoch": 0.11491935483870967,
+      "grad_norm": 0.07890711724758148,
+      "learning_rate": 0.00019946008478387238,
+      "loss": 1.6962,
+      "step": 114
+    },
+    {
+      "epoch": 0.1159274193548387,
+      "grad_norm": 0.08321288973093033,
+      "learning_rate": 0.00019944309641826947,
+      "loss": 1.7552,
+      "step": 115
+    },
+    {
+      "epoch": 0.11693548387096774,
+      "grad_norm": 0.0974084734916687,
+      "learning_rate": 0.0001994258456541659,
+      "loss": 1.7971,
+      "step": 116
+    },
+    {
+      "epoch": 0.11794354838709678,
+      "grad_norm": 0.08591660857200623,
+      "learning_rate": 0.00019940833253708097,
+      "loss": 1.7644,
+      "step": 117
+    },
+    {
+      "epoch": 0.11895161290322581,
+      "grad_norm": 0.07388189435005188,
+      "learning_rate": 0.00019939055711322616,
+      "loss": 1.6513,
+      "step": 118
+    },
+    {
+      "epoch": 0.11995967741935484,
+      "grad_norm": 0.07635471969842911,
+      "learning_rate": 0.00019937251942950512,
+      "loss": 1.7005,
+      "step": 119
+    },
+    {
+      "epoch": 0.12096774193548387,
+      "grad_norm": 0.08252502232789993,
+      "learning_rate": 0.0001993542195335135,
+      "loss": 1.7267,
+      "step": 120
+    },
+    {
+      "epoch": 0.1219758064516129,
+      "grad_norm": 0.10845799744129181,
+      "learning_rate": 0.0001993356574735389,
+      "loss": 1.7756,
+      "step": 121
+    },
+    {
+      "epoch": 0.12298387096774194,
+      "grad_norm": 0.07942607253789902,
+      "learning_rate": 0.00019931683329856066,
+      "loss": 1.6849,
+      "step": 122
+    },
+    {
+      "epoch": 0.12399193548387097,
+      "grad_norm": 0.08841695636510849,
+      "learning_rate": 0.00019929774705824973,
+      "loss": 1.7343,
+      "step": 123
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.09001098573207855,
+      "learning_rate": 0.0001992783988029686,
+      "loss": 1.7534,
+      "step": 124
+    },
+    {
+      "epoch": 0.12600806451612903,
+      "grad_norm": 0.07412228733301163,
+      "learning_rate": 0.00019925878858377113,
+      "loss": 1.7125,
+      "step": 125
+    },
+    {
+      "epoch": 0.12701612903225806,
+      "grad_norm": 0.09205227345228195,
+      "learning_rate": 0.00019923891645240238,
+      "loss": 1.6712,
+      "step": 126
+    },
+    {
+      "epoch": 0.1280241935483871,
+      "grad_norm": 0.07850176095962524,
+      "learning_rate": 0.00019921878246129858,
+      "loss": 1.6747,
+      "step": 127
+    },
+    {
+      "epoch": 0.12903225806451613,
+      "grad_norm": 0.07801543176174164,
+      "learning_rate": 0.00019919838666358688,
+      "loss": 1.6799,
+      "step": 128
+    },
+    {
+      "epoch": 0.13004032258064516,
+      "grad_norm": 0.08263793587684631,
+      "learning_rate": 0.00019917772911308524,
+      "loss": 1.7368,
+      "step": 129
+    },
+    {
+      "epoch": 0.1310483870967742,
+      "grad_norm": 0.10233369469642639,
+      "learning_rate": 0.00019915680986430233,
+      "loss": 1.7377,
+      "step": 130
+    },
+    {
+      "epoch": 0.13205645161290322,
+      "grad_norm": 0.08960834890604019,
+      "learning_rate": 0.00019913562897243736,
+      "loss": 1.7146,
+      "step": 131
+    },
+    {
+      "epoch": 0.13306451612903225,
+      "grad_norm": 0.07425748556852341,
+      "learning_rate": 0.00019911418649337997,
+      "loss": 1.6796,
+      "step": 132
+    },
+    {
+      "epoch": 0.13407258064516128,
+      "grad_norm": 0.11380482465028763,
+      "learning_rate": 0.00019909248248370988,
+      "loss": 1.7688,
+      "step": 133
+    },
+    {
+      "epoch": 0.1350806451612903,
+      "grad_norm": 0.09946684539318085,
+      "learning_rate": 0.00019907051700069714,
+      "loss": 1.7016,
+      "step": 134
+    },
+    {
+      "epoch": 0.13608870967741934,
+      "grad_norm": 0.07686997205018997,
+      "learning_rate": 0.0001990482901023016,
+      "loss": 1.7209,
+      "step": 135
+    },
+    {
+      "epoch": 0.13709677419354838,
+      "grad_norm": 0.08980387449264526,
+      "learning_rate": 0.0001990258018471729,
+      "loss": 1.6922,
+      "step": 136
+    },
+    {
+      "epoch": 0.1381048387096774,
+      "grad_norm": 0.08946418762207031,
+      "learning_rate": 0.00019900305229465036,
+      "loss": 1.7231,
+      "step": 137
+    },
+    {
+      "epoch": 0.13911290322580644,
+      "grad_norm": 0.07228976488113403,
+      "learning_rate": 0.00019898004150476278,
+      "loss": 1.6864,
+      "step": 138
+    },
+    {
+      "epoch": 0.14012096774193547,
+      "grad_norm": 0.09577012807130814,
+      "learning_rate": 0.00019895676953822822,
+      "loss": 1.6812,
+      "step": 139
+    },
+    {
+      "epoch": 0.14112903225806453,
+      "grad_norm": 0.08688167482614517,
+      "learning_rate": 0.00019893323645645404,
+      "loss": 1.738,
+      "step": 140
+    },
+    {
+      "epoch": 0.14213709677419356,
+      "grad_norm": 0.07488682866096497,
+      "learning_rate": 0.00019890944232153643,
+      "loss": 1.6202,
+      "step": 141
+    },
+    {
+      "epoch": 0.1431451612903226,
+      "grad_norm": 0.09752912074327469,
+      "learning_rate": 0.00019888538719626053,
+      "loss": 1.7006,
+      "step": 142
+    },
+    {
+      "epoch": 0.14415322580645162,
+      "grad_norm": 0.08033961057662964,
+      "learning_rate": 0.0001988610711441001,
+      "loss": 1.7119,
+      "step": 143
+    },
+    {
+      "epoch": 0.14516129032258066,
+      "grad_norm": 0.07507845759391785,
+      "learning_rate": 0.00019883649422921745,
+      "loss": 1.6504,
+      "step": 144
+    },
+    {
+      "epoch": 0.1461693548387097,
+      "grad_norm": 0.07756344974040985,
+      "learning_rate": 0.00019881165651646317,
+      "loss": 1.7107,
+      "step": 145
+    },
+    {
+      "epoch": 0.14717741935483872,
+      "grad_norm": 0.07581036537885666,
+      "learning_rate": 0.00019878655807137603,
+      "loss": 1.6777,
+      "step": 146
+    },
+    {
+      "epoch": 0.14818548387096775,
+      "grad_norm": 0.06943333894014359,
+      "learning_rate": 0.0001987611989601828,
+      "loss": 1.6282,
+      "step": 147
+    },
+    {
+      "epoch": 0.14919354838709678,
+      "grad_norm": 0.07314992696046829,
+      "learning_rate": 0.00019873557924979804,
+      "loss": 1.6773,
+      "step": 148
+    },
+    {
+      "epoch": 0.1502016129032258,
+      "grad_norm": 0.08181635290384293,
+      "learning_rate": 0.000198709699007824,
+      "loss": 1.668,
+      "step": 149
+    },
+    {
+      "epoch": 0.15120967741935484,
+      "grad_norm": 0.07046262919902802,
+      "learning_rate": 0.00019868355830255033,
+      "loss": 1.6857,
+      "step": 150
+    },
+    {
+      "epoch": 0.15221774193548387,
+      "grad_norm": 0.07162804901599884,
+      "learning_rate": 0.00019865715720295397,
+      "loss": 1.6299,
+      "step": 151
+    },
+    {
+      "epoch": 0.1532258064516129,
+      "grad_norm": 0.0785004273056984,
+      "learning_rate": 0.00019863049577869898,
+      "loss": 1.6651,
+      "step": 152
+    },
+    {
+      "epoch": 0.15423387096774194,
+      "grad_norm": 0.06895990669727325,
+      "learning_rate": 0.00019860357410013638,
+      "loss": 1.636,
+      "step": 153
+    },
+    {
+      "epoch": 0.15524193548387097,
+      "grad_norm": 0.0736781507730484,
+      "learning_rate": 0.00019857639223830377,
+      "loss": 1.6859,
+      "step": 154
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 0.07190602272748947,
+      "learning_rate": 0.00019854895026492545,
+      "loss": 1.706,
+      "step": 155
+    },
+    {
+      "epoch": 0.15725806451612903,
+      "grad_norm": 0.07781372219324112,
+      "learning_rate": 0.00019852124825241201,
+      "loss": 1.7015,
+      "step": 156
+    },
+    {
+      "epoch": 0.15826612903225806,
+      "grad_norm": 0.08466929197311401,
+      "learning_rate": 0.0001984932862738601,
+      "loss": 1.6684,
+      "step": 157
+    },
+    {
+      "epoch": 0.1592741935483871,
+      "grad_norm": 0.08189702033996582,
+      "learning_rate": 0.00019846506440305257,
+      "loss": 1.6914,
+      "step": 158
+    },
+    {
+      "epoch": 0.16028225806451613,
+      "grad_norm": 0.08032141625881195,
+      "learning_rate": 0.00019843658271445776,
+      "loss": 1.6574,
+      "step": 159
+    },
+    {
+      "epoch": 0.16129032258064516,
+      "grad_norm": 0.08438081294298172,
+      "learning_rate": 0.00019840784128322985,
+      "loss": 1.7503,
+      "step": 160
+    },
+    {
+      "epoch": 0.1622983870967742,
+      "grad_norm": 0.10350456833839417,
+      "learning_rate": 0.0001983788401852082,
+      "loss": 1.697,
+      "step": 161
+    },
+    {
+      "epoch": 0.16330645161290322,
+      "grad_norm": 0.08714311569929123,
+      "learning_rate": 0.00019834957949691747,
+      "loss": 1.7595,
+      "step": 162
+    },
+    {
+      "epoch": 0.16431451612903225,
+      "grad_norm": 0.08562017232179642,
+      "learning_rate": 0.00019832005929556722,
+      "loss": 1.7502,
+      "step": 163
+    },
+    {
+      "epoch": 0.16532258064516128,
+      "grad_norm": 0.0961882621049881,
+      "learning_rate": 0.00019829027965905186,
+      "loss": 1.6875,
+      "step": 164
+    },
+    {
+      "epoch": 0.1663306451612903,
+      "grad_norm": 0.09505471587181091,
+      "learning_rate": 0.00019826024066595027,
+      "loss": 1.6958,
+      "step": 165
+    },
+    {
+      "epoch": 0.16733870967741934,
+      "grad_norm": 0.07493823021650314,
+      "learning_rate": 0.00019822994239552573,
+      "loss": 1.6677,
+      "step": 166
+    },
+    {
+      "epoch": 0.16834677419354838,
+      "grad_norm": 0.09159812331199646,
+      "learning_rate": 0.00019819938492772568,
+      "loss": 1.6994,
+      "step": 167
+    },
+    {
+      "epoch": 0.1693548387096774,
+      "grad_norm": 0.1118432804942131,
+      "learning_rate": 0.00019816856834318155,
+      "loss": 1.7143,
+      "step": 168
+    },
+    {
+      "epoch": 0.17036290322580644,
+      "grad_norm": 0.09199640899896622,
+      "learning_rate": 0.0001981374927232084,
+      "loss": 1.6896,
+      "step": 169
+    },
+    {
+      "epoch": 0.17137096774193547,
+      "grad_norm": 0.0801042765378952,
+      "learning_rate": 0.00019810615814980483,
+      "loss": 1.7292,
+      "step": 170
+    },
+    {
+      "epoch": 0.17237903225806453,
+      "grad_norm": 0.1115993857383728,
+      "learning_rate": 0.00019807456470565283,
+      "loss": 1.6995,
+      "step": 171
+    },
+    {
+      "epoch": 0.17338709677419356,
+      "grad_norm": 0.10155931115150452,
+      "learning_rate": 0.00019804271247411727,
+      "loss": 1.6984,
+      "step": 172
+    },
+    {
+      "epoch": 0.1743951612903226,
+      "grad_norm": 0.07809167355298996,
+      "learning_rate": 0.00019801060153924608,
+      "loss": 1.7152,
+      "step": 173
+    },
+    {
+      "epoch": 0.17540322580645162,
+      "grad_norm": 0.08765136450529099,
+      "learning_rate": 0.0001979782319857697,
+      "loss": 1.6451,
+      "step": 174
+    },
+    {
+      "epoch": 0.17641129032258066,
+      "grad_norm": 0.07360592484474182,
+      "learning_rate": 0.00019794560389910102,
+      "loss": 1.6889,
+      "step": 175
+    },
+    {
+      "epoch": 0.1774193548387097,
+      "grad_norm": 0.09308324754238129,
+      "learning_rate": 0.00019791271736533512,
+      "loss": 1.7225,
+      "step": 176
+    },
+    {
+      "epoch": 0.17842741935483872,
+      "grad_norm": 0.08810586482286453,
+      "learning_rate": 0.00019787957247124907,
+      "loss": 1.6808,
+      "step": 177
+    },
+    {
+      "epoch": 0.17943548387096775,
+      "grad_norm": 0.07750339061021805,
+      "learning_rate": 0.00019784616930430157,
+      "loss": 1.6324,
+      "step": 178
+    },
+    {
+      "epoch": 0.18044354838709678,
+      "grad_norm": 0.08474040031433105,
+      "learning_rate": 0.00019781250795263295,
+      "loss": 1.6858,
+      "step": 179
+    },
+    {
+      "epoch": 0.1814516129032258,
+      "grad_norm": 0.08277326822280884,
+      "learning_rate": 0.0001977785885050647,
+      "loss": 1.7043,
+      "step": 180
+    },
+    {
+      "epoch": 0.18245967741935484,
+      "grad_norm": 0.07668858766555786,
+      "learning_rate": 0.00019774441105109943,
+      "loss": 1.6599,
+      "step": 181
+    },
+    {
+      "epoch": 0.18346774193548387,
+      "grad_norm": 0.07402200996875763,
+      "learning_rate": 0.00019770997568092046,
+      "loss": 1.6524,
+      "step": 182
+    },
+    {
+      "epoch": 0.1844758064516129,
+      "grad_norm": 0.08267819881439209,
+      "learning_rate": 0.0001976752824853917,
+      "loss": 1.6973,
+      "step": 183
+    },
+    {
+      "epoch": 0.18548387096774194,
+      "grad_norm": 0.0688646137714386,
+      "learning_rate": 0.00019764033155605747,
+      "loss": 1.63,
+      "step": 184
+    },
+    {
+      "epoch": 0.18649193548387097,
+      "grad_norm": 0.0818399116396904,
+      "learning_rate": 0.00019760512298514198,
+      "loss": 1.6773,
+      "step": 185
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.08086924254894257,
+      "learning_rate": 0.0001975696568655494,
+      "loss": 1.7037,
+      "step": 186
+    },
+    {
+      "epoch": 0.18850806451612903,
+      "grad_norm": 0.08136597275733948,
+      "learning_rate": 0.00019753393329086354,
+      "loss": 1.6634,
+      "step": 187
+    },
+    {
+      "epoch": 0.18951612903225806,
+      "grad_norm": 0.10008742660284042,
+      "learning_rate": 0.00019749795235534737,
+      "loss": 1.7139,
+      "step": 188
+    },
+    {
+      "epoch": 0.1905241935483871,
+      "grad_norm": 0.08657586574554443,
+      "learning_rate": 0.0001974617141539432,
+      "loss": 1.6877,
+      "step": 189
+    },
+    {
+      "epoch": 0.19153225806451613,
+      "grad_norm": 0.09825193136930466,
+      "learning_rate": 0.0001974252187822719,
+      "loss": 1.7274,
+      "step": 190
+    },
+    {
+      "epoch": 0.19254032258064516,
+      "grad_norm": 0.06964825093746185,
+      "learning_rate": 0.00019738846633663318,
+      "loss": 1.6431,
+      "step": 191
+    },
+    {
+      "epoch": 0.1935483870967742,
+      "grad_norm": 0.07197541743516922,
+      "learning_rate": 0.0001973514569140049,
+      "loss": 1.6532,
+      "step": 192
+    },
+    {
+      "epoch": 0.19455645161290322,
+      "grad_norm": 0.07691382616758347,
+      "learning_rate": 0.00019731419061204316,
+      "loss": 1.6816,
+      "step": 193
+    },
+    {
+      "epoch": 0.19556451612903225,
+      "grad_norm": 0.08229187875986099,
+      "learning_rate": 0.00019727666752908173,
+      "loss": 1.6471,
+      "step": 194
+    },
+    {
+      "epoch": 0.19657258064516128,
+      "grad_norm": 0.0788332000374794,
+      "learning_rate": 0.00019723888776413206,
+      "loss": 1.6745,
+      "step": 195
+    },
+    {
+      "epoch": 0.1975806451612903,
+      "grad_norm": 0.08446817100048065,
+      "learning_rate": 0.00019720085141688285,
+      "loss": 1.6863,
+      "step": 196
+    },
+    {
+      "epoch": 0.19858870967741934,
+      "grad_norm": 0.0747678205370903,
+      "learning_rate": 0.00019716255858769982,
+      "loss": 1.6553,
+      "step": 197
+    },
+    {
+      "epoch": 0.19959677419354838,
+      "grad_norm": 0.08248293399810791,
+      "learning_rate": 0.0001971240093776255,
+      "loss": 1.7021,
+      "step": 198
+    },
+    {
+      "epoch": 0.2006048387096774,
+      "grad_norm": 0.0832241103053093,
+      "learning_rate": 0.00019708520388837897,
+      "loss": 1.6832,
+      "step": 199
+    },
+    {
+      "epoch": 0.20161290322580644,
+      "grad_norm": 0.10792431235313416,
+      "learning_rate": 0.00019704614222235543,
+      "loss": 1.7196,
+      "step": 200
+    },
+    {
+      "epoch": 0.20262096774193547,
+      "grad_norm": 0.09173596650362015,
+      "learning_rate": 0.0001970068244826261,
+      "loss": 1.7039,
+      "step": 201
+    },
+    {
+      "epoch": 0.20362903225806453,
+      "grad_norm": 0.07657129317522049,
+      "learning_rate": 0.00019696725077293796,
+      "loss": 1.6614,
+      "step": 202
+    },
+    {
+      "epoch": 0.20463709677419356,
+      "grad_norm": 0.08881079405546188,
+      "learning_rate": 0.00019692742119771338,
+      "loss": 1.7062,
+      "step": 203
+    },
+    {
+      "epoch": 0.2056451612903226,
+      "grad_norm": 0.11070767790079117,
+      "learning_rate": 0.00019688733586204976,
+      "loss": 1.7759,
+      "step": 204
+    },
+    {
+      "epoch": 0.20665322580645162,
+      "grad_norm": 0.07556972652673721,
+      "learning_rate": 0.00019684699487171957,
+      "loss": 1.6664,
+      "step": 205
+    },
+    {
+      "epoch": 0.20766129032258066,
+      "grad_norm": 0.11293460428714752,
+      "learning_rate": 0.00019680639833316975,
+      "loss": 1.7476,
+      "step": 206
+    },
+    {
+      "epoch": 0.2086693548387097,
+      "grad_norm": 0.08948105573654175,
+      "learning_rate": 0.00019676554635352154,
+      "loss": 1.6933,
+      "step": 207
+    },
+    {
+      "epoch": 0.20967741935483872,
+      "grad_norm": 0.1004069596529007,
+      "learning_rate": 0.00019672443904057024,
+      "loss": 1.6909,
+      "step": 208
+    },
+    {
+      "epoch": 0.21068548387096775,
+      "grad_norm": 0.0815928652882576,
+      "learning_rate": 0.00019668307650278492,
+      "loss": 1.6881,
+      "step": 209
+    },
+    {
+      "epoch": 0.21169354838709678,
+      "grad_norm": 0.10198971629142761,
+      "learning_rate": 0.00019664145884930808,
+      "loss": 1.6653,
+      "step": 210
+    },
+    {
+      "epoch": 0.2127016129032258,
+      "grad_norm": 0.07174786180257797,
+      "learning_rate": 0.00019659958618995532,
+      "loss": 1.6204,
+      "step": 211
+    },
+    {
+      "epoch": 0.21370967741935484,
+      "grad_norm": 0.09819284826517105,
+      "learning_rate": 0.0001965574586352153,
+      "loss": 1.6574,
+      "step": 212
+    },
+    {
+      "epoch": 0.21471774193548387,
+      "grad_norm": 0.07578348368406296,
+      "learning_rate": 0.00019651507629624902,
+      "loss": 1.7096,
+      "step": 213
+    },
+    {
+      "epoch": 0.2157258064516129,
+      "grad_norm": 0.09160558879375458,
+      "learning_rate": 0.00019647243928489,
+      "loss": 1.673,
+      "step": 214
+    },
+    {
+      "epoch": 0.21673387096774194,
+      "grad_norm": 0.07697172462940216,
+      "learning_rate": 0.00019642954771364362,
+      "loss": 1.7069,
+      "step": 215
+    },
+    {
+      "epoch": 0.21774193548387097,
+      "grad_norm": 0.0956280305981636,
+      "learning_rate": 0.00019638640169568702,
+      "loss": 1.6727,
+      "step": 216
+    },
+    {
+      "epoch": 0.21875,
+      "grad_norm": 0.0775306299328804,
+      "learning_rate": 0.00019634300134486877,
+      "loss": 1.6846,
+      "step": 217
+    },
+    {
+      "epoch": 0.21975806451612903,
+      "grad_norm": 0.11724736541509628,
+      "learning_rate": 0.00019629934677570848,
+      "loss": 1.6723,
+      "step": 218
+    },
+    {
+      "epoch": 0.22076612903225806,
+      "grad_norm": 0.08374209702014923,
+      "learning_rate": 0.00019625543810339652,
+      "loss": 1.6552,
+      "step": 219
+    },
+    {
+      "epoch": 0.2217741935483871,
+      "grad_norm": 0.09895430505275726,
+      "learning_rate": 0.00019621127544379392,
+      "loss": 1.6843,
+      "step": 220
+    },
+    {
+      "epoch": 0.22278225806451613,
+      "grad_norm": 0.07595435529947281,
+      "learning_rate": 0.00019616685891343173,
+      "loss": 1.6878,
+      "step": 221
+    },
+    {
+      "epoch": 0.22379032258064516,
+      "grad_norm": 0.10327397286891937,
+      "learning_rate": 0.00019612218862951098,
+      "loss": 1.641,
+      "step": 222
+    },
+    {
+      "epoch": 0.2247983870967742,
+      "grad_norm": 0.08979543298482895,
+      "learning_rate": 0.00019607726470990229,
+      "loss": 1.7116,
+      "step": 223
+    },
+    {
+      "epoch": 0.22580645161290322,
+      "grad_norm": 0.08411210030317307,
+      "learning_rate": 0.00019603208727314543,
+      "loss": 1.6503,
+      "step": 224
+    },
+    {
+      "epoch": 0.22681451612903225,
+      "grad_norm": 0.08849965780973434,
+      "learning_rate": 0.00019598665643844924,
+      "loss": 1.7119,
+      "step": 225
+    },
+    {
+      "epoch": 0.22782258064516128,
+      "grad_norm": 0.08358252048492432,
+      "learning_rate": 0.00019594097232569118,
+      "loss": 1.7034,
+      "step": 226
+    },
+    {
+      "epoch": 0.2288306451612903,
+      "grad_norm": 0.08862830698490143,
+      "learning_rate": 0.0001958950350554169,
+      "loss": 1.6937,
+      "step": 227
+    },
+    {
+      "epoch": 0.22983870967741934,
+      "grad_norm": 0.09029026329517365,
+      "learning_rate": 0.00019584884474884025,
+      "loss": 1.6537,
+      "step": 228
+    },
+    {
+      "epoch": 0.23084677419354838,
+      "grad_norm": 0.0766313225030899,
+      "learning_rate": 0.00019580240152784265,
+      "loss": 1.6399,
+      "step": 229
+    },
+    {
+      "epoch": 0.2318548387096774,
+      "grad_norm": 0.09331216663122177,
+      "learning_rate": 0.00019575570551497287,
+      "loss": 1.6876,
+      "step": 230
+    },
+    {
+      "epoch": 0.23286290322580644,
+      "grad_norm": 0.07506153732538223,
+      "learning_rate": 0.00019570875683344672,
+      "loss": 1.6339,
+      "step": 231
+    },
+    {
+      "epoch": 0.23387096774193547,
+      "grad_norm": 0.08822404593229294,
+      "learning_rate": 0.0001956615556071468,
+      "loss": 1.6883,
+      "step": 232
+    },
+    {
+      "epoch": 0.23487903225806453,
+      "grad_norm": 0.07617950439453125,
+      "learning_rate": 0.000195614101960622,
+      "loss": 1.6845,
+      "step": 233
+    },
+    {
+      "epoch": 0.23588709677419356,
+      "grad_norm": 0.0857347846031189,
+      "learning_rate": 0.00019556639601908728,
+      "loss": 1.6769,
+      "step": 234
+    },
+    {
+      "epoch": 0.2368951612903226,
+      "grad_norm": 0.08155297487974167,
+      "learning_rate": 0.00019551843790842338,
+      "loss": 1.7275,
+      "step": 235
+    },
+    {
+      "epoch": 0.23790322580645162,
+      "grad_norm": 0.08427773416042328,
+      "learning_rate": 0.00019547022775517645,
+      "loss": 1.627,
+      "step": 236
+    },
+    {
+      "epoch": 0.23891129032258066,
+      "grad_norm": 0.0765247493982315,
+      "learning_rate": 0.00019542176568655757,
+      "loss": 1.6719,
+      "step": 237
+    },
+    {
+      "epoch": 0.2399193548387097,
+      "grad_norm": 0.07752780616283417,
+      "learning_rate": 0.00019537305183044268,
+      "loss": 1.6307,
+      "step": 238
+    },
+    {
+      "epoch": 0.24092741935483872,
+      "grad_norm": 0.07956812530755997,
+      "learning_rate": 0.00019532408631537203,
+      "loss": 1.6466,
+      "step": 239
+    },
+    {
+      "epoch": 0.24193548387096775,
+      "grad_norm": 0.07456839084625244,
+      "learning_rate": 0.00019527486927054994,
+      "loss": 1.6692,
+      "step": 240
+    },
+    {
+      "epoch": 0.24294354838709678,
+      "grad_norm": 0.08381907641887665,
+      "learning_rate": 0.00019522540082584443,
+      "loss": 1.679,
+      "step": 241
+    },
+    {
+      "epoch": 0.2439516129032258,
+      "grad_norm": 0.07443513721227646,
+      "learning_rate": 0.0001951756811117869,
+      "loss": 1.6867,
+      "step": 242
+    },
+    {
+      "epoch": 0.24495967741935484,
+      "grad_norm": 0.08541234582662582,
+      "learning_rate": 0.00019512571025957182,
+      "loss": 1.6424,
+      "step": 243
+    },
+    {
+      "epoch": 0.24596774193548387,
+      "grad_norm": 0.07867056876420975,
+      "learning_rate": 0.00019507548840105618,
+      "loss": 1.6847,
+      "step": 244
+    },
+    {
+      "epoch": 0.2469758064516129,
+      "grad_norm": 0.11804165691137314,
+      "learning_rate": 0.00019502501566875943,
+      "loss": 1.783,
+      "step": 245
+    },
+    {
+      "epoch": 0.24798387096774194,
+      "grad_norm": 0.0737847164273262,
+      "learning_rate": 0.00019497429219586296,
+      "loss": 1.6644,
+      "step": 246
+    },
+    {
+      "epoch": 0.24899193548387097,
+      "grad_norm": 0.08608712255954742,
+      "learning_rate": 0.00019492331811620976,
+      "loss": 1.6763,
+      "step": 247
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.09786904603242874,
+      "learning_rate": 0.00019487209356430413,
+      "loss": 1.7245,
+      "step": 248
+    },
+    {
+      "epoch": 0.25100806451612906,
+      "grad_norm": 0.10795535892248154,
+      "learning_rate": 0.00019482061867531127,
+      "loss": 1.7183,
+      "step": 249
+    },
+    {
+      "epoch": 0.25201612903225806,
+      "grad_norm": 0.0815276950597763,
+      "learning_rate": 0.0001947688935850569,
+      "loss": 1.7026,
+      "step": 250
+    },
+    {
+      "epoch": 0.2530241935483871,
+      "grad_norm": 0.09202085435390472,
+      "learning_rate": 0.00019471691843002701,
+      "loss": 1.6327,
+      "step": 251
+    },
+    {
+      "epoch": 0.2540322580645161,
+      "grad_norm": 0.08682993054389954,
+      "learning_rate": 0.00019466469334736739,
+      "loss": 1.6532,
+      "step": 252
+    },
+    {
+      "epoch": 0.2550403225806452,
+      "grad_norm": 0.08007092773914337,
+      "learning_rate": 0.00019461221847488333,
+      "loss": 1.6587,
+      "step": 253
+    },
+    {
+      "epoch": 0.2560483870967742,
+      "grad_norm": 0.12094767391681671,
+      "learning_rate": 0.0001945594939510392,
+      "loss": 1.7491,
+      "step": 254
+    },
+    {
+      "epoch": 0.25705645161290325,
+      "grad_norm": 0.10074511170387268,
+      "learning_rate": 0.00019450651991495812,
+      "loss": 1.7363,
+      "step": 255
+    },
+    {
+      "epoch": 0.25806451612903225,
+      "grad_norm": 0.0891348272562027,
+      "learning_rate": 0.00019445329650642163,
+      "loss": 1.6925,
+      "step": 256
+    },
+    {
+      "epoch": 0.2590725806451613,
+      "grad_norm": 0.1022176444530487,
+      "learning_rate": 0.00019439982386586932,
+      "loss": 1.6419,
+      "step": 257
+    },
+    {
+      "epoch": 0.2600806451612903,
+      "grad_norm": 0.08925571292638779,
+      "learning_rate": 0.00019434610213439832,
+      "loss": 1.6575,
+      "step": 258
+    },
+    {
+      "epoch": 0.2610887096774194,
+      "grad_norm": 0.07562322169542313,
+      "learning_rate": 0.0001942921314537631,
+      "loss": 1.6187,
+      "step": 259
+    },
+    {
+      "epoch": 0.2620967741935484,
+      "grad_norm": 0.09982999414205551,
+      "learning_rate": 0.000194237911966375,
+      "loss": 1.6341,
+      "step": 260
+    },
+    {
+      "epoch": 0.26310483870967744,
+      "grad_norm": 0.08155392110347748,
+      "learning_rate": 0.0001941834438153019,
+      "loss": 1.7189,
+      "step": 261
+    },
+    {
+      "epoch": 0.26411290322580644,
+      "grad_norm": 0.08979921042919159,
+      "learning_rate": 0.00019412872714426782,
+      "loss": 1.6556,
+      "step": 262
+    },
+    {
+      "epoch": 0.2651209677419355,
+      "grad_norm": 0.08493686467409134,
+      "learning_rate": 0.00019407376209765255,
+      "loss": 1.6919,
+      "step": 263
+    },
+    {
+      "epoch": 0.2661290322580645,
+      "grad_norm": 0.0822565034031868,
+      "learning_rate": 0.0001940185488204912,
+      "loss": 1.6205,
+      "step": 264
+    },
+    {
+      "epoch": 0.26713709677419356,
+      "grad_norm": 0.08931294083595276,
+      "learning_rate": 0.00019396308745847402,
+      "loss": 1.6848,
+      "step": 265
+    },
+    {
+      "epoch": 0.26814516129032256,
+      "grad_norm": 0.08736932277679443,
+      "learning_rate": 0.00019390737815794574,
+      "loss": 1.6882,
+      "step": 266
+    },
+    {
+      "epoch": 0.2691532258064516,
+      "grad_norm": 0.09153414517641068,
+      "learning_rate": 0.00019385142106590535,
+      "loss": 1.7596,
+      "step": 267
+    },
+    {
+      "epoch": 0.2701612903225806,
+      "grad_norm": 0.07890645414590836,
+      "learning_rate": 0.00019379521633000572,
+      "loss": 1.6987,
+      "step": 268
+    },
+    {
+      "epoch": 0.2711693548387097,
+      "grad_norm": 0.08790858089923859,
+      "learning_rate": 0.0001937387640985532,
+      "loss": 1.6744,
+      "step": 269
+    },
+    {
+      "epoch": 0.2721774193548387,
+      "grad_norm": 0.0803663581609726,
+      "learning_rate": 0.00019368206452050713,
+      "loss": 1.6846,
+      "step": 270
+    },
+    {
+      "epoch": 0.27318548387096775,
+      "grad_norm": 0.09086322039365768,
+      "learning_rate": 0.00019362511774547955,
+      "loss": 1.6878,
+      "step": 271
+    },
+    {
+      "epoch": 0.27419354838709675,
+      "grad_norm": 0.07199586182832718,
+      "learning_rate": 0.00019356792392373479,
+      "loss": 1.6316,
+      "step": 272
+    },
+    {
+      "epoch": 0.2752016129032258,
+      "grad_norm": 0.08460623025894165,
+      "learning_rate": 0.00019351048320618896,
+      "loss": 1.6558,
+      "step": 273
+    },
+    {
+      "epoch": 0.2762096774193548,
+      "grad_norm": 0.0732608363032341,
+      "learning_rate": 0.0001934527957444098,
+      "loss": 1.6752,
+      "step": 274
+    },
+    {
+      "epoch": 0.2772177419354839,
+      "grad_norm": 0.0906132385134697,
+      "learning_rate": 0.00019339486169061608,
+      "loss": 1.7395,
+      "step": 275
+    },
+    {
+      "epoch": 0.2782258064516129,
+      "grad_norm": 0.07827211916446686,
+      "learning_rate": 0.00019333668119767716,
+      "loss": 1.6681,
+      "step": 276
+    },
+    {
+      "epoch": 0.27923387096774194,
+      "grad_norm": 0.08276840299367905,
+      "learning_rate": 0.00019327825441911275,
+      "loss": 1.6645,
+      "step": 277
+    },
+    {
+      "epoch": 0.28024193548387094,
+      "grad_norm": 0.09114561229944229,
+      "learning_rate": 0.00019321958150909243,
+      "loss": 1.6857,
+      "step": 278
+    },
+    {
+      "epoch": 0.28125,
+      "grad_norm": 0.08729056268930435,
+      "learning_rate": 0.00019316066262243525,
+      "loss": 1.6483,
+      "step": 279
+    },
+    {
+      "epoch": 0.28225806451612906,
+      "grad_norm": 0.08572946488857269,
+      "learning_rate": 0.00019310149791460925,
+      "loss": 1.6872,
+      "step": 280
+    },
+    {
+      "epoch": 0.28326612903225806,
+      "grad_norm": 0.10044838488101959,
+      "learning_rate": 0.00019304208754173117,
+      "loss": 1.6935,
+      "step": 281
+    },
+    {
+      "epoch": 0.2842741935483871,
+      "grad_norm": 0.0785636454820633,
+      "learning_rate": 0.000192982431660566,
+      "loss": 1.6613,
+      "step": 282
+    },
+    {
+      "epoch": 0.2852822580645161,
+      "grad_norm": 0.08499724417924881,
+      "learning_rate": 0.00019292253042852648,
+      "loss": 1.6208,
+      "step": 283
+    },
+    {
+      "epoch": 0.2862903225806452,
+      "grad_norm": 0.09399082511663437,
+      "learning_rate": 0.00019286238400367277,
+      "loss": 1.619,
+      "step": 284
+    },
+    {
+      "epoch": 0.2872983870967742,
+      "grad_norm": 0.07334808260202408,
+      "learning_rate": 0.0001928019925447121,
+      "loss": 1.6813,
+      "step": 285
+    },
+    {
+      "epoch": 0.28830645161290325,
+      "grad_norm": 0.09035395085811615,
+      "learning_rate": 0.00019274135621099813,
+      "loss": 1.6265,
+      "step": 286
+    },
+    {
+      "epoch": 0.28931451612903225,
+      "grad_norm": 0.07861501723527908,
+      "learning_rate": 0.00019268047516253077,
+      "loss": 1.6808,
+      "step": 287
+    },
+    {
+      "epoch": 0.2903225806451613,
+      "grad_norm": 0.09788773208856583,
+      "learning_rate": 0.00019261934955995563,
+      "loss": 1.708,
+      "step": 288
+    },
+    {
+      "epoch": 0.2913306451612903,
+      "grad_norm": 0.07571721822023392,
+      "learning_rate": 0.00019255797956456357,
+      "loss": 1.6612,
+      "step": 289
+    },
+    {
+      "epoch": 0.2923387096774194,
+      "grad_norm": 0.0836874321103096,
+      "learning_rate": 0.00019249636533829042,
+      "loss": 1.6804,
+      "step": 290
+    },
+    {
+      "epoch": 0.2933467741935484,
+      "grad_norm": 0.08373916894197464,
+      "learning_rate": 0.00019243450704371632,
+      "loss": 1.6317,
+      "step": 291
+    },
+    {
+      "epoch": 0.29435483870967744,
+      "grad_norm": 0.08029752969741821,
+      "learning_rate": 0.00019237240484406561,
+      "loss": 1.6782,
+      "step": 292
+    },
+    {
+      "epoch": 0.29536290322580644,
+      "grad_norm": 0.08353215456008911,
+      "learning_rate": 0.00019231005890320602,
+      "loss": 1.6517,
+      "step": 293
+    },
+    {
+      "epoch": 0.2963709677419355,
+      "grad_norm": 0.09467596560716629,
+      "learning_rate": 0.00019224746938564859,
+      "loss": 1.6862,
+      "step": 294
+    },
+    {
+      "epoch": 0.2973790322580645,
+      "grad_norm": 0.10909095406532288,
+      "learning_rate": 0.000192184636456547,
+      "loss": 1.6579,
+      "step": 295
+    },
+    {
+      "epoch": 0.29838709677419356,
+      "grad_norm": 0.08434964716434479,
+      "learning_rate": 0.00019212156028169724,
+      "loss": 1.6516,
+      "step": 296
+    },
+    {
+      "epoch": 0.29939516129032256,
+      "grad_norm": 0.09146866202354431,
+      "learning_rate": 0.00019205824102753717,
+      "loss": 1.6754,
+      "step": 297
+    },
+    {
+      "epoch": 0.3004032258064516,
+      "grad_norm": 0.10936370491981506,
+      "learning_rate": 0.00019199467886114603,
+      "loss": 1.6495,
+      "step": 298
+    },
+    {
+      "epoch": 0.3014112903225806,
+      "grad_norm": 0.08099015057086945,
+      "learning_rate": 0.00019193087395024397,
+      "loss": 1.6656,
+      "step": 299
+    },
+    {
+      "epoch": 0.3024193548387097,
+      "grad_norm": 0.09252738207578659,
+      "learning_rate": 0.0001918668264631918,
+      "loss": 1.6711,
+      "step": 300
+    },
+    {
+      "epoch": 0.3034274193548387,
+      "grad_norm": 0.08917499333620071,
+      "learning_rate": 0.0001918025365689903,
+      "loss": 1.6356,
+      "step": 301
+    },
+    {
+      "epoch": 0.30443548387096775,
+      "grad_norm": 0.088597372174263,
+      "learning_rate": 0.00019173800443727994,
+      "loss": 1.6659,
+      "step": 302
+    },
+    {
+      "epoch": 0.30544354838709675,
+      "grad_norm": 0.09308971464633942,
+      "learning_rate": 0.00019167323023834033,
+      "loss": 1.7218,
+      "step": 303
+    },
+    {
+      "epoch": 0.3064516129032258,
+      "grad_norm": 0.07813969999551773,
+      "learning_rate": 0.00019160821414308988,
+      "loss": 1.6042,
+      "step": 304
+    },
+    {
+      "epoch": 0.3074596774193548,
+      "grad_norm": 0.08843039721250534,
+      "learning_rate": 0.0001915429563230853,
+      "loss": 1.6409,
+      "step": 305
+    },
+    {
+      "epoch": 0.3084677419354839,
+      "grad_norm": 0.09537311643362045,
+      "learning_rate": 0.00019147745695052097,
+      "loss": 1.6723,
+      "step": 306
+    },
+    {
+      "epoch": 0.3094758064516129,
+      "grad_norm": 0.08754942566156387,
+      "learning_rate": 0.00019141171619822882,
+      "loss": 1.643,
+      "step": 307
+    },
+    {
+      "epoch": 0.31048387096774194,
+      "grad_norm": 0.07768256217241287,
+      "learning_rate": 0.0001913457342396777,
+      "loss": 1.6109,
+      "step": 308
+    },
+    {
+      "epoch": 0.31149193548387094,
+      "grad_norm": 0.09593945741653442,
+      "learning_rate": 0.00019127951124897283,
+      "loss": 1.6756,
+      "step": 309
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.07348258048295975,
+      "learning_rate": 0.00019121304740085546,
+      "loss": 1.623,
+      "step": 310
+    },
+    {
+      "epoch": 0.31350806451612906,
+      "grad_norm": 0.08579769730567932,
+      "learning_rate": 0.0001911463428707025,
+      "loss": 1.658,
+      "step": 311
+    },
+    {
+      "epoch": 0.31451612903225806,
+      "grad_norm": 0.08485422283411026,
+      "learning_rate": 0.00019107939783452577,
+      "loss": 1.655,
+      "step": 312
+    },
+    {
+      "epoch": 0.3155241935483871,
+      "grad_norm": 0.08101114630699158,
+      "learning_rate": 0.00019101221246897184,
+      "loss": 1.6391,
+      "step": 313
+    },
+    {
+      "epoch": 0.3165322580645161,
+      "grad_norm": 0.08206996321678162,
+      "learning_rate": 0.00019094478695132138,
+      "loss": 1.6131,
+      "step": 314
+    },
+    {
+      "epoch": 0.3175403225806452,
+      "grad_norm": 0.07818609476089478,
+      "learning_rate": 0.00019087712145948868,
+      "loss": 1.6632,
+      "step": 315
+    },
+    {
+      "epoch": 0.3185483870967742,
+      "grad_norm": 0.09414539486169815,
+      "learning_rate": 0.0001908092161720214,
+      "loss": 1.6717,
+      "step": 316
+    },
+    {
+      "epoch": 0.31955645161290325,
+      "grad_norm": 0.08382460474967957,
+      "learning_rate": 0.00019074107126809984,
+      "loss": 1.6867,
+      "step": 317
+    },
+    {
+      "epoch": 0.32056451612903225,
+      "grad_norm": 0.07750436663627625,
+      "learning_rate": 0.00019067268692753655,
+      "loss": 1.6311,
+      "step": 318
+    },
+    {
+      "epoch": 0.3215725806451613,
+      "grad_norm": 0.08067768812179565,
+      "learning_rate": 0.00019060406333077596,
+      "loss": 1.6681,
+      "step": 319
+    },
+    {
+      "epoch": 0.3225806451612903,
+      "grad_norm": 0.074059396982193,
+      "learning_rate": 0.00019053520065889375,
+      "loss": 1.6408,
+      "step": 320
+    },
+    {
+      "epoch": 0.3235887096774194,
+      "grad_norm": 0.10559958219528198,
+      "learning_rate": 0.00019046609909359648,
+      "loss": 1.7342,
+      "step": 321
+    },
+    {
+      "epoch": 0.3245967741935484,
+      "grad_norm": 0.08121935278177261,
+      "learning_rate": 0.00019039675881722104,
+      "loss": 1.6808,
+      "step": 322
+    },
+    {
+      "epoch": 0.32560483870967744,
+      "grad_norm": 0.08211352676153183,
+      "learning_rate": 0.00019032718001273427,
+      "loss": 1.6127,
+      "step": 323
+    },
+    {
+      "epoch": 0.32661290322580644,
+      "grad_norm": 0.07450398057699203,
+      "learning_rate": 0.0001902573628637323,
+      "loss": 1.6555,
+      "step": 324
+    },
+    {
+      "epoch": 0.3276209677419355,
+      "grad_norm": 0.0976330116391182,
+      "learning_rate": 0.0001901873075544403,
+      "loss": 1.6775,
+      "step": 325
+    },
+    {
+      "epoch": 0.3286290322580645,
+      "grad_norm": 0.08012880384922028,
+      "learning_rate": 0.00019011701426971178,
+      "loss": 1.6213,
+      "step": 326
+    },
+    {
+      "epoch": 0.32963709677419356,
+      "grad_norm": 0.08508668839931488,
+      "learning_rate": 0.00019004648319502824,
+      "loss": 1.5809,
+      "step": 327
+    },
+    {
+      "epoch": 0.33064516129032256,
+      "grad_norm": 0.08622655272483826,
+      "learning_rate": 0.00018997571451649856,
+      "loss": 1.666,
+      "step": 328
+    },
+    {
+      "epoch": 0.3316532258064516,
+      "grad_norm": 0.09803669154644012,
+      "learning_rate": 0.00018990470842085867,
+      "loss": 1.6784,
+      "step": 329
+    },
+    {
+      "epoch": 0.3326612903225806,
+      "grad_norm": 0.08453961461782455,
+      "learning_rate": 0.0001898334650954709,
+      "loss": 1.6109,
+      "step": 330
+    },
+    {
+      "epoch": 0.3336693548387097,
+      "grad_norm": 0.07246208935976028,
+      "learning_rate": 0.00018976198472832364,
+      "loss": 1.6117,
+      "step": 331
+    },
+    {
+      "epoch": 0.3346774193548387,
+      "grad_norm": 0.08284757286310196,
+      "learning_rate": 0.00018969026750803063,
+      "loss": 1.6094,
+      "step": 332
+    },
+    {
+      "epoch": 0.33568548387096775,
+      "grad_norm": 0.08026500046253204,
+      "learning_rate": 0.00018961831362383067,
+      "loss": 1.6555,
+      "step": 333
+    },
+    {
+      "epoch": 0.33669354838709675,
+      "grad_norm": 0.08912428468465805,
+      "learning_rate": 0.00018954612326558707,
+      "loss": 1.6602,
+      "step": 334
+    },
+    {
+      "epoch": 0.3377016129032258,
+      "grad_norm": 0.08738451451063156,
+      "learning_rate": 0.00018947369662378704,
+      "loss": 1.6125,
+      "step": 335
+    },
+    {
+      "epoch": 0.3387096774193548,
+      "grad_norm": 0.07017836719751358,
+      "learning_rate": 0.00018940103388954133,
+      "loss": 1.6173,
+      "step": 336
+    },
+    {
+      "epoch": 0.3397177419354839,
+      "grad_norm": 0.08264176547527313,
+      "learning_rate": 0.00018932813525458363,
+      "loss": 1.6716,
+      "step": 337
+    },
+    {
+      "epoch": 0.3407258064516129,
+      "grad_norm": 0.08516332507133484,
+      "learning_rate": 0.00018925500091127007,
+      "loss": 1.6752,
+      "step": 338
+    },
+    {
+      "epoch": 0.34173387096774194,
+      "grad_norm": 0.07101423293352127,
+      "learning_rate": 0.00018918163105257883,
+      "loss": 1.6393,
+      "step": 339
+    },
+    {
+      "epoch": 0.34274193548387094,
+      "grad_norm": 0.07172892987728119,
+      "learning_rate": 0.00018910802587210942,
+      "loss": 1.6116,
+      "step": 340
+    },
+    {
+      "epoch": 0.34375,
+      "grad_norm": 0.07889813184738159,
+      "learning_rate": 0.0001890341855640824,
+      "loss": 1.6107,
+      "step": 341
+    },
+    {
+      "epoch": 0.34475806451612906,
+      "grad_norm": 0.07734905183315277,
+      "learning_rate": 0.0001889601103233387,
+      "loss": 1.6686,
+      "step": 342
+    },
+    {
+      "epoch": 0.34576612903225806,
+      "grad_norm": 0.09568161517381668,
+      "learning_rate": 0.00018888580034533915,
+      "loss": 1.6914,
+      "step": 343
+    },
+    {
+      "epoch": 0.3467741935483871,
+      "grad_norm": 0.0727929100394249,
+      "learning_rate": 0.000188811255826164,
+      "loss": 1.6271,
+      "step": 344
+    },
+    {
+      "epoch": 0.3477822580645161,
+      "grad_norm": 0.07241855561733246,
+      "learning_rate": 0.0001887364769625124,
+      "loss": 1.6514,
+      "step": 345
+    },
+    {
+      "epoch": 0.3487903225806452,
+      "grad_norm": 0.07215382158756256,
+      "learning_rate": 0.00018866146395170178,
+      "loss": 1.6578,
+      "step": 346
+    },
+    {
+      "epoch": 0.3497983870967742,
+      "grad_norm": 0.07429207116365433,
+      "learning_rate": 0.00018858621699166755,
+      "loss": 1.6176,
+      "step": 347
+    },
+    {
+      "epoch": 0.35080645161290325,
+      "grad_norm": 0.07516060024499893,
+      "learning_rate": 0.00018851073628096225,
+      "loss": 1.6735,
+      "step": 348
+    },
+    {
+      "epoch": 0.35181451612903225,
+      "grad_norm": 0.08864877372980118,
+      "learning_rate": 0.0001884350220187554,
+      "loss": 1.6044,
+      "step": 349
+    },
+    {
+      "epoch": 0.3528225806451613,
+      "grad_norm": 0.0749056488275528,
+      "learning_rate": 0.00018835907440483267,
+      "loss": 1.6316,
+      "step": 350
+    },
+    {
+      "epoch": 0.3538306451612903,
+      "grad_norm": 0.09181974828243256,
+      "learning_rate": 0.0001882828936395955,
+      "loss": 1.6834,
+      "step": 351
+    },
+    {
+      "epoch": 0.3548387096774194,
+      "grad_norm": 0.08013599365949631,
+      "learning_rate": 0.00018820647992406054,
+      "loss": 1.6367,
+      "step": 352
+    },
+    {
+      "epoch": 0.3558467741935484,
+      "grad_norm": 0.0809824988245964,
+      "learning_rate": 0.00018812983345985914,
+      "loss": 1.658,
+      "step": 353
+    },
+    {
+      "epoch": 0.35685483870967744,
+      "grad_norm": 0.1000952199101448,
+      "learning_rate": 0.0001880529544492368,
+      "loss": 1.6571,
+      "step": 354
+    },
+    {
+      "epoch": 0.35786290322580644,
+      "grad_norm": 0.074663445353508,
+      "learning_rate": 0.00018797584309505254,
+      "loss": 1.6358,
+      "step": 355
+    },
+    {
+      "epoch": 0.3588709677419355,
+      "grad_norm": 0.0898260623216629,
+      "learning_rate": 0.00018789849960077864,
+      "loss": 1.6496,
+      "step": 356
+    },
+    {
+      "epoch": 0.3598790322580645,
+      "grad_norm": 0.08878135681152344,
+      "learning_rate": 0.00018782092417049979,
+      "loss": 1.6819,
+      "step": 357
+    },
+    {
+      "epoch": 0.36088709677419356,
+      "grad_norm": 0.07256605476140976,
+      "learning_rate": 0.00018774311700891269,
+      "loss": 1.6521,
+      "step": 358
+    },
+    {
+      "epoch": 0.36189516129032256,
+      "grad_norm": 0.07939675450325012,
+      "learning_rate": 0.00018766507832132558,
+      "loss": 1.6898,
+      "step": 359
+    },
+    {
+      "epoch": 0.3629032258064516,
+      "grad_norm": 0.07508337497711182,
+      "learning_rate": 0.00018758680831365755,
+      "loss": 1.6204,
+      "step": 360
+    },
+    {
+      "epoch": 0.3639112903225806,
+      "grad_norm": 0.07679913192987442,
+      "learning_rate": 0.00018750830719243812,
+      "loss": 1.597,
+      "step": 361
+    },
+    {
+      "epoch": 0.3649193548387097,
+      "grad_norm": 0.07900839298963547,
+      "learning_rate": 0.00018742957516480657,
+      "loss": 1.6197,
+      "step": 362
+    },
+    {
+      "epoch": 0.3659274193548387,
+      "grad_norm": 0.08279551565647125,
+      "learning_rate": 0.00018735061243851158,
+      "loss": 1.7151,
+      "step": 363
+    },
+    {
+      "epoch": 0.36693548387096775,
+      "grad_norm": 0.10616319626569748,
+      "learning_rate": 0.00018727141922191047,
+      "loss": 1.7228,
+      "step": 364
+    },
+    {
+      "epoch": 0.36794354838709675,
+      "grad_norm": 0.08777708560228348,
+      "learning_rate": 0.00018719199572396882,
+      "loss": 1.6661,
+      "step": 365
+    },
+    {
+      "epoch": 0.3689516129032258,
+      "grad_norm": 0.0981433242559433,
+      "learning_rate": 0.00018711234215425978,
+      "loss": 1.6331,
+      "step": 366
+    },
+    {
+      "epoch": 0.3699596774193548,
+      "grad_norm": 0.07754123210906982,
+      "learning_rate": 0.00018703245872296365,
+      "loss": 1.6757,
+      "step": 367
+    },
+    {
+      "epoch": 0.3709677419354839,
+      "grad_norm": 0.09494742751121521,
+      "learning_rate": 0.00018695234564086724,
+      "loss": 1.6565,
+      "step": 368
+    },
+    {
+      "epoch": 0.3719758064516129,
+      "grad_norm": 0.100984126329422,
+      "learning_rate": 0.00018687200311936328,
+      "loss": 1.6879,
+      "step": 369
+    },
+    {
+      "epoch": 0.37298387096774194,
+      "grad_norm": 0.08996261656284332,
+      "learning_rate": 0.00018679143137045006,
+      "loss": 1.6579,
+      "step": 370
+    },
+    {
+      "epoch": 0.37399193548387094,
+      "grad_norm": 0.0966666117310524,
+      "learning_rate": 0.00018671063060673055,
+      "loss": 1.5853,
+      "step": 371
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.07991211116313934,
+      "learning_rate": 0.00018662960104141215,
+      "loss": 1.6355,
+      "step": 372
+    },
+    {
+      "epoch": 0.37600806451612906,
+      "grad_norm": 0.09592580795288086,
+      "learning_rate": 0.00018654834288830591,
+      "loss": 1.6172,
+      "step": 373
+    },
+    {
+      "epoch": 0.37701612903225806,
+      "grad_norm": 0.07976924628019333,
+      "learning_rate": 0.00018646685636182614,
+      "loss": 1.641,
+      "step": 374
+    },
+    {
+      "epoch": 0.3780241935483871,
+      "grad_norm": 0.08822676539421082,
+      "learning_rate": 0.00018638514167698965,
+      "loss": 1.6267,
+      "step": 375
+    },
+    {
+      "epoch": 0.3790322580645161,
+      "grad_norm": 0.07680735737085342,
+      "learning_rate": 0.00018630319904941535,
+      "loss": 1.6484,
+      "step": 376
+    },
+    {
+      "epoch": 0.3800403225806452,
+      "grad_norm": 0.09095903486013412,
+      "learning_rate": 0.0001862210286953236,
+      "loss": 1.6041,
+      "step": 377
+    },
+    {
+      "epoch": 0.3810483870967742,
+      "grad_norm": 0.07204829901456833,
+      "learning_rate": 0.0001861386308315357,
+      "loss": 1.6058,
+      "step": 378
+    },
+    {
+      "epoch": 0.38205645161290325,
+      "grad_norm": 0.12447134405374527,
+      "learning_rate": 0.00018605600567547318,
+      "loss": 1.6528,
+      "step": 379
+    },
+    {
+      "epoch": 0.38306451612903225,
+      "grad_norm": 0.08234449476003647,
+      "learning_rate": 0.00018597315344515744,
+      "loss": 1.6408,
+      "step": 380
+    },
+    {
+      "epoch": 0.3840725806451613,
+      "grad_norm": 0.0997692123055458,
+      "learning_rate": 0.00018589007435920892,
+      "loss": 1.631,
+      "step": 381
+    },
+    {
+      "epoch": 0.3850806451612903,
+      "grad_norm": 0.10275771468877792,
+      "learning_rate": 0.0001858067686368468,
+      "loss": 1.6979,
+      "step": 382
+    },
+    {
+      "epoch": 0.3860887096774194,
+      "grad_norm": 0.07703027874231339,
+      "learning_rate": 0.00018572323649788822,
+      "loss": 1.6037,
+      "step": 383
+    },
+    {
+      "epoch": 0.3870967741935484,
+      "grad_norm": 0.08485141396522522,
+      "learning_rate": 0.0001856394781627477,
+      "loss": 1.6027,
+      "step": 384
+    },
+    {
+      "epoch": 0.38810483870967744,
+      "grad_norm": 0.09312494099140167,
+      "learning_rate": 0.00018555549385243674,
+      "loss": 1.6757,
+      "step": 385
+    },
+    {
+      "epoch": 0.38911290322580644,
+      "grad_norm": 0.09300917387008667,
+      "learning_rate": 0.000185471283788563,
+      "loss": 1.6615,
+      "step": 386
+    },
+    {
+      "epoch": 0.3901209677419355,
+      "grad_norm": 0.07911553978919983,
+      "learning_rate": 0.0001853868481933299,
+      "loss": 1.6214,
+      "step": 387
+    },
+    {
+      "epoch": 0.3911290322580645,
+      "grad_norm": 0.07960621267557144,
+      "learning_rate": 0.00018530218728953597,
+      "loss": 1.6709,
+      "step": 388
+    },
+    {
+      "epoch": 0.39213709677419356,
+      "grad_norm": 0.0723830983042717,
+      "learning_rate": 0.0001852173013005742,
+      "loss": 1.6287,
+      "step": 389
+    },
+    {
+      "epoch": 0.39314516129032256,
+      "grad_norm": 0.08178212493658066,
+      "learning_rate": 0.00018513219045043156,
+      "loss": 1.5888,
+      "step": 390
+    },
+    {
+      "epoch": 0.3941532258064516,
+      "grad_norm": 0.07604778558015823,
+      "learning_rate": 0.00018504685496368838,
+      "loss": 1.6097,
+      "step": 391
+    },
+    {
+      "epoch": 0.3951612903225806,
+      "grad_norm": 0.07833520323038101,
+      "learning_rate": 0.00018496129506551763,
+      "loss": 1.6119,
+      "step": 392
+    },
+    {
+      "epoch": 0.3961693548387097,
+      "grad_norm": 0.0738687738776207,
+      "learning_rate": 0.00018487551098168452,
+      "loss": 1.646,
+      "step": 393
+    },
+    {
+      "epoch": 0.3971774193548387,
+      "grad_norm": 0.08156421035528183,
+      "learning_rate": 0.0001847895029385458,
+      "loss": 1.612,
+      "step": 394
+    },
+    {
+      "epoch": 0.39818548387096775,
+      "grad_norm": 0.0760064423084259,
+      "learning_rate": 0.00018470327116304916,
+      "loss": 1.6556,
+      "step": 395
+    },
+    {
+      "epoch": 0.39919354838709675,
+      "grad_norm": 0.07635514438152313,
+      "learning_rate": 0.0001846168158827326,
+      "loss": 1.5948,
+      "step": 396
+    },
+    {
+      "epoch": 0.4002016129032258,
+      "grad_norm": 0.07415641099214554,
+      "learning_rate": 0.00018453013732572403,
+      "loss": 1.6379,
+      "step": 397
+    },
+    {
+      "epoch": 0.4012096774193548,
+      "grad_norm": 0.07627629488706589,
+      "learning_rate": 0.00018444323572074035,
+      "loss": 1.6067,
+      "step": 398
+    },
+    {
+      "epoch": 0.4022177419354839,
+      "grad_norm": 0.08279147744178772,
+      "learning_rate": 0.00018435611129708713,
+      "loss": 1.6152,
+      "step": 399
+    },
+    {
+      "epoch": 0.4032258064516129,
+      "grad_norm": 0.07391797006130219,
+      "learning_rate": 0.00018426876428465777,
+      "loss": 1.6568,
+      "step": 400
+    },
+    {
+      "epoch": 0.40423387096774194,
+      "grad_norm": 0.07815629243850708,
+      "learning_rate": 0.00018418119491393312,
+      "loss": 1.6301,
+      "step": 401
+    },
+    {
+      "epoch": 0.40524193548387094,
+      "grad_norm": 0.07491758465766907,
+      "learning_rate": 0.0001840934034159807,
+      "loss": 1.6668,
+      "step": 402
+    },
+    {
+      "epoch": 0.40625,
+      "grad_norm": 0.07878877222537994,
+      "learning_rate": 0.0001840053900224542,
+      "loss": 1.6305,
+      "step": 403
+    },
+    {
+      "epoch": 0.40725806451612906,
+      "grad_norm": 0.07592154294252396,
+      "learning_rate": 0.00018391715496559273,
+      "loss": 1.6853,
+      "step": 404
+    },
+    {
+      "epoch": 0.40826612903225806,
+      "grad_norm": 0.082845039665699,
+      "learning_rate": 0.00018382869847822044,
+      "loss": 1.6918,
+      "step": 405
+    },
+    {
+      "epoch": 0.4092741935483871,
+      "grad_norm": 0.07842651754617691,
+      "learning_rate": 0.00018374002079374569,
+      "loss": 1.65,
+      "step": 406
+    },
+    {
+      "epoch": 0.4102822580645161,
+      "grad_norm": 0.07326355576515198,
+      "learning_rate": 0.0001836511221461604,
+      "loss": 1.6157,
+      "step": 407
+    },
+    {
+      "epoch": 0.4112903225806452,
+      "grad_norm": 0.08537916839122772,
+      "learning_rate": 0.00018356200277003975,
+      "loss": 1.5959,
+      "step": 408
+    },
+    {
+      "epoch": 0.4122983870967742,
+      "grad_norm": 0.09612290561199188,
+      "learning_rate": 0.00018347266290054116,
+      "loss": 1.6876,
+      "step": 409
+    },
+    {
+      "epoch": 0.41330645161290325,
+      "grad_norm": 0.07688483595848083,
+      "learning_rate": 0.00018338310277340406,
+      "loss": 1.6094,
+      "step": 410
+    },
+    {
+      "epoch": 0.41431451612903225,
+      "grad_norm": 0.09224136173725128,
+      "learning_rate": 0.00018329332262494887,
+      "loss": 1.616,
+      "step": 411
+    },
+    {
+      "epoch": 0.4153225806451613,
+      "grad_norm": 0.09629214555025101,
+      "learning_rate": 0.00018320332269207667,
+      "loss": 1.6197,
+      "step": 412
+    },
+    {
+      "epoch": 0.4163306451612903,
+      "grad_norm": 0.0956406518816948,
+      "learning_rate": 0.00018311310321226853,
+      "loss": 1.6939,
+      "step": 413
+    },
+    {
+      "epoch": 0.4173387096774194,
+      "grad_norm": 0.11505012959241867,
+      "learning_rate": 0.00018302266442358472,
+      "loss": 1.6692,
+      "step": 414
+    },
+    {
+      "epoch": 0.4183467741935484,
+      "grad_norm": 0.08150719106197357,
+      "learning_rate": 0.0001829320065646643,
+      "loss": 1.6428,
+      "step": 415
+    },
+    {
+      "epoch": 0.41935483870967744,
+      "grad_norm": 0.10705471783876419,
+      "learning_rate": 0.0001828411298747243,
+      "loss": 1.7328,
+      "step": 416
+    },
+    {
+      "epoch": 0.42036290322580644,
+      "grad_norm": 0.10280334204435349,
+      "learning_rate": 0.00018275003459355924,
+      "loss": 1.6245,
+      "step": 417
+    },
+    {
+      "epoch": 0.4213709677419355,
+      "grad_norm": 0.07620084285736084,
+      "learning_rate": 0.00018265872096154043,
+      "loss": 1.6317,
+      "step": 418
+    },
+    {
+      "epoch": 0.4223790322580645,
+      "grad_norm": 0.09292726963758469,
+      "learning_rate": 0.00018256718921961525,
+      "loss": 1.6555,
+      "step": 419
+    },
+    {
+      "epoch": 0.42338709677419356,
+      "grad_norm": 0.07884904742240906,
+      "learning_rate": 0.00018247543960930672,
+      "loss": 1.6325,
+      "step": 420
+    },
+    {
+      "epoch": 0.42439516129032256,
+      "grad_norm": 0.1114020049571991,
+      "learning_rate": 0.00018238347237271266,
+      "loss": 1.6861,
+      "step": 421
+    },
+    {
+      "epoch": 0.4254032258064516,
+      "grad_norm": 0.08363789319992065,
+      "learning_rate": 0.00018229128775250523,
+      "loss": 1.6398,
+      "step": 422
+    },
+    {
+      "epoch": 0.4264112903225806,
+      "grad_norm": 0.10317594558000565,
+      "learning_rate": 0.00018219888599193008,
+      "loss": 1.5966,
+      "step": 423
+    },
+    {
+      "epoch": 0.4274193548387097,
+      "grad_norm": 0.09324808418750763,
+      "learning_rate": 0.00018210626733480593,
+      "loss": 1.6463,
+      "step": 424
+    },
+    {
+      "epoch": 0.4284274193548387,
+      "grad_norm": 0.0866997167468071,
+      "learning_rate": 0.00018201343202552367,
+      "loss": 1.5802,
+      "step": 425
+    },
+    {
+      "epoch": 0.42943548387096775,
+      "grad_norm": 0.09528562426567078,
+      "learning_rate": 0.00018192038030904608,
+      "loss": 1.6768,
+      "step": 426
+    },
+    {
+      "epoch": 0.43044354838709675,
+      "grad_norm": 0.08449150621891022,
+      "learning_rate": 0.00018182711243090678,
+      "loss": 1.6323,
+      "step": 427
+    },
+    {
+      "epoch": 0.4314516129032258,
+      "grad_norm": 0.07713552564382553,
+      "learning_rate": 0.00018173362863720986,
+      "loss": 1.6264,
+      "step": 428
+    },
+    {
+      "epoch": 0.4324596774193548,
+      "grad_norm": 0.08549489825963974,
+      "learning_rate": 0.00018163992917462918,
+      "loss": 1.6628,
+      "step": 429
+    },
+    {
+      "epoch": 0.4334677419354839,
+      "grad_norm": 0.07783807069063187,
+      "learning_rate": 0.00018154601429040757,
+      "loss": 1.6892,
+      "step": 430
+    },
+    {
+      "epoch": 0.4344758064516129,
+      "grad_norm": 0.09653409570455551,
+      "learning_rate": 0.00018145188423235634,
+      "loss": 1.6651,
+      "step": 431
+    },
+    {
+      "epoch": 0.43548387096774194,
+      "grad_norm": 0.08650687336921692,
+      "learning_rate": 0.00018135753924885465,
+      "loss": 1.6113,
+      "step": 432
+    },
+    {
+      "epoch": 0.43649193548387094,
+      "grad_norm": 0.08643219619989395,
+      "learning_rate": 0.00018126297958884866,
+      "loss": 1.6111,
+      "step": 433
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.08586744964122772,
+      "learning_rate": 0.00018116820550185107,
+      "loss": 1.643,
+      "step": 434
+    },
+    {
+      "epoch": 0.43850806451612906,
+      "grad_norm": 0.09063699096441269,
+      "learning_rate": 0.00018107321723794036,
+      "loss": 1.6422,
+      "step": 435
+    },
+    {
+      "epoch": 0.43951612903225806,
+      "grad_norm": 0.07849163562059402,
+      "learning_rate": 0.00018097801504776012,
+      "loss": 1.6183,
+      "step": 436
+    },
+    {
+      "epoch": 0.4405241935483871,
+      "grad_norm": 0.07795203477144241,
+      "learning_rate": 0.00018088259918251846,
+      "loss": 1.6267,
+      "step": 437
+    },
+    {
+      "epoch": 0.4415322580645161,
+      "grad_norm": 0.08508776873350143,
+      "learning_rate": 0.00018078696989398734,
+      "loss": 1.6581,
+      "step": 438
+    },
+    {
+      "epoch": 0.4425403225806452,
+      "grad_norm": 0.08001305162906647,
+      "learning_rate": 0.00018069112743450183,
+      "loss": 1.6287,
+      "step": 439
+    },
+    {
+      "epoch": 0.4435483870967742,
+      "grad_norm": 0.07482777535915375,
+      "learning_rate": 0.0001805950720569595,
+      "loss": 1.6426,
+      "step": 440
+    },
+    {
+      "epoch": 0.44455645161290325,
+      "grad_norm": 0.07578035444021225,
+      "learning_rate": 0.00018049880401481972,
+      "loss": 1.6294,
+      "step": 441
+    },
+    {
+      "epoch": 0.44556451612903225,
+      "grad_norm": 0.07782859355211258,
+      "learning_rate": 0.00018040232356210308,
+      "loss": 1.5935,
+      "step": 442
+    },
+    {
+      "epoch": 0.4465725806451613,
+      "grad_norm": 0.07492804527282715,
+      "learning_rate": 0.00018030563095339062,
+      "loss": 1.5769,
+      "step": 443
+    },
+    {
+      "epoch": 0.4475806451612903,
+      "grad_norm": 0.07825621962547302,
+      "learning_rate": 0.00018020872644382313,
+      "loss": 1.5786,
+      "step": 444
+    },
+    {
+      "epoch": 0.4485887096774194,
+      "grad_norm": 0.09208081662654877,
+      "learning_rate": 0.0001801116102891006,
+      "loss": 1.6649,
+      "step": 445
+    },
+    {
+      "epoch": 0.4495967741935484,
+      "grad_norm": 0.07900070399045944,
+      "learning_rate": 0.00018001428274548156,
+      "loss": 1.6529,
+      "step": 446
+    },
+    {
+      "epoch": 0.45060483870967744,
+      "grad_norm": 0.07847368717193604,
+      "learning_rate": 0.00017991674406978215,
+      "loss": 1.6133,
+      "step": 447
+    },
+    {
+      "epoch": 0.45161290322580644,
+      "grad_norm": 0.0754162147641182,
+      "learning_rate": 0.00017981899451937573,
+      "loss": 1.6478,
+      "step": 448
+    },
+    {
+      "epoch": 0.4526209677419355,
+      "grad_norm": 0.08314093947410583,
+      "learning_rate": 0.0001797210343521921,
+      "loss": 1.5926,
+      "step": 449
+    },
+    {
+      "epoch": 0.4536290322580645,
+      "grad_norm": 0.07506029307842255,
+      "learning_rate": 0.00017962286382671678,
+      "loss": 1.6031,
+      "step": 450
+    },
+    {
+      "epoch": 0.45463709677419356,
+      "grad_norm": 0.09021966904401779,
+      "learning_rate": 0.00017952448320199035,
+      "loss": 1.5805,
+      "step": 451
+    },
+    {
+      "epoch": 0.45564516129032256,
+      "grad_norm": 0.07435688376426697,
+      "learning_rate": 0.00017942589273760783,
+      "loss": 1.6291,
+      "step": 452
+    },
+    {
+      "epoch": 0.4566532258064516,
+      "grad_norm": 0.07785916328430176,
+      "learning_rate": 0.00017932709269371784,
+      "loss": 1.6525,
+      "step": 453
+    },
+    {
+      "epoch": 0.4576612903225806,
+      "grad_norm": 0.07916136831045151,
+      "learning_rate": 0.00017922808333102207,
+      "loss": 1.6301,
+      "step": 454
+    },
+    {
+      "epoch": 0.4586693548387097,
+      "grad_norm": 0.08399738371372223,
+      "learning_rate": 0.00017912886491077462,
+      "loss": 1.6915,
+      "step": 455
+    },
+    {
+      "epoch": 0.4596774193548387,
+      "grad_norm": 0.08618689328432083,
+      "learning_rate": 0.000179029437694781,
+      "loss": 1.6718,
+      "step": 456
+    },
+    {
+      "epoch": 0.46068548387096775,
+      "grad_norm": 0.07570008933544159,
+      "learning_rate": 0.00017892980194539798,
+      "loss": 1.6588,
+      "step": 457
+    },
+    {
+      "epoch": 0.46169354838709675,
+      "grad_norm": 0.09821120649576187,
+      "learning_rate": 0.00017882995792553228,
+      "loss": 1.6914,
+      "step": 458
+    },
+    {
+      "epoch": 0.4627016129032258,
+      "grad_norm": 0.07994726300239563,
+      "learning_rate": 0.00017872990589864034,
+      "loss": 1.6077,
+      "step": 459
+    },
+    {
+      "epoch": 0.4637096774193548,
+      "grad_norm": 0.08893134444952011,
+      "learning_rate": 0.00017862964612872748,
+      "loss": 1.6447,
+      "step": 460
+    },
+    {
+      "epoch": 0.4647177419354839,
+      "grad_norm": 0.08347106724977493,
+      "learning_rate": 0.00017852917888034706,
+      "loss": 1.6501,
+      "step": 461
+    },
+    {
+      "epoch": 0.4657258064516129,
+      "grad_norm": 0.07879969477653503,
+      "learning_rate": 0.00017842850441860005,
+      "loss": 1.643,
+      "step": 462
+    },
+    {
+      "epoch": 0.46673387096774194,
+      "grad_norm": 0.08305401355028152,
+      "learning_rate": 0.00017832762300913413,
+      "loss": 1.677,
+      "step": 463
+    },
+    {
+      "epoch": 0.46774193548387094,
+      "grad_norm": 0.0827251598238945,
+      "learning_rate": 0.00017822653491814304,
+      "loss": 1.6432,
+      "step": 464
+    },
+    {
+      "epoch": 0.46875,
+      "grad_norm": 0.08472172170877457,
+      "learning_rate": 0.00017812524041236586,
+      "loss": 1.654,
+      "step": 465
+    },
+    {
+      "epoch": 0.46975806451612906,
+      "grad_norm": 0.07689754664897919,
+      "learning_rate": 0.0001780237397590864,
+      "loss": 1.5642,
+      "step": 466
+    },
+    {
+      "epoch": 0.47076612903225806,
+      "grad_norm": 0.10658534616231918,
+      "learning_rate": 0.00017792203322613236,
+      "loss": 1.6561,
+      "step": 467
+    },
+    {
+      "epoch": 0.4717741935483871,
+      "grad_norm": 0.08347711712121964,
+      "learning_rate": 0.0001778201210818748,
+      "loss": 1.6595,
+      "step": 468
+    },
+    {
+      "epoch": 0.4727822580645161,
+      "grad_norm": 0.08595866709947586,
+      "learning_rate": 0.0001777180035952272,
+      "loss": 1.6185,
+      "step": 469
+    },
+    {
+      "epoch": 0.4737903225806452,
+      "grad_norm": 0.08824612945318222,
+      "learning_rate": 0.00017761568103564487,
+      "loss": 1.6779,
+      "step": 470
+    },
+    {
+      "epoch": 0.4747983870967742,
+      "grad_norm": 0.07452390342950821,
+      "learning_rate": 0.0001775131536731244,
+      "loss": 1.6252,
+      "step": 471
+    },
+    {
+      "epoch": 0.47580645161290325,
+      "grad_norm": 0.09783647954463959,
+      "learning_rate": 0.00017741042177820258,
+      "loss": 1.6417,
+      "step": 472
+    },
+    {
+      "epoch": 0.47681451612903225,
+      "grad_norm": 0.07527977973222733,
+      "learning_rate": 0.0001773074856219561,
+      "loss": 1.6128,
+      "step": 473
+    },
+    {
+      "epoch": 0.4778225806451613,
+      "grad_norm": 0.07836946099996567,
+      "learning_rate": 0.00017720434547600043,
+      "loss": 1.625,
+      "step": 474
+    },
+    {
+      "epoch": 0.4788306451612903,
+      "grad_norm": 0.07427874952554703,
+      "learning_rate": 0.00017710100161248945,
+      "loss": 1.6261,
+      "step": 475
+    },
+    {
+      "epoch": 0.4798387096774194,
+      "grad_norm": 0.09168553352355957,
+      "learning_rate": 0.0001769974543041145,
+      "loss": 1.702,
+      "step": 476
+    },
+    {
+      "epoch": 0.4808467741935484,
+      "grad_norm": 0.0791415199637413,
+      "learning_rate": 0.00017689370382410386,
+      "loss": 1.6129,
+      "step": 477
+    },
+    {
+      "epoch": 0.48185483870967744,
+      "grad_norm": 0.07638856768608093,
+      "learning_rate": 0.00017678975044622174,
+      "loss": 1.593,
+      "step": 478
+    },
+    {
+      "epoch": 0.48286290322580644,
+      "grad_norm": 0.08905162662267685,
+      "learning_rate": 0.00017668559444476793,
+      "loss": 1.6803,
+      "step": 479
+    },
+    {
+      "epoch": 0.4838709677419355,
+      "grad_norm": 0.08039755374193192,
+      "learning_rate": 0.00017658123609457668,
+      "loss": 1.6624,
+      "step": 480
+    },
+    {
+      "epoch": 0.4848790322580645,
+      "grad_norm": 0.07831753045320511,
+      "learning_rate": 0.00017647667567101632,
+      "loss": 1.6602,
+      "step": 481
+    },
+    {
+      "epoch": 0.48588709677419356,
+      "grad_norm": 0.07645969092845917,
+      "learning_rate": 0.00017637191344998837,
+      "loss": 1.6462,
+      "step": 482
+    },
+    {
+      "epoch": 0.48689516129032256,
+      "grad_norm": 0.0790887251496315,
+      "learning_rate": 0.00017626694970792673,
+      "loss": 1.581,
+      "step": 483
+    },
+    {
+      "epoch": 0.4879032258064516,
+      "grad_norm": 0.07644886523485184,
+      "learning_rate": 0.00017616178472179715,
+      "loss": 1.6035,
+      "step": 484
+    },
+    {
+      "epoch": 0.4889112903225806,
+      "grad_norm": 0.08160758763551712,
+      "learning_rate": 0.0001760564187690964,
+      "loss": 1.6169,
+      "step": 485
+    },
+    {
+      "epoch": 0.4899193548387097,
+      "grad_norm": 0.09234445542097092,
+      "learning_rate": 0.00017595085212785146,
+      "loss": 1.5878,
+      "step": 486
+    },
+    {
+      "epoch": 0.4909274193548387,
+      "grad_norm": 0.09042947739362717,
+      "learning_rate": 0.0001758450850766189,
+      "loss": 1.6629,
+      "step": 487
+    },
+    {
+      "epoch": 0.49193548387096775,
+      "grad_norm": 0.08583879470825195,
+      "learning_rate": 0.00017573911789448414,
+      "loss": 1.6398,
+      "step": 488
+    },
+    {
+      "epoch": 0.49294354838709675,
+      "grad_norm": 0.07878076285123825,
+      "learning_rate": 0.00017563295086106063,
+      "loss": 1.64,
+      "step": 489
+    },
+    {
+      "epoch": 0.4939516129032258,
+      "grad_norm": 0.08849604427814484,
+      "learning_rate": 0.00017552658425648923,
+      "loss": 1.6015,
+      "step": 490
+    },
+    {
+      "epoch": 0.4949596774193548,
+      "grad_norm": 0.07961837202310562,
+      "learning_rate": 0.00017542001836143731,
+      "loss": 1.6392,
+      "step": 491
+    },
+    {
+      "epoch": 0.4959677419354839,
+      "grad_norm": 0.08883430808782578,
+      "learning_rate": 0.00017531325345709816,
+      "loss": 1.6417,
+      "step": 492
+    },
+    {
+      "epoch": 0.4969758064516129,
+      "grad_norm": 0.07420235127210617,
+      "learning_rate": 0.00017520628982519023,
+      "loss": 1.635,
+      "step": 493
+    },
+    {
+      "epoch": 0.49798387096774194,
+      "grad_norm": 0.08477555215358734,
+      "learning_rate": 0.0001750991277479563,
+      "loss": 1.6264,
+      "step": 494
+    },
+    {
+      "epoch": 0.49899193548387094,
+      "grad_norm": 0.07410185784101486,
+      "learning_rate": 0.00017499176750816276,
+      "loss": 1.6414,
+      "step": 495
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.08427213877439499,
+      "learning_rate": 0.00017488420938909893,
+      "loss": 1.6546,
+      "step": 496
+    },
+    {
+      "epoch": 0.501008064516129,
+      "grad_norm": 0.0739702582359314,
+      "learning_rate": 0.00017477645367457628,
+      "loss": 1.6316,
+      "step": 497
+    },
+    {
+      "epoch": 0.5020161290322581,
+      "grad_norm": 0.08044146001338959,
+      "learning_rate": 0.00017466850064892762,
+      "loss": 1.6256,
+      "step": 498
+    },
+    {
+      "epoch": 0.5030241935483871,
+      "grad_norm": 0.08690078556537628,
+      "learning_rate": 0.0001745603505970064,
+      "loss": 1.589,
+      "step": 499
+    },
+    {
+      "epoch": 0.5040322580645161,
+      "grad_norm": 0.07842793315649033,
+      "learning_rate": 0.00017445200380418607,
+      "loss": 1.6352,
+      "step": 500
+    },
+    {
+      "epoch": 0.5050403225806451,
+      "grad_norm": 0.08214239776134491,
+      "learning_rate": 0.00017434346055635912,
+      "loss": 1.6244,
+      "step": 501
+    },
+    {
+      "epoch": 0.5060483870967742,
+      "grad_norm": 0.07770374417304993,
+      "learning_rate": 0.00017423472113993634,
+      "loss": 1.65,
+      "step": 502
+    },
+    {
+      "epoch": 0.5070564516129032,
+      "grad_norm": 0.08378950506448746,
+      "learning_rate": 0.00017412578584184637,
+      "loss": 1.6129,
+      "step": 503
+    },
+    {
+      "epoch": 0.5080645161290323,
+      "grad_norm": 0.07839113473892212,
+      "learning_rate": 0.00017401665494953453,
+      "loss": 1.6479,
+      "step": 504
+    },
+    {
+      "epoch": 0.5090725806451613,
+      "grad_norm": 0.0775337815284729,
+      "learning_rate": 0.00017390732875096227,
+      "loss": 1.6005,
+      "step": 505
+    },
+    {
+      "epoch": 0.5100806451612904,
+      "grad_norm": 0.08532094955444336,
+      "learning_rate": 0.00017379780753460654,
+      "loss": 1.6669,
+      "step": 506
+    },
+    {
+      "epoch": 0.5110887096774194,
+      "grad_norm": 0.07484716176986694,
+      "learning_rate": 0.00017368809158945872,
+      "loss": 1.6786,
+      "step": 507
+    },
+    {
+      "epoch": 0.5120967741935484,
+      "grad_norm": 0.08861152827739716,
+      "learning_rate": 0.00017357818120502402,
+      "loss": 1.6753,
+      "step": 508
+    },
+    {
+      "epoch": 0.5131048387096774,
+      "grad_norm": 0.08586420863866806,
+      "learning_rate": 0.00017346807667132085,
+      "loss": 1.6483,
+      "step": 509
+    },
+    {
+      "epoch": 0.5141129032258065,
+      "grad_norm": 0.08970779180526733,
+      "learning_rate": 0.00017335777827887978,
+      "loss": 1.6776,
+      "step": 510
+    },
+    {
+      "epoch": 0.5151209677419355,
+      "grad_norm": 0.08755983412265778,
+      "learning_rate": 0.00017324728631874298,
+      "loss": 1.6666,
+      "step": 511
+    },
+    {
+      "epoch": 0.5161290322580645,
+      "grad_norm": 0.08634518831968307,
+      "learning_rate": 0.00017313660108246337,
+      "loss": 1.6195,
+      "step": 512
+    },
+    {
+      "epoch": 0.5171370967741935,
+      "grad_norm": 0.08298657834529877,
+      "learning_rate": 0.00017302572286210382,
+      "loss": 1.5564,
+      "step": 513
+    },
+    {
+      "epoch": 0.5181451612903226,
+      "grad_norm": 0.07834544777870178,
+      "learning_rate": 0.00017291465195023653,
+      "loss": 1.6109,
+      "step": 514
+    },
+    {
+      "epoch": 0.5191532258064516,
+      "grad_norm": 0.09181385487318039,
+      "learning_rate": 0.000172803388639942,
+      "loss": 1.6387,
+      "step": 515
+    },
+    {
+      "epoch": 0.5201612903225806,
+      "grad_norm": 0.07698329538106918,
+      "learning_rate": 0.00017269193322480856,
+      "loss": 1.6223,
+      "step": 516
+    },
+    {
+      "epoch": 0.5211693548387096,
+      "grad_norm": 0.10118810087442398,
+      "learning_rate": 0.00017258028599893136,
+      "loss": 1.6365,
+      "step": 517
+    },
+    {
+      "epoch": 0.5221774193548387,
+      "grad_norm": 0.08565083891153336,
+      "learning_rate": 0.00017246844725691166,
+      "loss": 1.5905,
+      "step": 518
+    },
+    {
+      "epoch": 0.5231854838709677,
+      "grad_norm": 0.08563411980867386,
+      "learning_rate": 0.00017235641729385615,
+      "loss": 1.6141,
+      "step": 519
+    },
+    {
+      "epoch": 0.5241935483870968,
+      "grad_norm": 0.07669138163328171,
+      "learning_rate": 0.00017224419640537598,
+      "loss": 1.6278,
+      "step": 520
+    },
+    {
+      "epoch": 0.5252016129032258,
+      "grad_norm": 0.09773047268390656,
+      "learning_rate": 0.00017213178488758622,
+      "loss": 1.7324,
+      "step": 521
+    },
+    {
+      "epoch": 0.5262096774193549,
+      "grad_norm": 0.07799120247364044,
+      "learning_rate": 0.00017201918303710482,
+      "loss": 1.5967,
+      "step": 522
+    },
+    {
+      "epoch": 0.5272177419354839,
+      "grad_norm": 0.0810832753777504,
+      "learning_rate": 0.0001719063911510521,
+      "loss": 1.6204,
+      "step": 523
+    },
+    {
+      "epoch": 0.5282258064516129,
+      "grad_norm": 0.08055137097835541,
+      "learning_rate": 0.0001717934095270497,
+      "loss": 1.6138,
+      "step": 524
+    },
+    {
+      "epoch": 0.5292338709677419,
+      "grad_norm": 0.08200159668922424,
+      "learning_rate": 0.0001716802384632199,
+      "loss": 1.6211,
+      "step": 525
+    },
+    {
+      "epoch": 0.530241935483871,
+      "grad_norm": 0.0793243944644928,
+      "learning_rate": 0.00017156687825818504,
+      "loss": 1.579,
+      "step": 526
+    },
+    {
+      "epoch": 0.53125,
+      "grad_norm": 0.08332548290491104,
+      "learning_rate": 0.00017145332921106633,
+      "loss": 1.5874,
+      "step": 527
+    },
+    {
+      "epoch": 0.532258064516129,
+      "grad_norm": 0.07582446932792664,
+      "learning_rate": 0.00017133959162148336,
+      "loss": 1.5871,
+      "step": 528
+    },
+    {
+      "epoch": 0.5332661290322581,
+      "grad_norm": 0.0803590714931488,
+      "learning_rate": 0.00017122566578955324,
+      "loss": 1.6451,
+      "step": 529
+    },
+    {
+      "epoch": 0.5342741935483871,
+      "grad_norm": 0.07705288380384445,
+      "learning_rate": 0.00017111155201588978,
+      "loss": 1.5892,
+      "step": 530
+    },
+    {
+      "epoch": 0.5352822580645161,
+      "grad_norm": 0.08003994822502136,
+      "learning_rate": 0.0001709972506016027,
+      "loss": 1.6701,
+      "step": 531
+    },
+    {
+      "epoch": 0.5362903225806451,
+      "grad_norm": 0.07644215226173401,
+      "learning_rate": 0.00017088276184829685,
+      "loss": 1.6271,
+      "step": 532
+    },
+    {
+      "epoch": 0.5372983870967742,
+      "grad_norm": 0.08193427324295044,
+      "learning_rate": 0.00017076808605807138,
+      "loss": 1.5906,
+      "step": 533
+    },
+    {
+      "epoch": 0.5383064516129032,
+      "grad_norm": 0.08339913934469223,
+      "learning_rate": 0.00017065322353351903,
+      "loss": 1.6452,
+      "step": 534
+    },
+    {
+      "epoch": 0.5393145161290323,
+      "grad_norm": 0.08375068008899689,
+      "learning_rate": 0.0001705381745777252,
+      "loss": 1.6573,
+      "step": 535
+    },
+    {
+      "epoch": 0.5403225806451613,
+      "grad_norm": 0.07980147749185562,
+      "learning_rate": 0.00017042293949426726,
+      "loss": 1.5999,
+      "step": 536
+    },
+    {
+      "epoch": 0.5413306451612904,
+      "grad_norm": 0.07945246994495392,
+      "learning_rate": 0.00017030751858721375,
+      "loss": 1.6372,
+      "step": 537
+    },
+    {
+      "epoch": 0.5423387096774194,
+      "grad_norm": 0.07931476086378098,
+      "learning_rate": 0.00017019191216112342,
+      "loss": 1.6244,
+      "step": 538
+    },
+    {
+      "epoch": 0.5433467741935484,
+      "grad_norm": 0.07984746247529984,
+      "learning_rate": 0.00017007612052104474,
+      "loss": 1.5592,
+      "step": 539
+    },
+    {
+      "epoch": 0.5443548387096774,
+      "grad_norm": 0.09376467764377594,
+      "learning_rate": 0.00016996014397251466,
+      "loss": 1.6774,
+      "step": 540
+    },
+    {
+      "epoch": 0.5453629032258065,
+      "grad_norm": 0.08642607182264328,
+      "learning_rate": 0.00016984398282155825,
+      "loss": 1.6101,
+      "step": 541
+    },
+    {
+      "epoch": 0.5463709677419355,
+      "grad_norm": 0.07891902327537537,
+      "learning_rate": 0.00016972763737468758,
+      "loss": 1.6109,
+      "step": 542
+    },
+    {
+      "epoch": 0.5473790322580645,
+      "grad_norm": 0.07893992215394974,
+      "learning_rate": 0.00016961110793890108,
+      "loss": 1.643,
+      "step": 543
+    },
+    {
+      "epoch": 0.5483870967741935,
+      "grad_norm": 0.08107249438762665,
+      "learning_rate": 0.00016949439482168255,
+      "loss": 1.6093,
+      "step": 544
+    },
+    {
+      "epoch": 0.5493951612903226,
+      "grad_norm": 0.08450604975223541,
+      "learning_rate": 0.00016937749833100064,
+      "loss": 1.6406,
+      "step": 545
+    },
+    {
+      "epoch": 0.5504032258064516,
+      "grad_norm": 0.08088622242212296,
+      "learning_rate": 0.0001692604187753077,
+      "loss": 1.6293,
+      "step": 546
+    },
+    {
+      "epoch": 0.5514112903225806,
+      "grad_norm": 0.09227669984102249,
+      "learning_rate": 0.0001691431564635392,
+      "loss": 1.6022,
+      "step": 547
+    },
+    {
+      "epoch": 0.5524193548387096,
+      "grad_norm": 0.08562039583921432,
+      "learning_rate": 0.00016902571170511292,
+      "loss": 1.6341,
+      "step": 548
+    },
+    {
+      "epoch": 0.5534274193548387,
+      "grad_norm": 0.09240545332431793,
+      "learning_rate": 0.0001689080848099279,
+      "loss": 1.643,
+      "step": 549
+    },
+    {
+      "epoch": 0.5544354838709677,
+      "grad_norm": 0.09082893282175064,
+      "learning_rate": 0.00016879027608836394,
+      "loss": 1.6132,
+      "step": 550
+    },
+    {
+      "epoch": 0.5554435483870968,
+      "grad_norm": 0.08730785548686981,
+      "learning_rate": 0.00016867228585128047,
+      "loss": 1.631,
+      "step": 551
+    },
+    {
+      "epoch": 0.5564516129032258,
+      "grad_norm": 0.08937687426805496,
+      "learning_rate": 0.000168554114410016,
+      "loss": 1.7034,
+      "step": 552
+    },
+    {
+      "epoch": 0.5574596774193549,
+      "grad_norm": 0.07652641087770462,
+      "learning_rate": 0.0001684357620763872,
+      "loss": 1.6019,
+      "step": 553
+    },
+    {
+      "epoch": 0.5584677419354839,
+      "grad_norm": 0.08145558089017868,
+      "learning_rate": 0.00016831722916268787,
+      "loss": 1.6705,
+      "step": 554
+    },
+    {
+      "epoch": 0.5594758064516129,
+      "grad_norm": 0.09578656405210495,
+      "learning_rate": 0.0001681985159816885,
+      "loss": 1.6889,
+      "step": 555
+    },
+    {
+      "epoch": 0.5604838709677419,
+      "grad_norm": 0.085781030356884,
+      "learning_rate": 0.00016807962284663518,
+      "loss": 1.6362,
+      "step": 556
+    },
+    {
+      "epoch": 0.561491935483871,
+      "grad_norm": 0.07998887449502945,
+      "learning_rate": 0.0001679605500712488,
+      "loss": 1.6045,
+      "step": 557
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.09279566258192062,
+      "learning_rate": 0.00016784129796972431,
+      "loss": 1.5786,
+      "step": 558
+    },
+    {
+      "epoch": 0.563508064516129,
+      "grad_norm": 0.08150017261505127,
+      "learning_rate": 0.0001677218668567299,
+      "loss": 1.6313,
+      "step": 559
+    },
+    {
+      "epoch": 0.5645161290322581,
+      "grad_norm": 0.08562348783016205,
+      "learning_rate": 0.00016760225704740594,
+      "loss": 1.6047,
+      "step": 560
+    },
+    {
+      "epoch": 0.5655241935483871,
+      "grad_norm": 0.09371492266654968,
+      "learning_rate": 0.00016748246885736452,
+      "loss": 1.6599,
+      "step": 561
+    },
+    {
+      "epoch": 0.5665322580645161,
+      "grad_norm": 0.08150923997163773,
+      "learning_rate": 0.00016736250260268828,
+      "loss": 1.6556,
+      "step": 562
+    },
+    {
+      "epoch": 0.5675403225806451,
+      "grad_norm": 0.08109602332115173,
+      "learning_rate": 0.0001672423585999298,
+      "loss": 1.6143,
+      "step": 563
+    },
+    {
+      "epoch": 0.5685483870967742,
+      "grad_norm": 0.07796693593263626,
+      "learning_rate": 0.0001671220371661106,
+      "loss": 1.6046,
+      "step": 564
+    },
+    {
+      "epoch": 0.5695564516129032,
+      "grad_norm": 0.08694635331630707,
+      "learning_rate": 0.0001670015386187205,
+      "loss": 1.6564,
+      "step": 565
+    },
+    {
+      "epoch": 0.5705645161290323,
+      "grad_norm": 0.08142531663179398,
+      "learning_rate": 0.00016688086327571648,
+      "loss": 1.6406,
+      "step": 566
+    },
+    {
+      "epoch": 0.5715725806451613,
+      "grad_norm": 0.07907096296548843,
+      "learning_rate": 0.00016676001145552228,
+      "loss": 1.5948,
+      "step": 567
+    },
+    {
+      "epoch": 0.5725806451612904,
+      "grad_norm": 0.08147318661212921,
+      "learning_rate": 0.0001666389834770271,
+      "loss": 1.5789,
+      "step": 568
+    },
+    {
+      "epoch": 0.5735887096774194,
+      "grad_norm": 0.08041603118181229,
+      "learning_rate": 0.00016651777965958503,
+      "loss": 1.6229,
+      "step": 569
+    },
+    {
+      "epoch": 0.5745967741935484,
+      "grad_norm": 0.07601971924304962,
+      "learning_rate": 0.00016639640032301413,
+      "loss": 1.5722,
+      "step": 570
+    },
+    {
+      "epoch": 0.5756048387096774,
+      "grad_norm": 0.08111369609832764,
+      "learning_rate": 0.0001662748457875957,
+      "loss": 1.6485,
+      "step": 571
+    },
+    {
+      "epoch": 0.5766129032258065,
+      "grad_norm": 0.07956349104642868,
+      "learning_rate": 0.00016615311637407316,
+      "loss": 1.6118,
+      "step": 572
+    },
+    {
+      "epoch": 0.5776209677419355,
+      "grad_norm": 0.08260063081979752,
+      "learning_rate": 0.00016603121240365152,
+      "loss": 1.6618,
+      "step": 573
+    },
+    {
+      "epoch": 0.5786290322580645,
+      "grad_norm": 0.077680803835392,
+      "learning_rate": 0.00016590913419799633,
+      "loss": 1.6316,
+      "step": 574
+    },
+    {
+      "epoch": 0.5796370967741935,
+      "grad_norm": 0.08391865342855453,
+      "learning_rate": 0.00016578688207923289,
+      "loss": 1.6273,
+      "step": 575
+    },
+    {
+      "epoch": 0.5806451612903226,
+      "grad_norm": 0.08210872858762741,
+      "learning_rate": 0.0001656644563699454,
+      "loss": 1.6222,
+      "step": 576
+    },
+    {
+      "epoch": 0.5816532258064516,
+      "grad_norm": 0.07796725630760193,
+      "learning_rate": 0.00016554185739317616,
+      "loss": 1.5981,
+      "step": 577
+    },
+    {
+      "epoch": 0.5826612903225806,
+      "grad_norm": 0.0765356495976448,
+      "learning_rate": 0.00016541908547242459,
+      "loss": 1.6164,
+      "step": 578
+    },
+    {
+      "epoch": 0.5836693548387096,
+      "grad_norm": 0.090540811419487,
+      "learning_rate": 0.00016529614093164648,
+      "loss": 1.6994,
+      "step": 579
+    },
+    {
+      "epoch": 0.5846774193548387,
+      "grad_norm": 0.08444759249687195,
+      "learning_rate": 0.00016517302409525315,
+      "loss": 1.6154,
+      "step": 580
+    },
+    {
+      "epoch": 0.5856854838709677,
+      "grad_norm": 0.0766877606511116,
+      "learning_rate": 0.0001650497352881105,
+      "loss": 1.6046,
+      "step": 581
+    },
+    {
+      "epoch": 0.5866935483870968,
+      "grad_norm": 0.0797574445605278,
+      "learning_rate": 0.00016492627483553822,
+      "loss": 1.6298,
+      "step": 582
+    },
+    {
+      "epoch": 0.5877016129032258,
+      "grad_norm": 0.07783927023410797,
+      "learning_rate": 0.00016480264306330898,
+      "loss": 1.5702,
+      "step": 583
+    },
+    {
+      "epoch": 0.5887096774193549,
+      "grad_norm": 0.08371485024690628,
+      "learning_rate": 0.0001646788402976474,
+      "loss": 1.6215,
+      "step": 584
+    },
+    {
+      "epoch": 0.5897177419354839,
+      "grad_norm": 0.08839402347803116,
+      "learning_rate": 0.0001645548668652294,
+      "loss": 1.5996,
+      "step": 585
+    },
+    {
+      "epoch": 0.5907258064516129,
+      "grad_norm": 0.07832740247249603,
+      "learning_rate": 0.0001644307230931811,
+      "loss": 1.6281,
+      "step": 586
+    },
+    {
+      "epoch": 0.5917338709677419,
+      "grad_norm": 0.07553452998399734,
+      "learning_rate": 0.00016430640930907827,
+      "loss": 1.6147,
+      "step": 587
+    },
+    {
+      "epoch": 0.592741935483871,
+      "grad_norm": 0.07809963822364807,
+      "learning_rate": 0.00016418192584094515,
+      "loss": 1.5993,
+      "step": 588
+    },
+    {
+      "epoch": 0.59375,
+      "grad_norm": 0.07688596844673157,
+      "learning_rate": 0.00016405727301725377,
+      "loss": 1.6019,
+      "step": 589
+    },
+    {
+      "epoch": 0.594758064516129,
+      "grad_norm": 0.07611083984375,
+      "learning_rate": 0.00016393245116692304,
+      "loss": 1.5689,
+      "step": 590
+    },
+    {
+      "epoch": 0.5957661290322581,
+      "grad_norm": 0.08132312446832657,
+      "learning_rate": 0.00016380746061931786,
+      "loss": 1.6307,
+      "step": 591
+    },
+    {
+      "epoch": 0.5967741935483871,
+      "grad_norm": 0.07959824800491333,
+      "learning_rate": 0.00016368230170424826,
+      "loss": 1.5851,
+      "step": 592
+    },
+    {
+      "epoch": 0.5977822580645161,
+      "grad_norm": 0.08210327476263046,
+      "learning_rate": 0.0001635569747519686,
+      "loss": 1.6139,
+      "step": 593
+    },
+    {
+      "epoch": 0.5987903225806451,
+      "grad_norm": 0.1014091745018959,
+      "learning_rate": 0.00016343148009317657,
+      "loss": 1.564,
+      "step": 594
+    },
+    {
+      "epoch": 0.5997983870967742,
+      "grad_norm": 0.08163224905729294,
+      "learning_rate": 0.00016330581805901239,
+      "loss": 1.5896,
+      "step": 595
+    },
+    {
+      "epoch": 0.6008064516129032,
+      "grad_norm": 0.08205213397741318,
+      "learning_rate": 0.00016317998898105797,
+      "loss": 1.6271,
+      "step": 596
+    },
+    {
+      "epoch": 0.6018145161290323,
+      "grad_norm": 0.07970026135444641,
+      "learning_rate": 0.00016305399319133595,
+      "loss": 1.6024,
+      "step": 597
+    },
+    {
+      "epoch": 0.6028225806451613,
+      "grad_norm": 0.07718155533075333,
+      "learning_rate": 0.00016292783102230888,
+      "loss": 1.5951,
+      "step": 598
+    },
+    {
+      "epoch": 0.6038306451612904,
+      "grad_norm": 0.09728401899337769,
+      "learning_rate": 0.00016280150280687834,
+      "loss": 1.6838,
+      "step": 599
+    },
+    {
+      "epoch": 0.6048387096774194,
+      "grad_norm": 0.08184093236923218,
+      "learning_rate": 0.00016267500887838412,
+      "loss": 1.5902,
+      "step": 600
+    },
+    {
+      "epoch": 0.6058467741935484,
+      "grad_norm": 0.08744041621685028,
+      "learning_rate": 0.00016254834957060309,
+      "loss": 1.6292,
+      "step": 601
+    },
+    {
+      "epoch": 0.6068548387096774,
+      "grad_norm": 0.09200835227966309,
+      "learning_rate": 0.00016242152521774874,
+      "loss": 1.6393,
+      "step": 602
+    },
+    {
+      "epoch": 0.6078629032258065,
+      "grad_norm": 0.08810313045978546,
+      "learning_rate": 0.0001622945361544699,
+      "loss": 1.6201,
+      "step": 603
+    },
+    {
+      "epoch": 0.6088709677419355,
+      "grad_norm": 0.09700248390436172,
+      "learning_rate": 0.00016216738271584999,
+      "loss": 1.5638,
+      "step": 604
+    },
+    {
+      "epoch": 0.6098790322580645,
+      "grad_norm": 0.08686663955450058,
+      "learning_rate": 0.00016204006523740634,
+      "loss": 1.5734,
+      "step": 605
+    },
+    {
+      "epoch": 0.6108870967741935,
+      "grad_norm": 0.07873237133026123,
+      "learning_rate": 0.00016191258405508896,
+      "loss": 1.5469,
+      "step": 606
+    },
+    {
+      "epoch": 0.6118951612903226,
+      "grad_norm": 0.08019126206636429,
+      "learning_rate": 0.0001617849395052799,
+      "loss": 1.6431,
+      "step": 607
+    },
+    {
+      "epoch": 0.6129032258064516,
+      "grad_norm": 0.08971964567899704,
+      "learning_rate": 0.00016165713192479227,
+      "loss": 1.6535,
+      "step": 608
+    },
+    {
+      "epoch": 0.6139112903225806,
+      "grad_norm": 0.07752855867147446,
+      "learning_rate": 0.00016152916165086936,
+      "loss": 1.5829,
+      "step": 609
+    },
+    {
+      "epoch": 0.6149193548387096,
+      "grad_norm": 0.08348417282104492,
+      "learning_rate": 0.00016140102902118377,
+      "loss": 1.6305,
+      "step": 610
+    },
+    {
+      "epoch": 0.6159274193548387,
+      "grad_norm": 0.0761261060833931,
+      "learning_rate": 0.0001612727343738365,
+      "loss": 1.5835,
+      "step": 611
+    },
+    {
+      "epoch": 0.6169354838709677,
+      "grad_norm": 0.11013983935117722,
+      "learning_rate": 0.00016114427804735603,
+      "loss": 1.6364,
+      "step": 612
+    },
+    {
+      "epoch": 0.6179435483870968,
+      "grad_norm": 0.086505226790905,
+      "learning_rate": 0.00016101566038069756,
+      "loss": 1.61,
+      "step": 613
+    },
+    {
+      "epoch": 0.6189516129032258,
+      "grad_norm": 0.08692600578069687,
+      "learning_rate": 0.00016088688171324184,
+      "loss": 1.6153,
+      "step": 614
+    },
+    {
+      "epoch": 0.6199596774193549,
+      "grad_norm": 0.09537503123283386,
+      "learning_rate": 0.0001607579423847946,
+      "loss": 1.6053,
+      "step": 615
+    },
+    {
+      "epoch": 0.6209677419354839,
+      "grad_norm": 0.08204115927219391,
+      "learning_rate": 0.00016062884273558545,
+      "loss": 1.5939,
+      "step": 616
+    },
+    {
+      "epoch": 0.6219758064516129,
+      "grad_norm": 0.08595214784145355,
+      "learning_rate": 0.00016049958310626708,
+      "loss": 1.6162,
+      "step": 617
+    },
+    {
+      "epoch": 0.6229838709677419,
+      "grad_norm": 0.08318503201007843,
+      "learning_rate": 0.00016037016383791425,
+      "loss": 1.6401,
+      "step": 618
+    },
+    {
+      "epoch": 0.623991935483871,
+      "grad_norm": 0.08207780867815018,
+      "learning_rate": 0.00016024058527202298,
+      "loss": 1.6226,
+      "step": 619
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.08268122375011444,
+      "learning_rate": 0.00016011084775050959,
+      "loss": 1.6522,
+      "step": 620
+    },
+    {
+      "epoch": 0.626008064516129,
+      "grad_norm": 0.07751034945249557,
+      "learning_rate": 0.00015998095161570995,
+      "loss": 1.5455,
+      "step": 621
+    },
+    {
+      "epoch": 0.6270161290322581,
+      "grad_norm": 0.08539839088916779,
+      "learning_rate": 0.00015985089721037832,
+      "loss": 1.6116,
+      "step": 622
+    },
+    {
+      "epoch": 0.6280241935483871,
+      "grad_norm": 0.08065900206565857,
+      "learning_rate": 0.00015972068487768665,
+      "loss": 1.6102,
+      "step": 623
+    },
+    {
+      "epoch": 0.6290322580645161,
+      "grad_norm": 0.07968778163194656,
+      "learning_rate": 0.00015959031496122364,
+      "loss": 1.6065,
+      "step": 624
+    },
+    {
+      "epoch": 0.6300403225806451,
+      "grad_norm": 0.08040513843297958,
+      "learning_rate": 0.00015945978780499375,
+      "loss": 1.5974,
+      "step": 625
+    },
+    {
+      "epoch": 0.6310483870967742,
+      "grad_norm": 0.0841718390583992,
+      "learning_rate": 0.00015932910375341639,
+      "loss": 1.5943,
+      "step": 626
+    },
+    {
+      "epoch": 0.6320564516129032,
+      "grad_norm": 0.07834211736917496,
+      "learning_rate": 0.0001591982631513249,
+      "loss": 1.5856,
+      "step": 627
+    },
+    {
+      "epoch": 0.6330645161290323,
+      "grad_norm": 0.08371677994728088,
+      "learning_rate": 0.00015906726634396575,
+      "loss": 1.5972,
+      "step": 628
+    },
+    {
+      "epoch": 0.6340725806451613,
+      "grad_norm": 0.09251397848129272,
+      "learning_rate": 0.00015893611367699762,
+      "loss": 1.6529,
+      "step": 629
+    },
+    {
+      "epoch": 0.6350806451612904,
+      "grad_norm": 0.080534428358078,
+      "learning_rate": 0.00015880480549649038,
+      "loss": 1.5786,
+      "step": 630
+    },
+    {
+      "epoch": 0.6360887096774194,
+      "grad_norm": 0.09134898334741592,
+      "learning_rate": 0.00015867334214892436,
+      "loss": 1.6303,
+      "step": 631
+    },
+    {
+      "epoch": 0.6370967741935484,
+      "grad_norm": 0.08673352748155594,
+      "learning_rate": 0.00015854172398118913,
+      "loss": 1.6281,
+      "step": 632
+    },
+    {
+      "epoch": 0.6381048387096774,
+      "grad_norm": 0.11661474406719208,
+      "learning_rate": 0.000158409951340583,
+      "loss": 1.6826,
+      "step": 633
+    },
+    {
+      "epoch": 0.6391129032258065,
+      "grad_norm": 0.08508265018463135,
+      "learning_rate": 0.0001582780245748118,
+      "loss": 1.5785,
+      "step": 634
+    },
+    {
+      "epoch": 0.6401209677419355,
+      "grad_norm": 0.09865213930606842,
+      "learning_rate": 0.00015814594403198794,
+      "loss": 1.619,
+      "step": 635
+    },
+    {
+      "epoch": 0.6411290322580645,
+      "grad_norm": 0.08882018178701401,
+      "learning_rate": 0.00015801371006062982,
+      "loss": 1.6076,
+      "step": 636
+    },
+    {
+      "epoch": 0.6421370967741935,
+      "grad_norm": 0.10395356267690659,
+      "learning_rate": 0.00015788132300966046,
+      "loss": 1.6193,
+      "step": 637
+    },
+    {
+      "epoch": 0.6431451612903226,
+      "grad_norm": 0.08556309342384338,
+      "learning_rate": 0.00015774878322840694,
+      "loss": 1.6313,
+      "step": 638
+    },
+    {
+      "epoch": 0.6441532258064516,
+      "grad_norm": 0.08463555574417114,
+      "learning_rate": 0.00015761609106659935,
+      "loss": 1.5852,
+      "step": 639
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.08253596723079681,
+      "learning_rate": 0.0001574832468743698,
+      "loss": 1.65,
+      "step": 640
+    },
+    {
+      "epoch": 0.6461693548387096,
+      "grad_norm": 0.09345366060733795,
+      "learning_rate": 0.0001573502510022516,
+      "loss": 1.5869,
+      "step": 641
+    },
+    {
+      "epoch": 0.6471774193548387,
+      "grad_norm": 0.08240879327058792,
+      "learning_rate": 0.00015721710380117826,
+      "loss": 1.6057,
+      "step": 642
+    },
+    {
+      "epoch": 0.6481854838709677,
+      "grad_norm": 0.08767805248498917,
+      "learning_rate": 0.0001570838056224827,
+      "loss": 1.5864,
+      "step": 643
+    },
+    {
+      "epoch": 0.6491935483870968,
+      "grad_norm": 0.08595956861972809,
+      "learning_rate": 0.0001569503568178961,
+      "loss": 1.593,
+      "step": 644
+    },
+    {
+      "epoch": 0.6502016129032258,
+      "grad_norm": 0.0859324112534523,
+      "learning_rate": 0.0001568167577395471,
+      "loss": 1.6248,
+      "step": 645
+    },
+    {
+      "epoch": 0.6512096774193549,
+      "grad_norm": 0.07949813455343246,
+      "learning_rate": 0.00015668300873996095,
+      "loss": 1.6269,
+      "step": 646
+    },
+    {
+      "epoch": 0.6522177419354839,
+      "grad_norm": 0.08270735293626785,
+      "learning_rate": 0.00015654911017205846,
+      "loss": 1.6161,
+      "step": 647
+    },
+    {
+      "epoch": 0.6532258064516129,
+      "grad_norm": 0.08057011663913727,
+      "learning_rate": 0.000156415062389155,
+      "loss": 1.615,
+      "step": 648
+    },
+    {
+      "epoch": 0.6542338709677419,
+      "grad_norm": 0.07924232631921768,
+      "learning_rate": 0.00015628086574495992,
+      "loss": 1.5898,
+      "step": 649
+    },
+    {
+      "epoch": 0.655241935483871,
+      "grad_norm": 0.08501306176185608,
+      "learning_rate": 0.00015614652059357508,
+      "loss": 1.6709,
+      "step": 650
+    },
+    {
+      "epoch": 0.65625,
+      "grad_norm": 0.08682959526777267,
+      "learning_rate": 0.00015601202728949436,
+      "loss": 1.6214,
+      "step": 651
+    },
+    {
+      "epoch": 0.657258064516129,
+      "grad_norm": 0.08149803429841995,
+      "learning_rate": 0.00015587738618760258,
+      "loss": 1.6337,
+      "step": 652
+    },
+    {
+      "epoch": 0.6582661290322581,
+      "grad_norm": 0.09022454917430878,
+      "learning_rate": 0.00015574259764317448,
+      "loss": 1.5809,
+      "step": 653
+    },
+    {
+      "epoch": 0.6592741935483871,
+      "grad_norm": 0.08189895004034042,
+      "learning_rate": 0.00015560766201187386,
+      "loss": 1.6188,
+      "step": 654
+    },
+    {
+      "epoch": 0.6602822580645161,
+      "grad_norm": 0.080174021422863,
+      "learning_rate": 0.00015547257964975273,
+      "loss": 1.5991,
+      "step": 655
+    },
+    {
+      "epoch": 0.6612903225806451,
+      "grad_norm": 0.08346089720726013,
+      "learning_rate": 0.0001553373509132501,
+      "loss": 1.5734,
+      "step": 656
+    },
+    {
+      "epoch": 0.6622983870967742,
+      "grad_norm": 0.07657915353775024,
+      "learning_rate": 0.00015520197615919145,
+      "loss": 1.5422,
+      "step": 657
+    },
+    {
+      "epoch": 0.6633064516129032,
+      "grad_norm": 0.08029603213071823,
+      "learning_rate": 0.0001550664557447873,
+      "loss": 1.5886,
+      "step": 658
+    },
+    {
+      "epoch": 0.6643145161290323,
+      "grad_norm": 0.08529450744390488,
+      "learning_rate": 0.0001549307900276327,
+      "loss": 1.629,
+      "step": 659
+    },
+    {
+      "epoch": 0.6653225806451613,
+      "grad_norm": 0.07882041484117508,
+      "learning_rate": 0.0001547949793657061,
+      "loss": 1.66,
+      "step": 660
+    },
+    {
+      "epoch": 0.6663306451612904,
+      "grad_norm": 0.08514705300331116,
+      "learning_rate": 0.00015465902411736828,
+      "loss": 1.6113,
+      "step": 661
+    },
+    {
+      "epoch": 0.6673387096774194,
+      "grad_norm": 0.07738941162824631,
+      "learning_rate": 0.00015452292464136167,
+      "loss": 1.5959,
+      "step": 662
+    },
+    {
+      "epoch": 0.6683467741935484,
+      "grad_norm": 0.08031867444515228,
+      "learning_rate": 0.0001543866812968092,
+      "loss": 1.601,
+      "step": 663
+    },
+    {
+      "epoch": 0.6693548387096774,
+      "grad_norm": 0.08055873215198517,
+      "learning_rate": 0.00015425029444321347,
+      "loss": 1.5731,
+      "step": 664
+    },
+    {
+      "epoch": 0.6703629032258065,
+      "grad_norm": 0.08486857265233994,
+      "learning_rate": 0.0001541137644404557,
+      "loss": 1.5703,
+      "step": 665
+    },
+    {
+      "epoch": 0.6713709677419355,
+      "grad_norm": 0.07934212684631348,
+      "learning_rate": 0.0001539770916487949,
+      "loss": 1.6163,
+      "step": 666
+    },
+    {
+      "epoch": 0.6723790322580645,
+      "grad_norm": 0.08954691141843796,
+      "learning_rate": 0.0001538402764288668,
+      "loss": 1.6139,
+      "step": 667
+    },
+    {
+      "epoch": 0.6733870967741935,
+      "grad_norm": 0.08842763304710388,
+      "learning_rate": 0.00015370331914168296,
+      "loss": 1.6322,
+      "step": 668
+    },
+    {
+      "epoch": 0.6743951612903226,
+      "grad_norm": 0.08686459064483643,
+      "learning_rate": 0.00015356622014862988,
+      "loss": 1.59,
+      "step": 669
+    },
+    {
+      "epoch": 0.6754032258064516,
+      "grad_norm": 0.07980991154909134,
+      "learning_rate": 0.00015342897981146785,
+      "loss": 1.576,
+      "step": 670
+    },
+    {
+      "epoch": 0.6764112903225806,
+      "grad_norm": 0.08613515645265579,
+      "learning_rate": 0.00015329159849233022,
+      "loss": 1.6328,
+      "step": 671
+    },
+    {
+      "epoch": 0.6774193548387096,
+      "grad_norm": 0.10668696463108063,
+      "learning_rate": 0.0001531540765537223,
+      "loss": 1.6482,
+      "step": 672
+    },
+    {
+      "epoch": 0.6784274193548387,
+      "grad_norm": 0.07826445251703262,
+      "learning_rate": 0.00015301641435852046,
+      "loss": 1.5984,
+      "step": 673
+    },
+    {
+      "epoch": 0.6794354838709677,
+      "grad_norm": 0.09749854356050491,
+      "learning_rate": 0.00015287861226997125,
+      "loss": 1.586,
+      "step": 674
+    },
+    {
+      "epoch": 0.6804435483870968,
+      "grad_norm": 0.09301649779081345,
+      "learning_rate": 0.00015274067065169017,
+      "loss": 1.6806,
+      "step": 675
+    },
+    {
+      "epoch": 0.6814516129032258,
+      "grad_norm": 0.08719351887702942,
+      "learning_rate": 0.00015260258986766104,
+      "loss": 1.5568,
+      "step": 676
+    },
+    {
+      "epoch": 0.6824596774193549,
+      "grad_norm": 0.08005709946155548,
+      "learning_rate": 0.00015246437028223486,
+      "loss": 1.6252,
+      "step": 677
+    },
+    {
+      "epoch": 0.6834677419354839,
+      "grad_norm": 0.08304545283317566,
+      "learning_rate": 0.00015232601226012886,
+      "loss": 1.6137,
+      "step": 678
+    },
+    {
+      "epoch": 0.6844758064516129,
+      "grad_norm": 0.07949443906545639,
+      "learning_rate": 0.0001521875161664256,
+      "loss": 1.5808,
+      "step": 679
+    },
+    {
+      "epoch": 0.6854838709677419,
+      "grad_norm": 0.08979618549346924,
+      "learning_rate": 0.00015204888236657188,
+      "loss": 1.6164,
+      "step": 680
+    },
+    {
+      "epoch": 0.686491935483871,
+      "grad_norm": 0.07843173295259476,
+      "learning_rate": 0.00015191011122637796,
+      "loss": 1.6246,
+      "step": 681
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.09026903659105301,
+      "learning_rate": 0.00015177120311201647,
+      "loss": 1.6352,
+      "step": 682
+    },
+    {
+      "epoch": 0.688508064516129,
+      "grad_norm": 0.09385894238948822,
+      "learning_rate": 0.00015163215839002146,
+      "loss": 1.622,
+      "step": 683
+    },
+    {
+      "epoch": 0.6895161290322581,
+      "grad_norm": 0.07961908727884293,
+      "learning_rate": 0.0001514929774272874,
+      "loss": 1.5745,
+      "step": 684
+    },
+    {
+      "epoch": 0.6905241935483871,
+      "grad_norm": 0.08670490235090256,
+      "learning_rate": 0.00015135366059106832,
+      "loss": 1.5945,
+      "step": 685
+    },
+    {
+      "epoch": 0.6915322580645161,
+      "grad_norm": 0.08476680517196655,
+      "learning_rate": 0.00015121420824897678,
+      "loss": 1.6316,
+      "step": 686
+    },
+    {
+      "epoch": 0.6925403225806451,
+      "grad_norm": 0.0937148854136467,
+      "learning_rate": 0.00015107462076898289,
+      "loss": 1.6054,
+      "step": 687
+    },
+    {
+      "epoch": 0.6935483870967742,
+      "grad_norm": 0.08981835842132568,
+      "learning_rate": 0.00015093489851941328,
+      "loss": 1.6683,
+      "step": 688
+    },
+    {
+      "epoch": 0.6945564516129032,
+      "grad_norm": 0.08677362650632858,
+      "learning_rate": 0.0001507950418689503,
+      "loss": 1.6306,
+      "step": 689
+    },
+    {
+      "epoch": 0.6955645161290323,
+      "grad_norm": 0.07769922912120819,
+      "learning_rate": 0.00015065505118663078,
+      "loss": 1.6164,
+      "step": 690
+    },
+    {
+      "epoch": 0.6965725806451613,
+      "grad_norm": 0.08614321053028107,
+      "learning_rate": 0.00015051492684184546,
+      "loss": 1.5615,
+      "step": 691
+    },
+    {
+      "epoch": 0.6975806451612904,
+      "grad_norm": 0.09230528026819229,
+      "learning_rate": 0.00015037466920433753,
+      "loss": 1.6901,
+      "step": 692
+    },
+    {
+      "epoch": 0.6985887096774194,
+      "grad_norm": 0.09350752830505371,
+      "learning_rate": 0.00015023427864420202,
+      "loss": 1.6465,
+      "step": 693
+    },
+    {
+      "epoch": 0.6995967741935484,
+      "grad_norm": 0.09468571841716766,
+      "learning_rate": 0.00015009375553188468,
+      "loss": 1.6485,
+      "step": 694
+    },
+    {
+      "epoch": 0.7006048387096774,
+      "grad_norm": 0.08464954793453217,
+      "learning_rate": 0.00014995310023818107,
+      "loss": 1.5865,
+      "step": 695
+    },
+    {
+      "epoch": 0.7016129032258065,
+      "grad_norm": 0.09060323238372803,
+      "learning_rate": 0.00014981231313423545,
+      "loss": 1.6074,
+      "step": 696
+    },
+    {
+      "epoch": 0.7026209677419355,
+      "grad_norm": 0.08714771270751953,
+      "learning_rate": 0.00014967139459153993,
+      "loss": 1.5824,
+      "step": 697
+    },
+    {
+      "epoch": 0.7036290322580645,
+      "grad_norm": 0.0776834785938263,
+      "learning_rate": 0.00014953034498193341,
+      "loss": 1.5689,
+      "step": 698
+    },
+    {
+      "epoch": 0.7046370967741935,
+      "grad_norm": 0.08315813541412354,
+      "learning_rate": 0.0001493891646776007,
+      "loss": 1.6187,
+      "step": 699
+    },
+    {
+      "epoch": 0.7056451612903226,
+      "grad_norm": 0.07914920896291733,
+      "learning_rate": 0.00014924785405107143,
+      "loss": 1.5417,
+      "step": 700
+    },
+    {
+      "epoch": 0.7066532258064516,
+      "grad_norm": 0.08314627408981323,
+      "learning_rate": 0.00014910641347521907,
+      "loss": 1.6298,
+      "step": 701
+    },
+    {
+      "epoch": 0.7076612903225806,
+      "grad_norm": 0.07665257155895233,
+      "learning_rate": 0.0001489648433232601,
+      "loss": 1.5464,
+      "step": 702
+    },
+    {
+      "epoch": 0.7086693548387096,
+      "grad_norm": 0.09670589119195938,
+      "learning_rate": 0.00014882314396875274,
+      "loss": 1.654,
+      "step": 703
+    },
+    {
+      "epoch": 0.7096774193548387,
+      "grad_norm": 0.08459917455911636,
+      "learning_rate": 0.00014868131578559633,
+      "loss": 1.6326,
+      "step": 704
+    },
+    {
+      "epoch": 0.7106854838709677,
+      "grad_norm": 0.08236029744148254,
+      "learning_rate": 0.00014853935914802994,
+      "loss": 1.59,
+      "step": 705
+    },
+    {
+      "epoch": 0.7116935483870968,
+      "grad_norm": 0.07780009508132935,
+      "learning_rate": 0.0001483972744306318,
+      "loss": 1.5801,
+      "step": 706
+    },
+    {
+      "epoch": 0.7127016129032258,
+      "grad_norm": 0.0835953950881958,
+      "learning_rate": 0.00014825506200831794,
+      "loss": 1.5765,
+      "step": 707
+    },
+    {
+      "epoch": 0.7137096774193549,
+      "grad_norm": 0.08014727383852005,
+      "learning_rate": 0.00014811272225634145,
+      "loss": 1.6156,
+      "step": 708
+    },
+    {
+      "epoch": 0.7147177419354839,
+      "grad_norm": 0.08108653128147125,
+      "learning_rate": 0.00014797025555029133,
+      "loss": 1.5825,
+      "step": 709
+    },
+    {
+      "epoch": 0.7157258064516129,
+      "grad_norm": 0.08455085754394531,
+      "learning_rate": 0.00014782766226609166,
+      "loss": 1.6218,
+      "step": 710
+    },
+    {
+      "epoch": 0.7167338709677419,
+      "grad_norm": 0.07630985975265503,
+      "learning_rate": 0.00014768494278000048,
+      "loss": 1.5889,
+      "step": 711
+    },
+    {
+      "epoch": 0.717741935483871,
+      "grad_norm": 0.08318428695201874,
+      "learning_rate": 0.00014754209746860878,
+      "loss": 1.5827,
+      "step": 712
+    },
+    {
+      "epoch": 0.71875,
+      "grad_norm": 0.08248715102672577,
+      "learning_rate": 0.00014739912670883967,
+      "loss": 1.621,
+      "step": 713
+    },
+    {
+      "epoch": 0.719758064516129,
+      "grad_norm": 0.07857991755008698,
+      "learning_rate": 0.00014725603087794716,
+      "loss": 1.5605,
+      "step": 714
+    },
+    {
+      "epoch": 0.7207661290322581,
+      "grad_norm": 0.08540824055671692,
+      "learning_rate": 0.0001471128103535154,
+      "loss": 1.5471,
+      "step": 715
+    },
+    {
+      "epoch": 0.7217741935483871,
+      "grad_norm": 0.0777583196759224,
+      "learning_rate": 0.00014696946551345747,
+      "loss": 1.5029,
+      "step": 716
+    },
+    {
+      "epoch": 0.7227822580645161,
+      "grad_norm": 0.08295831829309464,
+      "learning_rate": 0.00014682599673601458,
+      "loss": 1.5709,
+      "step": 717
+    },
+    {
+      "epoch": 0.7237903225806451,
+      "grad_norm": 0.08069245517253876,
+      "learning_rate": 0.00014668240439975482,
+      "loss": 1.5601,
+      "step": 718
+    },
+    {
+      "epoch": 0.7247983870967742,
+      "grad_norm": 0.08142071962356567,
+      "learning_rate": 0.00014653868888357249,
+      "loss": 1.6004,
+      "step": 719
+    },
+    {
+      "epoch": 0.7258064516129032,
+      "grad_norm": 0.09048129618167877,
+      "learning_rate": 0.0001463948505666868,
+      "loss": 1.6614,
+      "step": 720
+    },
+    {
+      "epoch": 0.7268145161290323,
+      "grad_norm": 0.09065764397382736,
+      "learning_rate": 0.00014625088982864098,
+      "loss": 1.6612,
+      "step": 721
+    },
+    {
+      "epoch": 0.7278225806451613,
+      "grad_norm": 0.0859372541308403,
+      "learning_rate": 0.00014610680704930142,
+      "loss": 1.5914,
+      "step": 722
+    },
+    {
+      "epoch": 0.7288306451612904,
+      "grad_norm": 0.0821571797132492,
+      "learning_rate": 0.0001459626026088564,
+      "loss": 1.5458,
+      "step": 723
+    },
+    {
+      "epoch": 0.7298387096774194,
+      "grad_norm": 0.08414388447999954,
+      "learning_rate": 0.0001458182768878153,
+      "loss": 1.5608,
+      "step": 724
+    },
+    {
+      "epoch": 0.7308467741935484,
+      "grad_norm": 0.08222994953393936,
+      "learning_rate": 0.00014567383026700752,
+      "loss": 1.5943,
+      "step": 725
+    },
+    {
+      "epoch": 0.7318548387096774,
+      "grad_norm": 0.08996201306581497,
+      "learning_rate": 0.0001455292631275814,
+      "loss": 1.5524,
+      "step": 726
+    },
+    {
+      "epoch": 0.7328629032258065,
+      "grad_norm": 0.08061891794204712,
+      "learning_rate": 0.0001453845758510034,
+      "loss": 1.6428,
+      "step": 727
+    },
+    {
+      "epoch": 0.7338709677419355,
+      "grad_norm": 0.09720771759748459,
+      "learning_rate": 0.0001452397688190569,
+      "loss": 1.6538,
+      "step": 728
+    },
+    {
+      "epoch": 0.7348790322580645,
+      "grad_norm": 0.08087541162967682,
+      "learning_rate": 0.00014509484241384134,
+      "loss": 1.6078,
+      "step": 729
+    },
+    {
+      "epoch": 0.7358870967741935,
+      "grad_norm": 0.09106358885765076,
+      "learning_rate": 0.00014494979701777102,
+      "loss": 1.589,
+      "step": 730
+    },
+    {
+      "epoch": 0.7368951612903226,
+      "grad_norm": 0.07827623188495636,
+      "learning_rate": 0.00014480463301357445,
+      "loss": 1.5937,
+      "step": 731
+    },
+    {
+      "epoch": 0.7379032258064516,
+      "grad_norm": 0.09681122750043869,
+      "learning_rate": 0.00014465935078429286,
+      "loss": 1.6308,
+      "step": 732
+    },
+    {
+      "epoch": 0.7389112903225806,
+      "grad_norm": 0.0876043364405632,
+      "learning_rate": 0.00014451395071327964,
+      "loss": 1.6136,
+      "step": 733
+    },
+    {
+      "epoch": 0.7399193548387096,
+      "grad_norm": 0.10326588153839111,
+      "learning_rate": 0.00014436843318419896,
+      "loss": 1.5964,
+      "step": 734
+    },
+    {
+      "epoch": 0.7409274193548387,
+      "grad_norm": 0.08790312707424164,
+      "learning_rate": 0.00014422279858102504,
+      "loss": 1.5992,
+      "step": 735
+    },
+    {
+      "epoch": 0.7419354838709677,
+      "grad_norm": 0.0805894061923027,
+      "learning_rate": 0.00014407704728804097,
+      "loss": 1.5503,
+      "step": 736
+    },
+    {
+      "epoch": 0.7429435483870968,
+      "grad_norm": 0.0813809409737587,
+      "learning_rate": 0.00014393117968983777,
+      "loss": 1.5807,
+      "step": 737
+    },
+    {
+      "epoch": 0.7439516129032258,
+      "grad_norm": 0.0871429443359375,
+      "learning_rate": 0.0001437851961713133,
+      "loss": 1.6493,
+      "step": 738
+    },
+    {
+      "epoch": 0.7449596774193549,
+      "grad_norm": 0.08929460495710373,
+      "learning_rate": 0.0001436390971176714,
+      "loss": 1.58,
+      "step": 739
+    },
+    {
+      "epoch": 0.7459677419354839,
+      "grad_norm": 0.08278234302997589,
+      "learning_rate": 0.0001434928829144206,
+      "loss": 1.6442,
+      "step": 740
+    },
+    {
+      "epoch": 0.7469758064516129,
+      "grad_norm": 0.09997319430112839,
+      "learning_rate": 0.00014334655394737355,
+      "loss": 1.5756,
+      "step": 741
+    },
+    {
+      "epoch": 0.7479838709677419,
+      "grad_norm": 0.07914005219936371,
+      "learning_rate": 0.0001432001106026454,
+      "loss": 1.5642,
+      "step": 742
+    },
+    {
+      "epoch": 0.748991935483871,
+      "grad_norm": 0.09618489444255829,
+      "learning_rate": 0.00014305355326665339,
+      "loss": 1.6108,
+      "step": 743
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.09149473160505295,
+      "learning_rate": 0.00014290688232611526,
+      "loss": 1.6007,
+      "step": 744
+    },
+    {
+      "epoch": 0.751008064516129,
+      "grad_norm": 0.08550098538398743,
+      "learning_rate": 0.00014276009816804885,
+      "loss": 1.588,
+      "step": 745
+    },
+    {
+      "epoch": 0.7520161290322581,
+      "grad_norm": 0.08285672217607498,
+      "learning_rate": 0.00014261320117977042,
+      "loss": 1.5845,
+      "step": 746
+    },
+    {
+      "epoch": 0.7530241935483871,
+      "grad_norm": 0.09440962970256805,
+      "learning_rate": 0.00014246619174889422,
+      "loss": 1.7127,
+      "step": 747
+    },
+    {
+      "epoch": 0.7540322580645161,
+      "grad_norm": 0.08045286685228348,
+      "learning_rate": 0.00014231907026333098,
+      "loss": 1.6066,
+      "step": 748
+    },
+    {
+      "epoch": 0.7550403225806451,
+      "grad_norm": 0.08301718533039093,
+      "learning_rate": 0.0001421718371112873,
+      "loss": 1.5732,
+      "step": 749
+    },
+    {
+      "epoch": 0.7560483870967742,
+      "grad_norm": 0.08225584775209427,
+      "learning_rate": 0.00014202449268126426,
+      "loss": 1.563,
+      "step": 750
+    },
+    {
+      "epoch": 0.7570564516129032,
+      "grad_norm": 0.08871738612651825,
+      "learning_rate": 0.00014187703736205667,
+      "loss": 1.6364,
+      "step": 751
+    },
+    {
+      "epoch": 0.7580645161290323,
+      "grad_norm": 0.08189701288938522,
+      "learning_rate": 0.00014172947154275195,
+      "loss": 1.5972,
+      "step": 752
+    },
+    {
+      "epoch": 0.7590725806451613,
+      "grad_norm": 0.08560924977064133,
+      "learning_rate": 0.00014158179561272907,
+      "loss": 1.5971,
+      "step": 753
+    },
+    {
+      "epoch": 0.7600806451612904,
+      "grad_norm": 0.08616410940885544,
+      "learning_rate": 0.00014143400996165746,
+      "loss": 1.6331,
+      "step": 754
+    },
+    {
+      "epoch": 0.7610887096774194,
+      "grad_norm": 0.08963197469711304,
+      "learning_rate": 0.00014128611497949626,
+      "loss": 1.5887,
+      "step": 755
+    },
+    {
+      "epoch": 0.7620967741935484,
+      "grad_norm": 0.09272851049900055,
+      "learning_rate": 0.0001411381110564929,
+      "loss": 1.5692,
+      "step": 756
+    },
+    {
+      "epoch": 0.7631048387096774,
+      "grad_norm": 0.08667407929897308,
+      "learning_rate": 0.0001409899985831824,
+      "loss": 1.5852,
+      "step": 757
+    },
+    {
+      "epoch": 0.7641129032258065,
+      "grad_norm": 0.08354497700929642,
+      "learning_rate": 0.00014084177795038613,
+      "loss": 1.6024,
+      "step": 758
+    },
+    {
+      "epoch": 0.7651209677419355,
+      "grad_norm": 0.09121601283550262,
+      "learning_rate": 0.00014069344954921096,
+      "loss": 1.5896,
+      "step": 759
+    },
+    {
+      "epoch": 0.7661290322580645,
+      "grad_norm": 0.09622003138065338,
+      "learning_rate": 0.00014054501377104797,
+      "loss": 1.5781,
+      "step": 760
+    },
+    {
+      "epoch": 0.7671370967741935,
+      "grad_norm": 0.08506747335195541,
+      "learning_rate": 0.00014039647100757177,
+      "loss": 1.5752,
+      "step": 761
+    },
+    {
+      "epoch": 0.7681451612903226,
+      "grad_norm": 0.09725549817085266,
+      "learning_rate": 0.00014024782165073912,
+      "loss": 1.599,
+      "step": 762
+    },
+    {
+      "epoch": 0.7691532258064516,
+      "grad_norm": 0.08023160696029663,
+      "learning_rate": 0.00014009906609278806,
+      "loss": 1.5503,
+      "step": 763
+    },
+    {
+      "epoch": 0.7701612903225806,
+      "grad_norm": 0.092674620449543,
+      "learning_rate": 0.00013995020472623693,
+      "loss": 1.6196,
+      "step": 764
+    },
+    {
+      "epoch": 0.7711693548387096,
+      "grad_norm": 0.07756571471691132,
+      "learning_rate": 0.0001398012379438832,
+      "loss": 1.599,
+      "step": 765
+    },
+    {
+      "epoch": 0.7721774193548387,
+      "grad_norm": 0.09609861671924591,
+      "learning_rate": 0.00013965216613880257,
+      "loss": 1.6356,
+      "step": 766
+    },
+    {
+      "epoch": 0.7731854838709677,
+      "grad_norm": 0.08073242753744125,
+      "learning_rate": 0.00013950298970434775,
+      "loss": 1.5975,
+      "step": 767
+    },
+    {
+      "epoch": 0.7741935483870968,
+      "grad_norm": 0.08342421054840088,
+      "learning_rate": 0.00013935370903414768,
+      "loss": 1.594,
+      "step": 768
+    },
+    {
+      "epoch": 0.7752016129032258,
+      "grad_norm": 0.07886181771755219,
+      "learning_rate": 0.00013920432452210619,
+      "loss": 1.5947,
+      "step": 769
+    },
+    {
+      "epoch": 0.7762096774193549,
+      "grad_norm": 0.08256496489048004,
+      "learning_rate": 0.00013905483656240125,
+      "loss": 1.5772,
+      "step": 770
+    },
+    {
+      "epoch": 0.7772177419354839,
+      "grad_norm": 0.08527923375368118,
+      "learning_rate": 0.0001389052455494837,
+      "loss": 1.5936,
+      "step": 771
+    },
+    {
+      "epoch": 0.7782258064516129,
+      "grad_norm": 0.08340179920196533,
+      "learning_rate": 0.00013875555187807637,
+      "loss": 1.5786,
+      "step": 772
+    },
+    {
+      "epoch": 0.7792338709677419,
+      "grad_norm": 0.07682585716247559,
+      "learning_rate": 0.00013860575594317292,
+      "loss": 1.542,
+      "step": 773
+    },
+    {
+      "epoch": 0.780241935483871,
+      "grad_norm": 0.08884165436029434,
+      "learning_rate": 0.00013845585814003684,
+      "loss": 1.5969,
+      "step": 774
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 0.07785353809595108,
+      "learning_rate": 0.00013830585886420054,
+      "loss": 1.5671,
+      "step": 775
+    },
+    {
+      "epoch": 0.782258064516129,
+      "grad_norm": 0.08034134656190872,
+      "learning_rate": 0.000138155758511464,
+      "loss": 1.5774,
+      "step": 776
+    },
+    {
+      "epoch": 0.7832661290322581,
+      "grad_norm": 0.0796407014131546,
+      "learning_rate": 0.0001380055574778941,
+      "loss": 1.5606,
+      "step": 777
+    },
+    {
+      "epoch": 0.7842741935483871,
+      "grad_norm": 0.07933478057384491,
+      "learning_rate": 0.00013785525615982319,
+      "loss": 1.5651,
+      "step": 778
+    },
+    {
+      "epoch": 0.7852822580645161,
+      "grad_norm": 0.08734553307294846,
+      "learning_rate": 0.00013770485495384843,
+      "loss": 1.6262,
+      "step": 779
+    },
+    {
+      "epoch": 0.7862903225806451,
+      "grad_norm": 0.08349025249481201,
+      "learning_rate": 0.0001375543542568304,
+      "loss": 1.5835,
+      "step": 780
+    },
+    {
+      "epoch": 0.7872983870967742,
+      "grad_norm": 0.09640732407569885,
+      "learning_rate": 0.00013740375446589232,
+      "loss": 1.586,
+      "step": 781
+    },
+    {
+      "epoch": 0.7883064516129032,
+      "grad_norm": 0.09520639479160309,
+      "learning_rate": 0.00013725305597841878,
+      "loss": 1.6521,
+      "step": 782
+    },
+    {
+      "epoch": 0.7893145161290323,
+      "grad_norm": 0.07939834147691727,
+      "learning_rate": 0.00013710225919205484,
+      "loss": 1.5062,
+      "step": 783
+    },
+    {
+      "epoch": 0.7903225806451613,
+      "grad_norm": 0.08648645132780075,
+      "learning_rate": 0.000136951364504705,
+      "loss": 1.6303,
+      "step": 784
+    },
+    {
+      "epoch": 0.7913306451612904,
+      "grad_norm": 0.09467138350009918,
+      "learning_rate": 0.00013680037231453203,
+      "loss": 1.6333,
+      "step": 785
+    },
+    {
+      "epoch": 0.7923387096774194,
+      "grad_norm": 0.08505504578351974,
+      "learning_rate": 0.000136649283019956,
+      "loss": 1.5953,
+      "step": 786
+    },
+    {
+      "epoch": 0.7933467741935484,
+      "grad_norm": 0.0903257429599762,
+      "learning_rate": 0.00013649809701965311,
+      "loss": 1.5841,
+      "step": 787
+    },
+    {
+      "epoch": 0.7943548387096774,
+      "grad_norm": 0.08327475190162659,
+      "learning_rate": 0.00013634681471255493,
+      "loss": 1.578,
+      "step": 788
+    },
+    {
+      "epoch": 0.7953629032258065,
+      "grad_norm": 0.09311467409133911,
+      "learning_rate": 0.000136195436497847,
+      "loss": 1.5911,
+      "step": 789
+    },
+    {
+      "epoch": 0.7963709677419355,
+      "grad_norm": 0.09214780479669571,
+      "learning_rate": 0.00013604396277496796,
+      "loss": 1.6009,
+      "step": 790
+    },
+    {
+      "epoch": 0.7973790322580645,
+      "grad_norm": 0.08812731504440308,
+      "learning_rate": 0.00013589239394360848,
+      "loss": 1.6141,
+      "step": 791
+    },
+    {
+      "epoch": 0.7983870967741935,
+      "grad_norm": 0.11389174312353134,
+      "learning_rate": 0.00013574073040371022,
+      "loss": 1.6369,
+      "step": 792
+    },
+    {
+      "epoch": 0.7993951612903226,
+      "grad_norm": 0.08469700813293457,
+      "learning_rate": 0.00013558897255546473,
+      "loss": 1.6009,
+      "step": 793
+    },
+    {
+      "epoch": 0.8004032258064516,
+      "grad_norm": 0.08306135982275009,
+      "learning_rate": 0.0001354371207993123,
+      "loss": 1.5556,
+      "step": 794
+    },
+    {
+      "epoch": 0.8014112903225806,
+      "grad_norm": 0.08287226408720016,
+      "learning_rate": 0.00013528517553594124,
+      "loss": 1.571,
+      "step": 795
+    },
+    {
+      "epoch": 0.8024193548387096,
+      "grad_norm": 0.0797332376241684,
+      "learning_rate": 0.00013513313716628637,
+      "loss": 1.5679,
+      "step": 796
+    },
+    {
+      "epoch": 0.8034274193548387,
+      "grad_norm": 0.07978206872940063,
+      "learning_rate": 0.0001349810060915283,
+      "loss": 1.5865,
+      "step": 797
+    },
+    {
+      "epoch": 0.8044354838709677,
+      "grad_norm": 0.07792511582374573,
+      "learning_rate": 0.00013482878271309226,
+      "loss": 1.5849,
+      "step": 798
+    },
+    {
+      "epoch": 0.8054435483870968,
+      "grad_norm": 0.07994278520345688,
+      "learning_rate": 0.000134676467432647,
+      "loss": 1.6026,
+      "step": 799
+    },
+    {
+      "epoch": 0.8064516129032258,
+      "grad_norm": 0.08317188918590546,
+      "learning_rate": 0.00013452406065210382,
+      "loss": 1.6333,
+      "step": 800
+    },
+    {
+      "epoch": 0.8074596774193549,
+      "grad_norm": 0.09058106690645218,
+      "learning_rate": 0.00013437156277361538,
+      "loss": 1.5936,
+      "step": 801
+    },
+    {
+      "epoch": 0.8084677419354839,
+      "grad_norm": 0.08963512629270554,
+      "learning_rate": 0.00013421897419957482,
+      "loss": 1.6422,
+      "step": 802
+    },
+    {
+      "epoch": 0.8094758064516129,
+      "grad_norm": 0.09142173826694489,
+      "learning_rate": 0.0001340662953326145,
+      "loss": 1.6779,
+      "step": 803
+    },
+    {
+      "epoch": 0.8104838709677419,
+      "grad_norm": 0.08868789672851562,
+      "learning_rate": 0.00013391352657560513,
+      "loss": 1.6594,
+      "step": 804
+    },
+    {
+      "epoch": 0.811491935483871,
+      "grad_norm": 0.08746343106031418,
+      "learning_rate": 0.0001337606683316545,
+      "loss": 1.5312,
+      "step": 805
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.07589108496904373,
+      "learning_rate": 0.00013360772100410665,
+      "loss": 1.5462,
+      "step": 806
+    },
+    {
+      "epoch": 0.813508064516129,
+      "grad_norm": 0.0817432850599289,
+      "learning_rate": 0.00013345468499654056,
+      "loss": 1.5393,
+      "step": 807
+    },
+    {
+      "epoch": 0.8145161290322581,
+      "grad_norm": 0.07965264469385147,
+      "learning_rate": 0.00013330156071276932,
+      "loss": 1.5687,
+      "step": 808
+    },
+    {
+      "epoch": 0.8155241935483871,
+      "grad_norm": 0.08861200511455536,
+      "learning_rate": 0.00013314834855683886,
+      "loss": 1.6412,
+      "step": 809
+    },
+    {
+      "epoch": 0.8165322580645161,
+      "grad_norm": 0.07894746214151382,
+      "learning_rate": 0.00013299504893302705,
+      "loss": 1.5738,
+      "step": 810
+    },
+    {
+      "epoch": 0.8175403225806451,
+      "grad_norm": 0.07987947016954422,
+      "learning_rate": 0.00013284166224584253,
+      "loss": 1.6212,
+      "step": 811
+    },
+    {
+      "epoch": 0.8185483870967742,
+      "grad_norm": 0.09027516096830368,
+      "learning_rate": 0.0001326881889000236,
+      "loss": 1.6113,
+      "step": 812
+    },
+    {
+      "epoch": 0.8195564516129032,
+      "grad_norm": 0.11448541283607483,
+      "learning_rate": 0.00013253462930053742,
+      "loss": 1.6315,
+      "step": 813
+    },
+    {
+      "epoch": 0.8205645161290323,
+      "grad_norm": 0.08771926164627075,
+      "learning_rate": 0.00013238098385257848,
+      "loss": 1.5919,
+      "step": 814
+    },
+    {
+      "epoch": 0.8215725806451613,
+      "grad_norm": 0.09016083925962448,
+      "learning_rate": 0.00013222725296156807,
+      "loss": 1.5629,
+      "step": 815
+    },
+    {
+      "epoch": 0.8225806451612904,
+      "grad_norm": 0.08411089330911636,
+      "learning_rate": 0.0001320734370331527,
+      "loss": 1.6037,
+      "step": 816
+    },
+    {
+      "epoch": 0.8235887096774194,
+      "grad_norm": 0.09559720754623413,
+      "learning_rate": 0.0001319195364732034,
+      "loss": 1.5463,
+      "step": 817
+    },
+    {
+      "epoch": 0.8245967741935484,
+      "grad_norm": 0.10408146679401398,
+      "learning_rate": 0.00013176555168781451,
+      "loss": 1.5768,
+      "step": 818
+    },
+    {
+      "epoch": 0.8256048387096774,
+      "grad_norm": 0.09700962156057358,
+      "learning_rate": 0.00013161148308330257,
+      "loss": 1.5739,
+      "step": 819
+    },
+    {
+      "epoch": 0.8266129032258065,
+      "grad_norm": 0.10024348646402359,
+      "learning_rate": 0.00013145733106620532,
+      "loss": 1.6281,
+      "step": 820
+    },
+    {
+      "epoch": 0.8276209677419355,
+      "grad_norm": 0.09777159988880157,
+      "learning_rate": 0.00013130309604328057,
+      "loss": 1.6059,
+      "step": 821
+    },
+    {
+      "epoch": 0.8286290322580645,
+      "grad_norm": 0.0887807309627533,
+      "learning_rate": 0.00013114877842150516,
+      "loss": 1.5857,
+      "step": 822
+    },
+    {
+      "epoch": 0.8296370967741935,
+      "grad_norm": 0.09031641483306885,
+      "learning_rate": 0.000130994378608074,
+      "loss": 1.5523,
+      "step": 823
+    },
+    {
+      "epoch": 0.8306451612903226,
+      "grad_norm": 0.0985943153500557,
+      "learning_rate": 0.00013083989701039868,
+      "loss": 1.5464,
+      "step": 824
+    },
+    {
+      "epoch": 0.8316532258064516,
+      "grad_norm": 0.09250693768262863,
+      "learning_rate": 0.0001306853340361067,
+      "loss": 1.5564,
+      "step": 825
+    },
+    {
+      "epoch": 0.8326612903225806,
+      "grad_norm": 0.10353913903236389,
+      "learning_rate": 0.0001305306900930403,
+      "loss": 1.6126,
+      "step": 826
+    },
+    {
+      "epoch": 0.8336693548387096,
+      "grad_norm": 0.10408423840999603,
+      "learning_rate": 0.00013037596558925532,
+      "loss": 1.5946,
+      "step": 827
+    },
+    {
+      "epoch": 0.8346774193548387,
+      "grad_norm": 0.09186139702796936,
+      "learning_rate": 0.00013022116093302022,
+      "loss": 1.5692,
+      "step": 828
+    },
+    {
+      "epoch": 0.8356854838709677,
+      "grad_norm": 0.08551473915576935,
+      "learning_rate": 0.00013006627653281493,
+      "loss": 1.5486,
+      "step": 829
+    },
+    {
+      "epoch": 0.8366935483870968,
+      "grad_norm": 0.0928485244512558,
+      "learning_rate": 0.0001299113127973298,
+      "loss": 1.5435,
+      "step": 830
+    },
+    {
+      "epoch": 0.8377016129032258,
+      "grad_norm": 0.08251947164535522,
+      "learning_rate": 0.00012975627013546453,
+      "loss": 1.5519,
+      "step": 831
+    },
+    {
+      "epoch": 0.8387096774193549,
+      "grad_norm": 0.09292181581258774,
+      "learning_rate": 0.0001296011489563271,
+      "loss": 1.6129,
+      "step": 832
+    },
+    {
+      "epoch": 0.8397177419354839,
+      "grad_norm": 0.07900629937648773,
+      "learning_rate": 0.00012944594966923263,
+      "loss": 1.5951,
+      "step": 833
+    },
+    {
+      "epoch": 0.8407258064516129,
+      "grad_norm": 0.08966945856809616,
+      "learning_rate": 0.00012929067268370234,
+      "loss": 1.5484,
+      "step": 834
+    },
+    {
+      "epoch": 0.8417338709677419,
+      "grad_norm": 0.08244184404611588,
+      "learning_rate": 0.00012913531840946248,
+      "loss": 1.5852,
+      "step": 835
+    },
+    {
+      "epoch": 0.842741935483871,
+      "grad_norm": 0.0986471101641655,
+      "learning_rate": 0.00012897988725644335,
+      "loss": 1.5797,
+      "step": 836
+    },
+    {
+      "epoch": 0.84375,
+      "grad_norm": 0.09217972308397293,
+      "learning_rate": 0.0001288243796347779,
+      "loss": 1.6433,
+      "step": 837
+    },
+    {
+      "epoch": 0.844758064516129,
+      "grad_norm": 0.07959865033626556,
+      "learning_rate": 0.00012866879595480098,
+      "loss": 1.5639,
+      "step": 838
+    },
+    {
+      "epoch": 0.8457661290322581,
+      "grad_norm": 0.08987965434789658,
+      "learning_rate": 0.0001285131366270482,
+      "loss": 1.567,
+      "step": 839
+    },
+    {
+      "epoch": 0.8467741935483871,
+      "grad_norm": 0.08139210939407349,
+      "learning_rate": 0.00012835740206225464,
+      "loss": 1.5881,
+      "step": 840
+    },
+    {
+      "epoch": 0.8477822580645161,
+      "grad_norm": 0.09342298656702042,
+      "learning_rate": 0.00012820159267135396,
+      "loss": 1.6147,
+      "step": 841
+    },
+    {
+      "epoch": 0.8487903225806451,
+      "grad_norm": 0.08475241810083389,
+      "learning_rate": 0.0001280457088654773,
+      "loss": 1.6063,
+      "step": 842
+    },
+    {
+      "epoch": 0.8497983870967742,
+      "grad_norm": 0.0910174772143364,
+      "learning_rate": 0.00012788975105595214,
+      "loss": 1.6055,
+      "step": 843
+    },
+    {
+      "epoch": 0.8508064516129032,
+      "grad_norm": 0.08082278817892075,
+      "learning_rate": 0.00012773371965430115,
+      "loss": 1.5668,
+      "step": 844
+    },
+    {
+      "epoch": 0.8518145161290323,
+      "grad_norm": 0.0862516313791275,
+      "learning_rate": 0.00012757761507224132,
+      "loss": 1.5415,
+      "step": 845
+    },
+    {
+      "epoch": 0.8528225806451613,
+      "grad_norm": 0.07902859151363373,
+      "learning_rate": 0.00012742143772168264,
+      "loss": 1.5333,
+      "step": 846
+    },
+    {
+      "epoch": 0.8538306451612904,
+      "grad_norm": 0.090780109167099,
+      "learning_rate": 0.00012726518801472718,
+      "loss": 1.6311,
+      "step": 847
+    },
+    {
+      "epoch": 0.8548387096774194,
+      "grad_norm": 0.08239061385393143,
+      "learning_rate": 0.0001271088663636679,
+      "loss": 1.5331,
+      "step": 848
+    },
+    {
+      "epoch": 0.8558467741935484,
+      "grad_norm": 0.08999927341938019,
+      "learning_rate": 0.0001269524731809875,
+      "loss": 1.5775,
+      "step": 849
+    },
+    {
+      "epoch": 0.8568548387096774,
+      "grad_norm": 0.07954005897045135,
+      "learning_rate": 0.00012679600887935768,
+      "loss": 1.5969,
+      "step": 850
+    },
+    {
+      "epoch": 0.8578629032258065,
+      "grad_norm": 0.08286864310503006,
+      "learning_rate": 0.00012663947387163755,
+      "loss": 1.551,
+      "step": 851
+    },
+    {
+      "epoch": 0.8588709677419355,
+      "grad_norm": 0.08236175030469894,
+      "learning_rate": 0.00012648286857087294,
+      "loss": 1.5575,
+      "step": 852
+    },
+    {
+      "epoch": 0.8598790322580645,
+      "grad_norm": 0.08063997328281403,
+      "learning_rate": 0.00012632619339029508,
+      "loss": 1.5899,
+      "step": 853
+    },
+    {
+      "epoch": 0.8608870967741935,
+      "grad_norm": 0.08329153805971146,
+      "learning_rate": 0.00012616944874331963,
+      "loss": 1.5523,
+      "step": 854
+    },
+    {
+      "epoch": 0.8618951612903226,
+      "grad_norm": 0.08181768655776978,
+      "learning_rate": 0.00012601263504354555,
+      "loss": 1.5743,
+      "step": 855
+    },
+    {
+      "epoch": 0.8629032258064516,
+      "grad_norm": 0.07989370822906494,
+      "learning_rate": 0.00012585575270475402,
+      "loss": 1.5629,
+      "step": 856
+    },
+    {
+      "epoch": 0.8639112903225806,
+      "grad_norm": 0.0804544985294342,
+      "learning_rate": 0.00012569880214090726,
+      "loss": 1.5573,
+      "step": 857
+    },
+    {
+      "epoch": 0.8649193548387096,
+      "grad_norm": 0.08739953488111496,
+      "learning_rate": 0.0001255417837661476,
+      "loss": 1.5705,
+      "step": 858
+    },
+    {
+      "epoch": 0.8659274193548387,
+      "grad_norm": 0.08386445045471191,
+      "learning_rate": 0.00012538469799479627,
+      "loss": 1.6106,
+      "step": 859
+    },
+    {
+      "epoch": 0.8669354838709677,
+      "grad_norm": 0.10252925008535385,
+      "learning_rate": 0.00012522754524135228,
+      "loss": 1.5472,
+      "step": 860
+    },
+    {
+      "epoch": 0.8679435483870968,
+      "grad_norm": 0.08197301626205444,
+      "learning_rate": 0.0001250703259204916,
+      "loss": 1.5955,
+      "step": 861
+    },
+    {
+      "epoch": 0.8689516129032258,
+      "grad_norm": 0.09445837140083313,
+      "learning_rate": 0.00012491304044706553,
+      "loss": 1.5536,
+      "step": 862
+    },
+    {
+      "epoch": 0.8699596774193549,
+      "grad_norm": 0.0779092088341713,
+      "learning_rate": 0.00012475568923610015,
+      "loss": 1.5235,
+      "step": 863
+    },
+    {
+      "epoch": 0.8709677419354839,
+      "grad_norm": 0.08657954633235931,
+      "learning_rate": 0.00012459827270279499,
+      "loss": 1.5306,
+      "step": 864
+    },
+    {
+      "epoch": 0.8719758064516129,
+      "grad_norm": 0.08000969886779785,
+      "learning_rate": 0.0001244407912625218,
+      "loss": 1.5451,
+      "step": 865
+    },
+    {
+      "epoch": 0.8729838709677419,
+      "grad_norm": 0.1217707023024559,
+      "learning_rate": 0.00012428324533082376,
+      "loss": 1.5896,
+      "step": 866
+    },
+    {
+      "epoch": 0.873991935483871,
+      "grad_norm": 0.09770061075687408,
+      "learning_rate": 0.00012412563532341413,
+      "loss": 1.5649,
+      "step": 867
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.08925329893827438,
+      "learning_rate": 0.0001239679616561753,
+      "loss": 1.59,
+      "step": 868
+    },
+    {
+      "epoch": 0.876008064516129,
+      "grad_norm": 0.0919514149427414,
+      "learning_rate": 0.0001238102247451575,
+      "loss": 1.6517,
+      "step": 869
+    },
+    {
+      "epoch": 0.8770161290322581,
+      "grad_norm": 0.0922718271613121,
+      "learning_rate": 0.0001236524250065781,
+      "loss": 1.6104,
+      "step": 870
+    },
+    {
+      "epoch": 0.8780241935483871,
+      "grad_norm": 0.08782748132944107,
+      "learning_rate": 0.00012349456285682002,
+      "loss": 1.6027,
+      "step": 871
+    },
+    {
+      "epoch": 0.8790322580645161,
+      "grad_norm": 0.08689384907484055,
+      "learning_rate": 0.00012333663871243094,
+      "loss": 1.5969,
+      "step": 872
+    },
+    {
+      "epoch": 0.8800403225806451,
+      "grad_norm": 0.08294008672237396,
+      "learning_rate": 0.00012317865299012212,
+      "loss": 1.5852,
+      "step": 873
+    },
+    {
+      "epoch": 0.8810483870967742,
+      "grad_norm": 0.1106681302189827,
+      "learning_rate": 0.00012302060610676737,
+      "loss": 1.622,
+      "step": 874
+    },
+    {
+      "epoch": 0.8820564516129032,
+      "grad_norm": 0.10415118932723999,
+      "learning_rate": 0.00012286249847940178,
+      "loss": 1.6416,
+      "step": 875
+    },
+    {
+      "epoch": 0.8830645161290323,
+      "grad_norm": 0.08293262124061584,
+      "learning_rate": 0.00012270433052522073,
+      "loss": 1.5963,
+      "step": 876
+    },
+    {
+      "epoch": 0.8840725806451613,
+      "grad_norm": 0.09230700880289078,
+      "learning_rate": 0.0001225461026615789,
+      "loss": 1.6242,
+      "step": 877
+    },
+    {
+      "epoch": 0.8850806451612904,
+      "grad_norm": 0.08799263834953308,
+      "learning_rate": 0.00012238781530598896,
+      "loss": 1.5607,
+      "step": 878
+    },
+    {
+      "epoch": 0.8860887096774194,
+      "grad_norm": 0.08640427887439728,
+      "learning_rate": 0.00012222946887612056,
+      "loss": 1.6114,
+      "step": 879
+    },
+    {
+      "epoch": 0.8870967741935484,
+      "grad_norm": 0.08553026616573334,
+      "learning_rate": 0.0001220710637897992,
+      "loss": 1.5549,
+      "step": 880
+    },
+    {
+      "epoch": 0.8881048387096774,
+      "grad_norm": 0.0878986194729805,
+      "learning_rate": 0.00012191260046500525,
+      "loss": 1.5697,
+      "step": 881
+    },
+    {
+      "epoch": 0.8891129032258065,
+      "grad_norm": 0.08509572595357895,
+      "learning_rate": 0.00012175407931987273,
+      "loss": 1.6237,
+      "step": 882
+    },
+    {
+      "epoch": 0.8901209677419355,
+      "grad_norm": 0.09629905223846436,
+      "learning_rate": 0.0001215955007726881,
+      "loss": 1.5869,
+      "step": 883
+    },
+    {
+      "epoch": 0.8911290322580645,
+      "grad_norm": 0.07942201942205429,
+      "learning_rate": 0.00012143686524188954,
+      "loss": 1.5933,
+      "step": 884
+    },
+    {
+      "epoch": 0.8921370967741935,
+      "grad_norm": 0.0878920629620552,
+      "learning_rate": 0.00012127817314606526,
+      "loss": 1.5485,
+      "step": 885
+    },
+    {
+      "epoch": 0.8931451612903226,
+      "grad_norm": 0.07961869984865189,
+      "learning_rate": 0.00012111942490395305,
+      "loss": 1.571,
+      "step": 886
+    },
+    {
+      "epoch": 0.8941532258064516,
+      "grad_norm": 0.08690143376588821,
+      "learning_rate": 0.00012096062093443863,
+      "loss": 1.5437,
+      "step": 887
+    },
+    {
+      "epoch": 0.8951612903225806,
+      "grad_norm": 0.08331328630447388,
+      "learning_rate": 0.00012080176165655488,
+      "loss": 1.5967,
+      "step": 888
+    },
+    {
+      "epoch": 0.8961693548387096,
+      "grad_norm": 0.08849766850471497,
+      "learning_rate": 0.00012064284748948053,
+      "loss": 1.6156,
+      "step": 889
+    },
+    {
+      "epoch": 0.8971774193548387,
+      "grad_norm": 0.08413555473089218,
+      "learning_rate": 0.00012048387885253925,
+      "loss": 1.5603,
+      "step": 890
+    },
+    {
+      "epoch": 0.8981854838709677,
+      "grad_norm": 0.08616600930690765,
+      "learning_rate": 0.0001203248561651984,
+      "loss": 1.5682,
+      "step": 891
+    },
+    {
+      "epoch": 0.8991935483870968,
+      "grad_norm": 0.08520584553480148,
+      "learning_rate": 0.00012016577984706792,
+      "loss": 1.6327,
+      "step": 892
+    },
+    {
+      "epoch": 0.9002016129032258,
+      "grad_norm": 0.08620157837867737,
+      "learning_rate": 0.0001200066503178993,
+      "loss": 1.6143,
+      "step": 893
+    },
+    {
+      "epoch": 0.9012096774193549,
+      "grad_norm": 0.07895144820213318,
+      "learning_rate": 0.00011984746799758442,
+      "loss": 1.5533,
+      "step": 894
+    },
+    {
+      "epoch": 0.9022177419354839,
+      "grad_norm": 0.08743470162153244,
+      "learning_rate": 0.0001196882333061545,
+      "loss": 1.6004,
+      "step": 895
+    },
+    {
+      "epoch": 0.9032258064516129,
+      "grad_norm": 0.08172673732042313,
+      "learning_rate": 0.0001195289466637789,
+      "loss": 1.6032,
+      "step": 896
+    },
+    {
+      "epoch": 0.9042338709677419,
+      "grad_norm": 0.09668843448162079,
+      "learning_rate": 0.00011936960849076411,
+      "loss": 1.6198,
+      "step": 897
+    },
+    {
+      "epoch": 0.905241935483871,
+      "grad_norm": 0.08503922075033188,
+      "learning_rate": 0.00011921021920755253,
+      "loss": 1.5638,
+      "step": 898
+    },
+    {
+      "epoch": 0.90625,
+      "grad_norm": 0.0889093279838562,
+      "learning_rate": 0.00011905077923472146,
+      "loss": 1.624,
+      "step": 899
+    },
+    {
+      "epoch": 0.907258064516129,
+      "grad_norm": 0.08409906178712845,
+      "learning_rate": 0.00011889128899298198,
+      "loss": 1.5562,
+      "step": 900
+    },
+    {
+      "epoch": 0.9082661290322581,
+      "grad_norm": 0.08293265849351883,
+      "learning_rate": 0.00011873174890317775,
+      "loss": 1.5709,
+      "step": 901
+    },
+    {
+      "epoch": 0.9092741935483871,
+      "grad_norm": 0.09479732066392899,
+      "learning_rate": 0.00011857215938628403,
+      "loss": 1.6222,
+      "step": 902
+    },
+    {
+      "epoch": 0.9102822580645161,
+      "grad_norm": 0.08044169843196869,
+      "learning_rate": 0.00011841252086340649,
+      "loss": 1.5862,
+      "step": 903
+    },
+    {
+      "epoch": 0.9112903225806451,
+      "grad_norm": 0.08543860912322998,
+      "learning_rate": 0.00011825283375578005,
+      "loss": 1.5764,
+      "step": 904
+    },
+    {
+      "epoch": 0.9122983870967742,
+      "grad_norm": 0.08160272240638733,
+      "learning_rate": 0.0001180930984847679,
+      "loss": 1.5204,
+      "step": 905
+    },
+    {
+      "epoch": 0.9133064516129032,
+      "grad_norm": 0.10486453771591187,
+      "learning_rate": 0.00011793331547186026,
+      "loss": 1.5921,
+      "step": 906
+    },
+    {
+      "epoch": 0.9143145161290323,
+      "grad_norm": 0.0780840739607811,
+      "learning_rate": 0.00011777348513867341,
+      "loss": 1.5173,
+      "step": 907
+    },
+    {
+      "epoch": 0.9153225806451613,
+      "grad_norm": 0.08347219228744507,
+      "learning_rate": 0.00011761360790694837,
+      "loss": 1.5543,
+      "step": 908
+    },
+    {
+      "epoch": 0.9163306451612904,
+      "grad_norm": 0.09629109501838684,
+      "learning_rate": 0.00011745368419855005,
+      "loss": 1.6039,
+      "step": 909
+    },
+    {
+      "epoch": 0.9173387096774194,
+      "grad_norm": 0.08534412831068039,
+      "learning_rate": 0.00011729371443546587,
+      "loss": 1.5787,
+      "step": 910
+    },
+    {
+      "epoch": 0.9183467741935484,
+      "grad_norm": 0.08703077584505081,
+      "learning_rate": 0.00011713369903980485,
+      "loss": 1.6218,
+      "step": 911
+    },
+    {
+      "epoch": 0.9193548387096774,
+      "grad_norm": 0.08057136088609695,
+      "learning_rate": 0.00011697363843379641,
+      "loss": 1.5475,
+      "step": 912
+    },
+    {
+      "epoch": 0.9203629032258065,
+      "grad_norm": 0.09287240356206894,
+      "learning_rate": 0.00011681353303978924,
+      "loss": 1.5587,
+      "step": 913
+    },
+    {
+      "epoch": 0.9213709677419355,
+      "grad_norm": 0.08380912989377975,
+      "learning_rate": 0.00011665338328025027,
+      "loss": 1.6194,
+      "step": 914
+    },
+    {
+      "epoch": 0.9223790322580645,
+      "grad_norm": 0.08018894493579865,
+      "learning_rate": 0.00011649318957776336,
+      "loss": 1.545,
+      "step": 915
+    },
+    {
+      "epoch": 0.9233870967741935,
+      "grad_norm": 0.07932014018297195,
+      "learning_rate": 0.00011633295235502851,
+      "loss": 1.5688,
+      "step": 916
+    },
+    {
+      "epoch": 0.9243951612903226,
+      "grad_norm": 0.08409032970666885,
+      "learning_rate": 0.0001161726720348604,
+      "loss": 1.5354,
+      "step": 917
+    },
+    {
+      "epoch": 0.9254032258064516,
+      "grad_norm": 0.07981358468532562,
+      "learning_rate": 0.00011601234904018751,
+      "loss": 1.5604,
+      "step": 918
+    },
+    {
+      "epoch": 0.9264112903225806,
+      "grad_norm": 0.0860762745141983,
+      "learning_rate": 0.00011585198379405092,
+      "loss": 1.5857,
+      "step": 919
+    },
+    {
+      "epoch": 0.9274193548387096,
+      "grad_norm": 0.09491165727376938,
+      "learning_rate": 0.00011569157671960316,
+      "loss": 1.5479,
+      "step": 920
+    },
+    {
+      "epoch": 0.9284274193548387,
+      "grad_norm": 0.08277281373739243,
+      "learning_rate": 0.00011553112824010716,
+      "loss": 1.5773,
+      "step": 921
+    },
+    {
+      "epoch": 0.9294354838709677,
+      "grad_norm": 0.08350729942321777,
+      "learning_rate": 0.00011537063877893513,
+      "loss": 1.5508,
+      "step": 922
+    },
+    {
+      "epoch": 0.9304435483870968,
+      "grad_norm": 0.08306790888309479,
+      "learning_rate": 0.00011521010875956734,
+      "loss": 1.5807,
+      "step": 923
+    },
+    {
+      "epoch": 0.9314516129032258,
+      "grad_norm": 0.07756998389959335,
+      "learning_rate": 0.00011504953860559116,
+      "loss": 1.546,
+      "step": 924
+    },
+    {
+      "epoch": 0.9324596774193549,
+      "grad_norm": 0.08689188212156296,
+      "learning_rate": 0.00011488892874069981,
+      "loss": 1.5929,
+      "step": 925
+    },
+    {
+      "epoch": 0.9334677419354839,
+      "grad_norm": 0.08053242415189743,
+      "learning_rate": 0.00011472827958869133,
+      "loss": 1.5578,
+      "step": 926
+    },
+    {
+      "epoch": 0.9344758064516129,
+      "grad_norm": 0.08326185494661331,
+      "learning_rate": 0.0001145675915734674,
+      "loss": 1.544,
+      "step": 927
+    },
+    {
+      "epoch": 0.9354838709677419,
+      "grad_norm": 0.08363624662160873,
+      "learning_rate": 0.00011440686511903223,
+      "loss": 1.5564,
+      "step": 928
+    },
+    {
+      "epoch": 0.936491935483871,
+      "grad_norm": 0.08229418098926544,
+      "learning_rate": 0.00011424610064949153,
+      "loss": 1.5542,
+      "step": 929
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.09776529669761658,
+      "learning_rate": 0.00011408529858905126,
+      "loss": 1.5353,
+      "step": 930
+    },
+    {
+      "epoch": 0.938508064516129,
+      "grad_norm": 0.09616075456142426,
+      "learning_rate": 0.0001139244593620166,
+      "loss": 1.6193,
+      "step": 931
+    },
+    {
+      "epoch": 0.9395161290322581,
+      "grad_norm": 0.1000729650259018,
+      "learning_rate": 0.00011376358339279076,
+      "loss": 1.633,
+      "step": 932
+    },
+    {
+      "epoch": 0.9405241935483871,
+      "grad_norm": 0.08457247912883759,
+      "learning_rate": 0.00011360267110587393,
+      "loss": 1.5798,
+      "step": 933
+    },
+    {
+      "epoch": 0.9415322580645161,
+      "grad_norm": 0.07730599492788315,
+      "learning_rate": 0.00011344172292586217,
+      "loss": 1.5163,
+      "step": 934
+    },
+    {
+      "epoch": 0.9425403225806451,
+      "grad_norm": 0.09660627692937851,
+      "learning_rate": 0.00011328073927744616,
+      "loss": 1.6322,
+      "step": 935
+    },
+    {
+      "epoch": 0.9435483870967742,
+      "grad_norm": 0.08001486957073212,
+      "learning_rate": 0.00011311972058541023,
+      "loss": 1.5755,
+      "step": 936
+    },
+    {
+      "epoch": 0.9445564516129032,
+      "grad_norm": 0.08265230059623718,
+      "learning_rate": 0.0001129586672746312,
+      "loss": 1.61,
+      "step": 937
+    },
+    {
+      "epoch": 0.9455645161290323,
+      "grad_norm": 0.09588516503572464,
+      "learning_rate": 0.00011279757977007717,
+      "loss": 1.6023,
+      "step": 938
+    },
+    {
+      "epoch": 0.9465725806451613,
+      "grad_norm": 0.0791090875864029,
+      "learning_rate": 0.0001126364584968065,
+      "loss": 1.5158,
+      "step": 939
+    },
+    {
+      "epoch": 0.9475806451612904,
+      "grad_norm": 0.09306017309427261,
+      "learning_rate": 0.00011247530387996668,
+      "loss": 1.5724,
+      "step": 940
+    },
+    {
+      "epoch": 0.9485887096774194,
+      "grad_norm": 0.08578615635633469,
+      "learning_rate": 0.00011231411634479316,
+      "loss": 1.5692,
+      "step": 941
+    },
+    {
+      "epoch": 0.9495967741935484,
+      "grad_norm": 0.0851496234536171,
+      "learning_rate": 0.00011215289631660823,
+      "loss": 1.5677,
+      "step": 942
+    },
+    {
+      "epoch": 0.9506048387096774,
+      "grad_norm": 0.08048581331968307,
+      "learning_rate": 0.00011199164422081995,
+      "loss": 1.5537,
+      "step": 943
+    },
+    {
+      "epoch": 0.9516129032258065,
+      "grad_norm": 0.08106819540262222,
+      "learning_rate": 0.000111830360482921,
+      "loss": 1.5296,
+      "step": 944
+    },
+    {
+      "epoch": 0.9526209677419355,
+      "grad_norm": 0.07831558585166931,
+      "learning_rate": 0.00011166904552848749,
+      "loss": 1.5503,
+      "step": 945
+    },
+    {
+      "epoch": 0.9536290322580645,
+      "grad_norm": 0.07931654155254364,
+      "learning_rate": 0.000111507699783178,
+      "loss": 1.5592,
+      "step": 946
+    },
+    {
+      "epoch": 0.9546370967741935,
+      "grad_norm": 0.07992593944072723,
+      "learning_rate": 0.0001113463236727323,
+      "loss": 1.5671,
+      "step": 947
+    },
+    {
+      "epoch": 0.9556451612903226,
+      "grad_norm": 0.08474520593881607,
+      "learning_rate": 0.00011118491762297027,
+      "loss": 1.5699,
+      "step": 948
+    },
+    {
+      "epoch": 0.9566532258064516,
+      "grad_norm": 0.08235491812229156,
+      "learning_rate": 0.0001110234820597908,
+      "loss": 1.5671,
+      "step": 949
+    },
+    {
+      "epoch": 0.9576612903225806,
+      "grad_norm": 0.09822028130292892,
+      "learning_rate": 0.00011086201740917075,
+      "loss": 1.6389,
+      "step": 950
+    },
+    {
+      "epoch": 0.9586693548387096,
+      "grad_norm": 0.08909379690885544,
+      "learning_rate": 0.00011070052409716354,
+      "loss": 1.6273,
+      "step": 951
+    },
+    {
+      "epoch": 0.9596774193548387,
+      "grad_norm": 0.08938673883676529,
+      "learning_rate": 0.00011053900254989837,
+      "loss": 1.5894,
+      "step": 952
+    },
+    {
+      "epoch": 0.9606854838709677,
+      "grad_norm": 0.08622390776872635,
+      "learning_rate": 0.00011037745319357893,
+      "loss": 1.6217,
+      "step": 953
+    },
+    {
+      "epoch": 0.9616935483870968,
+      "grad_norm": 0.08985532820224762,
+      "learning_rate": 0.00011021587645448222,
+      "loss": 1.6432,
+      "step": 954
+    },
+    {
+      "epoch": 0.9627016129032258,
+      "grad_norm": 0.08598313480615616,
+      "learning_rate": 0.00011005427275895756,
+      "loss": 1.54,
+      "step": 955
+    },
+    {
+      "epoch": 0.9637096774193549,
+      "grad_norm": 0.0815306007862091,
+      "learning_rate": 0.00010989264253342538,
+      "loss": 1.5172,
+      "step": 956
+    },
+    {
+      "epoch": 0.9647177419354839,
+      "grad_norm": 0.09671612083911896,
+      "learning_rate": 0.00010973098620437609,
+      "loss": 1.6054,
+      "step": 957
+    },
+    {
+      "epoch": 0.9657258064516129,
+      "grad_norm": 0.0809609442949295,
+      "learning_rate": 0.00010956930419836899,
+      "loss": 1.528,
+      "step": 958
+    },
+    {
+      "epoch": 0.9667338709677419,
+      "grad_norm": 0.08456597477197647,
+      "learning_rate": 0.0001094075969420312,
+      "loss": 1.5383,
+      "step": 959
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 0.09076231718063354,
+      "learning_rate": 0.00010924586486205632,
+      "loss": 1.5948,
+      "step": 960
+    },
+    {
+      "epoch": 0.96875,
+      "grad_norm": 0.08709228038787842,
+      "learning_rate": 0.00010908410838520362,
+      "loss": 1.5425,
+      "step": 961
+    },
+    {
+      "epoch": 0.969758064516129,
+      "grad_norm": 0.09060946106910706,
+      "learning_rate": 0.00010892232793829659,
+      "loss": 1.57,
+      "step": 962
+    },
+    {
+      "epoch": 0.9707661290322581,
+      "grad_norm": 0.0881752297282219,
+      "learning_rate": 0.0001087605239482221,
+      "loss": 1.5874,
+      "step": 963
+    },
+    {
+      "epoch": 0.9717741935483871,
+      "grad_norm": 0.086030974984169,
+      "learning_rate": 0.00010859869684192907,
+      "loss": 1.5792,
+      "step": 964
+    },
+    {
+      "epoch": 0.9727822580645161,
+      "grad_norm": 0.0817110538482666,
+      "learning_rate": 0.00010843684704642744,
+      "loss": 1.5506,
+      "step": 965
+    },
+    {
+      "epoch": 0.9737903225806451,
+      "grad_norm": 0.08721321821212769,
+      "learning_rate": 0.00010827497498878703,
+      "loss": 1.5907,
+      "step": 966
+    },
+    {
+      "epoch": 0.9747983870967742,
+      "grad_norm": 0.07887570559978485,
+      "learning_rate": 0.00010811308109613634,
+      "loss": 1.578,
+      "step": 967
+    },
+    {
+      "epoch": 0.9758064516129032,
+      "grad_norm": 0.11064060032367706,
+      "learning_rate": 0.00010795116579566158,
+      "loss": 1.6,
+      "step": 968
+    },
+    {
+      "epoch": 0.9768145161290323,
+      "grad_norm": 0.08672841638326645,
+      "learning_rate": 0.00010778922951460537,
+      "loss": 1.5872,
+      "step": 969
+    },
+    {
+      "epoch": 0.9778225806451613,
+      "grad_norm": 0.08424878865480423,
+      "learning_rate": 0.00010762727268026571,
+      "loss": 1.5698,
+      "step": 970
+    },
+    {
+      "epoch": 0.9788306451612904,
+      "grad_norm": 0.08876322209835052,
+      "learning_rate": 0.00010746529571999491,
+      "loss": 1.5775,
+      "step": 971
+    },
+    {
+      "epoch": 0.9798387096774194,
+      "grad_norm": 0.08440111577510834,
+      "learning_rate": 0.00010730329906119822,
+      "loss": 1.5574,
+      "step": 972
+    },
+    {
+      "epoch": 0.9808467741935484,
+      "grad_norm": 0.08397315442562103,
+      "learning_rate": 0.00010714128313133307,
+      "loss": 1.6166,
+      "step": 973
+    },
+    {
+      "epoch": 0.9818548387096774,
+      "grad_norm": 0.09894799441099167,
+      "learning_rate": 0.00010697924835790758,
+      "loss": 1.6352,
+      "step": 974
+    },
+    {
+      "epoch": 0.9828629032258065,
+      "grad_norm": 0.08329147845506668,
+      "learning_rate": 0.00010681719516847968,
+      "loss": 1.555,
+      "step": 975
+    },
+    {
+      "epoch": 0.9838709677419355,
+      "grad_norm": 0.08748366683721542,
+      "learning_rate": 0.00010665512399065582,
+      "loss": 1.5519,
+      "step": 976
+    },
+    {
+      "epoch": 0.9848790322580645,
+      "grad_norm": 0.08558699488639832,
+      "learning_rate": 0.00010649303525209005,
+      "loss": 1.5762,
+      "step": 977
+    },
+    {
+      "epoch": 0.9858870967741935,
+      "grad_norm": 0.11034592986106873,
+      "learning_rate": 0.00010633092938048257,
+      "loss": 1.5972,
+      "step": 978
+    },
+    {
+      "epoch": 0.9868951612903226,
+      "grad_norm": 0.08514732867479324,
+      "learning_rate": 0.00010616880680357892,
+      "loss": 1.5625,
+      "step": 979
+    },
+    {
+      "epoch": 0.9879032258064516,
+      "grad_norm": 0.09123446047306061,
+      "learning_rate": 0.00010600666794916871,
+      "loss": 1.5516,
+      "step": 980
+    },
+    {
+      "epoch": 0.9889112903225806,
+      "grad_norm": 0.08317586034536362,
+      "learning_rate": 0.00010584451324508444,
+      "loss": 1.6043,
+      "step": 981
+    },
+    {
+      "epoch": 0.9899193548387096,
+      "grad_norm": 0.09369304031133652,
+      "learning_rate": 0.00010568234311920051,
+      "loss": 1.5575,
+      "step": 982
+    },
+    {
+      "epoch": 0.9909274193548387,
+      "grad_norm": 0.08730312436819077,
+      "learning_rate": 0.00010552015799943193,
+      "loss": 1.5848,
+      "step": 983
+    },
+    {
+      "epoch": 0.9919354838709677,
+      "grad_norm": 0.08520778268575668,
+      "learning_rate": 0.00010535795831373337,
+      "loss": 1.5697,
+      "step": 984
+    },
+    {
+      "epoch": 0.9929435483870968,
+      "grad_norm": 0.08985403925180435,
+      "learning_rate": 0.00010519574449009784,
+      "loss": 1.546,
+      "step": 985
+    },
+    {
+      "epoch": 0.9939516129032258,
+      "grad_norm": 0.08371421694755554,
+      "learning_rate": 0.0001050335169565557,
+      "loss": 1.5724,
+      "step": 986
+    },
+    {
+      "epoch": 0.9949596774193549,
+      "grad_norm": 0.08613915741443634,
+      "learning_rate": 0.00010487127614117352,
+      "loss": 1.5974,
+      "step": 987
+    },
+    {
+      "epoch": 0.9959677419354839,
+      "grad_norm": 0.09249399602413177,
+      "learning_rate": 0.00010470902247205283,
+      "loss": 1.6173,
+      "step": 988
+    },
+    {
+      "epoch": 0.9969758064516129,
+      "grad_norm": 0.09915943443775177,
+      "learning_rate": 0.00010454675637732916,
+      "loss": 1.5947,
+      "step": 989
+    },
+    {
+      "epoch": 0.9979838709677419,
+      "grad_norm": 0.0846395492553711,
+      "learning_rate": 0.00010438447828517077,
+      "loss": 1.5243,
+      "step": 990
+    },
+    {
+      "epoch": 0.998991935483871,
+      "grad_norm": 0.08313705772161484,
+      "learning_rate": 0.00010422218862377764,
+      "loss": 1.5333,
+      "step": 991
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.08256080746650696,
+      "learning_rate": 0.00010405988782138019,
+      "loss": 1.5527,
+      "step": 992
+    },
+    {
+      "epoch": 1.001008064516129,
+      "grad_norm": 0.09215422719717026,
+      "learning_rate": 0.00010389757630623831,
+      "loss": 1.5035,
+      "step": 993
+    },
+    {
+      "epoch": 1.002016129032258,
+      "grad_norm": 0.08784796297550201,
+      "learning_rate": 0.00010373525450664016,
+      "loss": 1.5397,
+      "step": 994
+    },
+    {
+      "epoch": 1.003024193548387,
+      "grad_norm": 0.08578605949878693,
+      "learning_rate": 0.000103572922850901,
+      "loss": 1.5449,
+      "step": 995
+    },
+    {
+      "epoch": 1.0040322580645162,
+      "grad_norm": 0.09281399846076965,
+      "learning_rate": 0.00010341058176736207,
+      "loss": 1.4507,
+      "step": 996
+    },
+    {
+      "epoch": 1.0050403225806452,
+      "grad_norm": 0.09404852986335754,
+      "learning_rate": 0.00010324823168438953,
+      "loss": 1.4817,
+      "step": 997
+    },
+    {
+      "epoch": 1.0060483870967742,
+      "grad_norm": 0.0944603756070137,
+      "learning_rate": 0.00010308587303037334,
+      "loss": 1.536,
+      "step": 998
+    },
+    {
+      "epoch": 1.0070564516129032,
+      "grad_norm": 0.11103025823831558,
+      "learning_rate": 0.00010292350623372598,
+      "loss": 1.5278,
+      "step": 999
+    },
+    {
+      "epoch": 1.0080645161290323,
+      "grad_norm": 0.0859605222940445,
+      "learning_rate": 0.00010276113172288144,
+      "loss": 1.4855,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0090725806451613,
+      "grad_norm": 0.08268768340349197,
+      "learning_rate": 0.0001025987499262941,
+      "loss": 1.4975,
+      "step": 1001
+    },
+    {
+      "epoch": 1.0100806451612903,
+      "grad_norm": 0.09450601041316986,
+      "learning_rate": 0.00010243636127243754,
+      "loss": 1.5052,
+      "step": 1002
+    },
+    {
+      "epoch": 1.0110887096774193,
+      "grad_norm": 0.10182943195104599,
+      "learning_rate": 0.00010227396618980344,
+      "loss": 1.5889,
+      "step": 1003
+    },
+    {
+      "epoch": 1.0120967741935485,
+      "grad_norm": 0.10887010395526886,
+      "learning_rate": 0.00010211156510690043,
+      "loss": 1.5387,
+      "step": 1004
+    },
+    {
+      "epoch": 1.0131048387096775,
+      "grad_norm": 0.09432150423526764,
+      "learning_rate": 0.00010194915845225304,
+      "loss": 1.51,
+      "step": 1005
+    },
+    {
+      "epoch": 1.0141129032258065,
+      "grad_norm": 0.0892212763428688,
+      "learning_rate": 0.00010178674665440034,
+      "loss": 1.4975,
+      "step": 1006
+    },
+    {
+      "epoch": 1.0151209677419355,
+      "grad_norm": 0.08749305456876755,
+      "learning_rate": 0.00010162433014189519,
+      "loss": 1.5303,
+      "step": 1007
+    },
+    {
+      "epoch": 1.0161290322580645,
+      "grad_norm": 0.09416648000478745,
+      "learning_rate": 0.00010146190934330268,
+      "loss": 1.499,
+      "step": 1008
+    },
+    {
+      "epoch": 1.0171370967741935,
+      "grad_norm": 0.10288472473621368,
+      "learning_rate": 0.00010129948468719939,
+      "loss": 1.4785,
+      "step": 1009
+    },
+    {
+      "epoch": 1.0181451612903225,
+      "grad_norm": 0.08718498051166534,
+      "learning_rate": 0.00010113705660217197,
+      "loss": 1.5045,
+      "step": 1010
+    },
+    {
+      "epoch": 1.0191532258064515,
+      "grad_norm": 0.08473226428031921,
+      "learning_rate": 0.00010097462551681612,
+      "loss": 1.4799,
+      "step": 1011
+    },
+    {
+      "epoch": 1.0201612903225807,
+      "grad_norm": 0.09531670063734055,
+      "learning_rate": 0.00010081219185973552,
+      "loss": 1.545,
+      "step": 1012
+    },
+    {
+      "epoch": 1.0211693548387097,
+      "grad_norm": 0.08223138749599457,
+      "learning_rate": 0.00010064975605954054,
+      "loss": 1.4807,
+      "step": 1013
+    },
+    {
+      "epoch": 1.0221774193548387,
+      "grad_norm": 0.08815553784370422,
+      "learning_rate": 0.00010048731854484735,
+      "loss": 1.47,
+      "step": 1014
+    },
+    {
+      "epoch": 1.0231854838709677,
+      "grad_norm": 0.09323311597108841,
+      "learning_rate": 0.00010032487974427645,
+      "loss": 1.5823,
+      "step": 1015
+    },
+    {
+      "epoch": 1.0241935483870968,
+      "grad_norm": 0.1007145345211029,
+      "learning_rate": 0.00010016244008645195,
+      "loss": 1.4864,
+      "step": 1016
+    },
+    {
+      "epoch": 1.0252016129032258,
+      "grad_norm": 0.09309312701225281,
+      "learning_rate": 0.0001,
+      "loss": 1.5118,
+      "step": 1017
+    },
+    {
+      "epoch": 1.0262096774193548,
+      "grad_norm": 0.08557573705911636,
+      "learning_rate": 9.983755991354809e-05,
+      "loss": 1.5165,
+      "step": 1018
+    },
+    {
+      "epoch": 1.0272177419354838,
+      "grad_norm": 0.10075996816158295,
+      "learning_rate": 9.967512025572356e-05,
+      "loss": 1.5106,
+      "step": 1019
+    },
+    {
+      "epoch": 1.028225806451613,
+      "grad_norm": 0.08483249694108963,
+      "learning_rate": 9.951268145515269e-05,
+      "loss": 1.4974,
+      "step": 1020
+    },
+    {
+      "epoch": 1.029233870967742,
+      "grad_norm": 0.11874374747276306,
+      "learning_rate": 9.935024394045948e-05,
+      "loss": 1.5622,
+      "step": 1021
+    },
+    {
+      "epoch": 1.030241935483871,
+      "grad_norm": 0.11608150601387024,
+      "learning_rate": 9.918780814026452e-05,
+      "loss": 1.5636,
+      "step": 1022
+    },
+    {
+      "epoch": 1.03125,
+      "grad_norm": 0.11097010225057602,
+      "learning_rate": 9.90253744831839e-05,
+      "loss": 1.5388,
+      "step": 1023
+    },
+    {
+      "epoch": 1.032258064516129,
+      "grad_norm": 0.08821584284305573,
+      "learning_rate": 9.886294339782805e-05,
+      "loss": 1.4808,
+      "step": 1024
+    },
+    {
+      "epoch": 1.033266129032258,
+      "grad_norm": 0.08906351774930954,
+      "learning_rate": 9.870051531280064e-05,
+      "loss": 1.4567,
+      "step": 1025
+    },
+    {
+      "epoch": 1.034274193548387,
+      "grad_norm": 0.08993887901306152,
+      "learning_rate": 9.853809065669733e-05,
+      "loss": 1.5174,
+      "step": 1026
+    },
+    {
+      "epoch": 1.0352822580645162,
+      "grad_norm": 0.0829705148935318,
+      "learning_rate": 9.837566985810484e-05,
+      "loss": 1.5275,
+      "step": 1027
+    },
+    {
+      "epoch": 1.0362903225806452,
+      "grad_norm": 0.09338941425085068,
+      "learning_rate": 9.821325334559967e-05,
+      "loss": 1.5197,
+      "step": 1028
+    },
+    {
+      "epoch": 1.0372983870967742,
+      "grad_norm": 0.0843081921339035,
+      "learning_rate": 9.8050841547747e-05,
+      "loss": 1.5121,
+      "step": 1029
+    },
+    {
+      "epoch": 1.0383064516129032,
+      "grad_norm": 0.09108688682317734,
+      "learning_rate": 9.78884348930996e-05,
+      "loss": 1.5642,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0393145161290323,
+      "grad_norm": 0.08404973894357681,
+      "learning_rate": 9.772603381019658e-05,
+      "loss": 1.4552,
+      "step": 1031
+    },
+    {
+      "epoch": 1.0403225806451613,
+      "grad_norm": 0.08852069824934006,
+      "learning_rate": 9.756363872756249e-05,
+      "loss": 1.5511,
+      "step": 1032
+    },
+    {
+      "epoch": 1.0413306451612903,
+      "grad_norm": 0.08855357021093369,
+      "learning_rate": 9.740125007370592e-05,
+      "loss": 1.5341,
+      "step": 1033
+    },
+    {
+      "epoch": 1.0423387096774193,
+      "grad_norm": 0.08306348323822021,
+      "learning_rate": 9.723886827711857e-05,
+      "loss": 1.4941,
+      "step": 1034
+    },
+    {
+      "epoch": 1.0433467741935485,
+      "grad_norm": 0.11460579931735992,
+      "learning_rate": 9.707649376627406e-05,
+      "loss": 1.541,
+      "step": 1035
+    },
+    {
+      "epoch": 1.0443548387096775,
+      "grad_norm": 0.0861547440290451,
+      "learning_rate": 9.691412696962667e-05,
+      "loss": 1.5364,
+      "step": 1036
+    },
+    {
+      "epoch": 1.0453629032258065,
+      "grad_norm": 0.092412069439888,
+      "learning_rate": 9.675176831561048e-05,
+      "loss": 1.5179,
+      "step": 1037
+    },
+    {
+      "epoch": 1.0463709677419355,
+      "grad_norm": 0.08788943290710449,
+      "learning_rate": 9.658941823263797e-05,
+      "loss": 1.4936,
+      "step": 1038
+    },
+    {
+      "epoch": 1.0473790322580645,
+      "grad_norm": 0.08519960939884186,
+      "learning_rate": 9.642707714909904e-05,
+      "loss": 1.539,
+      "step": 1039
+    },
+    {
+      "epoch": 1.0483870967741935,
+      "grad_norm": 0.08832072466611862,
+      "learning_rate": 9.626474549335986e-05,
+      "loss": 1.5077,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0493951612903225,
+      "grad_norm": 0.10505390167236328,
+      "learning_rate": 9.61024236937617e-05,
+      "loss": 1.5432,
+      "step": 1041
+    },
+    {
+      "epoch": 1.0504032258064515,
+      "grad_norm": 0.09197022020816803,
+      "learning_rate": 9.594011217861982e-05,
+      "loss": 1.5595,
+      "step": 1042
+    },
+    {
+      "epoch": 1.0514112903225807,
+      "grad_norm": 0.0843205377459526,
+      "learning_rate": 9.577781137622238e-05,
+      "loss": 1.4353,
+      "step": 1043
+    },
+    {
+      "epoch": 1.0524193548387097,
+      "grad_norm": 0.10806506127119064,
+      "learning_rate": 9.561552171482925e-05,
+      "loss": 1.515,
+      "step": 1044
+    },
+    {
+      "epoch": 1.0534274193548387,
+      "grad_norm": 0.08592282235622406,
+      "learning_rate": 9.545324362267086e-05,
+      "loss": 1.5279,
+      "step": 1045
+    },
+    {
+      "epoch": 1.0544354838709677,
+      "grad_norm": 0.11082509160041809,
+      "learning_rate": 9.52909775279472e-05,
+      "loss": 1.5395,
+      "step": 1046
+    },
+    {
+      "epoch": 1.0554435483870968,
+      "grad_norm": 0.08529554307460785,
+      "learning_rate": 9.51287238588265e-05,
+      "loss": 1.4849,
+      "step": 1047
+    },
+    {
+      "epoch": 1.0564516129032258,
+      "grad_norm": 0.08765090256929398,
+      "learning_rate": 9.496648304344433e-05,
+      "loss": 1.4944,
+      "step": 1048
+    },
+    {
+      "epoch": 1.0574596774193548,
+      "grad_norm": 0.08893377333879471,
+      "learning_rate": 9.480425550990219e-05,
+      "loss": 1.5,
+      "step": 1049
+    },
+    {
+      "epoch": 1.0584677419354838,
+      "grad_norm": 0.09724058210849762,
+      "learning_rate": 9.464204168626665e-05,
+      "loss": 1.5281,
+      "step": 1050
+    },
+    {
+      "epoch": 1.059475806451613,
+      "grad_norm": 0.0883408635854721,
+      "learning_rate": 9.447984200056808e-05,
+      "loss": 1.5211,
+      "step": 1051
+    },
+    {
+      "epoch": 1.060483870967742,
+      "grad_norm": 0.08431454002857208,
+      "learning_rate": 9.43176568807995e-05,
+      "loss": 1.5175,
+      "step": 1052
+    },
+    {
+      "epoch": 1.061491935483871,
+      "grad_norm": 0.09407296776771545,
+      "learning_rate": 9.415548675491559e-05,
+      "loss": 1.5722,
+      "step": 1053
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 0.08895613998174667,
+      "learning_rate": 9.399333205083131e-05,
+      "loss": 1.5702,
+      "step": 1054
+    },
+    {
+      "epoch": 1.063508064516129,
+      "grad_norm": 0.08799167722463608,
+      "learning_rate": 9.38311931964211e-05,
+      "loss": 1.5531,
+      "step": 1055
+    },
+    {
+      "epoch": 1.064516129032258,
+      "grad_norm": 0.08785036206245422,
+      "learning_rate": 9.366907061951745e-05,
+      "loss": 1.5398,
+      "step": 1056
+    },
+    {
+      "epoch": 1.065524193548387,
+      "grad_norm": 0.10027193278074265,
+      "learning_rate": 9.350696474790999e-05,
+      "loss": 1.5256,
+      "step": 1057
+    },
+    {
+      "epoch": 1.066532258064516,
+      "grad_norm": 0.08771440386772156,
+      "learning_rate": 9.334487600934416e-05,
+      "loss": 1.5085,
+      "step": 1058
+    },
+    {
+      "epoch": 1.0675403225806452,
+      "grad_norm": 0.08703982830047607,
+      "learning_rate": 9.318280483152033e-05,
+      "loss": 1.4827,
+      "step": 1059
+    },
+    {
+      "epoch": 1.0685483870967742,
+      "grad_norm": 0.08651833981275558,
+      "learning_rate": 9.302075164209241e-05,
+      "loss": 1.5255,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0695564516129032,
+      "grad_norm": 0.0868133008480072,
+      "learning_rate": 9.285871686866692e-05,
+      "loss": 1.4953,
+      "step": 1061
+    },
+    {
+      "epoch": 1.0705645161290323,
+      "grad_norm": 0.08549060672521591,
+      "learning_rate": 9.269670093880177e-05,
+      "loss": 1.5239,
+      "step": 1062
+    },
+    {
+      "epoch": 1.0715725806451613,
+      "grad_norm": 0.08664209395647049,
+      "learning_rate": 9.25347042800051e-05,
+      "loss": 1.5328,
+      "step": 1063
+    },
+    {
+      "epoch": 1.0725806451612903,
+      "grad_norm": 0.0853060856461525,
+      "learning_rate": 9.237272731973428e-05,
+      "loss": 1.4854,
+      "step": 1064
+    },
+    {
+      "epoch": 1.0735887096774193,
+      "grad_norm": 0.10764405876398087,
+      "learning_rate": 9.221077048539464e-05,
+      "loss": 1.5174,
+      "step": 1065
+    },
+    {
+      "epoch": 1.0745967741935485,
+      "grad_norm": 0.09327509254217148,
+      "learning_rate": 9.204883420433844e-05,
+      "loss": 1.5074,
+      "step": 1066
+    },
+    {
+      "epoch": 1.0756048387096775,
+      "grad_norm": 0.08912849426269531,
+      "learning_rate": 9.188691890386367e-05,
+      "loss": 1.4915,
+      "step": 1067
+    },
+    {
+      "epoch": 1.0766129032258065,
+      "grad_norm": 0.08654549717903137,
+      "learning_rate": 9.172502501121297e-05,
+      "loss": 1.4998,
+      "step": 1068
+    },
+    {
+      "epoch": 1.0776209677419355,
+      "grad_norm": 0.09039713442325592,
+      "learning_rate": 9.156315295357257e-05,
+      "loss": 1.5139,
+      "step": 1069
+    },
+    {
+      "epoch": 1.0786290322580645,
+      "grad_norm": 0.08438859134912491,
+      "learning_rate": 9.140130315807091e-05,
+      "loss": 1.4935,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0796370967741935,
+      "grad_norm": 0.08553072065114975,
+      "learning_rate": 9.123947605177791e-05,
+      "loss": 1.508,
+      "step": 1071
+    },
+    {
+      "epoch": 1.0806451612903225,
+      "grad_norm": 0.08692750334739685,
+      "learning_rate": 9.107767206170342e-05,
+      "loss": 1.5114,
+      "step": 1072
+    },
+    {
+      "epoch": 1.0816532258064515,
+      "grad_norm": 0.09480643272399902,
+      "learning_rate": 9.09158916147964e-05,
+      "loss": 1.5726,
+      "step": 1073
+    },
+    {
+      "epoch": 1.0826612903225807,
+      "grad_norm": 0.0879359245300293,
+      "learning_rate": 9.075413513794369e-05,
+      "loss": 1.4962,
+      "step": 1074
+    },
+    {
+      "epoch": 1.0836693548387097,
+      "grad_norm": 0.09322493523359299,
+      "learning_rate": 9.059240305796884e-05,
+      "loss": 1.5454,
+      "step": 1075
+    },
+    {
+      "epoch": 1.0846774193548387,
+      "grad_norm": 0.09673374146223068,
+      "learning_rate": 9.043069580163099e-05,
+      "loss": 1.509,
+      "step": 1076
+    },
+    {
+      "epoch": 1.0856854838709677,
+      "grad_norm": 0.08707006275653839,
+      "learning_rate": 9.02690137956239e-05,
+      "loss": 1.5632,
+      "step": 1077
+    },
+    {
+      "epoch": 1.0866935483870968,
+      "grad_norm": 0.08686521649360657,
+      "learning_rate": 9.010735746657462e-05,
+      "loss": 1.4968,
+      "step": 1078
+    },
+    {
+      "epoch": 1.0877016129032258,
+      "grad_norm": 0.08472903817892075,
+      "learning_rate": 8.994572724104242e-05,
+      "loss": 1.4908,
+      "step": 1079
+    },
+    {
+      "epoch": 1.0887096774193548,
+      "grad_norm": 0.09030890464782715,
+      "learning_rate": 8.978412354551779e-05,
+      "loss": 1.5018,
+      "step": 1080
+    },
+    {
+      "epoch": 1.089717741935484,
+      "grad_norm": 0.08417510986328125,
+      "learning_rate": 8.962254680642107e-05,
+      "loss": 1.4444,
+      "step": 1081
+    },
+    {
+      "epoch": 1.090725806451613,
+      "grad_norm": 0.09092919528484344,
+      "learning_rate": 8.946099745010164e-05,
+      "loss": 1.5303,
+      "step": 1082
+    },
+    {
+      "epoch": 1.091733870967742,
+      "grad_norm": 0.09100567549467087,
+      "learning_rate": 8.929947590283647e-05,
+      "loss": 1.5403,
+      "step": 1083
+    },
+    {
+      "epoch": 1.092741935483871,
+      "grad_norm": 0.12923839688301086,
+      "learning_rate": 8.913798259082928e-05,
+      "loss": 1.4664,
+      "step": 1084
+    },
+    {
+      "epoch": 1.09375,
+      "grad_norm": 0.09925505518913269,
+      "learning_rate": 8.897651794020918e-05,
+      "loss": 1.5229,
+      "step": 1085
+    },
+    {
+      "epoch": 1.094758064516129,
+      "grad_norm": 0.08671566098928452,
+      "learning_rate": 8.881508237702973e-05,
+      "loss": 1.4995,
+      "step": 1086
+    },
+    {
+      "epoch": 1.095766129032258,
+      "grad_norm": 0.08649452030658722,
+      "learning_rate": 8.865367632726772e-05,
+      "loss": 1.4993,
+      "step": 1087
+    },
+    {
+      "epoch": 1.096774193548387,
+      "grad_norm": 0.0882314071059227,
+      "learning_rate": 8.849230021682199e-05,
+      "loss": 1.5442,
+      "step": 1088
+    },
+    {
+      "epoch": 1.097782258064516,
+      "grad_norm": 0.088679738342762,
+      "learning_rate": 8.833095447151252e-05,
+      "loss": 1.5014,
+      "step": 1089
+    },
+    {
+      "epoch": 1.0987903225806452,
+      "grad_norm": 0.09637542814016342,
+      "learning_rate": 8.816963951707901e-05,
+      "loss": 1.5503,
+      "step": 1090
+    },
+    {
+      "epoch": 1.0997983870967742,
+      "grad_norm": 0.09071476012468338,
+      "learning_rate": 8.800835577918006e-05,
+      "loss": 1.5016,
+      "step": 1091
+    },
+    {
+      "epoch": 1.1008064516129032,
+      "grad_norm": 0.09719227999448776,
+      "learning_rate": 8.784710368339178e-05,
+      "loss": 1.4767,
+      "step": 1092
+    },
+    {
+      "epoch": 1.1018145161290323,
+      "grad_norm": 0.08729701489210129,
+      "learning_rate": 8.768588365520685e-05,
+      "loss": 1.5011,
+      "step": 1093
+    },
+    {
+      "epoch": 1.1028225806451613,
+      "grad_norm": 0.08893397450447083,
+      "learning_rate": 8.752469612003332e-05,
+      "loss": 1.5368,
+      "step": 1094
+    },
+    {
+      "epoch": 1.1038306451612903,
+      "grad_norm": 0.08354583382606506,
+      "learning_rate": 8.736354150319349e-05,
+      "loss": 1.5199,
+      "step": 1095
+    },
+    {
+      "epoch": 1.1048387096774193,
+      "grad_norm": 0.08970467001199722,
+      "learning_rate": 8.720242022992284e-05,
+      "loss": 1.5328,
+      "step": 1096
+    },
+    {
+      "epoch": 1.1058467741935485,
+      "grad_norm": 0.09049658477306366,
+      "learning_rate": 8.704133272536879e-05,
+      "loss": 1.5323,
+      "step": 1097
+    },
+    {
+      "epoch": 1.1068548387096775,
+      "grad_norm": 0.08495205640792847,
+      "learning_rate": 8.68802794145898e-05,
+      "loss": 1.4833,
+      "step": 1098
+    },
+    {
+      "epoch": 1.1078629032258065,
+      "grad_norm": 0.08763737976551056,
+      "learning_rate": 8.671926072255389e-05,
+      "loss": 1.5314,
+      "step": 1099
+    },
+    {
+      "epoch": 1.1088709677419355,
+      "grad_norm": 0.0835312008857727,
+      "learning_rate": 8.655827707413788e-05,
+      "loss": 1.5162,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1098790322580645,
+      "grad_norm": 0.08878222852945328,
+      "learning_rate": 8.63973288941261e-05,
+      "loss": 1.4885,
+      "step": 1101
+    },
+    {
+      "epoch": 1.1108870967741935,
+      "grad_norm": 0.09213855862617493,
+      "learning_rate": 8.623641660720928e-05,
+      "loss": 1.5398,
+      "step": 1102
+    },
+    {
+      "epoch": 1.1118951612903225,
+      "grad_norm": 0.08432666957378387,
+      "learning_rate": 8.607554063798346e-05,
+      "loss": 1.4907,
+      "step": 1103
+    },
+    {
+      "epoch": 1.1129032258064515,
+      "grad_norm": 0.10029254853725433,
+      "learning_rate": 8.591470141094878e-05,
+      "loss": 1.5904,
+      "step": 1104
+    },
+    {
+      "epoch": 1.1139112903225807,
+      "grad_norm": 0.08696424961090088,
+      "learning_rate": 8.57538993505085e-05,
+      "loss": 1.5079,
+      "step": 1105
+    },
+    {
+      "epoch": 1.1149193548387097,
+      "grad_norm": 0.08842870593070984,
+      "learning_rate": 8.559313488096782e-05,
+      "loss": 1.5223,
+      "step": 1106
+    },
+    {
+      "epoch": 1.1159274193548387,
+      "grad_norm": 0.08505623787641525,
+      "learning_rate": 8.543240842653266e-05,
+      "loss": 1.4939,
+      "step": 1107
+    },
+    {
+      "epoch": 1.1169354838709677,
+      "grad_norm": 0.09814995527267456,
+      "learning_rate": 8.527172041130874e-05,
+      "loss": 1.5732,
+      "step": 1108
+    },
+    {
+      "epoch": 1.1179435483870968,
+      "grad_norm": 0.09438839554786682,
+      "learning_rate": 8.511107125930022e-05,
+      "loss": 1.5903,
+      "step": 1109
+    },
+    {
+      "epoch": 1.1189516129032258,
+      "grad_norm": 0.08910852670669556,
+      "learning_rate": 8.49504613944089e-05,
+      "loss": 1.5203,
+      "step": 1110
+    },
+    {
+      "epoch": 1.1199596774193548,
+      "grad_norm": 0.0924610123038292,
+      "learning_rate": 8.47898912404327e-05,
+      "loss": 1.5302,
+      "step": 1111
+    },
+    {
+      "epoch": 1.120967741935484,
+      "grad_norm": 0.08957453072071075,
+      "learning_rate": 8.462936122106489e-05,
+      "loss": 1.5179,
+      "step": 1112
+    },
+    {
+      "epoch": 1.121975806451613,
+      "grad_norm": 0.1187904104590416,
+      "learning_rate": 8.446887175989286e-05,
+      "loss": 1.5622,
+      "step": 1113
+    },
+    {
+      "epoch": 1.122983870967742,
+      "grad_norm": 0.0907069593667984,
+      "learning_rate": 8.430842328039686e-05,
+      "loss": 1.502,
+      "step": 1114
+    },
+    {
+      "epoch": 1.123991935483871,
+      "grad_norm": 0.09245329350233078,
+      "learning_rate": 8.414801620594912e-05,
+      "loss": 1.476,
+      "step": 1115
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.10100734978914261,
+      "learning_rate": 8.398765095981251e-05,
+      "loss": 1.5111,
+      "step": 1116
+    },
+    {
+      "epoch": 1.126008064516129,
+      "grad_norm": 0.09156333655118942,
+      "learning_rate": 8.382732796513966e-05,
+      "loss": 1.4985,
+      "step": 1117
+    },
+    {
+      "epoch": 1.127016129032258,
+      "grad_norm": 0.11173349618911743,
+      "learning_rate": 8.366704764497154e-05,
+      "loss": 1.4869,
+      "step": 1118
+    },
+    {
+      "epoch": 1.128024193548387,
+      "grad_norm": 0.08984418958425522,
+      "learning_rate": 8.35068104222367e-05,
+      "loss": 1.52,
+      "step": 1119
+    },
+    {
+      "epoch": 1.129032258064516,
+      "grad_norm": 0.11599362641572952,
+      "learning_rate": 8.33466167197498e-05,
+      "loss": 1.5154,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1300403225806452,
+      "grad_norm": 0.09752003848552704,
+      "learning_rate": 8.318646696021077e-05,
+      "loss": 1.4838,
+      "step": 1121
+    },
+    {
+      "epoch": 1.1310483870967742,
+      "grad_norm": 0.09071122109889984,
+      "learning_rate": 8.302636156620363e-05,
+      "loss": 1.5761,
+      "step": 1122
+    },
+    {
+      "epoch": 1.1320564516129032,
+      "grad_norm": 0.08928891271352768,
+      "learning_rate": 8.286630096019518e-05,
+      "loss": 1.5265,
+      "step": 1123
+    },
+    {
+      "epoch": 1.1330645161290323,
+      "grad_norm": 0.08508775383234024,
+      "learning_rate": 8.270628556453417e-05,
+      "loss": 1.548,
+      "step": 1124
+    },
+    {
+      "epoch": 1.1340725806451613,
+      "grad_norm": 0.08637328445911407,
+      "learning_rate": 8.254631580144999e-05,
+      "loss": 1.4786,
+      "step": 1125
+    },
+    {
+      "epoch": 1.1350806451612903,
+      "grad_norm": 0.08538668602705002,
+      "learning_rate": 8.238639209305166e-05,
+      "loss": 1.4797,
+      "step": 1126
+    },
+    {
+      "epoch": 1.1360887096774193,
+      "grad_norm": 0.08973052352666855,
+      "learning_rate": 8.222651486132664e-05,
+      "loss": 1.5066,
+      "step": 1127
+    },
+    {
+      "epoch": 1.1370967741935485,
+      "grad_norm": 0.08729778975248337,
+      "learning_rate": 8.206668452813978e-05,
+      "loss": 1.4973,
+      "step": 1128
+    },
+    {
+      "epoch": 1.1381048387096775,
+      "grad_norm": 0.08795138448476791,
+      "learning_rate": 8.190690151523215e-05,
+      "loss": 1.4892,
+      "step": 1129
+    },
+    {
+      "epoch": 1.1391129032258065,
+      "grad_norm": 0.08695145696401596,
+      "learning_rate": 8.174716624421997e-05,
+      "loss": 1.5163,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1401209677419355,
+      "grad_norm": 0.08848337084054947,
+      "learning_rate": 8.158747913659355e-05,
+      "loss": 1.4907,
+      "step": 1131
+    },
+    {
+      "epoch": 1.1411290322580645,
+      "grad_norm": 0.08827504515647888,
+      "learning_rate": 8.142784061371598e-05,
+      "loss": 1.5306,
+      "step": 1132
+    },
+    {
+      "epoch": 1.1421370967741935,
+      "grad_norm": 0.09366059303283691,
+      "learning_rate": 8.126825109682228e-05,
+      "loss": 1.4598,
+      "step": 1133
+    },
+    {
+      "epoch": 1.1431451612903225,
+      "grad_norm": 0.09082233905792236,
+      "learning_rate": 8.110871100701807e-05,
+      "loss": 1.5746,
+      "step": 1134
+    },
+    {
+      "epoch": 1.1441532258064515,
+      "grad_norm": 0.10159925371408463,
+      "learning_rate": 8.094922076527859e-05,
+      "loss": 1.5689,
+      "step": 1135
+    },
+    {
+      "epoch": 1.1451612903225807,
+      "grad_norm": 0.10202515870332718,
+      "learning_rate": 8.078978079244752e-05,
+      "loss": 1.5155,
+      "step": 1136
+    },
+    {
+      "epoch": 1.1461693548387097,
+      "grad_norm": 0.0907059907913208,
+      "learning_rate": 8.063039150923595e-05,
+      "loss": 1.5552,
+      "step": 1137
+    },
+    {
+      "epoch": 1.1471774193548387,
+      "grad_norm": 0.08588322252035141,
+      "learning_rate": 8.047105333622112e-05,
+      "loss": 1.5299,
+      "step": 1138
+    },
+    {
+      "epoch": 1.1481854838709677,
+      "grad_norm": 0.08953887969255447,
+      "learning_rate": 8.031176669384552e-05,
+      "loss": 1.5528,
+      "step": 1139
+    },
+    {
+      "epoch": 1.1491935483870968,
+      "grad_norm": 0.08963429927825928,
+      "learning_rate": 8.01525320024156e-05,
+      "loss": 1.4823,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1502016129032258,
+      "grad_norm": 0.09360229223966599,
+      "learning_rate": 7.999334968210073e-05,
+      "loss": 1.5288,
+      "step": 1141
+    },
+    {
+      "epoch": 1.1512096774193548,
+      "grad_norm": 0.09653651714324951,
+      "learning_rate": 7.983422015293212e-05,
+      "loss": 1.502,
+      "step": 1142
+    },
+    {
+      "epoch": 1.152217741935484,
+      "grad_norm": 0.0958021953701973,
+      "learning_rate": 7.967514383480161e-05,
+      "loss": 1.4772,
+      "step": 1143
+    },
+    {
+      "epoch": 1.153225806451613,
+      "grad_norm": 0.0900203064084053,
+      "learning_rate": 7.951612114746076e-05,
+      "loss": 1.5536,
+      "step": 1144
+    },
+    {
+      "epoch": 1.154233870967742,
+      "grad_norm": 0.1079091802239418,
+      "learning_rate": 7.935715251051949e-05,
+      "loss": 1.482,
+      "step": 1145
+    },
+    {
+      "epoch": 1.155241935483871,
+      "grad_norm": 0.09951366484165192,
+      "learning_rate": 7.919823834344516e-05,
+      "loss": 1.4908,
+      "step": 1146
+    },
+    {
+      "epoch": 1.15625,
+      "grad_norm": 0.08866190165281296,
+      "learning_rate": 7.90393790655614e-05,
+      "loss": 1.5027,
+      "step": 1147
+    },
+    {
+      "epoch": 1.157258064516129,
+      "grad_norm": 0.09670446068048477,
+      "learning_rate": 7.888057509604697e-05,
+      "loss": 1.4905,
+      "step": 1148
+    },
+    {
+      "epoch": 1.158266129032258,
+      "grad_norm": 0.0998421311378479,
+      "learning_rate": 7.872182685393475e-05,
+      "loss": 1.5349,
+      "step": 1149
+    },
+    {
+      "epoch": 1.159274193548387,
+      "grad_norm": 0.09023125469684601,
+      "learning_rate": 7.85631347581105e-05,
+      "loss": 1.5502,
+      "step": 1150
+    },
+    {
+      "epoch": 1.160282258064516,
+      "grad_norm": 0.09362298995256424,
+      "learning_rate": 7.84044992273119e-05,
+      "loss": 1.4587,
+      "step": 1151
+    },
+    {
+      "epoch": 1.1612903225806452,
+      "grad_norm": 0.09614353626966476,
+      "learning_rate": 7.82459206801273e-05,
+      "loss": 1.5398,
+      "step": 1152
+    },
+    {
+      "epoch": 1.1622983870967742,
+      "grad_norm": 0.08735020458698273,
+      "learning_rate": 7.808739953499478e-05,
+      "loss": 1.5106,
+      "step": 1153
+    },
+    {
+      "epoch": 1.1633064516129032,
+      "grad_norm": 0.11043401807546616,
+      "learning_rate": 7.792893621020082e-05,
+      "loss": 1.533,
+      "step": 1154
+    },
+    {
+      "epoch": 1.1643145161290323,
+      "grad_norm": 0.11868879944086075,
+      "learning_rate": 7.777053112387949e-05,
+      "loss": 1.5086,
+      "step": 1155
+    },
+    {
+      "epoch": 1.1653225806451613,
+      "grad_norm": 0.08818439394235611,
+      "learning_rate": 7.761218469401108e-05,
+      "loss": 1.5127,
+      "step": 1156
+    },
+    {
+      "epoch": 1.1663306451612903,
+      "grad_norm": 0.1308388113975525,
+      "learning_rate": 7.745389733842112e-05,
+      "loss": 1.4556,
+      "step": 1157
+    },
+    {
+      "epoch": 1.1673387096774193,
+      "grad_norm": 0.09634990245103836,
+      "learning_rate": 7.729566947477928e-05,
+      "loss": 1.5527,
+      "step": 1158
+    },
+    {
+      "epoch": 1.1683467741935485,
+      "grad_norm": 0.11291810870170593,
+      "learning_rate": 7.713750152059826e-05,
+      "loss": 1.5556,
+      "step": 1159
+    },
+    {
+      "epoch": 1.1693548387096775,
+      "grad_norm": 0.10674012452363968,
+      "learning_rate": 7.697939389323267e-05,
+      "loss": 1.4921,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1703629032258065,
+      "grad_norm": 0.09948462247848511,
+      "learning_rate": 7.682134700987789e-05,
+      "loss": 1.5691,
+      "step": 1161
+    },
+    {
+      "epoch": 1.1713709677419355,
+      "grad_norm": 0.09521344304084778,
+      "learning_rate": 7.66633612875691e-05,
+      "loss": 1.475,
+      "step": 1162
+    },
+    {
+      "epoch": 1.1723790322580645,
+      "grad_norm": 0.11034592241048813,
+      "learning_rate": 7.650543714318001e-05,
+      "loss": 1.5353,
+      "step": 1163
+    },
+    {
+      "epoch": 1.1733870967741935,
+      "grad_norm": 0.08763138949871063,
+      "learning_rate": 7.634757499342191e-05,
+      "loss": 1.4952,
+      "step": 1164
+    },
+    {
+      "epoch": 1.1743951612903225,
+      "grad_norm": 0.09569991379976273,
+      "learning_rate": 7.61897752548425e-05,
+      "loss": 1.5287,
+      "step": 1165
+    },
+    {
+      "epoch": 1.1754032258064515,
+      "grad_norm": 0.12841151654720306,
+      "learning_rate": 7.603203834382476e-05,
+      "loss": 1.6028,
+      "step": 1166
+    },
+    {
+      "epoch": 1.1764112903225807,
+      "grad_norm": 0.08578557521104813,
+      "learning_rate": 7.58743646765859e-05,
+      "loss": 1.4683,
+      "step": 1167
+    },
+    {
+      "epoch": 1.1774193548387097,
+      "grad_norm": 0.10593171417713165,
+      "learning_rate": 7.571675466917626e-05,
+      "loss": 1.5351,
+      "step": 1168
+    },
+    {
+      "epoch": 1.1784274193548387,
+      "grad_norm": 0.10871924459934235,
+      "learning_rate": 7.555920873747823e-05,
+      "loss": 1.5334,
+      "step": 1169
+    },
+    {
+      "epoch": 1.1794354838709677,
+      "grad_norm": 0.08840969204902649,
+      "learning_rate": 7.540172729720504e-05,
+      "loss": 1.5035,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1804435483870968,
+      "grad_norm": 0.08680961281061172,
+      "learning_rate": 7.524431076389986e-05,
+      "loss": 1.4756,
+      "step": 1171
+    },
+    {
+      "epoch": 1.1814516129032258,
+      "grad_norm": 0.0890466570854187,
+      "learning_rate": 7.50869595529345e-05,
+      "loss": 1.5077,
+      "step": 1172
+    },
+    {
+      "epoch": 1.1824596774193548,
+      "grad_norm": 0.10022439807653427,
+      "learning_rate": 7.492967407950844e-05,
+      "loss": 1.5001,
+      "step": 1173
+    },
+    {
+      "epoch": 1.183467741935484,
+      "grad_norm": 0.12129071354866028,
+      "learning_rate": 7.477245475864771e-05,
+      "loss": 1.5234,
+      "step": 1174
+    },
+    {
+      "epoch": 1.184475806451613,
+      "grad_norm": 0.09167549759149551,
+      "learning_rate": 7.461530200520377e-05,
+      "loss": 1.4971,
+      "step": 1175
+    },
+    {
+      "epoch": 1.185483870967742,
+      "grad_norm": 0.08763924986124039,
+      "learning_rate": 7.445821623385245e-05,
+      "loss": 1.5229,
+      "step": 1176
+    },
+    {
+      "epoch": 1.186491935483871,
+      "grad_norm": 0.13247455656528473,
+      "learning_rate": 7.430119785909278e-05,
+      "loss": 1.4973,
+      "step": 1177
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.10564038902521133,
+      "learning_rate": 7.414424729524602e-05,
+      "loss": 1.4549,
+      "step": 1178
+    },
+    {
+      "epoch": 1.188508064516129,
+      "grad_norm": 0.09784973412752151,
+      "learning_rate": 7.398736495645447e-05,
+      "loss": 1.514,
+      "step": 1179
+    },
+    {
+      "epoch": 1.189516129032258,
+      "grad_norm": 0.0994093120098114,
+      "learning_rate": 7.383055125668038e-05,
+      "loss": 1.4899,
+      "step": 1180
+    },
+    {
+      "epoch": 1.190524193548387,
+      "grad_norm": 0.09787564724683762,
+      "learning_rate": 7.367380660970493e-05,
+      "loss": 1.5306,
+      "step": 1181
+    },
+    {
+      "epoch": 1.191532258064516,
+      "grad_norm": 0.09221166372299194,
+      "learning_rate": 7.351713142912707e-05,
+      "loss": 1.5314,
+      "step": 1182
+    },
+    {
+      "epoch": 1.1925403225806452,
+      "grad_norm": 0.10157594084739685,
+      "learning_rate": 7.336052612836246e-05,
+      "loss": 1.5738,
+      "step": 1183
+    },
+    {
+      "epoch": 1.1935483870967742,
+      "grad_norm": 0.09838045388460159,
+      "learning_rate": 7.320399112064233e-05,
+      "loss": 1.543,
+      "step": 1184
+    },
+    {
+      "epoch": 1.1945564516129032,
+      "grad_norm": 0.0991106852889061,
+      "learning_rate": 7.304752681901251e-05,
+      "loss": 1.5159,
+      "step": 1185
+    },
+    {
+      "epoch": 1.1955645161290323,
+      "grad_norm": 0.09091247618198395,
+      "learning_rate": 7.289113363633215e-05,
+      "loss": 1.5109,
+      "step": 1186
+    },
+    {
+      "epoch": 1.1965725806451613,
+      "grad_norm": 0.10874085873365402,
+      "learning_rate": 7.273481198527285e-05,
+      "loss": 1.539,
+      "step": 1187
+    },
+    {
+      "epoch": 1.1975806451612903,
+      "grad_norm": 0.26701486110687256,
+      "learning_rate": 7.257856227831738e-05,
+      "loss": 1.5231,
+      "step": 1188
+    },
+    {
+      "epoch": 1.1985887096774193,
+      "grad_norm": 0.11611919850111008,
+      "learning_rate": 7.242238492775869e-05,
+      "loss": 1.5325,
+      "step": 1189
+    },
+    {
+      "epoch": 1.1995967741935485,
+      "grad_norm": 0.09033048897981644,
+      "learning_rate": 7.226628034569886e-05,
+      "loss": 1.5223,
+      "step": 1190
+    },
+    {
+      "epoch": 1.2006048387096775,
+      "grad_norm": 0.09677241742610931,
+      "learning_rate": 7.211024894404788e-05,
+      "loss": 1.5277,
+      "step": 1191
+    },
+    {
+      "epoch": 1.2016129032258065,
+      "grad_norm": 0.09878189116716385,
+      "learning_rate": 7.195429113452271e-05,
+      "loss": 1.525,
+      "step": 1192
+    },
+    {
+      "epoch": 1.2026209677419355,
+      "grad_norm": 0.09079443663358688,
+      "learning_rate": 7.179840732864604e-05,
+      "loss": 1.4836,
+      "step": 1193
+    },
+    {
+      "epoch": 1.2036290322580645,
+      "grad_norm": 0.10527854412794113,
+      "learning_rate": 7.16425979377454e-05,
+      "loss": 1.5068,
+      "step": 1194
+    },
+    {
+      "epoch": 1.2046370967741935,
+      "grad_norm": 0.1167258769273758,
+      "learning_rate": 7.148686337295181e-05,
+      "loss": 1.4535,
+      "step": 1195
+    },
+    {
+      "epoch": 1.2056451612903225,
+      "grad_norm": 0.09939006716012955,
+      "learning_rate": 7.133120404519903e-05,
+      "loss": 1.4873,
+      "step": 1196
+    },
+    {
+      "epoch": 1.2066532258064515,
+      "grad_norm": 0.09883987158536911,
+      "learning_rate": 7.117562036522213e-05,
+      "loss": 1.5022,
+      "step": 1197
+    },
+    {
+      "epoch": 1.2076612903225807,
+      "grad_norm": 0.10209079831838608,
+      "learning_rate": 7.102011274355667e-05,
+      "loss": 1.5432,
+      "step": 1198
+    },
+    {
+      "epoch": 1.2086693548387097,
+      "grad_norm": 0.10384919494390488,
+      "learning_rate": 7.086468159053751e-05,
+      "loss": 1.5568,
+      "step": 1199
+    },
+    {
+      "epoch": 1.2096774193548387,
+      "grad_norm": 0.1108224168419838,
+      "learning_rate": 7.070932731629769e-05,
+      "loss": 1.4903,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1984,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 300,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.728889247690916e+19,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}