diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25641 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3657, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027344818156959256, + "grad_norm": 0.5185216069221497, + "learning_rate": 5.000000000000001e-07, + "loss": 1.952, + "step": 1 + }, + { + "epoch": 0.0005468963631391851, + "grad_norm": 0.48264726996421814, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9283, + "step": 2 + }, + { + "epoch": 0.0008203445447087777, + "grad_norm": 0.3865518867969513, + "learning_rate": 1.5e-06, + "loss": 1.8876, + "step": 3 + }, + { + "epoch": 0.0010937927262783702, + "grad_norm": 0.34167248010635376, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.9148, + "step": 4 + }, + { + "epoch": 0.0013672409078479629, + "grad_norm": 0.6942479014396667, + "learning_rate": 2.5e-06, + "loss": 1.946, + "step": 5 + }, + { + "epoch": 0.0016406890894175555, + "grad_norm": 0.34968069195747375, + "learning_rate": 3e-06, + "loss": 1.9213, + "step": 6 + }, + { + "epoch": 0.0019141372709871479, + "grad_norm": 0.48089227080345154, + "learning_rate": 3.5000000000000004e-06, + "loss": 1.8915, + "step": 7 + }, + { + "epoch": 0.0021875854525567405, + "grad_norm": 0.34136202931404114, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9219, + "step": 8 + }, + { + "epoch": 0.002461033634126333, + "grad_norm": 0.529236912727356, + "learning_rate": 4.5e-06, + "loss": 1.9322, + "step": 9 + }, + { + "epoch": 0.0027344818156959257, + "grad_norm": 0.4663859009742737, + "learning_rate": 5e-06, + "loss": 1.8154, + "step": 10 + }, + { + "epoch": 0.0030079299972655183, + "grad_norm": 0.9236092567443848, + "learning_rate": 5.500000000000001e-06, + "loss": 1.9887, + "step": 11 + }, + { + "epoch": 0.003281378178835111, + "grad_norm": 0.43542373180389404, + "learning_rate": 6e-06, + "loss": 1.963, + "step": 12 + }, + { + "epoch": 0.003554826360404703, + "grad_norm": 0.33968186378479004, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.7446, + "step": 13 + }, + { + "epoch": 0.0038282745419742957, + "grad_norm": 0.2824525535106659, + "learning_rate": 7.000000000000001e-06, + "loss": 1.8923, + "step": 14 + }, + { + "epoch": 0.004101722723543888, + "grad_norm": 0.2588050663471222, + "learning_rate": 7.5e-06, + "loss": 1.7194, + "step": 15 + }, + { + "epoch": 0.004375170905113481, + "grad_norm": 0.3694664537906647, + "learning_rate": 8.000000000000001e-06, + "loss": 1.8807, + "step": 16 + }, + { + "epoch": 0.004648619086683074, + "grad_norm": 0.2177015244960785, + "learning_rate": 8.500000000000002e-06, + "loss": 1.8355, + "step": 17 + }, + { + "epoch": 0.004922067268252666, + "grad_norm": 0.395831435918808, + "learning_rate": 9e-06, + "loss": 1.9272, + "step": 18 + }, + { + "epoch": 0.005195515449822259, + "grad_norm": 0.23232288658618927, + "learning_rate": 9.5e-06, + "loss": 1.8483, + "step": 19 + }, + { + "epoch": 0.005468963631391851, + "grad_norm": 0.211470827460289, + "learning_rate": 1e-05, + "loss": 1.8662, + "step": 20 + }, + { + "epoch": 0.005742411812961444, + "grad_norm": 0.21474166214466095, + "learning_rate": 1.05e-05, + "loss": 1.8321, + "step": 21 + }, + { + "epoch": 0.006015859994531037, + "grad_norm": 0.22423861920833588, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.8677, + "step": 22 + }, + { + "epoch": 0.006289308176100629, + "grad_norm": 0.22832632064819336, + "learning_rate": 1.1500000000000002e-05, + "loss": 1.8584, + "step": 23 + }, + { + "epoch": 0.006562756357670222, + "grad_norm": 0.2289043813943863, + "learning_rate": 1.2e-05, + "loss": 1.9455, + "step": 24 + }, + { + "epoch": 0.006836204539239814, + "grad_norm": 0.21795853972434998, + "learning_rate": 1.25e-05, + "loss": 1.8726, + "step": 25 + }, + { + "epoch": 0.007109652720809406, + "grad_norm": 0.2133929580450058, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.8685, + "step": 26 + }, + { + "epoch": 0.007383100902378999, + "grad_norm": 0.18260058760643005, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.8412, + "step": 27 + }, + { + "epoch": 0.0076565490839485915, + "grad_norm": 0.18040010333061218, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.8779, + "step": 28 + }, + { + "epoch": 0.007929997265518185, + "grad_norm": 0.17110276222229004, + "learning_rate": 1.45e-05, + "loss": 1.8708, + "step": 29 + }, + { + "epoch": 0.008203445447087777, + "grad_norm": 0.17357124388217926, + "learning_rate": 1.5e-05, + "loss": 1.8665, + "step": 30 + }, + { + "epoch": 0.00847689362865737, + "grad_norm": 0.1595795750617981, + "learning_rate": 1.55e-05, + "loss": 1.861, + "step": 31 + }, + { + "epoch": 0.008750341810226962, + "grad_norm": 0.18393680453300476, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.9642, + "step": 32 + }, + { + "epoch": 0.009023789991796555, + "grad_norm": 0.17827364802360535, + "learning_rate": 1.65e-05, + "loss": 1.8671, + "step": 33 + }, + { + "epoch": 0.009297238173366147, + "grad_norm": 0.1620740294456482, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.8421, + "step": 34 + }, + { + "epoch": 0.009570686354935739, + "grad_norm": 0.15932734310626984, + "learning_rate": 1.75e-05, + "loss": 1.6872, + "step": 35 + }, + { + "epoch": 0.009844134536505332, + "grad_norm": 0.1550036072731018, + "learning_rate": 1.8e-05, + "loss": 1.7036, + "step": 36 + }, + { + "epoch": 0.010117582718074924, + "grad_norm": 0.17445598542690277, + "learning_rate": 1.85e-05, + "loss": 1.9004, + "step": 37 + }, + { + "epoch": 0.010391030899644518, + "grad_norm": 0.15758390724658966, + "learning_rate": 1.9e-05, + "loss": 1.8423, + "step": 38 + }, + { + "epoch": 0.01066447908121411, + "grad_norm": 0.16238977015018463, + "learning_rate": 1.9500000000000003e-05, + "loss": 1.7722, + "step": 39 + }, + { + "epoch": 0.010937927262783703, + "grad_norm": 0.14447326958179474, + "learning_rate": 2e-05, + "loss": 1.7876, + "step": 40 + }, + { + "epoch": 0.011211375444353295, + "grad_norm": 0.1725250482559204, + "learning_rate": 2.05e-05, + "loss": 1.7639, + "step": 41 + }, + { + "epoch": 0.011484823625922888, + "grad_norm": 0.14800074696540833, + "learning_rate": 2.1e-05, + "loss": 1.7457, + "step": 42 + }, + { + "epoch": 0.01175827180749248, + "grad_norm": 0.16171656548976898, + "learning_rate": 2.15e-05, + "loss": 1.8488, + "step": 43 + }, + { + "epoch": 0.012031719989062073, + "grad_norm": 0.17418281733989716, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.8017, + "step": 44 + }, + { + "epoch": 0.012305168170631665, + "grad_norm": 0.1688196063041687, + "learning_rate": 2.25e-05, + "loss": 1.9224, + "step": 45 + }, + { + "epoch": 0.012578616352201259, + "grad_norm": 0.1520584672689438, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.8044, + "step": 46 + }, + { + "epoch": 0.01285206453377085, + "grad_norm": 0.1390339732170105, + "learning_rate": 2.35e-05, + "loss": 1.709, + "step": 47 + }, + { + "epoch": 0.013125512715340444, + "grad_norm": 0.17691773176193237, + "learning_rate": 2.4e-05, + "loss": 1.8488, + "step": 48 + }, + { + "epoch": 0.013398960896910036, + "grad_norm": 0.1481151133775711, + "learning_rate": 2.45e-05, + "loss": 1.9118, + "step": 49 + }, + { + "epoch": 0.013672409078479627, + "grad_norm": 0.14990705251693726, + "learning_rate": 2.5e-05, + "loss": 1.7629, + "step": 50 + }, + { + "epoch": 0.01394585726004922, + "grad_norm": 0.15109795331954956, + "learning_rate": 2.5500000000000003e-05, + "loss": 1.7675, + "step": 51 + }, + { + "epoch": 0.014219305441618812, + "grad_norm": 0.15651960670948029, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.7662, + "step": 52 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 0.15988552570343018, + "learning_rate": 2.6500000000000004e-05, + "loss": 1.7708, + "step": 53 + }, + { + "epoch": 0.014766201804757998, + "grad_norm": 0.1471298187971115, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.7393, + "step": 54 + }, + { + "epoch": 0.015039649986327591, + "grad_norm": 0.1633843183517456, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.8271, + "step": 55 + }, + { + "epoch": 0.015313098167897183, + "grad_norm": 0.1614934802055359, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.7251, + "step": 56 + }, + { + "epoch": 0.015586546349466776, + "grad_norm": 0.15689364075660706, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.7851, + "step": 57 + }, + { + "epoch": 0.01585999453103637, + "grad_norm": 0.1521284133195877, + "learning_rate": 2.9e-05, + "loss": 1.8146, + "step": 58 + }, + { + "epoch": 0.01613344271260596, + "grad_norm": 0.15829654037952423, + "learning_rate": 2.95e-05, + "loss": 1.8003, + "step": 59 + }, + { + "epoch": 0.016406890894175553, + "grad_norm": 0.1448163092136383, + "learning_rate": 3e-05, + "loss": 1.7512, + "step": 60 + }, + { + "epoch": 0.016680339075745145, + "grad_norm": 0.1601008176803589, + "learning_rate": 3.05e-05, + "loss": 1.7764, + "step": 61 + }, + { + "epoch": 0.01695378725731474, + "grad_norm": 0.14987356960773468, + "learning_rate": 3.1e-05, + "loss": 1.7891, + "step": 62 + }, + { + "epoch": 0.017227235438884332, + "grad_norm": 0.16058160364627838, + "learning_rate": 3.15e-05, + "loss": 1.7912, + "step": 63 + }, + { + "epoch": 0.017500683620453924, + "grad_norm": 0.15416061878204346, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.8189, + "step": 64 + }, + { + "epoch": 0.017774131802023516, + "grad_norm": 0.15167462825775146, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.6827, + "step": 65 + }, + { + "epoch": 0.01804757998359311, + "grad_norm": 0.1508171409368515, + "learning_rate": 3.3e-05, + "loss": 1.7364, + "step": 66 + }, + { + "epoch": 0.018321028165162703, + "grad_norm": 0.15617215633392334, + "learning_rate": 3.35e-05, + "loss": 1.7085, + "step": 67 + }, + { + "epoch": 0.018594476346732294, + "grad_norm": 0.15548895299434662, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.8138, + "step": 68 + }, + { + "epoch": 0.018867924528301886, + "grad_norm": 0.17112572491168976, + "learning_rate": 3.45e-05, + "loss": 1.8459, + "step": 69 + }, + { + "epoch": 0.019141372709871478, + "grad_norm": 0.18400102853775024, + "learning_rate": 3.5e-05, + "loss": 1.9113, + "step": 70 + }, + { + "epoch": 0.019414820891441073, + "grad_norm": 0.16024211049079895, + "learning_rate": 3.55e-05, + "loss": 1.7861, + "step": 71 + }, + { + "epoch": 0.019688269073010665, + "grad_norm": 0.16651467978954315, + "learning_rate": 3.6e-05, + "loss": 1.626, + "step": 72 + }, + { + "epoch": 0.019961717254580257, + "grad_norm": 0.15933339297771454, + "learning_rate": 3.65e-05, + "loss": 1.7464, + "step": 73 + }, + { + "epoch": 0.02023516543614985, + "grad_norm": 0.15582682192325592, + "learning_rate": 3.7e-05, + "loss": 1.7713, + "step": 74 + }, + { + "epoch": 0.020508613617719443, + "grad_norm": 0.15848350524902344, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.7918, + "step": 75 + }, + { + "epoch": 0.020782061799289035, + "grad_norm": 0.1563899666070938, + "learning_rate": 3.8e-05, + "loss": 1.7257, + "step": 76 + }, + { + "epoch": 0.021055509980858627, + "grad_norm": 0.17177645862102509, + "learning_rate": 3.85e-05, + "loss": 1.784, + "step": 77 + }, + { + "epoch": 0.02132895816242822, + "grad_norm": 0.15895935893058777, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.7906, + "step": 78 + }, + { + "epoch": 0.021602406343997814, + "grad_norm": 0.15804462134838104, + "learning_rate": 3.9500000000000005e-05, + "loss": 1.6952, + "step": 79 + }, + { + "epoch": 0.021875854525567406, + "grad_norm": 0.15477772057056427, + "learning_rate": 4e-05, + "loss": 1.6538, + "step": 80 + }, + { + "epoch": 0.022149302707136997, + "grad_norm": 0.1593542844057083, + "learning_rate": 4.05e-05, + "loss": 1.727, + "step": 81 + }, + { + "epoch": 0.02242275088870659, + "grad_norm": 0.164969801902771, + "learning_rate": 4.1e-05, + "loss": 1.6443, + "step": 82 + }, + { + "epoch": 0.02269619907027618, + "grad_norm": 0.16219045221805573, + "learning_rate": 4.15e-05, + "loss": 1.7775, + "step": 83 + }, + { + "epoch": 0.022969647251845776, + "grad_norm": 0.15507090091705322, + "learning_rate": 4.2e-05, + "loss": 1.6936, + "step": 84 + }, + { + "epoch": 0.023243095433415368, + "grad_norm": 0.17881833016872406, + "learning_rate": 4.25e-05, + "loss": 1.8355, + "step": 85 + }, + { + "epoch": 0.02351654361498496, + "grad_norm": 0.16070392727851868, + "learning_rate": 4.3e-05, + "loss": 1.6853, + "step": 86 + }, + { + "epoch": 0.02378999179655455, + "grad_norm": 0.16585201025009155, + "learning_rate": 4.35e-05, + "loss": 1.7516, + "step": 87 + }, + { + "epoch": 0.024063439978124147, + "grad_norm": 0.1578633040189743, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.7062, + "step": 88 + }, + { + "epoch": 0.02433688815969374, + "grad_norm": 0.17426982522010803, + "learning_rate": 4.4500000000000004e-05, + "loss": 1.7312, + "step": 89 + }, + { + "epoch": 0.02461033634126333, + "grad_norm": 0.16550804674625397, + "learning_rate": 4.5e-05, + "loss": 1.7708, + "step": 90 + }, + { + "epoch": 0.024883784522832922, + "grad_norm": 0.17182576656341553, + "learning_rate": 4.55e-05, + "loss": 1.7964, + "step": 91 + }, + { + "epoch": 0.025157232704402517, + "grad_norm": 0.1642204076051712, + "learning_rate": 4.600000000000001e-05, + "loss": 1.7858, + "step": 92 + }, + { + "epoch": 0.02543068088597211, + "grad_norm": 0.17677852511405945, + "learning_rate": 4.6500000000000005e-05, + "loss": 1.701, + "step": 93 + }, + { + "epoch": 0.0257041290675417, + "grad_norm": 0.17346423864364624, + "learning_rate": 4.7e-05, + "loss": 1.8136, + "step": 94 + }, + { + "epoch": 0.025977577249111292, + "grad_norm": 0.17641915380954742, + "learning_rate": 4.75e-05, + "loss": 1.7921, + "step": 95 + }, + { + "epoch": 0.026251025430680888, + "grad_norm": 0.19822247326374054, + "learning_rate": 4.8e-05, + "loss": 1.7887, + "step": 96 + }, + { + "epoch": 0.02652447361225048, + "grad_norm": 0.1817287653684616, + "learning_rate": 4.85e-05, + "loss": 1.7063, + "step": 97 + }, + { + "epoch": 0.02679792179382007, + "grad_norm": 0.1644916832447052, + "learning_rate": 4.9e-05, + "loss": 1.684, + "step": 98 + }, + { + "epoch": 0.027071369975389663, + "grad_norm": 0.19094440340995789, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.8106, + "step": 99 + }, + { + "epoch": 0.027344818156959255, + "grad_norm": 0.16572299599647522, + "learning_rate": 5e-05, + "loss": 1.6812, + "step": 100 + }, + { + "epoch": 0.02761826633852885, + "grad_norm": 0.16459035873413086, + "learning_rate": 5e-05, + "loss": 1.697, + "step": 101 + }, + { + "epoch": 0.02789171452009844, + "grad_norm": 0.18967324495315552, + "learning_rate": 5e-05, + "loss": 1.7078, + "step": 102 + }, + { + "epoch": 0.028165162701668033, + "grad_norm": 0.1775873899459839, + "learning_rate": 5e-05, + "loss": 1.7995, + "step": 103 + }, + { + "epoch": 0.028438610883237625, + "grad_norm": 0.18641340732574463, + "learning_rate": 5e-05, + "loss": 1.8392, + "step": 104 + }, + { + "epoch": 0.02871205906480722, + "grad_norm": 0.17424939572811127, + "learning_rate": 5e-05, + "loss": 1.7695, + "step": 105 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 0.19932492077350616, + "learning_rate": 5e-05, + "loss": 1.6987, + "step": 106 + }, + { + "epoch": 0.029258955427946404, + "grad_norm": 0.1666601300239563, + "learning_rate": 5e-05, + "loss": 1.6425, + "step": 107 + }, + { + "epoch": 0.029532403609515995, + "grad_norm": 0.19102485477924347, + "learning_rate": 5e-05, + "loss": 1.7754, + "step": 108 + }, + { + "epoch": 0.02980585179108559, + "grad_norm": 0.17251844704151154, + "learning_rate": 5e-05, + "loss": 1.7315, + "step": 109 + }, + { + "epoch": 0.030079299972655182, + "grad_norm": 0.1837855726480484, + "learning_rate": 5e-05, + "loss": 1.805, + "step": 110 + }, + { + "epoch": 0.030352748154224774, + "grad_norm": 0.19414560496807098, + "learning_rate": 5e-05, + "loss": 1.6129, + "step": 111 + }, + { + "epoch": 0.030626196335794366, + "grad_norm": 0.16896690428256989, + "learning_rate": 5e-05, + "loss": 1.8306, + "step": 112 + }, + { + "epoch": 0.03089964451736396, + "grad_norm": 0.18901929259300232, + "learning_rate": 5e-05, + "loss": 1.7591, + "step": 113 + }, + { + "epoch": 0.031173092698933553, + "grad_norm": 0.17992505431175232, + "learning_rate": 5e-05, + "loss": 1.7676, + "step": 114 + }, + { + "epoch": 0.031446540880503145, + "grad_norm": 0.1731676608324051, + "learning_rate": 5e-05, + "loss": 1.7749, + "step": 115 + }, + { + "epoch": 0.03171998906207274, + "grad_norm": 0.17806388437747955, + "learning_rate": 5e-05, + "loss": 1.7912, + "step": 116 + }, + { + "epoch": 0.03199343724364233, + "grad_norm": 0.176969513297081, + "learning_rate": 5e-05, + "loss": 1.7966, + "step": 117 + }, + { + "epoch": 0.03226688542521192, + "grad_norm": 0.18301358819007874, + "learning_rate": 5e-05, + "loss": 1.7454, + "step": 118 + }, + { + "epoch": 0.03254033360678151, + "grad_norm": 0.18621793389320374, + "learning_rate": 5e-05, + "loss": 1.8087, + "step": 119 + }, + { + "epoch": 0.03281378178835111, + "grad_norm": 0.17661884427070618, + "learning_rate": 5e-05, + "loss": 1.8529, + "step": 120 + }, + { + "epoch": 0.0330872299699207, + "grad_norm": 0.1590997874736786, + "learning_rate": 5e-05, + "loss": 1.6437, + "step": 121 + }, + { + "epoch": 0.03336067815149029, + "grad_norm": 0.15839175879955292, + "learning_rate": 5e-05, + "loss": 1.6921, + "step": 122 + }, + { + "epoch": 0.033634126333059886, + "grad_norm": 0.1695318967103958, + "learning_rate": 5e-05, + "loss": 1.7514, + "step": 123 + }, + { + "epoch": 0.03390757451462948, + "grad_norm": 0.17511457204818726, + "learning_rate": 5e-05, + "loss": 1.7665, + "step": 124 + }, + { + "epoch": 0.03418102269619907, + "grad_norm": 0.16365903615951538, + "learning_rate": 5e-05, + "loss": 1.7289, + "step": 125 + }, + { + "epoch": 0.034454470877768664, + "grad_norm": 0.16163618862628937, + "learning_rate": 5e-05, + "loss": 1.7243, + "step": 126 + }, + { + "epoch": 0.03472791905933825, + "grad_norm": 0.1713578999042511, + "learning_rate": 5e-05, + "loss": 1.8056, + "step": 127 + }, + { + "epoch": 0.03500136724090785, + "grad_norm": 0.16412892937660217, + "learning_rate": 5e-05, + "loss": 1.7385, + "step": 128 + }, + { + "epoch": 0.03527481542247744, + "grad_norm": 0.16879543662071228, + "learning_rate": 5e-05, + "loss": 1.7961, + "step": 129 + }, + { + "epoch": 0.03554826360404703, + "grad_norm": 0.1659931093454361, + "learning_rate": 5e-05, + "loss": 1.6833, + "step": 130 + }, + { + "epoch": 0.035821711785616626, + "grad_norm": 0.16397275030612946, + "learning_rate": 5e-05, + "loss": 1.7543, + "step": 131 + }, + { + "epoch": 0.03609515996718622, + "grad_norm": 0.17216797173023224, + "learning_rate": 5e-05, + "loss": 1.8237, + "step": 132 + }, + { + "epoch": 0.03636860814875581, + "grad_norm": 0.15959759056568146, + "learning_rate": 5e-05, + "loss": 1.6513, + "step": 133 + }, + { + "epoch": 0.036642056330325405, + "grad_norm": 0.17499953508377075, + "learning_rate": 5e-05, + "loss": 1.7132, + "step": 134 + }, + { + "epoch": 0.03691550451189499, + "grad_norm": 0.17289526760578156, + "learning_rate": 5e-05, + "loss": 1.6642, + "step": 135 + }, + { + "epoch": 0.03718895269346459, + "grad_norm": 0.1796165257692337, + "learning_rate": 5e-05, + "loss": 1.6765, + "step": 136 + }, + { + "epoch": 0.037462400875034184, + "grad_norm": 0.20457540452480316, + "learning_rate": 5e-05, + "loss": 1.7167, + "step": 137 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 0.18604455888271332, + "learning_rate": 5e-05, + "loss": 1.7398, + "step": 138 + }, + { + "epoch": 0.03800929723817337, + "grad_norm": 0.20968973636627197, + "learning_rate": 5e-05, + "loss": 1.7422, + "step": 139 + }, + { + "epoch": 0.038282745419742956, + "grad_norm": 0.21795013546943665, + "learning_rate": 5e-05, + "loss": 1.7568, + "step": 140 + }, + { + "epoch": 0.03855619360131255, + "grad_norm": 0.19364839792251587, + "learning_rate": 5e-05, + "loss": 1.7591, + "step": 141 + }, + { + "epoch": 0.038829641782882146, + "grad_norm": 0.2641007602214813, + "learning_rate": 5e-05, + "loss": 1.8116, + "step": 142 + }, + { + "epoch": 0.039103089964451734, + "grad_norm": 0.17127980291843414, + "learning_rate": 5e-05, + "loss": 1.7593, + "step": 143 + }, + { + "epoch": 0.03937653814602133, + "grad_norm": 0.2609815001487732, + "learning_rate": 5e-05, + "loss": 1.7366, + "step": 144 + }, + { + "epoch": 0.039649986327590925, + "grad_norm": 0.1736164689064026, + "learning_rate": 5e-05, + "loss": 1.6295, + "step": 145 + }, + { + "epoch": 0.03992343450916051, + "grad_norm": 0.17097944021224976, + "learning_rate": 5e-05, + "loss": 1.6762, + "step": 146 + }, + { + "epoch": 0.04019688269073011, + "grad_norm": 0.19413994252681732, + "learning_rate": 5e-05, + "loss": 1.7808, + "step": 147 + }, + { + "epoch": 0.0404703308722997, + "grad_norm": 0.19383320212364197, + "learning_rate": 5e-05, + "loss": 1.7765, + "step": 148 + }, + { + "epoch": 0.04074377905386929, + "grad_norm": 0.18459807336330414, + "learning_rate": 5e-05, + "loss": 1.7168, + "step": 149 + }, + { + "epoch": 0.04101722723543889, + "grad_norm": 0.18464331328868866, + "learning_rate": 5e-05, + "loss": 1.8481, + "step": 150 + }, + { + "epoch": 0.041290675417008475, + "grad_norm": 0.18524886667728424, + "learning_rate": 5e-05, + "loss": 1.7175, + "step": 151 + }, + { + "epoch": 0.04156412359857807, + "grad_norm": 0.1744994819164276, + "learning_rate": 5e-05, + "loss": 1.7207, + "step": 152 + }, + { + "epoch": 0.04183757178014766, + "grad_norm": 0.1815788894891739, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 153 + }, + { + "epoch": 0.042111019961717254, + "grad_norm": 0.21097545325756073, + "learning_rate": 5e-05, + "loss": 1.7963, + "step": 154 + }, + { + "epoch": 0.04238446814328685, + "grad_norm": 0.17117224633693695, + "learning_rate": 5e-05, + "loss": 1.7514, + "step": 155 + }, + { + "epoch": 0.04265791632485644, + "grad_norm": 0.17618225514888763, + "learning_rate": 5e-05, + "loss": 1.7029, + "step": 156 + }, + { + "epoch": 0.04293136450642603, + "grad_norm": 0.223564013838768, + "learning_rate": 5e-05, + "loss": 1.8719, + "step": 157 + }, + { + "epoch": 0.04320481268799563, + "grad_norm": 0.17528285086154938, + "learning_rate": 5e-05, + "loss": 1.8264, + "step": 158 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.20005618035793304, + "learning_rate": 5e-05, + "loss": 1.7559, + "step": 159 + }, + { + "epoch": 0.04375170905113481, + "grad_norm": 0.18977142870426178, + "learning_rate": 5e-05, + "loss": 1.7066, + "step": 160 + }, + { + "epoch": 0.0440251572327044, + "grad_norm": 0.18722118437290192, + "learning_rate": 5e-05, + "loss": 1.7769, + "step": 161 + }, + { + "epoch": 0.044298605414273995, + "grad_norm": 0.16823123395442963, + "learning_rate": 5e-05, + "loss": 1.7666, + "step": 162 + }, + { + "epoch": 0.04457205359584359, + "grad_norm": 0.18529324233531952, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 163 + }, + { + "epoch": 0.04484550177741318, + "grad_norm": 0.18550924956798553, + "learning_rate": 5e-05, + "loss": 1.6754, + "step": 164 + }, + { + "epoch": 0.045118949958982774, + "grad_norm": 0.15453596413135529, + "learning_rate": 5e-05, + "loss": 1.6601, + "step": 165 + }, + { + "epoch": 0.04539239814055236, + "grad_norm": 0.17858926951885223, + "learning_rate": 5e-05, + "loss": 1.6343, + "step": 166 + }, + { + "epoch": 0.04566584632212196, + "grad_norm": 0.1769731193780899, + "learning_rate": 5e-05, + "loss": 1.6508, + "step": 167 + }, + { + "epoch": 0.04593929450369155, + "grad_norm": 0.16900819540023804, + "learning_rate": 5e-05, + "loss": 1.7167, + "step": 168 + }, + { + "epoch": 0.04621274268526114, + "grad_norm": 0.18043388426303864, + "learning_rate": 5e-05, + "loss": 1.75, + "step": 169 + }, + { + "epoch": 0.046486190866830736, + "grad_norm": 0.2040599286556244, + "learning_rate": 5e-05, + "loss": 1.7314, + "step": 170 + }, + { + "epoch": 0.04675963904840033, + "grad_norm": 0.17198055982589722, + "learning_rate": 5e-05, + "loss": 1.6897, + "step": 171 + }, + { + "epoch": 0.04703308722996992, + "grad_norm": 0.16982243955135345, + "learning_rate": 5e-05, + "loss": 1.7046, + "step": 172 + }, + { + "epoch": 0.047306535411539515, + "grad_norm": 0.1677250862121582, + "learning_rate": 5e-05, + "loss": 1.7385, + "step": 173 + }, + { + "epoch": 0.0475799835931091, + "grad_norm": 0.16259929537773132, + "learning_rate": 5e-05, + "loss": 1.7417, + "step": 174 + }, + { + "epoch": 0.0478534317746787, + "grad_norm": 0.1767575442790985, + "learning_rate": 5e-05, + "loss": 1.7981, + "step": 175 + }, + { + "epoch": 0.04812687995624829, + "grad_norm": 0.17178016901016235, + "learning_rate": 5e-05, + "loss": 1.7238, + "step": 176 + }, + { + "epoch": 0.04840032813781788, + "grad_norm": 0.1756935715675354, + "learning_rate": 5e-05, + "loss": 1.6331, + "step": 177 + }, + { + "epoch": 0.04867377631938748, + "grad_norm": 0.15742701292037964, + "learning_rate": 5e-05, + "loss": 1.7367, + "step": 178 + }, + { + "epoch": 0.04894722450095707, + "grad_norm": 0.16502848267555237, + "learning_rate": 5e-05, + "loss": 1.6908, + "step": 179 + }, + { + "epoch": 0.04922067268252666, + "grad_norm": 0.1676185131072998, + "learning_rate": 5e-05, + "loss": 1.6611, + "step": 180 + }, + { + "epoch": 0.049494120864096255, + "grad_norm": 0.19482578337192535, + "learning_rate": 5e-05, + "loss": 1.7986, + "step": 181 + }, + { + "epoch": 0.049767569045665844, + "grad_norm": 0.15637359023094177, + "learning_rate": 5e-05, + "loss": 1.7225, + "step": 182 + }, + { + "epoch": 0.05004101722723544, + "grad_norm": 0.17269264161586761, + "learning_rate": 5e-05, + "loss": 1.692, + "step": 183 + }, + { + "epoch": 0.050314465408805034, + "grad_norm": 0.17836996912956238, + "learning_rate": 5e-05, + "loss": 1.7457, + "step": 184 + }, + { + "epoch": 0.05058791359037462, + "grad_norm": 0.1829950511455536, + "learning_rate": 5e-05, + "loss": 1.7991, + "step": 185 + }, + { + "epoch": 0.05086136177194422, + "grad_norm": 0.18094682693481445, + "learning_rate": 5e-05, + "loss": 1.7173, + "step": 186 + }, + { + "epoch": 0.051134809953513806, + "grad_norm": 0.14723382890224457, + "learning_rate": 5e-05, + "loss": 1.6355, + "step": 187 + }, + { + "epoch": 0.0514082581350834, + "grad_norm": 0.18650388717651367, + "learning_rate": 5e-05, + "loss": 1.7328, + "step": 188 + }, + { + "epoch": 0.051681706316652996, + "grad_norm": 0.1545383185148239, + "learning_rate": 5e-05, + "loss": 1.6386, + "step": 189 + }, + { + "epoch": 0.051955154498222585, + "grad_norm": 0.18034450709819794, + "learning_rate": 5e-05, + "loss": 1.7728, + "step": 190 + }, + { + "epoch": 0.05222860267979218, + "grad_norm": 0.17649757862091064, + "learning_rate": 5e-05, + "loss": 1.8434, + "step": 191 + }, + { + "epoch": 0.052502050861361775, + "grad_norm": 0.1520988494157791, + "learning_rate": 5e-05, + "loss": 1.6414, + "step": 192 + }, + { + "epoch": 0.05277549904293136, + "grad_norm": 0.1721171736717224, + "learning_rate": 5e-05, + "loss": 1.7516, + "step": 193 + }, + { + "epoch": 0.05304894722450096, + "grad_norm": 0.16801105439662933, + "learning_rate": 5e-05, + "loss": 1.741, + "step": 194 + }, + { + "epoch": 0.05332239540607055, + "grad_norm": 0.1697547286748886, + "learning_rate": 5e-05, + "loss": 1.7154, + "step": 195 + }, + { + "epoch": 0.05359584358764014, + "grad_norm": 0.163418710231781, + "learning_rate": 5e-05, + "loss": 1.7957, + "step": 196 + }, + { + "epoch": 0.05386929176920974, + "grad_norm": 0.15881727635860443, + "learning_rate": 5e-05, + "loss": 1.71, + "step": 197 + }, + { + "epoch": 0.054142739950779326, + "grad_norm": 0.18331541121006012, + "learning_rate": 5e-05, + "loss": 1.7412, + "step": 198 + }, + { + "epoch": 0.05441618813234892, + "grad_norm": 0.15311068296432495, + "learning_rate": 5e-05, + "loss": 1.7389, + "step": 199 + }, + { + "epoch": 0.05468963631391851, + "grad_norm": 0.17510437965393066, + "learning_rate": 5e-05, + "loss": 1.7061, + "step": 200 + }, + { + "epoch": 0.054963084495488104, + "grad_norm": 0.1618979126214981, + "learning_rate": 5e-05, + "loss": 1.7949, + "step": 201 + }, + { + "epoch": 0.0552365326770577, + "grad_norm": 0.16283638775348663, + "learning_rate": 5e-05, + "loss": 1.6262, + "step": 202 + }, + { + "epoch": 0.05550998085862729, + "grad_norm": 0.17936623096466064, + "learning_rate": 5e-05, + "loss": 1.7559, + "step": 203 + }, + { + "epoch": 0.05578342904019688, + "grad_norm": 0.15585114061832428, + "learning_rate": 5e-05, + "loss": 1.6809, + "step": 204 + }, + { + "epoch": 0.05605687722176648, + "grad_norm": 0.16536429524421692, + "learning_rate": 5e-05, + "loss": 1.8167, + "step": 205 + }, + { + "epoch": 0.056330325403336066, + "grad_norm": 0.16469433903694153, + "learning_rate": 5e-05, + "loss": 1.762, + "step": 206 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 0.15904970467090607, + "learning_rate": 5e-05, + "loss": 1.7354, + "step": 207 + }, + { + "epoch": 0.05687722176647525, + "grad_norm": 0.15505826473236084, + "learning_rate": 5e-05, + "loss": 1.6751, + "step": 208 + }, + { + "epoch": 0.057150669948044845, + "grad_norm": 0.1706695258617401, + "learning_rate": 5e-05, + "loss": 1.8423, + "step": 209 + }, + { + "epoch": 0.05742411812961444, + "grad_norm": 0.1645784229040146, + "learning_rate": 5e-05, + "loss": 1.7291, + "step": 210 + }, + { + "epoch": 0.05769756631118403, + "grad_norm": 0.15510506927967072, + "learning_rate": 5e-05, + "loss": 1.6877, + "step": 211 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.16437038779258728, + "learning_rate": 5e-05, + "loss": 1.6895, + "step": 212 + }, + { + "epoch": 0.05824446267432322, + "grad_norm": 0.15057580173015594, + "learning_rate": 5e-05, + "loss": 1.6953, + "step": 213 + }, + { + "epoch": 0.05851791085589281, + "grad_norm": 0.17037834227085114, + "learning_rate": 5e-05, + "loss": 1.6958, + "step": 214 + }, + { + "epoch": 0.0587913590374624, + "grad_norm": 0.1691209077835083, + "learning_rate": 5e-05, + "loss": 1.6792, + "step": 215 + }, + { + "epoch": 0.05906480721903199, + "grad_norm": 0.18123316764831543, + "learning_rate": 5e-05, + "loss": 1.7774, + "step": 216 + }, + { + "epoch": 0.059338255400601586, + "grad_norm": 0.17319968342781067, + "learning_rate": 5e-05, + "loss": 1.7516, + "step": 217 + }, + { + "epoch": 0.05961170358217118, + "grad_norm": 0.16802047193050385, + "learning_rate": 5e-05, + "loss": 1.6909, + "step": 218 + }, + { + "epoch": 0.05988515176374077, + "grad_norm": 0.16676077246665955, + "learning_rate": 5e-05, + "loss": 1.613, + "step": 219 + }, + { + "epoch": 0.060158599945310365, + "grad_norm": 0.15997642278671265, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 220 + }, + { + "epoch": 0.06043204812687995, + "grad_norm": 0.1547921746969223, + "learning_rate": 5e-05, + "loss": 1.6874, + "step": 221 + }, + { + "epoch": 0.06070549630844955, + "grad_norm": 0.17867213487625122, + "learning_rate": 5e-05, + "loss": 1.6957, + "step": 222 + }, + { + "epoch": 0.060978944490019144, + "grad_norm": 0.15124128758907318, + "learning_rate": 5e-05, + "loss": 1.6269, + "step": 223 + }, + { + "epoch": 0.06125239267158873, + "grad_norm": 0.15765389800071716, + "learning_rate": 5e-05, + "loss": 1.7699, + "step": 224 + }, + { + "epoch": 0.06152584085315833, + "grad_norm": 0.18673233687877655, + "learning_rate": 5e-05, + "loss": 1.8988, + "step": 225 + }, + { + "epoch": 0.06179928903472792, + "grad_norm": 0.148275688290596, + "learning_rate": 5e-05, + "loss": 1.6565, + "step": 226 + }, + { + "epoch": 0.06207273721629751, + "grad_norm": 0.17454782128334045, + "learning_rate": 5e-05, + "loss": 1.7273, + "step": 227 + }, + { + "epoch": 0.062346185397867106, + "grad_norm": 0.15484969317913055, + "learning_rate": 5e-05, + "loss": 1.6832, + "step": 228 + }, + { + "epoch": 0.0626196335794367, + "grad_norm": 0.16521553695201874, + "learning_rate": 5e-05, + "loss": 1.886, + "step": 229 + }, + { + "epoch": 0.06289308176100629, + "grad_norm": 0.1734921932220459, + "learning_rate": 5e-05, + "loss": 1.8453, + "step": 230 + }, + { + "epoch": 0.06316652994257588, + "grad_norm": 0.16037142276763916, + "learning_rate": 5e-05, + "loss": 1.6604, + "step": 231 + }, + { + "epoch": 0.06343997812414548, + "grad_norm": 0.194137305021286, + "learning_rate": 5e-05, + "loss": 1.7101, + "step": 232 + }, + { + "epoch": 0.06371342630571507, + "grad_norm": 0.16026362776756287, + "learning_rate": 5e-05, + "loss": 1.7296, + "step": 233 + }, + { + "epoch": 0.06398687448728466, + "grad_norm": 0.1773460954427719, + "learning_rate": 5e-05, + "loss": 1.6812, + "step": 234 + }, + { + "epoch": 0.06426032266885426, + "grad_norm": 0.19645363092422485, + "learning_rate": 5e-05, + "loss": 1.8694, + "step": 235 + }, + { + "epoch": 0.06453377085042385, + "grad_norm": 0.16966943442821503, + "learning_rate": 5e-05, + "loss": 1.6903, + "step": 236 + }, + { + "epoch": 0.06480721903199343, + "grad_norm": 0.21228881180286407, + "learning_rate": 5e-05, + "loss": 1.7405, + "step": 237 + }, + { + "epoch": 0.06508066721356302, + "grad_norm": 0.16073106229305267, + "learning_rate": 5e-05, + "loss": 1.7031, + "step": 238 + }, + { + "epoch": 0.06535411539513263, + "grad_norm": 0.1698150485754013, + "learning_rate": 5e-05, + "loss": 1.6663, + "step": 239 + }, + { + "epoch": 0.06562756357670221, + "grad_norm": 0.1578008383512497, + "learning_rate": 5e-05, + "loss": 1.6899, + "step": 240 + }, + { + "epoch": 0.0659010117582718, + "grad_norm": 0.1813380867242813, + "learning_rate": 5e-05, + "loss": 1.7034, + "step": 241 + }, + { + "epoch": 0.0661744599398414, + "grad_norm": 0.16481561958789825, + "learning_rate": 5e-05, + "loss": 1.7453, + "step": 242 + }, + { + "epoch": 0.06644790812141099, + "grad_norm": 0.15525515377521515, + "learning_rate": 5e-05, + "loss": 1.7296, + "step": 243 + }, + { + "epoch": 0.06672135630298058, + "grad_norm": 0.19922387599945068, + "learning_rate": 5e-05, + "loss": 1.7433, + "step": 244 + }, + { + "epoch": 0.06699480448455018, + "grad_norm": 0.18391215801239014, + "learning_rate": 5e-05, + "loss": 1.7203, + "step": 245 + }, + { + "epoch": 0.06726825266611977, + "grad_norm": 0.15986333787441254, + "learning_rate": 5e-05, + "loss": 1.6375, + "step": 246 + }, + { + "epoch": 0.06754170084768936, + "grad_norm": 0.15006937086582184, + "learning_rate": 5e-05, + "loss": 1.6448, + "step": 247 + }, + { + "epoch": 0.06781514902925896, + "grad_norm": 0.19520802795886993, + "learning_rate": 5e-05, + "loss": 1.6555, + "step": 248 + }, + { + "epoch": 0.06808859721082855, + "grad_norm": 0.14804011583328247, + "learning_rate": 5e-05, + "loss": 1.682, + "step": 249 + }, + { + "epoch": 0.06836204539239814, + "grad_norm": 0.18628214299678802, + "learning_rate": 5e-05, + "loss": 1.7169, + "step": 250 + }, + { + "epoch": 0.06863549357396773, + "grad_norm": 0.15606792271137238, + "learning_rate": 5e-05, + "loss": 1.7105, + "step": 251 + }, + { + "epoch": 0.06890894175553733, + "grad_norm": 0.17179042100906372, + "learning_rate": 5e-05, + "loss": 1.703, + "step": 252 + }, + { + "epoch": 0.06918238993710692, + "grad_norm": 0.1775059700012207, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 253 + }, + { + "epoch": 0.0694558381186765, + "grad_norm": 0.15647803246974945, + "learning_rate": 5e-05, + "loss": 1.6498, + "step": 254 + }, + { + "epoch": 0.06972928630024611, + "grad_norm": 0.1764082908630371, + "learning_rate": 5e-05, + "loss": 1.7309, + "step": 255 + }, + { + "epoch": 0.0700027344818157, + "grad_norm": 0.1760226935148239, + "learning_rate": 5e-05, + "loss": 1.7507, + "step": 256 + }, + { + "epoch": 0.07027618266338528, + "grad_norm": 0.1542169153690338, + "learning_rate": 5e-05, + "loss": 1.569, + "step": 257 + }, + { + "epoch": 0.07054963084495489, + "grad_norm": 0.21275527775287628, + "learning_rate": 5e-05, + "loss": 1.7928, + "step": 258 + }, + { + "epoch": 0.07082307902652447, + "grad_norm": 0.16845186054706573, + "learning_rate": 5e-05, + "loss": 1.7246, + "step": 259 + }, + { + "epoch": 0.07109652720809406, + "grad_norm": 0.2582179605960846, + "learning_rate": 5e-05, + "loss": 1.6447, + "step": 260 + }, + { + "epoch": 0.07136997538966366, + "grad_norm": 0.16100846230983734, + "learning_rate": 5e-05, + "loss": 1.7214, + "step": 261 + }, + { + "epoch": 0.07164342357123325, + "grad_norm": 0.19200801849365234, + "learning_rate": 5e-05, + "loss": 1.7948, + "step": 262 + }, + { + "epoch": 0.07191687175280284, + "grad_norm": 0.2010166496038437, + "learning_rate": 5e-05, + "loss": 1.6726, + "step": 263 + }, + { + "epoch": 0.07219031993437244, + "grad_norm": 0.15795177221298218, + "learning_rate": 5e-05, + "loss": 1.7046, + "step": 264 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 0.1857866495847702, + "learning_rate": 5e-05, + "loss": 1.6915, + "step": 265 + }, + { + "epoch": 0.07273721629751162, + "grad_norm": 0.16975003480911255, + "learning_rate": 5e-05, + "loss": 1.6847, + "step": 266 + }, + { + "epoch": 0.07301066447908121, + "grad_norm": 0.14858882129192352, + "learning_rate": 5e-05, + "loss": 1.7224, + "step": 267 + }, + { + "epoch": 0.07328411266065081, + "grad_norm": 0.20861102640628815, + "learning_rate": 5e-05, + "loss": 1.6836, + "step": 268 + }, + { + "epoch": 0.0735575608422204, + "grad_norm": 0.1622302383184433, + "learning_rate": 5e-05, + "loss": 1.6482, + "step": 269 + }, + { + "epoch": 0.07383100902378999, + "grad_norm": 0.1789664328098297, + "learning_rate": 5e-05, + "loss": 1.7508, + "step": 270 + }, + { + "epoch": 0.07410445720535959, + "grad_norm": 0.16430623829364777, + "learning_rate": 5e-05, + "loss": 1.735, + "step": 271 + }, + { + "epoch": 0.07437790538692918, + "grad_norm": 0.15610988438129425, + "learning_rate": 5e-05, + "loss": 1.6569, + "step": 272 + }, + { + "epoch": 0.07465135356849877, + "grad_norm": 0.16325096786022186, + "learning_rate": 5e-05, + "loss": 1.6165, + "step": 273 + }, + { + "epoch": 0.07492480175006837, + "grad_norm": 0.15770940482616425, + "learning_rate": 5e-05, + "loss": 1.727, + "step": 274 + }, + { + "epoch": 0.07519824993163796, + "grad_norm": 0.15711399912834167, + "learning_rate": 5e-05, + "loss": 1.6336, + "step": 275 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 0.16381201148033142, + "learning_rate": 5e-05, + "loss": 1.69, + "step": 276 + }, + { + "epoch": 0.07574514629477715, + "grad_norm": 0.15251821279525757, + "learning_rate": 5e-05, + "loss": 1.6241, + "step": 277 + }, + { + "epoch": 0.07601859447634673, + "grad_norm": 0.177230566740036, + "learning_rate": 5e-05, + "loss": 1.7341, + "step": 278 + }, + { + "epoch": 0.07629204265791632, + "grad_norm": 0.16340211033821106, + "learning_rate": 5e-05, + "loss": 1.726, + "step": 279 + }, + { + "epoch": 0.07656549083948591, + "grad_norm": 0.15226496756076813, + "learning_rate": 5e-05, + "loss": 1.7703, + "step": 280 + }, + { + "epoch": 0.07683893902105551, + "grad_norm": 0.1857025921344757, + "learning_rate": 5e-05, + "loss": 1.6637, + "step": 281 + }, + { + "epoch": 0.0771123872026251, + "grad_norm": 0.1753539890050888, + "learning_rate": 5e-05, + "loss": 1.8154, + "step": 282 + }, + { + "epoch": 0.07738583538419469, + "grad_norm": 0.16672004759311676, + "learning_rate": 5e-05, + "loss": 1.6617, + "step": 283 + }, + { + "epoch": 0.07765928356576429, + "grad_norm": 0.186684250831604, + "learning_rate": 5e-05, + "loss": 1.6501, + "step": 284 + }, + { + "epoch": 0.07793273174733388, + "grad_norm": 0.18949177861213684, + "learning_rate": 5e-05, + "loss": 1.7384, + "step": 285 + }, + { + "epoch": 0.07820617992890347, + "grad_norm": 0.1607401967048645, + "learning_rate": 5e-05, + "loss": 1.719, + "step": 286 + }, + { + "epoch": 0.07847962811047307, + "grad_norm": 0.15836010873317719, + "learning_rate": 5e-05, + "loss": 1.6533, + "step": 287 + }, + { + "epoch": 0.07875307629204266, + "grad_norm": 0.17572252452373505, + "learning_rate": 5e-05, + "loss": 1.7285, + "step": 288 + }, + { + "epoch": 0.07902652447361225, + "grad_norm": 0.159013569355011, + "learning_rate": 5e-05, + "loss": 1.704, + "step": 289 + }, + { + "epoch": 0.07929997265518185, + "grad_norm": 0.16590869426727295, + "learning_rate": 5e-05, + "loss": 1.7192, + "step": 290 + }, + { + "epoch": 0.07957342083675144, + "grad_norm": 0.1848197877407074, + "learning_rate": 5e-05, + "loss": 1.7157, + "step": 291 + }, + { + "epoch": 0.07984686901832103, + "grad_norm": 0.1681578904390335, + "learning_rate": 5e-05, + "loss": 1.729, + "step": 292 + }, + { + "epoch": 0.08012031719989061, + "grad_norm": 0.18026727437973022, + "learning_rate": 5e-05, + "loss": 1.6524, + "step": 293 + }, + { + "epoch": 0.08039376538146022, + "grad_norm": 0.18624775111675262, + "learning_rate": 5e-05, + "loss": 1.7211, + "step": 294 + }, + { + "epoch": 0.0806672135630298, + "grad_norm": 0.17745369672775269, + "learning_rate": 5e-05, + "loss": 1.7268, + "step": 295 + }, + { + "epoch": 0.0809406617445994, + "grad_norm": 0.196435809135437, + "learning_rate": 5e-05, + "loss": 1.6314, + "step": 296 + }, + { + "epoch": 0.081214109926169, + "grad_norm": 0.15979966521263123, + "learning_rate": 5e-05, + "loss": 1.73, + "step": 297 + }, + { + "epoch": 0.08148755810773858, + "grad_norm": 0.16943073272705078, + "learning_rate": 5e-05, + "loss": 1.7083, + "step": 298 + }, + { + "epoch": 0.08176100628930817, + "grad_norm": 0.18523457646369934, + "learning_rate": 5e-05, + "loss": 1.7286, + "step": 299 + }, + { + "epoch": 0.08203445447087777, + "grad_norm": 0.15192170441150665, + "learning_rate": 5e-05, + "loss": 1.7399, + "step": 300 + }, + { + "epoch": 0.08230790265244736, + "grad_norm": 0.18083369731903076, + "learning_rate": 5e-05, + "loss": 1.7006, + "step": 301 + }, + { + "epoch": 0.08258135083401695, + "grad_norm": 0.16836769878864288, + "learning_rate": 5e-05, + "loss": 1.8089, + "step": 302 + }, + { + "epoch": 0.08285479901558655, + "grad_norm": 0.15962150692939758, + "learning_rate": 5e-05, + "loss": 1.7109, + "step": 303 + }, + { + "epoch": 0.08312824719715614, + "grad_norm": 0.18868067860603333, + "learning_rate": 5e-05, + "loss": 1.7697, + "step": 304 + }, + { + "epoch": 0.08340169537872573, + "grad_norm": 0.16335056722164154, + "learning_rate": 5e-05, + "loss": 1.7866, + "step": 305 + }, + { + "epoch": 0.08367514356029532, + "grad_norm": 0.23511578142642975, + "learning_rate": 5e-05, + "loss": 1.8343, + "step": 306 + }, + { + "epoch": 0.08394859174186492, + "grad_norm": 0.15618833899497986, + "learning_rate": 5e-05, + "loss": 1.633, + "step": 307 + }, + { + "epoch": 0.08422203992343451, + "grad_norm": 0.16993245482444763, + "learning_rate": 5e-05, + "loss": 1.7104, + "step": 308 + }, + { + "epoch": 0.0844954881050041, + "grad_norm": 0.18835750222206116, + "learning_rate": 5e-05, + "loss": 1.7113, + "step": 309 + }, + { + "epoch": 0.0847689362865737, + "grad_norm": 0.16639582812786102, + "learning_rate": 5e-05, + "loss": 1.816, + "step": 310 + }, + { + "epoch": 0.08504238446814329, + "grad_norm": 0.15975739061832428, + "learning_rate": 5e-05, + "loss": 1.5883, + "step": 311 + }, + { + "epoch": 0.08531583264971287, + "grad_norm": 0.20657338201999664, + "learning_rate": 5e-05, + "loss": 1.6521, + "step": 312 + }, + { + "epoch": 0.08558928083128248, + "grad_norm": 0.15396147966384888, + "learning_rate": 5e-05, + "loss": 1.7602, + "step": 313 + }, + { + "epoch": 0.08586272901285207, + "grad_norm": 0.18306365609169006, + "learning_rate": 5e-05, + "loss": 1.6811, + "step": 314 + }, + { + "epoch": 0.08613617719442165, + "grad_norm": 0.1487811654806137, + "learning_rate": 5e-05, + "loss": 1.5397, + "step": 315 + }, + { + "epoch": 0.08640962537599126, + "grad_norm": 0.14201436936855316, + "learning_rate": 5e-05, + "loss": 1.5702, + "step": 316 + }, + { + "epoch": 0.08668307355756084, + "grad_norm": 0.17401176691055298, + "learning_rate": 5e-05, + "loss": 1.6514, + "step": 317 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.16217194497585297, + "learning_rate": 5e-05, + "loss": 1.7284, + "step": 318 + }, + { + "epoch": 0.08722996992070002, + "grad_norm": 0.1471739411354065, + "learning_rate": 5e-05, + "loss": 1.6665, + "step": 319 + }, + { + "epoch": 0.08750341810226962, + "grad_norm": 0.1779485046863556, + "learning_rate": 5e-05, + "loss": 1.7358, + "step": 320 + }, + { + "epoch": 0.08777686628383921, + "grad_norm": 0.15634022653102875, + "learning_rate": 5e-05, + "loss": 1.6477, + "step": 321 + }, + { + "epoch": 0.0880503144654088, + "grad_norm": 0.1602734923362732, + "learning_rate": 5e-05, + "loss": 1.7381, + "step": 322 + }, + { + "epoch": 0.0883237626469784, + "grad_norm": 0.1770986020565033, + "learning_rate": 5e-05, + "loss": 1.7719, + "step": 323 + }, + { + "epoch": 0.08859721082854799, + "grad_norm": 0.1577000766992569, + "learning_rate": 5e-05, + "loss": 1.7571, + "step": 324 + }, + { + "epoch": 0.08887065901011758, + "grad_norm": 0.158709317445755, + "learning_rate": 5e-05, + "loss": 1.7026, + "step": 325 + }, + { + "epoch": 0.08914410719168718, + "grad_norm": 0.1561996042728424, + "learning_rate": 5e-05, + "loss": 1.602, + "step": 326 + }, + { + "epoch": 0.08941755537325677, + "grad_norm": 0.14826953411102295, + "learning_rate": 5e-05, + "loss": 1.7566, + "step": 327 + }, + { + "epoch": 0.08969100355482636, + "grad_norm": 0.1718713790178299, + "learning_rate": 5e-05, + "loss": 1.7978, + "step": 328 + }, + { + "epoch": 0.08996445173639596, + "grad_norm": 0.1710497885942459, + "learning_rate": 5e-05, + "loss": 1.7204, + "step": 329 + }, + { + "epoch": 0.09023789991796555, + "grad_norm": 0.1561228483915329, + "learning_rate": 5e-05, + "loss": 1.7124, + "step": 330 + }, + { + "epoch": 0.09051134809953514, + "grad_norm": 0.18225868046283722, + "learning_rate": 5e-05, + "loss": 1.7286, + "step": 331 + }, + { + "epoch": 0.09078479628110472, + "grad_norm": 0.1495756208896637, + "learning_rate": 5e-05, + "loss": 1.6065, + "step": 332 + }, + { + "epoch": 0.09105824446267433, + "grad_norm": 0.15466246008872986, + "learning_rate": 5e-05, + "loss": 1.7078, + "step": 333 + }, + { + "epoch": 0.09133169264424391, + "grad_norm": 0.15771101415157318, + "learning_rate": 5e-05, + "loss": 1.691, + "step": 334 + }, + { + "epoch": 0.0916051408258135, + "grad_norm": 0.1566590517759323, + "learning_rate": 5e-05, + "loss": 1.8164, + "step": 335 + }, + { + "epoch": 0.0918785890073831, + "grad_norm": 0.1567210704088211, + "learning_rate": 5e-05, + "loss": 1.6887, + "step": 336 + }, + { + "epoch": 0.09215203718895269, + "grad_norm": 0.1516759693622589, + "learning_rate": 5e-05, + "loss": 1.5887, + "step": 337 + }, + { + "epoch": 0.09242548537052228, + "grad_norm": 0.15259622037410736, + "learning_rate": 5e-05, + "loss": 1.7337, + "step": 338 + }, + { + "epoch": 0.09269893355209188, + "grad_norm": 0.14548294246196747, + "learning_rate": 5e-05, + "loss": 1.6738, + "step": 339 + }, + { + "epoch": 0.09297238173366147, + "grad_norm": 0.14872747659683228, + "learning_rate": 5e-05, + "loss": 1.7034, + "step": 340 + }, + { + "epoch": 0.09324582991523106, + "grad_norm": 0.1520431935787201, + "learning_rate": 5e-05, + "loss": 1.6948, + "step": 341 + }, + { + "epoch": 0.09351927809680066, + "grad_norm": 0.15893866121768951, + "learning_rate": 5e-05, + "loss": 1.7664, + "step": 342 + }, + { + "epoch": 0.09379272627837025, + "grad_norm": 0.1537715345621109, + "learning_rate": 5e-05, + "loss": 1.6939, + "step": 343 + }, + { + "epoch": 0.09406617445993984, + "grad_norm": 0.15221412479877472, + "learning_rate": 5e-05, + "loss": 1.7216, + "step": 344 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 0.16553561389446259, + "learning_rate": 5e-05, + "loss": 1.7498, + "step": 345 + }, + { + "epoch": 0.09461307082307903, + "grad_norm": 0.148160919547081, + "learning_rate": 5e-05, + "loss": 1.6938, + "step": 346 + }, + { + "epoch": 0.09488651900464862, + "grad_norm": 0.14013923704624176, + "learning_rate": 5e-05, + "loss": 1.5675, + "step": 347 + }, + { + "epoch": 0.0951599671862182, + "grad_norm": 0.14623858034610748, + "learning_rate": 5e-05, + "loss": 1.6729, + "step": 348 + }, + { + "epoch": 0.09543341536778781, + "grad_norm": 0.16254989802837372, + "learning_rate": 5e-05, + "loss": 1.8071, + "step": 349 + }, + { + "epoch": 0.0957068635493574, + "grad_norm": 0.1588207483291626, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 350 + }, + { + "epoch": 0.09598031173092698, + "grad_norm": 0.15622512996196747, + "learning_rate": 5e-05, + "loss": 1.7482, + "step": 351 + }, + { + "epoch": 0.09625375991249659, + "grad_norm": 0.15577097237110138, + "learning_rate": 5e-05, + "loss": 1.7231, + "step": 352 + }, + { + "epoch": 0.09652720809406617, + "grad_norm": 0.1480865478515625, + "learning_rate": 5e-05, + "loss": 1.6877, + "step": 353 + }, + { + "epoch": 0.09680065627563576, + "grad_norm": 0.15626661479473114, + "learning_rate": 5e-05, + "loss": 1.7725, + "step": 354 + }, + { + "epoch": 0.09707410445720537, + "grad_norm": 0.17652183771133423, + "learning_rate": 5e-05, + "loss": 1.6995, + "step": 355 + }, + { + "epoch": 0.09734755263877495, + "grad_norm": 0.15815986692905426, + "learning_rate": 5e-05, + "loss": 1.7932, + "step": 356 + }, + { + "epoch": 0.09762100082034454, + "grad_norm": 0.1668958067893982, + "learning_rate": 5e-05, + "loss": 1.7085, + "step": 357 + }, + { + "epoch": 0.09789444900191414, + "grad_norm": 0.15382306277751923, + "learning_rate": 5e-05, + "loss": 1.776, + "step": 358 + }, + { + "epoch": 0.09816789718348373, + "grad_norm": 0.1443256139755249, + "learning_rate": 5e-05, + "loss": 1.6132, + "step": 359 + }, + { + "epoch": 0.09844134536505332, + "grad_norm": 0.14953619241714478, + "learning_rate": 5e-05, + "loss": 1.6197, + "step": 360 + }, + { + "epoch": 0.09871479354662291, + "grad_norm": 0.16327831149101257, + "learning_rate": 5e-05, + "loss": 1.6632, + "step": 361 + }, + { + "epoch": 0.09898824172819251, + "grad_norm": 0.16656070947647095, + "learning_rate": 5e-05, + "loss": 1.732, + "step": 362 + }, + { + "epoch": 0.0992616899097621, + "grad_norm": 0.14592863619327545, + "learning_rate": 5e-05, + "loss": 1.6502, + "step": 363 + }, + { + "epoch": 0.09953513809133169, + "grad_norm": 0.152685284614563, + "learning_rate": 5e-05, + "loss": 1.7162, + "step": 364 + }, + { + "epoch": 0.09980858627290129, + "grad_norm": 0.14510676264762878, + "learning_rate": 5e-05, + "loss": 1.5949, + "step": 365 + }, + { + "epoch": 0.10008203445447088, + "grad_norm": 0.16652299463748932, + "learning_rate": 5e-05, + "loss": 1.7656, + "step": 366 + }, + { + "epoch": 0.10035548263604047, + "grad_norm": 0.1451842486858368, + "learning_rate": 5e-05, + "loss": 1.6449, + "step": 367 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 0.18037185072898865, + "learning_rate": 5e-05, + "loss": 1.7886, + "step": 368 + }, + { + "epoch": 0.10090237899917966, + "grad_norm": 0.14976035058498383, + "learning_rate": 5e-05, + "loss": 1.5979, + "step": 369 + }, + { + "epoch": 0.10117582718074924, + "grad_norm": 0.15457609295845032, + "learning_rate": 5e-05, + "loss": 1.6899, + "step": 370 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 0.19134309887886047, + "learning_rate": 5e-05, + "loss": 1.7097, + "step": 371 + }, + { + "epoch": 0.10172272354388844, + "grad_norm": 0.14750947058200836, + "learning_rate": 5e-05, + "loss": 1.6993, + "step": 372 + }, + { + "epoch": 0.10199617172545802, + "grad_norm": 0.1667216420173645, + "learning_rate": 5e-05, + "loss": 1.7053, + "step": 373 + }, + { + "epoch": 0.10226961990702761, + "grad_norm": 0.1675473004579544, + "learning_rate": 5e-05, + "loss": 1.6688, + "step": 374 + }, + { + "epoch": 0.10254306808859721, + "grad_norm": 0.14588837325572968, + "learning_rate": 5e-05, + "loss": 1.6963, + "step": 375 + }, + { + "epoch": 0.1028165162701668, + "grad_norm": 0.17661041021347046, + "learning_rate": 5e-05, + "loss": 1.6563, + "step": 376 + }, + { + "epoch": 0.10308996445173639, + "grad_norm": 0.16083909571170807, + "learning_rate": 5e-05, + "loss": 1.6893, + "step": 377 + }, + { + "epoch": 0.10336341263330599, + "grad_norm": 0.14849358797073364, + "learning_rate": 5e-05, + "loss": 1.6621, + "step": 378 + }, + { + "epoch": 0.10363686081487558, + "grad_norm": 0.15476635098457336, + "learning_rate": 5e-05, + "loss": 1.7159, + "step": 379 + }, + { + "epoch": 0.10391030899644517, + "grad_norm": 0.16712796688079834, + "learning_rate": 5e-05, + "loss": 1.728, + "step": 380 + }, + { + "epoch": 0.10418375717801477, + "grad_norm": 0.15185219049453735, + "learning_rate": 5e-05, + "loss": 1.674, + "step": 381 + }, + { + "epoch": 0.10445720535958436, + "grad_norm": 0.16834664344787598, + "learning_rate": 5e-05, + "loss": 1.7249, + "step": 382 + }, + { + "epoch": 0.10473065354115395, + "grad_norm": 0.15080732107162476, + "learning_rate": 5e-05, + "loss": 1.7108, + "step": 383 + }, + { + "epoch": 0.10500410172272355, + "grad_norm": 0.17864330112934113, + "learning_rate": 5e-05, + "loss": 1.743, + "step": 384 + }, + { + "epoch": 0.10527754990429314, + "grad_norm": 0.1494104564189911, + "learning_rate": 5e-05, + "loss": 1.7065, + "step": 385 + }, + { + "epoch": 0.10555099808586273, + "grad_norm": 0.1590685397386551, + "learning_rate": 5e-05, + "loss": 1.6841, + "step": 386 + }, + { + "epoch": 0.10582444626743231, + "grad_norm": 0.15133905410766602, + "learning_rate": 5e-05, + "loss": 1.7493, + "step": 387 + }, + { + "epoch": 0.10609789444900192, + "grad_norm": 0.16563621163368225, + "learning_rate": 5e-05, + "loss": 1.7181, + "step": 388 + }, + { + "epoch": 0.1063713426305715, + "grad_norm": 0.14327523112297058, + "learning_rate": 5e-05, + "loss": 1.6653, + "step": 389 + }, + { + "epoch": 0.1066447908121411, + "grad_norm": 0.14864635467529297, + "learning_rate": 5e-05, + "loss": 1.5826, + "step": 390 + }, + { + "epoch": 0.1069182389937107, + "grad_norm": 0.15405511856079102, + "learning_rate": 5e-05, + "loss": 1.7333, + "step": 391 + }, + { + "epoch": 0.10719168717528028, + "grad_norm": 0.1498226821422577, + "learning_rate": 5e-05, + "loss": 1.7149, + "step": 392 + }, + { + "epoch": 0.10746513535684987, + "grad_norm": 0.14723950624465942, + "learning_rate": 5e-05, + "loss": 1.7088, + "step": 393 + }, + { + "epoch": 0.10773858353841947, + "grad_norm": 0.15685135126113892, + "learning_rate": 5e-05, + "loss": 1.717, + "step": 394 + }, + { + "epoch": 0.10801203171998906, + "grad_norm": 0.16080626845359802, + "learning_rate": 5e-05, + "loss": 1.7603, + "step": 395 + }, + { + "epoch": 0.10828547990155865, + "grad_norm": 0.17556969821453094, + "learning_rate": 5e-05, + "loss": 1.7453, + "step": 396 + }, + { + "epoch": 0.10855892808312825, + "grad_norm": 0.16690774261951447, + "learning_rate": 5e-05, + "loss": 1.7831, + "step": 397 + }, + { + "epoch": 0.10883237626469784, + "grad_norm": 0.16693341732025146, + "learning_rate": 5e-05, + "loss": 1.7015, + "step": 398 + }, + { + "epoch": 0.10910582444626743, + "grad_norm": 0.15416108071804047, + "learning_rate": 5e-05, + "loss": 1.6887, + "step": 399 + }, + { + "epoch": 0.10937927262783702, + "grad_norm": 0.15936125814914703, + "learning_rate": 5e-05, + "loss": 1.6774, + "step": 400 + }, + { + "epoch": 0.10965272080940662, + "grad_norm": 0.15093334019184113, + "learning_rate": 5e-05, + "loss": 1.6507, + "step": 401 + }, + { + "epoch": 0.10992616899097621, + "grad_norm": 0.1748283952474594, + "learning_rate": 5e-05, + "loss": 1.6893, + "step": 402 + }, + { + "epoch": 0.1101996171725458, + "grad_norm": 0.17557309567928314, + "learning_rate": 5e-05, + "loss": 1.7279, + "step": 403 + }, + { + "epoch": 0.1104730653541154, + "grad_norm": 0.15320509672164917, + "learning_rate": 5e-05, + "loss": 1.6621, + "step": 404 + }, + { + "epoch": 0.11074651353568499, + "grad_norm": 0.1499612033367157, + "learning_rate": 5e-05, + "loss": 1.7715, + "step": 405 + }, + { + "epoch": 0.11101996171725458, + "grad_norm": 0.16688010096549988, + "learning_rate": 5e-05, + "loss": 1.7823, + "step": 406 + }, + { + "epoch": 0.11129340989882418, + "grad_norm": 0.157332643866539, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 407 + }, + { + "epoch": 0.11156685808039377, + "grad_norm": 0.14416515827178955, + "learning_rate": 5e-05, + "loss": 1.6094, + "step": 408 + }, + { + "epoch": 0.11184030626196335, + "grad_norm": 0.1920742392539978, + "learning_rate": 5e-05, + "loss": 1.7436, + "step": 409 + }, + { + "epoch": 0.11211375444353296, + "grad_norm": 0.1514004021883011, + "learning_rate": 5e-05, + "loss": 1.7049, + "step": 410 + }, + { + "epoch": 0.11238720262510254, + "grad_norm": 0.16829192638397217, + "learning_rate": 5e-05, + "loss": 1.7128, + "step": 411 + }, + { + "epoch": 0.11266065080667213, + "grad_norm": 0.16184952855110168, + "learning_rate": 5e-05, + "loss": 1.697, + "step": 412 + }, + { + "epoch": 0.11293409898824174, + "grad_norm": 0.15670904517173767, + "learning_rate": 5e-05, + "loss": 1.6045, + "step": 413 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.17832054197788239, + "learning_rate": 5e-05, + "loss": 1.7311, + "step": 414 + }, + { + "epoch": 0.11348099535138091, + "grad_norm": 0.15141215920448303, + "learning_rate": 5e-05, + "loss": 1.7269, + "step": 415 + }, + { + "epoch": 0.1137544435329505, + "grad_norm": 0.16881325840950012, + "learning_rate": 5e-05, + "loss": 1.753, + "step": 416 + }, + { + "epoch": 0.1140278917145201, + "grad_norm": 0.14302976429462433, + "learning_rate": 5e-05, + "loss": 1.6762, + "step": 417 + }, + { + "epoch": 0.11430133989608969, + "grad_norm": 0.15213017165660858, + "learning_rate": 5e-05, + "loss": 1.6379, + "step": 418 + }, + { + "epoch": 0.11457478807765928, + "grad_norm": 0.14254313707351685, + "learning_rate": 5e-05, + "loss": 1.702, + "step": 419 + }, + { + "epoch": 0.11484823625922888, + "grad_norm": 0.14474402368068695, + "learning_rate": 5e-05, + "loss": 1.6177, + "step": 420 + }, + { + "epoch": 0.11512168444079847, + "grad_norm": 0.1474299430847168, + "learning_rate": 5e-05, + "loss": 1.6375, + "step": 421 + }, + { + "epoch": 0.11539513262236806, + "grad_norm": 0.15810638666152954, + "learning_rate": 5e-05, + "loss": 1.7308, + "step": 422 + }, + { + "epoch": 0.11566858080393766, + "grad_norm": 0.15957583487033844, + "learning_rate": 5e-05, + "loss": 1.662, + "step": 423 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.1710730344057083, + "learning_rate": 5e-05, + "loss": 1.8709, + "step": 424 + }, + { + "epoch": 0.11621547716707684, + "grad_norm": 0.15612466633319855, + "learning_rate": 5e-05, + "loss": 1.6212, + "step": 425 + }, + { + "epoch": 0.11648892534864644, + "grad_norm": 0.1628880500793457, + "learning_rate": 5e-05, + "loss": 1.8161, + "step": 426 + }, + { + "epoch": 0.11676237353021603, + "grad_norm": 0.15160787105560303, + "learning_rate": 5e-05, + "loss": 1.738, + "step": 427 + }, + { + "epoch": 0.11703582171178561, + "grad_norm": 0.15054672956466675, + "learning_rate": 5e-05, + "loss": 1.6689, + "step": 428 + }, + { + "epoch": 0.1173092698933552, + "grad_norm": 0.16296370327472687, + "learning_rate": 5e-05, + "loss": 1.652, + "step": 429 + }, + { + "epoch": 0.1175827180749248, + "grad_norm": 0.14984969794750214, + "learning_rate": 5e-05, + "loss": 1.678, + "step": 430 + }, + { + "epoch": 0.1178561662564944, + "grad_norm": 0.1585860401391983, + "learning_rate": 5e-05, + "loss": 1.5993, + "step": 431 + }, + { + "epoch": 0.11812961443806398, + "grad_norm": 0.16059096157550812, + "learning_rate": 5e-05, + "loss": 1.6874, + "step": 432 + }, + { + "epoch": 0.11840306261963358, + "grad_norm": 0.16525696218013763, + "learning_rate": 5e-05, + "loss": 1.6795, + "step": 433 + }, + { + "epoch": 0.11867651080120317, + "grad_norm": 0.18887005746364594, + "learning_rate": 5e-05, + "loss": 1.6089, + "step": 434 + }, + { + "epoch": 0.11894995898277276, + "grad_norm": 0.15065988898277283, + "learning_rate": 5e-05, + "loss": 1.6332, + "step": 435 + }, + { + "epoch": 0.11922340716434236, + "grad_norm": 0.19893988966941833, + "learning_rate": 5e-05, + "loss": 1.721, + "step": 436 + }, + { + "epoch": 0.11949685534591195, + "grad_norm": 0.14734850823879242, + "learning_rate": 5e-05, + "loss": 1.6332, + "step": 437 + }, + { + "epoch": 0.11977030352748154, + "grad_norm": 0.18723982572555542, + "learning_rate": 5e-05, + "loss": 1.777, + "step": 438 + }, + { + "epoch": 0.12004375170905114, + "grad_norm": 0.1655622124671936, + "learning_rate": 5e-05, + "loss": 1.835, + "step": 439 + }, + { + "epoch": 0.12031719989062073, + "grad_norm": 0.1495118886232376, + "learning_rate": 5e-05, + "loss": 1.5813, + "step": 440 + }, + { + "epoch": 0.12059064807219032, + "grad_norm": 0.1608804315328598, + "learning_rate": 5e-05, + "loss": 1.6728, + "step": 441 + }, + { + "epoch": 0.1208640962537599, + "grad_norm": 0.1557897925376892, + "learning_rate": 5e-05, + "loss": 1.7919, + "step": 442 + }, + { + "epoch": 0.12113754443532951, + "grad_norm": 0.14694997668266296, + "learning_rate": 5e-05, + "loss": 1.6916, + "step": 443 + }, + { + "epoch": 0.1214109926168991, + "grad_norm": 0.15379074215888977, + "learning_rate": 5e-05, + "loss": 1.7453, + "step": 444 + }, + { + "epoch": 0.12168444079846868, + "grad_norm": 0.1691819578409195, + "learning_rate": 5e-05, + "loss": 1.7035, + "step": 445 + }, + { + "epoch": 0.12195788898003829, + "grad_norm": 0.16844218969345093, + "learning_rate": 5e-05, + "loss": 1.6465, + "step": 446 + }, + { + "epoch": 0.12223133716160788, + "grad_norm": 0.1534705013036728, + "learning_rate": 5e-05, + "loss": 1.7147, + "step": 447 + }, + { + "epoch": 0.12250478534317746, + "grad_norm": 0.16286121308803558, + "learning_rate": 5e-05, + "loss": 1.795, + "step": 448 + }, + { + "epoch": 0.12277823352474707, + "grad_norm": 0.1552685797214508, + "learning_rate": 5e-05, + "loss": 1.7379, + "step": 449 + }, + { + "epoch": 0.12305168170631665, + "grad_norm": 0.15220941603183746, + "learning_rate": 5e-05, + "loss": 1.5912, + "step": 450 + }, + { + "epoch": 0.12332512988788624, + "grad_norm": 0.15760937333106995, + "learning_rate": 5e-05, + "loss": 1.723, + "step": 451 + }, + { + "epoch": 0.12359857806945584, + "grad_norm": 0.14814068377017975, + "learning_rate": 5e-05, + "loss": 1.6482, + "step": 452 + }, + { + "epoch": 0.12387202625102543, + "grad_norm": 0.15846781432628632, + "learning_rate": 5e-05, + "loss": 1.6243, + "step": 453 + }, + { + "epoch": 0.12414547443259502, + "grad_norm": 0.14988292753696442, + "learning_rate": 5e-05, + "loss": 1.6261, + "step": 454 + }, + { + "epoch": 0.12441892261416461, + "grad_norm": 0.16977062821388245, + "learning_rate": 5e-05, + "loss": 1.6638, + "step": 455 + }, + { + "epoch": 0.12469237079573421, + "grad_norm": 0.16247521340847015, + "learning_rate": 5e-05, + "loss": 1.6939, + "step": 456 + }, + { + "epoch": 0.1249658189773038, + "grad_norm": 0.15425889194011688, + "learning_rate": 5e-05, + "loss": 1.6645, + "step": 457 + }, + { + "epoch": 0.1252392671588734, + "grad_norm": 0.18221181631088257, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 458 + }, + { + "epoch": 0.12551271534044298, + "grad_norm": 0.14771276712417603, + "learning_rate": 5e-05, + "loss": 1.7018, + "step": 459 + }, + { + "epoch": 0.12578616352201258, + "grad_norm": 0.175098717212677, + "learning_rate": 5e-05, + "loss": 1.78, + "step": 460 + }, + { + "epoch": 0.12605961170358218, + "grad_norm": 0.16985467076301575, + "learning_rate": 5e-05, + "loss": 1.6305, + "step": 461 + }, + { + "epoch": 0.12633305988515175, + "grad_norm": 0.1552826166152954, + "learning_rate": 5e-05, + "loss": 1.6986, + "step": 462 + }, + { + "epoch": 0.12660650806672136, + "grad_norm": 0.17271657288074493, + "learning_rate": 5e-05, + "loss": 1.7182, + "step": 463 + }, + { + "epoch": 0.12687995624829096, + "grad_norm": 0.1511317491531372, + "learning_rate": 5e-05, + "loss": 1.6274, + "step": 464 + }, + { + "epoch": 0.12715340442986053, + "grad_norm": 0.1526670604944229, + "learning_rate": 5e-05, + "loss": 1.6481, + "step": 465 + }, + { + "epoch": 0.12742685261143014, + "grad_norm": 0.15212751924991608, + "learning_rate": 5e-05, + "loss": 1.7155, + "step": 466 + }, + { + "epoch": 0.12770030079299974, + "grad_norm": 0.16328099370002747, + "learning_rate": 5e-05, + "loss": 1.7052, + "step": 467 + }, + { + "epoch": 0.1279737489745693, + "grad_norm": 0.14235898852348328, + "learning_rate": 5e-05, + "loss": 1.6514, + "step": 468 + }, + { + "epoch": 0.12824719715613891, + "grad_norm": 0.17346210777759552, + "learning_rate": 5e-05, + "loss": 1.6711, + "step": 469 + }, + { + "epoch": 0.12852064533770852, + "grad_norm": 0.17061196267604828, + "learning_rate": 5e-05, + "loss": 1.6815, + "step": 470 + }, + { + "epoch": 0.1287940935192781, + "grad_norm": 0.16796523332595825, + "learning_rate": 5e-05, + "loss": 1.6304, + "step": 471 + }, + { + "epoch": 0.1290675417008477, + "grad_norm": 0.17930328845977783, + "learning_rate": 5e-05, + "loss": 1.7399, + "step": 472 + }, + { + "epoch": 0.1293409898824173, + "grad_norm": 0.16334758698940277, + "learning_rate": 5e-05, + "loss": 1.6031, + "step": 473 + }, + { + "epoch": 0.12961443806398687, + "grad_norm": 0.18290555477142334, + "learning_rate": 5e-05, + "loss": 1.7326, + "step": 474 + }, + { + "epoch": 0.12988788624555647, + "grad_norm": 0.15675663948059082, + "learning_rate": 5e-05, + "loss": 1.6692, + "step": 475 + }, + { + "epoch": 0.13016133442712605, + "grad_norm": 0.1945638507604599, + "learning_rate": 5e-05, + "loss": 1.6891, + "step": 476 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.19286639988422394, + "learning_rate": 5e-05, + "loss": 1.7076, + "step": 477 + }, + { + "epoch": 0.13070823079026525, + "grad_norm": 0.17249007523059845, + "learning_rate": 5e-05, + "loss": 1.706, + "step": 478 + }, + { + "epoch": 0.13098167897183483, + "grad_norm": 0.1968424767255783, + "learning_rate": 5e-05, + "loss": 1.6126, + "step": 479 + }, + { + "epoch": 0.13125512715340443, + "grad_norm": 0.19258543848991394, + "learning_rate": 5e-05, + "loss": 1.7572, + "step": 480 + }, + { + "epoch": 0.13152857533497403, + "grad_norm": 0.16817843914031982, + "learning_rate": 5e-05, + "loss": 1.6691, + "step": 481 + }, + { + "epoch": 0.1318020235165436, + "grad_norm": 0.21118032932281494, + "learning_rate": 5e-05, + "loss": 1.6716, + "step": 482 + }, + { + "epoch": 0.1320754716981132, + "grad_norm": 0.15985234081745148, + "learning_rate": 5e-05, + "loss": 1.7348, + "step": 483 + }, + { + "epoch": 0.1323489198796828, + "grad_norm": 0.20505684614181519, + "learning_rate": 5e-05, + "loss": 1.7032, + "step": 484 + }, + { + "epoch": 0.13262236806125238, + "grad_norm": 0.18702927231788635, + "learning_rate": 5e-05, + "loss": 1.6254, + "step": 485 + }, + { + "epoch": 0.13289581624282198, + "grad_norm": 0.17232342064380646, + "learning_rate": 5e-05, + "loss": 1.7039, + "step": 486 + }, + { + "epoch": 0.1331692644243916, + "grad_norm": 0.20174479484558105, + "learning_rate": 5e-05, + "loss": 1.7167, + "step": 487 + }, + { + "epoch": 0.13344271260596116, + "grad_norm": 0.15040408074855804, + "learning_rate": 5e-05, + "loss": 1.7868, + "step": 488 + }, + { + "epoch": 0.13371616078753076, + "grad_norm": 0.2027222216129303, + "learning_rate": 5e-05, + "loss": 1.7302, + "step": 489 + }, + { + "epoch": 0.13398960896910037, + "grad_norm": 0.1705685704946518, + "learning_rate": 5e-05, + "loss": 1.7658, + "step": 490 + }, + { + "epoch": 0.13426305715066994, + "grad_norm": 0.15521638095378876, + "learning_rate": 5e-05, + "loss": 1.7048, + "step": 491 + }, + { + "epoch": 0.13453650533223954, + "grad_norm": 0.16530796885490417, + "learning_rate": 5e-05, + "loss": 1.7096, + "step": 492 + }, + { + "epoch": 0.13480995351380914, + "grad_norm": 0.1526857316493988, + "learning_rate": 5e-05, + "loss": 1.6364, + "step": 493 + }, + { + "epoch": 0.13508340169537872, + "grad_norm": 0.15197399258613586, + "learning_rate": 5e-05, + "loss": 1.7544, + "step": 494 + }, + { + "epoch": 0.13535684987694832, + "grad_norm": 0.17518053948879242, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 495 + }, + { + "epoch": 0.13563029805851792, + "grad_norm": 0.18786345422267914, + "learning_rate": 5e-05, + "loss": 1.6955, + "step": 496 + }, + { + "epoch": 0.1359037462400875, + "grad_norm": 0.16572856903076172, + "learning_rate": 5e-05, + "loss": 1.8124, + "step": 497 + }, + { + "epoch": 0.1361771944216571, + "grad_norm": 0.20353186130523682, + "learning_rate": 5e-05, + "loss": 1.6766, + "step": 498 + }, + { + "epoch": 0.1364506426032267, + "grad_norm": 0.15591098368167877, + "learning_rate": 5e-05, + "loss": 1.6693, + "step": 499 + }, + { + "epoch": 0.13672409078479628, + "grad_norm": 0.17146877944469452, + "learning_rate": 5e-05, + "loss": 1.5863, + "step": 500 + }, + { + "epoch": 0.13699753896636588, + "grad_norm": 0.1932230293750763, + "learning_rate": 5e-05, + "loss": 1.6442, + "step": 501 + }, + { + "epoch": 0.13727098714793545, + "grad_norm": 0.14423272013664246, + "learning_rate": 5e-05, + "loss": 1.6168, + "step": 502 + }, + { + "epoch": 0.13754443532950505, + "grad_norm": 0.15283560752868652, + "learning_rate": 5e-05, + "loss": 1.661, + "step": 503 + }, + { + "epoch": 0.13781788351107466, + "grad_norm": 0.19705916941165924, + "learning_rate": 5e-05, + "loss": 1.7297, + "step": 504 + }, + { + "epoch": 0.13809133169264423, + "grad_norm": 0.16028250753879547, + "learning_rate": 5e-05, + "loss": 1.764, + "step": 505 + }, + { + "epoch": 0.13836477987421383, + "grad_norm": 0.18517790734767914, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 506 + }, + { + "epoch": 0.13863822805578344, + "grad_norm": 0.17298881709575653, + "learning_rate": 5e-05, + "loss": 1.6147, + "step": 507 + }, + { + "epoch": 0.138911676237353, + "grad_norm": 0.1618654429912567, + "learning_rate": 5e-05, + "loss": 1.7223, + "step": 508 + }, + { + "epoch": 0.1391851244189226, + "grad_norm": 0.17122521996498108, + "learning_rate": 5e-05, + "loss": 1.6623, + "step": 509 + }, + { + "epoch": 0.13945857260049221, + "grad_norm": 0.1496722251176834, + "learning_rate": 5e-05, + "loss": 1.6599, + "step": 510 + }, + { + "epoch": 0.1397320207820618, + "grad_norm": 0.14283980429172516, + "learning_rate": 5e-05, + "loss": 1.6108, + "step": 511 + }, + { + "epoch": 0.1400054689636314, + "grad_norm": 0.15560725331306458, + "learning_rate": 5e-05, + "loss": 1.6407, + "step": 512 + }, + { + "epoch": 0.140278917145201, + "grad_norm": 0.14533740282058716, + "learning_rate": 5e-05, + "loss": 1.6877, + "step": 513 + }, + { + "epoch": 0.14055236532677057, + "grad_norm": 0.15537096560001373, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 514 + }, + { + "epoch": 0.14082581350834017, + "grad_norm": 0.14118176698684692, + "learning_rate": 5e-05, + "loss": 1.6312, + "step": 515 + }, + { + "epoch": 0.14109926168990977, + "grad_norm": 0.14146625995635986, + "learning_rate": 5e-05, + "loss": 1.5738, + "step": 516 + }, + { + "epoch": 0.14137270987147935, + "grad_norm": 0.15033139288425446, + "learning_rate": 5e-05, + "loss": 1.7774, + "step": 517 + }, + { + "epoch": 0.14164615805304895, + "grad_norm": 0.141265869140625, + "learning_rate": 5e-05, + "loss": 1.6147, + "step": 518 + }, + { + "epoch": 0.14191960623461855, + "grad_norm": 0.14538274705410004, + "learning_rate": 5e-05, + "loss": 1.6542, + "step": 519 + }, + { + "epoch": 0.14219305441618812, + "grad_norm": 0.15939506888389587, + "learning_rate": 5e-05, + "loss": 1.6933, + "step": 520 + }, + { + "epoch": 0.14246650259775773, + "grad_norm": 0.1489834487438202, + "learning_rate": 5e-05, + "loss": 1.6392, + "step": 521 + }, + { + "epoch": 0.14273995077932733, + "grad_norm": 0.18363149464130402, + "learning_rate": 5e-05, + "loss": 1.7042, + "step": 522 + }, + { + "epoch": 0.1430133989608969, + "grad_norm": 0.16449148952960968, + "learning_rate": 5e-05, + "loss": 1.6792, + "step": 523 + }, + { + "epoch": 0.1432868471424665, + "grad_norm": 0.15585970878601074, + "learning_rate": 5e-05, + "loss": 1.6823, + "step": 524 + }, + { + "epoch": 0.1435602953240361, + "grad_norm": 0.1625368297100067, + "learning_rate": 5e-05, + "loss": 1.6709, + "step": 525 + }, + { + "epoch": 0.14383374350560568, + "grad_norm": 0.16228622198104858, + "learning_rate": 5e-05, + "loss": 1.7291, + "step": 526 + }, + { + "epoch": 0.14410719168717528, + "grad_norm": 0.16529570519924164, + "learning_rate": 5e-05, + "loss": 1.7617, + "step": 527 + }, + { + "epoch": 0.1443806398687449, + "grad_norm": 0.1438167542219162, + "learning_rate": 5e-05, + "loss": 1.6428, + "step": 528 + }, + { + "epoch": 0.14465408805031446, + "grad_norm": 0.16496515274047852, + "learning_rate": 5e-05, + "loss": 1.771, + "step": 529 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.16147810220718384, + "learning_rate": 5e-05, + "loss": 1.6306, + "step": 530 + }, + { + "epoch": 0.14520098441345364, + "grad_norm": 0.14431102573871613, + "learning_rate": 5e-05, + "loss": 1.5672, + "step": 531 + }, + { + "epoch": 0.14547443259502324, + "grad_norm": 0.1534319818019867, + "learning_rate": 5e-05, + "loss": 1.7233, + "step": 532 + }, + { + "epoch": 0.14574788077659284, + "grad_norm": 0.15924040973186493, + "learning_rate": 5e-05, + "loss": 1.6973, + "step": 533 + }, + { + "epoch": 0.14602132895816242, + "grad_norm": 0.14544963836669922, + "learning_rate": 5e-05, + "loss": 1.6773, + "step": 534 + }, + { + "epoch": 0.14629477713973202, + "grad_norm": 0.1628415733575821, + "learning_rate": 5e-05, + "loss": 1.6427, + "step": 535 + }, + { + "epoch": 0.14656822532130162, + "grad_norm": 0.1434401571750641, + "learning_rate": 5e-05, + "loss": 1.6043, + "step": 536 + }, + { + "epoch": 0.1468416735028712, + "grad_norm": 0.17323043942451477, + "learning_rate": 5e-05, + "loss": 1.7487, + "step": 537 + }, + { + "epoch": 0.1471151216844408, + "grad_norm": 0.15013372898101807, + "learning_rate": 5e-05, + "loss": 1.6936, + "step": 538 + }, + { + "epoch": 0.1473885698660104, + "grad_norm": 0.14302760362625122, + "learning_rate": 5e-05, + "loss": 1.6237, + "step": 539 + }, + { + "epoch": 0.14766201804757997, + "grad_norm": 0.16000758111476898, + "learning_rate": 5e-05, + "loss": 1.7007, + "step": 540 + }, + { + "epoch": 0.14793546622914958, + "grad_norm": 0.1614782214164734, + "learning_rate": 5e-05, + "loss": 1.794, + "step": 541 + }, + { + "epoch": 0.14820891441071918, + "grad_norm": 0.17829090356826782, + "learning_rate": 5e-05, + "loss": 1.7155, + "step": 542 + }, + { + "epoch": 0.14848236259228875, + "grad_norm": 0.16465310752391815, + "learning_rate": 5e-05, + "loss": 1.6136, + "step": 543 + }, + { + "epoch": 0.14875581077385835, + "grad_norm": 0.17050433158874512, + "learning_rate": 5e-05, + "loss": 1.6685, + "step": 544 + }, + { + "epoch": 0.14902925895542796, + "grad_norm": 0.1535949409008026, + "learning_rate": 5e-05, + "loss": 1.6712, + "step": 545 + }, + { + "epoch": 0.14930270713699753, + "grad_norm": 0.1774369478225708, + "learning_rate": 5e-05, + "loss": 1.6754, + "step": 546 + }, + { + "epoch": 0.14957615531856713, + "grad_norm": 0.14995476603507996, + "learning_rate": 5e-05, + "loss": 1.6214, + "step": 547 + }, + { + "epoch": 0.14984960350013674, + "grad_norm": 0.16445566713809967, + "learning_rate": 5e-05, + "loss": 1.6705, + "step": 548 + }, + { + "epoch": 0.1501230516817063, + "grad_norm": 0.1804472953081131, + "learning_rate": 5e-05, + "loss": 1.7822, + "step": 549 + }, + { + "epoch": 0.1503964998632759, + "grad_norm": 0.15526773035526276, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 550 + }, + { + "epoch": 0.15066994804484551, + "grad_norm": 0.1779652237892151, + "learning_rate": 5e-05, + "loss": 1.6667, + "step": 551 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.15680311620235443, + "learning_rate": 5e-05, + "loss": 1.663, + "step": 552 + }, + { + "epoch": 0.1512168444079847, + "grad_norm": 0.15443289279937744, + "learning_rate": 5e-05, + "loss": 1.65, + "step": 553 + }, + { + "epoch": 0.1514902925895543, + "grad_norm": 0.16194204986095428, + "learning_rate": 5e-05, + "loss": 1.7575, + "step": 554 + }, + { + "epoch": 0.15176374077112387, + "grad_norm": 0.1542530059814453, + "learning_rate": 5e-05, + "loss": 1.6408, + "step": 555 + }, + { + "epoch": 0.15203718895269347, + "grad_norm": 0.14716754853725433, + "learning_rate": 5e-05, + "loss": 1.6898, + "step": 556 + }, + { + "epoch": 0.15231063713426304, + "grad_norm": 0.15577493607997894, + "learning_rate": 5e-05, + "loss": 1.6614, + "step": 557 + }, + { + "epoch": 0.15258408531583265, + "grad_norm": 0.14232535660266876, + "learning_rate": 5e-05, + "loss": 1.6025, + "step": 558 + }, + { + "epoch": 0.15285753349740225, + "grad_norm": 0.15629936754703522, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 559 + }, + { + "epoch": 0.15313098167897182, + "grad_norm": 0.15671175718307495, + "learning_rate": 5e-05, + "loss": 1.6616, + "step": 560 + }, + { + "epoch": 0.15340442986054142, + "grad_norm": 0.15339986979961395, + "learning_rate": 5e-05, + "loss": 1.7227, + "step": 561 + }, + { + "epoch": 0.15367787804211103, + "grad_norm": 0.14904935657978058, + "learning_rate": 5e-05, + "loss": 1.7057, + "step": 562 + }, + { + "epoch": 0.1539513262236806, + "grad_norm": 0.16447263956069946, + "learning_rate": 5e-05, + "loss": 1.7246, + "step": 563 + }, + { + "epoch": 0.1542247744052502, + "grad_norm": 0.15060292184352875, + "learning_rate": 5e-05, + "loss": 1.731, + "step": 564 + }, + { + "epoch": 0.1544982225868198, + "grad_norm": 0.15564222633838654, + "learning_rate": 5e-05, + "loss": 1.662, + "step": 565 + }, + { + "epoch": 0.15477167076838938, + "grad_norm": 0.1540018767118454, + "learning_rate": 5e-05, + "loss": 1.6793, + "step": 566 + }, + { + "epoch": 0.15504511894995898, + "grad_norm": 0.14922669529914856, + "learning_rate": 5e-05, + "loss": 1.6619, + "step": 567 + }, + { + "epoch": 0.15531856713152858, + "grad_norm": 0.1617313027381897, + "learning_rate": 5e-05, + "loss": 1.6571, + "step": 568 + }, + { + "epoch": 0.15559201531309816, + "grad_norm": 0.15420418977737427, + "learning_rate": 5e-05, + "loss": 1.6897, + "step": 569 + }, + { + "epoch": 0.15586546349466776, + "grad_norm": 0.16023240983486176, + "learning_rate": 5e-05, + "loss": 1.7376, + "step": 570 + }, + { + "epoch": 0.15613891167623736, + "grad_norm": 0.17280931770801544, + "learning_rate": 5e-05, + "loss": 1.7339, + "step": 571 + }, + { + "epoch": 0.15641235985780694, + "grad_norm": 0.15177179872989655, + "learning_rate": 5e-05, + "loss": 1.7808, + "step": 572 + }, + { + "epoch": 0.15668580803937654, + "grad_norm": 0.14648132026195526, + "learning_rate": 5e-05, + "loss": 1.6798, + "step": 573 + }, + { + "epoch": 0.15695925622094614, + "grad_norm": 0.1551511436700821, + "learning_rate": 5e-05, + "loss": 1.8416, + "step": 574 + }, + { + "epoch": 0.15723270440251572, + "grad_norm": 0.14707240462303162, + "learning_rate": 5e-05, + "loss": 1.6569, + "step": 575 + }, + { + "epoch": 0.15750615258408532, + "grad_norm": 0.15362726151943207, + "learning_rate": 5e-05, + "loss": 1.6738, + "step": 576 + }, + { + "epoch": 0.15777960076565492, + "grad_norm": 0.1611640602350235, + "learning_rate": 5e-05, + "loss": 1.6615, + "step": 577 + }, + { + "epoch": 0.1580530489472245, + "grad_norm": 0.14445427060127258, + "learning_rate": 5e-05, + "loss": 1.6479, + "step": 578 + }, + { + "epoch": 0.1583264971287941, + "grad_norm": 0.1491660177707672, + "learning_rate": 5e-05, + "loss": 1.6616, + "step": 579 + }, + { + "epoch": 0.1585999453103637, + "grad_norm": 0.16628406941890717, + "learning_rate": 5e-05, + "loss": 1.7884, + "step": 580 + }, + { + "epoch": 0.15887339349193327, + "grad_norm": 0.1535821259021759, + "learning_rate": 5e-05, + "loss": 1.7276, + "step": 581 + }, + { + "epoch": 0.15914684167350288, + "grad_norm": 0.14763818681240082, + "learning_rate": 5e-05, + "loss": 1.526, + "step": 582 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 0.15529736876487732, + "learning_rate": 5e-05, + "loss": 1.7708, + "step": 583 + }, + { + "epoch": 0.15969373803664205, + "grad_norm": 0.15036877989768982, + "learning_rate": 5e-05, + "loss": 1.7016, + "step": 584 + }, + { + "epoch": 0.15996718621821165, + "grad_norm": 0.15483739972114563, + "learning_rate": 5e-05, + "loss": 1.5965, + "step": 585 + }, + { + "epoch": 0.16024063439978123, + "grad_norm": 0.15531662106513977, + "learning_rate": 5e-05, + "loss": 1.6769, + "step": 586 + }, + { + "epoch": 0.16051408258135083, + "grad_norm": 0.1560511589050293, + "learning_rate": 5e-05, + "loss": 1.7538, + "step": 587 + }, + { + "epoch": 0.16078753076292043, + "grad_norm": 0.1831214427947998, + "learning_rate": 5e-05, + "loss": 1.7097, + "step": 588 + }, + { + "epoch": 0.16106097894449, + "grad_norm": 0.15891355276107788, + "learning_rate": 5e-05, + "loss": 1.6161, + "step": 589 + }, + { + "epoch": 0.1613344271260596, + "grad_norm": 0.1625254601240158, + "learning_rate": 5e-05, + "loss": 1.644, + "step": 590 + }, + { + "epoch": 0.1616078753076292, + "grad_norm": 0.15376971662044525, + "learning_rate": 5e-05, + "loss": 1.701, + "step": 591 + }, + { + "epoch": 0.1618813234891988, + "grad_norm": 0.14270102977752686, + "learning_rate": 5e-05, + "loss": 1.5089, + "step": 592 + }, + { + "epoch": 0.1621547716707684, + "grad_norm": 0.14729395508766174, + "learning_rate": 5e-05, + "loss": 1.6989, + "step": 593 + }, + { + "epoch": 0.162428219852338, + "grad_norm": 0.1675315499305725, + "learning_rate": 5e-05, + "loss": 1.7195, + "step": 594 + }, + { + "epoch": 0.16270166803390756, + "grad_norm": 0.14959470927715302, + "learning_rate": 5e-05, + "loss": 1.678, + "step": 595 + }, + { + "epoch": 0.16297511621547717, + "grad_norm": 0.15854990482330322, + "learning_rate": 5e-05, + "loss": 1.6452, + "step": 596 + }, + { + "epoch": 0.16324856439704677, + "grad_norm": 0.151190385222435, + "learning_rate": 5e-05, + "loss": 1.6751, + "step": 597 + }, + { + "epoch": 0.16352201257861634, + "grad_norm": 0.1632450520992279, + "learning_rate": 5e-05, + "loss": 1.662, + "step": 598 + }, + { + "epoch": 0.16379546076018595, + "grad_norm": 0.1520984023809433, + "learning_rate": 5e-05, + "loss": 1.7377, + "step": 599 + }, + { + "epoch": 0.16406890894175555, + "grad_norm": 0.14863604307174683, + "learning_rate": 5e-05, + "loss": 1.5927, + "step": 600 + }, + { + "epoch": 0.16434235712332512, + "grad_norm": 0.15424251556396484, + "learning_rate": 5e-05, + "loss": 1.6265, + "step": 601 + }, + { + "epoch": 0.16461580530489472, + "grad_norm": 0.15759313106536865, + "learning_rate": 5e-05, + "loss": 1.6659, + "step": 602 + }, + { + "epoch": 0.16488925348646433, + "grad_norm": 0.1515471190214157, + "learning_rate": 5e-05, + "loss": 1.6585, + "step": 603 + }, + { + "epoch": 0.1651627016680339, + "grad_norm": 0.16653236746788025, + "learning_rate": 5e-05, + "loss": 1.6593, + "step": 604 + }, + { + "epoch": 0.1654361498496035, + "grad_norm": 0.15483258664608002, + "learning_rate": 5e-05, + "loss": 1.7138, + "step": 605 + }, + { + "epoch": 0.1657095980311731, + "grad_norm": 0.14448527991771698, + "learning_rate": 5e-05, + "loss": 1.6627, + "step": 606 + }, + { + "epoch": 0.16598304621274268, + "grad_norm": 0.15589672327041626, + "learning_rate": 5e-05, + "loss": 1.5996, + "step": 607 + }, + { + "epoch": 0.16625649439431228, + "grad_norm": 0.16023333370685577, + "learning_rate": 5e-05, + "loss": 1.6351, + "step": 608 + }, + { + "epoch": 0.16652994257588188, + "grad_norm": 0.16108067333698273, + "learning_rate": 5e-05, + "loss": 1.716, + "step": 609 + }, + { + "epoch": 0.16680339075745146, + "grad_norm": 0.14648942649364471, + "learning_rate": 5e-05, + "loss": 1.662, + "step": 610 + }, + { + "epoch": 0.16707683893902106, + "grad_norm": 0.14486436545848846, + "learning_rate": 5e-05, + "loss": 1.6219, + "step": 611 + }, + { + "epoch": 0.16735028712059064, + "grad_norm": 0.15563012659549713, + "learning_rate": 5e-05, + "loss": 1.6803, + "step": 612 + }, + { + "epoch": 0.16762373530216024, + "grad_norm": 0.1571030616760254, + "learning_rate": 5e-05, + "loss": 1.7195, + "step": 613 + }, + { + "epoch": 0.16789718348372984, + "grad_norm": 0.14509615302085876, + "learning_rate": 5e-05, + "loss": 1.6462, + "step": 614 + }, + { + "epoch": 0.1681706316652994, + "grad_norm": 0.15147417783737183, + "learning_rate": 5e-05, + "loss": 1.7463, + "step": 615 + }, + { + "epoch": 0.16844407984686902, + "grad_norm": 0.1549796462059021, + "learning_rate": 5e-05, + "loss": 1.6997, + "step": 616 + }, + { + "epoch": 0.16871752802843862, + "grad_norm": 0.16344091296195984, + "learning_rate": 5e-05, + "loss": 1.6065, + "step": 617 + }, + { + "epoch": 0.1689909762100082, + "grad_norm": 0.1531122624874115, + "learning_rate": 5e-05, + "loss": 1.5878, + "step": 618 + }, + { + "epoch": 0.1692644243915778, + "grad_norm": 0.17184039950370789, + "learning_rate": 5e-05, + "loss": 1.6921, + "step": 619 + }, + { + "epoch": 0.1695378725731474, + "grad_norm": 0.16684702038764954, + "learning_rate": 5e-05, + "loss": 1.7952, + "step": 620 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 0.1621457189321518, + "learning_rate": 5e-05, + "loss": 1.6935, + "step": 621 + }, + { + "epoch": 0.17008476893628657, + "grad_norm": 0.15157224237918854, + "learning_rate": 5e-05, + "loss": 1.747, + "step": 622 + }, + { + "epoch": 0.17035821711785618, + "grad_norm": 0.1498030573129654, + "learning_rate": 5e-05, + "loss": 1.6138, + "step": 623 + }, + { + "epoch": 0.17063166529942575, + "grad_norm": 0.15279249846935272, + "learning_rate": 5e-05, + "loss": 1.6191, + "step": 624 + }, + { + "epoch": 0.17090511348099535, + "grad_norm": 0.15653859078884125, + "learning_rate": 5e-05, + "loss": 1.6271, + "step": 625 + }, + { + "epoch": 0.17117856166256495, + "grad_norm": 0.17365135252475739, + "learning_rate": 5e-05, + "loss": 1.733, + "step": 626 + }, + { + "epoch": 0.17145200984413453, + "grad_norm": 0.14621610939502716, + "learning_rate": 5e-05, + "loss": 1.5571, + "step": 627 + }, + { + "epoch": 0.17172545802570413, + "grad_norm": 0.1693270057439804, + "learning_rate": 5e-05, + "loss": 1.7436, + "step": 628 + }, + { + "epoch": 0.17199890620727373, + "grad_norm": 0.15240801870822906, + "learning_rate": 5e-05, + "loss": 1.7051, + "step": 629 + }, + { + "epoch": 0.1722723543888433, + "grad_norm": 0.14783060550689697, + "learning_rate": 5e-05, + "loss": 1.6765, + "step": 630 + }, + { + "epoch": 0.1725458025704129, + "grad_norm": 0.15038736164569855, + "learning_rate": 5e-05, + "loss": 1.5878, + "step": 631 + }, + { + "epoch": 0.1728192507519825, + "grad_norm": 0.16009370982646942, + "learning_rate": 5e-05, + "loss": 1.7588, + "step": 632 + }, + { + "epoch": 0.17309269893355209, + "grad_norm": 0.15497462451457977, + "learning_rate": 5e-05, + "loss": 1.6953, + "step": 633 + }, + { + "epoch": 0.1733661471151217, + "grad_norm": 0.18641214072704315, + "learning_rate": 5e-05, + "loss": 1.6218, + "step": 634 + }, + { + "epoch": 0.1736395952966913, + "grad_norm": 0.1536916345357895, + "learning_rate": 5e-05, + "loss": 1.6898, + "step": 635 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.15958872437477112, + "learning_rate": 5e-05, + "loss": 1.6823, + "step": 636 + }, + { + "epoch": 0.17418649165983047, + "grad_norm": 0.16902394592761993, + "learning_rate": 5e-05, + "loss": 1.7029, + "step": 637 + }, + { + "epoch": 0.17445993984140004, + "grad_norm": 0.1553962379693985, + "learning_rate": 5e-05, + "loss": 1.7663, + "step": 638 + }, + { + "epoch": 0.17473338802296964, + "grad_norm": 0.1627410352230072, + "learning_rate": 5e-05, + "loss": 1.5562, + "step": 639 + }, + { + "epoch": 0.17500683620453925, + "grad_norm": 0.16163355112075806, + "learning_rate": 5e-05, + "loss": 1.6688, + "step": 640 + }, + { + "epoch": 0.17528028438610882, + "grad_norm": 0.14881187677383423, + "learning_rate": 5e-05, + "loss": 1.6582, + "step": 641 + }, + { + "epoch": 0.17555373256767842, + "grad_norm": 0.1964666247367859, + "learning_rate": 5e-05, + "loss": 1.6861, + "step": 642 + }, + { + "epoch": 0.17582718074924802, + "grad_norm": 0.15720294415950775, + "learning_rate": 5e-05, + "loss": 1.6249, + "step": 643 + }, + { + "epoch": 0.1761006289308176, + "grad_norm": 0.15254618227481842, + "learning_rate": 5e-05, + "loss": 1.6902, + "step": 644 + }, + { + "epoch": 0.1763740771123872, + "grad_norm": 0.19420553743839264, + "learning_rate": 5e-05, + "loss": 1.7313, + "step": 645 + }, + { + "epoch": 0.1766475252939568, + "grad_norm": 0.1590331643819809, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 646 + }, + { + "epoch": 0.17692097347552638, + "grad_norm": 0.18741661310195923, + "learning_rate": 5e-05, + "loss": 1.6959, + "step": 647 + }, + { + "epoch": 0.17719442165709598, + "grad_norm": 0.16829299926757812, + "learning_rate": 5e-05, + "loss": 1.7166, + "step": 648 + }, + { + "epoch": 0.17746786983866558, + "grad_norm": 0.16889558732509613, + "learning_rate": 5e-05, + "loss": 1.6289, + "step": 649 + }, + { + "epoch": 0.17774131802023516, + "grad_norm": 0.18458721041679382, + "learning_rate": 5e-05, + "loss": 1.6351, + "step": 650 + }, + { + "epoch": 0.17801476620180476, + "grad_norm": 0.1489873081445694, + "learning_rate": 5e-05, + "loss": 1.6426, + "step": 651 + }, + { + "epoch": 0.17828821438337436, + "grad_norm": 0.1619219332933426, + "learning_rate": 5e-05, + "loss": 1.6849, + "step": 652 + }, + { + "epoch": 0.17856166256494393, + "grad_norm": 0.15107835829257965, + "learning_rate": 5e-05, + "loss": 1.6455, + "step": 653 + }, + { + "epoch": 0.17883511074651354, + "grad_norm": 0.15547437965869904, + "learning_rate": 5e-05, + "loss": 1.7137, + "step": 654 + }, + { + "epoch": 0.17910855892808314, + "grad_norm": 0.18254241347312927, + "learning_rate": 5e-05, + "loss": 1.7565, + "step": 655 + }, + { + "epoch": 0.1793820071096527, + "grad_norm": 0.15430989861488342, + "learning_rate": 5e-05, + "loss": 1.7157, + "step": 656 + }, + { + "epoch": 0.17965545529122232, + "grad_norm": 0.14642038941383362, + "learning_rate": 5e-05, + "loss": 1.6962, + "step": 657 + }, + { + "epoch": 0.17992890347279192, + "grad_norm": 0.15086182951927185, + "learning_rate": 5e-05, + "loss": 1.6187, + "step": 658 + }, + { + "epoch": 0.1802023516543615, + "grad_norm": 0.1483525186777115, + "learning_rate": 5e-05, + "loss": 1.6572, + "step": 659 + }, + { + "epoch": 0.1804757998359311, + "grad_norm": 0.1709260642528534, + "learning_rate": 5e-05, + "loss": 1.6813, + "step": 660 + }, + { + "epoch": 0.1807492480175007, + "grad_norm": 0.17065215110778809, + "learning_rate": 5e-05, + "loss": 1.7901, + "step": 661 + }, + { + "epoch": 0.18102269619907027, + "grad_norm": 0.15385740995407104, + "learning_rate": 5e-05, + "loss": 1.6925, + "step": 662 + }, + { + "epoch": 0.18129614438063987, + "grad_norm": 0.16739514470100403, + "learning_rate": 5e-05, + "loss": 1.6025, + "step": 663 + }, + { + "epoch": 0.18156959256220945, + "grad_norm": 0.16431251168251038, + "learning_rate": 5e-05, + "loss": 1.6316, + "step": 664 + }, + { + "epoch": 0.18184304074377905, + "grad_norm": 0.15166325867176056, + "learning_rate": 5e-05, + "loss": 1.7378, + "step": 665 + }, + { + "epoch": 0.18211648892534865, + "grad_norm": 0.1769624650478363, + "learning_rate": 5e-05, + "loss": 1.6963, + "step": 666 + }, + { + "epoch": 0.18238993710691823, + "grad_norm": 0.17551544308662415, + "learning_rate": 5e-05, + "loss": 1.6722, + "step": 667 + }, + { + "epoch": 0.18266338528848783, + "grad_norm": 0.1519336998462677, + "learning_rate": 5e-05, + "loss": 1.7812, + "step": 668 + }, + { + "epoch": 0.18293683347005743, + "grad_norm": 0.1523965448141098, + "learning_rate": 5e-05, + "loss": 1.6374, + "step": 669 + }, + { + "epoch": 0.183210281651627, + "grad_norm": 0.16304127871990204, + "learning_rate": 5e-05, + "loss": 1.6436, + "step": 670 + }, + { + "epoch": 0.1834837298331966, + "grad_norm": 0.1389431655406952, + "learning_rate": 5e-05, + "loss": 1.6136, + "step": 671 + }, + { + "epoch": 0.1837571780147662, + "grad_norm": 0.15939609706401825, + "learning_rate": 5e-05, + "loss": 1.6407, + "step": 672 + }, + { + "epoch": 0.18403062619633578, + "grad_norm": 0.15122279524803162, + "learning_rate": 5e-05, + "loss": 1.605, + "step": 673 + }, + { + "epoch": 0.18430407437790539, + "grad_norm": 0.14799101650714874, + "learning_rate": 5e-05, + "loss": 1.6199, + "step": 674 + }, + { + "epoch": 0.184577522559475, + "grad_norm": 0.17853890359401703, + "learning_rate": 5e-05, + "loss": 1.7246, + "step": 675 + }, + { + "epoch": 0.18485097074104456, + "grad_norm": 0.14898012578487396, + "learning_rate": 5e-05, + "loss": 1.7212, + "step": 676 + }, + { + "epoch": 0.18512441892261416, + "grad_norm": 0.1488298773765564, + "learning_rate": 5e-05, + "loss": 1.6592, + "step": 677 + }, + { + "epoch": 0.18539786710418377, + "grad_norm": 0.14956548810005188, + "learning_rate": 5e-05, + "loss": 1.6025, + "step": 678 + }, + { + "epoch": 0.18567131528575334, + "grad_norm": 0.14764147996902466, + "learning_rate": 5e-05, + "loss": 1.6695, + "step": 679 + }, + { + "epoch": 0.18594476346732294, + "grad_norm": 0.16988615691661835, + "learning_rate": 5e-05, + "loss": 1.7755, + "step": 680 + }, + { + "epoch": 0.18621821164889255, + "grad_norm": 0.1601964235305786, + "learning_rate": 5e-05, + "loss": 1.7565, + "step": 681 + }, + { + "epoch": 0.18649165983046212, + "grad_norm": 0.17461629211902618, + "learning_rate": 5e-05, + "loss": 1.7552, + "step": 682 + }, + { + "epoch": 0.18676510801203172, + "grad_norm": 0.15619756281375885, + "learning_rate": 5e-05, + "loss": 1.7162, + "step": 683 + }, + { + "epoch": 0.18703855619360132, + "grad_norm": 0.15257790684700012, + "learning_rate": 5e-05, + "loss": 1.7154, + "step": 684 + }, + { + "epoch": 0.1873120043751709, + "grad_norm": 0.15032611787319183, + "learning_rate": 5e-05, + "loss": 1.5825, + "step": 685 + }, + { + "epoch": 0.1875854525567405, + "grad_norm": 0.1578359454870224, + "learning_rate": 5e-05, + "loss": 1.706, + "step": 686 + }, + { + "epoch": 0.1878589007383101, + "grad_norm": 0.16593731939792633, + "learning_rate": 5e-05, + "loss": 1.624, + "step": 687 + }, + { + "epoch": 0.18813234891987968, + "grad_norm": 0.17991593480110168, + "learning_rate": 5e-05, + "loss": 1.7293, + "step": 688 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 0.15078522264957428, + "learning_rate": 5e-05, + "loss": 1.6419, + "step": 689 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.20055970549583435, + "learning_rate": 5e-05, + "loss": 1.7743, + "step": 690 + }, + { + "epoch": 0.18895269346458846, + "grad_norm": 0.14143189787864685, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 691 + }, + { + "epoch": 0.18922614164615806, + "grad_norm": 0.17427237331867218, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 692 + }, + { + "epoch": 0.18949958982772763, + "grad_norm": 0.18519847095012665, + "learning_rate": 5e-05, + "loss": 1.7039, + "step": 693 + }, + { + "epoch": 0.18977303800929723, + "grad_norm": 0.15196627378463745, + "learning_rate": 5e-05, + "loss": 1.7847, + "step": 694 + }, + { + "epoch": 0.19004648619086684, + "grad_norm": 0.17246317863464355, + "learning_rate": 5e-05, + "loss": 1.7097, + "step": 695 + }, + { + "epoch": 0.1903199343724364, + "grad_norm": 0.16350534558296204, + "learning_rate": 5e-05, + "loss": 1.6783, + "step": 696 + }, + { + "epoch": 0.190593382554006, + "grad_norm": 0.15077216923236847, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 697 + }, + { + "epoch": 0.19086683073557562, + "grad_norm": 0.17161305248737335, + "learning_rate": 5e-05, + "loss": 1.757, + "step": 698 + }, + { + "epoch": 0.1911402789171452, + "grad_norm": 0.15666204690933228, + "learning_rate": 5e-05, + "loss": 1.6094, + "step": 699 + }, + { + "epoch": 0.1914137270987148, + "grad_norm": 0.15116041898727417, + "learning_rate": 5e-05, + "loss": 1.7604, + "step": 700 + }, + { + "epoch": 0.1916871752802844, + "grad_norm": 0.17067068815231323, + "learning_rate": 5e-05, + "loss": 1.6355, + "step": 701 + }, + { + "epoch": 0.19196062346185397, + "grad_norm": 0.1541707068681717, + "learning_rate": 5e-05, + "loss": 1.7279, + "step": 702 + }, + { + "epoch": 0.19223407164342357, + "grad_norm": 0.154357448220253, + "learning_rate": 5e-05, + "loss": 1.5981, + "step": 703 + }, + { + "epoch": 0.19250751982499317, + "grad_norm": 0.1600044220685959, + "learning_rate": 5e-05, + "loss": 1.7863, + "step": 704 + }, + { + "epoch": 0.19278096800656275, + "grad_norm": 0.15702758729457855, + "learning_rate": 5e-05, + "loss": 1.752, + "step": 705 + }, + { + "epoch": 0.19305441618813235, + "grad_norm": 0.15750904381275177, + "learning_rate": 5e-05, + "loss": 1.6763, + "step": 706 + }, + { + "epoch": 0.19332786436970195, + "grad_norm": 0.14942318201065063, + "learning_rate": 5e-05, + "loss": 1.5828, + "step": 707 + }, + { + "epoch": 0.19360131255127153, + "grad_norm": 0.14567220211029053, + "learning_rate": 5e-05, + "loss": 1.647, + "step": 708 + }, + { + "epoch": 0.19387476073284113, + "grad_norm": 0.15003998577594757, + "learning_rate": 5e-05, + "loss": 1.6545, + "step": 709 + }, + { + "epoch": 0.19414820891441073, + "grad_norm": 0.1552526205778122, + "learning_rate": 5e-05, + "loss": 1.7039, + "step": 710 + }, + { + "epoch": 0.1944216570959803, + "grad_norm": 0.16224539279937744, + "learning_rate": 5e-05, + "loss": 1.6792, + "step": 711 + }, + { + "epoch": 0.1946951052775499, + "grad_norm": 0.15637609362602234, + "learning_rate": 5e-05, + "loss": 1.7131, + "step": 712 + }, + { + "epoch": 0.1949685534591195, + "grad_norm": 0.14550897479057312, + "learning_rate": 5e-05, + "loss": 1.6662, + "step": 713 + }, + { + "epoch": 0.19524200164068908, + "grad_norm": 0.14311423897743225, + "learning_rate": 5e-05, + "loss": 1.5609, + "step": 714 + }, + { + "epoch": 0.19551544982225869, + "grad_norm": 0.1632792055606842, + "learning_rate": 5e-05, + "loss": 1.716, + "step": 715 + }, + { + "epoch": 0.1957888980038283, + "grad_norm": 0.14958421885967255, + "learning_rate": 5e-05, + "loss": 1.6868, + "step": 716 + }, + { + "epoch": 0.19606234618539786, + "grad_norm": 0.15640190243721008, + "learning_rate": 5e-05, + "loss": 1.6763, + "step": 717 + }, + { + "epoch": 0.19633579436696746, + "grad_norm": 0.1410387009382248, + "learning_rate": 5e-05, + "loss": 1.6593, + "step": 718 + }, + { + "epoch": 0.19660924254853704, + "grad_norm": 0.14812639355659485, + "learning_rate": 5e-05, + "loss": 1.6907, + "step": 719 + }, + { + "epoch": 0.19688269073010664, + "grad_norm": 0.15623793005943298, + "learning_rate": 5e-05, + "loss": 1.6879, + "step": 720 + }, + { + "epoch": 0.19715613891167624, + "grad_norm": 0.1545989066362381, + "learning_rate": 5e-05, + "loss": 1.7276, + "step": 721 + }, + { + "epoch": 0.19742958709324582, + "grad_norm": 0.15051992237567902, + "learning_rate": 5e-05, + "loss": 1.7184, + "step": 722 + }, + { + "epoch": 0.19770303527481542, + "grad_norm": 0.14712779223918915, + "learning_rate": 5e-05, + "loss": 1.6231, + "step": 723 + }, + { + "epoch": 0.19797648345638502, + "grad_norm": 0.15088264644145966, + "learning_rate": 5e-05, + "loss": 1.662, + "step": 724 + }, + { + "epoch": 0.1982499316379546, + "grad_norm": 0.14665238559246063, + "learning_rate": 5e-05, + "loss": 1.6626, + "step": 725 + }, + { + "epoch": 0.1985233798195242, + "grad_norm": 0.16032913327217102, + "learning_rate": 5e-05, + "loss": 1.7283, + "step": 726 + }, + { + "epoch": 0.1987968280010938, + "grad_norm": 0.15149696171283722, + "learning_rate": 5e-05, + "loss": 1.6387, + "step": 727 + }, + { + "epoch": 0.19907027618266337, + "grad_norm": 0.15610548853874207, + "learning_rate": 5e-05, + "loss": 1.7084, + "step": 728 + }, + { + "epoch": 0.19934372436423298, + "grad_norm": 0.16325855255126953, + "learning_rate": 5e-05, + "loss": 1.567, + "step": 729 + }, + { + "epoch": 0.19961717254580258, + "grad_norm": 0.1529161036014557, + "learning_rate": 5e-05, + "loss": 1.6343, + "step": 730 + }, + { + "epoch": 0.19989062072737215, + "grad_norm": 0.1644391119480133, + "learning_rate": 5e-05, + "loss": 1.6918, + "step": 731 + }, + { + "epoch": 0.20016406890894176, + "grad_norm": 0.14825589954853058, + "learning_rate": 5e-05, + "loss": 1.6954, + "step": 732 + }, + { + "epoch": 0.20043751709051136, + "grad_norm": 0.15434464812278748, + "learning_rate": 5e-05, + "loss": 1.751, + "step": 733 + }, + { + "epoch": 0.20071096527208093, + "grad_norm": 0.14505796134471893, + "learning_rate": 5e-05, + "loss": 1.6398, + "step": 734 + }, + { + "epoch": 0.20098441345365053, + "grad_norm": 0.14808180928230286, + "learning_rate": 5e-05, + "loss": 1.6475, + "step": 735 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 0.15832236409187317, + "learning_rate": 5e-05, + "loss": 1.6888, + "step": 736 + }, + { + "epoch": 0.2015313098167897, + "grad_norm": 0.15063433349132538, + "learning_rate": 5e-05, + "loss": 1.714, + "step": 737 + }, + { + "epoch": 0.2018047579983593, + "grad_norm": 0.14673678576946259, + "learning_rate": 5e-05, + "loss": 1.6566, + "step": 738 + }, + { + "epoch": 0.20207820617992892, + "grad_norm": 0.16134969890117645, + "learning_rate": 5e-05, + "loss": 1.618, + "step": 739 + }, + { + "epoch": 0.2023516543614985, + "grad_norm": 0.15674619376659393, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 740 + }, + { + "epoch": 0.2026251025430681, + "grad_norm": 0.16632983088493347, + "learning_rate": 5e-05, + "loss": 1.6608, + "step": 741 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.15681079030036926, + "learning_rate": 5e-05, + "loss": 1.6452, + "step": 742 + }, + { + "epoch": 0.20317199890620727, + "grad_norm": 0.17244625091552734, + "learning_rate": 5e-05, + "loss": 1.6092, + "step": 743 + }, + { + "epoch": 0.20344544708777687, + "grad_norm": 0.17212654650211334, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 744 + }, + { + "epoch": 0.20371889526934644, + "grad_norm": 0.14502903819084167, + "learning_rate": 5e-05, + "loss": 1.6578, + "step": 745 + }, + { + "epoch": 0.20399234345091605, + "grad_norm": 0.19574536383152008, + "learning_rate": 5e-05, + "loss": 1.7769, + "step": 746 + }, + { + "epoch": 0.20426579163248565, + "grad_norm": 0.16160033643245697, + "learning_rate": 5e-05, + "loss": 1.6885, + "step": 747 + }, + { + "epoch": 0.20453923981405522, + "grad_norm": 0.15171994268894196, + "learning_rate": 5e-05, + "loss": 1.6761, + "step": 748 + }, + { + "epoch": 0.20481268799562483, + "grad_norm": 0.16578879952430725, + "learning_rate": 5e-05, + "loss": 1.6024, + "step": 749 + }, + { + "epoch": 0.20508613617719443, + "grad_norm": 0.1527402251958847, + "learning_rate": 5e-05, + "loss": 1.6214, + "step": 750 + }, + { + "epoch": 0.205359584358764, + "grad_norm": 0.15434326231479645, + "learning_rate": 5e-05, + "loss": 1.591, + "step": 751 + }, + { + "epoch": 0.2056330325403336, + "grad_norm": 0.15459883213043213, + "learning_rate": 5e-05, + "loss": 1.604, + "step": 752 + }, + { + "epoch": 0.2059064807219032, + "grad_norm": 0.17635492980480194, + "learning_rate": 5e-05, + "loss": 1.7674, + "step": 753 + }, + { + "epoch": 0.20617992890347278, + "grad_norm": 0.1530025154352188, + "learning_rate": 5e-05, + "loss": 1.5982, + "step": 754 + }, + { + "epoch": 0.20645337708504238, + "grad_norm": 0.16364556550979614, + "learning_rate": 5e-05, + "loss": 1.5648, + "step": 755 + }, + { + "epoch": 0.20672682526661199, + "grad_norm": 0.1753033846616745, + "learning_rate": 5e-05, + "loss": 1.6603, + "step": 756 + }, + { + "epoch": 0.20700027344818156, + "grad_norm": 0.17838945984840393, + "learning_rate": 5e-05, + "loss": 1.6572, + "step": 757 + }, + { + "epoch": 0.20727372162975116, + "grad_norm": 0.20289616286754608, + "learning_rate": 5e-05, + "loss": 1.713, + "step": 758 + }, + { + "epoch": 0.20754716981132076, + "grad_norm": 0.1460665762424469, + "learning_rate": 5e-05, + "loss": 1.6701, + "step": 759 + }, + { + "epoch": 0.20782061799289034, + "grad_norm": 0.1894819289445877, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 760 + }, + { + "epoch": 0.20809406617445994, + "grad_norm": 0.17014984786510468, + "learning_rate": 5e-05, + "loss": 1.6229, + "step": 761 + }, + { + "epoch": 0.20836751435602954, + "grad_norm": 0.14133282005786896, + "learning_rate": 5e-05, + "loss": 1.4855, + "step": 762 + }, + { + "epoch": 0.20864096253759912, + "grad_norm": 0.17830294370651245, + "learning_rate": 5e-05, + "loss": 1.8294, + "step": 763 + }, + { + "epoch": 0.20891441071916872, + "grad_norm": 0.17372062802314758, + "learning_rate": 5e-05, + "loss": 1.6214, + "step": 764 + }, + { + "epoch": 0.20918785890073832, + "grad_norm": 0.14572595059871674, + "learning_rate": 5e-05, + "loss": 1.6391, + "step": 765 + }, + { + "epoch": 0.2094613070823079, + "grad_norm": 0.19754233956336975, + "learning_rate": 5e-05, + "loss": 1.7374, + "step": 766 + }, + { + "epoch": 0.2097347552638775, + "grad_norm": 0.17328870296478271, + "learning_rate": 5e-05, + "loss": 1.6893, + "step": 767 + }, + { + "epoch": 0.2100082034454471, + "grad_norm": 0.15320096909999847, + "learning_rate": 5e-05, + "loss": 1.6703, + "step": 768 + }, + { + "epoch": 0.21028165162701667, + "grad_norm": 0.19963279366493225, + "learning_rate": 5e-05, + "loss": 1.8128, + "step": 769 + }, + { + "epoch": 0.21055509980858628, + "grad_norm": 0.15706828236579895, + "learning_rate": 5e-05, + "loss": 1.6998, + "step": 770 + }, + { + "epoch": 0.21082854799015588, + "grad_norm": 0.18131129443645477, + "learning_rate": 5e-05, + "loss": 1.6364, + "step": 771 + }, + { + "epoch": 0.21110199617172545, + "grad_norm": 0.19108878076076508, + "learning_rate": 5e-05, + "loss": 1.7074, + "step": 772 + }, + { + "epoch": 0.21137544435329506, + "grad_norm": 0.14925052225589752, + "learning_rate": 5e-05, + "loss": 1.6908, + "step": 773 + }, + { + "epoch": 0.21164889253486463, + "grad_norm": 0.1783328801393509, + "learning_rate": 5e-05, + "loss": 1.7353, + "step": 774 + }, + { + "epoch": 0.21192234071643423, + "grad_norm": 0.17631973326206207, + "learning_rate": 5e-05, + "loss": 1.683, + "step": 775 + }, + { + "epoch": 0.21219578889800383, + "grad_norm": 0.15554718673229218, + "learning_rate": 5e-05, + "loss": 1.6239, + "step": 776 + }, + { + "epoch": 0.2124692370795734, + "grad_norm": 0.1897638440132141, + "learning_rate": 5e-05, + "loss": 1.713, + "step": 777 + }, + { + "epoch": 0.212742685261143, + "grad_norm": 0.14565777778625488, + "learning_rate": 5e-05, + "loss": 1.5607, + "step": 778 + }, + { + "epoch": 0.2130161334427126, + "grad_norm": 0.14770552515983582, + "learning_rate": 5e-05, + "loss": 1.6054, + "step": 779 + }, + { + "epoch": 0.2132895816242822, + "grad_norm": 0.1784149557352066, + "learning_rate": 5e-05, + "loss": 1.7054, + "step": 780 + }, + { + "epoch": 0.2135630298058518, + "grad_norm": 0.15223895013332367, + "learning_rate": 5e-05, + "loss": 1.756, + "step": 781 + }, + { + "epoch": 0.2138364779874214, + "grad_norm": 0.15329818427562714, + "learning_rate": 5e-05, + "loss": 1.7035, + "step": 782 + }, + { + "epoch": 0.21410992616899097, + "grad_norm": 0.1558707058429718, + "learning_rate": 5e-05, + "loss": 1.6835, + "step": 783 + }, + { + "epoch": 0.21438337435056057, + "grad_norm": 0.15981443226337433, + "learning_rate": 5e-05, + "loss": 1.7418, + "step": 784 + }, + { + "epoch": 0.21465682253213017, + "grad_norm": 0.1494167298078537, + "learning_rate": 5e-05, + "loss": 1.5906, + "step": 785 + }, + { + "epoch": 0.21493027071369974, + "grad_norm": 0.15564508736133575, + "learning_rate": 5e-05, + "loss": 1.6873, + "step": 786 + }, + { + "epoch": 0.21520371889526935, + "grad_norm": 0.1639227420091629, + "learning_rate": 5e-05, + "loss": 1.7083, + "step": 787 + }, + { + "epoch": 0.21547716707683895, + "grad_norm": 0.14128105342388153, + "learning_rate": 5e-05, + "loss": 1.608, + "step": 788 + }, + { + "epoch": 0.21575061525840852, + "grad_norm": 0.16327965259552002, + "learning_rate": 5e-05, + "loss": 1.6022, + "step": 789 + }, + { + "epoch": 0.21602406343997813, + "grad_norm": 0.1559935361146927, + "learning_rate": 5e-05, + "loss": 1.6483, + "step": 790 + }, + { + "epoch": 0.21629751162154773, + "grad_norm": 0.1511768251657486, + "learning_rate": 5e-05, + "loss": 1.5955, + "step": 791 + }, + { + "epoch": 0.2165709598031173, + "grad_norm": 0.16056060791015625, + "learning_rate": 5e-05, + "loss": 1.6471, + "step": 792 + }, + { + "epoch": 0.2168444079846869, + "grad_norm": 0.16120120882987976, + "learning_rate": 5e-05, + "loss": 1.8007, + "step": 793 + }, + { + "epoch": 0.2171178561662565, + "grad_norm": 0.1515311449766159, + "learning_rate": 5e-05, + "loss": 1.6438, + "step": 794 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.15457142889499664, + "learning_rate": 5e-05, + "loss": 1.7115, + "step": 795 + }, + { + "epoch": 0.21766475252939568, + "grad_norm": 0.15303310751914978, + "learning_rate": 5e-05, + "loss": 1.6677, + "step": 796 + }, + { + "epoch": 0.21793820071096529, + "grad_norm": 0.1515769511461258, + "learning_rate": 5e-05, + "loss": 1.7371, + "step": 797 + }, + { + "epoch": 0.21821164889253486, + "grad_norm": 0.1483011692762375, + "learning_rate": 5e-05, + "loss": 1.6359, + "step": 798 + }, + { + "epoch": 0.21848509707410446, + "grad_norm": 0.1543753296136856, + "learning_rate": 5e-05, + "loss": 1.7473, + "step": 799 + }, + { + "epoch": 0.21875854525567404, + "grad_norm": 0.14442037045955658, + "learning_rate": 5e-05, + "loss": 1.6086, + "step": 800 + }, + { + "epoch": 0.21903199343724364, + "grad_norm": 0.14965881407260895, + "learning_rate": 5e-05, + "loss": 1.6522, + "step": 801 + }, + { + "epoch": 0.21930544161881324, + "grad_norm": 0.16104258596897125, + "learning_rate": 5e-05, + "loss": 1.5625, + "step": 802 + }, + { + "epoch": 0.21957888980038281, + "grad_norm": 0.16634979844093323, + "learning_rate": 5e-05, + "loss": 1.6574, + "step": 803 + }, + { + "epoch": 0.21985233798195242, + "grad_norm": 0.1422521471977234, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 804 + }, + { + "epoch": 0.22012578616352202, + "grad_norm": 0.17888352274894714, + "learning_rate": 5e-05, + "loss": 1.7694, + "step": 805 + }, + { + "epoch": 0.2203992343450916, + "grad_norm": 0.16180840134620667, + "learning_rate": 5e-05, + "loss": 1.6889, + "step": 806 + }, + { + "epoch": 0.2206726825266612, + "grad_norm": 0.1583503931760788, + "learning_rate": 5e-05, + "loss": 1.6678, + "step": 807 + }, + { + "epoch": 0.2209461307082308, + "grad_norm": 0.19179575145244598, + "learning_rate": 5e-05, + "loss": 1.7058, + "step": 808 + }, + { + "epoch": 0.22121957888980037, + "grad_norm": 0.14800883829593658, + "learning_rate": 5e-05, + "loss": 1.6305, + "step": 809 + }, + { + "epoch": 0.22149302707136997, + "grad_norm": 0.17706219851970673, + "learning_rate": 5e-05, + "loss": 1.691, + "step": 810 + }, + { + "epoch": 0.22176647525293958, + "grad_norm": 0.1569800078868866, + "learning_rate": 5e-05, + "loss": 1.5772, + "step": 811 + }, + { + "epoch": 0.22203992343450915, + "grad_norm": 0.1444752961397171, + "learning_rate": 5e-05, + "loss": 1.6368, + "step": 812 + }, + { + "epoch": 0.22231337161607875, + "grad_norm": 0.15736012160778046, + "learning_rate": 5e-05, + "loss": 1.5703, + "step": 813 + }, + { + "epoch": 0.22258681979764836, + "grad_norm": 0.18501217663288116, + "learning_rate": 5e-05, + "loss": 1.6198, + "step": 814 + }, + { + "epoch": 0.22286026797921793, + "grad_norm": 0.14157734811306, + "learning_rate": 5e-05, + "loss": 1.672, + "step": 815 + }, + { + "epoch": 0.22313371616078753, + "grad_norm": 0.16897475719451904, + "learning_rate": 5e-05, + "loss": 1.6156, + "step": 816 + }, + { + "epoch": 0.22340716434235713, + "grad_norm": 0.15974189341068268, + "learning_rate": 5e-05, + "loss": 1.5237, + "step": 817 + }, + { + "epoch": 0.2236806125239267, + "grad_norm": 0.14449931681156158, + "learning_rate": 5e-05, + "loss": 1.6376, + "step": 818 + }, + { + "epoch": 0.2239540607054963, + "grad_norm": 0.1783899962902069, + "learning_rate": 5e-05, + "loss": 1.707, + "step": 819 + }, + { + "epoch": 0.2242275088870659, + "grad_norm": 0.15125927329063416, + "learning_rate": 5e-05, + "loss": 1.6233, + "step": 820 + }, + { + "epoch": 0.2245009570686355, + "grad_norm": 0.15262438356876373, + "learning_rate": 5e-05, + "loss": 1.6687, + "step": 821 + }, + { + "epoch": 0.2247744052502051, + "grad_norm": 0.16108544170856476, + "learning_rate": 5e-05, + "loss": 1.647, + "step": 822 + }, + { + "epoch": 0.2250478534317747, + "grad_norm": 0.1504691243171692, + "learning_rate": 5e-05, + "loss": 1.6531, + "step": 823 + }, + { + "epoch": 0.22532130161334427, + "grad_norm": 0.1500689536333084, + "learning_rate": 5e-05, + "loss": 1.5914, + "step": 824 + }, + { + "epoch": 0.22559474979491387, + "grad_norm": 0.15878424048423767, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 825 + }, + { + "epoch": 0.22586819797648347, + "grad_norm": 0.15859322249889374, + "learning_rate": 5e-05, + "loss": 1.6787, + "step": 826 + }, + { + "epoch": 0.22614164615805304, + "grad_norm": 0.15319658815860748, + "learning_rate": 5e-05, + "loss": 1.6832, + "step": 827 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.15381459891796112, + "learning_rate": 5e-05, + "loss": 1.6422, + "step": 828 + }, + { + "epoch": 0.22668854252119222, + "grad_norm": 0.16640834510326385, + "learning_rate": 5e-05, + "loss": 1.7306, + "step": 829 + }, + { + "epoch": 0.22696199070276182, + "grad_norm": 0.1528118997812271, + "learning_rate": 5e-05, + "loss": 1.6858, + "step": 830 + }, + { + "epoch": 0.22723543888433143, + "grad_norm": 0.15058903396129608, + "learning_rate": 5e-05, + "loss": 1.6336, + "step": 831 + }, + { + "epoch": 0.227508887065901, + "grad_norm": 0.1552654355764389, + "learning_rate": 5e-05, + "loss": 1.677, + "step": 832 + }, + { + "epoch": 0.2277823352474706, + "grad_norm": 0.16159558296203613, + "learning_rate": 5e-05, + "loss": 1.682, + "step": 833 + }, + { + "epoch": 0.2280557834290402, + "grad_norm": 0.14523838460445404, + "learning_rate": 5e-05, + "loss": 1.5887, + "step": 834 + }, + { + "epoch": 0.22832923161060978, + "grad_norm": 0.16024388372898102, + "learning_rate": 5e-05, + "loss": 1.7047, + "step": 835 + }, + { + "epoch": 0.22860267979217938, + "grad_norm": 0.15451057255268097, + "learning_rate": 5e-05, + "loss": 1.6663, + "step": 836 + }, + { + "epoch": 0.22887612797374898, + "grad_norm": 0.14806225895881653, + "learning_rate": 5e-05, + "loss": 1.6782, + "step": 837 + }, + { + "epoch": 0.22914957615531856, + "grad_norm": 0.15798652172088623, + "learning_rate": 5e-05, + "loss": 1.7092, + "step": 838 + }, + { + "epoch": 0.22942302433688816, + "grad_norm": 0.15452131628990173, + "learning_rate": 5e-05, + "loss": 1.7137, + "step": 839 + }, + { + "epoch": 0.22969647251845776, + "grad_norm": 0.15365323424339294, + "learning_rate": 5e-05, + "loss": 1.7464, + "step": 840 + }, + { + "epoch": 0.22996992070002734, + "grad_norm": 0.14897583425045013, + "learning_rate": 5e-05, + "loss": 1.6712, + "step": 841 + }, + { + "epoch": 0.23024336888159694, + "grad_norm": 0.1519531011581421, + "learning_rate": 5e-05, + "loss": 1.7296, + "step": 842 + }, + { + "epoch": 0.23051681706316654, + "grad_norm": 0.14781694114208221, + "learning_rate": 5e-05, + "loss": 1.6932, + "step": 843 + }, + { + "epoch": 0.23079026524473611, + "grad_norm": 0.1538834124803543, + "learning_rate": 5e-05, + "loss": 1.6682, + "step": 844 + }, + { + "epoch": 0.23106371342630572, + "grad_norm": 0.14690068364143372, + "learning_rate": 5e-05, + "loss": 1.6194, + "step": 845 + }, + { + "epoch": 0.23133716160787532, + "grad_norm": 0.14810669422149658, + "learning_rate": 5e-05, + "loss": 1.6565, + "step": 846 + }, + { + "epoch": 0.2316106097894449, + "grad_norm": 0.1599397510290146, + "learning_rate": 5e-05, + "loss": 1.7253, + "step": 847 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.14897406101226807, + "learning_rate": 5e-05, + "loss": 1.6383, + "step": 848 + }, + { + "epoch": 0.2321575061525841, + "grad_norm": 0.14833606779575348, + "learning_rate": 5e-05, + "loss": 1.5975, + "step": 849 + }, + { + "epoch": 0.23243095433415367, + "grad_norm": 0.15134021639823914, + "learning_rate": 5e-05, + "loss": 1.6181, + "step": 850 + }, + { + "epoch": 0.23270440251572327, + "grad_norm": 0.1754770278930664, + "learning_rate": 5e-05, + "loss": 1.6905, + "step": 851 + }, + { + "epoch": 0.23297785069729288, + "grad_norm": 0.15956006944179535, + "learning_rate": 5e-05, + "loss": 1.7139, + "step": 852 + }, + { + "epoch": 0.23325129887886245, + "grad_norm": 0.1927836388349533, + "learning_rate": 5e-05, + "loss": 1.7274, + "step": 853 + }, + { + "epoch": 0.23352474706043205, + "grad_norm": 0.17408336699008942, + "learning_rate": 5e-05, + "loss": 1.6502, + "step": 854 + }, + { + "epoch": 0.23379819524200163, + "grad_norm": 0.15393292903900146, + "learning_rate": 5e-05, + "loss": 1.6537, + "step": 855 + }, + { + "epoch": 0.23407164342357123, + "grad_norm": 0.1970151662826538, + "learning_rate": 5e-05, + "loss": 1.6247, + "step": 856 + }, + { + "epoch": 0.23434509160514083, + "grad_norm": 0.14329130947589874, + "learning_rate": 5e-05, + "loss": 1.5819, + "step": 857 + }, + { + "epoch": 0.2346185397867104, + "grad_norm": 0.14831414818763733, + "learning_rate": 5e-05, + "loss": 1.6632, + "step": 858 + }, + { + "epoch": 0.23489198796828, + "grad_norm": 0.16687729954719543, + "learning_rate": 5e-05, + "loss": 1.6668, + "step": 859 + }, + { + "epoch": 0.2351654361498496, + "grad_norm": 0.14544977247714996, + "learning_rate": 5e-05, + "loss": 1.6716, + "step": 860 + }, + { + "epoch": 0.23543888433141918, + "grad_norm": 0.15175144374370575, + "learning_rate": 5e-05, + "loss": 1.7159, + "step": 861 + }, + { + "epoch": 0.2357123325129888, + "grad_norm": 0.17007999122142792, + "learning_rate": 5e-05, + "loss": 1.8236, + "step": 862 + }, + { + "epoch": 0.2359857806945584, + "grad_norm": 0.1416562795639038, + "learning_rate": 5e-05, + "loss": 1.6291, + "step": 863 + }, + { + "epoch": 0.23625922887612796, + "grad_norm": 0.16543252766132355, + "learning_rate": 5e-05, + "loss": 1.695, + "step": 864 + }, + { + "epoch": 0.23653267705769757, + "grad_norm": 0.16213998198509216, + "learning_rate": 5e-05, + "loss": 1.7187, + "step": 865 + }, + { + "epoch": 0.23680612523926717, + "grad_norm": 0.15842589735984802, + "learning_rate": 5e-05, + "loss": 1.7074, + "step": 866 + }, + { + "epoch": 0.23707957342083674, + "grad_norm": 0.16753311455249786, + "learning_rate": 5e-05, + "loss": 1.6487, + "step": 867 + }, + { + "epoch": 0.23735302160240634, + "grad_norm": 0.15180423855781555, + "learning_rate": 5e-05, + "loss": 1.6396, + "step": 868 + }, + { + "epoch": 0.23762646978397595, + "grad_norm": 0.1585211455821991, + "learning_rate": 5e-05, + "loss": 1.6959, + "step": 869 + }, + { + "epoch": 0.23789991796554552, + "grad_norm": 0.1608756184577942, + "learning_rate": 5e-05, + "loss": 1.7013, + "step": 870 + }, + { + "epoch": 0.23817336614711512, + "grad_norm": 0.14348694682121277, + "learning_rate": 5e-05, + "loss": 1.5302, + "step": 871 + }, + { + "epoch": 0.23844681432868473, + "grad_norm": 0.14560513198375702, + "learning_rate": 5e-05, + "loss": 1.6678, + "step": 872 + }, + { + "epoch": 0.2387202625102543, + "grad_norm": 0.1609930694103241, + "learning_rate": 5e-05, + "loss": 1.6599, + "step": 873 + }, + { + "epoch": 0.2389937106918239, + "grad_norm": 0.15456150472164154, + "learning_rate": 5e-05, + "loss": 1.6087, + "step": 874 + }, + { + "epoch": 0.2392671588733935, + "grad_norm": 0.14654546976089478, + "learning_rate": 5e-05, + "loss": 1.6199, + "step": 875 + }, + { + "epoch": 0.23954060705496308, + "grad_norm": 0.14911627769470215, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 876 + }, + { + "epoch": 0.23981405523653268, + "grad_norm": 0.17598745226860046, + "learning_rate": 5e-05, + "loss": 1.6854, + "step": 877 + }, + { + "epoch": 0.24008750341810228, + "grad_norm": 0.1485157608985901, + "learning_rate": 5e-05, + "loss": 1.652, + "step": 878 + }, + { + "epoch": 0.24036095159967186, + "grad_norm": 0.19263607263565063, + "learning_rate": 5e-05, + "loss": 1.6623, + "step": 879 + }, + { + "epoch": 0.24063439978124146, + "grad_norm": 0.15963439643383026, + "learning_rate": 5e-05, + "loss": 1.7693, + "step": 880 + }, + { + "epoch": 0.24090784796281103, + "grad_norm": 0.1660141795873642, + "learning_rate": 5e-05, + "loss": 1.5823, + "step": 881 + }, + { + "epoch": 0.24118129614438064, + "grad_norm": 0.16761627793312073, + "learning_rate": 5e-05, + "loss": 1.6647, + "step": 882 + }, + { + "epoch": 0.24145474432595024, + "grad_norm": 0.1527351289987564, + "learning_rate": 5e-05, + "loss": 1.5621, + "step": 883 + }, + { + "epoch": 0.2417281925075198, + "grad_norm": 0.1732681542634964, + "learning_rate": 5e-05, + "loss": 1.8039, + "step": 884 + }, + { + "epoch": 0.24200164068908941, + "grad_norm": 0.15724296867847443, + "learning_rate": 5e-05, + "loss": 1.5098, + "step": 885 + }, + { + "epoch": 0.24227508887065902, + "grad_norm": 0.1623707413673401, + "learning_rate": 5e-05, + "loss": 1.7287, + "step": 886 + }, + { + "epoch": 0.2425485370522286, + "grad_norm": 0.15608251094818115, + "learning_rate": 5e-05, + "loss": 1.6904, + "step": 887 + }, + { + "epoch": 0.2428219852337982, + "grad_norm": 0.16214315593242645, + "learning_rate": 5e-05, + "loss": 1.6739, + "step": 888 + }, + { + "epoch": 0.2430954334153678, + "grad_norm": 0.15404769778251648, + "learning_rate": 5e-05, + "loss": 1.6115, + "step": 889 + }, + { + "epoch": 0.24336888159693737, + "grad_norm": 0.17034713923931122, + "learning_rate": 5e-05, + "loss": 1.6412, + "step": 890 + }, + { + "epoch": 0.24364232977850697, + "grad_norm": 0.14412027597427368, + "learning_rate": 5e-05, + "loss": 1.5481, + "step": 891 + }, + { + "epoch": 0.24391577796007657, + "grad_norm": 0.1591709554195404, + "learning_rate": 5e-05, + "loss": 1.7115, + "step": 892 + }, + { + "epoch": 0.24418922614164615, + "grad_norm": 0.18128591775894165, + "learning_rate": 5e-05, + "loss": 1.6538, + "step": 893 + }, + { + "epoch": 0.24446267432321575, + "grad_norm": 0.1539849489927292, + "learning_rate": 5e-05, + "loss": 1.6899, + "step": 894 + }, + { + "epoch": 0.24473612250478535, + "grad_norm": 0.14450113475322723, + "learning_rate": 5e-05, + "loss": 1.5931, + "step": 895 + }, + { + "epoch": 0.24500957068635493, + "grad_norm": 0.152107372879982, + "learning_rate": 5e-05, + "loss": 1.6001, + "step": 896 + }, + { + "epoch": 0.24528301886792453, + "grad_norm": 0.14510129392147064, + "learning_rate": 5e-05, + "loss": 1.6048, + "step": 897 + }, + { + "epoch": 0.24555646704949413, + "grad_norm": 0.1490333080291748, + "learning_rate": 5e-05, + "loss": 1.6107, + "step": 898 + }, + { + "epoch": 0.2458299152310637, + "grad_norm": 0.14951343834400177, + "learning_rate": 5e-05, + "loss": 1.7005, + "step": 899 + }, + { + "epoch": 0.2461033634126333, + "grad_norm": 0.1492195427417755, + "learning_rate": 5e-05, + "loss": 1.6831, + "step": 900 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 0.15410034358501434, + "learning_rate": 5e-05, + "loss": 1.6933, + "step": 901 + }, + { + "epoch": 0.24665025977577248, + "grad_norm": 0.14823299646377563, + "learning_rate": 5e-05, + "loss": 1.637, + "step": 902 + }, + { + "epoch": 0.2469237079573421, + "grad_norm": 0.14969344437122345, + "learning_rate": 5e-05, + "loss": 1.6546, + "step": 903 + }, + { + "epoch": 0.2471971561389117, + "grad_norm": 0.14941661059856415, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 904 + }, + { + "epoch": 0.24747060432048126, + "grad_norm": 0.1853816956281662, + "learning_rate": 5e-05, + "loss": 1.6285, + "step": 905 + }, + { + "epoch": 0.24774405250205087, + "grad_norm": 0.15117646753787994, + "learning_rate": 5e-05, + "loss": 1.6274, + "step": 906 + }, + { + "epoch": 0.24801750068362047, + "grad_norm": 0.1522308588027954, + "learning_rate": 5e-05, + "loss": 1.735, + "step": 907 + }, + { + "epoch": 0.24829094886519004, + "grad_norm": 0.13882996141910553, + "learning_rate": 5e-05, + "loss": 1.5867, + "step": 908 + }, + { + "epoch": 0.24856439704675964, + "grad_norm": 0.1577073335647583, + "learning_rate": 5e-05, + "loss": 1.6647, + "step": 909 + }, + { + "epoch": 0.24883784522832922, + "grad_norm": 0.15113559365272522, + "learning_rate": 5e-05, + "loss": 1.6303, + "step": 910 + }, + { + "epoch": 0.24911129340989882, + "grad_norm": 0.15019430220127106, + "learning_rate": 5e-05, + "loss": 1.6976, + "step": 911 + }, + { + "epoch": 0.24938474159146842, + "grad_norm": 0.14112697541713715, + "learning_rate": 5e-05, + "loss": 1.54, + "step": 912 + }, + { + "epoch": 0.249658189773038, + "grad_norm": 0.14087800681591034, + "learning_rate": 5e-05, + "loss": 1.5316, + "step": 913 + }, + { + "epoch": 0.2499316379546076, + "grad_norm": 0.14896658062934875, + "learning_rate": 5e-05, + "loss": 1.561, + "step": 914 + }, + { + "epoch": 0.2502050861361772, + "grad_norm": 0.15148289501667023, + "learning_rate": 5e-05, + "loss": 1.6698, + "step": 915 + }, + { + "epoch": 0.2504785343177468, + "grad_norm": 0.1539052277803421, + "learning_rate": 5e-05, + "loss": 1.6017, + "step": 916 + }, + { + "epoch": 0.2507519824993164, + "grad_norm": 0.16018076241016388, + "learning_rate": 5e-05, + "loss": 1.6956, + "step": 917 + }, + { + "epoch": 0.25102543068088595, + "grad_norm": 0.17696644365787506, + "learning_rate": 5e-05, + "loss": 1.7196, + "step": 918 + }, + { + "epoch": 0.25129887886245555, + "grad_norm": 0.1509835124015808, + "learning_rate": 5e-05, + "loss": 1.6243, + "step": 919 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 0.15202875435352325, + "learning_rate": 5e-05, + "loss": 1.6868, + "step": 920 + }, + { + "epoch": 0.25184577522559476, + "grad_norm": 0.16320888698101044, + "learning_rate": 5e-05, + "loss": 1.697, + "step": 921 + }, + { + "epoch": 0.25211922340716436, + "grad_norm": 0.15281890332698822, + "learning_rate": 5e-05, + "loss": 1.5824, + "step": 922 + }, + { + "epoch": 0.2523926715887339, + "grad_norm": 0.1686651110649109, + "learning_rate": 5e-05, + "loss": 1.6362, + "step": 923 + }, + { + "epoch": 0.2526661197703035, + "grad_norm": 0.16698114573955536, + "learning_rate": 5e-05, + "loss": 1.6557, + "step": 924 + }, + { + "epoch": 0.2529395679518731, + "grad_norm": 0.16273106634616852, + "learning_rate": 5e-05, + "loss": 1.7441, + "step": 925 + }, + { + "epoch": 0.2532130161334427, + "grad_norm": 0.16961312294006348, + "learning_rate": 5e-05, + "loss": 1.7053, + "step": 926 + }, + { + "epoch": 0.2534864643150123, + "grad_norm": 0.1489211916923523, + "learning_rate": 5e-05, + "loss": 1.6257, + "step": 927 + }, + { + "epoch": 0.2537599124965819, + "grad_norm": 0.15058746933937073, + "learning_rate": 5e-05, + "loss": 1.61, + "step": 928 + }, + { + "epoch": 0.25403336067815147, + "grad_norm": 0.16194166243076324, + "learning_rate": 5e-05, + "loss": 1.6667, + "step": 929 + }, + { + "epoch": 0.25430680885972107, + "grad_norm": 0.1765557825565338, + "learning_rate": 5e-05, + "loss": 1.6572, + "step": 930 + }, + { + "epoch": 0.25458025704129067, + "grad_norm": 0.14746791124343872, + "learning_rate": 5e-05, + "loss": 1.6596, + "step": 931 + }, + { + "epoch": 0.25485370522286027, + "grad_norm": 0.14352820813655853, + "learning_rate": 5e-05, + "loss": 1.5291, + "step": 932 + }, + { + "epoch": 0.2551271534044299, + "grad_norm": 0.15766541659832, + "learning_rate": 5e-05, + "loss": 1.6496, + "step": 933 + }, + { + "epoch": 0.2554006015859995, + "grad_norm": 0.1461249589920044, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 934 + }, + { + "epoch": 0.255674049767569, + "grad_norm": 0.15647587180137634, + "learning_rate": 5e-05, + "loss": 1.7245, + "step": 935 + }, + { + "epoch": 0.2559474979491386, + "grad_norm": 0.16043874621391296, + "learning_rate": 5e-05, + "loss": 1.7584, + "step": 936 + }, + { + "epoch": 0.2562209461307082, + "grad_norm": 0.16398166120052338, + "learning_rate": 5e-05, + "loss": 1.7888, + "step": 937 + }, + { + "epoch": 0.25649439431227783, + "grad_norm": 0.1568370908498764, + "learning_rate": 5e-05, + "loss": 1.6886, + "step": 938 + }, + { + "epoch": 0.25676784249384743, + "grad_norm": 0.14432042837142944, + "learning_rate": 5e-05, + "loss": 1.5691, + "step": 939 + }, + { + "epoch": 0.25704129067541703, + "grad_norm": 0.15498638153076172, + "learning_rate": 5e-05, + "loss": 1.6414, + "step": 940 + }, + { + "epoch": 0.2573147388569866, + "grad_norm": 0.1573934257030487, + "learning_rate": 5e-05, + "loss": 1.7171, + "step": 941 + }, + { + "epoch": 0.2575881870385562, + "grad_norm": 0.16507598757743835, + "learning_rate": 5e-05, + "loss": 1.7697, + "step": 942 + }, + { + "epoch": 0.2578616352201258, + "grad_norm": 0.16209067404270172, + "learning_rate": 5e-05, + "loss": 1.7732, + "step": 943 + }, + { + "epoch": 0.2581350834016954, + "grad_norm": 0.16477470099925995, + "learning_rate": 5e-05, + "loss": 1.6439, + "step": 944 + }, + { + "epoch": 0.258408531583265, + "grad_norm": 0.15217360854148865, + "learning_rate": 5e-05, + "loss": 1.6341, + "step": 945 + }, + { + "epoch": 0.2586819797648346, + "grad_norm": 0.16021955013275146, + "learning_rate": 5e-05, + "loss": 1.6829, + "step": 946 + }, + { + "epoch": 0.25895542794640414, + "grad_norm": 0.16192196309566498, + "learning_rate": 5e-05, + "loss": 1.6225, + "step": 947 + }, + { + "epoch": 0.25922887612797374, + "grad_norm": 0.15227045118808746, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 948 + }, + { + "epoch": 0.25950232430954334, + "grad_norm": 0.17690598964691162, + "learning_rate": 5e-05, + "loss": 1.7092, + "step": 949 + }, + { + "epoch": 0.25977577249111294, + "grad_norm": 0.1463916003704071, + "learning_rate": 5e-05, + "loss": 1.5948, + "step": 950 + }, + { + "epoch": 0.26004922067268255, + "grad_norm": 0.16608351469039917, + "learning_rate": 5e-05, + "loss": 1.6339, + "step": 951 + }, + { + "epoch": 0.2603226688542521, + "grad_norm": 0.16047005355358124, + "learning_rate": 5e-05, + "loss": 1.6928, + "step": 952 + }, + { + "epoch": 0.2605961170358217, + "grad_norm": 0.1434023380279541, + "learning_rate": 5e-05, + "loss": 1.5613, + "step": 953 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.1558619737625122, + "learning_rate": 5e-05, + "loss": 1.6868, + "step": 954 + }, + { + "epoch": 0.2611430133989609, + "grad_norm": 0.14743700623512268, + "learning_rate": 5e-05, + "loss": 1.612, + "step": 955 + }, + { + "epoch": 0.2614164615805305, + "grad_norm": 0.15097011625766754, + "learning_rate": 5e-05, + "loss": 1.5625, + "step": 956 + }, + { + "epoch": 0.2616899097621001, + "grad_norm": 0.14291320741176605, + "learning_rate": 5e-05, + "loss": 1.6787, + "step": 957 + }, + { + "epoch": 0.26196335794366965, + "grad_norm": 0.15289993584156036, + "learning_rate": 5e-05, + "loss": 1.754, + "step": 958 + }, + { + "epoch": 0.26223680612523925, + "grad_norm": 0.15618576109409332, + "learning_rate": 5e-05, + "loss": 1.7212, + "step": 959 + }, + { + "epoch": 0.26251025430680885, + "grad_norm": 0.14384949207305908, + "learning_rate": 5e-05, + "loss": 1.6108, + "step": 960 + }, + { + "epoch": 0.26278370248837846, + "grad_norm": 0.15583495795726776, + "learning_rate": 5e-05, + "loss": 1.6998, + "step": 961 + }, + { + "epoch": 0.26305715066994806, + "grad_norm": 0.1544669270515442, + "learning_rate": 5e-05, + "loss": 1.646, + "step": 962 + }, + { + "epoch": 0.26333059885151766, + "grad_norm": 0.15387408435344696, + "learning_rate": 5e-05, + "loss": 1.7415, + "step": 963 + }, + { + "epoch": 0.2636040470330872, + "grad_norm": 0.15197156369686127, + "learning_rate": 5e-05, + "loss": 1.6324, + "step": 964 + }, + { + "epoch": 0.2638774952146568, + "grad_norm": 0.15596544742584229, + "learning_rate": 5e-05, + "loss": 1.6116, + "step": 965 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.15174002945423126, + "learning_rate": 5e-05, + "loss": 1.6552, + "step": 966 + }, + { + "epoch": 0.264424391577796, + "grad_norm": 0.16541074216365814, + "learning_rate": 5e-05, + "loss": 1.7092, + "step": 967 + }, + { + "epoch": 0.2646978397593656, + "grad_norm": 0.15054158866405487, + "learning_rate": 5e-05, + "loss": 1.6584, + "step": 968 + }, + { + "epoch": 0.2649712879409352, + "grad_norm": 0.16845272481441498, + "learning_rate": 5e-05, + "loss": 1.7485, + "step": 969 + }, + { + "epoch": 0.26524473612250477, + "grad_norm": 0.15951582789421082, + "learning_rate": 5e-05, + "loss": 1.6003, + "step": 970 + }, + { + "epoch": 0.26551818430407437, + "grad_norm": 0.15601181983947754, + "learning_rate": 5e-05, + "loss": 1.6222, + "step": 971 + }, + { + "epoch": 0.26579163248564397, + "grad_norm": 0.1529085487127304, + "learning_rate": 5e-05, + "loss": 1.687, + "step": 972 + }, + { + "epoch": 0.26606508066721357, + "grad_norm": 0.15299002826213837, + "learning_rate": 5e-05, + "loss": 1.716, + "step": 973 + }, + { + "epoch": 0.2663385288487832, + "grad_norm": 0.17247562110424042, + "learning_rate": 5e-05, + "loss": 1.6112, + "step": 974 + }, + { + "epoch": 0.2666119770303528, + "grad_norm": 0.16275718808174133, + "learning_rate": 5e-05, + "loss": 1.6481, + "step": 975 + }, + { + "epoch": 0.2668854252119223, + "grad_norm": 0.18116293847560883, + "learning_rate": 5e-05, + "loss": 1.5794, + "step": 976 + }, + { + "epoch": 0.2671588733934919, + "grad_norm": 0.14527344703674316, + "learning_rate": 5e-05, + "loss": 1.6521, + "step": 977 + }, + { + "epoch": 0.2674323215750615, + "grad_norm": 0.15483027696609497, + "learning_rate": 5e-05, + "loss": 1.637, + "step": 978 + }, + { + "epoch": 0.26770576975663113, + "grad_norm": 0.15565639734268188, + "learning_rate": 5e-05, + "loss": 1.6142, + "step": 979 + }, + { + "epoch": 0.26797921793820073, + "grad_norm": 0.15443767607212067, + "learning_rate": 5e-05, + "loss": 1.5859, + "step": 980 + }, + { + "epoch": 0.2682526661197703, + "grad_norm": 0.15313716232776642, + "learning_rate": 5e-05, + "loss": 1.6508, + "step": 981 + }, + { + "epoch": 0.2685261143013399, + "grad_norm": 0.15430563688278198, + "learning_rate": 5e-05, + "loss": 1.6352, + "step": 982 + }, + { + "epoch": 0.2687995624829095, + "grad_norm": 0.15906836092472076, + "learning_rate": 5e-05, + "loss": 1.7201, + "step": 983 + }, + { + "epoch": 0.2690730106644791, + "grad_norm": 0.151002898812294, + "learning_rate": 5e-05, + "loss": 1.7418, + "step": 984 + }, + { + "epoch": 0.2693464588460487, + "grad_norm": 0.1649433970451355, + "learning_rate": 5e-05, + "loss": 1.6364, + "step": 985 + }, + { + "epoch": 0.2696199070276183, + "grad_norm": 0.15000388026237488, + "learning_rate": 5e-05, + "loss": 1.5924, + "step": 986 + }, + { + "epoch": 0.26989335520918784, + "grad_norm": 0.1750802844762802, + "learning_rate": 5e-05, + "loss": 1.621, + "step": 987 + }, + { + "epoch": 0.27016680339075744, + "grad_norm": 0.14742594957351685, + "learning_rate": 5e-05, + "loss": 1.6103, + "step": 988 + }, + { + "epoch": 0.27044025157232704, + "grad_norm": 0.1554790437221527, + "learning_rate": 5e-05, + "loss": 1.6996, + "step": 989 + }, + { + "epoch": 0.27071369975389664, + "grad_norm": 0.16735535860061646, + "learning_rate": 5e-05, + "loss": 1.6705, + "step": 990 + }, + { + "epoch": 0.27098714793546624, + "grad_norm": 0.1675184816122055, + "learning_rate": 5e-05, + "loss": 1.7114, + "step": 991 + }, + { + "epoch": 0.27126059611703585, + "grad_norm": 0.1542404443025589, + "learning_rate": 5e-05, + "loss": 1.5953, + "step": 992 + }, + { + "epoch": 0.2715340442986054, + "grad_norm": 0.1605929136276245, + "learning_rate": 5e-05, + "loss": 1.5541, + "step": 993 + }, + { + "epoch": 0.271807492480175, + "grad_norm": 0.17429675161838531, + "learning_rate": 5e-05, + "loss": 1.6393, + "step": 994 + }, + { + "epoch": 0.2720809406617446, + "grad_norm": 0.16409185528755188, + "learning_rate": 5e-05, + "loss": 1.7244, + "step": 995 + }, + { + "epoch": 0.2723543888433142, + "grad_norm": 0.16131426393985748, + "learning_rate": 5e-05, + "loss": 1.6339, + "step": 996 + }, + { + "epoch": 0.2726278370248838, + "grad_norm": 0.14714978635311127, + "learning_rate": 5e-05, + "loss": 1.5154, + "step": 997 + }, + { + "epoch": 0.2729012852064534, + "grad_norm": 0.14420582354068756, + "learning_rate": 5e-05, + "loss": 1.654, + "step": 998 + }, + { + "epoch": 0.27317473338802295, + "grad_norm": 0.15642261505126953, + "learning_rate": 5e-05, + "loss": 1.5478, + "step": 999 + }, + { + "epoch": 0.27344818156959255, + "grad_norm": 0.14731954038143158, + "learning_rate": 5e-05, + "loss": 1.5659, + "step": 1000 + }, + { + "epoch": 0.27372162975116215, + "grad_norm": 0.16006474196910858, + "learning_rate": 5e-05, + "loss": 1.6048, + "step": 1001 + }, + { + "epoch": 0.27399507793273176, + "grad_norm": 0.15757764875888824, + "learning_rate": 5e-05, + "loss": 1.6572, + "step": 1002 + }, + { + "epoch": 0.27426852611430136, + "grad_norm": 0.15396282076835632, + "learning_rate": 5e-05, + "loss": 1.7424, + "step": 1003 + }, + { + "epoch": 0.2745419742958709, + "grad_norm": 0.15785497426986694, + "learning_rate": 5e-05, + "loss": 1.7142, + "step": 1004 + }, + { + "epoch": 0.2748154224774405, + "grad_norm": 0.14254657924175262, + "learning_rate": 5e-05, + "loss": 1.5812, + "step": 1005 + }, + { + "epoch": 0.2750888706590101, + "grad_norm": 0.1501447856426239, + "learning_rate": 5e-05, + "loss": 1.7112, + "step": 1006 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 0.16908079385757446, + "learning_rate": 5e-05, + "loss": 1.7688, + "step": 1007 + }, + { + "epoch": 0.2756357670221493, + "grad_norm": 0.1612824648618698, + "learning_rate": 5e-05, + "loss": 1.6473, + "step": 1008 + }, + { + "epoch": 0.2759092152037189, + "grad_norm": 0.1502557396888733, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 1009 + }, + { + "epoch": 0.27618266338528846, + "grad_norm": 0.1549758017063141, + "learning_rate": 5e-05, + "loss": 1.6473, + "step": 1010 + }, + { + "epoch": 0.27645611156685806, + "grad_norm": 0.1512656807899475, + "learning_rate": 5e-05, + "loss": 1.7336, + "step": 1011 + }, + { + "epoch": 0.27672955974842767, + "grad_norm": 0.1543295830488205, + "learning_rate": 5e-05, + "loss": 1.6293, + "step": 1012 + }, + { + "epoch": 0.27700300792999727, + "grad_norm": 0.15347859263420105, + "learning_rate": 5e-05, + "loss": 1.589, + "step": 1013 + }, + { + "epoch": 0.27727645611156687, + "grad_norm": 0.15677101910114288, + "learning_rate": 5e-05, + "loss": 1.7283, + "step": 1014 + }, + { + "epoch": 0.2775499042931365, + "grad_norm": 0.16513799130916595, + "learning_rate": 5e-05, + "loss": 1.6725, + "step": 1015 + }, + { + "epoch": 0.277823352474706, + "grad_norm": 0.15116244554519653, + "learning_rate": 5e-05, + "loss": 1.639, + "step": 1016 + }, + { + "epoch": 0.2780968006562756, + "grad_norm": 0.1464737504720688, + "learning_rate": 5e-05, + "loss": 1.6598, + "step": 1017 + }, + { + "epoch": 0.2783702488378452, + "grad_norm": 0.1716412454843521, + "learning_rate": 5e-05, + "loss": 1.6941, + "step": 1018 + }, + { + "epoch": 0.2786436970194148, + "grad_norm": 0.15639688074588776, + "learning_rate": 5e-05, + "loss": 1.6334, + "step": 1019 + }, + { + "epoch": 0.27891714520098443, + "grad_norm": 0.1594480276107788, + "learning_rate": 5e-05, + "loss": 1.6916, + "step": 1020 + }, + { + "epoch": 0.27919059338255403, + "grad_norm": 0.14737538993358612, + "learning_rate": 5e-05, + "loss": 1.5393, + "step": 1021 + }, + { + "epoch": 0.2794640415641236, + "grad_norm": 0.15937146544456482, + "learning_rate": 5e-05, + "loss": 1.7071, + "step": 1022 + }, + { + "epoch": 0.2797374897456932, + "grad_norm": 0.15143409371376038, + "learning_rate": 5e-05, + "loss": 1.6276, + "step": 1023 + }, + { + "epoch": 0.2800109379272628, + "grad_norm": 0.16173769533634186, + "learning_rate": 5e-05, + "loss": 1.6131, + "step": 1024 + }, + { + "epoch": 0.2802843861088324, + "grad_norm": 0.14734511077404022, + "learning_rate": 5e-05, + "loss": 1.6051, + "step": 1025 + }, + { + "epoch": 0.280557834290402, + "grad_norm": 0.1571275144815445, + "learning_rate": 5e-05, + "loss": 1.7169, + "step": 1026 + }, + { + "epoch": 0.2808312824719716, + "grad_norm": 0.15668824315071106, + "learning_rate": 5e-05, + "loss": 1.8378, + "step": 1027 + }, + { + "epoch": 0.28110473065354113, + "grad_norm": 0.15768049657344818, + "learning_rate": 5e-05, + "loss": 1.6911, + "step": 1028 + }, + { + "epoch": 0.28137817883511074, + "grad_norm": 0.15700852870941162, + "learning_rate": 5e-05, + "loss": 1.7142, + "step": 1029 + }, + { + "epoch": 0.28165162701668034, + "grad_norm": 0.14887461066246033, + "learning_rate": 5e-05, + "loss": 1.6241, + "step": 1030 + }, + { + "epoch": 0.28192507519824994, + "grad_norm": 0.15155330300331116, + "learning_rate": 5e-05, + "loss": 1.7063, + "step": 1031 + }, + { + "epoch": 0.28219852337981954, + "grad_norm": 0.1655176430940628, + "learning_rate": 5e-05, + "loss": 1.6063, + "step": 1032 + }, + { + "epoch": 0.2824719715613891, + "grad_norm": 0.1478133499622345, + "learning_rate": 5e-05, + "loss": 1.6977, + "step": 1033 + }, + { + "epoch": 0.2827454197429587, + "grad_norm": 0.14731581509113312, + "learning_rate": 5e-05, + "loss": 1.6238, + "step": 1034 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 0.16165371239185333, + "learning_rate": 5e-05, + "loss": 1.5795, + "step": 1035 + }, + { + "epoch": 0.2832923161060979, + "grad_norm": 0.1721602976322174, + "learning_rate": 5e-05, + "loss": 1.7447, + "step": 1036 + }, + { + "epoch": 0.2835657642876675, + "grad_norm": 0.14430297911167145, + "learning_rate": 5e-05, + "loss": 1.5664, + "step": 1037 + }, + { + "epoch": 0.2838392124692371, + "grad_norm": 0.15474802255630493, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 1038 + }, + { + "epoch": 0.28411266065080665, + "grad_norm": 0.16973662376403809, + "learning_rate": 5e-05, + "loss": 1.601, + "step": 1039 + }, + { + "epoch": 0.28438610883237625, + "grad_norm": 0.170043483376503, + "learning_rate": 5e-05, + "loss": 1.6629, + "step": 1040 + }, + { + "epoch": 0.28465955701394585, + "grad_norm": 0.1592838615179062, + "learning_rate": 5e-05, + "loss": 1.6638, + "step": 1041 + }, + { + "epoch": 0.28493300519551545, + "grad_norm": 0.1679082214832306, + "learning_rate": 5e-05, + "loss": 1.6667, + "step": 1042 + }, + { + "epoch": 0.28520645337708506, + "grad_norm": 0.17934280633926392, + "learning_rate": 5e-05, + "loss": 1.6224, + "step": 1043 + }, + { + "epoch": 0.28547990155865466, + "grad_norm": 0.14445669949054718, + "learning_rate": 5e-05, + "loss": 1.6861, + "step": 1044 + }, + { + "epoch": 0.2857533497402242, + "grad_norm": 0.15397311747074127, + "learning_rate": 5e-05, + "loss": 1.7506, + "step": 1045 + }, + { + "epoch": 0.2860267979217938, + "grad_norm": 0.15878726541996002, + "learning_rate": 5e-05, + "loss": 1.7283, + "step": 1046 + }, + { + "epoch": 0.2863002461033634, + "grad_norm": 0.15634529292583466, + "learning_rate": 5e-05, + "loss": 1.6385, + "step": 1047 + }, + { + "epoch": 0.286573694284933, + "grad_norm": 0.15060849487781525, + "learning_rate": 5e-05, + "loss": 1.6744, + "step": 1048 + }, + { + "epoch": 0.2868471424665026, + "grad_norm": 0.16258785128593445, + "learning_rate": 5e-05, + "loss": 1.6701, + "step": 1049 + }, + { + "epoch": 0.2871205906480722, + "grad_norm": 0.14815811812877655, + "learning_rate": 5e-05, + "loss": 1.5947, + "step": 1050 + }, + { + "epoch": 0.28739403882964176, + "grad_norm": 0.17626261711120605, + "learning_rate": 5e-05, + "loss": 1.6407, + "step": 1051 + }, + { + "epoch": 0.28766748701121136, + "grad_norm": 0.1558643877506256, + "learning_rate": 5e-05, + "loss": 1.6002, + "step": 1052 + }, + { + "epoch": 0.28794093519278097, + "grad_norm": 0.14758311212062836, + "learning_rate": 5e-05, + "loss": 1.5985, + "step": 1053 + }, + { + "epoch": 0.28821438337435057, + "grad_norm": 0.16778376698493958, + "learning_rate": 5e-05, + "loss": 1.6799, + "step": 1054 + }, + { + "epoch": 0.28848783155592017, + "grad_norm": 0.16925834119319916, + "learning_rate": 5e-05, + "loss": 1.6951, + "step": 1055 + }, + { + "epoch": 0.2887612797374898, + "grad_norm": 0.16058339178562164, + "learning_rate": 5e-05, + "loss": 1.6221, + "step": 1056 + }, + { + "epoch": 0.2890347279190593, + "grad_norm": 0.16010229289531708, + "learning_rate": 5e-05, + "loss": 1.6665, + "step": 1057 + }, + { + "epoch": 0.2893081761006289, + "grad_norm": 0.15044192969799042, + "learning_rate": 5e-05, + "loss": 1.6247, + "step": 1058 + }, + { + "epoch": 0.2895816242821985, + "grad_norm": 0.14547835290431976, + "learning_rate": 5e-05, + "loss": 1.6111, + "step": 1059 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.15208472311496735, + "learning_rate": 5e-05, + "loss": 1.7198, + "step": 1060 + }, + { + "epoch": 0.29012852064533773, + "grad_norm": 0.15193967521190643, + "learning_rate": 5e-05, + "loss": 1.6775, + "step": 1061 + }, + { + "epoch": 0.2904019688269073, + "grad_norm": 0.15141284465789795, + "learning_rate": 5e-05, + "loss": 1.6727, + "step": 1062 + }, + { + "epoch": 0.2906754170084769, + "grad_norm": 0.16151681542396545, + "learning_rate": 5e-05, + "loss": 1.7054, + "step": 1063 + }, + { + "epoch": 0.2909488651900465, + "grad_norm": 0.15598168969154358, + "learning_rate": 5e-05, + "loss": 1.6536, + "step": 1064 + }, + { + "epoch": 0.2912223133716161, + "grad_norm": 0.147295281291008, + "learning_rate": 5e-05, + "loss": 1.658, + "step": 1065 + }, + { + "epoch": 0.2914957615531857, + "grad_norm": 0.1515830159187317, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 1066 + }, + { + "epoch": 0.2917692097347553, + "grad_norm": 0.1516982913017273, + "learning_rate": 5e-05, + "loss": 1.6866, + "step": 1067 + }, + { + "epoch": 0.29204265791632483, + "grad_norm": 0.16418761014938354, + "learning_rate": 5e-05, + "loss": 1.7676, + "step": 1068 + }, + { + "epoch": 0.29231610609789443, + "grad_norm": 0.1566287726163864, + "learning_rate": 5e-05, + "loss": 1.6747, + "step": 1069 + }, + { + "epoch": 0.29258955427946404, + "grad_norm": 0.16158966720104218, + "learning_rate": 5e-05, + "loss": 1.6606, + "step": 1070 + }, + { + "epoch": 0.29286300246103364, + "grad_norm": 0.1478201299905777, + "learning_rate": 5e-05, + "loss": 1.708, + "step": 1071 + }, + { + "epoch": 0.29313645064260324, + "grad_norm": 0.15915492177009583, + "learning_rate": 5e-05, + "loss": 1.6656, + "step": 1072 + }, + { + "epoch": 0.29340989882417284, + "grad_norm": 0.16170357167720795, + "learning_rate": 5e-05, + "loss": 1.6458, + "step": 1073 + }, + { + "epoch": 0.2936833470057424, + "grad_norm": 0.1563751995563507, + "learning_rate": 5e-05, + "loss": 1.6192, + "step": 1074 + }, + { + "epoch": 0.293956795187312, + "grad_norm": 0.1516135334968567, + "learning_rate": 5e-05, + "loss": 1.6441, + "step": 1075 + }, + { + "epoch": 0.2942302433688816, + "grad_norm": 0.14756453037261963, + "learning_rate": 5e-05, + "loss": 1.5897, + "step": 1076 + }, + { + "epoch": 0.2945036915504512, + "grad_norm": 0.1514975130558014, + "learning_rate": 5e-05, + "loss": 1.6956, + "step": 1077 + }, + { + "epoch": 0.2947771397320208, + "grad_norm": 0.1433197408914566, + "learning_rate": 5e-05, + "loss": 1.5484, + "step": 1078 + }, + { + "epoch": 0.2950505879135904, + "grad_norm": 0.14783865213394165, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 1079 + }, + { + "epoch": 0.29532403609515995, + "grad_norm": 0.1563805192708969, + "learning_rate": 5e-05, + "loss": 1.6578, + "step": 1080 + }, + { + "epoch": 0.29559748427672955, + "grad_norm": 0.15550269186496735, + "learning_rate": 5e-05, + "loss": 1.6739, + "step": 1081 + }, + { + "epoch": 0.29587093245829915, + "grad_norm": 0.1647748500108719, + "learning_rate": 5e-05, + "loss": 1.8268, + "step": 1082 + }, + { + "epoch": 0.29614438063986875, + "grad_norm": 0.14956046640872955, + "learning_rate": 5e-05, + "loss": 1.6989, + "step": 1083 + }, + { + "epoch": 0.29641782882143836, + "grad_norm": 0.1558298021554947, + "learning_rate": 5e-05, + "loss": 1.6784, + "step": 1084 + }, + { + "epoch": 0.2966912770030079, + "grad_norm": 0.16264335811138153, + "learning_rate": 5e-05, + "loss": 1.6733, + "step": 1085 + }, + { + "epoch": 0.2969647251845775, + "grad_norm": 0.15490394830703735, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 1086 + }, + { + "epoch": 0.2972381733661471, + "grad_norm": 0.15151673555374146, + "learning_rate": 5e-05, + "loss": 1.6566, + "step": 1087 + }, + { + "epoch": 0.2975116215477167, + "grad_norm": 0.15661917626857758, + "learning_rate": 5e-05, + "loss": 1.6329, + "step": 1088 + }, + { + "epoch": 0.2977850697292863, + "grad_norm": 0.1620190590620041, + "learning_rate": 5e-05, + "loss": 1.6779, + "step": 1089 + }, + { + "epoch": 0.2980585179108559, + "grad_norm": 0.15718665719032288, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 1090 + }, + { + "epoch": 0.29833196609242546, + "grad_norm": 0.15119051933288574, + "learning_rate": 5e-05, + "loss": 1.6514, + "step": 1091 + }, + { + "epoch": 0.29860541427399506, + "grad_norm": 0.172483429312706, + "learning_rate": 5e-05, + "loss": 1.6253, + "step": 1092 + }, + { + "epoch": 0.29887886245556466, + "grad_norm": 0.17236994206905365, + "learning_rate": 5e-05, + "loss": 1.6927, + "step": 1093 + }, + { + "epoch": 0.29915231063713427, + "grad_norm": 0.1658979058265686, + "learning_rate": 5e-05, + "loss": 1.6684, + "step": 1094 + }, + { + "epoch": 0.29942575881870387, + "grad_norm": 0.16414695978164673, + "learning_rate": 5e-05, + "loss": 1.5933, + "step": 1095 + }, + { + "epoch": 0.29969920700027347, + "grad_norm": 0.156681627035141, + "learning_rate": 5e-05, + "loss": 1.6016, + "step": 1096 + }, + { + "epoch": 0.299972655181843, + "grad_norm": 0.14425235986709595, + "learning_rate": 5e-05, + "loss": 1.5941, + "step": 1097 + }, + { + "epoch": 0.3002461033634126, + "grad_norm": 0.16902539134025574, + "learning_rate": 5e-05, + "loss": 1.5992, + "step": 1098 + }, + { + "epoch": 0.3005195515449822, + "grad_norm": 0.17275625467300415, + "learning_rate": 5e-05, + "loss": 1.7861, + "step": 1099 + }, + { + "epoch": 0.3007929997265518, + "grad_norm": 0.15677183866500854, + "learning_rate": 5e-05, + "loss": 1.6925, + "step": 1100 + }, + { + "epoch": 0.3010664479081214, + "grad_norm": 0.1713411509990692, + "learning_rate": 5e-05, + "loss": 1.6661, + "step": 1101 + }, + { + "epoch": 0.30133989608969103, + "grad_norm": 0.15756134688854218, + "learning_rate": 5e-05, + "loss": 1.6203, + "step": 1102 + }, + { + "epoch": 0.3016133442712606, + "grad_norm": 0.15823891758918762, + "learning_rate": 5e-05, + "loss": 1.6594, + "step": 1103 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.15777519345283508, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 1104 + }, + { + "epoch": 0.3021602406343998, + "grad_norm": 0.15784233808517456, + "learning_rate": 5e-05, + "loss": 1.6849, + "step": 1105 + }, + { + "epoch": 0.3024336888159694, + "grad_norm": 0.1561124622821808, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 1106 + }, + { + "epoch": 0.302707136997539, + "grad_norm": 0.1538151651620865, + "learning_rate": 5e-05, + "loss": 1.6838, + "step": 1107 + }, + { + "epoch": 0.3029805851791086, + "grad_norm": 0.14938168227672577, + "learning_rate": 5e-05, + "loss": 1.6129, + "step": 1108 + }, + { + "epoch": 0.30325403336067813, + "grad_norm": 0.1569468080997467, + "learning_rate": 5e-05, + "loss": 1.6651, + "step": 1109 + }, + { + "epoch": 0.30352748154224773, + "grad_norm": 0.15352605283260345, + "learning_rate": 5e-05, + "loss": 1.7078, + "step": 1110 + }, + { + "epoch": 0.30380092972381734, + "grad_norm": 0.15539826452732086, + "learning_rate": 5e-05, + "loss": 1.643, + "step": 1111 + }, + { + "epoch": 0.30407437790538694, + "grad_norm": 0.14606958627700806, + "learning_rate": 5e-05, + "loss": 1.6365, + "step": 1112 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.15335801243782043, + "learning_rate": 5e-05, + "loss": 1.6818, + "step": 1113 + }, + { + "epoch": 0.3046212742685261, + "grad_norm": 0.15371185541152954, + "learning_rate": 5e-05, + "loss": 1.6413, + "step": 1114 + }, + { + "epoch": 0.3048947224500957, + "grad_norm": 0.15357623994350433, + "learning_rate": 5e-05, + "loss": 1.6887, + "step": 1115 + }, + { + "epoch": 0.3051681706316653, + "grad_norm": 0.14678330719470978, + "learning_rate": 5e-05, + "loss": 1.7126, + "step": 1116 + }, + { + "epoch": 0.3054416188132349, + "grad_norm": 0.1587059050798416, + "learning_rate": 5e-05, + "loss": 1.6373, + "step": 1117 + }, + { + "epoch": 0.3057150669948045, + "grad_norm": 0.1466260552406311, + "learning_rate": 5e-05, + "loss": 1.6006, + "step": 1118 + }, + { + "epoch": 0.3059885151763741, + "grad_norm": 0.14602194726467133, + "learning_rate": 5e-05, + "loss": 1.6627, + "step": 1119 + }, + { + "epoch": 0.30626196335794365, + "grad_norm": 0.1472383588552475, + "learning_rate": 5e-05, + "loss": 1.5288, + "step": 1120 + }, + { + "epoch": 0.30653541153951325, + "grad_norm": 0.1505843549966812, + "learning_rate": 5e-05, + "loss": 1.6624, + "step": 1121 + }, + { + "epoch": 0.30680885972108285, + "grad_norm": 0.15707598626613617, + "learning_rate": 5e-05, + "loss": 1.6482, + "step": 1122 + }, + { + "epoch": 0.30708230790265245, + "grad_norm": 0.15016111731529236, + "learning_rate": 5e-05, + "loss": 1.5778, + "step": 1123 + }, + { + "epoch": 0.30735575608422205, + "grad_norm": 0.1513780951499939, + "learning_rate": 5e-05, + "loss": 1.5779, + "step": 1124 + }, + { + "epoch": 0.30762920426579166, + "grad_norm": 0.15368494391441345, + "learning_rate": 5e-05, + "loss": 1.6531, + "step": 1125 + }, + { + "epoch": 0.3079026524473612, + "grad_norm": 0.1544412523508072, + "learning_rate": 5e-05, + "loss": 1.7159, + "step": 1126 + }, + { + "epoch": 0.3081761006289308, + "grad_norm": 0.14770282804965973, + "learning_rate": 5e-05, + "loss": 1.6338, + "step": 1127 + }, + { + "epoch": 0.3084495488105004, + "grad_norm": 0.14697682857513428, + "learning_rate": 5e-05, + "loss": 1.6817, + "step": 1128 + }, + { + "epoch": 0.30872299699207, + "grad_norm": 0.15029069781303406, + "learning_rate": 5e-05, + "loss": 1.7186, + "step": 1129 + }, + { + "epoch": 0.3089964451736396, + "grad_norm": 0.15909984707832336, + "learning_rate": 5e-05, + "loss": 1.7119, + "step": 1130 + }, + { + "epoch": 0.3092698933552092, + "grad_norm": 0.14838500320911407, + "learning_rate": 5e-05, + "loss": 1.615, + "step": 1131 + }, + { + "epoch": 0.30954334153677876, + "grad_norm": 0.15336006879806519, + "learning_rate": 5e-05, + "loss": 1.6544, + "step": 1132 + }, + { + "epoch": 0.30981678971834836, + "grad_norm": 0.15481799840927124, + "learning_rate": 5e-05, + "loss": 1.6279, + "step": 1133 + }, + { + "epoch": 0.31009023789991796, + "grad_norm": 0.1593121588230133, + "learning_rate": 5e-05, + "loss": 1.6697, + "step": 1134 + }, + { + "epoch": 0.31036368608148757, + "grad_norm": 0.16930562257766724, + "learning_rate": 5e-05, + "loss": 1.6706, + "step": 1135 + }, + { + "epoch": 0.31063713426305717, + "grad_norm": 0.15615466237068176, + "learning_rate": 5e-05, + "loss": 1.6018, + "step": 1136 + }, + { + "epoch": 0.31091058244462677, + "grad_norm": 0.15975496172904968, + "learning_rate": 5e-05, + "loss": 1.696, + "step": 1137 + }, + { + "epoch": 0.3111840306261963, + "grad_norm": 0.14851494133472443, + "learning_rate": 5e-05, + "loss": 1.5931, + "step": 1138 + }, + { + "epoch": 0.3114574788077659, + "grad_norm": 0.1657806932926178, + "learning_rate": 5e-05, + "loss": 1.7412, + "step": 1139 + }, + { + "epoch": 0.3117309269893355, + "grad_norm": 0.15335924923419952, + "learning_rate": 5e-05, + "loss": 1.6903, + "step": 1140 + }, + { + "epoch": 0.3120043751709051, + "grad_norm": 0.15214680135250092, + "learning_rate": 5e-05, + "loss": 1.7338, + "step": 1141 + }, + { + "epoch": 0.3122778233524747, + "grad_norm": 0.16068771481513977, + "learning_rate": 5e-05, + "loss": 1.712, + "step": 1142 + }, + { + "epoch": 0.3125512715340443, + "grad_norm": 0.15814535319805145, + "learning_rate": 5e-05, + "loss": 1.7013, + "step": 1143 + }, + { + "epoch": 0.3128247197156139, + "grad_norm": 0.1487572193145752, + "learning_rate": 5e-05, + "loss": 1.6186, + "step": 1144 + }, + { + "epoch": 0.3130981678971835, + "grad_norm": 0.15032756328582764, + "learning_rate": 5e-05, + "loss": 1.6842, + "step": 1145 + }, + { + "epoch": 0.3133716160787531, + "grad_norm": 0.15075697004795074, + "learning_rate": 5e-05, + "loss": 1.536, + "step": 1146 + }, + { + "epoch": 0.3136450642603227, + "grad_norm": 0.15556646883487701, + "learning_rate": 5e-05, + "loss": 1.6043, + "step": 1147 + }, + { + "epoch": 0.3139185124418923, + "grad_norm": 0.16194835305213928, + "learning_rate": 5e-05, + "loss": 1.7423, + "step": 1148 + }, + { + "epoch": 0.31419196062346183, + "grad_norm": 0.1608458310365677, + "learning_rate": 5e-05, + "loss": 1.6599, + "step": 1149 + }, + { + "epoch": 0.31446540880503143, + "grad_norm": 0.16816149652004242, + "learning_rate": 5e-05, + "loss": 1.7054, + "step": 1150 + }, + { + "epoch": 0.31473885698660103, + "grad_norm": 0.14813366532325745, + "learning_rate": 5e-05, + "loss": 1.7635, + "step": 1151 + }, + { + "epoch": 0.31501230516817064, + "grad_norm": 0.16501514613628387, + "learning_rate": 5e-05, + "loss": 1.6414, + "step": 1152 + }, + { + "epoch": 0.31528575334974024, + "grad_norm": 0.15683704614639282, + "learning_rate": 5e-05, + "loss": 1.6307, + "step": 1153 + }, + { + "epoch": 0.31555920153130984, + "grad_norm": 0.1599043607711792, + "learning_rate": 5e-05, + "loss": 1.678, + "step": 1154 + }, + { + "epoch": 0.3158326497128794, + "grad_norm": 0.15843378007411957, + "learning_rate": 5e-05, + "loss": 1.6729, + "step": 1155 + }, + { + "epoch": 0.316106097894449, + "grad_norm": 0.14226911962032318, + "learning_rate": 5e-05, + "loss": 1.584, + "step": 1156 + }, + { + "epoch": 0.3163795460760186, + "grad_norm": 0.17751207947731018, + "learning_rate": 5e-05, + "loss": 1.6461, + "step": 1157 + }, + { + "epoch": 0.3166529942575882, + "grad_norm": 0.1599876433610916, + "learning_rate": 5e-05, + "loss": 1.698, + "step": 1158 + }, + { + "epoch": 0.3169264424391578, + "grad_norm": 0.14740917086601257, + "learning_rate": 5e-05, + "loss": 1.6424, + "step": 1159 + }, + { + "epoch": 0.3171998906207274, + "grad_norm": 0.15979237854480743, + "learning_rate": 5e-05, + "loss": 1.6335, + "step": 1160 + }, + { + "epoch": 0.31747333880229694, + "grad_norm": 0.16154277324676514, + "learning_rate": 5e-05, + "loss": 1.7442, + "step": 1161 + }, + { + "epoch": 0.31774678698386655, + "grad_norm": 0.15989361703395844, + "learning_rate": 5e-05, + "loss": 1.639, + "step": 1162 + }, + { + "epoch": 0.31802023516543615, + "grad_norm": 0.15814067423343658, + "learning_rate": 5e-05, + "loss": 1.7051, + "step": 1163 + }, + { + "epoch": 0.31829368334700575, + "grad_norm": 0.15459835529327393, + "learning_rate": 5e-05, + "loss": 1.5727, + "step": 1164 + }, + { + "epoch": 0.31856713152857535, + "grad_norm": 0.15153050422668457, + "learning_rate": 5e-05, + "loss": 1.6371, + "step": 1165 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.1500144898891449, + "learning_rate": 5e-05, + "loss": 1.6598, + "step": 1166 + }, + { + "epoch": 0.3191140278917145, + "grad_norm": 0.15817582607269287, + "learning_rate": 5e-05, + "loss": 1.6488, + "step": 1167 + }, + { + "epoch": 0.3193874760732841, + "grad_norm": 0.15822099149227142, + "learning_rate": 5e-05, + "loss": 1.5906, + "step": 1168 + }, + { + "epoch": 0.3196609242548537, + "grad_norm": 0.14896897971630096, + "learning_rate": 5e-05, + "loss": 1.6791, + "step": 1169 + }, + { + "epoch": 0.3199343724364233, + "grad_norm": 0.1585737019777298, + "learning_rate": 5e-05, + "loss": 1.7674, + "step": 1170 + }, + { + "epoch": 0.3202078206179929, + "grad_norm": 0.16357015073299408, + "learning_rate": 5e-05, + "loss": 1.6621, + "step": 1171 + }, + { + "epoch": 0.32048126879956246, + "grad_norm": 0.14537674188613892, + "learning_rate": 5e-05, + "loss": 1.5276, + "step": 1172 + }, + { + "epoch": 0.32075471698113206, + "grad_norm": 0.15648779273033142, + "learning_rate": 5e-05, + "loss": 1.6323, + "step": 1173 + }, + { + "epoch": 0.32102816516270166, + "grad_norm": 0.14875225722789764, + "learning_rate": 5e-05, + "loss": 1.6257, + "step": 1174 + }, + { + "epoch": 0.32130161334427126, + "grad_norm": 0.14988994598388672, + "learning_rate": 5e-05, + "loss": 1.642, + "step": 1175 + }, + { + "epoch": 0.32157506152584087, + "grad_norm": 0.14954978227615356, + "learning_rate": 5e-05, + "loss": 1.6477, + "step": 1176 + }, + { + "epoch": 0.32184850970741047, + "grad_norm": 0.15913893282413483, + "learning_rate": 5e-05, + "loss": 1.7523, + "step": 1177 + }, + { + "epoch": 0.32212195788898, + "grad_norm": 0.17450571060180664, + "learning_rate": 5e-05, + "loss": 1.7723, + "step": 1178 + }, + { + "epoch": 0.3223954060705496, + "grad_norm": 0.14600080251693726, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 1179 + }, + { + "epoch": 0.3226688542521192, + "grad_norm": 0.14759889245033264, + "learning_rate": 5e-05, + "loss": 1.5792, + "step": 1180 + }, + { + "epoch": 0.3229423024336888, + "grad_norm": 0.17649635672569275, + "learning_rate": 5e-05, + "loss": 1.7302, + "step": 1181 + }, + { + "epoch": 0.3232157506152584, + "grad_norm": 0.15906043350696564, + "learning_rate": 5e-05, + "loss": 1.6046, + "step": 1182 + }, + { + "epoch": 0.323489198796828, + "grad_norm": 0.14562730491161346, + "learning_rate": 5e-05, + "loss": 1.566, + "step": 1183 + }, + { + "epoch": 0.3237626469783976, + "grad_norm": 0.15766160190105438, + "learning_rate": 5e-05, + "loss": 1.6397, + "step": 1184 + }, + { + "epoch": 0.3240360951599672, + "grad_norm": 0.16190050542354584, + "learning_rate": 5e-05, + "loss": 1.5876, + "step": 1185 + }, + { + "epoch": 0.3243095433415368, + "grad_norm": 0.15443246066570282, + "learning_rate": 5e-05, + "loss": 1.6817, + "step": 1186 + }, + { + "epoch": 0.3245829915231064, + "grad_norm": 0.1686965525150299, + "learning_rate": 5e-05, + "loss": 1.5725, + "step": 1187 + }, + { + "epoch": 0.324856439704676, + "grad_norm": 0.16837440431118011, + "learning_rate": 5e-05, + "loss": 1.681, + "step": 1188 + }, + { + "epoch": 0.3251298878862456, + "grad_norm": 0.15975917875766754, + "learning_rate": 5e-05, + "loss": 1.6988, + "step": 1189 + }, + { + "epoch": 0.32540333606781513, + "grad_norm": 0.16166743636131287, + "learning_rate": 5e-05, + "loss": 1.6254, + "step": 1190 + }, + { + "epoch": 0.32567678424938473, + "grad_norm": 0.15348750352859497, + "learning_rate": 5e-05, + "loss": 1.6757, + "step": 1191 + }, + { + "epoch": 0.32595023243095433, + "grad_norm": 0.16833312809467316, + "learning_rate": 5e-05, + "loss": 1.6262, + "step": 1192 + }, + { + "epoch": 0.32622368061252394, + "grad_norm": 0.1633581817150116, + "learning_rate": 5e-05, + "loss": 1.7312, + "step": 1193 + }, + { + "epoch": 0.32649712879409354, + "grad_norm": 0.14953148365020752, + "learning_rate": 5e-05, + "loss": 1.5993, + "step": 1194 + }, + { + "epoch": 0.3267705769756631, + "grad_norm": 0.1685009002685547, + "learning_rate": 5e-05, + "loss": 1.7026, + "step": 1195 + }, + { + "epoch": 0.3270440251572327, + "grad_norm": 0.1520211100578308, + "learning_rate": 5e-05, + "loss": 1.6178, + "step": 1196 + }, + { + "epoch": 0.3273174733388023, + "grad_norm": 0.15548743307590485, + "learning_rate": 5e-05, + "loss": 1.6335, + "step": 1197 + }, + { + "epoch": 0.3275909215203719, + "grad_norm": 0.1540190726518631, + "learning_rate": 5e-05, + "loss": 1.7661, + "step": 1198 + }, + { + "epoch": 0.3278643697019415, + "grad_norm": 0.15100663900375366, + "learning_rate": 5e-05, + "loss": 1.5361, + "step": 1199 + }, + { + "epoch": 0.3281378178835111, + "grad_norm": 0.15432646870613098, + "learning_rate": 5e-05, + "loss": 1.7029, + "step": 1200 + }, + { + "epoch": 0.32841126606508064, + "grad_norm": 0.15209780633449554, + "learning_rate": 5e-05, + "loss": 1.6857, + "step": 1201 + }, + { + "epoch": 0.32868471424665024, + "grad_norm": 0.1498226374387741, + "learning_rate": 5e-05, + "loss": 1.7318, + "step": 1202 + }, + { + "epoch": 0.32895816242821985, + "grad_norm": 0.149881511926651, + "learning_rate": 5e-05, + "loss": 1.6591, + "step": 1203 + }, + { + "epoch": 0.32923161060978945, + "grad_norm": 0.15110871195793152, + "learning_rate": 5e-05, + "loss": 1.6238, + "step": 1204 + }, + { + "epoch": 0.32950505879135905, + "grad_norm": 0.1504354178905487, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 1205 + }, + { + "epoch": 0.32977850697292865, + "grad_norm": 0.16923676431179047, + "learning_rate": 5e-05, + "loss": 1.7914, + "step": 1206 + }, + { + "epoch": 0.3300519551544982, + "grad_norm": 0.17524860799312592, + "learning_rate": 5e-05, + "loss": 1.6875, + "step": 1207 + }, + { + "epoch": 0.3303254033360678, + "grad_norm": 0.1538068652153015, + "learning_rate": 5e-05, + "loss": 1.6375, + "step": 1208 + }, + { + "epoch": 0.3305988515176374, + "grad_norm": 0.15115374326705933, + "learning_rate": 5e-05, + "loss": 1.6759, + "step": 1209 + }, + { + "epoch": 0.330872299699207, + "grad_norm": 0.1774495393037796, + "learning_rate": 5e-05, + "loss": 1.6865, + "step": 1210 + }, + { + "epoch": 0.3311457478807766, + "grad_norm": 0.16683407127857208, + "learning_rate": 5e-05, + "loss": 1.6031, + "step": 1211 + }, + { + "epoch": 0.3314191960623462, + "grad_norm": 0.16064640879631042, + "learning_rate": 5e-05, + "loss": 1.7608, + "step": 1212 + }, + { + "epoch": 0.33169264424391576, + "grad_norm": 0.16114073991775513, + "learning_rate": 5e-05, + "loss": 1.6658, + "step": 1213 + }, + { + "epoch": 0.33196609242548536, + "grad_norm": 0.161068856716156, + "learning_rate": 5e-05, + "loss": 1.6989, + "step": 1214 + }, + { + "epoch": 0.33223954060705496, + "grad_norm": 0.155606210231781, + "learning_rate": 5e-05, + "loss": 1.6235, + "step": 1215 + }, + { + "epoch": 0.33251298878862456, + "grad_norm": 0.18368175625801086, + "learning_rate": 5e-05, + "loss": 1.6516, + "step": 1216 + }, + { + "epoch": 0.33278643697019417, + "grad_norm": 0.1500716358423233, + "learning_rate": 5e-05, + "loss": 1.6361, + "step": 1217 + }, + { + "epoch": 0.33305988515176377, + "grad_norm": 0.15658661723136902, + "learning_rate": 5e-05, + "loss": 1.5645, + "step": 1218 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.16541090607643127, + "learning_rate": 5e-05, + "loss": 1.7556, + "step": 1219 + }, + { + "epoch": 0.3336067815149029, + "grad_norm": 0.15288498997688293, + "learning_rate": 5e-05, + "loss": 1.6349, + "step": 1220 + }, + { + "epoch": 0.3338802296964725, + "grad_norm": 0.16073960065841675, + "learning_rate": 5e-05, + "loss": 1.6157, + "step": 1221 + }, + { + "epoch": 0.3341536778780421, + "grad_norm": 0.1543978601694107, + "learning_rate": 5e-05, + "loss": 1.6628, + "step": 1222 + }, + { + "epoch": 0.3344271260596117, + "grad_norm": 0.14811809360980988, + "learning_rate": 5e-05, + "loss": 1.5668, + "step": 1223 + }, + { + "epoch": 0.33470057424118127, + "grad_norm": 0.15395627915859222, + "learning_rate": 5e-05, + "loss": 1.6593, + "step": 1224 + }, + { + "epoch": 0.3349740224227509, + "grad_norm": 0.16432489454746246, + "learning_rate": 5e-05, + "loss": 1.6334, + "step": 1225 + }, + { + "epoch": 0.3352474706043205, + "grad_norm": 0.15168853104114532, + "learning_rate": 5e-05, + "loss": 1.6247, + "step": 1226 + }, + { + "epoch": 0.3355209187858901, + "grad_norm": 0.15608245134353638, + "learning_rate": 5e-05, + "loss": 1.6651, + "step": 1227 + }, + { + "epoch": 0.3357943669674597, + "grad_norm": 0.16598603129386902, + "learning_rate": 5e-05, + "loss": 1.7173, + "step": 1228 + }, + { + "epoch": 0.3360678151490293, + "grad_norm": 0.14476749300956726, + "learning_rate": 5e-05, + "loss": 1.6363, + "step": 1229 + }, + { + "epoch": 0.3363412633305988, + "grad_norm": 0.16102361679077148, + "learning_rate": 5e-05, + "loss": 1.6763, + "step": 1230 + }, + { + "epoch": 0.33661471151216843, + "grad_norm": 0.16813768446445465, + "learning_rate": 5e-05, + "loss": 1.6754, + "step": 1231 + }, + { + "epoch": 0.33688815969373803, + "grad_norm": 0.15631164610385895, + "learning_rate": 5e-05, + "loss": 1.6529, + "step": 1232 + }, + { + "epoch": 0.33716160787530763, + "grad_norm": 0.1545805037021637, + "learning_rate": 5e-05, + "loss": 1.7156, + "step": 1233 + }, + { + "epoch": 0.33743505605687724, + "grad_norm": 0.17586275935173035, + "learning_rate": 5e-05, + "loss": 1.6615, + "step": 1234 + }, + { + "epoch": 0.33770850423844684, + "grad_norm": 0.15202665328979492, + "learning_rate": 5e-05, + "loss": 1.6291, + "step": 1235 + }, + { + "epoch": 0.3379819524200164, + "grad_norm": 0.16369853913784027, + "learning_rate": 5e-05, + "loss": 1.6827, + "step": 1236 + }, + { + "epoch": 0.338255400601586, + "grad_norm": 0.15997561812400818, + "learning_rate": 5e-05, + "loss": 1.7339, + "step": 1237 + }, + { + "epoch": 0.3385288487831556, + "grad_norm": 0.14640896022319794, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 1238 + }, + { + "epoch": 0.3388022969647252, + "grad_norm": 0.1671455204486847, + "learning_rate": 5e-05, + "loss": 1.6769, + "step": 1239 + }, + { + "epoch": 0.3390757451462948, + "grad_norm": 0.16016924381256104, + "learning_rate": 5e-05, + "loss": 1.7308, + "step": 1240 + }, + { + "epoch": 0.3393491933278644, + "grad_norm": 0.16020850837230682, + "learning_rate": 5e-05, + "loss": 1.7288, + "step": 1241 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.14866626262664795, + "learning_rate": 5e-05, + "loss": 1.5764, + "step": 1242 + }, + { + "epoch": 0.33989608969100354, + "grad_norm": 0.17916180193424225, + "learning_rate": 5e-05, + "loss": 1.7789, + "step": 1243 + }, + { + "epoch": 0.34016953787257315, + "grad_norm": 0.14788402616977692, + "learning_rate": 5e-05, + "loss": 1.63, + "step": 1244 + }, + { + "epoch": 0.34044298605414275, + "grad_norm": 0.15443041920661926, + "learning_rate": 5e-05, + "loss": 1.6499, + "step": 1245 + }, + { + "epoch": 0.34071643423571235, + "grad_norm": 0.15852518379688263, + "learning_rate": 5e-05, + "loss": 1.6446, + "step": 1246 + }, + { + "epoch": 0.3409898824172819, + "grad_norm": 0.15751297771930695, + "learning_rate": 5e-05, + "loss": 1.6294, + "step": 1247 + }, + { + "epoch": 0.3412633305988515, + "grad_norm": 0.15762586891651154, + "learning_rate": 5e-05, + "loss": 1.6864, + "step": 1248 + }, + { + "epoch": 0.3415367787804211, + "grad_norm": 0.15331421792507172, + "learning_rate": 5e-05, + "loss": 1.6898, + "step": 1249 + }, + { + "epoch": 0.3418102269619907, + "grad_norm": 0.165785014629364, + "learning_rate": 5e-05, + "loss": 1.703, + "step": 1250 + }, + { + "epoch": 0.3420836751435603, + "grad_norm": 0.1514601707458496, + "learning_rate": 5e-05, + "loss": 1.6489, + "step": 1251 + }, + { + "epoch": 0.3423571233251299, + "grad_norm": 0.187610924243927, + "learning_rate": 5e-05, + "loss": 1.6109, + "step": 1252 + }, + { + "epoch": 0.34263057150669946, + "grad_norm": 0.1479722261428833, + "learning_rate": 5e-05, + "loss": 1.6528, + "step": 1253 + }, + { + "epoch": 0.34290401968826906, + "grad_norm": 0.1570201814174652, + "learning_rate": 5e-05, + "loss": 1.6706, + "step": 1254 + }, + { + "epoch": 0.34317746786983866, + "grad_norm": 0.1705133467912674, + "learning_rate": 5e-05, + "loss": 1.7262, + "step": 1255 + }, + { + "epoch": 0.34345091605140826, + "grad_norm": 0.15776453912258148, + "learning_rate": 5e-05, + "loss": 1.6374, + "step": 1256 + }, + { + "epoch": 0.34372436423297786, + "grad_norm": 0.16335074603557587, + "learning_rate": 5e-05, + "loss": 1.6261, + "step": 1257 + }, + { + "epoch": 0.34399781241454747, + "grad_norm": 0.17720459401607513, + "learning_rate": 5e-05, + "loss": 1.5963, + "step": 1258 + }, + { + "epoch": 0.344271260596117, + "grad_norm": 0.14796927571296692, + "learning_rate": 5e-05, + "loss": 1.5923, + "step": 1259 + }, + { + "epoch": 0.3445447087776866, + "grad_norm": 0.16118891537189484, + "learning_rate": 5e-05, + "loss": 1.5986, + "step": 1260 + }, + { + "epoch": 0.3448181569592562, + "grad_norm": 0.16453957557678223, + "learning_rate": 5e-05, + "loss": 1.7498, + "step": 1261 + }, + { + "epoch": 0.3450916051408258, + "grad_norm": 0.15277455747127533, + "learning_rate": 5e-05, + "loss": 1.6196, + "step": 1262 + }, + { + "epoch": 0.3453650533223954, + "grad_norm": 0.1496528536081314, + "learning_rate": 5e-05, + "loss": 1.635, + "step": 1263 + }, + { + "epoch": 0.345638501503965, + "grad_norm": 0.1616893708705902, + "learning_rate": 5e-05, + "loss": 1.591, + "step": 1264 + }, + { + "epoch": 0.34591194968553457, + "grad_norm": 0.1540791094303131, + "learning_rate": 5e-05, + "loss": 1.6713, + "step": 1265 + }, + { + "epoch": 0.34618539786710417, + "grad_norm": 0.15063992142677307, + "learning_rate": 5e-05, + "loss": 1.6739, + "step": 1266 + }, + { + "epoch": 0.3464588460486738, + "grad_norm": 0.15687578916549683, + "learning_rate": 5e-05, + "loss": 1.7105, + "step": 1267 + }, + { + "epoch": 0.3467322942302434, + "grad_norm": 0.14987704157829285, + "learning_rate": 5e-05, + "loss": 1.7308, + "step": 1268 + }, + { + "epoch": 0.347005742411813, + "grad_norm": 0.1480540633201599, + "learning_rate": 5e-05, + "loss": 1.7043, + "step": 1269 + }, + { + "epoch": 0.3472791905933826, + "grad_norm": 0.1529364138841629, + "learning_rate": 5e-05, + "loss": 1.5753, + "step": 1270 + }, + { + "epoch": 0.3475526387749521, + "grad_norm": 0.14511734247207642, + "learning_rate": 5e-05, + "loss": 1.5622, + "step": 1271 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.14741328358650208, + "learning_rate": 5e-05, + "loss": 1.6804, + "step": 1272 + }, + { + "epoch": 0.34809953513809133, + "grad_norm": 0.14820526540279388, + "learning_rate": 5e-05, + "loss": 1.5904, + "step": 1273 + }, + { + "epoch": 0.34837298331966093, + "grad_norm": 0.1521809697151184, + "learning_rate": 5e-05, + "loss": 1.7002, + "step": 1274 + }, + { + "epoch": 0.34864643150123054, + "grad_norm": 0.1577511727809906, + "learning_rate": 5e-05, + "loss": 1.6897, + "step": 1275 + }, + { + "epoch": 0.3489198796828001, + "grad_norm": 0.14861604571342468, + "learning_rate": 5e-05, + "loss": 1.6671, + "step": 1276 + }, + { + "epoch": 0.3491933278643697, + "grad_norm": 0.15119168162345886, + "learning_rate": 5e-05, + "loss": 1.6387, + "step": 1277 + }, + { + "epoch": 0.3494667760459393, + "grad_norm": 0.1554817259311676, + "learning_rate": 5e-05, + "loss": 1.5855, + "step": 1278 + }, + { + "epoch": 0.3497402242275089, + "grad_norm": 0.151669442653656, + "learning_rate": 5e-05, + "loss": 1.6634, + "step": 1279 + }, + { + "epoch": 0.3500136724090785, + "grad_norm": 0.1499747335910797, + "learning_rate": 5e-05, + "loss": 1.627, + "step": 1280 + }, + { + "epoch": 0.3502871205906481, + "grad_norm": 0.15303084254264832, + "learning_rate": 5e-05, + "loss": 1.6594, + "step": 1281 + }, + { + "epoch": 0.35056056877221764, + "grad_norm": 0.1563277691602707, + "learning_rate": 5e-05, + "loss": 1.8, + "step": 1282 + }, + { + "epoch": 0.35083401695378724, + "grad_norm": 0.16208653151988983, + "learning_rate": 5e-05, + "loss": 1.7052, + "step": 1283 + }, + { + "epoch": 0.35110746513535684, + "grad_norm": 0.14594794809818268, + "learning_rate": 5e-05, + "loss": 1.5883, + "step": 1284 + }, + { + "epoch": 0.35138091331692645, + "grad_norm": 0.1515040099620819, + "learning_rate": 5e-05, + "loss": 1.681, + "step": 1285 + }, + { + "epoch": 0.35165436149849605, + "grad_norm": 0.15303464233875275, + "learning_rate": 5e-05, + "loss": 1.6776, + "step": 1286 + }, + { + "epoch": 0.35192780968006565, + "grad_norm": 0.15074992179870605, + "learning_rate": 5e-05, + "loss": 1.6131, + "step": 1287 + }, + { + "epoch": 0.3522012578616352, + "grad_norm": 0.15136325359344482, + "learning_rate": 5e-05, + "loss": 1.6393, + "step": 1288 + }, + { + "epoch": 0.3524747060432048, + "grad_norm": 0.161278635263443, + "learning_rate": 5e-05, + "loss": 1.5921, + "step": 1289 + }, + { + "epoch": 0.3527481542247744, + "grad_norm": 0.16516782343387604, + "learning_rate": 5e-05, + "loss": 1.6514, + "step": 1290 + }, + { + "epoch": 0.353021602406344, + "grad_norm": 0.1504773199558258, + "learning_rate": 5e-05, + "loss": 1.6053, + "step": 1291 + }, + { + "epoch": 0.3532950505879136, + "grad_norm": 0.15545381605625153, + "learning_rate": 5e-05, + "loss": 1.6231, + "step": 1292 + }, + { + "epoch": 0.3535684987694832, + "grad_norm": 0.14545585215091705, + "learning_rate": 5e-05, + "loss": 1.6238, + "step": 1293 + }, + { + "epoch": 0.35384194695105275, + "grad_norm": 0.15290163457393646, + "learning_rate": 5e-05, + "loss": 1.6625, + "step": 1294 + }, + { + "epoch": 0.35411539513262236, + "grad_norm": 0.15580037236213684, + "learning_rate": 5e-05, + "loss": 1.6871, + "step": 1295 + }, + { + "epoch": 0.35438884331419196, + "grad_norm": 0.15643970668315887, + "learning_rate": 5e-05, + "loss": 1.6451, + "step": 1296 + }, + { + "epoch": 0.35466229149576156, + "grad_norm": 0.14689311385154724, + "learning_rate": 5e-05, + "loss": 1.612, + "step": 1297 + }, + { + "epoch": 0.35493573967733116, + "grad_norm": 0.152411088347435, + "learning_rate": 5e-05, + "loss": 1.5722, + "step": 1298 + }, + { + "epoch": 0.35520918785890077, + "grad_norm": 0.16959436237812042, + "learning_rate": 5e-05, + "loss": 1.7114, + "step": 1299 + }, + { + "epoch": 0.3554826360404703, + "grad_norm": 0.15447081625461578, + "learning_rate": 5e-05, + "loss": 1.7206, + "step": 1300 + }, + { + "epoch": 0.3557560842220399, + "grad_norm": 0.17055825889110565, + "learning_rate": 5e-05, + "loss": 1.5612, + "step": 1301 + }, + { + "epoch": 0.3560295324036095, + "grad_norm": 0.157841756939888, + "learning_rate": 5e-05, + "loss": 1.6815, + "step": 1302 + }, + { + "epoch": 0.3563029805851791, + "grad_norm": 0.17073825001716614, + "learning_rate": 5e-05, + "loss": 1.6889, + "step": 1303 + }, + { + "epoch": 0.3565764287667487, + "grad_norm": 0.17082183063030243, + "learning_rate": 5e-05, + "loss": 1.6979, + "step": 1304 + }, + { + "epoch": 0.35684987694831827, + "grad_norm": 0.15137112140655518, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 1305 + }, + { + "epoch": 0.35712332512988787, + "grad_norm": 0.15363116562366486, + "learning_rate": 5e-05, + "loss": 1.6242, + "step": 1306 + }, + { + "epoch": 0.35739677331145747, + "grad_norm": 0.15610884130001068, + "learning_rate": 5e-05, + "loss": 1.5751, + "step": 1307 + }, + { + "epoch": 0.3576702214930271, + "grad_norm": 0.15649494528770447, + "learning_rate": 5e-05, + "loss": 1.6683, + "step": 1308 + }, + { + "epoch": 0.3579436696745967, + "grad_norm": 0.1602485328912735, + "learning_rate": 5e-05, + "loss": 1.7109, + "step": 1309 + }, + { + "epoch": 0.3582171178561663, + "grad_norm": 0.16601736843585968, + "learning_rate": 5e-05, + "loss": 1.6664, + "step": 1310 + }, + { + "epoch": 0.3584905660377358, + "grad_norm": 0.15012311935424805, + "learning_rate": 5e-05, + "loss": 1.5708, + "step": 1311 + }, + { + "epoch": 0.3587640142193054, + "grad_norm": 0.16618122160434723, + "learning_rate": 5e-05, + "loss": 1.7141, + "step": 1312 + }, + { + "epoch": 0.35903746240087503, + "grad_norm": 0.1614745706319809, + "learning_rate": 5e-05, + "loss": 1.6907, + "step": 1313 + }, + { + "epoch": 0.35931091058244463, + "grad_norm": 0.1518169492483139, + "learning_rate": 5e-05, + "loss": 1.6233, + "step": 1314 + }, + { + "epoch": 0.35958435876401423, + "grad_norm": 0.16219857335090637, + "learning_rate": 5e-05, + "loss": 1.6707, + "step": 1315 + }, + { + "epoch": 0.35985780694558384, + "grad_norm": 0.15628166496753693, + "learning_rate": 5e-05, + "loss": 1.7647, + "step": 1316 + }, + { + "epoch": 0.3601312551271534, + "grad_norm": 0.14191019535064697, + "learning_rate": 5e-05, + "loss": 1.5061, + "step": 1317 + }, + { + "epoch": 0.360404703308723, + "grad_norm": 0.148356094956398, + "learning_rate": 5e-05, + "loss": 1.6642, + "step": 1318 + }, + { + "epoch": 0.3606781514902926, + "grad_norm": 0.14994607865810394, + "learning_rate": 5e-05, + "loss": 1.655, + "step": 1319 + }, + { + "epoch": 0.3609515996718622, + "grad_norm": 0.15752872824668884, + "learning_rate": 5e-05, + "loss": 1.5816, + "step": 1320 + }, + { + "epoch": 0.3612250478534318, + "grad_norm": 0.15151529014110565, + "learning_rate": 5e-05, + "loss": 1.6372, + "step": 1321 + }, + { + "epoch": 0.3614984960350014, + "grad_norm": 0.15662340819835663, + "learning_rate": 5e-05, + "loss": 1.6894, + "step": 1322 + }, + { + "epoch": 0.36177194421657094, + "grad_norm": 0.16049139201641083, + "learning_rate": 5e-05, + "loss": 1.616, + "step": 1323 + }, + { + "epoch": 0.36204539239814054, + "grad_norm": 0.15439613163471222, + "learning_rate": 5e-05, + "loss": 1.6672, + "step": 1324 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 0.15884248912334442, + "learning_rate": 5e-05, + "loss": 1.6366, + "step": 1325 + }, + { + "epoch": 0.36259228876127975, + "grad_norm": 0.15194907784461975, + "learning_rate": 5e-05, + "loss": 1.7004, + "step": 1326 + }, + { + "epoch": 0.36286573694284935, + "grad_norm": 0.1542571783065796, + "learning_rate": 5e-05, + "loss": 1.6714, + "step": 1327 + }, + { + "epoch": 0.3631391851244189, + "grad_norm": 0.15640923380851746, + "learning_rate": 5e-05, + "loss": 1.7018, + "step": 1328 + }, + { + "epoch": 0.3634126333059885, + "grad_norm": 0.15510603785514832, + "learning_rate": 5e-05, + "loss": 1.6173, + "step": 1329 + }, + { + "epoch": 0.3636860814875581, + "grad_norm": 0.14958932995796204, + "learning_rate": 5e-05, + "loss": 1.6507, + "step": 1330 + }, + { + "epoch": 0.3639595296691277, + "grad_norm": 0.15672361850738525, + "learning_rate": 5e-05, + "loss": 1.6679, + "step": 1331 + }, + { + "epoch": 0.3642329778506973, + "grad_norm": 0.16806292533874512, + "learning_rate": 5e-05, + "loss": 1.6097, + "step": 1332 + }, + { + "epoch": 0.3645064260322669, + "grad_norm": 0.14968685805797577, + "learning_rate": 5e-05, + "loss": 1.6181, + "step": 1333 + }, + { + "epoch": 0.36477987421383645, + "grad_norm": 0.14979314804077148, + "learning_rate": 5e-05, + "loss": 1.705, + "step": 1334 + }, + { + "epoch": 0.36505332239540605, + "grad_norm": 0.14740002155303955, + "learning_rate": 5e-05, + "loss": 1.6532, + "step": 1335 + }, + { + "epoch": 0.36532677057697566, + "grad_norm": 0.15307697653770447, + "learning_rate": 5e-05, + "loss": 1.5616, + "step": 1336 + }, + { + "epoch": 0.36560021875854526, + "grad_norm": 0.165946364402771, + "learning_rate": 5e-05, + "loss": 1.7643, + "step": 1337 + }, + { + "epoch": 0.36587366694011486, + "grad_norm": 0.1671787053346634, + "learning_rate": 5e-05, + "loss": 1.7023, + "step": 1338 + }, + { + "epoch": 0.36614711512168446, + "grad_norm": 0.1597784012556076, + "learning_rate": 5e-05, + "loss": 1.6975, + "step": 1339 + }, + { + "epoch": 0.366420563303254, + "grad_norm": 0.1569102704524994, + "learning_rate": 5e-05, + "loss": 1.7043, + "step": 1340 + }, + { + "epoch": 0.3666940114848236, + "grad_norm": 0.1625957041978836, + "learning_rate": 5e-05, + "loss": 1.6712, + "step": 1341 + }, + { + "epoch": 0.3669674596663932, + "grad_norm": 0.16208802163600922, + "learning_rate": 5e-05, + "loss": 1.6891, + "step": 1342 + }, + { + "epoch": 0.3672409078479628, + "grad_norm": 0.1636815369129181, + "learning_rate": 5e-05, + "loss": 1.7703, + "step": 1343 + }, + { + "epoch": 0.3675143560295324, + "grad_norm": 0.1954984962940216, + "learning_rate": 5e-05, + "loss": 1.6462, + "step": 1344 + }, + { + "epoch": 0.367787804211102, + "grad_norm": 0.16720376908779144, + "learning_rate": 5e-05, + "loss": 1.6952, + "step": 1345 + }, + { + "epoch": 0.36806125239267157, + "grad_norm": 0.1478559672832489, + "learning_rate": 5e-05, + "loss": 1.6585, + "step": 1346 + }, + { + "epoch": 0.36833470057424117, + "grad_norm": 0.18975849449634552, + "learning_rate": 5e-05, + "loss": 1.6765, + "step": 1347 + }, + { + "epoch": 0.36860814875581077, + "grad_norm": 0.1447249799966812, + "learning_rate": 5e-05, + "loss": 1.633, + "step": 1348 + }, + { + "epoch": 0.3688815969373804, + "grad_norm": 0.19498814642429352, + "learning_rate": 5e-05, + "loss": 1.7036, + "step": 1349 + }, + { + "epoch": 0.36915504511895, + "grad_norm": 0.16138319671154022, + "learning_rate": 5e-05, + "loss": 1.6062, + "step": 1350 + }, + { + "epoch": 0.3694284933005196, + "grad_norm": 0.15296588838100433, + "learning_rate": 5e-05, + "loss": 1.6813, + "step": 1351 + }, + { + "epoch": 0.3697019414820891, + "grad_norm": 0.1728629469871521, + "learning_rate": 5e-05, + "loss": 1.723, + "step": 1352 + }, + { + "epoch": 0.3699753896636587, + "grad_norm": 0.17153020203113556, + "learning_rate": 5e-05, + "loss": 1.6255, + "step": 1353 + }, + { + "epoch": 0.37024883784522833, + "grad_norm": 0.1482992023229599, + "learning_rate": 5e-05, + "loss": 1.5935, + "step": 1354 + }, + { + "epoch": 0.37052228602679793, + "grad_norm": 0.19278480112552643, + "learning_rate": 5e-05, + "loss": 1.7194, + "step": 1355 + }, + { + "epoch": 0.37079573420836753, + "grad_norm": 0.15969671308994293, + "learning_rate": 5e-05, + "loss": 1.7102, + "step": 1356 + }, + { + "epoch": 0.3710691823899371, + "grad_norm": 0.1841089278459549, + "learning_rate": 5e-05, + "loss": 1.6842, + "step": 1357 + }, + { + "epoch": 0.3713426305715067, + "grad_norm": 0.1696590632200241, + "learning_rate": 5e-05, + "loss": 1.6301, + "step": 1358 + }, + { + "epoch": 0.3716160787530763, + "grad_norm": 0.15877504646778107, + "learning_rate": 5e-05, + "loss": 1.6211, + "step": 1359 + }, + { + "epoch": 0.3718895269346459, + "grad_norm": 0.17480207979679108, + "learning_rate": 5e-05, + "loss": 1.5707, + "step": 1360 + }, + { + "epoch": 0.3721629751162155, + "grad_norm": 0.1526334136724472, + "learning_rate": 5e-05, + "loss": 1.6062, + "step": 1361 + }, + { + "epoch": 0.3724364232977851, + "grad_norm": 0.1520829051733017, + "learning_rate": 5e-05, + "loss": 1.713, + "step": 1362 + }, + { + "epoch": 0.37270987147935464, + "grad_norm": 0.17460103332996368, + "learning_rate": 5e-05, + "loss": 1.5544, + "step": 1363 + }, + { + "epoch": 0.37298331966092424, + "grad_norm": 0.16600289940834045, + "learning_rate": 5e-05, + "loss": 1.6942, + "step": 1364 + }, + { + "epoch": 0.37325676784249384, + "grad_norm": 0.15874885022640228, + "learning_rate": 5e-05, + "loss": 1.5408, + "step": 1365 + }, + { + "epoch": 0.37353021602406344, + "grad_norm": 0.15118831396102905, + "learning_rate": 5e-05, + "loss": 1.6963, + "step": 1366 + }, + { + "epoch": 0.37380366420563305, + "grad_norm": 0.16530974209308624, + "learning_rate": 5e-05, + "loss": 1.6564, + "step": 1367 + }, + { + "epoch": 0.37407711238720265, + "grad_norm": 0.16382399201393127, + "learning_rate": 5e-05, + "loss": 1.5668, + "step": 1368 + }, + { + "epoch": 0.3743505605687722, + "grad_norm": 0.16075266897678375, + "learning_rate": 5e-05, + "loss": 1.5714, + "step": 1369 + }, + { + "epoch": 0.3746240087503418, + "grad_norm": 0.16619382798671722, + "learning_rate": 5e-05, + "loss": 1.6636, + "step": 1370 + }, + { + "epoch": 0.3748974569319114, + "grad_norm": 0.16388960182666779, + "learning_rate": 5e-05, + "loss": 1.5565, + "step": 1371 + }, + { + "epoch": 0.375170905113481, + "grad_norm": 0.17126598954200745, + "learning_rate": 5e-05, + "loss": 1.6027, + "step": 1372 + }, + { + "epoch": 0.3754443532950506, + "grad_norm": 0.15221962332725525, + "learning_rate": 5e-05, + "loss": 1.7189, + "step": 1373 + }, + { + "epoch": 0.3757178014766202, + "grad_norm": 0.1918559968471527, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 1374 + }, + { + "epoch": 0.37599124965818975, + "grad_norm": 0.1808595359325409, + "learning_rate": 5e-05, + "loss": 1.7059, + "step": 1375 + }, + { + "epoch": 0.37626469783975935, + "grad_norm": 0.16812893748283386, + "learning_rate": 5e-05, + "loss": 1.8234, + "step": 1376 + }, + { + "epoch": 0.37653814602132896, + "grad_norm": 0.17743167281150818, + "learning_rate": 5e-05, + "loss": 1.6533, + "step": 1377 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.16644933819770813, + "learning_rate": 5e-05, + "loss": 1.6681, + "step": 1378 + }, + { + "epoch": 0.37708504238446816, + "grad_norm": 0.1525644063949585, + "learning_rate": 5e-05, + "loss": 1.6302, + "step": 1379 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.1539047807455063, + "learning_rate": 5e-05, + "loss": 1.6559, + "step": 1380 + }, + { + "epoch": 0.3776319387476073, + "grad_norm": 0.17377246916294098, + "learning_rate": 5e-05, + "loss": 1.6563, + "step": 1381 + }, + { + "epoch": 0.3779053869291769, + "grad_norm": 0.1529269814491272, + "learning_rate": 5e-05, + "loss": 1.6278, + "step": 1382 + }, + { + "epoch": 0.3781788351107465, + "grad_norm": 0.17307884991168976, + "learning_rate": 5e-05, + "loss": 1.8326, + "step": 1383 + }, + { + "epoch": 0.3784522832923161, + "grad_norm": 0.15691839158535004, + "learning_rate": 5e-05, + "loss": 1.652, + "step": 1384 + }, + { + "epoch": 0.3787257314738857, + "grad_norm": 0.16453763842582703, + "learning_rate": 5e-05, + "loss": 1.6961, + "step": 1385 + }, + { + "epoch": 0.37899917965545526, + "grad_norm": 0.17871487140655518, + "learning_rate": 5e-05, + "loss": 1.6779, + "step": 1386 + }, + { + "epoch": 0.37927262783702487, + "grad_norm": 0.15350158512592316, + "learning_rate": 5e-05, + "loss": 1.6196, + "step": 1387 + }, + { + "epoch": 0.37954607601859447, + "grad_norm": 0.1600034087896347, + "learning_rate": 5e-05, + "loss": 1.6356, + "step": 1388 + }, + { + "epoch": 0.37981952420016407, + "grad_norm": 0.17216049134731293, + "learning_rate": 5e-05, + "loss": 1.6116, + "step": 1389 + }, + { + "epoch": 0.3800929723817337, + "grad_norm": 0.15600605309009552, + "learning_rate": 5e-05, + "loss": 1.6993, + "step": 1390 + }, + { + "epoch": 0.3803664205633033, + "grad_norm": 0.14978420734405518, + "learning_rate": 5e-05, + "loss": 1.7346, + "step": 1391 + }, + { + "epoch": 0.3806398687448728, + "grad_norm": 0.176108255982399, + "learning_rate": 5e-05, + "loss": 1.6644, + "step": 1392 + }, + { + "epoch": 0.3809133169264424, + "grad_norm": 0.1513052135705948, + "learning_rate": 5e-05, + "loss": 1.648, + "step": 1393 + }, + { + "epoch": 0.381186765108012, + "grad_norm": 0.16158834099769592, + "learning_rate": 5e-05, + "loss": 1.6868, + "step": 1394 + }, + { + "epoch": 0.38146021328958163, + "grad_norm": 0.1669863909482956, + "learning_rate": 5e-05, + "loss": 1.6286, + "step": 1395 + }, + { + "epoch": 0.38173366147115123, + "grad_norm": 0.16049465537071228, + "learning_rate": 5e-05, + "loss": 1.7456, + "step": 1396 + }, + { + "epoch": 0.38200710965272083, + "grad_norm": 0.17042703926563263, + "learning_rate": 5e-05, + "loss": 1.663, + "step": 1397 + }, + { + "epoch": 0.3822805578342904, + "grad_norm": 0.1597413718700409, + "learning_rate": 5e-05, + "loss": 1.6331, + "step": 1398 + }, + { + "epoch": 0.38255400601586, + "grad_norm": 0.15566933155059814, + "learning_rate": 5e-05, + "loss": 1.6423, + "step": 1399 + }, + { + "epoch": 0.3828274541974296, + "grad_norm": 0.15969716012477875, + "learning_rate": 5e-05, + "loss": 1.7039, + "step": 1400 + }, + { + "epoch": 0.3831009023789992, + "grad_norm": 0.15060196816921234, + "learning_rate": 5e-05, + "loss": 1.6484, + "step": 1401 + }, + { + "epoch": 0.3833743505605688, + "grad_norm": 0.15443742275238037, + "learning_rate": 5e-05, + "loss": 1.5378, + "step": 1402 + }, + { + "epoch": 0.3836477987421384, + "grad_norm": 0.15732194483280182, + "learning_rate": 5e-05, + "loss": 1.6646, + "step": 1403 + }, + { + "epoch": 0.38392124692370794, + "grad_norm": 0.15805989503860474, + "learning_rate": 5e-05, + "loss": 1.5314, + "step": 1404 + }, + { + "epoch": 0.38419469510527754, + "grad_norm": 0.1553335189819336, + "learning_rate": 5e-05, + "loss": 1.6566, + "step": 1405 + }, + { + "epoch": 0.38446814328684714, + "grad_norm": 0.16199639439582825, + "learning_rate": 5e-05, + "loss": 1.6392, + "step": 1406 + }, + { + "epoch": 0.38474159146841674, + "grad_norm": 0.15157069265842438, + "learning_rate": 5e-05, + "loss": 1.5499, + "step": 1407 + }, + { + "epoch": 0.38501503964998635, + "grad_norm": 0.15281042456626892, + "learning_rate": 5e-05, + "loss": 1.6623, + "step": 1408 + }, + { + "epoch": 0.3852884878315559, + "grad_norm": 0.16575610637664795, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 1409 + }, + { + "epoch": 0.3855619360131255, + "grad_norm": 0.17022867500782013, + "learning_rate": 5e-05, + "loss": 1.7497, + "step": 1410 + }, + { + "epoch": 0.3858353841946951, + "grad_norm": 0.1417384147644043, + "learning_rate": 5e-05, + "loss": 1.615, + "step": 1411 + }, + { + "epoch": 0.3861088323762647, + "grad_norm": 0.18117718398571014, + "learning_rate": 5e-05, + "loss": 1.7536, + "step": 1412 + }, + { + "epoch": 0.3863822805578343, + "grad_norm": 0.15728381276130676, + "learning_rate": 5e-05, + "loss": 1.577, + "step": 1413 + }, + { + "epoch": 0.3866557287394039, + "grad_norm": 0.163984015583992, + "learning_rate": 5e-05, + "loss": 1.6305, + "step": 1414 + }, + { + "epoch": 0.38692917692097345, + "grad_norm": 0.16205452382564545, + "learning_rate": 5e-05, + "loss": 1.5719, + "step": 1415 + }, + { + "epoch": 0.38720262510254305, + "grad_norm": 0.16791151463985443, + "learning_rate": 5e-05, + "loss": 1.6911, + "step": 1416 + }, + { + "epoch": 0.38747607328411265, + "grad_norm": 0.165225088596344, + "learning_rate": 5e-05, + "loss": 1.5915, + "step": 1417 + }, + { + "epoch": 0.38774952146568226, + "grad_norm": 0.1673455834388733, + "learning_rate": 5e-05, + "loss": 1.6484, + "step": 1418 + }, + { + "epoch": 0.38802296964725186, + "grad_norm": 0.15316179394721985, + "learning_rate": 5e-05, + "loss": 1.6133, + "step": 1419 + }, + { + "epoch": 0.38829641782882146, + "grad_norm": 0.17565707862377167, + "learning_rate": 5e-05, + "loss": 1.7247, + "step": 1420 + }, + { + "epoch": 0.388569866010391, + "grad_norm": 0.15834634006023407, + "learning_rate": 5e-05, + "loss": 1.6689, + "step": 1421 + }, + { + "epoch": 0.3888433141919606, + "grad_norm": 0.16178841888904572, + "learning_rate": 5e-05, + "loss": 1.6276, + "step": 1422 + }, + { + "epoch": 0.3891167623735302, + "grad_norm": 0.15787442028522491, + "learning_rate": 5e-05, + "loss": 1.6655, + "step": 1423 + }, + { + "epoch": 0.3893902105550998, + "grad_norm": 0.17375800013542175, + "learning_rate": 5e-05, + "loss": 1.6643, + "step": 1424 + }, + { + "epoch": 0.3896636587366694, + "grad_norm": 0.1527184098958969, + "learning_rate": 5e-05, + "loss": 1.5199, + "step": 1425 + }, + { + "epoch": 0.389937106918239, + "grad_norm": 0.17630016803741455, + "learning_rate": 5e-05, + "loss": 1.9053, + "step": 1426 + }, + { + "epoch": 0.39021055509980856, + "grad_norm": 0.16821622848510742, + "learning_rate": 5e-05, + "loss": 1.6229, + "step": 1427 + }, + { + "epoch": 0.39048400328137817, + "grad_norm": 0.1571454554796219, + "learning_rate": 5e-05, + "loss": 1.5945, + "step": 1428 + }, + { + "epoch": 0.39075745146294777, + "grad_norm": 0.16041232645511627, + "learning_rate": 5e-05, + "loss": 1.6161, + "step": 1429 + }, + { + "epoch": 0.39103089964451737, + "grad_norm": 0.15614961087703705, + "learning_rate": 5e-05, + "loss": 1.6234, + "step": 1430 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.1785866618156433, + "learning_rate": 5e-05, + "loss": 1.5821, + "step": 1431 + }, + { + "epoch": 0.3915777960076566, + "grad_norm": 0.1583407074213028, + "learning_rate": 5e-05, + "loss": 1.6031, + "step": 1432 + }, + { + "epoch": 0.3918512441892261, + "grad_norm": 0.15513792634010315, + "learning_rate": 5e-05, + "loss": 1.6701, + "step": 1433 + }, + { + "epoch": 0.3921246923707957, + "grad_norm": 0.15645377337932587, + "learning_rate": 5e-05, + "loss": 1.6072, + "step": 1434 + }, + { + "epoch": 0.3923981405523653, + "grad_norm": 0.17222340404987335, + "learning_rate": 5e-05, + "loss": 1.7521, + "step": 1435 + }, + { + "epoch": 0.39267158873393493, + "grad_norm": 0.16205964982509613, + "learning_rate": 5e-05, + "loss": 1.5481, + "step": 1436 + }, + { + "epoch": 0.39294503691550453, + "grad_norm": 0.16628527641296387, + "learning_rate": 5e-05, + "loss": 1.6072, + "step": 1437 + }, + { + "epoch": 0.3932184850970741, + "grad_norm": 0.15461629629135132, + "learning_rate": 5e-05, + "loss": 1.6637, + "step": 1438 + }, + { + "epoch": 0.3934919332786437, + "grad_norm": 0.17771334946155548, + "learning_rate": 5e-05, + "loss": 1.7054, + "step": 1439 + }, + { + "epoch": 0.3937653814602133, + "grad_norm": 0.16035428643226624, + "learning_rate": 5e-05, + "loss": 1.6933, + "step": 1440 + }, + { + "epoch": 0.3940388296417829, + "grad_norm": 0.15316614508628845, + "learning_rate": 5e-05, + "loss": 1.4949, + "step": 1441 + }, + { + "epoch": 0.3943122778233525, + "grad_norm": 0.15236356854438782, + "learning_rate": 5e-05, + "loss": 1.6445, + "step": 1442 + }, + { + "epoch": 0.3945857260049221, + "grad_norm": 0.15374061465263367, + "learning_rate": 5e-05, + "loss": 1.7432, + "step": 1443 + }, + { + "epoch": 0.39485917418649163, + "grad_norm": 0.15344202518463135, + "learning_rate": 5e-05, + "loss": 1.584, + "step": 1444 + }, + { + "epoch": 0.39513262236806124, + "grad_norm": 0.15145696699619293, + "learning_rate": 5e-05, + "loss": 1.6804, + "step": 1445 + }, + { + "epoch": 0.39540607054963084, + "grad_norm": 0.15984676778316498, + "learning_rate": 5e-05, + "loss": 1.7636, + "step": 1446 + }, + { + "epoch": 0.39567951873120044, + "grad_norm": 0.14700670540332794, + "learning_rate": 5e-05, + "loss": 1.5888, + "step": 1447 + }, + { + "epoch": 0.39595296691277004, + "grad_norm": 0.1534331887960434, + "learning_rate": 5e-05, + "loss": 1.5919, + "step": 1448 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 0.15421104431152344, + "learning_rate": 5e-05, + "loss": 1.5659, + "step": 1449 + }, + { + "epoch": 0.3964998632759092, + "grad_norm": 0.17723333835601807, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 1450 + }, + { + "epoch": 0.3967733114574788, + "grad_norm": 0.1645742654800415, + "learning_rate": 5e-05, + "loss": 1.6573, + "step": 1451 + }, + { + "epoch": 0.3970467596390484, + "grad_norm": 0.1458151489496231, + "learning_rate": 5e-05, + "loss": 1.4709, + "step": 1452 + }, + { + "epoch": 0.397320207820618, + "grad_norm": 0.15043888986110687, + "learning_rate": 5e-05, + "loss": 1.6267, + "step": 1453 + }, + { + "epoch": 0.3975936560021876, + "grad_norm": 0.16159576177597046, + "learning_rate": 5e-05, + "loss": 1.7, + "step": 1454 + }, + { + "epoch": 0.3978671041837572, + "grad_norm": 0.15626250207424164, + "learning_rate": 5e-05, + "loss": 1.6397, + "step": 1455 + }, + { + "epoch": 0.39814055236532675, + "grad_norm": 0.15309499204158783, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 1456 + }, + { + "epoch": 0.39841400054689635, + "grad_norm": 0.1638927310705185, + "learning_rate": 5e-05, + "loss": 1.6134, + "step": 1457 + }, + { + "epoch": 0.39868744872846595, + "grad_norm": 0.156220942735672, + "learning_rate": 5e-05, + "loss": 1.6544, + "step": 1458 + }, + { + "epoch": 0.39896089691003556, + "grad_norm": 0.15519675612449646, + "learning_rate": 5e-05, + "loss": 1.6289, + "step": 1459 + }, + { + "epoch": 0.39923434509160516, + "grad_norm": 0.16165190935134888, + "learning_rate": 5e-05, + "loss": 1.6655, + "step": 1460 + }, + { + "epoch": 0.39950779327317476, + "grad_norm": 0.14794325828552246, + "learning_rate": 5e-05, + "loss": 1.6321, + "step": 1461 + }, + { + "epoch": 0.3997812414547443, + "grad_norm": 0.1512671858072281, + "learning_rate": 5e-05, + "loss": 1.6119, + "step": 1462 + }, + { + "epoch": 0.4000546896363139, + "grad_norm": 0.14749833941459656, + "learning_rate": 5e-05, + "loss": 1.6302, + "step": 1463 + }, + { + "epoch": 0.4003281378178835, + "grad_norm": 0.1552843600511551, + "learning_rate": 5e-05, + "loss": 1.6113, + "step": 1464 + }, + { + "epoch": 0.4006015859994531, + "grad_norm": 0.15012173354625702, + "learning_rate": 5e-05, + "loss": 1.5396, + "step": 1465 + }, + { + "epoch": 0.4008750341810227, + "grad_norm": 0.15311282873153687, + "learning_rate": 5e-05, + "loss": 1.6266, + "step": 1466 + }, + { + "epoch": 0.40114848236259226, + "grad_norm": 0.15125569701194763, + "learning_rate": 5e-05, + "loss": 1.5979, + "step": 1467 + }, + { + "epoch": 0.40142193054416186, + "grad_norm": 0.1545860469341278, + "learning_rate": 5e-05, + "loss": 1.6872, + "step": 1468 + }, + { + "epoch": 0.40169537872573147, + "grad_norm": 0.1852022111415863, + "learning_rate": 5e-05, + "loss": 1.6847, + "step": 1469 + }, + { + "epoch": 0.40196882690730107, + "grad_norm": 0.1469736099243164, + "learning_rate": 5e-05, + "loss": 1.58, + "step": 1470 + }, + { + "epoch": 0.40224227508887067, + "grad_norm": 0.17848168313503265, + "learning_rate": 5e-05, + "loss": 1.6899, + "step": 1471 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 0.16272708773612976, + "learning_rate": 5e-05, + "loss": 1.6655, + "step": 1472 + }, + { + "epoch": 0.4027891714520098, + "grad_norm": 0.1590905487537384, + "learning_rate": 5e-05, + "loss": 1.6239, + "step": 1473 + }, + { + "epoch": 0.4030626196335794, + "grad_norm": 0.16318002343177795, + "learning_rate": 5e-05, + "loss": 1.6324, + "step": 1474 + }, + { + "epoch": 0.403336067815149, + "grad_norm": 0.1604347825050354, + "learning_rate": 5e-05, + "loss": 1.6793, + "step": 1475 + }, + { + "epoch": 0.4036095159967186, + "grad_norm": 0.1481696516275406, + "learning_rate": 5e-05, + "loss": 1.6391, + "step": 1476 + }, + { + "epoch": 0.40388296417828823, + "grad_norm": 0.1675114631652832, + "learning_rate": 5e-05, + "loss": 1.5925, + "step": 1477 + }, + { + "epoch": 0.40415641235985783, + "grad_norm": 0.15092286467552185, + "learning_rate": 5e-05, + "loss": 1.6294, + "step": 1478 + }, + { + "epoch": 0.4044298605414274, + "grad_norm": 0.1638384312391281, + "learning_rate": 5e-05, + "loss": 1.6552, + "step": 1479 + }, + { + "epoch": 0.404703308722997, + "grad_norm": 0.1602986752986908, + "learning_rate": 5e-05, + "loss": 1.6264, + "step": 1480 + }, + { + "epoch": 0.4049767569045666, + "grad_norm": 0.1523655354976654, + "learning_rate": 5e-05, + "loss": 1.6013, + "step": 1481 + }, + { + "epoch": 0.4052502050861362, + "grad_norm": 0.1629789173603058, + "learning_rate": 5e-05, + "loss": 1.6301, + "step": 1482 + }, + { + "epoch": 0.4055236532677058, + "grad_norm": 0.14943981170654297, + "learning_rate": 5e-05, + "loss": 1.5619, + "step": 1483 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.15840235352516174, + "learning_rate": 5e-05, + "loss": 1.6715, + "step": 1484 + }, + { + "epoch": 0.40607054963084493, + "grad_norm": 0.15214814245700836, + "learning_rate": 5e-05, + "loss": 1.6118, + "step": 1485 + }, + { + "epoch": 0.40634399781241454, + "grad_norm": 0.15232683718204498, + "learning_rate": 5e-05, + "loss": 1.604, + "step": 1486 + }, + { + "epoch": 0.40661744599398414, + "grad_norm": 0.15846048295497894, + "learning_rate": 5e-05, + "loss": 1.6588, + "step": 1487 + }, + { + "epoch": 0.40689089417555374, + "grad_norm": 0.15377342700958252, + "learning_rate": 5e-05, + "loss": 1.5945, + "step": 1488 + }, + { + "epoch": 0.40716434235712334, + "grad_norm": 0.14967899024486542, + "learning_rate": 5e-05, + "loss": 1.5724, + "step": 1489 + }, + { + "epoch": 0.4074377905386929, + "grad_norm": 0.15246756374835968, + "learning_rate": 5e-05, + "loss": 1.651, + "step": 1490 + }, + { + "epoch": 0.4077112387202625, + "grad_norm": 0.1686270534992218, + "learning_rate": 5e-05, + "loss": 1.519, + "step": 1491 + }, + { + "epoch": 0.4079846869018321, + "grad_norm": 0.1610393226146698, + "learning_rate": 5e-05, + "loss": 1.6535, + "step": 1492 + }, + { + "epoch": 0.4082581350834017, + "grad_norm": 0.17595553398132324, + "learning_rate": 5e-05, + "loss": 1.6219, + "step": 1493 + }, + { + "epoch": 0.4085315832649713, + "grad_norm": 0.1568283587694168, + "learning_rate": 5e-05, + "loss": 1.6203, + "step": 1494 + }, + { + "epoch": 0.4088050314465409, + "grad_norm": 0.15611572563648224, + "learning_rate": 5e-05, + "loss": 1.7053, + "step": 1495 + }, + { + "epoch": 0.40907847962811045, + "grad_norm": 0.16013391315937042, + "learning_rate": 5e-05, + "loss": 1.6718, + "step": 1496 + }, + { + "epoch": 0.40935192780968005, + "grad_norm": 0.14747720956802368, + "learning_rate": 5e-05, + "loss": 1.5962, + "step": 1497 + }, + { + "epoch": 0.40962537599124965, + "grad_norm": 0.1694934070110321, + "learning_rate": 5e-05, + "loss": 1.6709, + "step": 1498 + }, + { + "epoch": 0.40989882417281925, + "grad_norm": 0.17449548840522766, + "learning_rate": 5e-05, + "loss": 1.6565, + "step": 1499 + }, + { + "epoch": 0.41017227235438886, + "grad_norm": 0.15884292125701904, + "learning_rate": 5e-05, + "loss": 1.6468, + "step": 1500 + }, + { + "epoch": 0.41044572053595846, + "grad_norm": 0.1549280881881714, + "learning_rate": 5e-05, + "loss": 1.5994, + "step": 1501 + }, + { + "epoch": 0.410719168717528, + "grad_norm": 0.15527546405792236, + "learning_rate": 5e-05, + "loss": 1.555, + "step": 1502 + }, + { + "epoch": 0.4109926168990976, + "grad_norm": 0.1679718941450119, + "learning_rate": 5e-05, + "loss": 1.6539, + "step": 1503 + }, + { + "epoch": 0.4112660650806672, + "grad_norm": 0.14265212416648865, + "learning_rate": 5e-05, + "loss": 1.434, + "step": 1504 + }, + { + "epoch": 0.4115395132622368, + "grad_norm": 0.1542886644601822, + "learning_rate": 5e-05, + "loss": 1.5932, + "step": 1505 + }, + { + "epoch": 0.4118129614438064, + "grad_norm": 0.1690497249364853, + "learning_rate": 5e-05, + "loss": 1.6344, + "step": 1506 + }, + { + "epoch": 0.412086409625376, + "grad_norm": 0.1507546305656433, + "learning_rate": 5e-05, + "loss": 1.5773, + "step": 1507 + }, + { + "epoch": 0.41235985780694556, + "grad_norm": 0.15997721254825592, + "learning_rate": 5e-05, + "loss": 1.694, + "step": 1508 + }, + { + "epoch": 0.41263330598851516, + "grad_norm": 0.15315738320350647, + "learning_rate": 5e-05, + "loss": 1.6413, + "step": 1509 + }, + { + "epoch": 0.41290675417008477, + "grad_norm": 0.14834025502204895, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 1510 + }, + { + "epoch": 0.41318020235165437, + "grad_norm": 0.15537337958812714, + "learning_rate": 5e-05, + "loss": 1.6266, + "step": 1511 + }, + { + "epoch": 0.41345365053322397, + "grad_norm": 0.15986233949661255, + "learning_rate": 5e-05, + "loss": 1.5657, + "step": 1512 + }, + { + "epoch": 0.4137270987147936, + "grad_norm": 0.15568973124027252, + "learning_rate": 5e-05, + "loss": 1.6403, + "step": 1513 + }, + { + "epoch": 0.4140005468963631, + "grad_norm": 0.16333115100860596, + "learning_rate": 5e-05, + "loss": 1.6504, + "step": 1514 + }, + { + "epoch": 0.4142739950779327, + "grad_norm": 0.1546471118927002, + "learning_rate": 5e-05, + "loss": 1.6104, + "step": 1515 + }, + { + "epoch": 0.4145474432595023, + "grad_norm": 0.16248218715190887, + "learning_rate": 5e-05, + "loss": 1.6648, + "step": 1516 + }, + { + "epoch": 0.4148208914410719, + "grad_norm": 0.16074463725090027, + "learning_rate": 5e-05, + "loss": 1.6331, + "step": 1517 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.15557120740413666, + "learning_rate": 5e-05, + "loss": 1.6255, + "step": 1518 + }, + { + "epoch": 0.4153677878042111, + "grad_norm": 0.15570150315761566, + "learning_rate": 5e-05, + "loss": 1.6743, + "step": 1519 + }, + { + "epoch": 0.4156412359857807, + "grad_norm": 0.17130300402641296, + "learning_rate": 5e-05, + "loss": 1.6354, + "step": 1520 + }, + { + "epoch": 0.4159146841673503, + "grad_norm": 0.1627659946680069, + "learning_rate": 5e-05, + "loss": 1.6975, + "step": 1521 + }, + { + "epoch": 0.4161881323489199, + "grad_norm": 0.1583612710237503, + "learning_rate": 5e-05, + "loss": 1.6427, + "step": 1522 + }, + { + "epoch": 0.4164615805304895, + "grad_norm": 0.15877516567707062, + "learning_rate": 5e-05, + "loss": 1.6016, + "step": 1523 + }, + { + "epoch": 0.4167350287120591, + "grad_norm": 0.16064798831939697, + "learning_rate": 5e-05, + "loss": 1.605, + "step": 1524 + }, + { + "epoch": 0.41700847689362863, + "grad_norm": 0.16150295734405518, + "learning_rate": 5e-05, + "loss": 1.7196, + "step": 1525 + }, + { + "epoch": 0.41728192507519823, + "grad_norm": 0.1557587832212448, + "learning_rate": 5e-05, + "loss": 1.6024, + "step": 1526 + }, + { + "epoch": 0.41755537325676784, + "grad_norm": 0.14889568090438843, + "learning_rate": 5e-05, + "loss": 1.6842, + "step": 1527 + }, + { + "epoch": 0.41782882143833744, + "grad_norm": 0.15397529304027557, + "learning_rate": 5e-05, + "loss": 1.6452, + "step": 1528 + }, + { + "epoch": 0.41810226961990704, + "grad_norm": 0.14965930581092834, + "learning_rate": 5e-05, + "loss": 1.599, + "step": 1529 + }, + { + "epoch": 0.41837571780147664, + "grad_norm": 0.14950762689113617, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 1530 + }, + { + "epoch": 0.4186491659830462, + "grad_norm": 0.1583251804113388, + "learning_rate": 5e-05, + "loss": 1.661, + "step": 1531 + }, + { + "epoch": 0.4189226141646158, + "grad_norm": 0.1470499336719513, + "learning_rate": 5e-05, + "loss": 1.4718, + "step": 1532 + }, + { + "epoch": 0.4191960623461854, + "grad_norm": 0.15115216374397278, + "learning_rate": 5e-05, + "loss": 1.6577, + "step": 1533 + }, + { + "epoch": 0.419469510527755, + "grad_norm": 0.1576387733221054, + "learning_rate": 5e-05, + "loss": 1.6743, + "step": 1534 + }, + { + "epoch": 0.4197429587093246, + "grad_norm": 0.15287528932094574, + "learning_rate": 5e-05, + "loss": 1.619, + "step": 1535 + }, + { + "epoch": 0.4200164068908942, + "grad_norm": 0.15793505311012268, + "learning_rate": 5e-05, + "loss": 1.604, + "step": 1536 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 0.14916017651557922, + "learning_rate": 5e-05, + "loss": 1.6273, + "step": 1537 + }, + { + "epoch": 0.42056330325403335, + "grad_norm": 0.17765149474143982, + "learning_rate": 5e-05, + "loss": 1.7046, + "step": 1538 + }, + { + "epoch": 0.42083675143560295, + "grad_norm": 0.15169952809810638, + "learning_rate": 5e-05, + "loss": 1.5879, + "step": 1539 + }, + { + "epoch": 0.42111019961717255, + "grad_norm": 0.15626677870750427, + "learning_rate": 5e-05, + "loss": 1.5916, + "step": 1540 + }, + { + "epoch": 0.42138364779874216, + "grad_norm": 0.17149586975574493, + "learning_rate": 5e-05, + "loss": 1.5988, + "step": 1541 + }, + { + "epoch": 0.42165709598031176, + "grad_norm": 0.15350784361362457, + "learning_rate": 5e-05, + "loss": 1.7237, + "step": 1542 + }, + { + "epoch": 0.4219305441618813, + "grad_norm": 0.1620538979768753, + "learning_rate": 5e-05, + "loss": 1.6646, + "step": 1543 + }, + { + "epoch": 0.4222039923434509, + "grad_norm": 0.19055283069610596, + "learning_rate": 5e-05, + "loss": 1.6122, + "step": 1544 + }, + { + "epoch": 0.4224774405250205, + "grad_norm": 0.16404415667057037, + "learning_rate": 5e-05, + "loss": 1.718, + "step": 1545 + }, + { + "epoch": 0.4227508887065901, + "grad_norm": 0.18905016779899597, + "learning_rate": 5e-05, + "loss": 1.6112, + "step": 1546 + }, + { + "epoch": 0.4230243368881597, + "grad_norm": 0.15663960576057434, + "learning_rate": 5e-05, + "loss": 1.5915, + "step": 1547 + }, + { + "epoch": 0.42329778506972926, + "grad_norm": 0.15725626051425934, + "learning_rate": 5e-05, + "loss": 1.6258, + "step": 1548 + }, + { + "epoch": 0.42357123325129886, + "grad_norm": 0.14791657030582428, + "learning_rate": 5e-05, + "loss": 1.6198, + "step": 1549 + }, + { + "epoch": 0.42384468143286846, + "grad_norm": 0.1585807204246521, + "learning_rate": 5e-05, + "loss": 1.6669, + "step": 1550 + }, + { + "epoch": 0.42411812961443807, + "grad_norm": 0.1611957997083664, + "learning_rate": 5e-05, + "loss": 1.6661, + "step": 1551 + }, + { + "epoch": 0.42439157779600767, + "grad_norm": 0.15032406151294708, + "learning_rate": 5e-05, + "loss": 1.6543, + "step": 1552 + }, + { + "epoch": 0.42466502597757727, + "grad_norm": 0.15738679468631744, + "learning_rate": 5e-05, + "loss": 1.6384, + "step": 1553 + }, + { + "epoch": 0.4249384741591468, + "grad_norm": 0.1558157354593277, + "learning_rate": 5e-05, + "loss": 1.6816, + "step": 1554 + }, + { + "epoch": 0.4252119223407164, + "grad_norm": 0.15282602608203888, + "learning_rate": 5e-05, + "loss": 1.6191, + "step": 1555 + }, + { + "epoch": 0.425485370522286, + "grad_norm": 0.16039276123046875, + "learning_rate": 5e-05, + "loss": 1.6248, + "step": 1556 + }, + { + "epoch": 0.4257588187038556, + "grad_norm": 0.1603170931339264, + "learning_rate": 5e-05, + "loss": 1.6796, + "step": 1557 + }, + { + "epoch": 0.4260322668854252, + "grad_norm": 0.16182708740234375, + "learning_rate": 5e-05, + "loss": 1.62, + "step": 1558 + }, + { + "epoch": 0.42630571506699483, + "grad_norm": 0.15992410480976105, + "learning_rate": 5e-05, + "loss": 1.5315, + "step": 1559 + }, + { + "epoch": 0.4265791632485644, + "grad_norm": 0.15438152849674225, + "learning_rate": 5e-05, + "loss": 1.6707, + "step": 1560 + }, + { + "epoch": 0.426852611430134, + "grad_norm": 0.1619657427072525, + "learning_rate": 5e-05, + "loss": 1.6402, + "step": 1561 + }, + { + "epoch": 0.4271260596117036, + "grad_norm": 0.16311468183994293, + "learning_rate": 5e-05, + "loss": 1.6292, + "step": 1562 + }, + { + "epoch": 0.4273995077932732, + "grad_norm": 0.15871946513652802, + "learning_rate": 5e-05, + "loss": 1.7462, + "step": 1563 + }, + { + "epoch": 0.4276729559748428, + "grad_norm": 0.17374621331691742, + "learning_rate": 5e-05, + "loss": 1.6766, + "step": 1564 + }, + { + "epoch": 0.4279464041564124, + "grad_norm": 0.1653493344783783, + "learning_rate": 5e-05, + "loss": 1.6076, + "step": 1565 + }, + { + "epoch": 0.42821985233798193, + "grad_norm": 0.16838444769382477, + "learning_rate": 5e-05, + "loss": 1.754, + "step": 1566 + }, + { + "epoch": 0.42849330051955153, + "grad_norm": 0.14829325675964355, + "learning_rate": 5e-05, + "loss": 1.5052, + "step": 1567 + }, + { + "epoch": 0.42876674870112114, + "grad_norm": 0.1606733649969101, + "learning_rate": 5e-05, + "loss": 1.7269, + "step": 1568 + }, + { + "epoch": 0.42904019688269074, + "grad_norm": 0.16772723197937012, + "learning_rate": 5e-05, + "loss": 1.7469, + "step": 1569 + }, + { + "epoch": 0.42931364506426034, + "grad_norm": 0.1608705222606659, + "learning_rate": 5e-05, + "loss": 1.6371, + "step": 1570 + }, + { + "epoch": 0.42958709324582994, + "grad_norm": 0.16294115781784058, + "learning_rate": 5e-05, + "loss": 1.5675, + "step": 1571 + }, + { + "epoch": 0.4298605414273995, + "grad_norm": 0.16304922103881836, + "learning_rate": 5e-05, + "loss": 1.7321, + "step": 1572 + }, + { + "epoch": 0.4301339896089691, + "grad_norm": 0.18761757016181946, + "learning_rate": 5e-05, + "loss": 1.7382, + "step": 1573 + }, + { + "epoch": 0.4304074377905387, + "grad_norm": 0.18934416770935059, + "learning_rate": 5e-05, + "loss": 1.5394, + "step": 1574 + }, + { + "epoch": 0.4306808859721083, + "grad_norm": 0.16312015056610107, + "learning_rate": 5e-05, + "loss": 1.6435, + "step": 1575 + }, + { + "epoch": 0.4309543341536779, + "grad_norm": 0.20754271745681763, + "learning_rate": 5e-05, + "loss": 1.625, + "step": 1576 + }, + { + "epoch": 0.43122778233524744, + "grad_norm": 0.19495531916618347, + "learning_rate": 5e-05, + "loss": 1.6369, + "step": 1577 + }, + { + "epoch": 0.43150123051681705, + "grad_norm": 0.1650926023721695, + "learning_rate": 5e-05, + "loss": 1.7474, + "step": 1578 + }, + { + "epoch": 0.43177467869838665, + "grad_norm": 0.20554234087467194, + "learning_rate": 5e-05, + "loss": 1.5715, + "step": 1579 + }, + { + "epoch": 0.43204812687995625, + "grad_norm": 0.18859036266803741, + "learning_rate": 5e-05, + "loss": 1.6938, + "step": 1580 + }, + { + "epoch": 0.43232157506152585, + "grad_norm": 0.16272279620170593, + "learning_rate": 5e-05, + "loss": 1.583, + "step": 1581 + }, + { + "epoch": 0.43259502324309546, + "grad_norm": 0.20159992575645447, + "learning_rate": 5e-05, + "loss": 1.594, + "step": 1582 + }, + { + "epoch": 0.432868471424665, + "grad_norm": 0.16306354105472565, + "learning_rate": 5e-05, + "loss": 1.6359, + "step": 1583 + }, + { + "epoch": 0.4331419196062346, + "grad_norm": 0.16837598383426666, + "learning_rate": 5e-05, + "loss": 1.6118, + "step": 1584 + }, + { + "epoch": 0.4334153677878042, + "grad_norm": 0.19100947678089142, + "learning_rate": 5e-05, + "loss": 1.6267, + "step": 1585 + }, + { + "epoch": 0.4336888159693738, + "grad_norm": 0.15260100364685059, + "learning_rate": 5e-05, + "loss": 1.5452, + "step": 1586 + }, + { + "epoch": 0.4339622641509434, + "grad_norm": 0.15616540610790253, + "learning_rate": 5e-05, + "loss": 1.7126, + "step": 1587 + }, + { + "epoch": 0.434235712332513, + "grad_norm": 0.16332991421222687, + "learning_rate": 5e-05, + "loss": 1.6253, + "step": 1588 + }, + { + "epoch": 0.43450916051408256, + "grad_norm": 0.16623741388320923, + "learning_rate": 5e-05, + "loss": 1.6323, + "step": 1589 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.15148966014385223, + "learning_rate": 5e-05, + "loss": 1.6107, + "step": 1590 + }, + { + "epoch": 0.43505605687722176, + "grad_norm": 0.1654033362865448, + "learning_rate": 5e-05, + "loss": 1.5779, + "step": 1591 + }, + { + "epoch": 0.43532950505879137, + "grad_norm": 0.17548300325870514, + "learning_rate": 5e-05, + "loss": 1.6037, + "step": 1592 + }, + { + "epoch": 0.43560295324036097, + "grad_norm": 0.1487036496400833, + "learning_rate": 5e-05, + "loss": 1.5709, + "step": 1593 + }, + { + "epoch": 0.43587640142193057, + "grad_norm": 0.17089228332042694, + "learning_rate": 5e-05, + "loss": 1.6508, + "step": 1594 + }, + { + "epoch": 0.4361498496035001, + "grad_norm": 0.17811010777950287, + "learning_rate": 5e-05, + "loss": 1.5717, + "step": 1595 + }, + { + "epoch": 0.4364232977850697, + "grad_norm": 0.16123969852924347, + "learning_rate": 5e-05, + "loss": 1.657, + "step": 1596 + }, + { + "epoch": 0.4366967459666393, + "grad_norm": 0.15267348289489746, + "learning_rate": 5e-05, + "loss": 1.637, + "step": 1597 + }, + { + "epoch": 0.4369701941482089, + "grad_norm": 0.19433604180812836, + "learning_rate": 5e-05, + "loss": 1.5795, + "step": 1598 + }, + { + "epoch": 0.4372436423297785, + "grad_norm": 0.15938717126846313, + "learning_rate": 5e-05, + "loss": 1.719, + "step": 1599 + }, + { + "epoch": 0.4375170905113481, + "grad_norm": 0.16735820472240448, + "learning_rate": 5e-05, + "loss": 1.7389, + "step": 1600 + }, + { + "epoch": 0.4377905386929177, + "grad_norm": 0.19536766409873962, + "learning_rate": 5e-05, + "loss": 1.7317, + "step": 1601 + }, + { + "epoch": 0.4380639868744873, + "grad_norm": 0.18374434113502502, + "learning_rate": 5e-05, + "loss": 1.541, + "step": 1602 + }, + { + "epoch": 0.4383374350560569, + "grad_norm": 0.15462787449359894, + "learning_rate": 5e-05, + "loss": 1.6388, + "step": 1603 + }, + { + "epoch": 0.4386108832376265, + "grad_norm": 0.18189293146133423, + "learning_rate": 5e-05, + "loss": 1.6179, + "step": 1604 + }, + { + "epoch": 0.4388843314191961, + "grad_norm": 0.17478562891483307, + "learning_rate": 5e-05, + "loss": 1.6332, + "step": 1605 + }, + { + "epoch": 0.43915777960076563, + "grad_norm": 0.16604724526405334, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 1606 + }, + { + "epoch": 0.43943122778233523, + "grad_norm": 0.14763274788856506, + "learning_rate": 5e-05, + "loss": 1.5179, + "step": 1607 + }, + { + "epoch": 0.43970467596390483, + "grad_norm": 0.18372495472431183, + "learning_rate": 5e-05, + "loss": 1.6489, + "step": 1608 + }, + { + "epoch": 0.43997812414547444, + "grad_norm": 0.19224227964878082, + "learning_rate": 5e-05, + "loss": 1.6223, + "step": 1609 + }, + { + "epoch": 0.44025157232704404, + "grad_norm": 0.1518845111131668, + "learning_rate": 5e-05, + "loss": 1.6314, + "step": 1610 + }, + { + "epoch": 0.44052502050861364, + "grad_norm": 0.18382224440574646, + "learning_rate": 5e-05, + "loss": 1.6356, + "step": 1611 + }, + { + "epoch": 0.4407984686901832, + "grad_norm": 0.18745778501033783, + "learning_rate": 5e-05, + "loss": 1.6189, + "step": 1612 + }, + { + "epoch": 0.4410719168717528, + "grad_norm": 0.1606750637292862, + "learning_rate": 5e-05, + "loss": 1.7485, + "step": 1613 + }, + { + "epoch": 0.4413453650533224, + "grad_norm": 0.176387757062912, + "learning_rate": 5e-05, + "loss": 1.6141, + "step": 1614 + }, + { + "epoch": 0.441618813234892, + "grad_norm": 0.17925933003425598, + "learning_rate": 5e-05, + "loss": 1.6457, + "step": 1615 + }, + { + "epoch": 0.4418922614164616, + "grad_norm": 0.15073414146900177, + "learning_rate": 5e-05, + "loss": 1.5818, + "step": 1616 + }, + { + "epoch": 0.4421657095980312, + "grad_norm": 0.18990382552146912, + "learning_rate": 5e-05, + "loss": 1.6838, + "step": 1617 + }, + { + "epoch": 0.44243915777960074, + "grad_norm": 0.1909227818250656, + "learning_rate": 5e-05, + "loss": 1.7176, + "step": 1618 + }, + { + "epoch": 0.44271260596117035, + "grad_norm": 0.14983290433883667, + "learning_rate": 5e-05, + "loss": 1.5743, + "step": 1619 + }, + { + "epoch": 0.44298605414273995, + "grad_norm": 0.18672508001327515, + "learning_rate": 5e-05, + "loss": 1.7437, + "step": 1620 + }, + { + "epoch": 0.44325950232430955, + "grad_norm": 0.1633402407169342, + "learning_rate": 5e-05, + "loss": 1.6302, + "step": 1621 + }, + { + "epoch": 0.44353295050587915, + "grad_norm": 0.15954262018203735, + "learning_rate": 5e-05, + "loss": 1.6534, + "step": 1622 + }, + { + "epoch": 0.44380639868744876, + "grad_norm": 0.17406605184078217, + "learning_rate": 5e-05, + "loss": 1.6408, + "step": 1623 + }, + { + "epoch": 0.4440798468690183, + "grad_norm": 0.15827330946922302, + "learning_rate": 5e-05, + "loss": 1.7256, + "step": 1624 + }, + { + "epoch": 0.4443532950505879, + "grad_norm": 0.1576947122812271, + "learning_rate": 5e-05, + "loss": 1.6882, + "step": 1625 + }, + { + "epoch": 0.4446267432321575, + "grad_norm": 0.161362424492836, + "learning_rate": 5e-05, + "loss": 1.5669, + "step": 1626 + }, + { + "epoch": 0.4449001914137271, + "grad_norm": 0.15085799992084503, + "learning_rate": 5e-05, + "loss": 1.6761, + "step": 1627 + }, + { + "epoch": 0.4451736395952967, + "grad_norm": 0.14607954025268555, + "learning_rate": 5e-05, + "loss": 1.5689, + "step": 1628 + }, + { + "epoch": 0.44544708777686626, + "grad_norm": 0.149903804063797, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 1629 + }, + { + "epoch": 0.44572053595843586, + "grad_norm": 0.15304753184318542, + "learning_rate": 5e-05, + "loss": 1.6986, + "step": 1630 + }, + { + "epoch": 0.44599398414000546, + "grad_norm": 0.15582279860973358, + "learning_rate": 5e-05, + "loss": 1.6101, + "step": 1631 + }, + { + "epoch": 0.44626743232157506, + "grad_norm": 0.15496088564395905, + "learning_rate": 5e-05, + "loss": 1.6459, + "step": 1632 + }, + { + "epoch": 0.44654088050314467, + "grad_norm": 0.1606382578611374, + "learning_rate": 5e-05, + "loss": 1.6516, + "step": 1633 + }, + { + "epoch": 0.44681432868471427, + "grad_norm": 0.16134901344776154, + "learning_rate": 5e-05, + "loss": 1.7784, + "step": 1634 + }, + { + "epoch": 0.4470877768662838, + "grad_norm": 0.16512495279312134, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 1635 + }, + { + "epoch": 0.4473612250478534, + "grad_norm": 0.15310251712799072, + "learning_rate": 5e-05, + "loss": 1.6425, + "step": 1636 + }, + { + "epoch": 0.447634673229423, + "grad_norm": 0.1573248952627182, + "learning_rate": 5e-05, + "loss": 1.6168, + "step": 1637 + }, + { + "epoch": 0.4479081214109926, + "grad_norm": 0.15116660296916962, + "learning_rate": 5e-05, + "loss": 1.5542, + "step": 1638 + }, + { + "epoch": 0.4481815695925622, + "grad_norm": 0.1584998220205307, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 1639 + }, + { + "epoch": 0.4484550177741318, + "grad_norm": 0.15654048323631287, + "learning_rate": 5e-05, + "loss": 1.6022, + "step": 1640 + }, + { + "epoch": 0.44872846595570137, + "grad_norm": 0.1555994600057602, + "learning_rate": 5e-05, + "loss": 1.5843, + "step": 1641 + }, + { + "epoch": 0.449001914137271, + "grad_norm": 0.1540311872959137, + "learning_rate": 5e-05, + "loss": 1.6218, + "step": 1642 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 0.16408203542232513, + "learning_rate": 5e-05, + "loss": 1.6489, + "step": 1643 + }, + { + "epoch": 0.4495488105004102, + "grad_norm": 0.15430676937103271, + "learning_rate": 5e-05, + "loss": 1.6169, + "step": 1644 + }, + { + "epoch": 0.4498222586819798, + "grad_norm": 0.15662290155887604, + "learning_rate": 5e-05, + "loss": 1.7067, + "step": 1645 + }, + { + "epoch": 0.4500957068635494, + "grad_norm": 0.16638533771038055, + "learning_rate": 5e-05, + "loss": 1.6786, + "step": 1646 + }, + { + "epoch": 0.45036915504511893, + "grad_norm": 0.15840692818164825, + "learning_rate": 5e-05, + "loss": 1.5811, + "step": 1647 + }, + { + "epoch": 0.45064260322668853, + "grad_norm": 0.15642379224300385, + "learning_rate": 5e-05, + "loss": 1.6151, + "step": 1648 + }, + { + "epoch": 0.45091605140825813, + "grad_norm": 0.15364129841327667, + "learning_rate": 5e-05, + "loss": 1.5745, + "step": 1649 + }, + { + "epoch": 0.45118949958982774, + "grad_norm": 0.16230598092079163, + "learning_rate": 5e-05, + "loss": 1.5954, + "step": 1650 + }, + { + "epoch": 0.45146294777139734, + "grad_norm": 0.16471756994724274, + "learning_rate": 5e-05, + "loss": 1.6509, + "step": 1651 + }, + { + "epoch": 0.45173639595296694, + "grad_norm": 0.16866976022720337, + "learning_rate": 5e-05, + "loss": 1.7927, + "step": 1652 + }, + { + "epoch": 0.4520098441345365, + "grad_norm": 0.16828210651874542, + "learning_rate": 5e-05, + "loss": 1.7027, + "step": 1653 + }, + { + "epoch": 0.4522832923161061, + "grad_norm": 0.15160506963729858, + "learning_rate": 5e-05, + "loss": 1.6926, + "step": 1654 + }, + { + "epoch": 0.4525567404976757, + "grad_norm": 0.1498889923095703, + "learning_rate": 5e-05, + "loss": 1.6022, + "step": 1655 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.16272372007369995, + "learning_rate": 5e-05, + "loss": 1.6532, + "step": 1656 + }, + { + "epoch": 0.4531036368608149, + "grad_norm": 0.16485629975795746, + "learning_rate": 5e-05, + "loss": 1.6615, + "step": 1657 + }, + { + "epoch": 0.45337708504238444, + "grad_norm": 0.15402807295322418, + "learning_rate": 5e-05, + "loss": 1.6162, + "step": 1658 + }, + { + "epoch": 0.45365053322395404, + "grad_norm": 0.14957159757614136, + "learning_rate": 5e-05, + "loss": 1.646, + "step": 1659 + }, + { + "epoch": 0.45392398140552365, + "grad_norm": 0.15733934938907623, + "learning_rate": 5e-05, + "loss": 1.5475, + "step": 1660 + }, + { + "epoch": 0.45419742958709325, + "grad_norm": 0.16374224424362183, + "learning_rate": 5e-05, + "loss": 1.6937, + "step": 1661 + }, + { + "epoch": 0.45447087776866285, + "grad_norm": 0.1516236960887909, + "learning_rate": 5e-05, + "loss": 1.5963, + "step": 1662 + }, + { + "epoch": 0.45474432595023245, + "grad_norm": 0.1528540402650833, + "learning_rate": 5e-05, + "loss": 1.5636, + "step": 1663 + }, + { + "epoch": 0.455017774131802, + "grad_norm": 0.1831309050321579, + "learning_rate": 5e-05, + "loss": 1.7391, + "step": 1664 + }, + { + "epoch": 0.4552912223133716, + "grad_norm": 0.1571461260318756, + "learning_rate": 5e-05, + "loss": 1.56, + "step": 1665 + }, + { + "epoch": 0.4555646704949412, + "grad_norm": 0.17489275336265564, + "learning_rate": 5e-05, + "loss": 1.7431, + "step": 1666 + }, + { + "epoch": 0.4558381186765108, + "grad_norm": 0.16362959146499634, + "learning_rate": 5e-05, + "loss": 1.6041, + "step": 1667 + }, + { + "epoch": 0.4561115668580804, + "grad_norm": 0.1590225100517273, + "learning_rate": 5e-05, + "loss": 1.7021, + "step": 1668 + }, + { + "epoch": 0.45638501503965, + "grad_norm": 0.1953715831041336, + "learning_rate": 5e-05, + "loss": 1.7408, + "step": 1669 + }, + { + "epoch": 0.45665846322121956, + "grad_norm": 0.1608944684267044, + "learning_rate": 5e-05, + "loss": 1.6262, + "step": 1670 + }, + { + "epoch": 0.45693191140278916, + "grad_norm": 0.16777822375297546, + "learning_rate": 5e-05, + "loss": 1.6342, + "step": 1671 + }, + { + "epoch": 0.45720535958435876, + "grad_norm": 0.16327768564224243, + "learning_rate": 5e-05, + "loss": 1.6965, + "step": 1672 + }, + { + "epoch": 0.45747880776592836, + "grad_norm": 0.15659572184085846, + "learning_rate": 5e-05, + "loss": 1.6095, + "step": 1673 + }, + { + "epoch": 0.45775225594749797, + "grad_norm": 0.17218998074531555, + "learning_rate": 5e-05, + "loss": 1.6831, + "step": 1674 + }, + { + "epoch": 0.45802570412906757, + "grad_norm": 0.1536104679107666, + "learning_rate": 5e-05, + "loss": 1.5643, + "step": 1675 + }, + { + "epoch": 0.4582991523106371, + "grad_norm": 0.15404142439365387, + "learning_rate": 5e-05, + "loss": 1.5892, + "step": 1676 + }, + { + "epoch": 0.4585726004922067, + "grad_norm": 0.15226425230503082, + "learning_rate": 5e-05, + "loss": 1.582, + "step": 1677 + }, + { + "epoch": 0.4588460486737763, + "grad_norm": 0.1722135990858078, + "learning_rate": 5e-05, + "loss": 1.6726, + "step": 1678 + }, + { + "epoch": 0.4591194968553459, + "grad_norm": 0.14889506995677948, + "learning_rate": 5e-05, + "loss": 1.5599, + "step": 1679 + }, + { + "epoch": 0.4593929450369155, + "grad_norm": 0.15106317400932312, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 1680 + }, + { + "epoch": 0.45966639321848507, + "grad_norm": 0.1514967978000641, + "learning_rate": 5e-05, + "loss": 1.5257, + "step": 1681 + }, + { + "epoch": 0.45993984140005467, + "grad_norm": 0.1474781632423401, + "learning_rate": 5e-05, + "loss": 1.5199, + "step": 1682 + }, + { + "epoch": 0.4602132895816243, + "grad_norm": 0.15649180114269257, + "learning_rate": 5e-05, + "loss": 1.7461, + "step": 1683 + }, + { + "epoch": 0.4604867377631939, + "grad_norm": 0.15975254774093628, + "learning_rate": 5e-05, + "loss": 1.6344, + "step": 1684 + }, + { + "epoch": 0.4607601859447635, + "grad_norm": 0.15253420174121857, + "learning_rate": 5e-05, + "loss": 1.5446, + "step": 1685 + }, + { + "epoch": 0.4610336341263331, + "grad_norm": 0.16303133964538574, + "learning_rate": 5e-05, + "loss": 1.6806, + "step": 1686 + }, + { + "epoch": 0.4613070823079026, + "grad_norm": 0.15411800146102905, + "learning_rate": 5e-05, + "loss": 1.6656, + "step": 1687 + }, + { + "epoch": 0.46158053048947223, + "grad_norm": 0.15066716074943542, + "learning_rate": 5e-05, + "loss": 1.6403, + "step": 1688 + }, + { + "epoch": 0.46185397867104183, + "grad_norm": 0.15379618108272552, + "learning_rate": 5e-05, + "loss": 1.5969, + "step": 1689 + }, + { + "epoch": 0.46212742685261143, + "grad_norm": 0.15352702140808105, + "learning_rate": 5e-05, + "loss": 1.6403, + "step": 1690 + }, + { + "epoch": 0.46240087503418104, + "grad_norm": 0.158903107047081, + "learning_rate": 5e-05, + "loss": 1.6892, + "step": 1691 + }, + { + "epoch": 0.46267432321575064, + "grad_norm": 0.15684601664543152, + "learning_rate": 5e-05, + "loss": 1.6744, + "step": 1692 + }, + { + "epoch": 0.4629477713973202, + "grad_norm": 0.1492249220609665, + "learning_rate": 5e-05, + "loss": 1.613, + "step": 1693 + }, + { + "epoch": 0.4632212195788898, + "grad_norm": 0.1778024435043335, + "learning_rate": 5e-05, + "loss": 1.7122, + "step": 1694 + }, + { + "epoch": 0.4634946677604594, + "grad_norm": 0.16280174255371094, + "learning_rate": 5e-05, + "loss": 1.6553, + "step": 1695 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.1631561666727066, + "learning_rate": 5e-05, + "loss": 1.646, + "step": 1696 + }, + { + "epoch": 0.4640415641235986, + "grad_norm": 0.1574755609035492, + "learning_rate": 5e-05, + "loss": 1.7003, + "step": 1697 + }, + { + "epoch": 0.4643150123051682, + "grad_norm": 0.15220339596271515, + "learning_rate": 5e-05, + "loss": 1.5751, + "step": 1698 + }, + { + "epoch": 0.46458846048673774, + "grad_norm": 0.14914649724960327, + "learning_rate": 5e-05, + "loss": 1.498, + "step": 1699 + }, + { + "epoch": 0.46486190866830734, + "grad_norm": 0.19843582808971405, + "learning_rate": 5e-05, + "loss": 1.7326, + "step": 1700 + }, + { + "epoch": 0.46513535684987695, + "grad_norm": 0.1573631912469864, + "learning_rate": 5e-05, + "loss": 1.6891, + "step": 1701 + }, + { + "epoch": 0.46540880503144655, + "grad_norm": 0.16770999133586884, + "learning_rate": 5e-05, + "loss": 1.6852, + "step": 1702 + }, + { + "epoch": 0.46568225321301615, + "grad_norm": 0.17956487834453583, + "learning_rate": 5e-05, + "loss": 1.6751, + "step": 1703 + }, + { + "epoch": 0.46595570139458575, + "grad_norm": 0.1517421156167984, + "learning_rate": 5e-05, + "loss": 1.6928, + "step": 1704 + }, + { + "epoch": 0.4662291495761553, + "grad_norm": 0.164058655500412, + "learning_rate": 5e-05, + "loss": 1.6313, + "step": 1705 + }, + { + "epoch": 0.4665025977577249, + "grad_norm": 0.16080081462860107, + "learning_rate": 5e-05, + "loss": 1.5901, + "step": 1706 + }, + { + "epoch": 0.4667760459392945, + "grad_norm": 0.149660125374794, + "learning_rate": 5e-05, + "loss": 1.6411, + "step": 1707 + }, + { + "epoch": 0.4670494941208641, + "grad_norm": 0.1580061912536621, + "learning_rate": 5e-05, + "loss": 1.6815, + "step": 1708 + }, + { + "epoch": 0.4673229423024337, + "grad_norm": 0.16102102398872375, + "learning_rate": 5e-05, + "loss": 1.6063, + "step": 1709 + }, + { + "epoch": 0.46759639048400325, + "grad_norm": 0.16004008054733276, + "learning_rate": 5e-05, + "loss": 1.5849, + "step": 1710 + }, + { + "epoch": 0.46786983866557286, + "grad_norm": 0.14842753112316132, + "learning_rate": 5e-05, + "loss": 1.6713, + "step": 1711 + }, + { + "epoch": 0.46814328684714246, + "grad_norm": 0.17115214467048645, + "learning_rate": 5e-05, + "loss": 1.7549, + "step": 1712 + }, + { + "epoch": 0.46841673502871206, + "grad_norm": 0.16055969893932343, + "learning_rate": 5e-05, + "loss": 1.7193, + "step": 1713 + }, + { + "epoch": 0.46869018321028166, + "grad_norm": 0.14992430806159973, + "learning_rate": 5e-05, + "loss": 1.6529, + "step": 1714 + }, + { + "epoch": 0.46896363139185127, + "grad_norm": 0.1594884842634201, + "learning_rate": 5e-05, + "loss": 1.5389, + "step": 1715 + }, + { + "epoch": 0.4692370795734208, + "grad_norm": 0.15089979767799377, + "learning_rate": 5e-05, + "loss": 1.5729, + "step": 1716 + }, + { + "epoch": 0.4695105277549904, + "grad_norm": 0.15678352117538452, + "learning_rate": 5e-05, + "loss": 1.6118, + "step": 1717 + }, + { + "epoch": 0.46978397593656, + "grad_norm": 0.15891622006893158, + "learning_rate": 5e-05, + "loss": 1.6665, + "step": 1718 + }, + { + "epoch": 0.4700574241181296, + "grad_norm": 0.15619421005249023, + "learning_rate": 5e-05, + "loss": 1.657, + "step": 1719 + }, + { + "epoch": 0.4703308722996992, + "grad_norm": 0.15846212208271027, + "learning_rate": 5e-05, + "loss": 1.649, + "step": 1720 + }, + { + "epoch": 0.4706043204812688, + "grad_norm": 0.1617954969406128, + "learning_rate": 5e-05, + "loss": 1.665, + "step": 1721 + }, + { + "epoch": 0.47087776866283837, + "grad_norm": 0.15690676867961884, + "learning_rate": 5e-05, + "loss": 1.5683, + "step": 1722 + }, + { + "epoch": 0.47115121684440797, + "grad_norm": 0.15898874402046204, + "learning_rate": 5e-05, + "loss": 1.5895, + "step": 1723 + }, + { + "epoch": 0.4714246650259776, + "grad_norm": 0.1485942155122757, + "learning_rate": 5e-05, + "loss": 1.5482, + "step": 1724 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 0.15413321554660797, + "learning_rate": 5e-05, + "loss": 1.6355, + "step": 1725 + }, + { + "epoch": 0.4719715613891168, + "grad_norm": 0.16040156781673431, + "learning_rate": 5e-05, + "loss": 1.6984, + "step": 1726 + }, + { + "epoch": 0.4722450095706864, + "grad_norm": 0.1513047069311142, + "learning_rate": 5e-05, + "loss": 1.5586, + "step": 1727 + }, + { + "epoch": 0.4725184577522559, + "grad_norm": 0.1601109653711319, + "learning_rate": 5e-05, + "loss": 1.7306, + "step": 1728 + }, + { + "epoch": 0.47279190593382553, + "grad_norm": 0.15264980494976044, + "learning_rate": 5e-05, + "loss": 1.6797, + "step": 1729 + }, + { + "epoch": 0.47306535411539513, + "grad_norm": 0.15382279455661774, + "learning_rate": 5e-05, + "loss": 1.7204, + "step": 1730 + }, + { + "epoch": 0.47333880229696473, + "grad_norm": 0.15572312474250793, + "learning_rate": 5e-05, + "loss": 1.5855, + "step": 1731 + }, + { + "epoch": 0.47361225047853434, + "grad_norm": 0.15744370222091675, + "learning_rate": 5e-05, + "loss": 1.6487, + "step": 1732 + }, + { + "epoch": 0.47388569866010394, + "grad_norm": 0.1619214117527008, + "learning_rate": 5e-05, + "loss": 1.6556, + "step": 1733 + }, + { + "epoch": 0.4741591468416735, + "grad_norm": 0.1546183079481125, + "learning_rate": 5e-05, + "loss": 1.585, + "step": 1734 + }, + { + "epoch": 0.4744325950232431, + "grad_norm": 0.1744302213191986, + "learning_rate": 5e-05, + "loss": 1.7555, + "step": 1735 + }, + { + "epoch": 0.4747060432048127, + "grad_norm": 0.16364151239395142, + "learning_rate": 5e-05, + "loss": 1.6622, + "step": 1736 + }, + { + "epoch": 0.4749794913863823, + "grad_norm": 0.14857520163059235, + "learning_rate": 5e-05, + "loss": 1.5803, + "step": 1737 + }, + { + "epoch": 0.4752529395679519, + "grad_norm": 0.1706264317035675, + "learning_rate": 5e-05, + "loss": 1.6314, + "step": 1738 + }, + { + "epoch": 0.47552638774952144, + "grad_norm": 0.15458574891090393, + "learning_rate": 5e-05, + "loss": 1.5577, + "step": 1739 + }, + { + "epoch": 0.47579983593109104, + "grad_norm": 0.1472868025302887, + "learning_rate": 5e-05, + "loss": 1.5581, + "step": 1740 + }, + { + "epoch": 0.47607328411266064, + "grad_norm": 0.16088488698005676, + "learning_rate": 5e-05, + "loss": 1.6711, + "step": 1741 + }, + { + "epoch": 0.47634673229423025, + "grad_norm": 0.16059233248233795, + "learning_rate": 5e-05, + "loss": 1.6239, + "step": 1742 + }, + { + "epoch": 0.47662018047579985, + "grad_norm": 0.15403006970882416, + "learning_rate": 5e-05, + "loss": 1.5916, + "step": 1743 + }, + { + "epoch": 0.47689362865736945, + "grad_norm": 0.15987733006477356, + "learning_rate": 5e-05, + "loss": 1.5877, + "step": 1744 + }, + { + "epoch": 0.477167076838939, + "grad_norm": 0.1711868941783905, + "learning_rate": 5e-05, + "loss": 1.6065, + "step": 1745 + }, + { + "epoch": 0.4774405250205086, + "grad_norm": 0.16569697856903076, + "learning_rate": 5e-05, + "loss": 1.6181, + "step": 1746 + }, + { + "epoch": 0.4777139732020782, + "grad_norm": 0.15485012531280518, + "learning_rate": 5e-05, + "loss": 1.6009, + "step": 1747 + }, + { + "epoch": 0.4779874213836478, + "grad_norm": 0.16187280416488647, + "learning_rate": 5e-05, + "loss": 1.5976, + "step": 1748 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.1522330790758133, + "learning_rate": 5e-05, + "loss": 1.7015, + "step": 1749 + }, + { + "epoch": 0.478534317746787, + "grad_norm": 0.1496533900499344, + "learning_rate": 5e-05, + "loss": 1.6114, + "step": 1750 + }, + { + "epoch": 0.47880776592835655, + "grad_norm": 0.16666877269744873, + "learning_rate": 5e-05, + "loss": 1.6728, + "step": 1751 + }, + { + "epoch": 0.47908121410992616, + "grad_norm": 0.1623934507369995, + "learning_rate": 5e-05, + "loss": 1.5624, + "step": 1752 + }, + { + "epoch": 0.47935466229149576, + "grad_norm": 0.1671043336391449, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 1753 + }, + { + "epoch": 0.47962811047306536, + "grad_norm": 0.16084609925746918, + "learning_rate": 5e-05, + "loss": 1.5419, + "step": 1754 + }, + { + "epoch": 0.47990155865463496, + "grad_norm": 0.18174776434898376, + "learning_rate": 5e-05, + "loss": 1.6554, + "step": 1755 + }, + { + "epoch": 0.48017500683620457, + "grad_norm": 0.15653514862060547, + "learning_rate": 5e-05, + "loss": 1.5938, + "step": 1756 + }, + { + "epoch": 0.4804484550177741, + "grad_norm": 0.16829371452331543, + "learning_rate": 5e-05, + "loss": 1.6566, + "step": 1757 + }, + { + "epoch": 0.4807219031993437, + "grad_norm": 0.16628386080265045, + "learning_rate": 5e-05, + "loss": 1.6325, + "step": 1758 + }, + { + "epoch": 0.4809953513809133, + "grad_norm": 0.15540596842765808, + "learning_rate": 5e-05, + "loss": 1.5648, + "step": 1759 + }, + { + "epoch": 0.4812687995624829, + "grad_norm": 0.18352627754211426, + "learning_rate": 5e-05, + "loss": 1.7056, + "step": 1760 + }, + { + "epoch": 0.4815422477440525, + "grad_norm": 0.16480763256549835, + "learning_rate": 5e-05, + "loss": 1.6272, + "step": 1761 + }, + { + "epoch": 0.48181569592562207, + "grad_norm": 0.16389521956443787, + "learning_rate": 5e-05, + "loss": 1.7246, + "step": 1762 + }, + { + "epoch": 0.48208914410719167, + "grad_norm": 0.14809884130954742, + "learning_rate": 5e-05, + "loss": 1.5785, + "step": 1763 + }, + { + "epoch": 0.48236259228876127, + "grad_norm": 0.1639075130224228, + "learning_rate": 5e-05, + "loss": 1.5561, + "step": 1764 + }, + { + "epoch": 0.4826360404703309, + "grad_norm": 0.17026962339878082, + "learning_rate": 5e-05, + "loss": 1.599, + "step": 1765 + }, + { + "epoch": 0.4829094886519005, + "grad_norm": 0.16182227432727814, + "learning_rate": 5e-05, + "loss": 1.5801, + "step": 1766 + }, + { + "epoch": 0.4831829368334701, + "grad_norm": 0.17417606711387634, + "learning_rate": 5e-05, + "loss": 1.5894, + "step": 1767 + }, + { + "epoch": 0.4834563850150396, + "grad_norm": 0.1606941670179367, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 1768 + }, + { + "epoch": 0.4837298331966092, + "grad_norm": 0.152814581990242, + "learning_rate": 5e-05, + "loss": 1.6011, + "step": 1769 + }, + { + "epoch": 0.48400328137817883, + "grad_norm": 0.16164222359657288, + "learning_rate": 5e-05, + "loss": 1.5673, + "step": 1770 + }, + { + "epoch": 0.48427672955974843, + "grad_norm": 0.16344451904296875, + "learning_rate": 5e-05, + "loss": 1.6209, + "step": 1771 + }, + { + "epoch": 0.48455017774131803, + "grad_norm": 0.1603250950574875, + "learning_rate": 5e-05, + "loss": 1.6105, + "step": 1772 + }, + { + "epoch": 0.48482362592288764, + "grad_norm": 0.16991026699543, + "learning_rate": 5e-05, + "loss": 1.6723, + "step": 1773 + }, + { + "epoch": 0.4850970741044572, + "grad_norm": 0.1601463109254837, + "learning_rate": 5e-05, + "loss": 1.5577, + "step": 1774 + }, + { + "epoch": 0.4853705222860268, + "grad_norm": 0.16674864292144775, + "learning_rate": 5e-05, + "loss": 1.6598, + "step": 1775 + }, + { + "epoch": 0.4856439704675964, + "grad_norm": 0.16150332987308502, + "learning_rate": 5e-05, + "loss": 1.6948, + "step": 1776 + }, + { + "epoch": 0.485917418649166, + "grad_norm": 0.16552074253559113, + "learning_rate": 5e-05, + "loss": 1.6746, + "step": 1777 + }, + { + "epoch": 0.4861908668307356, + "grad_norm": 0.1552310287952423, + "learning_rate": 5e-05, + "loss": 1.5961, + "step": 1778 + }, + { + "epoch": 0.4864643150123052, + "grad_norm": 0.16018088161945343, + "learning_rate": 5e-05, + "loss": 1.6024, + "step": 1779 + }, + { + "epoch": 0.48673776319387474, + "grad_norm": 0.16856145858764648, + "learning_rate": 5e-05, + "loss": 1.6187, + "step": 1780 + }, + { + "epoch": 0.48701121137544434, + "grad_norm": 0.16507881879806519, + "learning_rate": 5e-05, + "loss": 1.6644, + "step": 1781 + }, + { + "epoch": 0.48728465955701394, + "grad_norm": 0.1524297147989273, + "learning_rate": 5e-05, + "loss": 1.6013, + "step": 1782 + }, + { + "epoch": 0.48755810773858355, + "grad_norm": 0.16545777022838593, + "learning_rate": 5e-05, + "loss": 1.6813, + "step": 1783 + }, + { + "epoch": 0.48783155592015315, + "grad_norm": 0.17331644892692566, + "learning_rate": 5e-05, + "loss": 1.6797, + "step": 1784 + }, + { + "epoch": 0.48810500410172275, + "grad_norm": 0.16021324694156647, + "learning_rate": 5e-05, + "loss": 1.5828, + "step": 1785 + }, + { + "epoch": 0.4883784522832923, + "grad_norm": 0.15434862673282623, + "learning_rate": 5e-05, + "loss": 1.5449, + "step": 1786 + }, + { + "epoch": 0.4886519004648619, + "grad_norm": 0.15682034194469452, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 1787 + }, + { + "epoch": 0.4889253486464315, + "grad_norm": 0.17640839517116547, + "learning_rate": 5e-05, + "loss": 1.7292, + "step": 1788 + }, + { + "epoch": 0.4891987968280011, + "grad_norm": 0.16206398606300354, + "learning_rate": 5e-05, + "loss": 1.5844, + "step": 1789 + }, + { + "epoch": 0.4894722450095707, + "grad_norm": 0.15871930122375488, + "learning_rate": 5e-05, + "loss": 1.685, + "step": 1790 + }, + { + "epoch": 0.48974569319114025, + "grad_norm": 0.16885024309158325, + "learning_rate": 5e-05, + "loss": 1.7065, + "step": 1791 + }, + { + "epoch": 0.49001914137270985, + "grad_norm": 0.16324368119239807, + "learning_rate": 5e-05, + "loss": 1.6381, + "step": 1792 + }, + { + "epoch": 0.49029258955427946, + "grad_norm": 0.15525028109550476, + "learning_rate": 5e-05, + "loss": 1.6529, + "step": 1793 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.15970031917095184, + "learning_rate": 5e-05, + "loss": 1.751, + "step": 1794 + }, + { + "epoch": 0.49083948591741866, + "grad_norm": 0.16955524682998657, + "learning_rate": 5e-05, + "loss": 1.7307, + "step": 1795 + }, + { + "epoch": 0.49111293409898826, + "grad_norm": 0.153351828455925, + "learning_rate": 5e-05, + "loss": 1.5379, + "step": 1796 + }, + { + "epoch": 0.4913863822805578, + "grad_norm": 0.16103506088256836, + "learning_rate": 5e-05, + "loss": 1.6945, + "step": 1797 + }, + { + "epoch": 0.4916598304621274, + "grad_norm": 0.17233219742774963, + "learning_rate": 5e-05, + "loss": 1.6656, + "step": 1798 + }, + { + "epoch": 0.491933278643697, + "grad_norm": 0.15539239346981049, + "learning_rate": 5e-05, + "loss": 1.6, + "step": 1799 + }, + { + "epoch": 0.4922067268252666, + "grad_norm": 0.1534857302904129, + "learning_rate": 5e-05, + "loss": 1.5498, + "step": 1800 + }, + { + "epoch": 0.4924801750068362, + "grad_norm": 0.18539227545261383, + "learning_rate": 5e-05, + "loss": 1.5413, + "step": 1801 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.15599320828914642, + "learning_rate": 5e-05, + "loss": 1.7108, + "step": 1802 + }, + { + "epoch": 0.49302707136997537, + "grad_norm": 0.15700851380825043, + "learning_rate": 5e-05, + "loss": 1.6198, + "step": 1803 + }, + { + "epoch": 0.49330051955154497, + "grad_norm": 0.15811684727668762, + "learning_rate": 5e-05, + "loss": 1.5085, + "step": 1804 + }, + { + "epoch": 0.49357396773311457, + "grad_norm": 0.15955877304077148, + "learning_rate": 5e-05, + "loss": 1.5673, + "step": 1805 + }, + { + "epoch": 0.4938474159146842, + "grad_norm": 0.15809409320354462, + "learning_rate": 5e-05, + "loss": 1.6075, + "step": 1806 + }, + { + "epoch": 0.4941208640962538, + "grad_norm": 0.17937301099300385, + "learning_rate": 5e-05, + "loss": 1.5832, + "step": 1807 + }, + { + "epoch": 0.4943943122778234, + "grad_norm": 0.16430814564228058, + "learning_rate": 5e-05, + "loss": 1.6945, + "step": 1808 + }, + { + "epoch": 0.4946677604593929, + "grad_norm": 0.1564474105834961, + "learning_rate": 5e-05, + "loss": 1.6259, + "step": 1809 + }, + { + "epoch": 0.4949412086409625, + "grad_norm": 0.17254872620105743, + "learning_rate": 5e-05, + "loss": 1.7017, + "step": 1810 + }, + { + "epoch": 0.49521465682253213, + "grad_norm": 0.16017508506774902, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 1811 + }, + { + "epoch": 0.49548810500410173, + "grad_norm": 0.15552417933940887, + "learning_rate": 5e-05, + "loss": 1.5943, + "step": 1812 + }, + { + "epoch": 0.49576155318567133, + "grad_norm": 0.14992982149124146, + "learning_rate": 5e-05, + "loss": 1.5989, + "step": 1813 + }, + { + "epoch": 0.49603500136724094, + "grad_norm": 0.14720216393470764, + "learning_rate": 5e-05, + "loss": 1.4911, + "step": 1814 + }, + { + "epoch": 0.4963084495488105, + "grad_norm": 0.15855912864208221, + "learning_rate": 5e-05, + "loss": 1.5896, + "step": 1815 + }, + { + "epoch": 0.4965818977303801, + "grad_norm": 0.1551193743944168, + "learning_rate": 5e-05, + "loss": 1.58, + "step": 1816 + }, + { + "epoch": 0.4968553459119497, + "grad_norm": 0.15198324620723724, + "learning_rate": 5e-05, + "loss": 1.6181, + "step": 1817 + }, + { + "epoch": 0.4971287940935193, + "grad_norm": 0.1556388884782791, + "learning_rate": 5e-05, + "loss": 1.6119, + "step": 1818 + }, + { + "epoch": 0.4974022422750889, + "grad_norm": 0.1501542180776596, + "learning_rate": 5e-05, + "loss": 1.5566, + "step": 1819 + }, + { + "epoch": 0.49767569045665844, + "grad_norm": 0.1648813784122467, + "learning_rate": 5e-05, + "loss": 1.6296, + "step": 1820 + }, + { + "epoch": 0.49794913863822804, + "grad_norm": 0.15707071125507355, + "learning_rate": 5e-05, + "loss": 1.6604, + "step": 1821 + }, + { + "epoch": 0.49822258681979764, + "grad_norm": 0.1539454311132431, + "learning_rate": 5e-05, + "loss": 1.6051, + "step": 1822 + }, + { + "epoch": 0.49849603500136724, + "grad_norm": 0.15157034993171692, + "learning_rate": 5e-05, + "loss": 1.6315, + "step": 1823 + }, + { + "epoch": 0.49876948318293685, + "grad_norm": 0.15487664937973022, + "learning_rate": 5e-05, + "loss": 1.676, + "step": 1824 + }, + { + "epoch": 0.49904293136450645, + "grad_norm": 0.15155303478240967, + "learning_rate": 5e-05, + "loss": 1.5685, + "step": 1825 + }, + { + "epoch": 0.499316379546076, + "grad_norm": 0.14079029858112335, + "learning_rate": 5e-05, + "loss": 1.5199, + "step": 1826 + }, + { + "epoch": 0.4995898277276456, + "grad_norm": 0.15700970590114594, + "learning_rate": 5e-05, + "loss": 1.6312, + "step": 1827 + }, + { + "epoch": 0.4998632759092152, + "grad_norm": 0.1630944162607193, + "learning_rate": 5e-05, + "loss": 1.7161, + "step": 1828 + }, + { + "epoch": 0.5001367240907848, + "grad_norm": 0.15556710958480835, + "learning_rate": 5e-05, + "loss": 1.6417, + "step": 1829 + }, + { + "epoch": 0.5004101722723544, + "grad_norm": 0.15315458178520203, + "learning_rate": 5e-05, + "loss": 1.6343, + "step": 1830 + }, + { + "epoch": 0.500683620453924, + "grad_norm": 0.15903054177761078, + "learning_rate": 5e-05, + "loss": 1.6433, + "step": 1831 + }, + { + "epoch": 0.5009570686354936, + "grad_norm": 0.16064853966236115, + "learning_rate": 5e-05, + "loss": 1.6739, + "step": 1832 + }, + { + "epoch": 0.5012305168170632, + "grad_norm": 0.14736616611480713, + "learning_rate": 5e-05, + "loss": 1.547, + "step": 1833 + }, + { + "epoch": 0.5015039649986328, + "grad_norm": 0.17235925793647766, + "learning_rate": 5e-05, + "loss": 1.5768, + "step": 1834 + }, + { + "epoch": 0.5017774131802023, + "grad_norm": 0.15816731750965118, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 1835 + }, + { + "epoch": 0.5020508613617719, + "grad_norm": 0.15823869407176971, + "learning_rate": 5e-05, + "loss": 1.6582, + "step": 1836 + }, + { + "epoch": 0.5023243095433415, + "grad_norm": 0.14915357530117035, + "learning_rate": 5e-05, + "loss": 1.5551, + "step": 1837 + }, + { + "epoch": 0.5025977577249111, + "grad_norm": 0.15896442532539368, + "learning_rate": 5e-05, + "loss": 1.6161, + "step": 1838 + }, + { + "epoch": 0.5028712059064807, + "grad_norm": 0.15622593462467194, + "learning_rate": 5e-05, + "loss": 1.5816, + "step": 1839 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 0.16117733716964722, + "learning_rate": 5e-05, + "loss": 1.6266, + "step": 1840 + }, + { + "epoch": 0.5034181022696199, + "grad_norm": 0.15878431499004364, + "learning_rate": 5e-05, + "loss": 1.6835, + "step": 1841 + }, + { + "epoch": 0.5036915504511895, + "grad_norm": 0.15273039042949677, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 1842 + }, + { + "epoch": 0.5039649986327591, + "grad_norm": 0.1612463891506195, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 1843 + }, + { + "epoch": 0.5042384468143287, + "grad_norm": 0.17296503484249115, + "learning_rate": 5e-05, + "loss": 1.664, + "step": 1844 + }, + { + "epoch": 0.5045118949958983, + "grad_norm": 0.1609872728586197, + "learning_rate": 5e-05, + "loss": 1.727, + "step": 1845 + }, + { + "epoch": 0.5047853431774678, + "grad_norm": 0.18367880582809448, + "learning_rate": 5e-05, + "loss": 1.6324, + "step": 1846 + }, + { + "epoch": 0.5050587913590374, + "grad_norm": 0.1622532606124878, + "learning_rate": 5e-05, + "loss": 1.6035, + "step": 1847 + }, + { + "epoch": 0.505332239540607, + "grad_norm": 0.16167710721492767, + "learning_rate": 5e-05, + "loss": 1.7208, + "step": 1848 + }, + { + "epoch": 0.5056056877221766, + "grad_norm": 0.17007885873317719, + "learning_rate": 5e-05, + "loss": 1.4988, + "step": 1849 + }, + { + "epoch": 0.5058791359037462, + "grad_norm": 0.1878514140844345, + "learning_rate": 5e-05, + "loss": 1.698, + "step": 1850 + }, + { + "epoch": 0.5061525840853158, + "grad_norm": 0.1594289392232895, + "learning_rate": 5e-05, + "loss": 1.6375, + "step": 1851 + }, + { + "epoch": 0.5064260322668854, + "grad_norm": 0.17619235813617706, + "learning_rate": 5e-05, + "loss": 1.5621, + "step": 1852 + }, + { + "epoch": 0.506699480448455, + "grad_norm": 0.15811192989349365, + "learning_rate": 5e-05, + "loss": 1.4968, + "step": 1853 + }, + { + "epoch": 0.5069729286300246, + "grad_norm": 0.15754997730255127, + "learning_rate": 5e-05, + "loss": 1.6578, + "step": 1854 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 0.17825719714164734, + "learning_rate": 5e-05, + "loss": 1.6794, + "step": 1855 + }, + { + "epoch": 0.5075198249931638, + "grad_norm": 0.1577797830104828, + "learning_rate": 5e-05, + "loss": 1.5553, + "step": 1856 + }, + { + "epoch": 0.5077932731747334, + "grad_norm": 0.164150670170784, + "learning_rate": 5e-05, + "loss": 1.6357, + "step": 1857 + }, + { + "epoch": 0.5080667213563029, + "grad_norm": 0.16957907378673553, + "learning_rate": 5e-05, + "loss": 1.6714, + "step": 1858 + }, + { + "epoch": 0.5083401695378725, + "grad_norm": 0.15430040657520294, + "learning_rate": 5e-05, + "loss": 1.5349, + "step": 1859 + }, + { + "epoch": 0.5086136177194421, + "grad_norm": 0.1596827656030655, + "learning_rate": 5e-05, + "loss": 1.6752, + "step": 1860 + }, + { + "epoch": 0.5088870659010117, + "grad_norm": 0.15648046135902405, + "learning_rate": 5e-05, + "loss": 1.5519, + "step": 1861 + }, + { + "epoch": 0.5091605140825813, + "grad_norm": 0.154343381524086, + "learning_rate": 5e-05, + "loss": 1.5832, + "step": 1862 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 0.15734641253948212, + "learning_rate": 5e-05, + "loss": 1.6465, + "step": 1863 + }, + { + "epoch": 0.5097074104457205, + "grad_norm": 0.16264241933822632, + "learning_rate": 5e-05, + "loss": 1.7062, + "step": 1864 + }, + { + "epoch": 0.5099808586272901, + "grad_norm": 0.1580963432788849, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 1865 + }, + { + "epoch": 0.5102543068088597, + "grad_norm": 0.15470340847969055, + "learning_rate": 5e-05, + "loss": 1.591, + "step": 1866 + }, + { + "epoch": 0.5105277549904293, + "grad_norm": 0.18182386457920074, + "learning_rate": 5e-05, + "loss": 1.7503, + "step": 1867 + }, + { + "epoch": 0.510801203171999, + "grad_norm": 0.16213788092136383, + "learning_rate": 5e-05, + "loss": 1.5868, + "step": 1868 + }, + { + "epoch": 0.5110746513535686, + "grad_norm": 0.1534777283668518, + "learning_rate": 5e-05, + "loss": 1.5489, + "step": 1869 + }, + { + "epoch": 0.511348099535138, + "grad_norm": 0.1524716168642044, + "learning_rate": 5e-05, + "loss": 1.5555, + "step": 1870 + }, + { + "epoch": 0.5116215477167076, + "grad_norm": 0.15701556205749512, + "learning_rate": 5e-05, + "loss": 1.5622, + "step": 1871 + }, + { + "epoch": 0.5118949958982772, + "grad_norm": 0.1485898792743683, + "learning_rate": 5e-05, + "loss": 1.5619, + "step": 1872 + }, + { + "epoch": 0.5121684440798469, + "grad_norm": 0.16039031744003296, + "learning_rate": 5e-05, + "loss": 1.6172, + "step": 1873 + }, + { + "epoch": 0.5124418922614165, + "grad_norm": 0.167929545044899, + "learning_rate": 5e-05, + "loss": 1.6437, + "step": 1874 + }, + { + "epoch": 0.5127153404429861, + "grad_norm": 0.16350795328617096, + "learning_rate": 5e-05, + "loss": 1.6416, + "step": 1875 + }, + { + "epoch": 0.5129887886245557, + "grad_norm": 0.15972229838371277, + "learning_rate": 5e-05, + "loss": 1.7117, + "step": 1876 + }, + { + "epoch": 0.5132622368061253, + "grad_norm": 0.15788406133651733, + "learning_rate": 5e-05, + "loss": 1.5657, + "step": 1877 + }, + { + "epoch": 0.5135356849876949, + "grad_norm": 0.1540132462978363, + "learning_rate": 5e-05, + "loss": 1.6087, + "step": 1878 + }, + { + "epoch": 0.5138091331692645, + "grad_norm": 0.16247431933879852, + "learning_rate": 5e-05, + "loss": 1.6432, + "step": 1879 + }, + { + "epoch": 0.5140825813508341, + "grad_norm": 0.15716880559921265, + "learning_rate": 5e-05, + "loss": 1.5565, + "step": 1880 + }, + { + "epoch": 0.5143560295324036, + "grad_norm": 0.15379799902439117, + "learning_rate": 5e-05, + "loss": 1.6237, + "step": 1881 + }, + { + "epoch": 0.5146294777139732, + "grad_norm": 0.15428978204727173, + "learning_rate": 5e-05, + "loss": 1.5128, + "step": 1882 + }, + { + "epoch": 0.5149029258955428, + "grad_norm": 0.15110410749912262, + "learning_rate": 5e-05, + "loss": 1.5246, + "step": 1883 + }, + { + "epoch": 0.5151763740771124, + "grad_norm": 0.15420377254486084, + "learning_rate": 5e-05, + "loss": 1.6952, + "step": 1884 + }, + { + "epoch": 0.515449822258682, + "grad_norm": 0.15766222774982452, + "learning_rate": 5e-05, + "loss": 1.6611, + "step": 1885 + }, + { + "epoch": 0.5157232704402516, + "grad_norm": 0.1532919555902481, + "learning_rate": 5e-05, + "loss": 1.5322, + "step": 1886 + }, + { + "epoch": 0.5159967186218212, + "grad_norm": 0.15919700264930725, + "learning_rate": 5e-05, + "loss": 1.6387, + "step": 1887 + }, + { + "epoch": 0.5162701668033908, + "grad_norm": 0.15913300216197968, + "learning_rate": 5e-05, + "loss": 1.6679, + "step": 1888 + }, + { + "epoch": 0.5165436149849604, + "grad_norm": 0.16154128313064575, + "learning_rate": 5e-05, + "loss": 1.72, + "step": 1889 + }, + { + "epoch": 0.51681706316653, + "grad_norm": 0.16438822448253632, + "learning_rate": 5e-05, + "loss": 1.7099, + "step": 1890 + }, + { + "epoch": 0.5170905113480996, + "grad_norm": 0.1546657532453537, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 1891 + }, + { + "epoch": 0.5173639595296692, + "grad_norm": 0.15779191255569458, + "learning_rate": 5e-05, + "loss": 1.6176, + "step": 1892 + }, + { + "epoch": 0.5176374077112387, + "grad_norm": 0.15966758131980896, + "learning_rate": 5e-05, + "loss": 1.665, + "step": 1893 + }, + { + "epoch": 0.5179108558928083, + "grad_norm": 0.17288130521774292, + "learning_rate": 5e-05, + "loss": 1.6342, + "step": 1894 + }, + { + "epoch": 0.5181843040743779, + "grad_norm": 0.16509395837783813, + "learning_rate": 5e-05, + "loss": 1.649, + "step": 1895 + }, + { + "epoch": 0.5184577522559475, + "grad_norm": 0.14995107054710388, + "learning_rate": 5e-05, + "loss": 1.4966, + "step": 1896 + }, + { + "epoch": 0.5187312004375171, + "grad_norm": 0.15514351427555084, + "learning_rate": 5e-05, + "loss": 1.5691, + "step": 1897 + }, + { + "epoch": 0.5190046486190867, + "grad_norm": 0.15947668254375458, + "learning_rate": 5e-05, + "loss": 1.534, + "step": 1898 + }, + { + "epoch": 0.5192780968006563, + "grad_norm": 0.16156336665153503, + "learning_rate": 5e-05, + "loss": 1.7277, + "step": 1899 + }, + { + "epoch": 0.5195515449822259, + "grad_norm": 0.17509359121322632, + "learning_rate": 5e-05, + "loss": 1.8131, + "step": 1900 + }, + { + "epoch": 0.5198249931637955, + "grad_norm": 0.1574561595916748, + "learning_rate": 5e-05, + "loss": 1.618, + "step": 1901 + }, + { + "epoch": 0.5200984413453651, + "grad_norm": 0.1618000864982605, + "learning_rate": 5e-05, + "loss": 1.5925, + "step": 1902 + }, + { + "epoch": 0.5203718895269347, + "grad_norm": 0.15169738233089447, + "learning_rate": 5e-05, + "loss": 1.6704, + "step": 1903 + }, + { + "epoch": 0.5206453377085042, + "grad_norm": 0.15747463703155518, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 1904 + }, + { + "epoch": 0.5209187858900738, + "grad_norm": 0.15733812749385834, + "learning_rate": 5e-05, + "loss": 1.6501, + "step": 1905 + }, + { + "epoch": 0.5211922340716434, + "grad_norm": 0.15596060454845428, + "learning_rate": 5e-05, + "loss": 1.6661, + "step": 1906 + }, + { + "epoch": 0.521465682253213, + "grad_norm": 0.15629130601882935, + "learning_rate": 5e-05, + "loss": 1.652, + "step": 1907 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.15206284821033478, + "learning_rate": 5e-05, + "loss": 1.5696, + "step": 1908 + }, + { + "epoch": 0.5220125786163522, + "grad_norm": 0.16571788489818573, + "learning_rate": 5e-05, + "loss": 1.6589, + "step": 1909 + }, + { + "epoch": 0.5222860267979218, + "grad_norm": 0.1548844426870346, + "learning_rate": 5e-05, + "loss": 1.6127, + "step": 1910 + }, + { + "epoch": 0.5225594749794914, + "grad_norm": 0.16375820338726044, + "learning_rate": 5e-05, + "loss": 1.5907, + "step": 1911 + }, + { + "epoch": 0.522832923161061, + "grad_norm": 0.15710307657718658, + "learning_rate": 5e-05, + "loss": 1.5947, + "step": 1912 + }, + { + "epoch": 0.5231063713426306, + "grad_norm": 0.15390416979789734, + "learning_rate": 5e-05, + "loss": 1.6072, + "step": 1913 + }, + { + "epoch": 0.5233798195242002, + "grad_norm": 0.15938615798950195, + "learning_rate": 5e-05, + "loss": 1.6348, + "step": 1914 + }, + { + "epoch": 0.5236532677057698, + "grad_norm": 0.14976823329925537, + "learning_rate": 5e-05, + "loss": 1.5721, + "step": 1915 + }, + { + "epoch": 0.5239267158873393, + "grad_norm": 0.15962772071361542, + "learning_rate": 5e-05, + "loss": 1.6077, + "step": 1916 + }, + { + "epoch": 0.5242001640689089, + "grad_norm": 0.1549055576324463, + "learning_rate": 5e-05, + "loss": 1.5865, + "step": 1917 + }, + { + "epoch": 0.5244736122504785, + "grad_norm": 0.1605559140443802, + "learning_rate": 5e-05, + "loss": 1.5902, + "step": 1918 + }, + { + "epoch": 0.5247470604320481, + "grad_norm": 0.15829849243164062, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 1919 + }, + { + "epoch": 0.5250205086136177, + "grad_norm": 0.1654774397611618, + "learning_rate": 5e-05, + "loss": 1.6368, + "step": 1920 + }, + { + "epoch": 0.5252939567951873, + "grad_norm": 0.16315412521362305, + "learning_rate": 5e-05, + "loss": 1.6415, + "step": 1921 + }, + { + "epoch": 0.5255674049767569, + "grad_norm": 0.14765046536922455, + "learning_rate": 5e-05, + "loss": 1.6004, + "step": 1922 + }, + { + "epoch": 0.5258408531583265, + "grad_norm": 0.15793031454086304, + "learning_rate": 5e-05, + "loss": 1.6902, + "step": 1923 + }, + { + "epoch": 0.5261143013398961, + "grad_norm": 0.165854349732399, + "learning_rate": 5e-05, + "loss": 1.6598, + "step": 1924 + }, + { + "epoch": 0.5263877495214657, + "grad_norm": 0.15993940830230713, + "learning_rate": 5e-05, + "loss": 1.6446, + "step": 1925 + }, + { + "epoch": 0.5266611977030353, + "grad_norm": 0.1579129993915558, + "learning_rate": 5e-05, + "loss": 1.6392, + "step": 1926 + }, + { + "epoch": 0.5269346458846048, + "grad_norm": 0.1714421659708023, + "learning_rate": 5e-05, + "loss": 1.6083, + "step": 1927 + }, + { + "epoch": 0.5272080940661744, + "grad_norm": 0.15757611393928528, + "learning_rate": 5e-05, + "loss": 1.6344, + "step": 1928 + }, + { + "epoch": 0.527481542247744, + "grad_norm": 0.15477389097213745, + "learning_rate": 5e-05, + "loss": 1.6428, + "step": 1929 + }, + { + "epoch": 0.5277549904293136, + "grad_norm": 0.14917831122875214, + "learning_rate": 5e-05, + "loss": 1.4957, + "step": 1930 + }, + { + "epoch": 0.5280284386108832, + "grad_norm": 0.17004439234733582, + "learning_rate": 5e-05, + "loss": 1.7057, + "step": 1931 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.1737544983625412, + "learning_rate": 5e-05, + "loss": 1.6653, + "step": 1932 + }, + { + "epoch": 0.5285753349740224, + "grad_norm": 0.15638841688632965, + "learning_rate": 5e-05, + "loss": 1.581, + "step": 1933 + }, + { + "epoch": 0.528848783155592, + "grad_norm": 0.19510416686534882, + "learning_rate": 5e-05, + "loss": 1.7202, + "step": 1934 + }, + { + "epoch": 0.5291222313371616, + "grad_norm": 0.16090962290763855, + "learning_rate": 5e-05, + "loss": 1.6374, + "step": 1935 + }, + { + "epoch": 0.5293956795187312, + "grad_norm": 0.1696002036333084, + "learning_rate": 5e-05, + "loss": 1.7889, + "step": 1936 + }, + { + "epoch": 0.5296691277003008, + "grad_norm": 0.16609039902687073, + "learning_rate": 5e-05, + "loss": 1.6189, + "step": 1937 + }, + { + "epoch": 0.5299425758818704, + "grad_norm": 0.1632707715034485, + "learning_rate": 5e-05, + "loss": 1.6715, + "step": 1938 + }, + { + "epoch": 0.5302160240634399, + "grad_norm": 0.1675024926662445, + "learning_rate": 5e-05, + "loss": 1.6406, + "step": 1939 + }, + { + "epoch": 0.5304894722450095, + "grad_norm": 0.1700305938720703, + "learning_rate": 5e-05, + "loss": 1.5702, + "step": 1940 + }, + { + "epoch": 0.5307629204265791, + "grad_norm": 0.16249525547027588, + "learning_rate": 5e-05, + "loss": 1.6847, + "step": 1941 + }, + { + "epoch": 0.5310363686081487, + "grad_norm": 0.16194301843643188, + "learning_rate": 5e-05, + "loss": 1.5392, + "step": 1942 + }, + { + "epoch": 0.5313098167897183, + "grad_norm": 0.16195613145828247, + "learning_rate": 5e-05, + "loss": 1.6185, + "step": 1943 + }, + { + "epoch": 0.5315832649712879, + "grad_norm": 0.15754897892475128, + "learning_rate": 5e-05, + "loss": 1.6188, + "step": 1944 + }, + { + "epoch": 0.5318567131528575, + "grad_norm": 0.16981816291809082, + "learning_rate": 5e-05, + "loss": 1.6742, + "step": 1945 + }, + { + "epoch": 0.5321301613344271, + "grad_norm": 0.16634345054626465, + "learning_rate": 5e-05, + "loss": 1.5054, + "step": 1946 + }, + { + "epoch": 0.5324036095159967, + "grad_norm": 0.16170655190944672, + "learning_rate": 5e-05, + "loss": 1.6678, + "step": 1947 + }, + { + "epoch": 0.5326770576975663, + "grad_norm": 0.1492849886417389, + "learning_rate": 5e-05, + "loss": 1.6136, + "step": 1948 + }, + { + "epoch": 0.532950505879136, + "grad_norm": 0.18234175443649292, + "learning_rate": 5e-05, + "loss": 1.7196, + "step": 1949 + }, + { + "epoch": 0.5332239540607056, + "grad_norm": 0.158365860581398, + "learning_rate": 5e-05, + "loss": 1.4944, + "step": 1950 + }, + { + "epoch": 0.533497402242275, + "grad_norm": 0.1587635576725006, + "learning_rate": 5e-05, + "loss": 1.6334, + "step": 1951 + }, + { + "epoch": 0.5337708504238446, + "grad_norm": 0.18955689668655396, + "learning_rate": 5e-05, + "loss": 1.6253, + "step": 1952 + }, + { + "epoch": 0.5340442986054142, + "grad_norm": 0.1624482125043869, + "learning_rate": 5e-05, + "loss": 1.6511, + "step": 1953 + }, + { + "epoch": 0.5343177467869838, + "grad_norm": 0.1539737582206726, + "learning_rate": 5e-05, + "loss": 1.4702, + "step": 1954 + }, + { + "epoch": 0.5345911949685535, + "grad_norm": 0.17508608102798462, + "learning_rate": 5e-05, + "loss": 1.6406, + "step": 1955 + }, + { + "epoch": 0.534864643150123, + "grad_norm": 0.15725727379322052, + "learning_rate": 5e-05, + "loss": 1.5327, + "step": 1956 + }, + { + "epoch": 0.5351380913316927, + "grad_norm": 0.16343533992767334, + "learning_rate": 5e-05, + "loss": 1.6075, + "step": 1957 + }, + { + "epoch": 0.5354115395132623, + "grad_norm": 0.16122648119926453, + "learning_rate": 5e-05, + "loss": 1.577, + "step": 1958 + }, + { + "epoch": 0.5356849876948319, + "grad_norm": 0.16027076542377472, + "learning_rate": 5e-05, + "loss": 1.6378, + "step": 1959 + }, + { + "epoch": 0.5359584358764015, + "grad_norm": 0.15738053619861603, + "learning_rate": 5e-05, + "loss": 1.702, + "step": 1960 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 0.15926112234592438, + "learning_rate": 5e-05, + "loss": 1.6522, + "step": 1961 + }, + { + "epoch": 0.5365053322395406, + "grad_norm": 0.15066231787204742, + "learning_rate": 5e-05, + "loss": 1.4936, + "step": 1962 + }, + { + "epoch": 0.5367787804211102, + "grad_norm": 0.14897476136684418, + "learning_rate": 5e-05, + "loss": 1.5098, + "step": 1963 + }, + { + "epoch": 0.5370522286026798, + "grad_norm": 0.1546986699104309, + "learning_rate": 5e-05, + "loss": 1.5904, + "step": 1964 + }, + { + "epoch": 0.5373256767842494, + "grad_norm": 0.15693390369415283, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 1965 + }, + { + "epoch": 0.537599124965819, + "grad_norm": 0.16184082627296448, + "learning_rate": 5e-05, + "loss": 1.7093, + "step": 1966 + }, + { + "epoch": 0.5378725731473886, + "grad_norm": 0.157283753156662, + "learning_rate": 5e-05, + "loss": 1.7046, + "step": 1967 + }, + { + "epoch": 0.5381460213289582, + "grad_norm": 0.16195395588874817, + "learning_rate": 5e-05, + "loss": 1.6401, + "step": 1968 + }, + { + "epoch": 0.5384194695105278, + "grad_norm": 0.16151578724384308, + "learning_rate": 5e-05, + "loss": 1.6325, + "step": 1969 + }, + { + "epoch": 0.5386929176920974, + "grad_norm": 0.1499180942773819, + "learning_rate": 5e-05, + "loss": 1.5444, + "step": 1970 + }, + { + "epoch": 0.538966365873667, + "grad_norm": 0.16066747903823853, + "learning_rate": 5e-05, + "loss": 1.5921, + "step": 1971 + }, + { + "epoch": 0.5392398140552366, + "grad_norm": 0.15843364596366882, + "learning_rate": 5e-05, + "loss": 1.5952, + "step": 1972 + }, + { + "epoch": 0.5395132622368062, + "grad_norm": 0.15702027082443237, + "learning_rate": 5e-05, + "loss": 1.5061, + "step": 1973 + }, + { + "epoch": 0.5397867104183757, + "grad_norm": 0.15099968016147614, + "learning_rate": 5e-05, + "loss": 1.6227, + "step": 1974 + }, + { + "epoch": 0.5400601585999453, + "grad_norm": 0.15741010010242462, + "learning_rate": 5e-05, + "loss": 1.647, + "step": 1975 + }, + { + "epoch": 0.5403336067815149, + "grad_norm": 0.1735711693763733, + "learning_rate": 5e-05, + "loss": 1.5785, + "step": 1976 + }, + { + "epoch": 0.5406070549630845, + "grad_norm": 0.1666189730167389, + "learning_rate": 5e-05, + "loss": 1.6775, + "step": 1977 + }, + { + "epoch": 0.5408805031446541, + "grad_norm": 0.17314961552619934, + "learning_rate": 5e-05, + "loss": 1.6201, + "step": 1978 + }, + { + "epoch": 0.5411539513262237, + "grad_norm": 0.17330363392829895, + "learning_rate": 5e-05, + "loss": 1.6672, + "step": 1979 + }, + { + "epoch": 0.5414273995077933, + "grad_norm": 0.16131727397441864, + "learning_rate": 5e-05, + "loss": 1.6451, + "step": 1980 + }, + { + "epoch": 0.5417008476893629, + "grad_norm": 0.16308879852294922, + "learning_rate": 5e-05, + "loss": 1.5366, + "step": 1981 + }, + { + "epoch": 0.5419742958709325, + "grad_norm": 0.1608009785413742, + "learning_rate": 5e-05, + "loss": 1.636, + "step": 1982 + }, + { + "epoch": 0.5422477440525021, + "grad_norm": 0.1665000170469284, + "learning_rate": 5e-05, + "loss": 1.6159, + "step": 1983 + }, + { + "epoch": 0.5425211922340717, + "grad_norm": 0.15207409858703613, + "learning_rate": 5e-05, + "loss": 1.5366, + "step": 1984 + }, + { + "epoch": 0.5427946404156412, + "grad_norm": 0.17597833275794983, + "learning_rate": 5e-05, + "loss": 1.6525, + "step": 1985 + }, + { + "epoch": 0.5430680885972108, + "grad_norm": 0.15393896400928497, + "learning_rate": 5e-05, + "loss": 1.5363, + "step": 1986 + }, + { + "epoch": 0.5433415367787804, + "grad_norm": 0.17519448697566986, + "learning_rate": 5e-05, + "loss": 1.6806, + "step": 1987 + }, + { + "epoch": 0.54361498496035, + "grad_norm": 0.1612955778837204, + "learning_rate": 5e-05, + "loss": 1.531, + "step": 1988 + }, + { + "epoch": 0.5438884331419196, + "grad_norm": 0.15321476757526398, + "learning_rate": 5e-05, + "loss": 1.4887, + "step": 1989 + }, + { + "epoch": 0.5441618813234892, + "grad_norm": 0.1556374877691269, + "learning_rate": 5e-05, + "loss": 1.625, + "step": 1990 + }, + { + "epoch": 0.5444353295050588, + "grad_norm": 0.1725417673587799, + "learning_rate": 5e-05, + "loss": 1.6286, + "step": 1991 + }, + { + "epoch": 0.5447087776866284, + "grad_norm": 0.15200626850128174, + "learning_rate": 5e-05, + "loss": 1.5596, + "step": 1992 + }, + { + "epoch": 0.544982225868198, + "grad_norm": 0.1692056953907013, + "learning_rate": 5e-05, + "loss": 1.7184, + "step": 1993 + }, + { + "epoch": 0.5452556740497676, + "grad_norm": 0.15530715882778168, + "learning_rate": 5e-05, + "loss": 1.493, + "step": 1994 + }, + { + "epoch": 0.5455291222313372, + "grad_norm": 0.1559220254421234, + "learning_rate": 5e-05, + "loss": 1.5517, + "step": 1995 + }, + { + "epoch": 0.5458025704129068, + "grad_norm": 0.15481142699718475, + "learning_rate": 5e-05, + "loss": 1.5246, + "step": 1996 + }, + { + "epoch": 0.5460760185944763, + "grad_norm": 0.1693953573703766, + "learning_rate": 5e-05, + "loss": 1.6766, + "step": 1997 + }, + { + "epoch": 0.5463494667760459, + "grad_norm": 0.14999975264072418, + "learning_rate": 5e-05, + "loss": 1.5715, + "step": 1998 + }, + { + "epoch": 0.5466229149576155, + "grad_norm": 0.17395493388175964, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 1999 + }, + { + "epoch": 0.5468963631391851, + "grad_norm": 0.16458867490291595, + "learning_rate": 5e-05, + "loss": 1.5627, + "step": 2000 + }, + { + "epoch": 0.5471698113207547, + "grad_norm": 0.15648111701011658, + "learning_rate": 5e-05, + "loss": 1.6565, + "step": 2001 + }, + { + "epoch": 0.5474432595023243, + "grad_norm": 0.16169799864292145, + "learning_rate": 5e-05, + "loss": 1.6626, + "step": 2002 + }, + { + "epoch": 0.5477167076838939, + "grad_norm": 0.17330998182296753, + "learning_rate": 5e-05, + "loss": 1.6268, + "step": 2003 + }, + { + "epoch": 0.5479901558654635, + "grad_norm": 0.1588200032711029, + "learning_rate": 5e-05, + "loss": 1.5532, + "step": 2004 + }, + { + "epoch": 0.5482636040470331, + "grad_norm": 0.16601422429084778, + "learning_rate": 5e-05, + "loss": 1.5976, + "step": 2005 + }, + { + "epoch": 0.5485370522286027, + "grad_norm": 0.17725588381290436, + "learning_rate": 5e-05, + "loss": 1.6347, + "step": 2006 + }, + { + "epoch": 0.5488105004101723, + "grad_norm": 0.15480291843414307, + "learning_rate": 5e-05, + "loss": 1.585, + "step": 2007 + }, + { + "epoch": 0.5490839485917418, + "grad_norm": 0.1640474647283554, + "learning_rate": 5e-05, + "loss": 1.4857, + "step": 2008 + }, + { + "epoch": 0.5493573967733114, + "grad_norm": 0.17515790462493896, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 2009 + }, + { + "epoch": 0.549630844954881, + "grad_norm": 0.15881620347499847, + "learning_rate": 5e-05, + "loss": 1.5243, + "step": 2010 + }, + { + "epoch": 0.5499042931364506, + "grad_norm": 0.1577688604593277, + "learning_rate": 5e-05, + "loss": 1.6015, + "step": 2011 + }, + { + "epoch": 0.5501777413180202, + "grad_norm": 0.16591089963912964, + "learning_rate": 5e-05, + "loss": 1.6122, + "step": 2012 + }, + { + "epoch": 0.5504511894995898, + "grad_norm": 0.16563619673252106, + "learning_rate": 5e-05, + "loss": 1.6371, + "step": 2013 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 0.17261956632137299, + "learning_rate": 5e-05, + "loss": 1.6834, + "step": 2014 + }, + { + "epoch": 0.550998085862729, + "grad_norm": 0.15990975499153137, + "learning_rate": 5e-05, + "loss": 1.6495, + "step": 2015 + }, + { + "epoch": 0.5512715340442986, + "grad_norm": 0.15920180082321167, + "learning_rate": 5e-05, + "loss": 1.6299, + "step": 2016 + }, + { + "epoch": 0.5515449822258682, + "grad_norm": 0.15567980706691742, + "learning_rate": 5e-05, + "loss": 1.5456, + "step": 2017 + }, + { + "epoch": 0.5518184304074378, + "grad_norm": 0.155466690659523, + "learning_rate": 5e-05, + "loss": 1.6061, + "step": 2018 + }, + { + "epoch": 0.5520918785890074, + "grad_norm": 0.16620704531669617, + "learning_rate": 5e-05, + "loss": 1.6645, + "step": 2019 + }, + { + "epoch": 0.5523653267705769, + "grad_norm": 0.16219502687454224, + "learning_rate": 5e-05, + "loss": 1.7455, + "step": 2020 + }, + { + "epoch": 0.5526387749521465, + "grad_norm": 0.1692618727684021, + "learning_rate": 5e-05, + "loss": 1.6036, + "step": 2021 + }, + { + "epoch": 0.5529122231337161, + "grad_norm": 0.1535937339067459, + "learning_rate": 5e-05, + "loss": 1.614, + "step": 2022 + }, + { + "epoch": 0.5531856713152857, + "grad_norm": 0.16089798510074615, + "learning_rate": 5e-05, + "loss": 1.585, + "step": 2023 + }, + { + "epoch": 0.5534591194968553, + "grad_norm": 0.16011877357959747, + "learning_rate": 5e-05, + "loss": 1.6358, + "step": 2024 + }, + { + "epoch": 0.5537325676784249, + "grad_norm": 0.1536594033241272, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 2025 + }, + { + "epoch": 0.5540060158599945, + "grad_norm": 0.15482646226882935, + "learning_rate": 5e-05, + "loss": 1.5291, + "step": 2026 + }, + { + "epoch": 0.5542794640415641, + "grad_norm": 0.16264241933822632, + "learning_rate": 5e-05, + "loss": 1.5984, + "step": 2027 + }, + { + "epoch": 0.5545529122231337, + "grad_norm": 0.16012269258499146, + "learning_rate": 5e-05, + "loss": 1.6847, + "step": 2028 + }, + { + "epoch": 0.5548263604047033, + "grad_norm": 0.16615892946720123, + "learning_rate": 5e-05, + "loss": 1.5873, + "step": 2029 + }, + { + "epoch": 0.555099808586273, + "grad_norm": 0.15969812870025635, + "learning_rate": 5e-05, + "loss": 1.5295, + "step": 2030 + }, + { + "epoch": 0.5553732567678425, + "grad_norm": 0.17502804100513458, + "learning_rate": 5e-05, + "loss": 1.6885, + "step": 2031 + }, + { + "epoch": 0.555646704949412, + "grad_norm": 0.15848223865032196, + "learning_rate": 5e-05, + "loss": 1.5479, + "step": 2032 + }, + { + "epoch": 0.5559201531309816, + "grad_norm": 0.16243097186088562, + "learning_rate": 5e-05, + "loss": 1.6315, + "step": 2033 + }, + { + "epoch": 0.5561936013125512, + "grad_norm": 0.15360400080680847, + "learning_rate": 5e-05, + "loss": 1.5927, + "step": 2034 + }, + { + "epoch": 0.5564670494941208, + "grad_norm": 0.17348462343215942, + "learning_rate": 5e-05, + "loss": 1.6338, + "step": 2035 + }, + { + "epoch": 0.5567404976756904, + "grad_norm": 0.14669735729694366, + "learning_rate": 5e-05, + "loss": 1.5976, + "step": 2036 + }, + { + "epoch": 0.55701394585726, + "grad_norm": 0.17374375462532043, + "learning_rate": 5e-05, + "loss": 1.6696, + "step": 2037 + }, + { + "epoch": 0.5572873940388297, + "grad_norm": 0.1698565036058426, + "learning_rate": 5e-05, + "loss": 1.7155, + "step": 2038 + }, + { + "epoch": 0.5575608422203993, + "grad_norm": 0.16311299800872803, + "learning_rate": 5e-05, + "loss": 1.5581, + "step": 2039 + }, + { + "epoch": 0.5578342904019689, + "grad_norm": 0.1789819747209549, + "learning_rate": 5e-05, + "loss": 1.6041, + "step": 2040 + }, + { + "epoch": 0.5581077385835385, + "grad_norm": 0.15918207168579102, + "learning_rate": 5e-05, + "loss": 1.6056, + "step": 2041 + }, + { + "epoch": 0.5583811867651081, + "grad_norm": 0.1748282015323639, + "learning_rate": 5e-05, + "loss": 1.6903, + "step": 2042 + }, + { + "epoch": 0.5586546349466776, + "grad_norm": 0.15822182595729828, + "learning_rate": 5e-05, + "loss": 1.6031, + "step": 2043 + }, + { + "epoch": 0.5589280831282472, + "grad_norm": 0.15568383038043976, + "learning_rate": 5e-05, + "loss": 1.6616, + "step": 2044 + }, + { + "epoch": 0.5592015313098168, + "grad_norm": 0.1490369737148285, + "learning_rate": 5e-05, + "loss": 1.554, + "step": 2045 + }, + { + "epoch": 0.5594749794913864, + "grad_norm": 0.1597105860710144, + "learning_rate": 5e-05, + "loss": 1.5907, + "step": 2046 + }, + { + "epoch": 0.559748427672956, + "grad_norm": 0.15021972358226776, + "learning_rate": 5e-05, + "loss": 1.5364, + "step": 2047 + }, + { + "epoch": 0.5600218758545256, + "grad_norm": 0.16100069880485535, + "learning_rate": 5e-05, + "loss": 1.6302, + "step": 2048 + }, + { + "epoch": 0.5602953240360952, + "grad_norm": 0.1672513782978058, + "learning_rate": 5e-05, + "loss": 1.6362, + "step": 2049 + }, + { + "epoch": 0.5605687722176648, + "grad_norm": 0.14868006110191345, + "learning_rate": 5e-05, + "loss": 1.5026, + "step": 2050 + }, + { + "epoch": 0.5608422203992344, + "grad_norm": 0.15839874744415283, + "learning_rate": 5e-05, + "loss": 1.676, + "step": 2051 + }, + { + "epoch": 0.561115668580804, + "grad_norm": 0.15299920737743378, + "learning_rate": 5e-05, + "loss": 1.6009, + "step": 2052 + }, + { + "epoch": 0.5613891167623736, + "grad_norm": 0.15907810628414154, + "learning_rate": 5e-05, + "loss": 1.6038, + "step": 2053 + }, + { + "epoch": 0.5616625649439432, + "grad_norm": 0.15567612648010254, + "learning_rate": 5e-05, + "loss": 1.6237, + "step": 2054 + }, + { + "epoch": 0.5619360131255127, + "grad_norm": 0.1545831263065338, + "learning_rate": 5e-05, + "loss": 1.5655, + "step": 2055 + }, + { + "epoch": 0.5622094613070823, + "grad_norm": 0.15580891072750092, + "learning_rate": 5e-05, + "loss": 1.5292, + "step": 2056 + }, + { + "epoch": 0.5624829094886519, + "grad_norm": 0.1534157246351242, + "learning_rate": 5e-05, + "loss": 1.5798, + "step": 2057 + }, + { + "epoch": 0.5627563576702215, + "grad_norm": 0.17224334180355072, + "learning_rate": 5e-05, + "loss": 1.705, + "step": 2058 + }, + { + "epoch": 0.5630298058517911, + "grad_norm": 0.15283888578414917, + "learning_rate": 5e-05, + "loss": 1.6189, + "step": 2059 + }, + { + "epoch": 0.5633032540333607, + "grad_norm": 0.1531190425157547, + "learning_rate": 5e-05, + "loss": 1.4706, + "step": 2060 + }, + { + "epoch": 0.5635767022149303, + "grad_norm": 0.15777452290058136, + "learning_rate": 5e-05, + "loss": 1.5925, + "step": 2061 + }, + { + "epoch": 0.5638501503964999, + "grad_norm": 0.16896897554397583, + "learning_rate": 5e-05, + "loss": 1.6667, + "step": 2062 + }, + { + "epoch": 0.5641235985780695, + "grad_norm": 0.15329276025295258, + "learning_rate": 5e-05, + "loss": 1.5491, + "step": 2063 + }, + { + "epoch": 0.5643970467596391, + "grad_norm": 0.16101738810539246, + "learning_rate": 5e-05, + "loss": 1.6393, + "step": 2064 + }, + { + "epoch": 0.5646704949412087, + "grad_norm": 0.1673561930656433, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 2065 + }, + { + "epoch": 0.5649439431227782, + "grad_norm": 0.1541615128517151, + "learning_rate": 5e-05, + "loss": 1.559, + "step": 2066 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.16007159650325775, + "learning_rate": 5e-05, + "loss": 1.584, + "step": 2067 + }, + { + "epoch": 0.5654908394859174, + "grad_norm": 0.1529233604669571, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 2068 + }, + { + "epoch": 0.565764287667487, + "grad_norm": 0.1614188849925995, + "learning_rate": 5e-05, + "loss": 1.6406, + "step": 2069 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.15437230467796326, + "learning_rate": 5e-05, + "loss": 1.6067, + "step": 2070 + }, + { + "epoch": 0.5663111840306262, + "grad_norm": 0.15617471933364868, + "learning_rate": 5e-05, + "loss": 1.6215, + "step": 2071 + }, + { + "epoch": 0.5665846322121958, + "grad_norm": 0.15771955251693726, + "learning_rate": 5e-05, + "loss": 1.5109, + "step": 2072 + }, + { + "epoch": 0.5668580803937654, + "grad_norm": 0.1565471887588501, + "learning_rate": 5e-05, + "loss": 1.5407, + "step": 2073 + }, + { + "epoch": 0.567131528575335, + "grad_norm": 0.16085505485534668, + "learning_rate": 5e-05, + "loss": 1.6118, + "step": 2074 + }, + { + "epoch": 0.5674049767569046, + "grad_norm": 0.15981802344322205, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 2075 + }, + { + "epoch": 0.5676784249384742, + "grad_norm": 0.16176685690879822, + "learning_rate": 5e-05, + "loss": 1.6723, + "step": 2076 + }, + { + "epoch": 0.5679518731200438, + "grad_norm": 0.17309653759002686, + "learning_rate": 5e-05, + "loss": 1.7196, + "step": 2077 + }, + { + "epoch": 0.5682253213016133, + "grad_norm": 0.15585723519325256, + "learning_rate": 5e-05, + "loss": 1.5884, + "step": 2078 + }, + { + "epoch": 0.5684987694831829, + "grad_norm": 0.1600128412246704, + "learning_rate": 5e-05, + "loss": 1.6116, + "step": 2079 + }, + { + "epoch": 0.5687722176647525, + "grad_norm": 0.16564618051052094, + "learning_rate": 5e-05, + "loss": 1.5695, + "step": 2080 + }, + { + "epoch": 0.5690456658463221, + "grad_norm": 0.15419046580791473, + "learning_rate": 5e-05, + "loss": 1.5458, + "step": 2081 + }, + { + "epoch": 0.5693191140278917, + "grad_norm": 0.17247512936592102, + "learning_rate": 5e-05, + "loss": 1.6021, + "step": 2082 + }, + { + "epoch": 0.5695925622094613, + "grad_norm": 0.16553881764411926, + "learning_rate": 5e-05, + "loss": 1.5771, + "step": 2083 + }, + { + "epoch": 0.5698660103910309, + "grad_norm": 0.16219867765903473, + "learning_rate": 5e-05, + "loss": 1.6749, + "step": 2084 + }, + { + "epoch": 0.5701394585726005, + "grad_norm": 0.17330633103847504, + "learning_rate": 5e-05, + "loss": 1.6143, + "step": 2085 + }, + { + "epoch": 0.5704129067541701, + "grad_norm": 0.16274374723434448, + "learning_rate": 5e-05, + "loss": 1.6205, + "step": 2086 + }, + { + "epoch": 0.5706863549357397, + "grad_norm": 0.15556305646896362, + "learning_rate": 5e-05, + "loss": 1.5875, + "step": 2087 + }, + { + "epoch": 0.5709598031173093, + "grad_norm": 0.15635040402412415, + "learning_rate": 5e-05, + "loss": 1.6349, + "step": 2088 + }, + { + "epoch": 0.5712332512988788, + "grad_norm": 0.160813108086586, + "learning_rate": 5e-05, + "loss": 1.604, + "step": 2089 + }, + { + "epoch": 0.5715066994804484, + "grad_norm": 0.15628398954868317, + "learning_rate": 5e-05, + "loss": 1.5023, + "step": 2090 + }, + { + "epoch": 0.571780147662018, + "grad_norm": 0.18704648315906525, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 2091 + }, + { + "epoch": 0.5720535958435876, + "grad_norm": 0.1672285944223404, + "learning_rate": 5e-05, + "loss": 1.643, + "step": 2092 + }, + { + "epoch": 0.5723270440251572, + "grad_norm": 0.15511822700500488, + "learning_rate": 5e-05, + "loss": 1.6143, + "step": 2093 + }, + { + "epoch": 0.5726004922067268, + "grad_norm": 0.16968627274036407, + "learning_rate": 5e-05, + "loss": 1.649, + "step": 2094 + }, + { + "epoch": 0.5728739403882964, + "grad_norm": 0.16604706645011902, + "learning_rate": 5e-05, + "loss": 1.6111, + "step": 2095 + }, + { + "epoch": 0.573147388569866, + "grad_norm": 0.16677343845367432, + "learning_rate": 5e-05, + "loss": 1.6142, + "step": 2096 + }, + { + "epoch": 0.5734208367514356, + "grad_norm": 0.16563932597637177, + "learning_rate": 5e-05, + "loss": 1.6043, + "step": 2097 + }, + { + "epoch": 0.5736942849330052, + "grad_norm": 0.15738388895988464, + "learning_rate": 5e-05, + "loss": 1.5803, + "step": 2098 + }, + { + "epoch": 0.5739677331145748, + "grad_norm": 0.15241163969039917, + "learning_rate": 5e-05, + "loss": 1.5756, + "step": 2099 + }, + { + "epoch": 0.5742411812961444, + "grad_norm": 0.1751902997493744, + "learning_rate": 5e-05, + "loss": 1.6738, + "step": 2100 + }, + { + "epoch": 0.5745146294777139, + "grad_norm": 0.15986758470535278, + "learning_rate": 5e-05, + "loss": 1.5693, + "step": 2101 + }, + { + "epoch": 0.5747880776592835, + "grad_norm": 0.19663068652153015, + "learning_rate": 5e-05, + "loss": 1.59, + "step": 2102 + }, + { + "epoch": 0.5750615258408531, + "grad_norm": 0.15884363651275635, + "learning_rate": 5e-05, + "loss": 1.6378, + "step": 2103 + }, + { + "epoch": 0.5753349740224227, + "grad_norm": 0.1701025813817978, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 2104 + }, + { + "epoch": 0.5756084222039923, + "grad_norm": 0.1757257878780365, + "learning_rate": 5e-05, + "loss": 1.7625, + "step": 2105 + }, + { + "epoch": 0.5758818703855619, + "grad_norm": 0.15682871639728546, + "learning_rate": 5e-05, + "loss": 1.6039, + "step": 2106 + }, + { + "epoch": 0.5761553185671315, + "grad_norm": 0.15779271721839905, + "learning_rate": 5e-05, + "loss": 1.6915, + "step": 2107 + }, + { + "epoch": 0.5764287667487011, + "grad_norm": 0.15956608951091766, + "learning_rate": 5e-05, + "loss": 1.622, + "step": 2108 + }, + { + "epoch": 0.5767022149302707, + "grad_norm": 0.15776754915714264, + "learning_rate": 5e-05, + "loss": 1.6435, + "step": 2109 + }, + { + "epoch": 0.5769756631118403, + "grad_norm": 0.17491094768047333, + "learning_rate": 5e-05, + "loss": 1.5058, + "step": 2110 + }, + { + "epoch": 0.5772491112934099, + "grad_norm": 0.15431523323059082, + "learning_rate": 5e-05, + "loss": 1.6608, + "step": 2111 + }, + { + "epoch": 0.5775225594749795, + "grad_norm": 0.15970687568187714, + "learning_rate": 5e-05, + "loss": 1.6203, + "step": 2112 + }, + { + "epoch": 0.577796007656549, + "grad_norm": 0.16885408759117126, + "learning_rate": 5e-05, + "loss": 1.6354, + "step": 2113 + }, + { + "epoch": 0.5780694558381186, + "grad_norm": 0.1878814995288849, + "learning_rate": 5e-05, + "loss": 1.6947, + "step": 2114 + }, + { + "epoch": 0.5783429040196882, + "grad_norm": 0.17708611488342285, + "learning_rate": 5e-05, + "loss": 1.5832, + "step": 2115 + }, + { + "epoch": 0.5786163522012578, + "grad_norm": 0.16223332285881042, + "learning_rate": 5e-05, + "loss": 1.7159, + "step": 2116 + }, + { + "epoch": 0.5788898003828274, + "grad_norm": 0.1625894010066986, + "learning_rate": 5e-05, + "loss": 1.6246, + "step": 2117 + }, + { + "epoch": 0.579163248564397, + "grad_norm": 0.1753711998462677, + "learning_rate": 5e-05, + "loss": 1.629, + "step": 2118 + }, + { + "epoch": 0.5794366967459667, + "grad_norm": 0.15611834824085236, + "learning_rate": 5e-05, + "loss": 1.6562, + "step": 2119 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.1676332801580429, + "learning_rate": 5e-05, + "loss": 1.6435, + "step": 2120 + }, + { + "epoch": 0.5799835931091059, + "grad_norm": 0.16473741829395294, + "learning_rate": 5e-05, + "loss": 1.5665, + "step": 2121 + }, + { + "epoch": 0.5802570412906755, + "grad_norm": 0.1505088359117508, + "learning_rate": 5e-05, + "loss": 1.5784, + "step": 2122 + }, + { + "epoch": 0.5805304894722451, + "grad_norm": 0.1565895527601242, + "learning_rate": 5e-05, + "loss": 1.6327, + "step": 2123 + }, + { + "epoch": 0.5808039376538146, + "grad_norm": 0.16196846961975098, + "learning_rate": 5e-05, + "loss": 1.503, + "step": 2124 + }, + { + "epoch": 0.5810773858353842, + "grad_norm": 0.1584513634443283, + "learning_rate": 5e-05, + "loss": 1.6884, + "step": 2125 + }, + { + "epoch": 0.5813508340169538, + "grad_norm": 0.1517631560564041, + "learning_rate": 5e-05, + "loss": 1.4673, + "step": 2126 + }, + { + "epoch": 0.5816242821985234, + "grad_norm": 0.15401685237884521, + "learning_rate": 5e-05, + "loss": 1.5718, + "step": 2127 + }, + { + "epoch": 0.581897730380093, + "grad_norm": 0.1545703262090683, + "learning_rate": 5e-05, + "loss": 1.6156, + "step": 2128 + }, + { + "epoch": 0.5821711785616626, + "grad_norm": 0.1678171306848526, + "learning_rate": 5e-05, + "loss": 1.7368, + "step": 2129 + }, + { + "epoch": 0.5824446267432322, + "grad_norm": 0.15641961991786957, + "learning_rate": 5e-05, + "loss": 1.7208, + "step": 2130 + }, + { + "epoch": 0.5827180749248018, + "grad_norm": 0.16505669057369232, + "learning_rate": 5e-05, + "loss": 1.6648, + "step": 2131 + }, + { + "epoch": 0.5829915231063714, + "grad_norm": 0.15975305438041687, + "learning_rate": 5e-05, + "loss": 1.577, + "step": 2132 + }, + { + "epoch": 0.583264971287941, + "grad_norm": 0.15941214561462402, + "learning_rate": 5e-05, + "loss": 1.6433, + "step": 2133 + }, + { + "epoch": 0.5835384194695106, + "grad_norm": 0.15701207518577576, + "learning_rate": 5e-05, + "loss": 1.5609, + "step": 2134 + }, + { + "epoch": 0.5838118676510802, + "grad_norm": 0.1584734469652176, + "learning_rate": 5e-05, + "loss": 1.5644, + "step": 2135 + }, + { + "epoch": 0.5840853158326497, + "grad_norm": 0.149795264005661, + "learning_rate": 5e-05, + "loss": 1.5459, + "step": 2136 + }, + { + "epoch": 0.5843587640142193, + "grad_norm": 0.16000156104564667, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 2137 + }, + { + "epoch": 0.5846322121957889, + "grad_norm": 0.14752228558063507, + "learning_rate": 5e-05, + "loss": 1.4835, + "step": 2138 + }, + { + "epoch": 0.5849056603773585, + "grad_norm": 0.1613810956478119, + "learning_rate": 5e-05, + "loss": 1.5714, + "step": 2139 + }, + { + "epoch": 0.5851791085589281, + "grad_norm": 0.15406599640846252, + "learning_rate": 5e-05, + "loss": 1.5929, + "step": 2140 + }, + { + "epoch": 0.5854525567404977, + "grad_norm": 0.15958181023597717, + "learning_rate": 5e-05, + "loss": 1.6232, + "step": 2141 + }, + { + "epoch": 0.5857260049220673, + "grad_norm": 0.16171377897262573, + "learning_rate": 5e-05, + "loss": 1.7343, + "step": 2142 + }, + { + "epoch": 0.5859994531036369, + "grad_norm": 0.15244810283184052, + "learning_rate": 5e-05, + "loss": 1.6367, + "step": 2143 + }, + { + "epoch": 0.5862729012852065, + "grad_norm": 0.16295252740383148, + "learning_rate": 5e-05, + "loss": 1.6822, + "step": 2144 + }, + { + "epoch": 0.5865463494667761, + "grad_norm": 0.16270601749420166, + "learning_rate": 5e-05, + "loss": 1.5552, + "step": 2145 + }, + { + "epoch": 0.5868197976483457, + "grad_norm": 0.1615038365125656, + "learning_rate": 5e-05, + "loss": 1.6273, + "step": 2146 + }, + { + "epoch": 0.5870932458299152, + "grad_norm": 0.17501547932624817, + "learning_rate": 5e-05, + "loss": 1.6598, + "step": 2147 + }, + { + "epoch": 0.5873666940114848, + "grad_norm": 0.15854844450950623, + "learning_rate": 5e-05, + "loss": 1.6657, + "step": 2148 + }, + { + "epoch": 0.5876401421930544, + "grad_norm": 0.15156009793281555, + "learning_rate": 5e-05, + "loss": 1.6571, + "step": 2149 + }, + { + "epoch": 0.587913590374624, + "grad_norm": 0.15848124027252197, + "learning_rate": 5e-05, + "loss": 1.6176, + "step": 2150 + }, + { + "epoch": 0.5881870385561936, + "grad_norm": 0.15376807749271393, + "learning_rate": 5e-05, + "loss": 1.659, + "step": 2151 + }, + { + "epoch": 0.5884604867377632, + "grad_norm": 0.15940286219120026, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 2152 + }, + { + "epoch": 0.5887339349193328, + "grad_norm": 0.1563788503408432, + "learning_rate": 5e-05, + "loss": 1.7226, + "step": 2153 + }, + { + "epoch": 0.5890073831009024, + "grad_norm": 0.14890335500240326, + "learning_rate": 5e-05, + "loss": 1.5368, + "step": 2154 + }, + { + "epoch": 0.589280831282472, + "grad_norm": 0.16119109094142914, + "learning_rate": 5e-05, + "loss": 1.64, + "step": 2155 + }, + { + "epoch": 0.5895542794640416, + "grad_norm": 0.15738506615161896, + "learning_rate": 5e-05, + "loss": 1.6306, + "step": 2156 + }, + { + "epoch": 0.5898277276456112, + "grad_norm": 0.15417338907718658, + "learning_rate": 5e-05, + "loss": 1.5886, + "step": 2157 + }, + { + "epoch": 0.5901011758271808, + "grad_norm": 0.15935613214969635, + "learning_rate": 5e-05, + "loss": 1.7004, + "step": 2158 + }, + { + "epoch": 0.5903746240087503, + "grad_norm": 0.15843814611434937, + "learning_rate": 5e-05, + "loss": 1.684, + "step": 2159 + }, + { + "epoch": 0.5906480721903199, + "grad_norm": 0.15500840544700623, + "learning_rate": 5e-05, + "loss": 1.6215, + "step": 2160 + }, + { + "epoch": 0.5909215203718895, + "grad_norm": 0.15437741577625275, + "learning_rate": 5e-05, + "loss": 1.5473, + "step": 2161 + }, + { + "epoch": 0.5911949685534591, + "grad_norm": 0.17060165107250214, + "learning_rate": 5e-05, + "loss": 1.6937, + "step": 2162 + }, + { + "epoch": 0.5914684167350287, + "grad_norm": 0.15199008584022522, + "learning_rate": 5e-05, + "loss": 1.5557, + "step": 2163 + }, + { + "epoch": 0.5917418649165983, + "grad_norm": 0.16401898860931396, + "learning_rate": 5e-05, + "loss": 1.6955, + "step": 2164 + }, + { + "epoch": 0.5920153130981679, + "grad_norm": 0.16618569195270538, + "learning_rate": 5e-05, + "loss": 1.6962, + "step": 2165 + }, + { + "epoch": 0.5922887612797375, + "grad_norm": 0.1566200703382492, + "learning_rate": 5e-05, + "loss": 1.6495, + "step": 2166 + }, + { + "epoch": 0.5925622094613071, + "grad_norm": 0.15811263024806976, + "learning_rate": 5e-05, + "loss": 1.6775, + "step": 2167 + }, + { + "epoch": 0.5928356576428767, + "grad_norm": 0.16176708042621613, + "learning_rate": 5e-05, + "loss": 1.5412, + "step": 2168 + }, + { + "epoch": 0.5931091058244463, + "grad_norm": 0.15909145772457123, + "learning_rate": 5e-05, + "loss": 1.6401, + "step": 2169 + }, + { + "epoch": 0.5933825540060158, + "grad_norm": 0.15169541537761688, + "learning_rate": 5e-05, + "loss": 1.6311, + "step": 2170 + }, + { + "epoch": 0.5936560021875854, + "grad_norm": 0.1585574597120285, + "learning_rate": 5e-05, + "loss": 1.613, + "step": 2171 + }, + { + "epoch": 0.593929450369155, + "grad_norm": 0.1560421884059906, + "learning_rate": 5e-05, + "loss": 1.581, + "step": 2172 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 0.1590547412633896, + "learning_rate": 5e-05, + "loss": 1.581, + "step": 2173 + }, + { + "epoch": 0.5944763467322942, + "grad_norm": 0.1633371263742447, + "learning_rate": 5e-05, + "loss": 1.5969, + "step": 2174 + }, + { + "epoch": 0.5947497949138638, + "grad_norm": 0.16914770007133484, + "learning_rate": 5e-05, + "loss": 1.6087, + "step": 2175 + }, + { + "epoch": 0.5950232430954334, + "grad_norm": 0.15823769569396973, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 2176 + }, + { + "epoch": 0.595296691277003, + "grad_norm": 0.1536397933959961, + "learning_rate": 5e-05, + "loss": 1.5132, + "step": 2177 + }, + { + "epoch": 0.5955701394585726, + "grad_norm": 0.16653746366500854, + "learning_rate": 5e-05, + "loss": 1.5937, + "step": 2178 + }, + { + "epoch": 0.5958435876401422, + "grad_norm": 0.17655491828918457, + "learning_rate": 5e-05, + "loss": 1.7522, + "step": 2179 + }, + { + "epoch": 0.5961170358217118, + "grad_norm": 0.16123391687870026, + "learning_rate": 5e-05, + "loss": 1.6493, + "step": 2180 + }, + { + "epoch": 0.5963904840032814, + "grad_norm": 0.16177600622177124, + "learning_rate": 5e-05, + "loss": 1.6801, + "step": 2181 + }, + { + "epoch": 0.5966639321848509, + "grad_norm": 0.16101764142513275, + "learning_rate": 5e-05, + "loss": 1.6148, + "step": 2182 + }, + { + "epoch": 0.5969373803664205, + "grad_norm": 0.15031690895557404, + "learning_rate": 5e-05, + "loss": 1.5556, + "step": 2183 + }, + { + "epoch": 0.5972108285479901, + "grad_norm": 0.15750014781951904, + "learning_rate": 5e-05, + "loss": 1.6032, + "step": 2184 + }, + { + "epoch": 0.5974842767295597, + "grad_norm": 0.16449135541915894, + "learning_rate": 5e-05, + "loss": 1.6762, + "step": 2185 + }, + { + "epoch": 0.5977577249111293, + "grad_norm": 0.1698635369539261, + "learning_rate": 5e-05, + "loss": 1.6279, + "step": 2186 + }, + { + "epoch": 0.5980311730926989, + "grad_norm": 0.16983488202095032, + "learning_rate": 5e-05, + "loss": 1.5975, + "step": 2187 + }, + { + "epoch": 0.5983046212742685, + "grad_norm": 0.16298316419124603, + "learning_rate": 5e-05, + "loss": 1.603, + "step": 2188 + }, + { + "epoch": 0.5985780694558381, + "grad_norm": 0.16714924573898315, + "learning_rate": 5e-05, + "loss": 1.6296, + "step": 2189 + }, + { + "epoch": 0.5988515176374077, + "grad_norm": 0.16718335449695587, + "learning_rate": 5e-05, + "loss": 1.7187, + "step": 2190 + }, + { + "epoch": 0.5991249658189773, + "grad_norm": 0.16621999442577362, + "learning_rate": 5e-05, + "loss": 1.631, + "step": 2191 + }, + { + "epoch": 0.5993984140005469, + "grad_norm": 0.15571008622646332, + "learning_rate": 5e-05, + "loss": 1.6298, + "step": 2192 + }, + { + "epoch": 0.5996718621821165, + "grad_norm": 0.16420917212963104, + "learning_rate": 5e-05, + "loss": 1.721, + "step": 2193 + }, + { + "epoch": 0.599945310363686, + "grad_norm": 0.174006387591362, + "learning_rate": 5e-05, + "loss": 1.6524, + "step": 2194 + }, + { + "epoch": 0.6002187585452556, + "grad_norm": 0.1774541437625885, + "learning_rate": 5e-05, + "loss": 1.7686, + "step": 2195 + }, + { + "epoch": 0.6004922067268252, + "grad_norm": 0.1665443480014801, + "learning_rate": 5e-05, + "loss": 1.6123, + "step": 2196 + }, + { + "epoch": 0.6007656549083948, + "grad_norm": 0.1589983105659485, + "learning_rate": 5e-05, + "loss": 1.6771, + "step": 2197 + }, + { + "epoch": 0.6010391030899644, + "grad_norm": 0.171598419547081, + "learning_rate": 5e-05, + "loss": 1.6487, + "step": 2198 + }, + { + "epoch": 0.601312551271534, + "grad_norm": 0.16494937241077423, + "learning_rate": 5e-05, + "loss": 1.5661, + "step": 2199 + }, + { + "epoch": 0.6015859994531036, + "grad_norm": 0.16418486833572388, + "learning_rate": 5e-05, + "loss": 1.5924, + "step": 2200 + }, + { + "epoch": 0.6018594476346733, + "grad_norm": 0.16469943523406982, + "learning_rate": 5e-05, + "loss": 1.6722, + "step": 2201 + }, + { + "epoch": 0.6021328958162429, + "grad_norm": 0.1632019281387329, + "learning_rate": 5e-05, + "loss": 1.5975, + "step": 2202 + }, + { + "epoch": 0.6024063439978125, + "grad_norm": 0.1550121158361435, + "learning_rate": 5e-05, + "loss": 1.6071, + "step": 2203 + }, + { + "epoch": 0.6026797921793821, + "grad_norm": 0.16901268064975739, + "learning_rate": 5e-05, + "loss": 1.7557, + "step": 2204 + }, + { + "epoch": 0.6029532403609515, + "grad_norm": 0.16295814514160156, + "learning_rate": 5e-05, + "loss": 1.6243, + "step": 2205 + }, + { + "epoch": 0.6032266885425212, + "grad_norm": 0.15237173438072205, + "learning_rate": 5e-05, + "loss": 1.5015, + "step": 2206 + }, + { + "epoch": 0.6035001367240908, + "grad_norm": 0.17193639278411865, + "learning_rate": 5e-05, + "loss": 1.6746, + "step": 2207 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.154008686542511, + "learning_rate": 5e-05, + "loss": 1.5166, + "step": 2208 + }, + { + "epoch": 0.60404703308723, + "grad_norm": 0.1634528785943985, + "learning_rate": 5e-05, + "loss": 1.6208, + "step": 2209 + }, + { + "epoch": 0.6043204812687996, + "grad_norm": 0.16617950797080994, + "learning_rate": 5e-05, + "loss": 1.6649, + "step": 2210 + }, + { + "epoch": 0.6045939294503692, + "grad_norm": 0.1584254801273346, + "learning_rate": 5e-05, + "loss": 1.6875, + "step": 2211 + }, + { + "epoch": 0.6048673776319388, + "grad_norm": 0.15937229990959167, + "learning_rate": 5e-05, + "loss": 1.5211, + "step": 2212 + }, + { + "epoch": 0.6051408258135084, + "grad_norm": 0.15841583907604218, + "learning_rate": 5e-05, + "loss": 1.5409, + "step": 2213 + }, + { + "epoch": 0.605414273995078, + "grad_norm": 0.15823234617710114, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 2214 + }, + { + "epoch": 0.6056877221766476, + "grad_norm": 0.15863433480262756, + "learning_rate": 5e-05, + "loss": 1.59, + "step": 2215 + }, + { + "epoch": 0.6059611703582172, + "grad_norm": 0.16127796471118927, + "learning_rate": 5e-05, + "loss": 1.6605, + "step": 2216 + }, + { + "epoch": 0.6062346185397867, + "grad_norm": 0.15523792803287506, + "learning_rate": 5e-05, + "loss": 1.551, + "step": 2217 + }, + { + "epoch": 0.6065080667213563, + "grad_norm": 0.16734172403812408, + "learning_rate": 5e-05, + "loss": 1.5959, + "step": 2218 + }, + { + "epoch": 0.6067815149029259, + "grad_norm": 0.16805590689182281, + "learning_rate": 5e-05, + "loss": 1.6665, + "step": 2219 + }, + { + "epoch": 0.6070549630844955, + "grad_norm": 0.1635158360004425, + "learning_rate": 5e-05, + "loss": 1.7377, + "step": 2220 + }, + { + "epoch": 0.6073284112660651, + "grad_norm": 0.17108480632305145, + "learning_rate": 5e-05, + "loss": 1.6587, + "step": 2221 + }, + { + "epoch": 0.6076018594476347, + "grad_norm": 0.1763361096382141, + "learning_rate": 5e-05, + "loss": 1.735, + "step": 2222 + }, + { + "epoch": 0.6078753076292043, + "grad_norm": 0.153673455119133, + "learning_rate": 5e-05, + "loss": 1.5807, + "step": 2223 + }, + { + "epoch": 0.6081487558107739, + "grad_norm": 0.17884378135204315, + "learning_rate": 5e-05, + "loss": 1.7078, + "step": 2224 + }, + { + "epoch": 0.6084222039923435, + "grad_norm": 0.2724404036998749, + "learning_rate": 5e-05, + "loss": 1.72, + "step": 2225 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.16155698895454407, + "learning_rate": 5e-05, + "loss": 1.6043, + "step": 2226 + }, + { + "epoch": 0.6089691003554827, + "grad_norm": 0.17532581090927124, + "learning_rate": 5e-05, + "loss": 1.6597, + "step": 2227 + }, + { + "epoch": 0.6092425485370522, + "grad_norm": 0.1655532866716385, + "learning_rate": 5e-05, + "loss": 1.6069, + "step": 2228 + }, + { + "epoch": 0.6095159967186218, + "grad_norm": 0.158722385764122, + "learning_rate": 5e-05, + "loss": 1.5885, + "step": 2229 + }, + { + "epoch": 0.6097894449001914, + "grad_norm": 0.17299902439117432, + "learning_rate": 5e-05, + "loss": 1.5863, + "step": 2230 + }, + { + "epoch": 0.610062893081761, + "grad_norm": 0.17708896100521088, + "learning_rate": 5e-05, + "loss": 1.656, + "step": 2231 + }, + { + "epoch": 0.6103363412633306, + "grad_norm": 0.16440723836421967, + "learning_rate": 5e-05, + "loss": 1.6651, + "step": 2232 + }, + { + "epoch": 0.6106097894449002, + "grad_norm": 0.16679327189922333, + "learning_rate": 5e-05, + "loss": 1.6108, + "step": 2233 + }, + { + "epoch": 0.6108832376264698, + "grad_norm": 0.18106336891651154, + "learning_rate": 5e-05, + "loss": 1.6973, + "step": 2234 + }, + { + "epoch": 0.6111566858080394, + "grad_norm": 0.1723693609237671, + "learning_rate": 5e-05, + "loss": 1.6928, + "step": 2235 + }, + { + "epoch": 0.611430133989609, + "grad_norm": 0.1619867980480194, + "learning_rate": 5e-05, + "loss": 1.5845, + "step": 2236 + }, + { + "epoch": 0.6117035821711786, + "grad_norm": 0.18252336978912354, + "learning_rate": 5e-05, + "loss": 1.6004, + "step": 2237 + }, + { + "epoch": 0.6119770303527482, + "grad_norm": 0.166873037815094, + "learning_rate": 5e-05, + "loss": 1.6284, + "step": 2238 + }, + { + "epoch": 0.6122504785343178, + "grad_norm": 0.16315174102783203, + "learning_rate": 5e-05, + "loss": 1.607, + "step": 2239 + }, + { + "epoch": 0.6125239267158873, + "grad_norm": 0.17975984513759613, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 2240 + }, + { + "epoch": 0.6127973748974569, + "grad_norm": 0.1571521908044815, + "learning_rate": 5e-05, + "loss": 1.6401, + "step": 2241 + }, + { + "epoch": 0.6130708230790265, + "grad_norm": 0.16314658522605896, + "learning_rate": 5e-05, + "loss": 1.5639, + "step": 2242 + }, + { + "epoch": 0.6133442712605961, + "grad_norm": 0.1598280519247055, + "learning_rate": 5e-05, + "loss": 1.5334, + "step": 2243 + }, + { + "epoch": 0.6136177194421657, + "grad_norm": 0.15419380366802216, + "learning_rate": 5e-05, + "loss": 1.5093, + "step": 2244 + }, + { + "epoch": 0.6138911676237353, + "grad_norm": 0.15684397518634796, + "learning_rate": 5e-05, + "loss": 1.6236, + "step": 2245 + }, + { + "epoch": 0.6141646158053049, + "grad_norm": 0.1650727540254593, + "learning_rate": 5e-05, + "loss": 1.7009, + "step": 2246 + }, + { + "epoch": 0.6144380639868745, + "grad_norm": 0.16277483105659485, + "learning_rate": 5e-05, + "loss": 1.6878, + "step": 2247 + }, + { + "epoch": 0.6147115121684441, + "grad_norm": 0.1605841964483261, + "learning_rate": 5e-05, + "loss": 1.6445, + "step": 2248 + }, + { + "epoch": 0.6149849603500137, + "grad_norm": 0.16627003252506256, + "learning_rate": 5e-05, + "loss": 1.6217, + "step": 2249 + }, + { + "epoch": 0.6152584085315833, + "grad_norm": 0.15815889835357666, + "learning_rate": 5e-05, + "loss": 1.6161, + "step": 2250 + }, + { + "epoch": 0.6155318567131528, + "grad_norm": 0.15406928956508636, + "learning_rate": 5e-05, + "loss": 1.6202, + "step": 2251 + }, + { + "epoch": 0.6158053048947224, + "grad_norm": 0.1534588783979416, + "learning_rate": 5e-05, + "loss": 1.5494, + "step": 2252 + }, + { + "epoch": 0.616078753076292, + "grad_norm": 0.15792147815227509, + "learning_rate": 5e-05, + "loss": 1.5803, + "step": 2253 + }, + { + "epoch": 0.6163522012578616, + "grad_norm": 0.15371176600456238, + "learning_rate": 5e-05, + "loss": 1.5671, + "step": 2254 + }, + { + "epoch": 0.6166256494394312, + "grad_norm": 0.15262968838214874, + "learning_rate": 5e-05, + "loss": 1.5196, + "step": 2255 + }, + { + "epoch": 0.6168990976210008, + "grad_norm": 0.15933872759342194, + "learning_rate": 5e-05, + "loss": 1.5735, + "step": 2256 + }, + { + "epoch": 0.6171725458025704, + "grad_norm": 0.15623560547828674, + "learning_rate": 5e-05, + "loss": 1.529, + "step": 2257 + }, + { + "epoch": 0.61744599398414, + "grad_norm": 0.15916681289672852, + "learning_rate": 5e-05, + "loss": 1.5828, + "step": 2258 + }, + { + "epoch": 0.6177194421657096, + "grad_norm": 0.15046894550323486, + "learning_rate": 5e-05, + "loss": 1.6456, + "step": 2259 + }, + { + "epoch": 0.6179928903472792, + "grad_norm": 0.15784558653831482, + "learning_rate": 5e-05, + "loss": 1.5167, + "step": 2260 + }, + { + "epoch": 0.6182663385288488, + "grad_norm": 0.16395771503448486, + "learning_rate": 5e-05, + "loss": 1.5806, + "step": 2261 + }, + { + "epoch": 0.6185397867104184, + "grad_norm": 0.15254901349544525, + "learning_rate": 5e-05, + "loss": 1.5617, + "step": 2262 + }, + { + "epoch": 0.6188132348919879, + "grad_norm": 0.15680168569087982, + "learning_rate": 5e-05, + "loss": 1.5795, + "step": 2263 + }, + { + "epoch": 0.6190866830735575, + "grad_norm": 0.1724022775888443, + "learning_rate": 5e-05, + "loss": 1.6245, + "step": 2264 + }, + { + "epoch": 0.6193601312551271, + "grad_norm": 0.1564149111509323, + "learning_rate": 5e-05, + "loss": 1.5742, + "step": 2265 + }, + { + "epoch": 0.6196335794366967, + "grad_norm": 0.18149738013744354, + "learning_rate": 5e-05, + "loss": 1.5351, + "step": 2266 + }, + { + "epoch": 0.6199070276182663, + "grad_norm": 0.15708599984645844, + "learning_rate": 5e-05, + "loss": 1.6182, + "step": 2267 + }, + { + "epoch": 0.6201804757998359, + "grad_norm": 0.16995084285736084, + "learning_rate": 5e-05, + "loss": 1.6893, + "step": 2268 + }, + { + "epoch": 0.6204539239814055, + "grad_norm": 0.16211822628974915, + "learning_rate": 5e-05, + "loss": 1.702, + "step": 2269 + }, + { + "epoch": 0.6207273721629751, + "grad_norm": 0.16163219511508942, + "learning_rate": 5e-05, + "loss": 1.6688, + "step": 2270 + }, + { + "epoch": 0.6210008203445447, + "grad_norm": 0.16371332108974457, + "learning_rate": 5e-05, + "loss": 1.5554, + "step": 2271 + }, + { + "epoch": 0.6212742685261143, + "grad_norm": 0.1574903279542923, + "learning_rate": 5e-05, + "loss": 1.6372, + "step": 2272 + }, + { + "epoch": 0.6215477167076839, + "grad_norm": 0.15815797448158264, + "learning_rate": 5e-05, + "loss": 1.5674, + "step": 2273 + }, + { + "epoch": 0.6218211648892535, + "grad_norm": 0.1578124612569809, + "learning_rate": 5e-05, + "loss": 1.5226, + "step": 2274 + }, + { + "epoch": 0.622094613070823, + "grad_norm": 0.1595599204301834, + "learning_rate": 5e-05, + "loss": 1.5657, + "step": 2275 + }, + { + "epoch": 0.6223680612523926, + "grad_norm": 0.15654776990413666, + "learning_rate": 5e-05, + "loss": 1.6009, + "step": 2276 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 0.1564917415380478, + "learning_rate": 5e-05, + "loss": 1.6166, + "step": 2277 + }, + { + "epoch": 0.6229149576155318, + "grad_norm": 0.1595001518726349, + "learning_rate": 5e-05, + "loss": 1.5389, + "step": 2278 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 0.16073504090309143, + "learning_rate": 5e-05, + "loss": 1.5948, + "step": 2279 + }, + { + "epoch": 0.623461853978671, + "grad_norm": 0.15986725687980652, + "learning_rate": 5e-05, + "loss": 1.6057, + "step": 2280 + }, + { + "epoch": 0.6237353021602406, + "grad_norm": 0.1577160656452179, + "learning_rate": 5e-05, + "loss": 1.6144, + "step": 2281 + }, + { + "epoch": 0.6240087503418102, + "grad_norm": 0.1593600958585739, + "learning_rate": 5e-05, + "loss": 1.6538, + "step": 2282 + }, + { + "epoch": 0.6242821985233798, + "grad_norm": 0.15497250854969025, + "learning_rate": 5e-05, + "loss": 1.5714, + "step": 2283 + }, + { + "epoch": 0.6245556467049495, + "grad_norm": 0.1659761667251587, + "learning_rate": 5e-05, + "loss": 1.6146, + "step": 2284 + }, + { + "epoch": 0.624829094886519, + "grad_norm": 0.16305339336395264, + "learning_rate": 5e-05, + "loss": 1.6935, + "step": 2285 + }, + { + "epoch": 0.6251025430680885, + "grad_norm": 0.16828620433807373, + "learning_rate": 5e-05, + "loss": 1.6341, + "step": 2286 + }, + { + "epoch": 0.6253759912496581, + "grad_norm": 0.16056501865386963, + "learning_rate": 5e-05, + "loss": 1.6103, + "step": 2287 + }, + { + "epoch": 0.6256494394312277, + "grad_norm": 0.1658848077058792, + "learning_rate": 5e-05, + "loss": 1.6548, + "step": 2288 + }, + { + "epoch": 0.6259228876127974, + "grad_norm": 0.1599251627922058, + "learning_rate": 5e-05, + "loss": 1.5715, + "step": 2289 + }, + { + "epoch": 0.626196335794367, + "grad_norm": 0.15655700862407684, + "learning_rate": 5e-05, + "loss": 1.5145, + "step": 2290 + }, + { + "epoch": 0.6264697839759366, + "grad_norm": 0.16632331907749176, + "learning_rate": 5e-05, + "loss": 1.6599, + "step": 2291 + }, + { + "epoch": 0.6267432321575062, + "grad_norm": 0.16444595158100128, + "learning_rate": 5e-05, + "loss": 1.6591, + "step": 2292 + }, + { + "epoch": 0.6270166803390758, + "grad_norm": 0.1640361100435257, + "learning_rate": 5e-05, + "loss": 1.5751, + "step": 2293 + }, + { + "epoch": 0.6272901285206454, + "grad_norm": 0.16003577411174774, + "learning_rate": 5e-05, + "loss": 1.6096, + "step": 2294 + }, + { + "epoch": 0.627563576702215, + "grad_norm": 0.16625234484672546, + "learning_rate": 5e-05, + "loss": 1.6685, + "step": 2295 + }, + { + "epoch": 0.6278370248837846, + "grad_norm": 0.15295128524303436, + "learning_rate": 5e-05, + "loss": 1.569, + "step": 2296 + }, + { + "epoch": 0.6281104730653542, + "grad_norm": 0.16834108531475067, + "learning_rate": 5e-05, + "loss": 1.6813, + "step": 2297 + }, + { + "epoch": 0.6283839212469237, + "grad_norm": 0.16331374645233154, + "learning_rate": 5e-05, + "loss": 1.5879, + "step": 2298 + }, + { + "epoch": 0.6286573694284933, + "grad_norm": 0.17699094116687775, + "learning_rate": 5e-05, + "loss": 1.673, + "step": 2299 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 0.1626228392124176, + "learning_rate": 5e-05, + "loss": 1.6703, + "step": 2300 + }, + { + "epoch": 0.6292042657916325, + "grad_norm": 0.16163185238838196, + "learning_rate": 5e-05, + "loss": 1.6249, + "step": 2301 + }, + { + "epoch": 0.6294777139732021, + "grad_norm": 0.17444822192192078, + "learning_rate": 5e-05, + "loss": 1.5542, + "step": 2302 + }, + { + "epoch": 0.6297511621547717, + "grad_norm": 0.1661493182182312, + "learning_rate": 5e-05, + "loss": 1.6295, + "step": 2303 + }, + { + "epoch": 0.6300246103363413, + "grad_norm": 0.1628965586423874, + "learning_rate": 5e-05, + "loss": 1.5952, + "step": 2304 + }, + { + "epoch": 0.6302980585179109, + "grad_norm": 0.1656888723373413, + "learning_rate": 5e-05, + "loss": 1.594, + "step": 2305 + }, + { + "epoch": 0.6305715066994805, + "grad_norm": 0.15425047278404236, + "learning_rate": 5e-05, + "loss": 1.4985, + "step": 2306 + }, + { + "epoch": 0.6308449548810501, + "grad_norm": 0.15567493438720703, + "learning_rate": 5e-05, + "loss": 1.5571, + "step": 2307 + }, + { + "epoch": 0.6311184030626197, + "grad_norm": 0.17899873852729797, + "learning_rate": 5e-05, + "loss": 1.6662, + "step": 2308 + }, + { + "epoch": 0.6313918512441892, + "grad_norm": 0.15900376439094543, + "learning_rate": 5e-05, + "loss": 1.6517, + "step": 2309 + }, + { + "epoch": 0.6316652994257588, + "grad_norm": 0.15737038850784302, + "learning_rate": 5e-05, + "loss": 1.5635, + "step": 2310 + }, + { + "epoch": 0.6319387476073284, + "grad_norm": 0.16273202002048492, + "learning_rate": 5e-05, + "loss": 1.5987, + "step": 2311 + }, + { + "epoch": 0.632212195788898, + "grad_norm": 0.15607643127441406, + "learning_rate": 5e-05, + "loss": 1.5978, + "step": 2312 + }, + { + "epoch": 0.6324856439704676, + "grad_norm": 0.16705310344696045, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 2313 + }, + { + "epoch": 0.6327590921520372, + "grad_norm": 0.15748460590839386, + "learning_rate": 5e-05, + "loss": 1.5379, + "step": 2314 + }, + { + "epoch": 0.6330325403336068, + "grad_norm": 0.16817143559455872, + "learning_rate": 5e-05, + "loss": 1.6564, + "step": 2315 + }, + { + "epoch": 0.6333059885151764, + "grad_norm": 0.15591007471084595, + "learning_rate": 5e-05, + "loss": 1.539, + "step": 2316 + }, + { + "epoch": 0.633579436696746, + "grad_norm": 0.1618664264678955, + "learning_rate": 5e-05, + "loss": 1.5866, + "step": 2317 + }, + { + "epoch": 0.6338528848783156, + "grad_norm": 0.1688256859779358, + "learning_rate": 5e-05, + "loss": 1.7367, + "step": 2318 + }, + { + "epoch": 0.6341263330598852, + "grad_norm": 0.17073442041873932, + "learning_rate": 5e-05, + "loss": 1.6241, + "step": 2319 + }, + { + "epoch": 0.6343997812414548, + "grad_norm": 0.16293637454509735, + "learning_rate": 5e-05, + "loss": 1.5913, + "step": 2320 + }, + { + "epoch": 0.6346732294230243, + "grad_norm": 0.15301881730556488, + "learning_rate": 5e-05, + "loss": 1.5685, + "step": 2321 + }, + { + "epoch": 0.6349466776045939, + "grad_norm": 0.1594955027103424, + "learning_rate": 5e-05, + "loss": 1.6564, + "step": 2322 + }, + { + "epoch": 0.6352201257861635, + "grad_norm": 0.15445536375045776, + "learning_rate": 5e-05, + "loss": 1.5536, + "step": 2323 + }, + { + "epoch": 0.6354935739677331, + "grad_norm": 0.166269913315773, + "learning_rate": 5e-05, + "loss": 1.6064, + "step": 2324 + }, + { + "epoch": 0.6357670221493027, + "grad_norm": 0.15730029344558716, + "learning_rate": 5e-05, + "loss": 1.5387, + "step": 2325 + }, + { + "epoch": 0.6360404703308723, + "grad_norm": 0.15941552817821503, + "learning_rate": 5e-05, + "loss": 1.6213, + "step": 2326 + }, + { + "epoch": 0.6363139185124419, + "grad_norm": 0.16579844057559967, + "learning_rate": 5e-05, + "loss": 1.5993, + "step": 2327 + }, + { + "epoch": 0.6365873666940115, + "grad_norm": 0.17182214558124542, + "learning_rate": 5e-05, + "loss": 1.6941, + "step": 2328 + }, + { + "epoch": 0.6368608148755811, + "grad_norm": 0.1528700590133667, + "learning_rate": 5e-05, + "loss": 1.5408, + "step": 2329 + }, + { + "epoch": 0.6371342630571507, + "grad_norm": 0.15841877460479736, + "learning_rate": 5e-05, + "loss": 1.5145, + "step": 2330 + }, + { + "epoch": 0.6374077112387203, + "grad_norm": 0.17149239778518677, + "learning_rate": 5e-05, + "loss": 1.7896, + "step": 2331 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.17149433493614197, + "learning_rate": 5e-05, + "loss": 1.6635, + "step": 2332 + }, + { + "epoch": 0.6379546076018594, + "grad_norm": 0.1553071290254593, + "learning_rate": 5e-05, + "loss": 1.6115, + "step": 2333 + }, + { + "epoch": 0.638228055783429, + "grad_norm": 0.17740413546562195, + "learning_rate": 5e-05, + "loss": 1.6419, + "step": 2334 + }, + { + "epoch": 0.6385015039649986, + "grad_norm": 0.16571162641048431, + "learning_rate": 5e-05, + "loss": 1.6287, + "step": 2335 + }, + { + "epoch": 0.6387749521465682, + "grad_norm": 0.16401976346969604, + "learning_rate": 5e-05, + "loss": 1.5881, + "step": 2336 + }, + { + "epoch": 0.6390484003281378, + "grad_norm": 0.16186074912548065, + "learning_rate": 5e-05, + "loss": 1.644, + "step": 2337 + }, + { + "epoch": 0.6393218485097074, + "grad_norm": 0.16843372583389282, + "learning_rate": 5e-05, + "loss": 1.708, + "step": 2338 + }, + { + "epoch": 0.639595296691277, + "grad_norm": 0.1627928465604782, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 2339 + }, + { + "epoch": 0.6398687448728466, + "grad_norm": 0.15548692643642426, + "learning_rate": 5e-05, + "loss": 1.6938, + "step": 2340 + }, + { + "epoch": 0.6401421930544162, + "grad_norm": 0.15752455592155457, + "learning_rate": 5e-05, + "loss": 1.6264, + "step": 2341 + }, + { + "epoch": 0.6404156412359858, + "grad_norm": 0.1618221253156662, + "learning_rate": 5e-05, + "loss": 1.6214, + "step": 2342 + }, + { + "epoch": 0.6406890894175554, + "grad_norm": 0.15189188718795776, + "learning_rate": 5e-05, + "loss": 1.6024, + "step": 2343 + }, + { + "epoch": 0.6409625375991249, + "grad_norm": 0.16147050261497498, + "learning_rate": 5e-05, + "loss": 1.6617, + "step": 2344 + }, + { + "epoch": 0.6412359857806945, + "grad_norm": 0.16319455206394196, + "learning_rate": 5e-05, + "loss": 1.6428, + "step": 2345 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.15911765396595, + "learning_rate": 5e-05, + "loss": 1.5411, + "step": 2346 + }, + { + "epoch": 0.6417828821438337, + "grad_norm": 0.15844199061393738, + "learning_rate": 5e-05, + "loss": 1.5859, + "step": 2347 + }, + { + "epoch": 0.6420563303254033, + "grad_norm": 0.16009950637817383, + "learning_rate": 5e-05, + "loss": 1.6266, + "step": 2348 + }, + { + "epoch": 0.6423297785069729, + "grad_norm": 0.15512198209762573, + "learning_rate": 5e-05, + "loss": 1.6013, + "step": 2349 + }, + { + "epoch": 0.6426032266885425, + "grad_norm": 0.1645142138004303, + "learning_rate": 5e-05, + "loss": 1.545, + "step": 2350 + }, + { + "epoch": 0.6428766748701121, + "grad_norm": 0.15614928305149078, + "learning_rate": 5e-05, + "loss": 1.6131, + "step": 2351 + }, + { + "epoch": 0.6431501230516817, + "grad_norm": 0.16520023345947266, + "learning_rate": 5e-05, + "loss": 1.6631, + "step": 2352 + }, + { + "epoch": 0.6434235712332513, + "grad_norm": 0.164496511220932, + "learning_rate": 5e-05, + "loss": 1.6092, + "step": 2353 + }, + { + "epoch": 0.6436970194148209, + "grad_norm": 0.16124795377254486, + "learning_rate": 5e-05, + "loss": 1.5647, + "step": 2354 + }, + { + "epoch": 0.6439704675963905, + "grad_norm": 0.1593211442232132, + "learning_rate": 5e-05, + "loss": 1.5575, + "step": 2355 + }, + { + "epoch": 0.64424391577796, + "grad_norm": 0.15209631621837616, + "learning_rate": 5e-05, + "loss": 1.5939, + "step": 2356 + }, + { + "epoch": 0.6445173639595296, + "grad_norm": 0.1607188880443573, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 2357 + }, + { + "epoch": 0.6447908121410992, + "grad_norm": 0.1642126739025116, + "learning_rate": 5e-05, + "loss": 1.6879, + "step": 2358 + }, + { + "epoch": 0.6450642603226688, + "grad_norm": 0.15646396577358246, + "learning_rate": 5e-05, + "loss": 1.5971, + "step": 2359 + }, + { + "epoch": 0.6453377085042384, + "grad_norm": 0.16889351606369019, + "learning_rate": 5e-05, + "loss": 1.6108, + "step": 2360 + }, + { + "epoch": 0.645611156685808, + "grad_norm": 0.15696920454502106, + "learning_rate": 5e-05, + "loss": 1.6325, + "step": 2361 + }, + { + "epoch": 0.6458846048673776, + "grad_norm": 0.17429569363594055, + "learning_rate": 5e-05, + "loss": 1.6995, + "step": 2362 + }, + { + "epoch": 0.6461580530489472, + "grad_norm": 0.16672879457473755, + "learning_rate": 5e-05, + "loss": 1.6846, + "step": 2363 + }, + { + "epoch": 0.6464315012305168, + "grad_norm": 0.15300826728343964, + "learning_rate": 5e-05, + "loss": 1.5222, + "step": 2364 + }, + { + "epoch": 0.6467049494120864, + "grad_norm": 0.16159702837467194, + "learning_rate": 5e-05, + "loss": 1.6988, + "step": 2365 + }, + { + "epoch": 0.646978397593656, + "grad_norm": 0.1599888950586319, + "learning_rate": 5e-05, + "loss": 1.6864, + "step": 2366 + }, + { + "epoch": 0.6472518457752255, + "grad_norm": 0.15492287278175354, + "learning_rate": 5e-05, + "loss": 1.5455, + "step": 2367 + }, + { + "epoch": 0.6475252939567951, + "grad_norm": 0.16030439734458923, + "learning_rate": 5e-05, + "loss": 1.6159, + "step": 2368 + }, + { + "epoch": 0.6477987421383647, + "grad_norm": 0.16615253686904907, + "learning_rate": 5e-05, + "loss": 1.6585, + "step": 2369 + }, + { + "epoch": 0.6480721903199343, + "grad_norm": 0.15947528183460236, + "learning_rate": 5e-05, + "loss": 1.5924, + "step": 2370 + }, + { + "epoch": 0.648345638501504, + "grad_norm": 0.1585034430027008, + "learning_rate": 5e-05, + "loss": 1.5964, + "step": 2371 + }, + { + "epoch": 0.6486190866830736, + "grad_norm": 0.15772749483585358, + "learning_rate": 5e-05, + "loss": 1.5869, + "step": 2372 + }, + { + "epoch": 0.6488925348646432, + "grad_norm": 0.16060665249824524, + "learning_rate": 5e-05, + "loss": 1.6111, + "step": 2373 + }, + { + "epoch": 0.6491659830462128, + "grad_norm": 0.15704810619354248, + "learning_rate": 5e-05, + "loss": 1.6257, + "step": 2374 + }, + { + "epoch": 0.6494394312277824, + "grad_norm": 0.16538581252098083, + "learning_rate": 5e-05, + "loss": 1.5543, + "step": 2375 + }, + { + "epoch": 0.649712879409352, + "grad_norm": 0.15830865502357483, + "learning_rate": 5e-05, + "loss": 1.6593, + "step": 2376 + }, + { + "epoch": 0.6499863275909216, + "grad_norm": 0.17541712522506714, + "learning_rate": 5e-05, + "loss": 1.8108, + "step": 2377 + }, + { + "epoch": 0.6502597757724912, + "grad_norm": 0.15826934576034546, + "learning_rate": 5e-05, + "loss": 1.6312, + "step": 2378 + }, + { + "epoch": 0.6505332239540607, + "grad_norm": 0.1586860567331314, + "learning_rate": 5e-05, + "loss": 1.6282, + "step": 2379 + }, + { + "epoch": 0.6508066721356303, + "grad_norm": 0.15930341184139252, + "learning_rate": 5e-05, + "loss": 1.6039, + "step": 2380 + }, + { + "epoch": 0.6510801203171999, + "grad_norm": 0.16288048028945923, + "learning_rate": 5e-05, + "loss": 1.64, + "step": 2381 + }, + { + "epoch": 0.6513535684987695, + "grad_norm": 0.15390625596046448, + "learning_rate": 5e-05, + "loss": 1.5283, + "step": 2382 + }, + { + "epoch": 0.6516270166803391, + "grad_norm": 0.15672685205936432, + "learning_rate": 5e-05, + "loss": 1.5796, + "step": 2383 + }, + { + "epoch": 0.6519004648619087, + "grad_norm": 0.16665740311145782, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 2384 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.16864562034606934, + "learning_rate": 5e-05, + "loss": 1.5693, + "step": 2385 + }, + { + "epoch": 0.6524473612250479, + "grad_norm": 0.17148347198963165, + "learning_rate": 5e-05, + "loss": 1.6832, + "step": 2386 + }, + { + "epoch": 0.6527208094066175, + "grad_norm": 0.15762430429458618, + "learning_rate": 5e-05, + "loss": 1.5791, + "step": 2387 + }, + { + "epoch": 0.6529942575881871, + "grad_norm": 0.16125012934207916, + "learning_rate": 5e-05, + "loss": 1.6339, + "step": 2388 + }, + { + "epoch": 0.6532677057697567, + "grad_norm": 0.17223118245601654, + "learning_rate": 5e-05, + "loss": 1.6022, + "step": 2389 + }, + { + "epoch": 0.6535411539513262, + "grad_norm": 0.16604146361351013, + "learning_rate": 5e-05, + "loss": 1.6289, + "step": 2390 + }, + { + "epoch": 0.6538146021328958, + "grad_norm": 0.16918644309043884, + "learning_rate": 5e-05, + "loss": 1.6586, + "step": 2391 + }, + { + "epoch": 0.6540880503144654, + "grad_norm": 0.16766510903835297, + "learning_rate": 5e-05, + "loss": 1.6329, + "step": 2392 + }, + { + "epoch": 0.654361498496035, + "grad_norm": 0.1656326800584793, + "learning_rate": 5e-05, + "loss": 1.6551, + "step": 2393 + }, + { + "epoch": 0.6546349466776046, + "grad_norm": 0.16642306745052338, + "learning_rate": 5e-05, + "loss": 1.589, + "step": 2394 + }, + { + "epoch": 0.6549083948591742, + "grad_norm": 0.15996260941028595, + "learning_rate": 5e-05, + "loss": 1.5731, + "step": 2395 + }, + { + "epoch": 0.6551818430407438, + "grad_norm": 0.16654618084430695, + "learning_rate": 5e-05, + "loss": 1.7088, + "step": 2396 + }, + { + "epoch": 0.6554552912223134, + "grad_norm": 0.16626910865306854, + "learning_rate": 5e-05, + "loss": 1.5951, + "step": 2397 + }, + { + "epoch": 0.655728739403883, + "grad_norm": 0.16408471763134003, + "learning_rate": 5e-05, + "loss": 1.5914, + "step": 2398 + }, + { + "epoch": 0.6560021875854526, + "grad_norm": 0.15765917301177979, + "learning_rate": 5e-05, + "loss": 1.5525, + "step": 2399 + }, + { + "epoch": 0.6562756357670222, + "grad_norm": 0.15567491948604584, + "learning_rate": 5e-05, + "loss": 1.5724, + "step": 2400 + }, + { + "epoch": 0.6565490839485918, + "grad_norm": 0.1593087911605835, + "learning_rate": 5e-05, + "loss": 1.6831, + "step": 2401 + }, + { + "epoch": 0.6568225321301613, + "grad_norm": 0.17593322694301605, + "learning_rate": 5e-05, + "loss": 1.6298, + "step": 2402 + }, + { + "epoch": 0.6570959803117309, + "grad_norm": 0.16059966385364532, + "learning_rate": 5e-05, + "loss": 1.6232, + "step": 2403 + }, + { + "epoch": 0.6573694284933005, + "grad_norm": 0.15784414112567902, + "learning_rate": 5e-05, + "loss": 1.5765, + "step": 2404 + }, + { + "epoch": 0.6576428766748701, + "grad_norm": 0.16477881371974945, + "learning_rate": 5e-05, + "loss": 1.6309, + "step": 2405 + }, + { + "epoch": 0.6579163248564397, + "grad_norm": 0.15968461334705353, + "learning_rate": 5e-05, + "loss": 1.5916, + "step": 2406 + }, + { + "epoch": 0.6581897730380093, + "grad_norm": 0.15735867619514465, + "learning_rate": 5e-05, + "loss": 1.5926, + "step": 2407 + }, + { + "epoch": 0.6584632212195789, + "grad_norm": 0.15733473002910614, + "learning_rate": 5e-05, + "loss": 1.6399, + "step": 2408 + }, + { + "epoch": 0.6587366694011485, + "grad_norm": 0.16281574964523315, + "learning_rate": 5e-05, + "loss": 1.7522, + "step": 2409 + }, + { + "epoch": 0.6590101175827181, + "grad_norm": 0.17003872990608215, + "learning_rate": 5e-05, + "loss": 1.6451, + "step": 2410 + }, + { + "epoch": 0.6592835657642877, + "grad_norm": 0.15980958938598633, + "learning_rate": 5e-05, + "loss": 1.6791, + "step": 2411 + }, + { + "epoch": 0.6595570139458573, + "grad_norm": 0.15881507098674774, + "learning_rate": 5e-05, + "loss": 1.6218, + "step": 2412 + }, + { + "epoch": 0.6598304621274268, + "grad_norm": 0.1568790227174759, + "learning_rate": 5e-05, + "loss": 1.6652, + "step": 2413 + }, + { + "epoch": 0.6601039103089964, + "grad_norm": 0.16632792353630066, + "learning_rate": 5e-05, + "loss": 1.6607, + "step": 2414 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 0.15554708242416382, + "learning_rate": 5e-05, + "loss": 1.6423, + "step": 2415 + }, + { + "epoch": 0.6606508066721356, + "grad_norm": 0.16607338190078735, + "learning_rate": 5e-05, + "loss": 1.5636, + "step": 2416 + }, + { + "epoch": 0.6609242548537052, + "grad_norm": 0.1500883400440216, + "learning_rate": 5e-05, + "loss": 1.5665, + "step": 2417 + }, + { + "epoch": 0.6611977030352748, + "grad_norm": 0.1519675999879837, + "learning_rate": 5e-05, + "loss": 1.5243, + "step": 2418 + }, + { + "epoch": 0.6614711512168444, + "grad_norm": 0.1638784110546112, + "learning_rate": 5e-05, + "loss": 1.6462, + "step": 2419 + }, + { + "epoch": 0.661744599398414, + "grad_norm": 0.15792463719844818, + "learning_rate": 5e-05, + "loss": 1.5241, + "step": 2420 + }, + { + "epoch": 0.6620180475799836, + "grad_norm": 0.15407374501228333, + "learning_rate": 5e-05, + "loss": 1.5822, + "step": 2421 + }, + { + "epoch": 0.6622914957615532, + "grad_norm": 0.1588737964630127, + "learning_rate": 5e-05, + "loss": 1.7011, + "step": 2422 + }, + { + "epoch": 0.6625649439431228, + "grad_norm": 0.15952999889850616, + "learning_rate": 5e-05, + "loss": 1.5825, + "step": 2423 + }, + { + "epoch": 0.6628383921246924, + "grad_norm": 0.1603320688009262, + "learning_rate": 5e-05, + "loss": 1.5174, + "step": 2424 + }, + { + "epoch": 0.6631118403062619, + "grad_norm": 0.1592378169298172, + "learning_rate": 5e-05, + "loss": 1.5717, + "step": 2425 + }, + { + "epoch": 0.6633852884878315, + "grad_norm": 0.15770871937274933, + "learning_rate": 5e-05, + "loss": 1.603, + "step": 2426 + }, + { + "epoch": 0.6636587366694011, + "grad_norm": 0.14396196603775024, + "learning_rate": 5e-05, + "loss": 1.4309, + "step": 2427 + }, + { + "epoch": 0.6639321848509707, + "grad_norm": 0.15671992301940918, + "learning_rate": 5e-05, + "loss": 1.6278, + "step": 2428 + }, + { + "epoch": 0.6642056330325403, + "grad_norm": 0.157623752951622, + "learning_rate": 5e-05, + "loss": 1.5608, + "step": 2429 + }, + { + "epoch": 0.6644790812141099, + "grad_norm": 0.1747284233570099, + "learning_rate": 5e-05, + "loss": 1.6987, + "step": 2430 + }, + { + "epoch": 0.6647525293956795, + "grad_norm": 0.16402825713157654, + "learning_rate": 5e-05, + "loss": 1.6901, + "step": 2431 + }, + { + "epoch": 0.6650259775772491, + "grad_norm": 0.15663176774978638, + "learning_rate": 5e-05, + "loss": 1.5185, + "step": 2432 + }, + { + "epoch": 0.6652994257588187, + "grad_norm": 0.14828836917877197, + "learning_rate": 5e-05, + "loss": 1.525, + "step": 2433 + }, + { + "epoch": 0.6655728739403883, + "grad_norm": 0.15145589411258698, + "learning_rate": 5e-05, + "loss": 1.5436, + "step": 2434 + }, + { + "epoch": 0.6658463221219579, + "grad_norm": 0.166320338845253, + "learning_rate": 5e-05, + "loss": 1.6605, + "step": 2435 + }, + { + "epoch": 0.6661197703035275, + "grad_norm": 0.15976975858211517, + "learning_rate": 5e-05, + "loss": 1.6091, + "step": 2436 + }, + { + "epoch": 0.666393218485097, + "grad_norm": 0.1547703742980957, + "learning_rate": 5e-05, + "loss": 1.594, + "step": 2437 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.15781809389591217, + "learning_rate": 5e-05, + "loss": 1.6204, + "step": 2438 + }, + { + "epoch": 0.6669401148482362, + "grad_norm": 0.16567228734493256, + "learning_rate": 5e-05, + "loss": 1.5822, + "step": 2439 + }, + { + "epoch": 0.6672135630298058, + "grad_norm": 0.16364511847496033, + "learning_rate": 5e-05, + "loss": 1.606, + "step": 2440 + }, + { + "epoch": 0.6674870112113754, + "grad_norm": 0.17084093391895294, + "learning_rate": 5e-05, + "loss": 1.6213, + "step": 2441 + }, + { + "epoch": 0.667760459392945, + "grad_norm": 0.16154657304286957, + "learning_rate": 5e-05, + "loss": 1.5344, + "step": 2442 + }, + { + "epoch": 0.6680339075745146, + "grad_norm": 0.1677396446466446, + "learning_rate": 5e-05, + "loss": 1.5043, + "step": 2443 + }, + { + "epoch": 0.6683073557560842, + "grad_norm": 0.16632121801376343, + "learning_rate": 5e-05, + "loss": 1.6781, + "step": 2444 + }, + { + "epoch": 0.6685808039376538, + "grad_norm": 0.16119202971458435, + "learning_rate": 5e-05, + "loss": 1.5862, + "step": 2445 + }, + { + "epoch": 0.6688542521192234, + "grad_norm": 0.16189579665660858, + "learning_rate": 5e-05, + "loss": 1.5532, + "step": 2446 + }, + { + "epoch": 0.669127700300793, + "grad_norm": 0.16972647607326508, + "learning_rate": 5e-05, + "loss": 1.634, + "step": 2447 + }, + { + "epoch": 0.6694011484823625, + "grad_norm": 0.1550510823726654, + "learning_rate": 5e-05, + "loss": 1.6588, + "step": 2448 + }, + { + "epoch": 0.6696745966639321, + "grad_norm": 0.16665950417518616, + "learning_rate": 5e-05, + "loss": 1.6175, + "step": 2449 + }, + { + "epoch": 0.6699480448455017, + "grad_norm": 0.17640990018844604, + "learning_rate": 5e-05, + "loss": 1.5892, + "step": 2450 + }, + { + "epoch": 0.6702214930270713, + "grad_norm": 0.1729980856180191, + "learning_rate": 5e-05, + "loss": 1.6211, + "step": 2451 + }, + { + "epoch": 0.670494941208641, + "grad_norm": 0.1840631514787674, + "learning_rate": 5e-05, + "loss": 1.6796, + "step": 2452 + }, + { + "epoch": 0.6707683893902106, + "grad_norm": 0.16333408653736115, + "learning_rate": 5e-05, + "loss": 1.6035, + "step": 2453 + }, + { + "epoch": 0.6710418375717802, + "grad_norm": 0.15522709488868713, + "learning_rate": 5e-05, + "loss": 1.5799, + "step": 2454 + }, + { + "epoch": 0.6713152857533498, + "grad_norm": 0.1594085544347763, + "learning_rate": 5e-05, + "loss": 1.6033, + "step": 2455 + }, + { + "epoch": 0.6715887339349194, + "grad_norm": 0.174288809299469, + "learning_rate": 5e-05, + "loss": 1.5725, + "step": 2456 + }, + { + "epoch": 0.671862182116489, + "grad_norm": 0.1599048525094986, + "learning_rate": 5e-05, + "loss": 1.5534, + "step": 2457 + }, + { + "epoch": 0.6721356302980586, + "grad_norm": 0.17705102264881134, + "learning_rate": 5e-05, + "loss": 1.6193, + "step": 2458 + }, + { + "epoch": 0.6724090784796282, + "grad_norm": 0.17394457757472992, + "learning_rate": 5e-05, + "loss": 1.5991, + "step": 2459 + }, + { + "epoch": 0.6726825266611977, + "grad_norm": 0.17613448202610016, + "learning_rate": 5e-05, + "loss": 1.6804, + "step": 2460 + }, + { + "epoch": 0.6729559748427673, + "grad_norm": 0.1572728008031845, + "learning_rate": 5e-05, + "loss": 1.6653, + "step": 2461 + }, + { + "epoch": 0.6732294230243369, + "grad_norm": 0.17541466653347015, + "learning_rate": 5e-05, + "loss": 1.5699, + "step": 2462 + }, + { + "epoch": 0.6735028712059065, + "grad_norm": 0.1598716527223587, + "learning_rate": 5e-05, + "loss": 1.5847, + "step": 2463 + }, + { + "epoch": 0.6737763193874761, + "grad_norm": 0.16459186375141144, + "learning_rate": 5e-05, + "loss": 1.5796, + "step": 2464 + }, + { + "epoch": 0.6740497675690457, + "grad_norm": 0.16075541079044342, + "learning_rate": 5e-05, + "loss": 1.6633, + "step": 2465 + }, + { + "epoch": 0.6743232157506153, + "grad_norm": 0.16022174060344696, + "learning_rate": 5e-05, + "loss": 1.574, + "step": 2466 + }, + { + "epoch": 0.6745966639321849, + "grad_norm": 0.16279369592666626, + "learning_rate": 5e-05, + "loss": 1.6139, + "step": 2467 + }, + { + "epoch": 0.6748701121137545, + "grad_norm": 0.15538600087165833, + "learning_rate": 5e-05, + "loss": 1.6165, + "step": 2468 + }, + { + "epoch": 0.6751435602953241, + "grad_norm": 0.15767286717891693, + "learning_rate": 5e-05, + "loss": 1.6236, + "step": 2469 + }, + { + "epoch": 0.6754170084768937, + "grad_norm": 0.15616439282894135, + "learning_rate": 5e-05, + "loss": 1.6096, + "step": 2470 + }, + { + "epoch": 0.6756904566584632, + "grad_norm": 0.1593167930841446, + "learning_rate": 5e-05, + "loss": 1.5664, + "step": 2471 + }, + { + "epoch": 0.6759639048400328, + "grad_norm": 0.15188159048557281, + "learning_rate": 5e-05, + "loss": 1.6384, + "step": 2472 + }, + { + "epoch": 0.6762373530216024, + "grad_norm": 0.1587432473897934, + "learning_rate": 5e-05, + "loss": 1.6228, + "step": 2473 + }, + { + "epoch": 0.676510801203172, + "grad_norm": 0.15247757732868195, + "learning_rate": 5e-05, + "loss": 1.4628, + "step": 2474 + }, + { + "epoch": 0.6767842493847416, + "grad_norm": 0.1572297066450119, + "learning_rate": 5e-05, + "loss": 1.5996, + "step": 2475 + }, + { + "epoch": 0.6770576975663112, + "grad_norm": 0.1596899777650833, + "learning_rate": 5e-05, + "loss": 1.6908, + "step": 2476 + }, + { + "epoch": 0.6773311457478808, + "grad_norm": 0.15775376558303833, + "learning_rate": 5e-05, + "loss": 1.5248, + "step": 2477 + }, + { + "epoch": 0.6776045939294504, + "grad_norm": 0.16116631031036377, + "learning_rate": 5e-05, + "loss": 1.5714, + "step": 2478 + }, + { + "epoch": 0.67787804211102, + "grad_norm": 0.15727153420448303, + "learning_rate": 5e-05, + "loss": 1.5822, + "step": 2479 + }, + { + "epoch": 0.6781514902925896, + "grad_norm": 0.15772849321365356, + "learning_rate": 5e-05, + "loss": 1.64, + "step": 2480 + }, + { + "epoch": 0.6784249384741592, + "grad_norm": 0.15587928891181946, + "learning_rate": 5e-05, + "loss": 1.5437, + "step": 2481 + }, + { + "epoch": 0.6786983866557288, + "grad_norm": 0.16047807037830353, + "learning_rate": 5e-05, + "loss": 1.5917, + "step": 2482 + }, + { + "epoch": 0.6789718348372983, + "grad_norm": 0.1718113124370575, + "learning_rate": 5e-05, + "loss": 1.6332, + "step": 2483 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.1675024926662445, + "learning_rate": 5e-05, + "loss": 1.6186, + "step": 2484 + }, + { + "epoch": 0.6795187312004375, + "grad_norm": 0.16398410499095917, + "learning_rate": 5e-05, + "loss": 1.6521, + "step": 2485 + }, + { + "epoch": 0.6797921793820071, + "grad_norm": 0.18062223494052887, + "learning_rate": 5e-05, + "loss": 1.5848, + "step": 2486 + }, + { + "epoch": 0.6800656275635767, + "grad_norm": 0.15654748678207397, + "learning_rate": 5e-05, + "loss": 1.5452, + "step": 2487 + }, + { + "epoch": 0.6803390757451463, + "grad_norm": 0.1599476933479309, + "learning_rate": 5e-05, + "loss": 1.6255, + "step": 2488 + }, + { + "epoch": 0.6806125239267159, + "grad_norm": 0.16449692845344543, + "learning_rate": 5e-05, + "loss": 1.6717, + "step": 2489 + }, + { + "epoch": 0.6808859721082855, + "grad_norm": 0.16174077987670898, + "learning_rate": 5e-05, + "loss": 1.5248, + "step": 2490 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 0.1614287793636322, + "learning_rate": 5e-05, + "loss": 1.5408, + "step": 2491 + }, + { + "epoch": 0.6814328684714247, + "grad_norm": 0.1738748550415039, + "learning_rate": 5e-05, + "loss": 1.6189, + "step": 2492 + }, + { + "epoch": 0.6817063166529943, + "grad_norm": 0.1673515886068344, + "learning_rate": 5e-05, + "loss": 1.68, + "step": 2493 + }, + { + "epoch": 0.6819797648345638, + "grad_norm": 0.16801506280899048, + "learning_rate": 5e-05, + "loss": 1.6737, + "step": 2494 + }, + { + "epoch": 0.6822532130161334, + "grad_norm": 0.1836167722940445, + "learning_rate": 5e-05, + "loss": 1.6566, + "step": 2495 + }, + { + "epoch": 0.682526661197703, + "grad_norm": 0.1564912497997284, + "learning_rate": 5e-05, + "loss": 1.5649, + "step": 2496 + }, + { + "epoch": 0.6828001093792726, + "grad_norm": 0.161629781126976, + "learning_rate": 5e-05, + "loss": 1.6278, + "step": 2497 + }, + { + "epoch": 0.6830735575608422, + "grad_norm": 0.18047556281089783, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 2498 + }, + { + "epoch": 0.6833470057424118, + "grad_norm": 0.1663326621055603, + "learning_rate": 5e-05, + "loss": 1.6222, + "step": 2499 + }, + { + "epoch": 0.6836204539239814, + "grad_norm": 0.16856786608695984, + "learning_rate": 5e-05, + "loss": 1.6573, + "step": 2500 + }, + { + "epoch": 0.683893902105551, + "grad_norm": 0.17234691977500916, + "learning_rate": 5e-05, + "loss": 1.6514, + "step": 2501 + }, + { + "epoch": 0.6841673502871206, + "grad_norm": 0.15840747952461243, + "learning_rate": 5e-05, + "loss": 1.5428, + "step": 2502 + }, + { + "epoch": 0.6844407984686902, + "grad_norm": 0.17479360103607178, + "learning_rate": 5e-05, + "loss": 1.6271, + "step": 2503 + }, + { + "epoch": 0.6847142466502598, + "grad_norm": 0.16867142915725708, + "learning_rate": 5e-05, + "loss": 1.5434, + "step": 2504 + }, + { + "epoch": 0.6849876948318294, + "grad_norm": 0.16198626160621643, + "learning_rate": 5e-05, + "loss": 1.6075, + "step": 2505 + }, + { + "epoch": 0.6852611430133989, + "grad_norm": 0.16754168272018433, + "learning_rate": 5e-05, + "loss": 1.7111, + "step": 2506 + }, + { + "epoch": 0.6855345911949685, + "grad_norm": 0.1667136549949646, + "learning_rate": 5e-05, + "loss": 1.582, + "step": 2507 + }, + { + "epoch": 0.6858080393765381, + "grad_norm": 0.15942475199699402, + "learning_rate": 5e-05, + "loss": 1.6081, + "step": 2508 + }, + { + "epoch": 0.6860814875581077, + "grad_norm": 0.1539759784936905, + "learning_rate": 5e-05, + "loss": 1.5774, + "step": 2509 + }, + { + "epoch": 0.6863549357396773, + "grad_norm": 0.1718335896730423, + "learning_rate": 5e-05, + "loss": 1.7125, + "step": 2510 + }, + { + "epoch": 0.6866283839212469, + "grad_norm": 0.16027317941188812, + "learning_rate": 5e-05, + "loss": 1.634, + "step": 2511 + }, + { + "epoch": 0.6869018321028165, + "grad_norm": 0.1722375601530075, + "learning_rate": 5e-05, + "loss": 1.6105, + "step": 2512 + }, + { + "epoch": 0.6871752802843861, + "grad_norm": 0.15657594799995422, + "learning_rate": 5e-05, + "loss": 1.5136, + "step": 2513 + }, + { + "epoch": 0.6874487284659557, + "grad_norm": 0.16483189165592194, + "learning_rate": 5e-05, + "loss": 1.629, + "step": 2514 + }, + { + "epoch": 0.6877221766475253, + "grad_norm": 0.1574966162443161, + "learning_rate": 5e-05, + "loss": 1.5831, + "step": 2515 + }, + { + "epoch": 0.6879956248290949, + "grad_norm": 0.16608235239982605, + "learning_rate": 5e-05, + "loss": 1.6769, + "step": 2516 + }, + { + "epoch": 0.6882690730106645, + "grad_norm": 0.1636161059141159, + "learning_rate": 5e-05, + "loss": 1.6551, + "step": 2517 + }, + { + "epoch": 0.688542521192234, + "grad_norm": 0.15895555913448334, + "learning_rate": 5e-05, + "loss": 1.5672, + "step": 2518 + }, + { + "epoch": 0.6888159693738036, + "grad_norm": 0.17441825568675995, + "learning_rate": 5e-05, + "loss": 1.6559, + "step": 2519 + }, + { + "epoch": 0.6890894175553732, + "grad_norm": 0.15818622708320618, + "learning_rate": 5e-05, + "loss": 1.6146, + "step": 2520 + }, + { + "epoch": 0.6893628657369428, + "grad_norm": 0.16466504335403442, + "learning_rate": 5e-05, + "loss": 1.6309, + "step": 2521 + }, + { + "epoch": 0.6896363139185124, + "grad_norm": 0.1637655645608902, + "learning_rate": 5e-05, + "loss": 1.6369, + "step": 2522 + }, + { + "epoch": 0.689909762100082, + "grad_norm": 0.17039266228675842, + "learning_rate": 5e-05, + "loss": 1.6315, + "step": 2523 + }, + { + "epoch": 0.6901832102816516, + "grad_norm": 0.1624341756105423, + "learning_rate": 5e-05, + "loss": 1.5878, + "step": 2524 + }, + { + "epoch": 0.6904566584632212, + "grad_norm": 0.17213301360607147, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 2525 + }, + { + "epoch": 0.6907301066447908, + "grad_norm": 0.17695990204811096, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 2526 + }, + { + "epoch": 0.6910035548263604, + "grad_norm": 0.1731920838356018, + "learning_rate": 5e-05, + "loss": 1.7317, + "step": 2527 + }, + { + "epoch": 0.69127700300793, + "grad_norm": 0.16777953505516052, + "learning_rate": 5e-05, + "loss": 1.608, + "step": 2528 + }, + { + "epoch": 0.6915504511894995, + "grad_norm": 0.16122910380363464, + "learning_rate": 5e-05, + "loss": 1.5529, + "step": 2529 + }, + { + "epoch": 0.6918238993710691, + "grad_norm": 0.16527071595191956, + "learning_rate": 5e-05, + "loss": 1.621, + "step": 2530 + }, + { + "epoch": 0.6920973475526387, + "grad_norm": 0.1596560925245285, + "learning_rate": 5e-05, + "loss": 1.6079, + "step": 2531 + }, + { + "epoch": 0.6923707957342083, + "grad_norm": 0.15807481110095978, + "learning_rate": 5e-05, + "loss": 1.5349, + "step": 2532 + }, + { + "epoch": 0.692644243915778, + "grad_norm": 0.16306331753730774, + "learning_rate": 5e-05, + "loss": 1.6461, + "step": 2533 + }, + { + "epoch": 0.6929176920973475, + "grad_norm": 0.16564686596393585, + "learning_rate": 5e-05, + "loss": 1.5821, + "step": 2534 + }, + { + "epoch": 0.6931911402789172, + "grad_norm": 0.1704019457101822, + "learning_rate": 5e-05, + "loss": 1.5851, + "step": 2535 + }, + { + "epoch": 0.6934645884604868, + "grad_norm": 0.15806765854358673, + "learning_rate": 5e-05, + "loss": 1.6363, + "step": 2536 + }, + { + "epoch": 0.6937380366420564, + "grad_norm": 0.15724487602710724, + "learning_rate": 5e-05, + "loss": 1.608, + "step": 2537 + }, + { + "epoch": 0.694011484823626, + "grad_norm": 0.16402623057365417, + "learning_rate": 5e-05, + "loss": 1.5813, + "step": 2538 + }, + { + "epoch": 0.6942849330051956, + "grad_norm": 0.15913555026054382, + "learning_rate": 5e-05, + "loss": 1.6284, + "step": 2539 + }, + { + "epoch": 0.6945583811867652, + "grad_norm": 0.16074508428573608, + "learning_rate": 5e-05, + "loss": 1.528, + "step": 2540 + }, + { + "epoch": 0.6948318293683347, + "grad_norm": 0.15752652287483215, + "learning_rate": 5e-05, + "loss": 1.5023, + "step": 2541 + }, + { + "epoch": 0.6951052775499043, + "grad_norm": 0.15707096457481384, + "learning_rate": 5e-05, + "loss": 1.6251, + "step": 2542 + }, + { + "epoch": 0.6953787257314739, + "grad_norm": 0.15630987286567688, + "learning_rate": 5e-05, + "loss": 1.5003, + "step": 2543 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.15872447192668915, + "learning_rate": 5e-05, + "loss": 1.5698, + "step": 2544 + }, + { + "epoch": 0.6959256220946131, + "grad_norm": 0.16033059358596802, + "learning_rate": 5e-05, + "loss": 1.5535, + "step": 2545 + }, + { + "epoch": 0.6961990702761827, + "grad_norm": 0.1571992188692093, + "learning_rate": 5e-05, + "loss": 1.573, + "step": 2546 + }, + { + "epoch": 0.6964725184577523, + "grad_norm": 0.15957675874233246, + "learning_rate": 5e-05, + "loss": 1.6449, + "step": 2547 + }, + { + "epoch": 0.6967459666393219, + "grad_norm": 0.17184406518936157, + "learning_rate": 5e-05, + "loss": 1.7537, + "step": 2548 + }, + { + "epoch": 0.6970194148208915, + "grad_norm": 0.1593320220708847, + "learning_rate": 5e-05, + "loss": 1.6327, + "step": 2549 + }, + { + "epoch": 0.6972928630024611, + "grad_norm": 0.16837316751480103, + "learning_rate": 5e-05, + "loss": 1.6125, + "step": 2550 + }, + { + "epoch": 0.6975663111840307, + "grad_norm": 0.1569320261478424, + "learning_rate": 5e-05, + "loss": 1.6095, + "step": 2551 + }, + { + "epoch": 0.6978397593656002, + "grad_norm": 0.1526784896850586, + "learning_rate": 5e-05, + "loss": 1.5959, + "step": 2552 + }, + { + "epoch": 0.6981132075471698, + "grad_norm": 0.16312266886234283, + "learning_rate": 5e-05, + "loss": 1.6402, + "step": 2553 + }, + { + "epoch": 0.6983866557287394, + "grad_norm": 0.16702058911323547, + "learning_rate": 5e-05, + "loss": 1.5529, + "step": 2554 + }, + { + "epoch": 0.698660103910309, + "grad_norm": 0.16597692668437958, + "learning_rate": 5e-05, + "loss": 1.6619, + "step": 2555 + }, + { + "epoch": 0.6989335520918786, + "grad_norm": 0.16194956004619598, + "learning_rate": 5e-05, + "loss": 1.7461, + "step": 2556 + }, + { + "epoch": 0.6992070002734482, + "grad_norm": 0.15624722838401794, + "learning_rate": 5e-05, + "loss": 1.6271, + "step": 2557 + }, + { + "epoch": 0.6994804484550178, + "grad_norm": 0.15689989924430847, + "learning_rate": 5e-05, + "loss": 1.5491, + "step": 2558 + }, + { + "epoch": 0.6997538966365874, + "grad_norm": 0.1616222858428955, + "learning_rate": 5e-05, + "loss": 1.5906, + "step": 2559 + }, + { + "epoch": 0.700027344818157, + "grad_norm": 0.15930163860321045, + "learning_rate": 5e-05, + "loss": 1.5583, + "step": 2560 + }, + { + "epoch": 0.7003007929997266, + "grad_norm": 0.16471946239471436, + "learning_rate": 5e-05, + "loss": 1.6327, + "step": 2561 + }, + { + "epoch": 0.7005742411812962, + "grad_norm": 0.16374364495277405, + "learning_rate": 5e-05, + "loss": 1.6228, + "step": 2562 + }, + { + "epoch": 0.7008476893628658, + "grad_norm": 0.1631327122449875, + "learning_rate": 5e-05, + "loss": 1.6547, + "step": 2563 + }, + { + "epoch": 0.7011211375444353, + "grad_norm": 0.15994025766849518, + "learning_rate": 5e-05, + "loss": 1.5833, + "step": 2564 + }, + { + "epoch": 0.7013945857260049, + "grad_norm": 0.16121256351470947, + "learning_rate": 5e-05, + "loss": 1.6305, + "step": 2565 + }, + { + "epoch": 0.7016680339075745, + "grad_norm": 0.16354970633983612, + "learning_rate": 5e-05, + "loss": 1.5552, + "step": 2566 + }, + { + "epoch": 0.7019414820891441, + "grad_norm": 0.161824032664299, + "learning_rate": 5e-05, + "loss": 1.6749, + "step": 2567 + }, + { + "epoch": 0.7022149302707137, + "grad_norm": 0.17040131986141205, + "learning_rate": 5e-05, + "loss": 1.6291, + "step": 2568 + }, + { + "epoch": 0.7024883784522833, + "grad_norm": 0.1799769103527069, + "learning_rate": 5e-05, + "loss": 1.657, + "step": 2569 + }, + { + "epoch": 0.7027618266338529, + "grad_norm": 0.15718813240528107, + "learning_rate": 5e-05, + "loss": 1.5858, + "step": 2570 + }, + { + "epoch": 0.7030352748154225, + "grad_norm": 0.16162623465061188, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 2571 + }, + { + "epoch": 0.7033087229969921, + "grad_norm": 0.16257646679878235, + "learning_rate": 5e-05, + "loss": 1.624, + "step": 2572 + }, + { + "epoch": 0.7035821711785617, + "grad_norm": 0.16675184667110443, + "learning_rate": 5e-05, + "loss": 1.6738, + "step": 2573 + }, + { + "epoch": 0.7038556193601313, + "grad_norm": 0.17329177260398865, + "learning_rate": 5e-05, + "loss": 1.6952, + "step": 2574 + }, + { + "epoch": 0.7041290675417008, + "grad_norm": 0.15674902498722076, + "learning_rate": 5e-05, + "loss": 1.5858, + "step": 2575 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 0.15690606832504272, + "learning_rate": 5e-05, + "loss": 1.5772, + "step": 2576 + }, + { + "epoch": 0.70467596390484, + "grad_norm": 0.16210415959358215, + "learning_rate": 5e-05, + "loss": 1.6705, + "step": 2577 + }, + { + "epoch": 0.7049494120864096, + "grad_norm": 0.16525010764598846, + "learning_rate": 5e-05, + "loss": 1.6188, + "step": 2578 + }, + { + "epoch": 0.7052228602679792, + "grad_norm": 0.1738116294145584, + "learning_rate": 5e-05, + "loss": 1.5788, + "step": 2579 + }, + { + "epoch": 0.7054963084495488, + "grad_norm": 0.1632319688796997, + "learning_rate": 5e-05, + "loss": 1.6327, + "step": 2580 + }, + { + "epoch": 0.7057697566311184, + "grad_norm": 0.16341067850589752, + "learning_rate": 5e-05, + "loss": 1.6567, + "step": 2581 + }, + { + "epoch": 0.706043204812688, + "grad_norm": 0.17945139110088348, + "learning_rate": 5e-05, + "loss": 1.6919, + "step": 2582 + }, + { + "epoch": 0.7063166529942576, + "grad_norm": 0.15467429161071777, + "learning_rate": 5e-05, + "loss": 1.618, + "step": 2583 + }, + { + "epoch": 0.7065901011758272, + "grad_norm": 0.16300177574157715, + "learning_rate": 5e-05, + "loss": 1.6526, + "step": 2584 + }, + { + "epoch": 0.7068635493573968, + "grad_norm": 0.15929557383060455, + "learning_rate": 5e-05, + "loss": 1.5512, + "step": 2585 + }, + { + "epoch": 0.7071369975389664, + "grad_norm": 0.16156993806362152, + "learning_rate": 5e-05, + "loss": 1.6352, + "step": 2586 + }, + { + "epoch": 0.7074104457205359, + "grad_norm": 0.15560369193553925, + "learning_rate": 5e-05, + "loss": 1.5727, + "step": 2587 + }, + { + "epoch": 0.7076838939021055, + "grad_norm": 0.16337203979492188, + "learning_rate": 5e-05, + "loss": 1.561, + "step": 2588 + }, + { + "epoch": 0.7079573420836751, + "grad_norm": 0.1646268218755722, + "learning_rate": 5e-05, + "loss": 1.5965, + "step": 2589 + }, + { + "epoch": 0.7082307902652447, + "grad_norm": 0.16066445410251617, + "learning_rate": 5e-05, + "loss": 1.6132, + "step": 2590 + }, + { + "epoch": 0.7085042384468143, + "grad_norm": 0.14791713654994965, + "learning_rate": 5e-05, + "loss": 1.5, + "step": 2591 + }, + { + "epoch": 0.7087776866283839, + "grad_norm": 0.15416625142097473, + "learning_rate": 5e-05, + "loss": 1.5718, + "step": 2592 + }, + { + "epoch": 0.7090511348099535, + "grad_norm": 0.1604636311531067, + "learning_rate": 5e-05, + "loss": 1.6064, + "step": 2593 + }, + { + "epoch": 0.7093245829915231, + "grad_norm": 0.15781953930854797, + "learning_rate": 5e-05, + "loss": 1.5769, + "step": 2594 + }, + { + "epoch": 0.7095980311730927, + "grad_norm": 0.15604273974895477, + "learning_rate": 5e-05, + "loss": 1.588, + "step": 2595 + }, + { + "epoch": 0.7098714793546623, + "grad_norm": 0.15518909692764282, + "learning_rate": 5e-05, + "loss": 1.59, + "step": 2596 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 0.16104191541671753, + "learning_rate": 5e-05, + "loss": 1.6747, + "step": 2597 + }, + { + "epoch": 0.7104183757178015, + "grad_norm": 0.16382278501987457, + "learning_rate": 5e-05, + "loss": 1.6737, + "step": 2598 + }, + { + "epoch": 0.710691823899371, + "grad_norm": 0.16717152297496796, + "learning_rate": 5e-05, + "loss": 1.6239, + "step": 2599 + }, + { + "epoch": 0.7109652720809406, + "grad_norm": 0.16594311594963074, + "learning_rate": 5e-05, + "loss": 1.7318, + "step": 2600 + }, + { + "epoch": 0.7112387202625102, + "grad_norm": 0.16552476584911346, + "learning_rate": 5e-05, + "loss": 1.7376, + "step": 2601 + }, + { + "epoch": 0.7115121684440798, + "grad_norm": 0.17463426291942596, + "learning_rate": 5e-05, + "loss": 1.6286, + "step": 2602 + }, + { + "epoch": 0.7117856166256494, + "grad_norm": 0.159657821059227, + "learning_rate": 5e-05, + "loss": 1.5664, + "step": 2603 + }, + { + "epoch": 0.712059064807219, + "grad_norm": 0.16044557094573975, + "learning_rate": 5e-05, + "loss": 1.5265, + "step": 2604 + }, + { + "epoch": 0.7123325129887886, + "grad_norm": 0.15536926686763763, + "learning_rate": 5e-05, + "loss": 1.5381, + "step": 2605 + }, + { + "epoch": 0.7126059611703582, + "grad_norm": 0.1617160588502884, + "learning_rate": 5e-05, + "loss": 1.6025, + "step": 2606 + }, + { + "epoch": 0.7128794093519278, + "grad_norm": 0.16367921233177185, + "learning_rate": 5e-05, + "loss": 1.7194, + "step": 2607 + }, + { + "epoch": 0.7131528575334974, + "grad_norm": 0.16112937033176422, + "learning_rate": 5e-05, + "loss": 1.4914, + "step": 2608 + }, + { + "epoch": 0.713426305715067, + "grad_norm": 0.16444730758666992, + "learning_rate": 5e-05, + "loss": 1.625, + "step": 2609 + }, + { + "epoch": 0.7136997538966365, + "grad_norm": 0.16794142127037048, + "learning_rate": 5e-05, + "loss": 1.6348, + "step": 2610 + }, + { + "epoch": 0.7139732020782061, + "grad_norm": 0.1628817617893219, + "learning_rate": 5e-05, + "loss": 1.6083, + "step": 2611 + }, + { + "epoch": 0.7142466502597757, + "grad_norm": 0.18700134754180908, + "learning_rate": 5e-05, + "loss": 1.6996, + "step": 2612 + }, + { + "epoch": 0.7145200984413453, + "grad_norm": 0.15966500341892242, + "learning_rate": 5e-05, + "loss": 1.6113, + "step": 2613 + }, + { + "epoch": 0.7147935466229149, + "grad_norm": 0.18392211198806763, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 2614 + }, + { + "epoch": 0.7150669948044845, + "grad_norm": 0.17130230367183685, + "learning_rate": 5e-05, + "loss": 1.6511, + "step": 2615 + }, + { + "epoch": 0.7153404429860541, + "grad_norm": 0.16397760808467865, + "learning_rate": 5e-05, + "loss": 1.6283, + "step": 2616 + }, + { + "epoch": 0.7156138911676238, + "grad_norm": 0.16686753928661346, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 2617 + }, + { + "epoch": 0.7158873393491934, + "grad_norm": 0.1645701676607132, + "learning_rate": 5e-05, + "loss": 1.6268, + "step": 2618 + }, + { + "epoch": 0.716160787530763, + "grad_norm": 0.1619407832622528, + "learning_rate": 5e-05, + "loss": 1.6309, + "step": 2619 + }, + { + "epoch": 0.7164342357123326, + "grad_norm": 0.17755883932113647, + "learning_rate": 5e-05, + "loss": 1.6246, + "step": 2620 + }, + { + "epoch": 0.7167076838939022, + "grad_norm": 0.17687168717384338, + "learning_rate": 5e-05, + "loss": 1.6039, + "step": 2621 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.16282087564468384, + "learning_rate": 5e-05, + "loss": 1.5709, + "step": 2622 + }, + { + "epoch": 0.7172545802570413, + "grad_norm": 0.1705690324306488, + "learning_rate": 5e-05, + "loss": 1.4957, + "step": 2623 + }, + { + "epoch": 0.7175280284386109, + "grad_norm": 0.17099738121032715, + "learning_rate": 5e-05, + "loss": 1.5331, + "step": 2624 + }, + { + "epoch": 0.7178014766201805, + "grad_norm": 0.1703016757965088, + "learning_rate": 5e-05, + "loss": 1.6962, + "step": 2625 + }, + { + "epoch": 0.7180749248017501, + "grad_norm": 0.1631249189376831, + "learning_rate": 5e-05, + "loss": 1.5764, + "step": 2626 + }, + { + "epoch": 0.7183483729833197, + "grad_norm": 0.1670769453048706, + "learning_rate": 5e-05, + "loss": 1.5328, + "step": 2627 + }, + { + "epoch": 0.7186218211648893, + "grad_norm": 0.15617656707763672, + "learning_rate": 5e-05, + "loss": 1.5934, + "step": 2628 + }, + { + "epoch": 0.7188952693464589, + "grad_norm": 0.15787526965141296, + "learning_rate": 5e-05, + "loss": 1.5847, + "step": 2629 + }, + { + "epoch": 0.7191687175280285, + "grad_norm": 0.160158172249794, + "learning_rate": 5e-05, + "loss": 1.5974, + "step": 2630 + }, + { + "epoch": 0.7194421657095981, + "grad_norm": 0.16433510184288025, + "learning_rate": 5e-05, + "loss": 1.573, + "step": 2631 + }, + { + "epoch": 0.7197156138911677, + "grad_norm": 0.15415172278881073, + "learning_rate": 5e-05, + "loss": 1.5221, + "step": 2632 + }, + { + "epoch": 0.7199890620727372, + "grad_norm": 0.16252250969409943, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 2633 + }, + { + "epoch": 0.7202625102543068, + "grad_norm": 0.1922326236963272, + "learning_rate": 5e-05, + "loss": 1.5648, + "step": 2634 + }, + { + "epoch": 0.7205359584358764, + "grad_norm": 0.16628074645996094, + "learning_rate": 5e-05, + "loss": 1.5623, + "step": 2635 + }, + { + "epoch": 0.720809406617446, + "grad_norm": 0.19099196791648865, + "learning_rate": 5e-05, + "loss": 1.6423, + "step": 2636 + }, + { + "epoch": 0.7210828547990156, + "grad_norm": 0.20283576846122742, + "learning_rate": 5e-05, + "loss": 1.6107, + "step": 2637 + }, + { + "epoch": 0.7213563029805852, + "grad_norm": 0.15855461359024048, + "learning_rate": 5e-05, + "loss": 1.6438, + "step": 2638 + }, + { + "epoch": 0.7216297511621548, + "grad_norm": 0.21607458591461182, + "learning_rate": 5e-05, + "loss": 1.6484, + "step": 2639 + }, + { + "epoch": 0.7219031993437244, + "grad_norm": 0.1732168048620224, + "learning_rate": 5e-05, + "loss": 1.5945, + "step": 2640 + }, + { + "epoch": 0.722176647525294, + "grad_norm": 0.18066106736660004, + "learning_rate": 5e-05, + "loss": 1.6576, + "step": 2641 + }, + { + "epoch": 0.7224500957068636, + "grad_norm": 0.1668356955051422, + "learning_rate": 5e-05, + "loss": 1.6666, + "step": 2642 + }, + { + "epoch": 0.7227235438884332, + "grad_norm": 0.17484630644321442, + "learning_rate": 5e-05, + "loss": 1.5694, + "step": 2643 + }, + { + "epoch": 0.7229969920700028, + "grad_norm": 0.16421037912368774, + "learning_rate": 5e-05, + "loss": 1.5838, + "step": 2644 + }, + { + "epoch": 0.7232704402515723, + "grad_norm": 0.16860564053058624, + "learning_rate": 5e-05, + "loss": 1.6144, + "step": 2645 + }, + { + "epoch": 0.7235438884331419, + "grad_norm": 0.15611010789871216, + "learning_rate": 5e-05, + "loss": 1.5933, + "step": 2646 + }, + { + "epoch": 0.7238173366147115, + "grad_norm": 0.1699523776769638, + "learning_rate": 5e-05, + "loss": 1.6102, + "step": 2647 + }, + { + "epoch": 0.7240907847962811, + "grad_norm": 0.15928548574447632, + "learning_rate": 5e-05, + "loss": 1.5433, + "step": 2648 + }, + { + "epoch": 0.7243642329778507, + "grad_norm": 0.15467146039009094, + "learning_rate": 5e-05, + "loss": 1.5628, + "step": 2649 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.1566615253686905, + "learning_rate": 5e-05, + "loss": 1.6284, + "step": 2650 + }, + { + "epoch": 0.7249111293409899, + "grad_norm": 0.1829017996788025, + "learning_rate": 5e-05, + "loss": 1.5719, + "step": 2651 + }, + { + "epoch": 0.7251845775225595, + "grad_norm": 0.20479872822761536, + "learning_rate": 5e-05, + "loss": 1.483, + "step": 2652 + }, + { + "epoch": 0.7254580257041291, + "grad_norm": 0.16582070291042328, + "learning_rate": 5e-05, + "loss": 1.6091, + "step": 2653 + }, + { + "epoch": 0.7257314738856987, + "grad_norm": 0.16045349836349487, + "learning_rate": 5e-05, + "loss": 1.603, + "step": 2654 + }, + { + "epoch": 0.7260049220672683, + "grad_norm": 0.21731440722942352, + "learning_rate": 5e-05, + "loss": 1.5638, + "step": 2655 + }, + { + "epoch": 0.7262783702488378, + "grad_norm": 0.16208265721797943, + "learning_rate": 5e-05, + "loss": 1.5898, + "step": 2656 + }, + { + "epoch": 0.7265518184304074, + "grad_norm": 0.1588432937860489, + "learning_rate": 5e-05, + "loss": 1.5425, + "step": 2657 + }, + { + "epoch": 0.726825266611977, + "grad_norm": 0.1615055352449417, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 2658 + }, + { + "epoch": 0.7270987147935466, + "grad_norm": 0.14946486055850983, + "learning_rate": 5e-05, + "loss": 1.5624, + "step": 2659 + }, + { + "epoch": 0.7273721629751162, + "grad_norm": 0.15998360514640808, + "learning_rate": 5e-05, + "loss": 1.7129, + "step": 2660 + }, + { + "epoch": 0.7276456111566858, + "grad_norm": 0.15510539710521698, + "learning_rate": 5e-05, + "loss": 1.6051, + "step": 2661 + }, + { + "epoch": 0.7279190593382554, + "grad_norm": 0.15238802134990692, + "learning_rate": 5e-05, + "loss": 1.5503, + "step": 2662 + }, + { + "epoch": 0.728192507519825, + "grad_norm": 0.16379080712795258, + "learning_rate": 5e-05, + "loss": 1.5657, + "step": 2663 + }, + { + "epoch": 0.7284659557013946, + "grad_norm": 0.1604292243719101, + "learning_rate": 5e-05, + "loss": 1.5491, + "step": 2664 + }, + { + "epoch": 0.7287394038829642, + "grad_norm": 0.16709019243717194, + "learning_rate": 5e-05, + "loss": 1.5678, + "step": 2665 + }, + { + "epoch": 0.7290128520645338, + "grad_norm": 0.15374033153057098, + "learning_rate": 5e-05, + "loss": 1.5231, + "step": 2666 + }, + { + "epoch": 0.7292863002461034, + "grad_norm": 0.165635883808136, + "learning_rate": 5e-05, + "loss": 1.6576, + "step": 2667 + }, + { + "epoch": 0.7295597484276729, + "grad_norm": 0.15312857925891876, + "learning_rate": 5e-05, + "loss": 1.5964, + "step": 2668 + }, + { + "epoch": 0.7298331966092425, + "grad_norm": 0.16065338253974915, + "learning_rate": 5e-05, + "loss": 1.6054, + "step": 2669 + }, + { + "epoch": 0.7301066447908121, + "grad_norm": 0.16220992803573608, + "learning_rate": 5e-05, + "loss": 1.6551, + "step": 2670 + }, + { + "epoch": 0.7303800929723817, + "grad_norm": 0.16524186730384827, + "learning_rate": 5e-05, + "loss": 1.5719, + "step": 2671 + }, + { + "epoch": 0.7306535411539513, + "grad_norm": 0.1688978374004364, + "learning_rate": 5e-05, + "loss": 1.5573, + "step": 2672 + }, + { + "epoch": 0.7309269893355209, + "grad_norm": 0.16838806867599487, + "learning_rate": 5e-05, + "loss": 1.6978, + "step": 2673 + }, + { + "epoch": 0.7312004375170905, + "grad_norm": 0.15754219889640808, + "learning_rate": 5e-05, + "loss": 1.5535, + "step": 2674 + }, + { + "epoch": 0.7314738856986601, + "grad_norm": 0.17631256580352783, + "learning_rate": 5e-05, + "loss": 1.6562, + "step": 2675 + }, + { + "epoch": 0.7317473338802297, + "grad_norm": 0.17020723223686218, + "learning_rate": 5e-05, + "loss": 1.6649, + "step": 2676 + }, + { + "epoch": 0.7320207820617993, + "grad_norm": 0.17932482063770294, + "learning_rate": 5e-05, + "loss": 1.6102, + "step": 2677 + }, + { + "epoch": 0.7322942302433689, + "grad_norm": 0.1644601821899414, + "learning_rate": 5e-05, + "loss": 1.5042, + "step": 2678 + }, + { + "epoch": 0.7325676784249385, + "grad_norm": 0.18472503125667572, + "learning_rate": 5e-05, + "loss": 1.5805, + "step": 2679 + }, + { + "epoch": 0.732841126606508, + "grad_norm": 0.16369123756885529, + "learning_rate": 5e-05, + "loss": 1.6079, + "step": 2680 + }, + { + "epoch": 0.7331145747880776, + "grad_norm": 0.16636909544467926, + "learning_rate": 5e-05, + "loss": 1.5607, + "step": 2681 + }, + { + "epoch": 0.7333880229696472, + "grad_norm": 0.16647064685821533, + "learning_rate": 5e-05, + "loss": 1.5819, + "step": 2682 + }, + { + "epoch": 0.7336614711512168, + "grad_norm": 0.17708119750022888, + "learning_rate": 5e-05, + "loss": 1.6366, + "step": 2683 + }, + { + "epoch": 0.7339349193327864, + "grad_norm": 0.15985938906669617, + "learning_rate": 5e-05, + "loss": 1.6187, + "step": 2684 + }, + { + "epoch": 0.734208367514356, + "grad_norm": 0.1802692860364914, + "learning_rate": 5e-05, + "loss": 1.5459, + "step": 2685 + }, + { + "epoch": 0.7344818156959256, + "grad_norm": 0.156411275267601, + "learning_rate": 5e-05, + "loss": 1.5549, + "step": 2686 + }, + { + "epoch": 0.7347552638774952, + "grad_norm": 0.1642201840877533, + "learning_rate": 5e-05, + "loss": 1.6463, + "step": 2687 + }, + { + "epoch": 0.7350287120590648, + "grad_norm": 0.16783371567726135, + "learning_rate": 5e-05, + "loss": 1.5811, + "step": 2688 + }, + { + "epoch": 0.7353021602406344, + "grad_norm": 0.15901906788349152, + "learning_rate": 5e-05, + "loss": 1.5787, + "step": 2689 + }, + { + "epoch": 0.735575608422204, + "grad_norm": 0.1638929694890976, + "learning_rate": 5e-05, + "loss": 1.6526, + "step": 2690 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 0.15809820592403412, + "learning_rate": 5e-05, + "loss": 1.6082, + "step": 2691 + }, + { + "epoch": 0.7361225047853431, + "grad_norm": 0.1644376665353775, + "learning_rate": 5e-05, + "loss": 1.5305, + "step": 2692 + }, + { + "epoch": 0.7363959529669127, + "grad_norm": 0.16332222521305084, + "learning_rate": 5e-05, + "loss": 1.6534, + "step": 2693 + }, + { + "epoch": 0.7366694011484823, + "grad_norm": 0.18329092860221863, + "learning_rate": 5e-05, + "loss": 1.603, + "step": 2694 + }, + { + "epoch": 0.7369428493300519, + "grad_norm": 0.1716667115688324, + "learning_rate": 5e-05, + "loss": 1.7023, + "step": 2695 + }, + { + "epoch": 0.7372162975116215, + "grad_norm": 0.16460417211055756, + "learning_rate": 5e-05, + "loss": 1.6165, + "step": 2696 + }, + { + "epoch": 0.7374897456931911, + "grad_norm": 0.16401362419128418, + "learning_rate": 5e-05, + "loss": 1.5556, + "step": 2697 + }, + { + "epoch": 0.7377631938747607, + "grad_norm": 0.15876410901546478, + "learning_rate": 5e-05, + "loss": 1.5701, + "step": 2698 + }, + { + "epoch": 0.7380366420563303, + "grad_norm": 0.18006089329719543, + "learning_rate": 5e-05, + "loss": 1.6061, + "step": 2699 + }, + { + "epoch": 0.7383100902379, + "grad_norm": 0.16289302706718445, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 2700 + }, + { + "epoch": 0.7385835384194696, + "grad_norm": 0.15855976939201355, + "learning_rate": 5e-05, + "loss": 1.5566, + "step": 2701 + }, + { + "epoch": 0.7388569866010392, + "grad_norm": 0.16138465702533722, + "learning_rate": 5e-05, + "loss": 1.6222, + "step": 2702 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.1679658591747284, + "learning_rate": 5e-05, + "loss": 1.6472, + "step": 2703 + }, + { + "epoch": 0.7394038829641782, + "grad_norm": 0.16092632710933685, + "learning_rate": 5e-05, + "loss": 1.6278, + "step": 2704 + }, + { + "epoch": 0.7396773311457479, + "grad_norm": 0.17156141996383667, + "learning_rate": 5e-05, + "loss": 1.6397, + "step": 2705 + }, + { + "epoch": 0.7399507793273175, + "grad_norm": 0.17136305570602417, + "learning_rate": 5e-05, + "loss": 1.6552, + "step": 2706 + }, + { + "epoch": 0.7402242275088871, + "grad_norm": 0.16168268024921417, + "learning_rate": 5e-05, + "loss": 1.5059, + "step": 2707 + }, + { + "epoch": 0.7404976756904567, + "grad_norm": 0.16185155510902405, + "learning_rate": 5e-05, + "loss": 1.5465, + "step": 2708 + }, + { + "epoch": 0.7407711238720263, + "grad_norm": 0.15480349957942963, + "learning_rate": 5e-05, + "loss": 1.5585, + "step": 2709 + }, + { + "epoch": 0.7410445720535959, + "grad_norm": 0.16501635313034058, + "learning_rate": 5e-05, + "loss": 1.6614, + "step": 2710 + }, + { + "epoch": 0.7413180202351655, + "grad_norm": 0.15960432589054108, + "learning_rate": 5e-05, + "loss": 1.5774, + "step": 2711 + }, + { + "epoch": 0.7415914684167351, + "grad_norm": 0.16600382328033447, + "learning_rate": 5e-05, + "loss": 1.6463, + "step": 2712 + }, + { + "epoch": 0.7418649165983047, + "grad_norm": 0.16479994356632233, + "learning_rate": 5e-05, + "loss": 1.6348, + "step": 2713 + }, + { + "epoch": 0.7421383647798742, + "grad_norm": 0.15985356271266937, + "learning_rate": 5e-05, + "loss": 1.5811, + "step": 2714 + }, + { + "epoch": 0.7424118129614438, + "grad_norm": 0.16183726489543915, + "learning_rate": 5e-05, + "loss": 1.6204, + "step": 2715 + }, + { + "epoch": 0.7426852611430134, + "grad_norm": 0.1608124077320099, + "learning_rate": 5e-05, + "loss": 1.5919, + "step": 2716 + }, + { + "epoch": 0.742958709324583, + "grad_norm": 0.16460396349430084, + "learning_rate": 5e-05, + "loss": 1.5784, + "step": 2717 + }, + { + "epoch": 0.7432321575061526, + "grad_norm": 0.1570330709218979, + "learning_rate": 5e-05, + "loss": 1.577, + "step": 2718 + }, + { + "epoch": 0.7435056056877222, + "grad_norm": 0.17111632227897644, + "learning_rate": 5e-05, + "loss": 1.5384, + "step": 2719 + }, + { + "epoch": 0.7437790538692918, + "grad_norm": 0.1649581640958786, + "learning_rate": 5e-05, + "loss": 1.5283, + "step": 2720 + }, + { + "epoch": 0.7440525020508614, + "grad_norm": 0.16623181104660034, + "learning_rate": 5e-05, + "loss": 1.6578, + "step": 2721 + }, + { + "epoch": 0.744325950232431, + "grad_norm": 0.16478942334651947, + "learning_rate": 5e-05, + "loss": 1.6588, + "step": 2722 + }, + { + "epoch": 0.7445993984140006, + "grad_norm": 0.16344524919986725, + "learning_rate": 5e-05, + "loss": 1.6124, + "step": 2723 + }, + { + "epoch": 0.7448728465955702, + "grad_norm": 0.1609012335538864, + "learning_rate": 5e-05, + "loss": 1.6005, + "step": 2724 + }, + { + "epoch": 0.7451462947771398, + "grad_norm": 0.17075218260288239, + "learning_rate": 5e-05, + "loss": 1.7217, + "step": 2725 + }, + { + "epoch": 0.7454197429587093, + "grad_norm": 0.16447240114212036, + "learning_rate": 5e-05, + "loss": 1.6142, + "step": 2726 + }, + { + "epoch": 0.7456931911402789, + "grad_norm": 0.15842659771442413, + "learning_rate": 5e-05, + "loss": 1.566, + "step": 2727 + }, + { + "epoch": 0.7459666393218485, + "grad_norm": 0.1670447736978531, + "learning_rate": 5e-05, + "loss": 1.6299, + "step": 2728 + }, + { + "epoch": 0.7462400875034181, + "grad_norm": 0.1572618931531906, + "learning_rate": 5e-05, + "loss": 1.5555, + "step": 2729 + }, + { + "epoch": 0.7465135356849877, + "grad_norm": 0.16439321637153625, + "learning_rate": 5e-05, + "loss": 1.6149, + "step": 2730 + }, + { + "epoch": 0.7467869838665573, + "grad_norm": 0.15821623802185059, + "learning_rate": 5e-05, + "loss": 1.6452, + "step": 2731 + }, + { + "epoch": 0.7470604320481269, + "grad_norm": 0.16016224026679993, + "learning_rate": 5e-05, + "loss": 1.5371, + "step": 2732 + }, + { + "epoch": 0.7473338802296965, + "grad_norm": 0.1666966676712036, + "learning_rate": 5e-05, + "loss": 1.5912, + "step": 2733 + }, + { + "epoch": 0.7476073284112661, + "grad_norm": 0.1630515605211258, + "learning_rate": 5e-05, + "loss": 1.5903, + "step": 2734 + }, + { + "epoch": 0.7478807765928357, + "grad_norm": 0.1738271415233612, + "learning_rate": 5e-05, + "loss": 1.6498, + "step": 2735 + }, + { + "epoch": 0.7481542247744053, + "grad_norm": 0.16823971271514893, + "learning_rate": 5e-05, + "loss": 1.6878, + "step": 2736 + }, + { + "epoch": 0.7484276729559748, + "grad_norm": 0.16174787282943726, + "learning_rate": 5e-05, + "loss": 1.5538, + "step": 2737 + }, + { + "epoch": 0.7487011211375444, + "grad_norm": 0.15992052853107452, + "learning_rate": 5e-05, + "loss": 1.5307, + "step": 2738 + }, + { + "epoch": 0.748974569319114, + "grad_norm": 0.17191752791404724, + "learning_rate": 5e-05, + "loss": 1.6057, + "step": 2739 + }, + { + "epoch": 0.7492480175006836, + "grad_norm": 0.16447550058364868, + "learning_rate": 5e-05, + "loss": 1.6311, + "step": 2740 + }, + { + "epoch": 0.7495214656822532, + "grad_norm": 0.16319876909255981, + "learning_rate": 5e-05, + "loss": 1.5762, + "step": 2741 + }, + { + "epoch": 0.7497949138638228, + "grad_norm": 0.16644878685474396, + "learning_rate": 5e-05, + "loss": 1.5566, + "step": 2742 + }, + { + "epoch": 0.7500683620453924, + "grad_norm": 0.16339637339115143, + "learning_rate": 5e-05, + "loss": 1.6049, + "step": 2743 + }, + { + "epoch": 0.750341810226962, + "grad_norm": 0.1628854125738144, + "learning_rate": 5e-05, + "loss": 1.5755, + "step": 2744 + }, + { + "epoch": 0.7506152584085316, + "grad_norm": 0.15712201595306396, + "learning_rate": 5e-05, + "loss": 1.5888, + "step": 2745 + }, + { + "epoch": 0.7508887065901012, + "grad_norm": 0.15863806009292603, + "learning_rate": 5e-05, + "loss": 1.5662, + "step": 2746 + }, + { + "epoch": 0.7511621547716708, + "grad_norm": 0.16975349187850952, + "learning_rate": 5e-05, + "loss": 1.6032, + "step": 2747 + }, + { + "epoch": 0.7514356029532404, + "grad_norm": 0.15435737371444702, + "learning_rate": 5e-05, + "loss": 1.5288, + "step": 2748 + }, + { + "epoch": 0.7517090511348099, + "grad_norm": 0.16747300326824188, + "learning_rate": 5e-05, + "loss": 1.59, + "step": 2749 + }, + { + "epoch": 0.7519824993163795, + "grad_norm": 0.17562800645828247, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 2750 + }, + { + "epoch": 0.7522559474979491, + "grad_norm": 0.1631203293800354, + "learning_rate": 5e-05, + "loss": 1.6197, + "step": 2751 + }, + { + "epoch": 0.7525293956795187, + "grad_norm": 0.17813941836357117, + "learning_rate": 5e-05, + "loss": 1.607, + "step": 2752 + }, + { + "epoch": 0.7528028438610883, + "grad_norm": 0.15916167199611664, + "learning_rate": 5e-05, + "loss": 1.6042, + "step": 2753 + }, + { + "epoch": 0.7530762920426579, + "grad_norm": 0.18315176665782928, + "learning_rate": 5e-05, + "loss": 1.6809, + "step": 2754 + }, + { + "epoch": 0.7533497402242275, + "grad_norm": 0.16371318697929382, + "learning_rate": 5e-05, + "loss": 1.58, + "step": 2755 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.1591997742652893, + "learning_rate": 5e-05, + "loss": 1.5595, + "step": 2756 + }, + { + "epoch": 0.7538966365873667, + "grad_norm": 0.1630491465330124, + "learning_rate": 5e-05, + "loss": 1.6129, + "step": 2757 + }, + { + "epoch": 0.7541700847689363, + "grad_norm": 0.1691833734512329, + "learning_rate": 5e-05, + "loss": 1.6539, + "step": 2758 + }, + { + "epoch": 0.7544435329505059, + "grad_norm": 0.16124382615089417, + "learning_rate": 5e-05, + "loss": 1.6068, + "step": 2759 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.16120778024196625, + "learning_rate": 5e-05, + "loss": 1.6477, + "step": 2760 + }, + { + "epoch": 0.754990429313645, + "grad_norm": 0.16382825374603271, + "learning_rate": 5e-05, + "loss": 1.6095, + "step": 2761 + }, + { + "epoch": 0.7552638774952146, + "grad_norm": 0.16977065801620483, + "learning_rate": 5e-05, + "loss": 1.6844, + "step": 2762 + }, + { + "epoch": 0.7555373256767842, + "grad_norm": 0.16113348305225372, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 2763 + }, + { + "epoch": 0.7558107738583538, + "grad_norm": 0.17500002682209015, + "learning_rate": 5e-05, + "loss": 1.5729, + "step": 2764 + }, + { + "epoch": 0.7560842220399234, + "grad_norm": 0.16505929827690125, + "learning_rate": 5e-05, + "loss": 1.5522, + "step": 2765 + }, + { + "epoch": 0.756357670221493, + "grad_norm": 0.16675904393196106, + "learning_rate": 5e-05, + "loss": 1.6734, + "step": 2766 + }, + { + "epoch": 0.7566311184030626, + "grad_norm": 0.20538048446178436, + "learning_rate": 5e-05, + "loss": 1.6777, + "step": 2767 + }, + { + "epoch": 0.7569045665846322, + "grad_norm": 0.17019402980804443, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 2768 + }, + { + "epoch": 0.7571780147662018, + "grad_norm": 0.18559719622135162, + "learning_rate": 5e-05, + "loss": 1.6489, + "step": 2769 + }, + { + "epoch": 0.7574514629477714, + "grad_norm": 0.16804775595664978, + "learning_rate": 5e-05, + "loss": 1.5647, + "step": 2770 + }, + { + "epoch": 0.757724911129341, + "grad_norm": 0.1561867594718933, + "learning_rate": 5e-05, + "loss": 1.5402, + "step": 2771 + }, + { + "epoch": 0.7579983593109105, + "grad_norm": 0.1730283796787262, + "learning_rate": 5e-05, + "loss": 1.6805, + "step": 2772 + }, + { + "epoch": 0.7582718074924801, + "grad_norm": 0.17423667013645172, + "learning_rate": 5e-05, + "loss": 1.6453, + "step": 2773 + }, + { + "epoch": 0.7585452556740497, + "grad_norm": 0.1615607887506485, + "learning_rate": 5e-05, + "loss": 1.5584, + "step": 2774 + }, + { + "epoch": 0.7588187038556193, + "grad_norm": 0.16401253640651703, + "learning_rate": 5e-05, + "loss": 1.5841, + "step": 2775 + }, + { + "epoch": 0.7590921520371889, + "grad_norm": 0.1691298633813858, + "learning_rate": 5e-05, + "loss": 1.6887, + "step": 2776 + }, + { + "epoch": 0.7593656002187585, + "grad_norm": 0.15560850501060486, + "learning_rate": 5e-05, + "loss": 1.4923, + "step": 2777 + }, + { + "epoch": 0.7596390484003281, + "grad_norm": 0.1585303395986557, + "learning_rate": 5e-05, + "loss": 1.5857, + "step": 2778 + }, + { + "epoch": 0.7599124965818977, + "grad_norm": 0.16869275271892548, + "learning_rate": 5e-05, + "loss": 1.5873, + "step": 2779 + }, + { + "epoch": 0.7601859447634673, + "grad_norm": 0.17015528678894043, + "learning_rate": 5e-05, + "loss": 1.7419, + "step": 2780 + }, + { + "epoch": 0.760459392945037, + "grad_norm": 0.1577947735786438, + "learning_rate": 5e-05, + "loss": 1.5961, + "step": 2781 + }, + { + "epoch": 0.7607328411266066, + "grad_norm": 0.16766513884067535, + "learning_rate": 5e-05, + "loss": 1.6623, + "step": 2782 + }, + { + "epoch": 0.7610062893081762, + "grad_norm": 0.15825945138931274, + "learning_rate": 5e-05, + "loss": 1.6, + "step": 2783 + }, + { + "epoch": 0.7612797374897456, + "grad_norm": 0.16338619589805603, + "learning_rate": 5e-05, + "loss": 1.677, + "step": 2784 + }, + { + "epoch": 0.7615531856713152, + "grad_norm": 0.16439124941825867, + "learning_rate": 5e-05, + "loss": 1.6444, + "step": 2785 + }, + { + "epoch": 0.7618266338528848, + "grad_norm": 0.1587378978729248, + "learning_rate": 5e-05, + "loss": 1.6477, + "step": 2786 + }, + { + "epoch": 0.7621000820344545, + "grad_norm": 0.16001760959625244, + "learning_rate": 5e-05, + "loss": 1.638, + "step": 2787 + }, + { + "epoch": 0.762373530216024, + "grad_norm": 0.16488565504550934, + "learning_rate": 5e-05, + "loss": 1.5758, + "step": 2788 + }, + { + "epoch": 0.7626469783975937, + "grad_norm": 0.17125697433948517, + "learning_rate": 5e-05, + "loss": 1.6488, + "step": 2789 + }, + { + "epoch": 0.7629204265791633, + "grad_norm": 0.16552145779132843, + "learning_rate": 5e-05, + "loss": 1.6288, + "step": 2790 + }, + { + "epoch": 0.7631938747607329, + "grad_norm": 0.16142013669013977, + "learning_rate": 5e-05, + "loss": 1.5717, + "step": 2791 + }, + { + "epoch": 0.7634673229423025, + "grad_norm": 0.17424419522285461, + "learning_rate": 5e-05, + "loss": 1.6741, + "step": 2792 + }, + { + "epoch": 0.7637407711238721, + "grad_norm": 0.1656377911567688, + "learning_rate": 5e-05, + "loss": 1.5975, + "step": 2793 + }, + { + "epoch": 0.7640142193054417, + "grad_norm": 0.1610795110464096, + "learning_rate": 5e-05, + "loss": 1.6058, + "step": 2794 + }, + { + "epoch": 0.7642876674870112, + "grad_norm": 0.160441592335701, + "learning_rate": 5e-05, + "loss": 1.5343, + "step": 2795 + }, + { + "epoch": 0.7645611156685808, + "grad_norm": 0.16838759183883667, + "learning_rate": 5e-05, + "loss": 1.5573, + "step": 2796 + }, + { + "epoch": 0.7648345638501504, + "grad_norm": 0.16315698623657227, + "learning_rate": 5e-05, + "loss": 1.5484, + "step": 2797 + }, + { + "epoch": 0.76510801203172, + "grad_norm": 0.17179346084594727, + "learning_rate": 5e-05, + "loss": 1.7635, + "step": 2798 + }, + { + "epoch": 0.7653814602132896, + "grad_norm": 0.15899762511253357, + "learning_rate": 5e-05, + "loss": 1.645, + "step": 2799 + }, + { + "epoch": 0.7656549083948592, + "grad_norm": 0.1585722267627716, + "learning_rate": 5e-05, + "loss": 1.5657, + "step": 2800 + }, + { + "epoch": 0.7659283565764288, + "grad_norm": 0.16446919739246368, + "learning_rate": 5e-05, + "loss": 1.6099, + "step": 2801 + }, + { + "epoch": 0.7662018047579984, + "grad_norm": 0.16195422410964966, + "learning_rate": 5e-05, + "loss": 1.5312, + "step": 2802 + }, + { + "epoch": 0.766475252939568, + "grad_norm": 0.1638331413269043, + "learning_rate": 5e-05, + "loss": 1.573, + "step": 2803 + }, + { + "epoch": 0.7667487011211376, + "grad_norm": 0.16117534041404724, + "learning_rate": 5e-05, + "loss": 1.5535, + "step": 2804 + }, + { + "epoch": 0.7670221493027072, + "grad_norm": 0.1906486600637436, + "learning_rate": 5e-05, + "loss": 1.6745, + "step": 2805 + }, + { + "epoch": 0.7672955974842768, + "grad_norm": 0.1637091189622879, + "learning_rate": 5e-05, + "loss": 1.6076, + "step": 2806 + }, + { + "epoch": 0.7675690456658463, + "grad_norm": 0.16975869238376617, + "learning_rate": 5e-05, + "loss": 1.5807, + "step": 2807 + }, + { + "epoch": 0.7678424938474159, + "grad_norm": 0.16839958727359772, + "learning_rate": 5e-05, + "loss": 1.5136, + "step": 2808 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 0.16886620223522186, + "learning_rate": 5e-05, + "loss": 1.6107, + "step": 2809 + }, + { + "epoch": 0.7683893902105551, + "grad_norm": 0.16913080215454102, + "learning_rate": 5e-05, + "loss": 1.547, + "step": 2810 + }, + { + "epoch": 0.7686628383921247, + "grad_norm": 0.17105981707572937, + "learning_rate": 5e-05, + "loss": 1.6501, + "step": 2811 + }, + { + "epoch": 0.7689362865736943, + "grad_norm": 0.16617277264595032, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 2812 + }, + { + "epoch": 0.7692097347552639, + "grad_norm": 0.16640552878379822, + "learning_rate": 5e-05, + "loss": 1.5152, + "step": 2813 + }, + { + "epoch": 0.7694831829368335, + "grad_norm": 0.1717168390750885, + "learning_rate": 5e-05, + "loss": 1.5904, + "step": 2814 + }, + { + "epoch": 0.7697566311184031, + "grad_norm": 0.15507300198078156, + "learning_rate": 5e-05, + "loss": 1.4552, + "step": 2815 + }, + { + "epoch": 0.7700300792999727, + "grad_norm": 0.16422231495380402, + "learning_rate": 5e-05, + "loss": 1.6003, + "step": 2816 + }, + { + "epoch": 0.7703035274815423, + "grad_norm": 0.16993476450443268, + "learning_rate": 5e-05, + "loss": 1.5668, + "step": 2817 + }, + { + "epoch": 0.7705769756631118, + "grad_norm": 0.16527430713176727, + "learning_rate": 5e-05, + "loss": 1.5809, + "step": 2818 + }, + { + "epoch": 0.7708504238446814, + "grad_norm": 0.16234123706817627, + "learning_rate": 5e-05, + "loss": 1.5393, + "step": 2819 + }, + { + "epoch": 0.771123872026251, + "grad_norm": 0.17365649342536926, + "learning_rate": 5e-05, + "loss": 1.5758, + "step": 2820 + }, + { + "epoch": 0.7713973202078206, + "grad_norm": 0.17129479348659515, + "learning_rate": 5e-05, + "loss": 1.6084, + "step": 2821 + }, + { + "epoch": 0.7716707683893902, + "grad_norm": 0.16345179080963135, + "learning_rate": 5e-05, + "loss": 1.5886, + "step": 2822 + }, + { + "epoch": 0.7719442165709598, + "grad_norm": 0.17358049750328064, + "learning_rate": 5e-05, + "loss": 1.6457, + "step": 2823 + }, + { + "epoch": 0.7722176647525294, + "grad_norm": 0.17579670250415802, + "learning_rate": 5e-05, + "loss": 1.7123, + "step": 2824 + }, + { + "epoch": 0.772491112934099, + "grad_norm": 0.1691892147064209, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 2825 + }, + { + "epoch": 0.7727645611156686, + "grad_norm": 0.17657522857189178, + "learning_rate": 5e-05, + "loss": 1.6072, + "step": 2826 + }, + { + "epoch": 0.7730380092972382, + "grad_norm": 0.17486031353473663, + "learning_rate": 5e-05, + "loss": 1.5497, + "step": 2827 + }, + { + "epoch": 0.7733114574788078, + "grad_norm": 0.16051729023456573, + "learning_rate": 5e-05, + "loss": 1.4969, + "step": 2828 + }, + { + "epoch": 0.7735849056603774, + "grad_norm": 0.16621479392051697, + "learning_rate": 5e-05, + "loss": 1.6151, + "step": 2829 + }, + { + "epoch": 0.7738583538419469, + "grad_norm": 0.16369469463825226, + "learning_rate": 5e-05, + "loss": 1.6707, + "step": 2830 + }, + { + "epoch": 0.7741318020235165, + "grad_norm": 0.1664346307516098, + "learning_rate": 5e-05, + "loss": 1.552, + "step": 2831 + }, + { + "epoch": 0.7744052502050861, + "grad_norm": 0.1688821017742157, + "learning_rate": 5e-05, + "loss": 1.566, + "step": 2832 + }, + { + "epoch": 0.7746786983866557, + "grad_norm": 0.15843422710895538, + "learning_rate": 5e-05, + "loss": 1.6504, + "step": 2833 + }, + { + "epoch": 0.7749521465682253, + "grad_norm": 0.16451962292194366, + "learning_rate": 5e-05, + "loss": 1.5611, + "step": 2834 + }, + { + "epoch": 0.7752255947497949, + "grad_norm": 0.17255376279354095, + "learning_rate": 5e-05, + "loss": 1.5388, + "step": 2835 + }, + { + "epoch": 0.7754990429313645, + "grad_norm": 0.1702892780303955, + "learning_rate": 5e-05, + "loss": 1.6152, + "step": 2836 + }, + { + "epoch": 0.7757724911129341, + "grad_norm": 0.16479864716529846, + "learning_rate": 5e-05, + "loss": 1.677, + "step": 2837 + }, + { + "epoch": 0.7760459392945037, + "grad_norm": 0.16758742928504944, + "learning_rate": 5e-05, + "loss": 1.5406, + "step": 2838 + }, + { + "epoch": 0.7763193874760733, + "grad_norm": 0.1617206186056137, + "learning_rate": 5e-05, + "loss": 1.635, + "step": 2839 + }, + { + "epoch": 0.7765928356576429, + "grad_norm": 0.1629093438386917, + "learning_rate": 5e-05, + "loss": 1.5516, + "step": 2840 + }, + { + "epoch": 0.7768662838392125, + "grad_norm": 0.16901403665542603, + "learning_rate": 5e-05, + "loss": 1.6223, + "step": 2841 + }, + { + "epoch": 0.777139732020782, + "grad_norm": 0.16104839742183685, + "learning_rate": 5e-05, + "loss": 1.6432, + "step": 2842 + }, + { + "epoch": 0.7774131802023516, + "grad_norm": 0.15974918007850647, + "learning_rate": 5e-05, + "loss": 1.5034, + "step": 2843 + }, + { + "epoch": 0.7776866283839212, + "grad_norm": 0.1669033169746399, + "learning_rate": 5e-05, + "loss": 1.6008, + "step": 2844 + }, + { + "epoch": 0.7779600765654908, + "grad_norm": 0.17953120172023773, + "learning_rate": 5e-05, + "loss": 1.6714, + "step": 2845 + }, + { + "epoch": 0.7782335247470604, + "grad_norm": 0.16718675196170807, + "learning_rate": 5e-05, + "loss": 1.5205, + "step": 2846 + }, + { + "epoch": 0.77850697292863, + "grad_norm": 0.16724152863025665, + "learning_rate": 5e-05, + "loss": 1.5884, + "step": 2847 + }, + { + "epoch": 0.7787804211101996, + "grad_norm": 0.16082365810871124, + "learning_rate": 5e-05, + "loss": 1.6615, + "step": 2848 + }, + { + "epoch": 0.7790538692917692, + "grad_norm": 0.15853464603424072, + "learning_rate": 5e-05, + "loss": 1.5306, + "step": 2849 + }, + { + "epoch": 0.7793273174733388, + "grad_norm": 0.1667121946811676, + "learning_rate": 5e-05, + "loss": 1.5941, + "step": 2850 + }, + { + "epoch": 0.7796007656549084, + "grad_norm": 0.1697937250137329, + "learning_rate": 5e-05, + "loss": 1.6127, + "step": 2851 + }, + { + "epoch": 0.779874213836478, + "grad_norm": 0.158981055021286, + "learning_rate": 5e-05, + "loss": 1.5672, + "step": 2852 + }, + { + "epoch": 0.7801476620180475, + "grad_norm": 0.1826319545507431, + "learning_rate": 5e-05, + "loss": 1.6604, + "step": 2853 + }, + { + "epoch": 0.7804211101996171, + "grad_norm": 0.17479664087295532, + "learning_rate": 5e-05, + "loss": 1.6348, + "step": 2854 + }, + { + "epoch": 0.7806945583811867, + "grad_norm": 0.17905382812023163, + "learning_rate": 5e-05, + "loss": 1.5434, + "step": 2855 + }, + { + "epoch": 0.7809680065627563, + "grad_norm": 0.1738382875919342, + "learning_rate": 5e-05, + "loss": 1.6045, + "step": 2856 + }, + { + "epoch": 0.7812414547443259, + "grad_norm": 0.16724707186222076, + "learning_rate": 5e-05, + "loss": 1.5591, + "step": 2857 + }, + { + "epoch": 0.7815149029258955, + "grad_norm": 0.17346110939979553, + "learning_rate": 5e-05, + "loss": 1.5721, + "step": 2858 + }, + { + "epoch": 0.7817883511074651, + "grad_norm": 0.16252191364765167, + "learning_rate": 5e-05, + "loss": 1.5935, + "step": 2859 + }, + { + "epoch": 0.7820617992890347, + "grad_norm": 0.1672922968864441, + "learning_rate": 5e-05, + "loss": 1.6467, + "step": 2860 + }, + { + "epoch": 0.7823352474706043, + "grad_norm": 0.1666378527879715, + "learning_rate": 5e-05, + "loss": 1.6392, + "step": 2861 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.1650533229112625, + "learning_rate": 5e-05, + "loss": 1.5737, + "step": 2862 + }, + { + "epoch": 0.7828821438337435, + "grad_norm": 0.16479378938674927, + "learning_rate": 5e-05, + "loss": 1.6066, + "step": 2863 + }, + { + "epoch": 0.7831555920153132, + "grad_norm": 0.16486263275146484, + "learning_rate": 5e-05, + "loss": 1.5617, + "step": 2864 + }, + { + "epoch": 0.7834290401968826, + "grad_norm": 0.1614285111427307, + "learning_rate": 5e-05, + "loss": 1.6226, + "step": 2865 + }, + { + "epoch": 0.7837024883784522, + "grad_norm": 0.176860049366951, + "learning_rate": 5e-05, + "loss": 1.5129, + "step": 2866 + }, + { + "epoch": 0.7839759365600218, + "grad_norm": 0.16381007432937622, + "learning_rate": 5e-05, + "loss": 1.6524, + "step": 2867 + }, + { + "epoch": 0.7842493847415914, + "grad_norm": 0.15981391072273254, + "learning_rate": 5e-05, + "loss": 1.5611, + "step": 2868 + }, + { + "epoch": 0.784522832923161, + "grad_norm": 0.17717792093753815, + "learning_rate": 5e-05, + "loss": 1.691, + "step": 2869 + }, + { + "epoch": 0.7847962811047307, + "grad_norm": 0.15418937802314758, + "learning_rate": 5e-05, + "loss": 1.5502, + "step": 2870 + }, + { + "epoch": 0.7850697292863003, + "grad_norm": 0.17375600337982178, + "learning_rate": 5e-05, + "loss": 1.6708, + "step": 2871 + }, + { + "epoch": 0.7853431774678699, + "grad_norm": 0.16681233048439026, + "learning_rate": 5e-05, + "loss": 1.6323, + "step": 2872 + }, + { + "epoch": 0.7856166256494395, + "grad_norm": 0.15948568284511566, + "learning_rate": 5e-05, + "loss": 1.5582, + "step": 2873 + }, + { + "epoch": 0.7858900738310091, + "grad_norm": 0.16835032403469086, + "learning_rate": 5e-05, + "loss": 1.5979, + "step": 2874 + }, + { + "epoch": 0.7861635220125787, + "grad_norm": 0.16147814691066742, + "learning_rate": 5e-05, + "loss": 1.5618, + "step": 2875 + }, + { + "epoch": 0.7864369701941482, + "grad_norm": 0.15999336540699005, + "learning_rate": 5e-05, + "loss": 1.5351, + "step": 2876 + }, + { + "epoch": 0.7867104183757178, + "grad_norm": 0.17828235030174255, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 2877 + }, + { + "epoch": 0.7869838665572874, + "grad_norm": 0.16279451549053192, + "learning_rate": 5e-05, + "loss": 1.5923, + "step": 2878 + }, + { + "epoch": 0.787257314738857, + "grad_norm": 0.16363869607448578, + "learning_rate": 5e-05, + "loss": 1.6645, + "step": 2879 + }, + { + "epoch": 0.7875307629204266, + "grad_norm": 0.17845980823040009, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 2880 + }, + { + "epoch": 0.7878042111019962, + "grad_norm": 0.17278538644313812, + "learning_rate": 5e-05, + "loss": 1.6211, + "step": 2881 + }, + { + "epoch": 0.7880776592835658, + "grad_norm": 0.15572743117809296, + "learning_rate": 5e-05, + "loss": 1.5377, + "step": 2882 + }, + { + "epoch": 0.7883511074651354, + "grad_norm": 0.18251000344753265, + "learning_rate": 5e-05, + "loss": 1.769, + "step": 2883 + }, + { + "epoch": 0.788624555646705, + "grad_norm": 0.17334499955177307, + "learning_rate": 5e-05, + "loss": 1.6428, + "step": 2884 + }, + { + "epoch": 0.7888980038282746, + "grad_norm": 0.17535856366157532, + "learning_rate": 5e-05, + "loss": 1.7187, + "step": 2885 + }, + { + "epoch": 0.7891714520098442, + "grad_norm": 0.16317224502563477, + "learning_rate": 5e-05, + "loss": 1.4863, + "step": 2886 + }, + { + "epoch": 0.7894449001914138, + "grad_norm": 0.18952353298664093, + "learning_rate": 5e-05, + "loss": 1.7284, + "step": 2887 + }, + { + "epoch": 0.7897183483729833, + "grad_norm": 0.17403848469257355, + "learning_rate": 5e-05, + "loss": 1.5511, + "step": 2888 + }, + { + "epoch": 0.7899917965545529, + "grad_norm": 0.17016200721263885, + "learning_rate": 5e-05, + "loss": 1.5673, + "step": 2889 + }, + { + "epoch": 0.7902652447361225, + "grad_norm": 0.16698943078517914, + "learning_rate": 5e-05, + "loss": 1.5506, + "step": 2890 + }, + { + "epoch": 0.7905386929176921, + "grad_norm": 0.1672915667295456, + "learning_rate": 5e-05, + "loss": 1.5712, + "step": 2891 + }, + { + "epoch": 0.7908121410992617, + "grad_norm": 0.17247925698757172, + "learning_rate": 5e-05, + "loss": 1.4931, + "step": 2892 + }, + { + "epoch": 0.7910855892808313, + "grad_norm": 0.15129195153713226, + "learning_rate": 5e-05, + "loss": 1.5219, + "step": 2893 + }, + { + "epoch": 0.7913590374624009, + "grad_norm": 0.16134491562843323, + "learning_rate": 5e-05, + "loss": 1.5749, + "step": 2894 + }, + { + "epoch": 0.7916324856439705, + "grad_norm": 0.17356084287166595, + "learning_rate": 5e-05, + "loss": 1.607, + "step": 2895 + }, + { + "epoch": 0.7919059338255401, + "grad_norm": 0.16742588579654694, + "learning_rate": 5e-05, + "loss": 1.515, + "step": 2896 + }, + { + "epoch": 0.7921793820071097, + "grad_norm": 0.16483165323734283, + "learning_rate": 5e-05, + "loss": 1.5906, + "step": 2897 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.1711149662733078, + "learning_rate": 5e-05, + "loss": 1.6759, + "step": 2898 + }, + { + "epoch": 0.7927262783702488, + "grad_norm": 0.1694236695766449, + "learning_rate": 5e-05, + "loss": 1.5804, + "step": 2899 + }, + { + "epoch": 0.7929997265518184, + "grad_norm": 0.15667274594306946, + "learning_rate": 5e-05, + "loss": 1.5717, + "step": 2900 + }, + { + "epoch": 0.793273174733388, + "grad_norm": 0.18144182860851288, + "learning_rate": 5e-05, + "loss": 1.5945, + "step": 2901 + }, + { + "epoch": 0.7935466229149576, + "grad_norm": 0.16796544194221497, + "learning_rate": 5e-05, + "loss": 1.6503, + "step": 2902 + }, + { + "epoch": 0.7938200710965272, + "grad_norm": 0.16234607994556427, + "learning_rate": 5e-05, + "loss": 1.6427, + "step": 2903 + }, + { + "epoch": 0.7940935192780968, + "grad_norm": 0.16550585627555847, + "learning_rate": 5e-05, + "loss": 1.6079, + "step": 2904 + }, + { + "epoch": 0.7943669674596664, + "grad_norm": 0.1626591831445694, + "learning_rate": 5e-05, + "loss": 1.5215, + "step": 2905 + }, + { + "epoch": 0.794640415641236, + "grad_norm": 0.1648007035255432, + "learning_rate": 5e-05, + "loss": 1.4877, + "step": 2906 + }, + { + "epoch": 0.7949138638228056, + "grad_norm": 0.1771146059036255, + "learning_rate": 5e-05, + "loss": 1.5802, + "step": 2907 + }, + { + "epoch": 0.7951873120043752, + "grad_norm": 0.16548393666744232, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 2908 + }, + { + "epoch": 0.7954607601859448, + "grad_norm": 0.18224813044071198, + "learning_rate": 5e-05, + "loss": 1.5849, + "step": 2909 + }, + { + "epoch": 0.7957342083675144, + "grad_norm": 0.19087089598178864, + "learning_rate": 5e-05, + "loss": 1.5959, + "step": 2910 + }, + { + "epoch": 0.7960076565490839, + "grad_norm": 0.16123077273368835, + "learning_rate": 5e-05, + "loss": 1.5631, + "step": 2911 + }, + { + "epoch": 0.7962811047306535, + "grad_norm": 0.18250906467437744, + "learning_rate": 5e-05, + "loss": 1.6527, + "step": 2912 + }, + { + "epoch": 0.7965545529122231, + "grad_norm": 0.18301981687545776, + "learning_rate": 5e-05, + "loss": 1.6438, + "step": 2913 + }, + { + "epoch": 0.7968280010937927, + "grad_norm": 0.15846803784370422, + "learning_rate": 5e-05, + "loss": 1.5266, + "step": 2914 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 0.16750794649124146, + "learning_rate": 5e-05, + "loss": 1.6276, + "step": 2915 + }, + { + "epoch": 0.7973748974569319, + "grad_norm": 0.16995219886302948, + "learning_rate": 5e-05, + "loss": 1.7035, + "step": 2916 + }, + { + "epoch": 0.7976483456385015, + "grad_norm": 0.16649973392486572, + "learning_rate": 5e-05, + "loss": 1.5616, + "step": 2917 + }, + { + "epoch": 0.7979217938200711, + "grad_norm": 0.14843766391277313, + "learning_rate": 5e-05, + "loss": 1.428, + "step": 2918 + }, + { + "epoch": 0.7981952420016407, + "grad_norm": 0.17352357506752014, + "learning_rate": 5e-05, + "loss": 1.6872, + "step": 2919 + }, + { + "epoch": 0.7984686901832103, + "grad_norm": 0.16598494350910187, + "learning_rate": 5e-05, + "loss": 1.6047, + "step": 2920 + }, + { + "epoch": 0.7987421383647799, + "grad_norm": 0.1559354066848755, + "learning_rate": 5e-05, + "loss": 1.5798, + "step": 2921 + }, + { + "epoch": 0.7990155865463495, + "grad_norm": 0.1671651154756546, + "learning_rate": 5e-05, + "loss": 1.599, + "step": 2922 + }, + { + "epoch": 0.799289034727919, + "grad_norm": 0.15780937671661377, + "learning_rate": 5e-05, + "loss": 1.5554, + "step": 2923 + }, + { + "epoch": 0.7995624829094886, + "grad_norm": 0.16018469631671906, + "learning_rate": 5e-05, + "loss": 1.5614, + "step": 2924 + }, + { + "epoch": 0.7998359310910582, + "grad_norm": 0.16393598914146423, + "learning_rate": 5e-05, + "loss": 1.6024, + "step": 2925 + }, + { + "epoch": 0.8001093792726278, + "grad_norm": 0.15699848532676697, + "learning_rate": 5e-05, + "loss": 1.5343, + "step": 2926 + }, + { + "epoch": 0.8003828274541974, + "grad_norm": 0.1754962056875229, + "learning_rate": 5e-05, + "loss": 1.6674, + "step": 2927 + }, + { + "epoch": 0.800656275635767, + "grad_norm": 0.16628246009349823, + "learning_rate": 5e-05, + "loss": 1.6243, + "step": 2928 + }, + { + "epoch": 0.8009297238173366, + "grad_norm": 0.1607159823179245, + "learning_rate": 5e-05, + "loss": 1.6163, + "step": 2929 + }, + { + "epoch": 0.8012031719989062, + "grad_norm": 0.17041060328483582, + "learning_rate": 5e-05, + "loss": 1.6411, + "step": 2930 + }, + { + "epoch": 0.8014766201804758, + "grad_norm": 0.1616741120815277, + "learning_rate": 5e-05, + "loss": 1.4652, + "step": 2931 + }, + { + "epoch": 0.8017500683620454, + "grad_norm": 0.16323620080947876, + "learning_rate": 5e-05, + "loss": 1.6336, + "step": 2932 + }, + { + "epoch": 0.802023516543615, + "grad_norm": 0.16059629619121552, + "learning_rate": 5e-05, + "loss": 1.6735, + "step": 2933 + }, + { + "epoch": 0.8022969647251845, + "grad_norm": 0.17597347497940063, + "learning_rate": 5e-05, + "loss": 1.6076, + "step": 2934 + }, + { + "epoch": 0.8025704129067541, + "grad_norm": 0.16613614559173584, + "learning_rate": 5e-05, + "loss": 1.607, + "step": 2935 + }, + { + "epoch": 0.8028438610883237, + "grad_norm": 0.17486806213855743, + "learning_rate": 5e-05, + "loss": 1.6023, + "step": 2936 + }, + { + "epoch": 0.8031173092698933, + "grad_norm": 0.1755034178495407, + "learning_rate": 5e-05, + "loss": 1.601, + "step": 2937 + }, + { + "epoch": 0.8033907574514629, + "grad_norm": 0.16530238091945648, + "learning_rate": 5e-05, + "loss": 1.5338, + "step": 2938 + }, + { + "epoch": 0.8036642056330325, + "grad_norm": 0.18131126463413239, + "learning_rate": 5e-05, + "loss": 1.7062, + "step": 2939 + }, + { + "epoch": 0.8039376538146021, + "grad_norm": 0.1728467047214508, + "learning_rate": 5e-05, + "loss": 1.6545, + "step": 2940 + }, + { + "epoch": 0.8042111019961717, + "grad_norm": 0.15401597321033478, + "learning_rate": 5e-05, + "loss": 1.4845, + "step": 2941 + }, + { + "epoch": 0.8044845501777413, + "grad_norm": 0.18329520523548126, + "learning_rate": 5e-05, + "loss": 1.6014, + "step": 2942 + }, + { + "epoch": 0.8047579983593109, + "grad_norm": 0.16439451277256012, + "learning_rate": 5e-05, + "loss": 1.5848, + "step": 2943 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 0.16697899997234344, + "learning_rate": 5e-05, + "loss": 1.6285, + "step": 2944 + }, + { + "epoch": 0.8053048947224501, + "grad_norm": 0.1735549122095108, + "learning_rate": 5e-05, + "loss": 1.7079, + "step": 2945 + }, + { + "epoch": 0.8055783429040196, + "grad_norm": 0.16093853116035461, + "learning_rate": 5e-05, + "loss": 1.5457, + "step": 2946 + }, + { + "epoch": 0.8058517910855892, + "grad_norm": 0.1812943071126938, + "learning_rate": 5e-05, + "loss": 1.7846, + "step": 2947 + }, + { + "epoch": 0.8061252392671588, + "grad_norm": 0.1680847555398941, + "learning_rate": 5e-05, + "loss": 1.6172, + "step": 2948 + }, + { + "epoch": 0.8063986874487284, + "grad_norm": 0.1760333627462387, + "learning_rate": 5e-05, + "loss": 1.6645, + "step": 2949 + }, + { + "epoch": 0.806672135630298, + "grad_norm": 0.16148562729358673, + "learning_rate": 5e-05, + "loss": 1.5527, + "step": 2950 + }, + { + "epoch": 0.8069455838118677, + "grad_norm": 0.16784906387329102, + "learning_rate": 5e-05, + "loss": 1.6316, + "step": 2951 + }, + { + "epoch": 0.8072190319934373, + "grad_norm": 0.16758820414543152, + "learning_rate": 5e-05, + "loss": 1.566, + "step": 2952 + }, + { + "epoch": 0.8074924801750069, + "grad_norm": 0.16588006913661957, + "learning_rate": 5e-05, + "loss": 1.5411, + "step": 2953 + }, + { + "epoch": 0.8077659283565765, + "grad_norm": 0.15759024024009705, + "learning_rate": 5e-05, + "loss": 1.497, + "step": 2954 + }, + { + "epoch": 0.8080393765381461, + "grad_norm": 0.1606815755367279, + "learning_rate": 5e-05, + "loss": 1.5874, + "step": 2955 + }, + { + "epoch": 0.8083128247197157, + "grad_norm": 0.171406090259552, + "learning_rate": 5e-05, + "loss": 1.6286, + "step": 2956 + }, + { + "epoch": 0.8085862729012852, + "grad_norm": 0.1659841537475586, + "learning_rate": 5e-05, + "loss": 1.642, + "step": 2957 + }, + { + "epoch": 0.8088597210828548, + "grad_norm": 0.1642453819513321, + "learning_rate": 5e-05, + "loss": 1.6138, + "step": 2958 + }, + { + "epoch": 0.8091331692644244, + "grad_norm": 0.1737990528345108, + "learning_rate": 5e-05, + "loss": 1.6755, + "step": 2959 + }, + { + "epoch": 0.809406617445994, + "grad_norm": 0.16041895747184753, + "learning_rate": 5e-05, + "loss": 1.5752, + "step": 2960 + }, + { + "epoch": 0.8096800656275636, + "grad_norm": 0.16759610176086426, + "learning_rate": 5e-05, + "loss": 1.5574, + "step": 2961 + }, + { + "epoch": 0.8099535138091332, + "grad_norm": 0.17192721366882324, + "learning_rate": 5e-05, + "loss": 1.6143, + "step": 2962 + }, + { + "epoch": 0.8102269619907028, + "grad_norm": 0.17108719050884247, + "learning_rate": 5e-05, + "loss": 1.668, + "step": 2963 + }, + { + "epoch": 0.8105004101722724, + "grad_norm": 0.1675434559583664, + "learning_rate": 5e-05, + "loss": 1.596, + "step": 2964 + }, + { + "epoch": 0.810773858353842, + "grad_norm": 0.16867220401763916, + "learning_rate": 5e-05, + "loss": 1.6051, + "step": 2965 + }, + { + "epoch": 0.8110473065354116, + "grad_norm": 0.16799405217170715, + "learning_rate": 5e-05, + "loss": 1.6316, + "step": 2966 + }, + { + "epoch": 0.8113207547169812, + "grad_norm": 0.1661425083875656, + "learning_rate": 5e-05, + "loss": 1.6354, + "step": 2967 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.17176170647144318, + "learning_rate": 5e-05, + "loss": 1.5873, + "step": 2968 + }, + { + "epoch": 0.8118676510801203, + "grad_norm": 0.16512513160705566, + "learning_rate": 5e-05, + "loss": 1.5885, + "step": 2969 + }, + { + "epoch": 0.8121410992616899, + "grad_norm": 0.1690187305212021, + "learning_rate": 5e-05, + "loss": 1.6306, + "step": 2970 + }, + { + "epoch": 0.8124145474432595, + "grad_norm": 0.17420509457588196, + "learning_rate": 5e-05, + "loss": 1.49, + "step": 2971 + }, + { + "epoch": 0.8126879956248291, + "grad_norm": 0.16634975373744965, + "learning_rate": 5e-05, + "loss": 1.5559, + "step": 2972 + }, + { + "epoch": 0.8129614438063987, + "grad_norm": 0.1683768481016159, + "learning_rate": 5e-05, + "loss": 1.5532, + "step": 2973 + }, + { + "epoch": 0.8132348919879683, + "grad_norm": 0.16414394974708557, + "learning_rate": 5e-05, + "loss": 1.5992, + "step": 2974 + }, + { + "epoch": 0.8135083401695379, + "grad_norm": 0.16777318716049194, + "learning_rate": 5e-05, + "loss": 1.6227, + "step": 2975 + }, + { + "epoch": 0.8137817883511075, + "grad_norm": 0.16975970566272736, + "learning_rate": 5e-05, + "loss": 1.589, + "step": 2976 + }, + { + "epoch": 0.8140552365326771, + "grad_norm": 0.16181518137454987, + "learning_rate": 5e-05, + "loss": 1.6104, + "step": 2977 + }, + { + "epoch": 0.8143286847142467, + "grad_norm": 0.16820965707302094, + "learning_rate": 5e-05, + "loss": 1.597, + "step": 2978 + }, + { + "epoch": 0.8146021328958163, + "grad_norm": 0.17384564876556396, + "learning_rate": 5e-05, + "loss": 1.61, + "step": 2979 + }, + { + "epoch": 0.8148755810773858, + "grad_norm": 0.17363837361335754, + "learning_rate": 5e-05, + "loss": 1.6574, + "step": 2980 + }, + { + "epoch": 0.8151490292589554, + "grad_norm": 0.17065000534057617, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 2981 + }, + { + "epoch": 0.815422477440525, + "grad_norm": 0.1675603985786438, + "learning_rate": 5e-05, + "loss": 1.6028, + "step": 2982 + }, + { + "epoch": 0.8156959256220946, + "grad_norm": 0.16641339659690857, + "learning_rate": 5e-05, + "loss": 1.6176, + "step": 2983 + }, + { + "epoch": 0.8159693738036642, + "grad_norm": 0.16285206377506256, + "learning_rate": 5e-05, + "loss": 1.6211, + "step": 2984 + }, + { + "epoch": 0.8162428219852338, + "grad_norm": 0.1820947378873825, + "learning_rate": 5e-05, + "loss": 1.6473, + "step": 2985 + }, + { + "epoch": 0.8165162701668034, + "grad_norm": 0.16042275726795197, + "learning_rate": 5e-05, + "loss": 1.5818, + "step": 2986 + }, + { + "epoch": 0.816789718348373, + "grad_norm": 0.1615065634250641, + "learning_rate": 5e-05, + "loss": 1.5165, + "step": 2987 + }, + { + "epoch": 0.8170631665299426, + "grad_norm": 0.17090201377868652, + "learning_rate": 5e-05, + "loss": 1.653, + "step": 2988 + }, + { + "epoch": 0.8173366147115122, + "grad_norm": 0.1570872813463211, + "learning_rate": 5e-05, + "loss": 1.5234, + "step": 2989 + }, + { + "epoch": 0.8176100628930818, + "grad_norm": 0.16220347583293915, + "learning_rate": 5e-05, + "loss": 1.6026, + "step": 2990 + }, + { + "epoch": 0.8178835110746514, + "grad_norm": 0.17709346115589142, + "learning_rate": 5e-05, + "loss": 1.5405, + "step": 2991 + }, + { + "epoch": 0.8181569592562209, + "grad_norm": 0.15809805691242218, + "learning_rate": 5e-05, + "loss": 1.5744, + "step": 2992 + }, + { + "epoch": 0.8184304074377905, + "grad_norm": 0.17646804451942444, + "learning_rate": 5e-05, + "loss": 1.6328, + "step": 2993 + }, + { + "epoch": 0.8187038556193601, + "grad_norm": 0.19047993421554565, + "learning_rate": 5e-05, + "loss": 1.6451, + "step": 2994 + }, + { + "epoch": 0.8189773038009297, + "grad_norm": 0.16458064317703247, + "learning_rate": 5e-05, + "loss": 1.6942, + "step": 2995 + }, + { + "epoch": 0.8192507519824993, + "grad_norm": 0.17090070247650146, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 2996 + }, + { + "epoch": 0.8195242001640689, + "grad_norm": 0.1800215095281601, + "learning_rate": 5e-05, + "loss": 1.5693, + "step": 2997 + }, + { + "epoch": 0.8197976483456385, + "grad_norm": 0.16883789002895355, + "learning_rate": 5e-05, + "loss": 1.5624, + "step": 2998 + }, + { + "epoch": 0.8200710965272081, + "grad_norm": 0.16943061351776123, + "learning_rate": 5e-05, + "loss": 1.6307, + "step": 2999 + }, + { + "epoch": 0.8203445447087777, + "grad_norm": 0.1675194501876831, + "learning_rate": 5e-05, + "loss": 1.5902, + "step": 3000 + }, + { + "epoch": 0.8206179928903473, + "grad_norm": 0.16143114864826202, + "learning_rate": 5e-05, + "loss": 1.5696, + "step": 3001 + }, + { + "epoch": 0.8208914410719169, + "grad_norm": 0.16645751893520355, + "learning_rate": 5e-05, + "loss": 1.6472, + "step": 3002 + }, + { + "epoch": 0.8211648892534865, + "grad_norm": 0.16916631162166595, + "learning_rate": 5e-05, + "loss": 1.5685, + "step": 3003 + }, + { + "epoch": 0.821438337435056, + "grad_norm": 0.1605367660522461, + "learning_rate": 5e-05, + "loss": 1.6069, + "step": 3004 + }, + { + "epoch": 0.8217117856166256, + "grad_norm": 0.1694014072418213, + "learning_rate": 5e-05, + "loss": 1.6411, + "step": 3005 + }, + { + "epoch": 0.8219852337981952, + "grad_norm": 0.15868619084358215, + "learning_rate": 5e-05, + "loss": 1.5634, + "step": 3006 + }, + { + "epoch": 0.8222586819797648, + "grad_norm": 0.16402362287044525, + "learning_rate": 5e-05, + "loss": 1.6254, + "step": 3007 + }, + { + "epoch": 0.8225321301613344, + "grad_norm": 0.16820335388183594, + "learning_rate": 5e-05, + "loss": 1.6344, + "step": 3008 + }, + { + "epoch": 0.822805578342904, + "grad_norm": 0.1659119576215744, + "learning_rate": 5e-05, + "loss": 1.6044, + "step": 3009 + }, + { + "epoch": 0.8230790265244736, + "grad_norm": 0.177097886800766, + "learning_rate": 5e-05, + "loss": 1.7114, + "step": 3010 + }, + { + "epoch": 0.8233524747060432, + "grad_norm": 0.16565869748592377, + "learning_rate": 5e-05, + "loss": 1.5614, + "step": 3011 + }, + { + "epoch": 0.8236259228876128, + "grad_norm": 0.16346898674964905, + "learning_rate": 5e-05, + "loss": 1.6737, + "step": 3012 + }, + { + "epoch": 0.8238993710691824, + "grad_norm": 0.16942520439624786, + "learning_rate": 5e-05, + "loss": 1.6536, + "step": 3013 + }, + { + "epoch": 0.824172819250752, + "grad_norm": 0.16573195159435272, + "learning_rate": 5e-05, + "loss": 1.5418, + "step": 3014 + }, + { + "epoch": 0.8244462674323215, + "grad_norm": 0.16428519785404205, + "learning_rate": 5e-05, + "loss": 1.5565, + "step": 3015 + }, + { + "epoch": 0.8247197156138911, + "grad_norm": 0.1609538197517395, + "learning_rate": 5e-05, + "loss": 1.5613, + "step": 3016 + }, + { + "epoch": 0.8249931637954607, + "grad_norm": 0.16682593524456024, + "learning_rate": 5e-05, + "loss": 1.6829, + "step": 3017 + }, + { + "epoch": 0.8252666119770303, + "grad_norm": 0.16691312193870544, + "learning_rate": 5e-05, + "loss": 1.5133, + "step": 3018 + }, + { + "epoch": 0.8255400601585999, + "grad_norm": 0.16984447836875916, + "learning_rate": 5e-05, + "loss": 1.6283, + "step": 3019 + }, + { + "epoch": 0.8258135083401695, + "grad_norm": 0.1617315411567688, + "learning_rate": 5e-05, + "loss": 1.5689, + "step": 3020 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.17331373691558838, + "learning_rate": 5e-05, + "loss": 1.635, + "step": 3021 + }, + { + "epoch": 0.8263604047033087, + "grad_norm": 0.1681394726037979, + "learning_rate": 5e-05, + "loss": 1.6635, + "step": 3022 + }, + { + "epoch": 0.8266338528848783, + "grad_norm": 0.15742860734462738, + "learning_rate": 5e-05, + "loss": 1.4818, + "step": 3023 + }, + { + "epoch": 0.8269073010664479, + "grad_norm": 0.18009516596794128, + "learning_rate": 5e-05, + "loss": 1.6404, + "step": 3024 + }, + { + "epoch": 0.8271807492480175, + "grad_norm": 0.16786369681358337, + "learning_rate": 5e-05, + "loss": 1.5971, + "step": 3025 + }, + { + "epoch": 0.8274541974295871, + "grad_norm": 0.16309179365634918, + "learning_rate": 5e-05, + "loss": 1.5124, + "step": 3026 + }, + { + "epoch": 0.8277276456111566, + "grad_norm": 0.15728282928466797, + "learning_rate": 5e-05, + "loss": 1.5749, + "step": 3027 + }, + { + "epoch": 0.8280010937927262, + "grad_norm": 0.16731971502304077, + "learning_rate": 5e-05, + "loss": 1.5051, + "step": 3028 + }, + { + "epoch": 0.8282745419742958, + "grad_norm": 0.17260520160198212, + "learning_rate": 5e-05, + "loss": 1.5307, + "step": 3029 + }, + { + "epoch": 0.8285479901558654, + "grad_norm": 0.16881436109542847, + "learning_rate": 5e-05, + "loss": 1.6312, + "step": 3030 + }, + { + "epoch": 0.828821438337435, + "grad_norm": 0.16781583428382874, + "learning_rate": 5e-05, + "loss": 1.5454, + "step": 3031 + }, + { + "epoch": 0.8290948865190046, + "grad_norm": 0.16846032440662384, + "learning_rate": 5e-05, + "loss": 1.6269, + "step": 3032 + }, + { + "epoch": 0.8293683347005743, + "grad_norm": 0.1699826866388321, + "learning_rate": 5e-05, + "loss": 1.6368, + "step": 3033 + }, + { + "epoch": 0.8296417828821439, + "grad_norm": 0.16753311455249786, + "learning_rate": 5e-05, + "loss": 1.6949, + "step": 3034 + }, + { + "epoch": 0.8299152310637135, + "grad_norm": 0.17179977893829346, + "learning_rate": 5e-05, + "loss": 1.6023, + "step": 3035 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.1626930981874466, + "learning_rate": 5e-05, + "loss": 1.6162, + "step": 3036 + }, + { + "epoch": 0.8304621274268527, + "grad_norm": 0.1600092500448227, + "learning_rate": 5e-05, + "loss": 1.5765, + "step": 3037 + }, + { + "epoch": 0.8307355756084221, + "grad_norm": 0.1597457230091095, + "learning_rate": 5e-05, + "loss": 1.6114, + "step": 3038 + }, + { + "epoch": 0.8310090237899918, + "grad_norm": 0.16018109023571014, + "learning_rate": 5e-05, + "loss": 1.6279, + "step": 3039 + }, + { + "epoch": 0.8312824719715614, + "grad_norm": 0.15814651548862457, + "learning_rate": 5e-05, + "loss": 1.5031, + "step": 3040 + }, + { + "epoch": 0.831555920153131, + "grad_norm": 0.1627386063337326, + "learning_rate": 5e-05, + "loss": 1.5399, + "step": 3041 + }, + { + "epoch": 0.8318293683347006, + "grad_norm": 0.17200924456119537, + "learning_rate": 5e-05, + "loss": 1.6497, + "step": 3042 + }, + { + "epoch": 0.8321028165162702, + "grad_norm": 0.16421593725681305, + "learning_rate": 5e-05, + "loss": 1.653, + "step": 3043 + }, + { + "epoch": 0.8323762646978398, + "grad_norm": 0.16235610842704773, + "learning_rate": 5e-05, + "loss": 1.498, + "step": 3044 + }, + { + "epoch": 0.8326497128794094, + "grad_norm": 0.16004261374473572, + "learning_rate": 5e-05, + "loss": 1.5331, + "step": 3045 + }, + { + "epoch": 0.832923161060979, + "grad_norm": 0.17818929255008698, + "learning_rate": 5e-05, + "loss": 1.6328, + "step": 3046 + }, + { + "epoch": 0.8331966092425486, + "grad_norm": 0.17339615523815155, + "learning_rate": 5e-05, + "loss": 1.6716, + "step": 3047 + }, + { + "epoch": 0.8334700574241182, + "grad_norm": 0.16837354004383087, + "learning_rate": 5e-05, + "loss": 1.6282, + "step": 3048 + }, + { + "epoch": 0.8337435056056878, + "grad_norm": 0.16777902841567993, + "learning_rate": 5e-05, + "loss": 1.596, + "step": 3049 + }, + { + "epoch": 0.8340169537872573, + "grad_norm": 0.16719725728034973, + "learning_rate": 5e-05, + "loss": 1.6055, + "step": 3050 + }, + { + "epoch": 0.8342904019688269, + "grad_norm": 0.16437400877475739, + "learning_rate": 5e-05, + "loss": 1.6341, + "step": 3051 + }, + { + "epoch": 0.8345638501503965, + "grad_norm": 0.1672685593366623, + "learning_rate": 5e-05, + "loss": 1.6135, + "step": 3052 + }, + { + "epoch": 0.8348372983319661, + "grad_norm": 0.16196739673614502, + "learning_rate": 5e-05, + "loss": 1.5584, + "step": 3053 + }, + { + "epoch": 0.8351107465135357, + "grad_norm": 0.15699756145477295, + "learning_rate": 5e-05, + "loss": 1.5569, + "step": 3054 + }, + { + "epoch": 0.8353841946951053, + "grad_norm": 0.1717248409986496, + "learning_rate": 5e-05, + "loss": 1.6093, + "step": 3055 + }, + { + "epoch": 0.8356576428766749, + "grad_norm": 0.17324498295783997, + "learning_rate": 5e-05, + "loss": 1.6929, + "step": 3056 + }, + { + "epoch": 0.8359310910582445, + "grad_norm": 0.17252692580223083, + "learning_rate": 5e-05, + "loss": 1.6724, + "step": 3057 + }, + { + "epoch": 0.8362045392398141, + "grad_norm": 0.17730343341827393, + "learning_rate": 5e-05, + "loss": 1.6094, + "step": 3058 + }, + { + "epoch": 0.8364779874213837, + "grad_norm": 0.15670277178287506, + "learning_rate": 5e-05, + "loss": 1.5384, + "step": 3059 + }, + { + "epoch": 0.8367514356029533, + "grad_norm": 0.171427920460701, + "learning_rate": 5e-05, + "loss": 1.5808, + "step": 3060 + }, + { + "epoch": 0.8370248837845229, + "grad_norm": 0.1834273487329483, + "learning_rate": 5e-05, + "loss": 1.6069, + "step": 3061 + }, + { + "epoch": 0.8372983319660924, + "grad_norm": 0.1576196849346161, + "learning_rate": 5e-05, + "loss": 1.6032, + "step": 3062 + }, + { + "epoch": 0.837571780147662, + "grad_norm": 0.17979919910430908, + "learning_rate": 5e-05, + "loss": 1.6884, + "step": 3063 + }, + { + "epoch": 0.8378452283292316, + "grad_norm": 0.16846168041229248, + "learning_rate": 5e-05, + "loss": 1.4665, + "step": 3064 + }, + { + "epoch": 0.8381186765108012, + "grad_norm": 0.17571622133255005, + "learning_rate": 5e-05, + "loss": 1.5971, + "step": 3065 + }, + { + "epoch": 0.8383921246923708, + "grad_norm": 0.1625003069639206, + "learning_rate": 5e-05, + "loss": 1.6018, + "step": 3066 + }, + { + "epoch": 0.8386655728739404, + "grad_norm": 0.16521279513835907, + "learning_rate": 5e-05, + "loss": 1.5108, + "step": 3067 + }, + { + "epoch": 0.83893902105551, + "grad_norm": 0.19198334217071533, + "learning_rate": 5e-05, + "loss": 1.6102, + "step": 3068 + }, + { + "epoch": 0.8392124692370796, + "grad_norm": 0.16152355074882507, + "learning_rate": 5e-05, + "loss": 1.5801, + "step": 3069 + }, + { + "epoch": 0.8394859174186492, + "grad_norm": 0.159002423286438, + "learning_rate": 5e-05, + "loss": 1.5931, + "step": 3070 + }, + { + "epoch": 0.8397593656002188, + "grad_norm": 0.16774891316890717, + "learning_rate": 5e-05, + "loss": 1.613, + "step": 3071 + }, + { + "epoch": 0.8400328137817884, + "grad_norm": 0.17755338549613953, + "learning_rate": 5e-05, + "loss": 1.6408, + "step": 3072 + }, + { + "epoch": 0.8403062619633579, + "grad_norm": 0.15868759155273438, + "learning_rate": 5e-05, + "loss": 1.6276, + "step": 3073 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 0.1694575846195221, + "learning_rate": 5e-05, + "loss": 1.6421, + "step": 3074 + }, + { + "epoch": 0.8408531583264971, + "grad_norm": 0.1652621626853943, + "learning_rate": 5e-05, + "loss": 1.5798, + "step": 3075 + }, + { + "epoch": 0.8411266065080667, + "grad_norm": 0.1710357666015625, + "learning_rate": 5e-05, + "loss": 1.5947, + "step": 3076 + }, + { + "epoch": 0.8414000546896363, + "grad_norm": 0.16072213649749756, + "learning_rate": 5e-05, + "loss": 1.5291, + "step": 3077 + }, + { + "epoch": 0.8416735028712059, + "grad_norm": 0.18232187628746033, + "learning_rate": 5e-05, + "loss": 1.6492, + "step": 3078 + }, + { + "epoch": 0.8419469510527755, + "grad_norm": 0.1688283234834671, + "learning_rate": 5e-05, + "loss": 1.6395, + "step": 3079 + }, + { + "epoch": 0.8422203992343451, + "grad_norm": 0.170608788728714, + "learning_rate": 5e-05, + "loss": 1.6103, + "step": 3080 + }, + { + "epoch": 0.8424938474159147, + "grad_norm": 0.16807672381401062, + "learning_rate": 5e-05, + "loss": 1.535, + "step": 3081 + }, + { + "epoch": 0.8427672955974843, + "grad_norm": 0.17390793561935425, + "learning_rate": 5e-05, + "loss": 1.6763, + "step": 3082 + }, + { + "epoch": 0.8430407437790539, + "grad_norm": 0.1748964488506317, + "learning_rate": 5e-05, + "loss": 1.6523, + "step": 3083 + }, + { + "epoch": 0.8433141919606235, + "grad_norm": 0.17339888215065002, + "learning_rate": 5e-05, + "loss": 1.4646, + "step": 3084 + }, + { + "epoch": 0.843587640142193, + "grad_norm": 0.16728384792804718, + "learning_rate": 5e-05, + "loss": 1.5108, + "step": 3085 + }, + { + "epoch": 0.8438610883237626, + "grad_norm": 0.1811836063861847, + "learning_rate": 5e-05, + "loss": 1.6013, + "step": 3086 + }, + { + "epoch": 0.8441345365053322, + "grad_norm": 0.15502838790416718, + "learning_rate": 5e-05, + "loss": 1.5567, + "step": 3087 + }, + { + "epoch": 0.8444079846869018, + "grad_norm": 0.17469522356987, + "learning_rate": 5e-05, + "loss": 1.6128, + "step": 3088 + }, + { + "epoch": 0.8446814328684714, + "grad_norm": 0.1718490719795227, + "learning_rate": 5e-05, + "loss": 1.6778, + "step": 3089 + }, + { + "epoch": 0.844954881050041, + "grad_norm": 0.1672654151916504, + "learning_rate": 5e-05, + "loss": 1.5281, + "step": 3090 + }, + { + "epoch": 0.8452283292316106, + "grad_norm": 0.18899762630462646, + "learning_rate": 5e-05, + "loss": 1.5707, + "step": 3091 + }, + { + "epoch": 0.8455017774131802, + "grad_norm": 0.16991832852363586, + "learning_rate": 5e-05, + "loss": 1.6655, + "step": 3092 + }, + { + "epoch": 0.8457752255947498, + "grad_norm": 0.16526849567890167, + "learning_rate": 5e-05, + "loss": 1.5944, + "step": 3093 + }, + { + "epoch": 0.8460486737763194, + "grad_norm": 0.1665259599685669, + "learning_rate": 5e-05, + "loss": 1.6564, + "step": 3094 + }, + { + "epoch": 0.846322121957889, + "grad_norm": 0.16578637063503265, + "learning_rate": 5e-05, + "loss": 1.5696, + "step": 3095 + }, + { + "epoch": 0.8465955701394585, + "grad_norm": 0.15842179954051971, + "learning_rate": 5e-05, + "loss": 1.5215, + "step": 3096 + }, + { + "epoch": 0.8468690183210281, + "grad_norm": 0.1621725708246231, + "learning_rate": 5e-05, + "loss": 1.6389, + "step": 3097 + }, + { + "epoch": 0.8471424665025977, + "grad_norm": 0.15811792016029358, + "learning_rate": 5e-05, + "loss": 1.5293, + "step": 3098 + }, + { + "epoch": 0.8474159146841673, + "grad_norm": 0.17002242803573608, + "learning_rate": 5e-05, + "loss": 1.5781, + "step": 3099 + }, + { + "epoch": 0.8476893628657369, + "grad_norm": 0.16223548352718353, + "learning_rate": 5e-05, + "loss": 1.5776, + "step": 3100 + }, + { + "epoch": 0.8479628110473065, + "grad_norm": 0.17608049511909485, + "learning_rate": 5e-05, + "loss": 1.5716, + "step": 3101 + }, + { + "epoch": 0.8482362592288761, + "grad_norm": 0.16243280470371246, + "learning_rate": 5e-05, + "loss": 1.582, + "step": 3102 + }, + { + "epoch": 0.8485097074104457, + "grad_norm": 0.1691652089357376, + "learning_rate": 5e-05, + "loss": 1.5969, + "step": 3103 + }, + { + "epoch": 0.8487831555920153, + "grad_norm": 0.17655591666698456, + "learning_rate": 5e-05, + "loss": 1.5828, + "step": 3104 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.16086623072624207, + "learning_rate": 5e-05, + "loss": 1.5631, + "step": 3105 + }, + { + "epoch": 0.8493300519551545, + "grad_norm": 0.1816483736038208, + "learning_rate": 5e-05, + "loss": 1.723, + "step": 3106 + }, + { + "epoch": 0.8496035001367241, + "grad_norm": 0.17240236699581146, + "learning_rate": 5e-05, + "loss": 1.669, + "step": 3107 + }, + { + "epoch": 0.8498769483182936, + "grad_norm": 0.16476856172084808, + "learning_rate": 5e-05, + "loss": 1.5872, + "step": 3108 + }, + { + "epoch": 0.8501503964998632, + "grad_norm": 0.15981562435626984, + "learning_rate": 5e-05, + "loss": 1.5647, + "step": 3109 + }, + { + "epoch": 0.8504238446814328, + "grad_norm": 0.1652696579694748, + "learning_rate": 5e-05, + "loss": 1.5882, + "step": 3110 + }, + { + "epoch": 0.8506972928630024, + "grad_norm": 0.16290616989135742, + "learning_rate": 5e-05, + "loss": 1.6325, + "step": 3111 + }, + { + "epoch": 0.850970741044572, + "grad_norm": 0.16608236730098724, + "learning_rate": 5e-05, + "loss": 1.6638, + "step": 3112 + }, + { + "epoch": 0.8512441892261416, + "grad_norm": 0.16186489164829254, + "learning_rate": 5e-05, + "loss": 1.5525, + "step": 3113 + }, + { + "epoch": 0.8515176374077112, + "grad_norm": 0.1654919683933258, + "learning_rate": 5e-05, + "loss": 1.5305, + "step": 3114 + }, + { + "epoch": 0.8517910855892808, + "grad_norm": 0.16424906253814697, + "learning_rate": 5e-05, + "loss": 1.5848, + "step": 3115 + }, + { + "epoch": 0.8520645337708505, + "grad_norm": 0.16823723912239075, + "learning_rate": 5e-05, + "loss": 1.5737, + "step": 3116 + }, + { + "epoch": 0.85233798195242, + "grad_norm": 0.16373160481452942, + "learning_rate": 5e-05, + "loss": 1.6274, + "step": 3117 + }, + { + "epoch": 0.8526114301339897, + "grad_norm": 0.18197789788246155, + "learning_rate": 5e-05, + "loss": 1.6227, + "step": 3118 + }, + { + "epoch": 0.8528848783155591, + "grad_norm": 0.15607775747776031, + "learning_rate": 5e-05, + "loss": 1.5371, + "step": 3119 + }, + { + "epoch": 0.8531583264971287, + "grad_norm": 0.16798537969589233, + "learning_rate": 5e-05, + "loss": 1.5632, + "step": 3120 + }, + { + "epoch": 0.8534317746786984, + "grad_norm": 0.164012148976326, + "learning_rate": 5e-05, + "loss": 1.5629, + "step": 3121 + }, + { + "epoch": 0.853705222860268, + "grad_norm": 0.176783949136734, + "learning_rate": 5e-05, + "loss": 1.5977, + "step": 3122 + }, + { + "epoch": 0.8539786710418376, + "grad_norm": 0.15592290461063385, + "learning_rate": 5e-05, + "loss": 1.5529, + "step": 3123 + }, + { + "epoch": 0.8542521192234072, + "grad_norm": 0.18195119500160217, + "learning_rate": 5e-05, + "loss": 1.6908, + "step": 3124 + }, + { + "epoch": 0.8545255674049768, + "grad_norm": 0.174040749669075, + "learning_rate": 5e-05, + "loss": 1.5646, + "step": 3125 + }, + { + "epoch": 0.8547990155865464, + "grad_norm": 0.16298840939998627, + "learning_rate": 5e-05, + "loss": 1.5824, + "step": 3126 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 0.16243524849414825, + "learning_rate": 5e-05, + "loss": 1.603, + "step": 3127 + }, + { + "epoch": 0.8553459119496856, + "grad_norm": 0.16204892098903656, + "learning_rate": 5e-05, + "loss": 1.595, + "step": 3128 + }, + { + "epoch": 0.8556193601312552, + "grad_norm": 0.16708557307720184, + "learning_rate": 5e-05, + "loss": 1.6143, + "step": 3129 + }, + { + "epoch": 0.8558928083128248, + "grad_norm": 0.1747390627861023, + "learning_rate": 5e-05, + "loss": 1.5863, + "step": 3130 + }, + { + "epoch": 0.8561662564943943, + "grad_norm": 0.17126700282096863, + "learning_rate": 5e-05, + "loss": 1.6091, + "step": 3131 + }, + { + "epoch": 0.8564397046759639, + "grad_norm": 0.1731352061033249, + "learning_rate": 5e-05, + "loss": 1.6073, + "step": 3132 + }, + { + "epoch": 0.8567131528575335, + "grad_norm": 0.17451922595500946, + "learning_rate": 5e-05, + "loss": 1.7312, + "step": 3133 + }, + { + "epoch": 0.8569866010391031, + "grad_norm": 0.156746968626976, + "learning_rate": 5e-05, + "loss": 1.4434, + "step": 3134 + }, + { + "epoch": 0.8572600492206727, + "grad_norm": 0.1696559190750122, + "learning_rate": 5e-05, + "loss": 1.6185, + "step": 3135 + }, + { + "epoch": 0.8575334974022423, + "grad_norm": 0.1689455807209015, + "learning_rate": 5e-05, + "loss": 1.5066, + "step": 3136 + }, + { + "epoch": 0.8578069455838119, + "grad_norm": 0.16820566356182098, + "learning_rate": 5e-05, + "loss": 1.5294, + "step": 3137 + }, + { + "epoch": 0.8580803937653815, + "grad_norm": 0.18438181281089783, + "learning_rate": 5e-05, + "loss": 1.6513, + "step": 3138 + }, + { + "epoch": 0.8583538419469511, + "grad_norm": 0.1611601859331131, + "learning_rate": 5e-05, + "loss": 1.4604, + "step": 3139 + }, + { + "epoch": 0.8586272901285207, + "grad_norm": 0.16456107795238495, + "learning_rate": 5e-05, + "loss": 1.5664, + "step": 3140 + }, + { + "epoch": 0.8589007383100903, + "grad_norm": 0.16825945675373077, + "learning_rate": 5e-05, + "loss": 1.5777, + "step": 3141 + }, + { + "epoch": 0.8591741864916599, + "grad_norm": 0.16525815427303314, + "learning_rate": 5e-05, + "loss": 1.5464, + "step": 3142 + }, + { + "epoch": 0.8594476346732294, + "grad_norm": 0.17219798266887665, + "learning_rate": 5e-05, + "loss": 1.6255, + "step": 3143 + }, + { + "epoch": 0.859721082854799, + "grad_norm": 0.1633169800043106, + "learning_rate": 5e-05, + "loss": 1.5723, + "step": 3144 + }, + { + "epoch": 0.8599945310363686, + "grad_norm": 0.15741117298603058, + "learning_rate": 5e-05, + "loss": 1.5098, + "step": 3145 + }, + { + "epoch": 0.8602679792179382, + "grad_norm": 0.17558981478214264, + "learning_rate": 5e-05, + "loss": 1.6152, + "step": 3146 + }, + { + "epoch": 0.8605414273995078, + "grad_norm": 0.16872353851795197, + "learning_rate": 5e-05, + "loss": 1.6884, + "step": 3147 + }, + { + "epoch": 0.8608148755810774, + "grad_norm": 0.18673337996006012, + "learning_rate": 5e-05, + "loss": 1.6064, + "step": 3148 + }, + { + "epoch": 0.861088323762647, + "grad_norm": 0.16644282639026642, + "learning_rate": 5e-05, + "loss": 1.518, + "step": 3149 + }, + { + "epoch": 0.8613617719442166, + "grad_norm": 0.16721606254577637, + "learning_rate": 5e-05, + "loss": 1.5891, + "step": 3150 + }, + { + "epoch": 0.8616352201257862, + "grad_norm": 0.17002660036087036, + "learning_rate": 5e-05, + "loss": 1.5312, + "step": 3151 + }, + { + "epoch": 0.8619086683073558, + "grad_norm": 0.16748939454555511, + "learning_rate": 5e-05, + "loss": 1.6485, + "step": 3152 + }, + { + "epoch": 0.8621821164889254, + "grad_norm": 0.15232722461223602, + "learning_rate": 5e-05, + "loss": 1.4527, + "step": 3153 + }, + { + "epoch": 0.8624555646704949, + "grad_norm": 0.16974018514156342, + "learning_rate": 5e-05, + "loss": 1.5227, + "step": 3154 + }, + { + "epoch": 0.8627290128520645, + "grad_norm": 0.16853350400924683, + "learning_rate": 5e-05, + "loss": 1.5233, + "step": 3155 + }, + { + "epoch": 0.8630024610336341, + "grad_norm": 0.16759264469146729, + "learning_rate": 5e-05, + "loss": 1.6178, + "step": 3156 + }, + { + "epoch": 0.8632759092152037, + "grad_norm": 0.18532946705818176, + "learning_rate": 5e-05, + "loss": 1.4664, + "step": 3157 + }, + { + "epoch": 0.8635493573967733, + "grad_norm": 0.16451618075370789, + "learning_rate": 5e-05, + "loss": 1.5393, + "step": 3158 + }, + { + "epoch": 0.8638228055783429, + "grad_norm": 0.17090554535388947, + "learning_rate": 5e-05, + "loss": 1.6055, + "step": 3159 + }, + { + "epoch": 0.8640962537599125, + "grad_norm": 0.16245882213115692, + "learning_rate": 5e-05, + "loss": 1.5864, + "step": 3160 + }, + { + "epoch": 0.8643697019414821, + "grad_norm": 0.16497839987277985, + "learning_rate": 5e-05, + "loss": 1.5725, + "step": 3161 + }, + { + "epoch": 0.8646431501230517, + "grad_norm": 0.1678629368543625, + "learning_rate": 5e-05, + "loss": 1.6436, + "step": 3162 + }, + { + "epoch": 0.8649165983046213, + "grad_norm": 0.17666424810886383, + "learning_rate": 5e-05, + "loss": 1.5639, + "step": 3163 + }, + { + "epoch": 0.8651900464861909, + "grad_norm": 0.16444604098796844, + "learning_rate": 5e-05, + "loss": 1.6184, + "step": 3164 + }, + { + "epoch": 0.8654634946677605, + "grad_norm": 0.1679319143295288, + "learning_rate": 5e-05, + "loss": 1.5206, + "step": 3165 + }, + { + "epoch": 0.86573694284933, + "grad_norm": 0.16246174275875092, + "learning_rate": 5e-05, + "loss": 1.5794, + "step": 3166 + }, + { + "epoch": 0.8660103910308996, + "grad_norm": 0.17036394774913788, + "learning_rate": 5e-05, + "loss": 1.6325, + "step": 3167 + }, + { + "epoch": 0.8662838392124692, + "grad_norm": 0.1656508892774582, + "learning_rate": 5e-05, + "loss": 1.5285, + "step": 3168 + }, + { + "epoch": 0.8665572873940388, + "grad_norm": 0.17411690950393677, + "learning_rate": 5e-05, + "loss": 1.5536, + "step": 3169 + }, + { + "epoch": 0.8668307355756084, + "grad_norm": 0.1696336418390274, + "learning_rate": 5e-05, + "loss": 1.5778, + "step": 3170 + }, + { + "epoch": 0.867104183757178, + "grad_norm": 0.17345334589481354, + "learning_rate": 5e-05, + "loss": 1.5805, + "step": 3171 + }, + { + "epoch": 0.8673776319387476, + "grad_norm": 0.1722104400396347, + "learning_rate": 5e-05, + "loss": 1.6734, + "step": 3172 + }, + { + "epoch": 0.8676510801203172, + "grad_norm": 0.17595550417900085, + "learning_rate": 5e-05, + "loss": 1.6609, + "step": 3173 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.1680113971233368, + "learning_rate": 5e-05, + "loss": 1.6642, + "step": 3174 + }, + { + "epoch": 0.8681979764834564, + "grad_norm": 0.16808103024959564, + "learning_rate": 5e-05, + "loss": 1.5834, + "step": 3175 + }, + { + "epoch": 0.868471424665026, + "grad_norm": 0.16665758192539215, + "learning_rate": 5e-05, + "loss": 1.6229, + "step": 3176 + }, + { + "epoch": 0.8687448728465955, + "grad_norm": 0.16951392590999603, + "learning_rate": 5e-05, + "loss": 1.5884, + "step": 3177 + }, + { + "epoch": 0.8690183210281651, + "grad_norm": 0.16685131192207336, + "learning_rate": 5e-05, + "loss": 1.632, + "step": 3178 + }, + { + "epoch": 0.8692917692097347, + "grad_norm": 0.1711168885231018, + "learning_rate": 5e-05, + "loss": 1.6524, + "step": 3179 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.16593927145004272, + "learning_rate": 5e-05, + "loss": 1.6038, + "step": 3180 + }, + { + "epoch": 0.8698386655728739, + "grad_norm": 0.16577638685703278, + "learning_rate": 5e-05, + "loss": 1.6469, + "step": 3181 + }, + { + "epoch": 0.8701121137544435, + "grad_norm": 0.17342355847358704, + "learning_rate": 5e-05, + "loss": 1.6564, + "step": 3182 + }, + { + "epoch": 0.8703855619360131, + "grad_norm": 0.15923261642456055, + "learning_rate": 5e-05, + "loss": 1.5506, + "step": 3183 + }, + { + "epoch": 0.8706590101175827, + "grad_norm": 0.18326669931411743, + "learning_rate": 5e-05, + "loss": 1.5866, + "step": 3184 + }, + { + "epoch": 0.8709324582991523, + "grad_norm": 0.17581258714199066, + "learning_rate": 5e-05, + "loss": 1.5521, + "step": 3185 + }, + { + "epoch": 0.8712059064807219, + "grad_norm": 0.1683703362941742, + "learning_rate": 5e-05, + "loss": 1.6314, + "step": 3186 + }, + { + "epoch": 0.8714793546622915, + "grad_norm": 0.17400458455085754, + "learning_rate": 5e-05, + "loss": 1.586, + "step": 3187 + }, + { + "epoch": 0.8717528028438611, + "grad_norm": 0.1864955574274063, + "learning_rate": 5e-05, + "loss": 1.6241, + "step": 3188 + }, + { + "epoch": 0.8720262510254306, + "grad_norm": 0.16646967828273773, + "learning_rate": 5e-05, + "loss": 1.568, + "step": 3189 + }, + { + "epoch": 0.8722996992070002, + "grad_norm": 0.17674346268177032, + "learning_rate": 5e-05, + "loss": 1.6457, + "step": 3190 + }, + { + "epoch": 0.8725731473885698, + "grad_norm": 0.18272368609905243, + "learning_rate": 5e-05, + "loss": 1.6196, + "step": 3191 + }, + { + "epoch": 0.8728465955701394, + "grad_norm": 0.16906492412090302, + "learning_rate": 5e-05, + "loss": 1.5682, + "step": 3192 + }, + { + "epoch": 0.873120043751709, + "grad_norm": 0.16809342801570892, + "learning_rate": 5e-05, + "loss": 1.5799, + "step": 3193 + }, + { + "epoch": 0.8733934919332786, + "grad_norm": 0.1753772646188736, + "learning_rate": 5e-05, + "loss": 1.5376, + "step": 3194 + }, + { + "epoch": 0.8736669401148482, + "grad_norm": 0.16110126674175262, + "learning_rate": 5e-05, + "loss": 1.6114, + "step": 3195 + }, + { + "epoch": 0.8739403882964178, + "grad_norm": 0.16913153231143951, + "learning_rate": 5e-05, + "loss": 1.5882, + "step": 3196 + }, + { + "epoch": 0.8742138364779874, + "grad_norm": 0.17413096129894257, + "learning_rate": 5e-05, + "loss": 1.577, + "step": 3197 + }, + { + "epoch": 0.874487284659557, + "grad_norm": 0.16093304753303528, + "learning_rate": 5e-05, + "loss": 1.5553, + "step": 3198 + }, + { + "epoch": 0.8747607328411267, + "grad_norm": 0.1645004004240036, + "learning_rate": 5e-05, + "loss": 1.5537, + "step": 3199 + }, + { + "epoch": 0.8750341810226961, + "grad_norm": 0.1645384430885315, + "learning_rate": 5e-05, + "loss": 1.629, + "step": 3200 + }, + { + "epoch": 0.8753076292042657, + "grad_norm": 0.17203739285469055, + "learning_rate": 5e-05, + "loss": 1.6529, + "step": 3201 + }, + { + "epoch": 0.8755810773858353, + "grad_norm": 0.1711164116859436, + "learning_rate": 5e-05, + "loss": 1.63, + "step": 3202 + }, + { + "epoch": 0.875854525567405, + "grad_norm": 0.17426781356334686, + "learning_rate": 5e-05, + "loss": 1.7013, + "step": 3203 + }, + { + "epoch": 0.8761279737489746, + "grad_norm": 0.1662566065788269, + "learning_rate": 5e-05, + "loss": 1.5933, + "step": 3204 + }, + { + "epoch": 0.8764014219305442, + "grad_norm": 0.17674902081489563, + "learning_rate": 5e-05, + "loss": 1.5578, + "step": 3205 + }, + { + "epoch": 0.8766748701121138, + "grad_norm": 0.15999096632003784, + "learning_rate": 5e-05, + "loss": 1.6195, + "step": 3206 + }, + { + "epoch": 0.8769483182936834, + "grad_norm": 0.16697584092617035, + "learning_rate": 5e-05, + "loss": 1.589, + "step": 3207 + }, + { + "epoch": 0.877221766475253, + "grad_norm": 0.16489756107330322, + "learning_rate": 5e-05, + "loss": 1.6149, + "step": 3208 + }, + { + "epoch": 0.8774952146568226, + "grad_norm": 0.16519208252429962, + "learning_rate": 5e-05, + "loss": 1.5, + "step": 3209 + }, + { + "epoch": 0.8777686628383922, + "grad_norm": 0.17452332377433777, + "learning_rate": 5e-05, + "loss": 1.6418, + "step": 3210 + }, + { + "epoch": 0.8780421110199618, + "grad_norm": 0.16637583076953888, + "learning_rate": 5e-05, + "loss": 1.5775, + "step": 3211 + }, + { + "epoch": 0.8783155592015313, + "grad_norm": 0.15892595052719116, + "learning_rate": 5e-05, + "loss": 1.498, + "step": 3212 + }, + { + "epoch": 0.8785890073831009, + "grad_norm": 0.17607276141643524, + "learning_rate": 5e-05, + "loss": 1.6636, + "step": 3213 + }, + { + "epoch": 0.8788624555646705, + "grad_norm": 0.1674719601869583, + "learning_rate": 5e-05, + "loss": 1.6106, + "step": 3214 + }, + { + "epoch": 0.8791359037462401, + "grad_norm": 0.17102982103824615, + "learning_rate": 5e-05, + "loss": 1.6174, + "step": 3215 + }, + { + "epoch": 0.8794093519278097, + "grad_norm": 0.16860932111740112, + "learning_rate": 5e-05, + "loss": 1.5382, + "step": 3216 + }, + { + "epoch": 0.8796828001093793, + "grad_norm": 0.15637625753879547, + "learning_rate": 5e-05, + "loss": 1.5326, + "step": 3217 + }, + { + "epoch": 0.8799562482909489, + "grad_norm": 0.16792798042297363, + "learning_rate": 5e-05, + "loss": 1.6486, + "step": 3218 + }, + { + "epoch": 0.8802296964725185, + "grad_norm": 0.17826606333255768, + "learning_rate": 5e-05, + "loss": 1.5327, + "step": 3219 + }, + { + "epoch": 0.8805031446540881, + "grad_norm": 0.1626116782426834, + "learning_rate": 5e-05, + "loss": 1.5084, + "step": 3220 + }, + { + "epoch": 0.8807765928356577, + "grad_norm": 0.16171365976333618, + "learning_rate": 5e-05, + "loss": 1.559, + "step": 3221 + }, + { + "epoch": 0.8810500410172273, + "grad_norm": 0.16499905288219452, + "learning_rate": 5e-05, + "loss": 1.4079, + "step": 3222 + }, + { + "epoch": 0.8813234891987969, + "grad_norm": 0.16512024402618408, + "learning_rate": 5e-05, + "loss": 1.5858, + "step": 3223 + }, + { + "epoch": 0.8815969373803664, + "grad_norm": 0.1706002801656723, + "learning_rate": 5e-05, + "loss": 1.6038, + "step": 3224 + }, + { + "epoch": 0.881870385561936, + "grad_norm": 0.17629894614219666, + "learning_rate": 5e-05, + "loss": 1.6537, + "step": 3225 + }, + { + "epoch": 0.8821438337435056, + "grad_norm": 0.16706202924251556, + "learning_rate": 5e-05, + "loss": 1.5556, + "step": 3226 + }, + { + "epoch": 0.8824172819250752, + "grad_norm": 0.1704121083021164, + "learning_rate": 5e-05, + "loss": 1.6341, + "step": 3227 + }, + { + "epoch": 0.8826907301066448, + "grad_norm": 0.17939800024032593, + "learning_rate": 5e-05, + "loss": 1.4981, + "step": 3228 + }, + { + "epoch": 0.8829641782882144, + "grad_norm": 0.17186492681503296, + "learning_rate": 5e-05, + "loss": 1.5591, + "step": 3229 + }, + { + "epoch": 0.883237626469784, + "grad_norm": 0.160826176404953, + "learning_rate": 5e-05, + "loss": 1.5478, + "step": 3230 + }, + { + "epoch": 0.8835110746513536, + "grad_norm": 0.190667062997818, + "learning_rate": 5e-05, + "loss": 1.5104, + "step": 3231 + }, + { + "epoch": 0.8837845228329232, + "grad_norm": 0.18037305772304535, + "learning_rate": 5e-05, + "loss": 1.6009, + "step": 3232 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 0.15924717485904694, + "learning_rate": 5e-05, + "loss": 1.5045, + "step": 3233 + }, + { + "epoch": 0.8843314191960624, + "grad_norm": 0.19811680912971497, + "learning_rate": 5e-05, + "loss": 1.5899, + "step": 3234 + }, + { + "epoch": 0.8846048673776319, + "grad_norm": 0.1798917055130005, + "learning_rate": 5e-05, + "loss": 1.5403, + "step": 3235 + }, + { + "epoch": 0.8848783155592015, + "grad_norm": 0.17577169835567474, + "learning_rate": 5e-05, + "loss": 1.5191, + "step": 3236 + }, + { + "epoch": 0.8851517637407711, + "grad_norm": 0.18266037106513977, + "learning_rate": 5e-05, + "loss": 1.5779, + "step": 3237 + }, + { + "epoch": 0.8854252119223407, + "grad_norm": 0.1692240834236145, + "learning_rate": 5e-05, + "loss": 1.5353, + "step": 3238 + }, + { + "epoch": 0.8856986601039103, + "grad_norm": 0.1703694760799408, + "learning_rate": 5e-05, + "loss": 1.5582, + "step": 3239 + }, + { + "epoch": 0.8859721082854799, + "grad_norm": 0.1754840612411499, + "learning_rate": 5e-05, + "loss": 1.5573, + "step": 3240 + }, + { + "epoch": 0.8862455564670495, + "grad_norm": 0.16203297674655914, + "learning_rate": 5e-05, + "loss": 1.5276, + "step": 3241 + }, + { + "epoch": 0.8865190046486191, + "grad_norm": 0.15817001461982727, + "learning_rate": 5e-05, + "loss": 1.583, + "step": 3242 + }, + { + "epoch": 0.8867924528301887, + "grad_norm": 0.1767439991235733, + "learning_rate": 5e-05, + "loss": 1.6644, + "step": 3243 + }, + { + "epoch": 0.8870659010117583, + "grad_norm": 0.16701658070087433, + "learning_rate": 5e-05, + "loss": 1.5825, + "step": 3244 + }, + { + "epoch": 0.8873393491933279, + "grad_norm": 0.16390787065029144, + "learning_rate": 5e-05, + "loss": 1.6174, + "step": 3245 + }, + { + "epoch": 0.8876127973748975, + "grad_norm": 0.1741890013217926, + "learning_rate": 5e-05, + "loss": 1.5714, + "step": 3246 + }, + { + "epoch": 0.887886245556467, + "grad_norm": 0.1701374500989914, + "learning_rate": 5e-05, + "loss": 1.6041, + "step": 3247 + }, + { + "epoch": 0.8881596937380366, + "grad_norm": 0.18417386710643768, + "learning_rate": 5e-05, + "loss": 1.6007, + "step": 3248 + }, + { + "epoch": 0.8884331419196062, + "grad_norm": 0.18053127825260162, + "learning_rate": 5e-05, + "loss": 1.5415, + "step": 3249 + }, + { + "epoch": 0.8887065901011758, + "grad_norm": 0.16399511694908142, + "learning_rate": 5e-05, + "loss": 1.5805, + "step": 3250 + }, + { + "epoch": 0.8889800382827454, + "grad_norm": 0.1606212705373764, + "learning_rate": 5e-05, + "loss": 1.5433, + "step": 3251 + }, + { + "epoch": 0.889253486464315, + "grad_norm": 0.16091781854629517, + "learning_rate": 5e-05, + "loss": 1.5801, + "step": 3252 + }, + { + "epoch": 0.8895269346458846, + "grad_norm": 0.17079661786556244, + "learning_rate": 5e-05, + "loss": 1.6032, + "step": 3253 + }, + { + "epoch": 0.8898003828274542, + "grad_norm": 0.16236819326877594, + "learning_rate": 5e-05, + "loss": 1.5309, + "step": 3254 + }, + { + "epoch": 0.8900738310090238, + "grad_norm": 0.16366977989673615, + "learning_rate": 5e-05, + "loss": 1.4854, + "step": 3255 + }, + { + "epoch": 0.8903472791905934, + "grad_norm": 0.1692305952310562, + "learning_rate": 5e-05, + "loss": 1.5546, + "step": 3256 + }, + { + "epoch": 0.890620727372163, + "grad_norm": 0.1591336876153946, + "learning_rate": 5e-05, + "loss": 1.5724, + "step": 3257 + }, + { + "epoch": 0.8908941755537325, + "grad_norm": 0.1661178022623062, + "learning_rate": 5e-05, + "loss": 1.6642, + "step": 3258 + }, + { + "epoch": 0.8911676237353021, + "grad_norm": 0.1741829365491867, + "learning_rate": 5e-05, + "loss": 1.5808, + "step": 3259 + }, + { + "epoch": 0.8914410719168717, + "grad_norm": 0.16046129167079926, + "learning_rate": 5e-05, + "loss": 1.6136, + "step": 3260 + }, + { + "epoch": 0.8917145200984413, + "grad_norm": 0.16220787167549133, + "learning_rate": 5e-05, + "loss": 1.5145, + "step": 3261 + }, + { + "epoch": 0.8919879682800109, + "grad_norm": 0.17300353944301605, + "learning_rate": 5e-05, + "loss": 1.5805, + "step": 3262 + }, + { + "epoch": 0.8922614164615805, + "grad_norm": 0.1636408567428589, + "learning_rate": 5e-05, + "loss": 1.5679, + "step": 3263 + }, + { + "epoch": 0.8925348646431501, + "grad_norm": 0.1789669394493103, + "learning_rate": 5e-05, + "loss": 1.7185, + "step": 3264 + }, + { + "epoch": 0.8928083128247197, + "grad_norm": 0.161798357963562, + "learning_rate": 5e-05, + "loss": 1.4734, + "step": 3265 + }, + { + "epoch": 0.8930817610062893, + "grad_norm": 0.1663116067647934, + "learning_rate": 5e-05, + "loss": 1.559, + "step": 3266 + }, + { + "epoch": 0.8933552091878589, + "grad_norm": 0.16649827361106873, + "learning_rate": 5e-05, + "loss": 1.5498, + "step": 3267 + }, + { + "epoch": 0.8936286573694285, + "grad_norm": 0.1723203808069229, + "learning_rate": 5e-05, + "loss": 1.623, + "step": 3268 + }, + { + "epoch": 0.8939021055509981, + "grad_norm": 0.16643695533275604, + "learning_rate": 5e-05, + "loss": 1.6505, + "step": 3269 + }, + { + "epoch": 0.8941755537325676, + "grad_norm": 0.16589291393756866, + "learning_rate": 5e-05, + "loss": 1.6633, + "step": 3270 + }, + { + "epoch": 0.8944490019141372, + "grad_norm": 0.1624186635017395, + "learning_rate": 5e-05, + "loss": 1.5022, + "step": 3271 + }, + { + "epoch": 0.8947224500957068, + "grad_norm": 0.1685740202665329, + "learning_rate": 5e-05, + "loss": 1.6225, + "step": 3272 + }, + { + "epoch": 0.8949958982772764, + "grad_norm": 0.16466566920280457, + "learning_rate": 5e-05, + "loss": 1.6597, + "step": 3273 + }, + { + "epoch": 0.895269346458846, + "grad_norm": 0.16794006526470184, + "learning_rate": 5e-05, + "loss": 1.5795, + "step": 3274 + }, + { + "epoch": 0.8955427946404156, + "grad_norm": 0.1675807386636734, + "learning_rate": 5e-05, + "loss": 1.6751, + "step": 3275 + }, + { + "epoch": 0.8958162428219852, + "grad_norm": 0.1637728214263916, + "learning_rate": 5e-05, + "loss": 1.5746, + "step": 3276 + }, + { + "epoch": 0.8960896910035548, + "grad_norm": 0.16383953392505646, + "learning_rate": 5e-05, + "loss": 1.5741, + "step": 3277 + }, + { + "epoch": 0.8963631391851244, + "grad_norm": 0.16199614107608795, + "learning_rate": 5e-05, + "loss": 1.5629, + "step": 3278 + }, + { + "epoch": 0.896636587366694, + "grad_norm": 0.17193222045898438, + "learning_rate": 5e-05, + "loss": 1.6165, + "step": 3279 + }, + { + "epoch": 0.8969100355482637, + "grad_norm": 0.16081291437149048, + "learning_rate": 5e-05, + "loss": 1.5044, + "step": 3280 + }, + { + "epoch": 0.8971834837298331, + "grad_norm": 0.16551733016967773, + "learning_rate": 5e-05, + "loss": 1.6052, + "step": 3281 + }, + { + "epoch": 0.8974569319114027, + "grad_norm": 0.1634800136089325, + "learning_rate": 5e-05, + "loss": 1.5435, + "step": 3282 + }, + { + "epoch": 0.8977303800929723, + "grad_norm": 0.17315998673439026, + "learning_rate": 5e-05, + "loss": 1.6515, + "step": 3283 + }, + { + "epoch": 0.898003828274542, + "grad_norm": 0.17361579835414886, + "learning_rate": 5e-05, + "loss": 1.6037, + "step": 3284 + }, + { + "epoch": 0.8982772764561116, + "grad_norm": 0.17537641525268555, + "learning_rate": 5e-05, + "loss": 1.6171, + "step": 3285 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 0.17007243633270264, + "learning_rate": 5e-05, + "loss": 1.6756, + "step": 3286 + }, + { + "epoch": 0.8988241728192508, + "grad_norm": 0.1736474186182022, + "learning_rate": 5e-05, + "loss": 1.5852, + "step": 3287 + }, + { + "epoch": 0.8990976210008204, + "grad_norm": 0.1645202934741974, + "learning_rate": 5e-05, + "loss": 1.6534, + "step": 3288 + }, + { + "epoch": 0.89937106918239, + "grad_norm": 0.16424302756786346, + "learning_rate": 5e-05, + "loss": 1.5194, + "step": 3289 + }, + { + "epoch": 0.8996445173639596, + "grad_norm": 0.17903803288936615, + "learning_rate": 5e-05, + "loss": 1.5683, + "step": 3290 + }, + { + "epoch": 0.8999179655455292, + "grad_norm": 0.16927549242973328, + "learning_rate": 5e-05, + "loss": 1.6283, + "step": 3291 + }, + { + "epoch": 0.9001914137270988, + "grad_norm": 0.16738973557949066, + "learning_rate": 5e-05, + "loss": 1.6196, + "step": 3292 + }, + { + "epoch": 0.9004648619086683, + "grad_norm": 0.16482320427894592, + "learning_rate": 5e-05, + "loss": 1.5135, + "step": 3293 + }, + { + "epoch": 0.9007383100902379, + "grad_norm": 0.1613280475139618, + "learning_rate": 5e-05, + "loss": 1.6036, + "step": 3294 + }, + { + "epoch": 0.9010117582718075, + "grad_norm": 0.16160857677459717, + "learning_rate": 5e-05, + "loss": 1.5348, + "step": 3295 + }, + { + "epoch": 0.9012852064533771, + "grad_norm": 0.1576218158006668, + "learning_rate": 5e-05, + "loss": 1.5171, + "step": 3296 + }, + { + "epoch": 0.9015586546349467, + "grad_norm": 0.1682513952255249, + "learning_rate": 5e-05, + "loss": 1.5429, + "step": 3297 + }, + { + "epoch": 0.9018321028165163, + "grad_norm": 0.16176675260066986, + "learning_rate": 5e-05, + "loss": 1.5041, + "step": 3298 + }, + { + "epoch": 0.9021055509980859, + "grad_norm": 0.16141022741794586, + "learning_rate": 5e-05, + "loss": 1.5655, + "step": 3299 + }, + { + "epoch": 0.9023789991796555, + "grad_norm": 0.16335152089595795, + "learning_rate": 5e-05, + "loss": 1.6064, + "step": 3300 + }, + { + "epoch": 0.9026524473612251, + "grad_norm": 0.16691720485687256, + "learning_rate": 5e-05, + "loss": 1.6316, + "step": 3301 + }, + { + "epoch": 0.9029258955427947, + "grad_norm": 0.16636385023593903, + "learning_rate": 5e-05, + "loss": 1.59, + "step": 3302 + }, + { + "epoch": 0.9031993437243643, + "grad_norm": 0.1710832118988037, + "learning_rate": 5e-05, + "loss": 1.5199, + "step": 3303 + }, + { + "epoch": 0.9034727919059339, + "grad_norm": 0.169814333319664, + "learning_rate": 5e-05, + "loss": 1.5754, + "step": 3304 + }, + { + "epoch": 0.9037462400875034, + "grad_norm": 0.1706944704055786, + "learning_rate": 5e-05, + "loss": 1.6101, + "step": 3305 + }, + { + "epoch": 0.904019688269073, + "grad_norm": 0.15784287452697754, + "learning_rate": 5e-05, + "loss": 1.5232, + "step": 3306 + }, + { + "epoch": 0.9042931364506426, + "grad_norm": 0.1703675389289856, + "learning_rate": 5e-05, + "loss": 1.6001, + "step": 3307 + }, + { + "epoch": 0.9045665846322122, + "grad_norm": 0.16346101462841034, + "learning_rate": 5e-05, + "loss": 1.5885, + "step": 3308 + }, + { + "epoch": 0.9048400328137818, + "grad_norm": 0.18370665609836578, + "learning_rate": 5e-05, + "loss": 1.6754, + "step": 3309 + }, + { + "epoch": 0.9051134809953514, + "grad_norm": 0.16520695388317108, + "learning_rate": 5e-05, + "loss": 1.5961, + "step": 3310 + }, + { + "epoch": 0.905386929176921, + "grad_norm": 0.17497234046459198, + "learning_rate": 5e-05, + "loss": 1.5912, + "step": 3311 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.16902846097946167, + "learning_rate": 5e-05, + "loss": 1.6057, + "step": 3312 + }, + { + "epoch": 0.9059338255400602, + "grad_norm": 0.16770271956920624, + "learning_rate": 5e-05, + "loss": 1.6274, + "step": 3313 + }, + { + "epoch": 0.9062072737216298, + "grad_norm": 0.16241686046123505, + "learning_rate": 5e-05, + "loss": 1.5494, + "step": 3314 + }, + { + "epoch": 0.9064807219031994, + "grad_norm": 0.16672220826148987, + "learning_rate": 5e-05, + "loss": 1.6654, + "step": 3315 + }, + { + "epoch": 0.9067541700847689, + "grad_norm": 0.1626524180173874, + "learning_rate": 5e-05, + "loss": 1.5578, + "step": 3316 + }, + { + "epoch": 0.9070276182663385, + "grad_norm": 0.16794568300247192, + "learning_rate": 5e-05, + "loss": 1.6463, + "step": 3317 + }, + { + "epoch": 0.9073010664479081, + "grad_norm": 0.16510622203350067, + "learning_rate": 5e-05, + "loss": 1.597, + "step": 3318 + }, + { + "epoch": 0.9075745146294777, + "grad_norm": 0.16373200714588165, + "learning_rate": 5e-05, + "loss": 1.6206, + "step": 3319 + }, + { + "epoch": 0.9078479628110473, + "grad_norm": 0.16739732027053833, + "learning_rate": 5e-05, + "loss": 1.6041, + "step": 3320 + }, + { + "epoch": 0.9081214109926169, + "grad_norm": 0.17469698190689087, + "learning_rate": 5e-05, + "loss": 1.6364, + "step": 3321 + }, + { + "epoch": 0.9083948591741865, + "grad_norm": 0.1681319773197174, + "learning_rate": 5e-05, + "loss": 1.5485, + "step": 3322 + }, + { + "epoch": 0.9086683073557561, + "grad_norm": 0.1650519222021103, + "learning_rate": 5e-05, + "loss": 1.5211, + "step": 3323 + }, + { + "epoch": 0.9089417555373257, + "grad_norm": 0.167753666639328, + "learning_rate": 5e-05, + "loss": 1.6338, + "step": 3324 + }, + { + "epoch": 0.9092152037188953, + "grad_norm": 0.164119154214859, + "learning_rate": 5e-05, + "loss": 1.5542, + "step": 3325 + }, + { + "epoch": 0.9094886519004649, + "grad_norm": 0.16457271575927734, + "learning_rate": 5e-05, + "loss": 1.6337, + "step": 3326 + }, + { + "epoch": 0.9097621000820345, + "grad_norm": 0.17210492491722107, + "learning_rate": 5e-05, + "loss": 1.6361, + "step": 3327 + }, + { + "epoch": 0.910035548263604, + "grad_norm": 0.17178331315517426, + "learning_rate": 5e-05, + "loss": 1.6849, + "step": 3328 + }, + { + "epoch": 0.9103089964451736, + "grad_norm": 0.1700570285320282, + "learning_rate": 5e-05, + "loss": 1.6171, + "step": 3329 + }, + { + "epoch": 0.9105824446267432, + "grad_norm": 0.159864142537117, + "learning_rate": 5e-05, + "loss": 1.5302, + "step": 3330 + }, + { + "epoch": 0.9108558928083128, + "grad_norm": 0.16526418924331665, + "learning_rate": 5e-05, + "loss": 1.5488, + "step": 3331 + }, + { + "epoch": 0.9111293409898824, + "grad_norm": 0.16879980266094208, + "learning_rate": 5e-05, + "loss": 1.6115, + "step": 3332 + }, + { + "epoch": 0.911402789171452, + "grad_norm": 0.1696682572364807, + "learning_rate": 5e-05, + "loss": 1.6198, + "step": 3333 + }, + { + "epoch": 0.9116762373530216, + "grad_norm": 0.1723857969045639, + "learning_rate": 5e-05, + "loss": 1.4767, + "step": 3334 + }, + { + "epoch": 0.9119496855345912, + "grad_norm": 0.16016238927841187, + "learning_rate": 5e-05, + "loss": 1.5293, + "step": 3335 + }, + { + "epoch": 0.9122231337161608, + "grad_norm": 0.1660720705986023, + "learning_rate": 5e-05, + "loss": 1.5294, + "step": 3336 + }, + { + "epoch": 0.9124965818977304, + "grad_norm": 0.16862016916275024, + "learning_rate": 5e-05, + "loss": 1.5987, + "step": 3337 + }, + { + "epoch": 0.9127700300793, + "grad_norm": 0.17382635176181793, + "learning_rate": 5e-05, + "loss": 1.6834, + "step": 3338 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.18816977739334106, + "learning_rate": 5e-05, + "loss": 1.5269, + "step": 3339 + }, + { + "epoch": 0.9133169264424391, + "grad_norm": 0.17092393338680267, + "learning_rate": 5e-05, + "loss": 1.5092, + "step": 3340 + }, + { + "epoch": 0.9135903746240087, + "grad_norm": 0.1623528152704239, + "learning_rate": 5e-05, + "loss": 1.4633, + "step": 3341 + }, + { + "epoch": 0.9138638228055783, + "grad_norm": 0.16653479635715485, + "learning_rate": 5e-05, + "loss": 1.5334, + "step": 3342 + }, + { + "epoch": 0.9141372709871479, + "grad_norm": 0.1750710904598236, + "learning_rate": 5e-05, + "loss": 1.6056, + "step": 3343 + }, + { + "epoch": 0.9144107191687175, + "grad_norm": 0.1836736500263214, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 3344 + }, + { + "epoch": 0.9146841673502871, + "grad_norm": 0.1607818603515625, + "learning_rate": 5e-05, + "loss": 1.5236, + "step": 3345 + }, + { + "epoch": 0.9149576155318567, + "grad_norm": 0.1604374796152115, + "learning_rate": 5e-05, + "loss": 1.4829, + "step": 3346 + }, + { + "epoch": 0.9152310637134263, + "grad_norm": 0.16865043342113495, + "learning_rate": 5e-05, + "loss": 1.6105, + "step": 3347 + }, + { + "epoch": 0.9155045118949959, + "grad_norm": 0.17235668003559113, + "learning_rate": 5e-05, + "loss": 1.6007, + "step": 3348 + }, + { + "epoch": 0.9157779600765655, + "grad_norm": 0.16054973006248474, + "learning_rate": 5e-05, + "loss": 1.5466, + "step": 3349 + }, + { + "epoch": 0.9160514082581351, + "grad_norm": 0.1764586865901947, + "learning_rate": 5e-05, + "loss": 1.6551, + "step": 3350 + }, + { + "epoch": 0.9163248564397046, + "grad_norm": 0.15826088190078735, + "learning_rate": 5e-05, + "loss": 1.538, + "step": 3351 + }, + { + "epoch": 0.9165983046212742, + "grad_norm": 0.16789792478084564, + "learning_rate": 5e-05, + "loss": 1.5845, + "step": 3352 + }, + { + "epoch": 0.9168717528028438, + "grad_norm": 0.1768852025270462, + "learning_rate": 5e-05, + "loss": 1.5847, + "step": 3353 + }, + { + "epoch": 0.9171452009844134, + "grad_norm": 0.16304756700992584, + "learning_rate": 5e-05, + "loss": 1.5508, + "step": 3354 + }, + { + "epoch": 0.917418649165983, + "grad_norm": 0.16250407695770264, + "learning_rate": 5e-05, + "loss": 1.5004, + "step": 3355 + }, + { + "epoch": 0.9176920973475526, + "grad_norm": 0.18279612064361572, + "learning_rate": 5e-05, + "loss": 1.6446, + "step": 3356 + }, + { + "epoch": 0.9179655455291222, + "grad_norm": 0.16867072880268097, + "learning_rate": 5e-05, + "loss": 1.5235, + "step": 3357 + }, + { + "epoch": 0.9182389937106918, + "grad_norm": 0.16388383507728577, + "learning_rate": 5e-05, + "loss": 1.5635, + "step": 3358 + }, + { + "epoch": 0.9185124418922614, + "grad_norm": 0.16723370552062988, + "learning_rate": 5e-05, + "loss": 1.6193, + "step": 3359 + }, + { + "epoch": 0.918785890073831, + "grad_norm": 0.1766999363899231, + "learning_rate": 5e-05, + "loss": 1.6716, + "step": 3360 + }, + { + "epoch": 0.9190593382554006, + "grad_norm": 0.16779550909996033, + "learning_rate": 5e-05, + "loss": 1.5566, + "step": 3361 + }, + { + "epoch": 0.9193327864369701, + "grad_norm": 0.16861297190189362, + "learning_rate": 5e-05, + "loss": 1.6456, + "step": 3362 + }, + { + "epoch": 0.9196062346185397, + "grad_norm": 0.18011167645454407, + "learning_rate": 5e-05, + "loss": 1.6536, + "step": 3363 + }, + { + "epoch": 0.9198796828001093, + "grad_norm": 0.1704559475183487, + "learning_rate": 5e-05, + "loss": 1.6436, + "step": 3364 + }, + { + "epoch": 0.920153130981679, + "grad_norm": 0.16970516741275787, + "learning_rate": 5e-05, + "loss": 1.5132, + "step": 3365 + }, + { + "epoch": 0.9204265791632485, + "grad_norm": 0.1771494746208191, + "learning_rate": 5e-05, + "loss": 1.569, + "step": 3366 + }, + { + "epoch": 0.9207000273448182, + "grad_norm": 0.17308154702186584, + "learning_rate": 5e-05, + "loss": 1.6102, + "step": 3367 + }, + { + "epoch": 0.9209734755263878, + "grad_norm": 0.1670798808336258, + "learning_rate": 5e-05, + "loss": 1.5677, + "step": 3368 + }, + { + "epoch": 0.9212469237079574, + "grad_norm": 0.1702473908662796, + "learning_rate": 5e-05, + "loss": 1.56, + "step": 3369 + }, + { + "epoch": 0.921520371889527, + "grad_norm": 0.1635974645614624, + "learning_rate": 5e-05, + "loss": 1.5166, + "step": 3370 + }, + { + "epoch": 0.9217938200710966, + "grad_norm": 0.16810756921768188, + "learning_rate": 5e-05, + "loss": 1.5151, + "step": 3371 + }, + { + "epoch": 0.9220672682526662, + "grad_norm": 0.15942378342151642, + "learning_rate": 5e-05, + "loss": 1.5737, + "step": 3372 + }, + { + "epoch": 0.9223407164342358, + "grad_norm": 0.17249250411987305, + "learning_rate": 5e-05, + "loss": 1.5527, + "step": 3373 + }, + { + "epoch": 0.9226141646158053, + "grad_norm": 0.16102191805839539, + "learning_rate": 5e-05, + "loss": 1.5104, + "step": 3374 + }, + { + "epoch": 0.9228876127973749, + "grad_norm": 0.17065097391605377, + "learning_rate": 5e-05, + "loss": 1.5749, + "step": 3375 + }, + { + "epoch": 0.9231610609789445, + "grad_norm": 0.17183524370193481, + "learning_rate": 5e-05, + "loss": 1.5149, + "step": 3376 + }, + { + "epoch": 0.9234345091605141, + "grad_norm": 0.17372307181358337, + "learning_rate": 5e-05, + "loss": 1.6725, + "step": 3377 + }, + { + "epoch": 0.9237079573420837, + "grad_norm": 0.17314964532852173, + "learning_rate": 5e-05, + "loss": 1.5872, + "step": 3378 + }, + { + "epoch": 0.9239814055236533, + "grad_norm": 0.17013640701770782, + "learning_rate": 5e-05, + "loss": 1.6004, + "step": 3379 + }, + { + "epoch": 0.9242548537052229, + "grad_norm": 0.16398966312408447, + "learning_rate": 5e-05, + "loss": 1.5362, + "step": 3380 + }, + { + "epoch": 0.9245283018867925, + "grad_norm": 0.16314879059791565, + "learning_rate": 5e-05, + "loss": 1.473, + "step": 3381 + }, + { + "epoch": 0.9248017500683621, + "grad_norm": 0.164910227060318, + "learning_rate": 5e-05, + "loss": 1.5161, + "step": 3382 + }, + { + "epoch": 0.9250751982499317, + "grad_norm": 0.16402222216129303, + "learning_rate": 5e-05, + "loss": 1.5689, + "step": 3383 + }, + { + "epoch": 0.9253486464315013, + "grad_norm": 0.1658565104007721, + "learning_rate": 5e-05, + "loss": 1.5275, + "step": 3384 + }, + { + "epoch": 0.9256220946130709, + "grad_norm": 0.1655322015285492, + "learning_rate": 5e-05, + "loss": 1.5571, + "step": 3385 + }, + { + "epoch": 0.9258955427946404, + "grad_norm": 0.16581609845161438, + "learning_rate": 5e-05, + "loss": 1.6255, + "step": 3386 + }, + { + "epoch": 0.92616899097621, + "grad_norm": 0.1806444674730301, + "learning_rate": 5e-05, + "loss": 1.6025, + "step": 3387 + }, + { + "epoch": 0.9264424391577796, + "grad_norm": 0.1640556901693344, + "learning_rate": 5e-05, + "loss": 1.5615, + "step": 3388 + }, + { + "epoch": 0.9267158873393492, + "grad_norm": 0.15630288422107697, + "learning_rate": 5e-05, + "loss": 1.5421, + "step": 3389 + }, + { + "epoch": 0.9269893355209188, + "grad_norm": 0.16430017352104187, + "learning_rate": 5e-05, + "loss": 1.594, + "step": 3390 + }, + { + "epoch": 0.9272627837024884, + "grad_norm": 0.17217886447906494, + "learning_rate": 5e-05, + "loss": 1.5472, + "step": 3391 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 0.17046159505844116, + "learning_rate": 5e-05, + "loss": 1.5272, + "step": 3392 + }, + { + "epoch": 0.9278096800656276, + "grad_norm": 0.16787946224212646, + "learning_rate": 5e-05, + "loss": 1.5965, + "step": 3393 + }, + { + "epoch": 0.9280831282471972, + "grad_norm": 0.1614540070295334, + "learning_rate": 5e-05, + "loss": 1.4654, + "step": 3394 + }, + { + "epoch": 0.9283565764287668, + "grad_norm": 0.16552212834358215, + "learning_rate": 5e-05, + "loss": 1.5901, + "step": 3395 + }, + { + "epoch": 0.9286300246103364, + "grad_norm": 0.1740752011537552, + "learning_rate": 5e-05, + "loss": 1.6107, + "step": 3396 + }, + { + "epoch": 0.9289034727919059, + "grad_norm": 0.16223378479480743, + "learning_rate": 5e-05, + "loss": 1.5006, + "step": 3397 + }, + { + "epoch": 0.9291769209734755, + "grad_norm": 0.18281352519989014, + "learning_rate": 5e-05, + "loss": 1.6221, + "step": 3398 + }, + { + "epoch": 0.9294503691550451, + "grad_norm": 0.17504733800888062, + "learning_rate": 5e-05, + "loss": 1.5975, + "step": 3399 + }, + { + "epoch": 0.9297238173366147, + "grad_norm": 0.1651788204908371, + "learning_rate": 5e-05, + "loss": 1.5912, + "step": 3400 + }, + { + "epoch": 0.9299972655181843, + "grad_norm": 0.172093003988266, + "learning_rate": 5e-05, + "loss": 1.6516, + "step": 3401 + }, + { + "epoch": 0.9302707136997539, + "grad_norm": 0.17026016116142273, + "learning_rate": 5e-05, + "loss": 1.5742, + "step": 3402 + }, + { + "epoch": 0.9305441618813235, + "grad_norm": 0.17041485011577606, + "learning_rate": 5e-05, + "loss": 1.6175, + "step": 3403 + }, + { + "epoch": 0.9308176100628931, + "grad_norm": 0.17498993873596191, + "learning_rate": 5e-05, + "loss": 1.554, + "step": 3404 + }, + { + "epoch": 0.9310910582444627, + "grad_norm": 0.1785575896501541, + "learning_rate": 5e-05, + "loss": 1.5705, + "step": 3405 + }, + { + "epoch": 0.9313645064260323, + "grad_norm": 0.17882578074932098, + "learning_rate": 5e-05, + "loss": 1.5676, + "step": 3406 + }, + { + "epoch": 0.9316379546076019, + "grad_norm": 0.16166472434997559, + "learning_rate": 5e-05, + "loss": 1.5654, + "step": 3407 + }, + { + "epoch": 0.9319114027891715, + "grad_norm": 0.1717950403690338, + "learning_rate": 5e-05, + "loss": 1.5881, + "step": 3408 + }, + { + "epoch": 0.932184850970741, + "grad_norm": 0.17605219781398773, + "learning_rate": 5e-05, + "loss": 1.5802, + "step": 3409 + }, + { + "epoch": 0.9324582991523106, + "grad_norm": 0.16096574068069458, + "learning_rate": 5e-05, + "loss": 1.51, + "step": 3410 + }, + { + "epoch": 0.9327317473338802, + "grad_norm": 0.16242116689682007, + "learning_rate": 5e-05, + "loss": 1.5665, + "step": 3411 + }, + { + "epoch": 0.9330051955154498, + "grad_norm": 0.17469222843647003, + "learning_rate": 5e-05, + "loss": 1.601, + "step": 3412 + }, + { + "epoch": 0.9332786436970194, + "grad_norm": 0.16520720720291138, + "learning_rate": 5e-05, + "loss": 1.5964, + "step": 3413 + }, + { + "epoch": 0.933552091878589, + "grad_norm": 0.17878501117229462, + "learning_rate": 5e-05, + "loss": 1.7176, + "step": 3414 + }, + { + "epoch": 0.9338255400601586, + "grad_norm": 0.16792459785938263, + "learning_rate": 5e-05, + "loss": 1.5655, + "step": 3415 + }, + { + "epoch": 0.9340989882417282, + "grad_norm": 0.1701999306678772, + "learning_rate": 5e-05, + "loss": 1.5766, + "step": 3416 + }, + { + "epoch": 0.9343724364232978, + "grad_norm": 0.17062143981456757, + "learning_rate": 5e-05, + "loss": 1.6155, + "step": 3417 + }, + { + "epoch": 0.9346458846048674, + "grad_norm": 0.17067895829677582, + "learning_rate": 5e-05, + "loss": 1.5856, + "step": 3418 + }, + { + "epoch": 0.934919332786437, + "grad_norm": 0.17343777418136597, + "learning_rate": 5e-05, + "loss": 1.5841, + "step": 3419 + }, + { + "epoch": 0.9351927809680065, + "grad_norm": 0.16160957515239716, + "learning_rate": 5e-05, + "loss": 1.5255, + "step": 3420 + }, + { + "epoch": 0.9354662291495761, + "grad_norm": 0.15786494314670563, + "learning_rate": 5e-05, + "loss": 1.507, + "step": 3421 + }, + { + "epoch": 0.9357396773311457, + "grad_norm": 0.18263064324855804, + "learning_rate": 5e-05, + "loss": 1.7325, + "step": 3422 + }, + { + "epoch": 0.9360131255127153, + "grad_norm": 0.16316089034080505, + "learning_rate": 5e-05, + "loss": 1.421, + "step": 3423 + }, + { + "epoch": 0.9362865736942849, + "grad_norm": 0.16691453754901886, + "learning_rate": 5e-05, + "loss": 1.6236, + "step": 3424 + }, + { + "epoch": 0.9365600218758545, + "grad_norm": 0.16807349026203156, + "learning_rate": 5e-05, + "loss": 1.5508, + "step": 3425 + }, + { + "epoch": 0.9368334700574241, + "grad_norm": 0.16728545725345612, + "learning_rate": 5e-05, + "loss": 1.4871, + "step": 3426 + }, + { + "epoch": 0.9371069182389937, + "grad_norm": 0.1841699630022049, + "learning_rate": 5e-05, + "loss": 1.5571, + "step": 3427 + }, + { + "epoch": 0.9373803664205633, + "grad_norm": 0.17346476018428802, + "learning_rate": 5e-05, + "loss": 1.5599, + "step": 3428 + }, + { + "epoch": 0.9376538146021329, + "grad_norm": 0.17243662476539612, + "learning_rate": 5e-05, + "loss": 1.6099, + "step": 3429 + }, + { + "epoch": 0.9379272627837025, + "grad_norm": 0.16930198669433594, + "learning_rate": 5e-05, + "loss": 1.6239, + "step": 3430 + }, + { + "epoch": 0.9382007109652721, + "grad_norm": 0.17017267644405365, + "learning_rate": 5e-05, + "loss": 1.6719, + "step": 3431 + }, + { + "epoch": 0.9384741591468416, + "grad_norm": 0.16782057285308838, + "learning_rate": 5e-05, + "loss": 1.6005, + "step": 3432 + }, + { + "epoch": 0.9387476073284112, + "grad_norm": 0.16345928609371185, + "learning_rate": 5e-05, + "loss": 1.6027, + "step": 3433 + }, + { + "epoch": 0.9390210555099808, + "grad_norm": 0.16896604001522064, + "learning_rate": 5e-05, + "loss": 1.6702, + "step": 3434 + }, + { + "epoch": 0.9392945036915504, + "grad_norm": 0.16361366212368011, + "learning_rate": 5e-05, + "loss": 1.6016, + "step": 3435 + }, + { + "epoch": 0.93956795187312, + "grad_norm": 0.1642320603132248, + "learning_rate": 5e-05, + "loss": 1.5547, + "step": 3436 + }, + { + "epoch": 0.9398414000546896, + "grad_norm": 0.1652129888534546, + "learning_rate": 5e-05, + "loss": 1.5415, + "step": 3437 + }, + { + "epoch": 0.9401148482362592, + "grad_norm": 0.16417425870895386, + "learning_rate": 5e-05, + "loss": 1.5715, + "step": 3438 + }, + { + "epoch": 0.9403882964178288, + "grad_norm": 0.16755367815494537, + "learning_rate": 5e-05, + "loss": 1.4629, + "step": 3439 + }, + { + "epoch": 0.9406617445993984, + "grad_norm": 0.1799638420343399, + "learning_rate": 5e-05, + "loss": 1.5797, + "step": 3440 + }, + { + "epoch": 0.940935192780968, + "grad_norm": 0.17268924415111542, + "learning_rate": 5e-05, + "loss": 1.4751, + "step": 3441 + }, + { + "epoch": 0.9412086409625376, + "grad_norm": 0.16716521978378296, + "learning_rate": 5e-05, + "loss": 1.5751, + "step": 3442 + }, + { + "epoch": 0.9414820891441071, + "grad_norm": 0.17508552968502045, + "learning_rate": 5e-05, + "loss": 1.5959, + "step": 3443 + }, + { + "epoch": 0.9417555373256767, + "grad_norm": 0.1688762605190277, + "learning_rate": 5e-05, + "loss": 1.6451, + "step": 3444 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 0.17425693571567535, + "learning_rate": 5e-05, + "loss": 1.6965, + "step": 3445 + }, + { + "epoch": 0.9423024336888159, + "grad_norm": 0.16022168099880219, + "learning_rate": 5e-05, + "loss": 1.5167, + "step": 3446 + }, + { + "epoch": 0.9425758818703855, + "grad_norm": 0.16879145801067352, + "learning_rate": 5e-05, + "loss": 1.5844, + "step": 3447 + }, + { + "epoch": 0.9428493300519551, + "grad_norm": 0.17180216312408447, + "learning_rate": 5e-05, + "loss": 1.5252, + "step": 3448 + }, + { + "epoch": 0.9431227782335248, + "grad_norm": 0.17670823633670807, + "learning_rate": 5e-05, + "loss": 1.5545, + "step": 3449 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.1751013547182083, + "learning_rate": 5e-05, + "loss": 1.6204, + "step": 3450 + }, + { + "epoch": 0.943669674596664, + "grad_norm": 0.16705964505672455, + "learning_rate": 5e-05, + "loss": 1.5458, + "step": 3451 + }, + { + "epoch": 0.9439431227782336, + "grad_norm": 0.17305395007133484, + "learning_rate": 5e-05, + "loss": 1.5924, + "step": 3452 + }, + { + "epoch": 0.9442165709598032, + "grad_norm": 0.17383334040641785, + "learning_rate": 5e-05, + "loss": 1.6049, + "step": 3453 + }, + { + "epoch": 0.9444900191413728, + "grad_norm": 0.16880303621292114, + "learning_rate": 5e-05, + "loss": 1.5726, + "step": 3454 + }, + { + "epoch": 0.9447634673229423, + "grad_norm": 0.17519088089466095, + "learning_rate": 5e-05, + "loss": 1.5866, + "step": 3455 + }, + { + "epoch": 0.9450369155045119, + "grad_norm": 0.1687891036272049, + "learning_rate": 5e-05, + "loss": 1.5832, + "step": 3456 + }, + { + "epoch": 0.9453103636860815, + "grad_norm": 0.16175159811973572, + "learning_rate": 5e-05, + "loss": 1.6019, + "step": 3457 + }, + { + "epoch": 0.9455838118676511, + "grad_norm": 0.17194457352161407, + "learning_rate": 5e-05, + "loss": 1.6486, + "step": 3458 + }, + { + "epoch": 0.9458572600492207, + "grad_norm": 0.17335595190525055, + "learning_rate": 5e-05, + "loss": 1.6298, + "step": 3459 + }, + { + "epoch": 0.9461307082307903, + "grad_norm": 0.17388460040092468, + "learning_rate": 5e-05, + "loss": 1.6029, + "step": 3460 + }, + { + "epoch": 0.9464041564123599, + "grad_norm": 0.15883868932724, + "learning_rate": 5e-05, + "loss": 1.5193, + "step": 3461 + }, + { + "epoch": 0.9466776045939295, + "grad_norm": 0.16564515233039856, + "learning_rate": 5e-05, + "loss": 1.5726, + "step": 3462 + }, + { + "epoch": 0.9469510527754991, + "grad_norm": 0.1713998019695282, + "learning_rate": 5e-05, + "loss": 1.619, + "step": 3463 + }, + { + "epoch": 0.9472245009570687, + "grad_norm": 0.16872616112232208, + "learning_rate": 5e-05, + "loss": 1.6437, + "step": 3464 + }, + { + "epoch": 0.9474979491386383, + "grad_norm": 0.17497317492961884, + "learning_rate": 5e-05, + "loss": 1.533, + "step": 3465 + }, + { + "epoch": 0.9477713973202079, + "grad_norm": 0.15996572375297546, + "learning_rate": 5e-05, + "loss": 1.4727, + "step": 3466 + }, + { + "epoch": 0.9480448455017774, + "grad_norm": 0.1631600260734558, + "learning_rate": 5e-05, + "loss": 1.5921, + "step": 3467 + }, + { + "epoch": 0.948318293683347, + "grad_norm": 0.1694975644350052, + "learning_rate": 5e-05, + "loss": 1.6251, + "step": 3468 + }, + { + "epoch": 0.9485917418649166, + "grad_norm": 0.1703629493713379, + "learning_rate": 5e-05, + "loss": 1.6061, + "step": 3469 + }, + { + "epoch": 0.9488651900464862, + "grad_norm": 0.16327685117721558, + "learning_rate": 5e-05, + "loss": 1.5616, + "step": 3470 + }, + { + "epoch": 0.9491386382280558, + "grad_norm": 0.16655370593070984, + "learning_rate": 5e-05, + "loss": 1.6223, + "step": 3471 + }, + { + "epoch": 0.9494120864096254, + "grad_norm": 0.18266244232654572, + "learning_rate": 5e-05, + "loss": 1.5366, + "step": 3472 + }, + { + "epoch": 0.949685534591195, + "grad_norm": 0.1686059832572937, + "learning_rate": 5e-05, + "loss": 1.6063, + "step": 3473 + }, + { + "epoch": 0.9499589827727646, + "grad_norm": 0.17475081980228424, + "learning_rate": 5e-05, + "loss": 1.5283, + "step": 3474 + }, + { + "epoch": 0.9502324309543342, + "grad_norm": 0.17627473175525665, + "learning_rate": 5e-05, + "loss": 1.5529, + "step": 3475 + }, + { + "epoch": 0.9505058791359038, + "grad_norm": 0.1728144884109497, + "learning_rate": 5e-05, + "loss": 1.4832, + "step": 3476 + }, + { + "epoch": 0.9507793273174734, + "grad_norm": 0.19535605609416962, + "learning_rate": 5e-05, + "loss": 1.5747, + "step": 3477 + }, + { + "epoch": 0.9510527754990429, + "grad_norm": 0.2011018991470337, + "learning_rate": 5e-05, + "loss": 1.5601, + "step": 3478 + }, + { + "epoch": 0.9513262236806125, + "grad_norm": 0.17405074834823608, + "learning_rate": 5e-05, + "loss": 1.6053, + "step": 3479 + }, + { + "epoch": 0.9515996718621821, + "grad_norm": 0.20217041671276093, + "learning_rate": 5e-05, + "loss": 1.607, + "step": 3480 + }, + { + "epoch": 0.9518731200437517, + "grad_norm": 0.17352132499217987, + "learning_rate": 5e-05, + "loss": 1.5501, + "step": 3481 + }, + { + "epoch": 0.9521465682253213, + "grad_norm": 0.17558707296848297, + "learning_rate": 5e-05, + "loss": 1.6479, + "step": 3482 + }, + { + "epoch": 0.9524200164068909, + "grad_norm": 0.17117784917354584, + "learning_rate": 5e-05, + "loss": 1.5656, + "step": 3483 + }, + { + "epoch": 0.9526934645884605, + "grad_norm": 0.16838552057743073, + "learning_rate": 5e-05, + "loss": 1.5381, + "step": 3484 + }, + { + "epoch": 0.9529669127700301, + "grad_norm": 0.17733819782733917, + "learning_rate": 5e-05, + "loss": 1.6453, + "step": 3485 + }, + { + "epoch": 0.9532403609515997, + "grad_norm": 0.1712433099746704, + "learning_rate": 5e-05, + "loss": 1.5944, + "step": 3486 + }, + { + "epoch": 0.9535138091331693, + "grad_norm": 0.19022230803966522, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 3487 + }, + { + "epoch": 0.9537872573147389, + "grad_norm": 0.17896117269992828, + "learning_rate": 5e-05, + "loss": 1.6098, + "step": 3488 + }, + { + "epoch": 0.9540607054963085, + "grad_norm": 0.17199327051639557, + "learning_rate": 5e-05, + "loss": 1.589, + "step": 3489 + }, + { + "epoch": 0.954334153677878, + "grad_norm": 0.19092747569084167, + "learning_rate": 5e-05, + "loss": 1.522, + "step": 3490 + }, + { + "epoch": 0.9546076018594476, + "grad_norm": 0.17024657130241394, + "learning_rate": 5e-05, + "loss": 1.6758, + "step": 3491 + }, + { + "epoch": 0.9548810500410172, + "grad_norm": 0.17147701978683472, + "learning_rate": 5e-05, + "loss": 1.5168, + "step": 3492 + }, + { + "epoch": 0.9551544982225868, + "grad_norm": 0.18577666580677032, + "learning_rate": 5e-05, + "loss": 1.5943, + "step": 3493 + }, + { + "epoch": 0.9554279464041564, + "grad_norm": 0.16658198833465576, + "learning_rate": 5e-05, + "loss": 1.6217, + "step": 3494 + }, + { + "epoch": 0.955701394585726, + "grad_norm": 0.17051103711128235, + "learning_rate": 5e-05, + "loss": 1.521, + "step": 3495 + }, + { + "epoch": 0.9559748427672956, + "grad_norm": 0.1750299036502838, + "learning_rate": 5e-05, + "loss": 1.541, + "step": 3496 + }, + { + "epoch": 0.9562482909488652, + "grad_norm": 0.185097336769104, + "learning_rate": 5e-05, + "loss": 1.664, + "step": 3497 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.17204828560352325, + "learning_rate": 5e-05, + "loss": 1.6225, + "step": 3498 + }, + { + "epoch": 0.9567951873120044, + "grad_norm": 0.17451748251914978, + "learning_rate": 5e-05, + "loss": 1.5786, + "step": 3499 + }, + { + "epoch": 0.957068635493574, + "grad_norm": 0.20204168558120728, + "learning_rate": 5e-05, + "loss": 1.7499, + "step": 3500 + }, + { + "epoch": 0.9573420836751435, + "grad_norm": 0.17379069328308105, + "learning_rate": 5e-05, + "loss": 1.585, + "step": 3501 + }, + { + "epoch": 0.9576155318567131, + "grad_norm": 0.1748073250055313, + "learning_rate": 5e-05, + "loss": 1.5567, + "step": 3502 + }, + { + "epoch": 0.9578889800382827, + "grad_norm": 0.20730933547019958, + "learning_rate": 5e-05, + "loss": 1.7386, + "step": 3503 + }, + { + "epoch": 0.9581624282198523, + "grad_norm": 0.1670350432395935, + "learning_rate": 5e-05, + "loss": 1.5471, + "step": 3504 + }, + { + "epoch": 0.9584358764014219, + "grad_norm": 0.16766297817230225, + "learning_rate": 5e-05, + "loss": 1.4913, + "step": 3505 + }, + { + "epoch": 0.9587093245829915, + "grad_norm": 0.18351885676383972, + "learning_rate": 5e-05, + "loss": 1.5318, + "step": 3506 + }, + { + "epoch": 0.9589827727645611, + "grad_norm": 0.15731576085090637, + "learning_rate": 5e-05, + "loss": 1.4561, + "step": 3507 + }, + { + "epoch": 0.9592562209461307, + "grad_norm": 0.18964198231697083, + "learning_rate": 5e-05, + "loss": 1.6603, + "step": 3508 + }, + { + "epoch": 0.9595296691277003, + "grad_norm": 0.19028975069522858, + "learning_rate": 5e-05, + "loss": 1.6721, + "step": 3509 + }, + { + "epoch": 0.9598031173092699, + "grad_norm": 0.16938328742980957, + "learning_rate": 5e-05, + "loss": 1.5652, + "step": 3510 + }, + { + "epoch": 0.9600765654908395, + "grad_norm": 0.17705866694450378, + "learning_rate": 5e-05, + "loss": 1.5832, + "step": 3511 + }, + { + "epoch": 0.9603500136724091, + "grad_norm": 0.1780325025320053, + "learning_rate": 5e-05, + "loss": 1.5572, + "step": 3512 + }, + { + "epoch": 0.9606234618539786, + "grad_norm": 0.18230192363262177, + "learning_rate": 5e-05, + "loss": 1.6136, + "step": 3513 + }, + { + "epoch": 0.9608969100355482, + "grad_norm": 0.18078622221946716, + "learning_rate": 5e-05, + "loss": 1.5788, + "step": 3514 + }, + { + "epoch": 0.9611703582171178, + "grad_norm": 0.17788712680339813, + "learning_rate": 5e-05, + "loss": 1.4601, + "step": 3515 + }, + { + "epoch": 0.9614438063986874, + "grad_norm": 0.17222706973552704, + "learning_rate": 5e-05, + "loss": 1.586, + "step": 3516 + }, + { + "epoch": 0.961717254580257, + "grad_norm": 0.1748836636543274, + "learning_rate": 5e-05, + "loss": 1.6527, + "step": 3517 + }, + { + "epoch": 0.9619907027618266, + "grad_norm": 0.18419338762760162, + "learning_rate": 5e-05, + "loss": 1.6447, + "step": 3518 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 0.17671184241771698, + "learning_rate": 5e-05, + "loss": 1.6371, + "step": 3519 + }, + { + "epoch": 0.9625375991249658, + "grad_norm": 0.1626741737127304, + "learning_rate": 5e-05, + "loss": 1.6034, + "step": 3520 + }, + { + "epoch": 0.9628110473065354, + "grad_norm": 0.17750827968120575, + "learning_rate": 5e-05, + "loss": 1.5709, + "step": 3521 + }, + { + "epoch": 0.963084495488105, + "grad_norm": 0.18092156946659088, + "learning_rate": 5e-05, + "loss": 1.5488, + "step": 3522 + }, + { + "epoch": 0.9633579436696746, + "grad_norm": 0.20844420790672302, + "learning_rate": 5e-05, + "loss": 1.6209, + "step": 3523 + }, + { + "epoch": 0.9636313918512441, + "grad_norm": 0.1704307496547699, + "learning_rate": 5e-05, + "loss": 1.6154, + "step": 3524 + }, + { + "epoch": 0.9639048400328137, + "grad_norm": 0.17518380284309387, + "learning_rate": 5e-05, + "loss": 1.4978, + "step": 3525 + }, + { + "epoch": 0.9641782882143833, + "grad_norm": 0.18488885462284088, + "learning_rate": 5e-05, + "loss": 1.5973, + "step": 3526 + }, + { + "epoch": 0.9644517363959529, + "grad_norm": 0.1840328723192215, + "learning_rate": 5e-05, + "loss": 1.6942, + "step": 3527 + }, + { + "epoch": 0.9647251845775225, + "grad_norm": 0.16851232945919037, + "learning_rate": 5e-05, + "loss": 1.5614, + "step": 3528 + }, + { + "epoch": 0.9649986327590921, + "grad_norm": 0.1788729429244995, + "learning_rate": 5e-05, + "loss": 1.5971, + "step": 3529 + }, + { + "epoch": 0.9652720809406617, + "grad_norm": 0.17521923780441284, + "learning_rate": 5e-05, + "loss": 1.5597, + "step": 3530 + }, + { + "epoch": 0.9655455291222313, + "grad_norm": 0.16558045148849487, + "learning_rate": 5e-05, + "loss": 1.5802, + "step": 3531 + }, + { + "epoch": 0.965818977303801, + "grad_norm": 0.16922442615032196, + "learning_rate": 5e-05, + "loss": 1.5096, + "step": 3532 + }, + { + "epoch": 0.9660924254853706, + "grad_norm": 0.16690833866596222, + "learning_rate": 5e-05, + "loss": 1.5354, + "step": 3533 + }, + { + "epoch": 0.9663658736669402, + "grad_norm": 0.17629151046276093, + "learning_rate": 5e-05, + "loss": 1.5951, + "step": 3534 + }, + { + "epoch": 0.9666393218485098, + "grad_norm": 0.1715892106294632, + "learning_rate": 5e-05, + "loss": 1.6381, + "step": 3535 + }, + { + "epoch": 0.9669127700300792, + "grad_norm": 0.17085449397563934, + "learning_rate": 5e-05, + "loss": 1.5276, + "step": 3536 + }, + { + "epoch": 0.9671862182116489, + "grad_norm": 0.17155250906944275, + "learning_rate": 5e-05, + "loss": 1.5824, + "step": 3537 + }, + { + "epoch": 0.9674596663932185, + "grad_norm": 0.16368617117404938, + "learning_rate": 5e-05, + "loss": 1.5595, + "step": 3538 + }, + { + "epoch": 0.9677331145747881, + "grad_norm": 0.17515048384666443, + "learning_rate": 5e-05, + "loss": 1.5234, + "step": 3539 + }, + { + "epoch": 0.9680065627563577, + "grad_norm": 0.17045453190803528, + "learning_rate": 5e-05, + "loss": 1.5847, + "step": 3540 + }, + { + "epoch": 0.9682800109379273, + "grad_norm": 0.17592334747314453, + "learning_rate": 5e-05, + "loss": 1.681, + "step": 3541 + }, + { + "epoch": 0.9685534591194969, + "grad_norm": 0.174262136220932, + "learning_rate": 5e-05, + "loss": 1.6834, + "step": 3542 + }, + { + "epoch": 0.9688269073010665, + "grad_norm": 0.16702409088611603, + "learning_rate": 5e-05, + "loss": 1.5888, + "step": 3543 + }, + { + "epoch": 0.9691003554826361, + "grad_norm": 0.16639576852321625, + "learning_rate": 5e-05, + "loss": 1.57, + "step": 3544 + }, + { + "epoch": 0.9693738036642057, + "grad_norm": 0.16943658888339996, + "learning_rate": 5e-05, + "loss": 1.605, + "step": 3545 + }, + { + "epoch": 0.9696472518457753, + "grad_norm": 0.16538041830062866, + "learning_rate": 5e-05, + "loss": 1.562, + "step": 3546 + }, + { + "epoch": 0.9699207000273449, + "grad_norm": 0.16645492613315582, + "learning_rate": 5e-05, + "loss": 1.5321, + "step": 3547 + }, + { + "epoch": 0.9701941482089144, + "grad_norm": 0.17388178408145905, + "learning_rate": 5e-05, + "loss": 1.576, + "step": 3548 + }, + { + "epoch": 0.970467596390484, + "grad_norm": 0.17088045179843903, + "learning_rate": 5e-05, + "loss": 1.5195, + "step": 3549 + }, + { + "epoch": 0.9707410445720536, + "grad_norm": 0.17386609315872192, + "learning_rate": 5e-05, + "loss": 1.5194, + "step": 3550 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 0.1669374406337738, + "learning_rate": 5e-05, + "loss": 1.4916, + "step": 3551 + }, + { + "epoch": 0.9712879409351928, + "grad_norm": 0.18806956708431244, + "learning_rate": 5e-05, + "loss": 1.4877, + "step": 3552 + }, + { + "epoch": 0.9715613891167624, + "grad_norm": 0.1629197746515274, + "learning_rate": 5e-05, + "loss": 1.5365, + "step": 3553 + }, + { + "epoch": 0.971834837298332, + "grad_norm": 0.16897207498550415, + "learning_rate": 5e-05, + "loss": 1.556, + "step": 3554 + }, + { + "epoch": 0.9721082854799016, + "grad_norm": 0.16960465908050537, + "learning_rate": 5e-05, + "loss": 1.5432, + "step": 3555 + }, + { + "epoch": 0.9723817336614712, + "grad_norm": 0.16694605350494385, + "learning_rate": 5e-05, + "loss": 1.6413, + "step": 3556 + }, + { + "epoch": 0.9726551818430408, + "grad_norm": 0.162724107503891, + "learning_rate": 5e-05, + "loss": 1.491, + "step": 3557 + }, + { + "epoch": 0.9729286300246104, + "grad_norm": 0.1615595519542694, + "learning_rate": 5e-05, + "loss": 1.5386, + "step": 3558 + }, + { + "epoch": 0.9732020782061799, + "grad_norm": 0.15930253267288208, + "learning_rate": 5e-05, + "loss": 1.5991, + "step": 3559 + }, + { + "epoch": 0.9734755263877495, + "grad_norm": 0.16046123206615448, + "learning_rate": 5e-05, + "loss": 1.5796, + "step": 3560 + }, + { + "epoch": 0.9737489745693191, + "grad_norm": 0.16169172525405884, + "learning_rate": 5e-05, + "loss": 1.5435, + "step": 3561 + }, + { + "epoch": 0.9740224227508887, + "grad_norm": 0.17044523358345032, + "learning_rate": 5e-05, + "loss": 1.5045, + "step": 3562 + }, + { + "epoch": 0.9742958709324583, + "grad_norm": 0.16750258207321167, + "learning_rate": 5e-05, + "loss": 1.6184, + "step": 3563 + }, + { + "epoch": 0.9745693191140279, + "grad_norm": 0.16423483192920685, + "learning_rate": 5e-05, + "loss": 1.5574, + "step": 3564 + }, + { + "epoch": 0.9748427672955975, + "grad_norm": 0.16018086671829224, + "learning_rate": 5e-05, + "loss": 1.4814, + "step": 3565 + }, + { + "epoch": 0.9751162154771671, + "grad_norm": 0.16692084074020386, + "learning_rate": 5e-05, + "loss": 1.565, + "step": 3566 + }, + { + "epoch": 0.9753896636587367, + "grad_norm": 0.16623759269714355, + "learning_rate": 5e-05, + "loss": 1.5177, + "step": 3567 + }, + { + "epoch": 0.9756631118403063, + "grad_norm": 0.1683354675769806, + "learning_rate": 5e-05, + "loss": 1.5655, + "step": 3568 + }, + { + "epoch": 0.9759365600218759, + "grad_norm": 0.17353872954845428, + "learning_rate": 5e-05, + "loss": 1.5703, + "step": 3569 + }, + { + "epoch": 0.9762100082034455, + "grad_norm": 0.1665719896554947, + "learning_rate": 5e-05, + "loss": 1.5936, + "step": 3570 + }, + { + "epoch": 0.976483456385015, + "grad_norm": 0.17849120497703552, + "learning_rate": 5e-05, + "loss": 1.6144, + "step": 3571 + }, + { + "epoch": 0.9767569045665846, + "grad_norm": 0.17340973019599915, + "learning_rate": 5e-05, + "loss": 1.6221, + "step": 3572 + }, + { + "epoch": 0.9770303527481542, + "grad_norm": 0.17012323439121246, + "learning_rate": 5e-05, + "loss": 1.5603, + "step": 3573 + }, + { + "epoch": 0.9773038009297238, + "grad_norm": 0.1630171239376068, + "learning_rate": 5e-05, + "loss": 1.4353, + "step": 3574 + }, + { + "epoch": 0.9775772491112934, + "grad_norm": 0.16621048748493195, + "learning_rate": 5e-05, + "loss": 1.5738, + "step": 3575 + }, + { + "epoch": 0.977850697292863, + "grad_norm": 0.16702908277511597, + "learning_rate": 5e-05, + "loss": 1.5689, + "step": 3576 + }, + { + "epoch": 0.9781241454744326, + "grad_norm": 0.18123649060726166, + "learning_rate": 5e-05, + "loss": 1.687, + "step": 3577 + }, + { + "epoch": 0.9783975936560022, + "grad_norm": 0.1809684783220291, + "learning_rate": 5e-05, + "loss": 1.5811, + "step": 3578 + }, + { + "epoch": 0.9786710418375718, + "grad_norm": 0.18062925338745117, + "learning_rate": 5e-05, + "loss": 1.6317, + "step": 3579 + }, + { + "epoch": 0.9789444900191414, + "grad_norm": 0.1767573207616806, + "learning_rate": 5e-05, + "loss": 1.6907, + "step": 3580 + }, + { + "epoch": 0.979217938200711, + "grad_norm": 0.17363345623016357, + "learning_rate": 5e-05, + "loss": 1.6019, + "step": 3581 + }, + { + "epoch": 0.9794913863822805, + "grad_norm": 0.173218235373497, + "learning_rate": 5e-05, + "loss": 1.5115, + "step": 3582 + }, + { + "epoch": 0.9797648345638501, + "grad_norm": 0.16840875148773193, + "learning_rate": 5e-05, + "loss": 1.5806, + "step": 3583 + }, + { + "epoch": 0.9800382827454197, + "grad_norm": 0.17666743695735931, + "learning_rate": 5e-05, + "loss": 1.6649, + "step": 3584 + }, + { + "epoch": 0.9803117309269893, + "grad_norm": 0.1744329184293747, + "learning_rate": 5e-05, + "loss": 1.5703, + "step": 3585 + }, + { + "epoch": 0.9805851791085589, + "grad_norm": 0.1757236123085022, + "learning_rate": 5e-05, + "loss": 1.6247, + "step": 3586 + }, + { + "epoch": 0.9808586272901285, + "grad_norm": 0.17235401272773743, + "learning_rate": 5e-05, + "loss": 1.6508, + "step": 3587 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.1712830811738968, + "learning_rate": 5e-05, + "loss": 1.6403, + "step": 3588 + }, + { + "epoch": 0.9814055236532677, + "grad_norm": 0.16565677523612976, + "learning_rate": 5e-05, + "loss": 1.5304, + "step": 3589 + }, + { + "epoch": 0.9816789718348373, + "grad_norm": 0.1652953326702118, + "learning_rate": 5e-05, + "loss": 1.553, + "step": 3590 + }, + { + "epoch": 0.9819524200164069, + "grad_norm": 0.17286698520183563, + "learning_rate": 5e-05, + "loss": 1.5573, + "step": 3591 + }, + { + "epoch": 0.9822258681979765, + "grad_norm": 0.17360328137874603, + "learning_rate": 5e-05, + "loss": 1.6319, + "step": 3592 + }, + { + "epoch": 0.9824993163795461, + "grad_norm": 0.1673014909029007, + "learning_rate": 5e-05, + "loss": 1.5778, + "step": 3593 + }, + { + "epoch": 0.9827727645611156, + "grad_norm": 0.17071346938610077, + "learning_rate": 5e-05, + "loss": 1.6138, + "step": 3594 + }, + { + "epoch": 0.9830462127426852, + "grad_norm": 0.1741405427455902, + "learning_rate": 5e-05, + "loss": 1.5566, + "step": 3595 + }, + { + "epoch": 0.9833196609242548, + "grad_norm": 0.17240159213542938, + "learning_rate": 5e-05, + "loss": 1.6181, + "step": 3596 + }, + { + "epoch": 0.9835931091058244, + "grad_norm": 0.1678202599287033, + "learning_rate": 5e-05, + "loss": 1.5552, + "step": 3597 + }, + { + "epoch": 0.983866557287394, + "grad_norm": 0.17836597561836243, + "learning_rate": 5e-05, + "loss": 1.5859, + "step": 3598 + }, + { + "epoch": 0.9841400054689636, + "grad_norm": 0.17026908695697784, + "learning_rate": 5e-05, + "loss": 1.6616, + "step": 3599 + }, + { + "epoch": 0.9844134536505332, + "grad_norm": 0.17007942497730255, + "learning_rate": 5e-05, + "loss": 1.48, + "step": 3600 + }, + { + "epoch": 0.9846869018321028, + "grad_norm": 0.1805422455072403, + "learning_rate": 5e-05, + "loss": 1.6306, + "step": 3601 + }, + { + "epoch": 0.9849603500136724, + "grad_norm": 0.16335126757621765, + "learning_rate": 5e-05, + "loss": 1.5206, + "step": 3602 + }, + { + "epoch": 0.985233798195242, + "grad_norm": 0.1635698527097702, + "learning_rate": 5e-05, + "loss": 1.5101, + "step": 3603 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 0.18003027141094208, + "learning_rate": 5e-05, + "loss": 1.643, + "step": 3604 + }, + { + "epoch": 0.9857806945583811, + "grad_norm": 0.17599767446517944, + "learning_rate": 5e-05, + "loss": 1.6374, + "step": 3605 + }, + { + "epoch": 0.9860541427399507, + "grad_norm": 0.1730010062456131, + "learning_rate": 5e-05, + "loss": 1.5686, + "step": 3606 + }, + { + "epoch": 0.9863275909215203, + "grad_norm": 0.17399701476097107, + "learning_rate": 5e-05, + "loss": 1.6065, + "step": 3607 + }, + { + "epoch": 0.9866010391030899, + "grad_norm": 0.18367905914783478, + "learning_rate": 5e-05, + "loss": 1.7215, + "step": 3608 + }, + { + "epoch": 0.9868744872846595, + "grad_norm": 0.18548168241977692, + "learning_rate": 5e-05, + "loss": 1.5337, + "step": 3609 + }, + { + "epoch": 0.9871479354662291, + "grad_norm": 0.1609538048505783, + "learning_rate": 5e-05, + "loss": 1.5586, + "step": 3610 + }, + { + "epoch": 0.9874213836477987, + "grad_norm": 0.17349039018154144, + "learning_rate": 5e-05, + "loss": 1.659, + "step": 3611 + }, + { + "epoch": 0.9876948318293683, + "grad_norm": 0.17284166812896729, + "learning_rate": 5e-05, + "loss": 1.6581, + "step": 3612 + }, + { + "epoch": 0.987968280010938, + "grad_norm": 0.16577011346817017, + "learning_rate": 5e-05, + "loss": 1.4734, + "step": 3613 + }, + { + "epoch": 0.9882417281925076, + "grad_norm": 0.17548039555549622, + "learning_rate": 5e-05, + "loss": 1.664, + "step": 3614 + }, + { + "epoch": 0.9885151763740772, + "grad_norm": 0.16056503355503082, + "learning_rate": 5e-05, + "loss": 1.5385, + "step": 3615 + }, + { + "epoch": 0.9887886245556468, + "grad_norm": 0.1729920357465744, + "learning_rate": 5e-05, + "loss": 1.6822, + "step": 3616 + }, + { + "epoch": 0.9890620727372162, + "grad_norm": 0.1658102124929428, + "learning_rate": 5e-05, + "loss": 1.5311, + "step": 3617 + }, + { + "epoch": 0.9893355209187858, + "grad_norm": 0.15829437971115112, + "learning_rate": 5e-05, + "loss": 1.5528, + "step": 3618 + }, + { + "epoch": 0.9896089691003555, + "grad_norm": 0.160248264670372, + "learning_rate": 5e-05, + "loss": 1.5386, + "step": 3619 + }, + { + "epoch": 0.989882417281925, + "grad_norm": 0.1627732813358307, + "learning_rate": 5e-05, + "loss": 1.5654, + "step": 3620 + }, + { + "epoch": 0.9901558654634947, + "grad_norm": 0.17312705516815186, + "learning_rate": 5e-05, + "loss": 1.4819, + "step": 3621 + }, + { + "epoch": 0.9904293136450643, + "grad_norm": 0.1666928231716156, + "learning_rate": 5e-05, + "loss": 1.5008, + "step": 3622 + }, + { + "epoch": 0.9907027618266339, + "grad_norm": 0.16598451137542725, + "learning_rate": 5e-05, + "loss": 1.4918, + "step": 3623 + }, + { + "epoch": 0.9909762100082035, + "grad_norm": 0.17471113801002502, + "learning_rate": 5e-05, + "loss": 1.6568, + "step": 3624 + }, + { + "epoch": 0.9912496581897731, + "grad_norm": 0.16479487717151642, + "learning_rate": 5e-05, + "loss": 1.5205, + "step": 3625 + }, + { + "epoch": 0.9915231063713427, + "grad_norm": 0.18188925087451935, + "learning_rate": 5e-05, + "loss": 1.6185, + "step": 3626 + }, + { + "epoch": 0.9917965545529123, + "grad_norm": 0.1632193922996521, + "learning_rate": 5e-05, + "loss": 1.5691, + "step": 3627 + }, + { + "epoch": 0.9920700027344819, + "grad_norm": 0.16891783475875854, + "learning_rate": 5e-05, + "loss": 1.53, + "step": 3628 + }, + { + "epoch": 0.9923434509160514, + "grad_norm": 0.17035579681396484, + "learning_rate": 5e-05, + "loss": 1.5631, + "step": 3629 + }, + { + "epoch": 0.992616899097621, + "grad_norm": 0.16007426381111145, + "learning_rate": 5e-05, + "loss": 1.414, + "step": 3630 + }, + { + "epoch": 0.9928903472791906, + "grad_norm": 0.17161479592323303, + "learning_rate": 5e-05, + "loss": 1.5988, + "step": 3631 + }, + { + "epoch": 0.9931637954607602, + "grad_norm": 0.16341786086559296, + "learning_rate": 5e-05, + "loss": 1.5492, + "step": 3632 + }, + { + "epoch": 0.9934372436423298, + "grad_norm": 0.17597921192646027, + "learning_rate": 5e-05, + "loss": 1.6547, + "step": 3633 + }, + { + "epoch": 0.9937106918238994, + "grad_norm": 0.18249720335006714, + "learning_rate": 5e-05, + "loss": 1.7218, + "step": 3634 + }, + { + "epoch": 0.993984140005469, + "grad_norm": 0.17179979383945465, + "learning_rate": 5e-05, + "loss": 1.6261, + "step": 3635 + }, + { + "epoch": 0.9942575881870386, + "grad_norm": 0.16686037182807922, + "learning_rate": 5e-05, + "loss": 1.5381, + "step": 3636 + }, + { + "epoch": 0.9945310363686082, + "grad_norm": 0.17167183756828308, + "learning_rate": 5e-05, + "loss": 1.6132, + "step": 3637 + }, + { + "epoch": 0.9948044845501778, + "grad_norm": 0.1685621589422226, + "learning_rate": 5e-05, + "loss": 1.5806, + "step": 3638 + }, + { + "epoch": 0.9950779327317474, + "grad_norm": 0.17752127349376678, + "learning_rate": 5e-05, + "loss": 1.5842, + "step": 3639 + }, + { + "epoch": 0.9953513809133169, + "grad_norm": 0.18407252430915833, + "learning_rate": 5e-05, + "loss": 1.5769, + "step": 3640 + }, + { + "epoch": 0.9956248290948865, + "grad_norm": 0.17489740252494812, + "learning_rate": 5e-05, + "loss": 1.6766, + "step": 3641 + }, + { + "epoch": 0.9958982772764561, + "grad_norm": 0.1652628481388092, + "learning_rate": 5e-05, + "loss": 1.5431, + "step": 3642 + }, + { + "epoch": 0.9961717254580257, + "grad_norm": 0.16765595972537994, + "learning_rate": 5e-05, + "loss": 1.5277, + "step": 3643 + }, + { + "epoch": 0.9964451736395953, + "grad_norm": 0.18051624298095703, + "learning_rate": 5e-05, + "loss": 1.7006, + "step": 3644 + }, + { + "epoch": 0.9967186218211649, + "grad_norm": 0.16528740525245667, + "learning_rate": 5e-05, + "loss": 1.5304, + "step": 3645 + }, + { + "epoch": 0.9969920700027345, + "grad_norm": 0.18011535704135895, + "learning_rate": 5e-05, + "loss": 1.6645, + "step": 3646 + }, + { + "epoch": 0.9972655181843041, + "grad_norm": 0.17780528962612152, + "learning_rate": 5e-05, + "loss": 1.5679, + "step": 3647 + }, + { + "epoch": 0.9975389663658737, + "grad_norm": 0.16724787652492523, + "learning_rate": 5e-05, + "loss": 1.5967, + "step": 3648 + }, + { + "epoch": 0.9978124145474433, + "grad_norm": 0.1816672682762146, + "learning_rate": 5e-05, + "loss": 1.5915, + "step": 3649 + }, + { + "epoch": 0.9980858627290129, + "grad_norm": 0.1704689860343933, + "learning_rate": 5e-05, + "loss": 1.4793, + "step": 3650 + }, + { + "epoch": 0.9983593109105825, + "grad_norm": 0.16653746366500854, + "learning_rate": 5e-05, + "loss": 1.5062, + "step": 3651 + }, + { + "epoch": 0.998632759092152, + "grad_norm": 0.17958907783031464, + "learning_rate": 5e-05, + "loss": 1.5776, + "step": 3652 + }, + { + "epoch": 0.9989062072737216, + "grad_norm": 0.1751994490623474, + "learning_rate": 5e-05, + "loss": 1.5787, + "step": 3653 + }, + { + "epoch": 0.9991796554552912, + "grad_norm": 0.16929645836353302, + "learning_rate": 5e-05, + "loss": 1.5965, + "step": 3654 + }, + { + "epoch": 0.9994531036368608, + "grad_norm": 0.19185209274291992, + "learning_rate": 5e-05, + "loss": 1.5234, + "step": 3655 + }, + { + "epoch": 0.9997265518184304, + "grad_norm": 0.17126961052417755, + "learning_rate": 5e-05, + "loss": 1.5285, + "step": 3656 + }, + { + "epoch": 1.0, + "grad_norm": 0.17371578514575958, + "learning_rate": 5e-05, + "loss": 1.5768, + "step": 3657 + }, + { + "epoch": 1.0, + "step": 3657, + "total_flos": 2.81863078907845e+18, + "train_loss": 1.6374119961558387, + "train_runtime": 72812.9252, + "train_samples_per_second": 0.402, + "train_steps_per_second": 0.05 + } + ], + "logging_steps": 1, + "max_steps": 3657, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.81863078907845e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}