{ "best_metric": null, "best_model_checkpoint": null, "epoch": 21.595116581607126, "eval_steps": 1000, "global_step": 215800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020014009806864807, "grad_norm": 0.03836650773882866, "learning_rate": 4.999998120874287e-05, "loss": 1.1065, "step": 200 }, { "epoch": 0.040028019613729614, "grad_norm": 4.9917144775390625, "learning_rate": 4.99999228953385e-05, "loss": 1.0957, "step": 400 }, { "epoch": 0.06004202942059442, "grad_norm": 2.3958218097686768, "learning_rate": 4.999982504752454e-05, "loss": 1.09, "step": 600 }, { "epoch": 0.08005603922745923, "grad_norm": 1.6214464902877808, "learning_rate": 4.9999687665455754e-05, "loss": 1.0892, "step": 800 }, { "epoch": 0.10007004903432402, "grad_norm": 0.44337132573127747, "learning_rate": 4.999951074934936e-05, "loss": 1.0706, "step": 1000 }, { "epoch": 0.10007004903432402, "eval_loss": 1.074905276298523, "eval_runtime": 1515.87, "eval_samples_per_second": 6.534, "eval_steps_per_second": 1.089, "step": 1000 }, { "epoch": 0.12008405884118883, "grad_norm": 2.2500057220458984, "learning_rate": 4.999929429948517e-05, "loss": 1.0665, "step": 1200 }, { "epoch": 0.14009806864805363, "grad_norm": 4.672375679016113, "learning_rate": 4.999903831620545e-05, "loss": 1.0557, "step": 1400 }, { "epoch": 0.16011207845491846, "grad_norm": 1.5358741283416748, "learning_rate": 4.999874279991501e-05, "loss": 1.0587, "step": 1600 }, { "epoch": 0.18012608826178325, "grad_norm": 1.451069951057434, "learning_rate": 4.9998407751081175e-05, "loss": 1.0304, "step": 1800 }, { "epoch": 0.20014009806864805, "grad_norm": 2.5960638523101807, "learning_rate": 4.999803317023378e-05, "loss": 1.0072, "step": 2000 }, { "epoch": 0.20014009806864805, "eval_loss": 1.0809282064437866, "eval_runtime": 1519.7995, "eval_samples_per_second": 6.517, "eval_steps_per_second": 1.086, "step": 2000 }, { "epoch": 0.22015410787551287, "grad_norm": 3.5109949111938477, "learning_rate": 4.99976190579652e-05, "loss": 1.0295, "step": 2200 }, { "epoch": 0.24016811768237767, "grad_norm": 0.3527772128582001, "learning_rate": 4.999716541493028e-05, "loss": 1.0211, "step": 2400 }, { "epoch": 0.26018212748924247, "grad_norm": 3.8305537700653076, "learning_rate": 4.999667224184641e-05, "loss": 1.0186, "step": 2600 }, { "epoch": 0.28019613729610726, "grad_norm": 6.743955612182617, "learning_rate": 4.999613953949349e-05, "loss": 0.988, "step": 2800 }, { "epoch": 0.30021014710297206, "grad_norm": 5.491413116455078, "learning_rate": 4.999556730871393e-05, "loss": 0.9906, "step": 3000 }, { "epoch": 0.30021014710297206, "eval_loss": 1.0688618421554565, "eval_runtime": 1521.3335, "eval_samples_per_second": 6.51, "eval_steps_per_second": 1.085, "step": 3000 }, { "epoch": 0.3202241569098369, "grad_norm": 0.7390512228012085, "learning_rate": 4.999495555041262e-05, "loss": 0.984, "step": 3200 }, { "epoch": 0.3402381667167017, "grad_norm": 1.398917317390442, "learning_rate": 4.999430426555702e-05, "loss": 0.9834, "step": 3400 }, { "epoch": 0.3602521765235665, "grad_norm": 6.698788166046143, "learning_rate": 4.999361345517703e-05, "loss": 0.9851, "step": 3600 }, { "epoch": 0.3802661863304313, "grad_norm": 4.882030963897705, "learning_rate": 4.99928831203651e-05, "loss": 0.9979, "step": 3800 }, { "epoch": 0.4002801961372961, "grad_norm": 0.8999706506729126, "learning_rate": 4.999211326227616e-05, "loss": 0.977, "step": 4000 }, { "epoch": 0.4002801961372961, "eval_loss": 1.0179067850112915, "eval_runtime": 1524.9954, "eval_samples_per_second": 6.494, "eval_steps_per_second": 1.083, "step": 4000 }, { "epoch": 0.4202942059441609, "grad_norm": 5.606805801391602, "learning_rate": 4.999130388212765e-05, "loss": 0.9906, "step": 4200 }, { "epoch": 0.44030821575102574, "grad_norm": 2.99131178855896, "learning_rate": 4.999045498119951e-05, "loss": 0.9669, "step": 4400 }, { "epoch": 0.46032222555789054, "grad_norm": 1.6574604511260986, "learning_rate": 4.998956656083418e-05, "loss": 0.9689, "step": 4600 }, { "epoch": 0.48033623536475534, "grad_norm": 1.9680750370025635, "learning_rate": 4.998863862243659e-05, "loss": 0.9419, "step": 4800 }, { "epoch": 0.5003502451716202, "grad_norm": 3.6111326217651367, "learning_rate": 4.998767116747416e-05, "loss": 0.9466, "step": 5000 }, { "epoch": 0.5003502451716202, "eval_loss": 0.98724764585495, "eval_runtime": 1519.8095, "eval_samples_per_second": 6.517, "eval_steps_per_second": 1.086, "step": 5000 }, { "epoch": 0.5203642549784849, "grad_norm": 1.7990385293960571, "learning_rate": 4.998666419747681e-05, "loss": 0.943, "step": 5200 }, { "epoch": 0.5403782647853498, "grad_norm": 2.986767053604126, "learning_rate": 4.9985617714036934e-05, "loss": 0.9391, "step": 5400 }, { "epoch": 0.5603922745922145, "grad_norm": 1.2289016246795654, "learning_rate": 4.998453171880943e-05, "loss": 0.9193, "step": 5600 }, { "epoch": 0.5804062843990794, "grad_norm": 6.417152404785156, "learning_rate": 4.998340621351167e-05, "loss": 0.9002, "step": 5800 }, { "epoch": 0.6004202942059441, "grad_norm": 2.3924219608306885, "learning_rate": 4.9982241199923516e-05, "loss": 0.9356, "step": 6000 }, { "epoch": 0.6004202942059441, "eval_loss": 0.9291344881057739, "eval_runtime": 1526.7644, "eval_samples_per_second": 6.487, "eval_steps_per_second": 1.081, "step": 6000 }, { "epoch": 0.620434304012809, "grad_norm": 2.170485496520996, "learning_rate": 4.9981036679887285e-05, "loss": 0.901, "step": 6200 }, { "epoch": 0.6404483138196738, "grad_norm": 6.00177526473999, "learning_rate": 4.9979792655307775e-05, "loss": 0.9298, "step": 6400 }, { "epoch": 0.6604623236265386, "grad_norm": 4.213263034820557, "learning_rate": 4.9978509128152284e-05, "loss": 0.8793, "step": 6600 }, { "epoch": 0.6804763334334034, "grad_norm": 3.818934440612793, "learning_rate": 4.997718610045054e-05, "loss": 0.8877, "step": 6800 }, { "epoch": 0.7004903432402682, "grad_norm": 4.24846076965332, "learning_rate": 4.997582357429477e-05, "loss": 0.87, "step": 7000 }, { "epoch": 0.7004903432402682, "eval_loss": 0.9745267033576965, "eval_runtime": 1527.2059, "eval_samples_per_second": 6.485, "eval_steps_per_second": 1.081, "step": 7000 }, { "epoch": 0.720504353047133, "grad_norm": 1.9713335037231445, "learning_rate": 4.997442155183963e-05, "loss": 0.8778, "step": 7200 }, { "epoch": 0.7405183628539977, "grad_norm": 3.707040786743164, "learning_rate": 4.997298003530226e-05, "loss": 0.8979, "step": 7400 }, { "epoch": 0.7605323726608626, "grad_norm": 7.6451215744018555, "learning_rate": 4.997149902696226e-05, "loss": 0.8911, "step": 7600 }, { "epoch": 0.7805463824677275, "grad_norm": 6.48955774307251, "learning_rate": 4.996997852916165e-05, "loss": 0.8726, "step": 7800 }, { "epoch": 0.8005603922745922, "grad_norm": 3.440643787384033, "learning_rate": 4.996841854430493e-05, "loss": 0.8568, "step": 8000 }, { "epoch": 0.8005603922745922, "eval_loss": 0.9558325409889221, "eval_runtime": 1525.1569, "eval_samples_per_second": 6.494, "eval_steps_per_second": 1.083, "step": 8000 }, { "epoch": 0.820574402081457, "grad_norm": 0.7619641423225403, "learning_rate": 4.996681907485902e-05, "loss": 0.8819, "step": 8200 }, { "epoch": 0.8405884118883218, "grad_norm": 2.227626085281372, "learning_rate": 4.99651801233533e-05, "loss": 0.8594, "step": 8400 }, { "epoch": 0.8606024216951866, "grad_norm": 2.33223032951355, "learning_rate": 4.996350169237957e-05, "loss": 0.8412, "step": 8600 }, { "epoch": 0.8806164315020515, "grad_norm": 3.2668964862823486, "learning_rate": 4.996178378459208e-05, "loss": 0.8723, "step": 8800 }, { "epoch": 0.9006304413089162, "grad_norm": 2.3762640953063965, "learning_rate": 4.996002640270748e-05, "loss": 0.8657, "step": 9000 }, { "epoch": 0.9006304413089162, "eval_loss": 0.9002355337142944, "eval_runtime": 1532.6568, "eval_samples_per_second": 6.462, "eval_steps_per_second": 1.077, "step": 9000 }, { "epoch": 0.9206444511157811, "grad_norm": 1.9243931770324707, "learning_rate": 4.995822954950487e-05, "loss": 0.8509, "step": 9200 }, { "epoch": 0.9406584609226458, "grad_norm": 2.0189273357391357, "learning_rate": 4.995639322782576e-05, "loss": 0.8388, "step": 9400 }, { "epoch": 0.9606724707295107, "grad_norm": 4.521066665649414, "learning_rate": 4.995451744057408e-05, "loss": 0.7915, "step": 9600 }, { "epoch": 0.9806864805363754, "grad_norm": 2.9316565990448, "learning_rate": 4.995260219071616e-05, "loss": 0.8352, "step": 9800 }, { "epoch": 1.0007004903432404, "grad_norm": 3.372856616973877, "learning_rate": 4.9950647481280744e-05, "loss": 0.8119, "step": 10000 }, { "epoch": 1.0007004903432404, "eval_loss": 0.8663386106491089, "eval_runtime": 1530.8489, "eval_samples_per_second": 6.47, "eval_steps_per_second": 1.078, "step": 10000 }, { "epoch": 1.020714500150105, "grad_norm": 9.139496803283691, "learning_rate": 4.994865331535896e-05, "loss": 0.7974, "step": 10200 }, { "epoch": 1.0407285099569699, "grad_norm": 2.9714930057525635, "learning_rate": 4.994661969610436e-05, "loss": 0.8195, "step": 10400 }, { "epoch": 1.0607425197638347, "grad_norm": 5.354647636413574, "learning_rate": 4.9944546626732866e-05, "loss": 0.8617, "step": 10600 }, { "epoch": 1.0807565295706996, "grad_norm": 4.419401168823242, "learning_rate": 4.994243411052278e-05, "loss": 0.7929, "step": 10800 }, { "epoch": 1.1007705393775642, "grad_norm": 1.4354485273361206, "learning_rate": 4.994028215081482e-05, "loss": 0.8218, "step": 11000 }, { "epoch": 1.1007705393775642, "eval_loss": 0.8851364254951477, "eval_runtime": 1462.9369, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.129, "step": 11000 }, { "epoch": 1.120784549184429, "grad_norm": 3.1663262844085693, "learning_rate": 4.993809075101203e-05, "loss": 0.8058, "step": 11200 }, { "epoch": 1.140798558991294, "grad_norm": 3.3653388023376465, "learning_rate": 4.9935859914579866e-05, "loss": 0.8298, "step": 11400 }, { "epoch": 1.1608125687981588, "grad_norm": 5.4315972328186035, "learning_rate": 4.993358964504612e-05, "loss": 0.7907, "step": 11600 }, { "epoch": 1.1808265786050236, "grad_norm": 3.9618091583251953, "learning_rate": 4.993127994600095e-05, "loss": 0.7843, "step": 11800 }, { "epoch": 1.2008405884118882, "grad_norm": 6.105351448059082, "learning_rate": 4.9928930821096884e-05, "loss": 0.7895, "step": 12000 }, { "epoch": 1.2008405884118882, "eval_loss": 0.871867299079895, "eval_runtime": 1463.2034, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.128, "step": 12000 }, { "epoch": 1.220854598218753, "grad_norm": 7.426138877868652, "learning_rate": 4.992654227404878e-05, "loss": 0.8016, "step": 12200 }, { "epoch": 1.240868608025618, "grad_norm": 3.5146498680114746, "learning_rate": 4.9924114308633846e-05, "loss": 0.8064, "step": 12400 }, { "epoch": 1.2608826178324828, "grad_norm": 5.279008388519287, "learning_rate": 4.992164692869161e-05, "loss": 0.7923, "step": 12600 }, { "epoch": 1.2808966276393474, "grad_norm": 3.450472116470337, "learning_rate": 4.9919140138123946e-05, "loss": 0.8052, "step": 12800 }, { "epoch": 1.3009106374462123, "grad_norm": 5.5161614418029785, "learning_rate": 4.991659394089504e-05, "loss": 0.7813, "step": 13000 }, { "epoch": 1.3009106374462123, "eval_loss": 0.902167797088623, "eval_runtime": 1461.6814, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.13, "step": 13000 }, { "epoch": 1.3209246472530771, "grad_norm": 7.929429531097412, "learning_rate": 4.9914008341031415e-05, "loss": 0.8144, "step": 13200 }, { "epoch": 1.340938657059942, "grad_norm": 7.625912189483643, "learning_rate": 4.991138334262188e-05, "loss": 0.8057, "step": 13400 }, { "epoch": 1.3609526668668068, "grad_norm": 3.3602981567382812, "learning_rate": 4.9908718949817544e-05, "loss": 0.7972, "step": 13600 }, { "epoch": 1.3809666766736717, "grad_norm": 6.379916667938232, "learning_rate": 4.9906015166831854e-05, "loss": 0.7847, "step": 13800 }, { "epoch": 1.4009806864805363, "grad_norm": 5.260190010070801, "learning_rate": 4.9903271997940514e-05, "loss": 0.749, "step": 14000 }, { "epoch": 1.4009806864805363, "eval_loss": 0.8986017107963562, "eval_runtime": 1463.929, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.128, "step": 14000 }, { "epoch": 1.4209946962874012, "grad_norm": 2.692965269088745, "learning_rate": 4.9900489447481513e-05, "loss": 0.8036, "step": 14200 }, { "epoch": 1.441008706094266, "grad_norm": 16.46511459350586, "learning_rate": 4.9897667519855126e-05, "loss": 0.7982, "step": 14400 }, { "epoch": 1.4610227159011309, "grad_norm": 4.93552827835083, "learning_rate": 4.98948062195239e-05, "loss": 0.8008, "step": 14600 }, { "epoch": 1.4810367257079955, "grad_norm": 6.806095600128174, "learning_rate": 4.989190555101264e-05, "loss": 0.7916, "step": 14800 }, { "epoch": 1.5010507355148603, "grad_norm": 6.310318470001221, "learning_rate": 4.988896551890841e-05, "loss": 0.7361, "step": 15000 }, { "epoch": 1.5010507355148603, "eval_loss": 0.9008051156997681, "eval_runtime": 1464.5394, "eval_samples_per_second": 6.763, "eval_steps_per_second": 1.127, "step": 15000 }, { "epoch": 1.5210647453217252, "grad_norm": 6.064602375030518, "learning_rate": 4.9885986127860516e-05, "loss": 0.7789, "step": 15200 }, { "epoch": 1.54107875512859, "grad_norm": 5.148590087890625, "learning_rate": 4.988296738258052e-05, "loss": 0.7849, "step": 15400 }, { "epoch": 1.561092764935455, "grad_norm": 9.876203536987305, "learning_rate": 4.987990928784221e-05, "loss": 0.7665, "step": 15600 }, { "epoch": 1.5811067747423198, "grad_norm": 3.4477570056915283, "learning_rate": 4.9876811848481584e-05, "loss": 0.7705, "step": 15800 }, { "epoch": 1.6011207845491844, "grad_norm": 4.232241153717041, "learning_rate": 4.987367506939688e-05, "loss": 0.791, "step": 16000 }, { "epoch": 1.6011207845491844, "eval_loss": 0.8321856260299683, "eval_runtime": 1467.0437, "eval_samples_per_second": 6.751, "eval_steps_per_second": 1.125, "step": 16000 }, { "epoch": 1.6211347943560492, "grad_norm": 6.282638072967529, "learning_rate": 4.987049895554856e-05, "loss": 0.7649, "step": 16200 }, { "epoch": 1.641148804162914, "grad_norm": 3.4875385761260986, "learning_rate": 4.986728351195926e-05, "loss": 0.791, "step": 16400 }, { "epoch": 1.6611628139697787, "grad_norm": 6.374185085296631, "learning_rate": 4.986402874371381e-05, "loss": 0.7536, "step": 16600 }, { "epoch": 1.6811768237766436, "grad_norm": 7.087797164916992, "learning_rate": 4.986073465595925e-05, "loss": 0.7307, "step": 16800 }, { "epoch": 1.7011908335835084, "grad_norm": 5.365784168243408, "learning_rate": 4.98574012539048e-05, "loss": 0.7964, "step": 17000 }, { "epoch": 1.7011908335835084, "eval_loss": 0.8461858630180359, "eval_runtime": 1464.8353, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.127, "step": 17000 }, { "epoch": 1.7212048433903733, "grad_norm": 7.748868942260742, "learning_rate": 4.985402854282182e-05, "loss": 0.7526, "step": 17200 }, { "epoch": 1.7412188531972381, "grad_norm": 3.441117525100708, "learning_rate": 4.985061652804385e-05, "loss": 0.7479, "step": 17400 }, { "epoch": 1.761232863004103, "grad_norm": 2.670192003250122, "learning_rate": 4.9847165214966605e-05, "loss": 0.7654, "step": 17600 }, { "epoch": 1.7812468728109678, "grad_norm": 4.161070346832275, "learning_rate": 4.984367460904792e-05, "loss": 0.7343, "step": 17800 }, { "epoch": 1.8012608826178325, "grad_norm": 6.927440643310547, "learning_rate": 4.984014471580777e-05, "loss": 0.7563, "step": 18000 }, { "epoch": 1.8012608826178325, "eval_loss": 0.9024050831794739, "eval_runtime": 1464.0228, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.128, "step": 18000 }, { "epoch": 1.8212748924246973, "grad_norm": 8.026199340820312, "learning_rate": 4.9836575540828255e-05, "loss": 0.7269, "step": 18200 }, { "epoch": 1.841288902231562, "grad_norm": 5.15052604675293, "learning_rate": 4.983296708975362e-05, "loss": 0.7508, "step": 18400 }, { "epoch": 1.8613029120384268, "grad_norm": 4.978198528289795, "learning_rate": 4.9829319368290196e-05, "loss": 0.7218, "step": 18600 }, { "epoch": 1.8813169218452916, "grad_norm": 5.513315677642822, "learning_rate": 4.9825632382206406e-05, "loss": 0.7457, "step": 18800 }, { "epoch": 1.9013309316521565, "grad_norm": 5.887680530548096, "learning_rate": 4.9821906137332794e-05, "loss": 0.753, "step": 19000 }, { "epoch": 1.9013309316521565, "eval_loss": 0.8066152334213257, "eval_runtime": 1464.1036, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.128, "step": 19000 }, { "epoch": 1.9213449414590213, "grad_norm": 3.8684074878692627, "learning_rate": 4.981814063956197e-05, "loss": 0.736, "step": 19200 }, { "epoch": 1.9413589512658862, "grad_norm": 4.095367431640625, "learning_rate": 4.981433589484863e-05, "loss": 0.7854, "step": 19400 }, { "epoch": 1.961372961072751, "grad_norm": 3.773890733718872, "learning_rate": 4.9810491909209525e-05, "loss": 0.7481, "step": 19600 }, { "epoch": 1.9813869708796157, "grad_norm": 7.259924411773682, "learning_rate": 4.9806608688723445e-05, "loss": 0.7543, "step": 19800 }, { "epoch": 2.0014009806864808, "grad_norm": 4.742369651794434, "learning_rate": 4.980268623953125e-05, "loss": 0.6996, "step": 20000 }, { "epoch": 2.0014009806864808, "eval_loss": 0.8363164663314819, "eval_runtime": 1466.1234, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.126, "step": 20000 }, { "epoch": 2.021414990493345, "grad_norm": 24.398244857788086, "learning_rate": 4.9798724567835845e-05, "loss": 0.7153, "step": 20200 }, { "epoch": 2.04142900030021, "grad_norm": 3.7045798301696777, "learning_rate": 4.979472367990212e-05, "loss": 0.7318, "step": 20400 }, { "epoch": 2.061443010107075, "grad_norm": 3.9732093811035156, "learning_rate": 4.979068358205702e-05, "loss": 0.7306, "step": 20600 }, { "epoch": 2.0814570199139397, "grad_norm": 14.974584579467773, "learning_rate": 4.9786604280689466e-05, "loss": 0.7059, "step": 20800 }, { "epoch": 2.1014710297208046, "grad_norm": 8.874074935913086, "learning_rate": 4.9782485782250396e-05, "loss": 0.7117, "step": 21000 }, { "epoch": 2.1014710297208046, "eval_loss": 0.842494785785675, "eval_runtime": 1479.9678, "eval_samples_per_second": 6.692, "eval_steps_per_second": 1.116, "step": 21000 }, { "epoch": 2.1214850395276694, "grad_norm": 2.043355703353882, "learning_rate": 4.977832809325274e-05, "loss": 0.7045, "step": 21200 }, { "epoch": 2.1414990493345343, "grad_norm": 5.032010078430176, "learning_rate": 4.977413122027136e-05, "loss": 0.6842, "step": 21400 }, { "epoch": 2.161513059141399, "grad_norm": 3.8701581954956055, "learning_rate": 4.976989516994313e-05, "loss": 0.745, "step": 21600 }, { "epoch": 2.181527068948264, "grad_norm": 3.017009735107422, "learning_rate": 4.9765619948966866e-05, "loss": 0.7226, "step": 21800 }, { "epoch": 2.2015410787551284, "grad_norm": 4.095740795135498, "learning_rate": 4.976130556410331e-05, "loss": 0.7, "step": 22000 }, { "epoch": 2.2015410787551284, "eval_loss": 0.831932783126831, "eval_runtime": 1469.3596, "eval_samples_per_second": 6.74, "eval_steps_per_second": 1.124, "step": 22000 }, { "epoch": 2.2215550885619932, "grad_norm": 2.3981704711914062, "learning_rate": 4.975695202217516e-05, "loss": 0.7065, "step": 22200 }, { "epoch": 2.241569098368858, "grad_norm": 3.694225311279297, "learning_rate": 4.975255933006702e-05, "loss": 0.7352, "step": 22400 }, { "epoch": 2.261583108175723, "grad_norm": 21.724483489990234, "learning_rate": 4.974812749472541e-05, "loss": 0.6801, "step": 22600 }, { "epoch": 2.281597117982588, "grad_norm": 1.6748656034469604, "learning_rate": 4.974365652315874e-05, "loss": 0.7022, "step": 22800 }, { "epoch": 2.3016111277894526, "grad_norm": 12.608467102050781, "learning_rate": 4.973914642243735e-05, "loss": 0.7257, "step": 23000 }, { "epoch": 2.3016111277894526, "eval_loss": 0.7793951034545898, "eval_runtime": 1471.057, "eval_samples_per_second": 6.733, "eval_steps_per_second": 1.122, "step": 23000 }, { "epoch": 2.3216251375963175, "grad_norm": 3.929051160812378, "learning_rate": 4.973459719969341e-05, "loss": 0.6974, "step": 23200 }, { "epoch": 2.3416391474031824, "grad_norm": 12.414912223815918, "learning_rate": 4.973000886212097e-05, "loss": 0.6991, "step": 23400 }, { "epoch": 2.361653157210047, "grad_norm": 4.481961727142334, "learning_rate": 4.972538141697596e-05, "loss": 0.6671, "step": 23600 }, { "epoch": 2.3816671670169116, "grad_norm": 14.903956413269043, "learning_rate": 4.972071487157611e-05, "loss": 0.6982, "step": 23800 }, { "epoch": 2.4016811768237765, "grad_norm": 6.630885124206543, "learning_rate": 4.971600923330103e-05, "loss": 0.6985, "step": 24000 }, { "epoch": 2.4016811768237765, "eval_loss": 0.8149631023406982, "eval_runtime": 1468.6934, "eval_samples_per_second": 6.743, "eval_steps_per_second": 1.124, "step": 24000 }, { "epoch": 2.4216951866306413, "grad_norm": 7.6192851066589355, "learning_rate": 4.971126450959211e-05, "loss": 0.7377, "step": 24200 }, { "epoch": 2.441709196437506, "grad_norm": 4.382999420166016, "learning_rate": 4.97064807079526e-05, "loss": 0.6735, "step": 24400 }, { "epoch": 2.461723206244371, "grad_norm": 5.252045154571533, "learning_rate": 4.970165783594747e-05, "loss": 0.6851, "step": 24600 }, { "epoch": 2.481737216051236, "grad_norm": 3.434926748275757, "learning_rate": 4.969679590120354e-05, "loss": 0.6951, "step": 24800 }, { "epoch": 2.5017512258581007, "grad_norm": 3.28517484664917, "learning_rate": 4.9691894911409384e-05, "loss": 0.6972, "step": 25000 }, { "epoch": 2.5017512258581007, "eval_loss": 0.7769864797592163, "eval_runtime": 1469.3966, "eval_samples_per_second": 6.74, "eval_steps_per_second": 1.124, "step": 25000 }, { "epoch": 2.5217652356649656, "grad_norm": 10.94172477722168, "learning_rate": 4.968695487431534e-05, "loss": 0.6444, "step": 25200 }, { "epoch": 2.5417792454718304, "grad_norm": 8.870444297790527, "learning_rate": 4.968197579773348e-05, "loss": 0.7219, "step": 25400 }, { "epoch": 2.561793255278695, "grad_norm": 2.93217134475708, "learning_rate": 4.967695768953763e-05, "loss": 0.6777, "step": 25600 }, { "epoch": 2.58180726508556, "grad_norm": 2.14872145652771, "learning_rate": 4.967190055766333e-05, "loss": 0.6701, "step": 25800 }, { "epoch": 2.6018212748924245, "grad_norm": 13.219706535339355, "learning_rate": 4.966680441010783e-05, "loss": 0.7368, "step": 26000 }, { "epoch": 2.6018212748924245, "eval_loss": 0.7869371175765991, "eval_runtime": 1466.9136, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.125, "step": 26000 }, { "epoch": 2.6218352846992894, "grad_norm": 1.99358069896698, "learning_rate": 4.9661669254930084e-05, "loss": 0.7419, "step": 26200 }, { "epoch": 2.6418492945061542, "grad_norm": 7.771709442138672, "learning_rate": 4.9656495100250735e-05, "loss": 0.6972, "step": 26400 }, { "epoch": 2.661863304313019, "grad_norm": 3.1652045249938965, "learning_rate": 4.9651281954252083e-05, "loss": 0.703, "step": 26600 }, { "epoch": 2.681877314119884, "grad_norm": 3.5205235481262207, "learning_rate": 4.9646029825178106e-05, "loss": 0.7268, "step": 26800 }, { "epoch": 2.701891323926749, "grad_norm": 2.3725974559783936, "learning_rate": 4.964073872133443e-05, "loss": 0.6774, "step": 27000 }, { "epoch": 2.701891323926749, "eval_loss": 0.7754766941070557, "eval_runtime": 1470.7591, "eval_samples_per_second": 6.734, "eval_steps_per_second": 1.123, "step": 27000 }, { "epoch": 2.7219053337336137, "grad_norm": 3.8851709365844727, "learning_rate": 4.963540865108829e-05, "loss": 0.6629, "step": 27200 }, { "epoch": 2.741919343540478, "grad_norm": 6.006371974945068, "learning_rate": 4.963003962286858e-05, "loss": 0.6773, "step": 27400 }, { "epoch": 2.7619333533473434, "grad_norm": 8.04957389831543, "learning_rate": 4.962463164516577e-05, "loss": 0.6692, "step": 27600 }, { "epoch": 2.7819473631542078, "grad_norm": 8.697158813476562, "learning_rate": 4.961918472653193e-05, "loss": 0.679, "step": 27800 }, { "epoch": 2.8019613729610726, "grad_norm": 6.187361717224121, "learning_rate": 4.961369887558072e-05, "loss": 0.7075, "step": 28000 }, { "epoch": 2.8019613729610726, "eval_loss": 0.7774937748908997, "eval_runtime": 1470.9031, "eval_samples_per_second": 6.733, "eval_steps_per_second": 1.122, "step": 28000 }, { "epoch": 2.8219753827679375, "grad_norm": 10.004706382751465, "learning_rate": 4.960817410098737e-05, "loss": 0.7333, "step": 28200 }, { "epoch": 2.8419893925748023, "grad_norm": 5.531788349151611, "learning_rate": 4.960261041148864e-05, "loss": 0.721, "step": 28400 }, { "epoch": 2.862003402381667, "grad_norm": 5.793222427368164, "learning_rate": 4.959700781588285e-05, "loss": 0.7407, "step": 28600 }, { "epoch": 2.882017412188532, "grad_norm": 2.2222025394439697, "learning_rate": 4.959136632302984e-05, "loss": 0.6991, "step": 28800 }, { "epoch": 2.902031421995397, "grad_norm": 6.433917045593262, "learning_rate": 4.958568594185096e-05, "loss": 0.7114, "step": 29000 }, { "epoch": 2.902031421995397, "eval_loss": 0.8205936551094055, "eval_runtime": 1469.4658, "eval_samples_per_second": 6.74, "eval_steps_per_second": 1.124, "step": 29000 }, { "epoch": 2.9220454318022617, "grad_norm": 3.507596969604492, "learning_rate": 4.9579966681329074e-05, "loss": 0.7476, "step": 29200 }, { "epoch": 2.9420594416091266, "grad_norm": 2.8263449668884277, "learning_rate": 4.9574208550508484e-05, "loss": 0.694, "step": 29400 }, { "epoch": 2.962073451415991, "grad_norm": 8.172489166259766, "learning_rate": 4.956841155849501e-05, "loss": 0.6869, "step": 29600 }, { "epoch": 2.982087461222856, "grad_norm": 3.555558681488037, "learning_rate": 4.95625757144559e-05, "loss": 0.6729, "step": 29800 }, { "epoch": 3.0021014710297207, "grad_norm": 8.730535507202148, "learning_rate": 4.9556701027619856e-05, "loss": 0.6775, "step": 30000 }, { "epoch": 3.0021014710297207, "eval_loss": 0.7652115225791931, "eval_runtime": 1471.9622, "eval_samples_per_second": 6.728, "eval_steps_per_second": 1.122, "step": 30000 }, { "epoch": 3.0221154808365855, "grad_norm": 5.184886932373047, "learning_rate": 4.9550787507277e-05, "loss": 0.6504, "step": 30200 }, { "epoch": 3.0421294906434504, "grad_norm": 2.356971502304077, "learning_rate": 4.9544835162778844e-05, "loss": 0.6625, "step": 30400 }, { "epoch": 3.0621435004503152, "grad_norm": 4.148653030395508, "learning_rate": 4.953884400353832e-05, "loss": 0.6295, "step": 30600 }, { "epoch": 3.08215751025718, "grad_norm": 1.5591318607330322, "learning_rate": 4.953281403902975e-05, "loss": 0.6831, "step": 30800 }, { "epoch": 3.102171520064045, "grad_norm": 5.893868446350098, "learning_rate": 4.952674527878879e-05, "loss": 0.6753, "step": 31000 }, { "epoch": 3.102171520064045, "eval_loss": 0.8020298480987549, "eval_runtime": 1469.2225, "eval_samples_per_second": 6.741, "eval_steps_per_second": 1.124, "step": 31000 }, { "epoch": 3.12218552987091, "grad_norm": 4.421782493591309, "learning_rate": 4.952063773241248e-05, "loss": 0.6702, "step": 31200 }, { "epoch": 3.142199539677774, "grad_norm": 7.965831756591797, "learning_rate": 4.951449140955916e-05, "loss": 0.6494, "step": 31400 }, { "epoch": 3.162213549484639, "grad_norm": 1.913954496383667, "learning_rate": 4.950830631994851e-05, "loss": 0.6679, "step": 31600 }, { "epoch": 3.182227559291504, "grad_norm": 2.5493805408477783, "learning_rate": 4.950208247336154e-05, "loss": 0.6578, "step": 31800 }, { "epoch": 3.2022415690983688, "grad_norm": 3.94891095161438, "learning_rate": 4.949581987964051e-05, "loss": 0.6612, "step": 32000 }, { "epoch": 3.2022415690983688, "eval_loss": 0.8482847213745117, "eval_runtime": 1471.1418, "eval_samples_per_second": 6.732, "eval_steps_per_second": 1.122, "step": 32000 }, { "epoch": 3.2222555789052336, "grad_norm": 19.275236129760742, "learning_rate": 4.948951854868896e-05, "loss": 0.6679, "step": 32200 }, { "epoch": 3.2422695887120985, "grad_norm": 6.727810859680176, "learning_rate": 4.9483178490471705e-05, "loss": 0.6669, "step": 32400 }, { "epoch": 3.2622835985189633, "grad_norm": 7.788300514221191, "learning_rate": 4.9476799715014796e-05, "loss": 0.6468, "step": 32600 }, { "epoch": 3.282297608325828, "grad_norm": 17.329851150512695, "learning_rate": 4.94703822324055e-05, "loss": 0.6775, "step": 32800 }, { "epoch": 3.302311618132693, "grad_norm": 3.4661388397216797, "learning_rate": 4.946392605279232e-05, "loss": 0.6593, "step": 33000 }, { "epoch": 3.302311618132693, "eval_loss": 0.8139415383338928, "eval_runtime": 1575.3242, "eval_samples_per_second": 6.287, "eval_steps_per_second": 1.048, "step": 33000 }, { "epoch": 3.322325627939558, "grad_norm": 3.6273305416107178, "learning_rate": 4.9457431186384914e-05, "loss": 0.6688, "step": 33200 }, { "epoch": 3.3423396377464227, "grad_norm": 6.19839334487915, "learning_rate": 4.9450897643454165e-05, "loss": 0.6615, "step": 33400 }, { "epoch": 3.362353647553287, "grad_norm": 3.127351760864258, "learning_rate": 4.944432543433208e-05, "loss": 0.6506, "step": 33600 }, { "epoch": 3.382367657360152, "grad_norm": 4.932734489440918, "learning_rate": 4.943771456941183e-05, "loss": 0.6522, "step": 33800 }, { "epoch": 3.402381667167017, "grad_norm": 9.829474449157715, "learning_rate": 4.943106505914772e-05, "loss": 0.6398, "step": 34000 }, { "epoch": 3.402381667167017, "eval_loss": 0.7665857076644897, "eval_runtime": 1577.5761, "eval_samples_per_second": 6.278, "eval_steps_per_second": 1.047, "step": 34000 }, { "epoch": 3.4223956769738817, "grad_norm": 2.363632917404175, "learning_rate": 4.942437691405516e-05, "loss": 0.6432, "step": 34200 }, { "epoch": 3.4424096867807465, "grad_norm": 5.529567241668701, "learning_rate": 4.941765014471065e-05, "loss": 0.6242, "step": 34400 }, { "epoch": 3.4624236965876114, "grad_norm": 3.5687777996063232, "learning_rate": 4.941088476175178e-05, "loss": 0.7094, "step": 34600 }, { "epoch": 3.4824377063944763, "grad_norm": 2.710512399673462, "learning_rate": 4.94040807758772e-05, "loss": 0.6733, "step": 34800 }, { "epoch": 3.502451716201341, "grad_norm": 6.525320053100586, "learning_rate": 4.939723819784661e-05, "loss": 0.6346, "step": 35000 }, { "epoch": 3.502451716201341, "eval_loss": 0.782295823097229, "eval_runtime": 1574.8202, "eval_samples_per_second": 6.289, "eval_steps_per_second": 1.048, "step": 35000 }, { "epoch": 3.522465726008206, "grad_norm": 2.604841470718384, "learning_rate": 4.939035703848073e-05, "loss": 0.6854, "step": 35200 }, { "epoch": 3.5424797358150704, "grad_norm": 7.024875164031982, "learning_rate": 4.938343730866129e-05, "loss": 0.6783, "step": 35400 }, { "epoch": 3.562493745621935, "grad_norm": 1.6358895301818848, "learning_rate": 4.937647901933105e-05, "loss": 0.671, "step": 35600 }, { "epoch": 3.5825077554288, "grad_norm": 3.727923631668091, "learning_rate": 4.9369482181493675e-05, "loss": 0.6389, "step": 35800 }, { "epoch": 3.602521765235665, "grad_norm": 2.4830257892608643, "learning_rate": 4.9362446806213866e-05, "loss": 0.6441, "step": 36000 }, { "epoch": 3.602521765235665, "eval_loss": 0.7771650552749634, "eval_runtime": 1570.923, "eval_samples_per_second": 6.305, "eval_steps_per_second": 1.051, "step": 36000 }, { "epoch": 3.6225357750425298, "grad_norm": 6.460010051727295, "learning_rate": 4.935537290461722e-05, "loss": 0.6464, "step": 36200 }, { "epoch": 3.6425497848493946, "grad_norm": 3.2908380031585693, "learning_rate": 4.934826048789027e-05, "loss": 0.6337, "step": 36400 }, { "epoch": 3.6625637946562595, "grad_norm": 3.8624677658081055, "learning_rate": 4.9341109567280474e-05, "loss": 0.6301, "step": 36600 }, { "epoch": 3.6825778044631243, "grad_norm": 4.898980617523193, "learning_rate": 4.9333920154096144e-05, "loss": 0.6381, "step": 36800 }, { "epoch": 3.702591814269989, "grad_norm": 13.398076057434082, "learning_rate": 4.9326692259706496e-05, "loss": 0.6455, "step": 37000 }, { "epoch": 3.702591814269989, "eval_loss": 0.7631074786186218, "eval_runtime": 1573.8856, "eval_samples_per_second": 6.293, "eval_steps_per_second": 1.049, "step": 37000 }, { "epoch": 3.7226058240768536, "grad_norm": 2.525438070297241, "learning_rate": 4.9319425895541585e-05, "loss": 0.6752, "step": 37200 }, { "epoch": 3.7426198338837184, "grad_norm": 5.1247382164001465, "learning_rate": 4.9312121073092305e-05, "loss": 0.662, "step": 37400 }, { "epoch": 3.7626338436905833, "grad_norm": 4.910254001617432, "learning_rate": 4.9304777803910364e-05, "loss": 0.6529, "step": 37600 }, { "epoch": 3.782647853497448, "grad_norm": 5.506941318511963, "learning_rate": 4.9297396099608265e-05, "loss": 0.6542, "step": 37800 }, { "epoch": 3.802661863304313, "grad_norm": 6.370262145996094, "learning_rate": 4.9289975971859294e-05, "loss": 0.6481, "step": 38000 }, { "epoch": 3.802661863304313, "eval_loss": 0.7701865434646606, "eval_runtime": 1575.5655, "eval_samples_per_second": 6.286, "eval_steps_per_second": 1.048, "step": 38000 }, { "epoch": 3.822675873111178, "grad_norm": 3.6959519386291504, "learning_rate": 4.9282517432397513e-05, "loss": 0.6368, "step": 38200 }, { "epoch": 3.8426898829180427, "grad_norm": 5.103184700012207, "learning_rate": 4.927502049301771e-05, "loss": 0.6436, "step": 38400 }, { "epoch": 3.8627038927249076, "grad_norm": 3.0977602005004883, "learning_rate": 4.926748516557541e-05, "loss": 0.6456, "step": 38600 }, { "epoch": 3.8827179025317724, "grad_norm": 3.459282159805298, "learning_rate": 4.925991146198683e-05, "loss": 0.6707, "step": 38800 }, { "epoch": 3.902731912338637, "grad_norm": 1.971540927886963, "learning_rate": 4.925229939422889e-05, "loss": 0.6459, "step": 39000 }, { "epoch": 3.902731912338637, "eval_loss": 0.771842360496521, "eval_runtime": 1575.4138, "eval_samples_per_second": 6.287, "eval_steps_per_second": 1.048, "step": 39000 }, { "epoch": 3.922745922145502, "grad_norm": 6.059391021728516, "learning_rate": 4.924464897433918e-05, "loss": 0.6525, "step": 39200 }, { "epoch": 3.9427599319523665, "grad_norm": 3.5738885402679443, "learning_rate": 4.923696021441591e-05, "loss": 0.6314, "step": 39400 }, { "epoch": 3.9627739417592314, "grad_norm": 8.004569053649902, "learning_rate": 4.9229233126617944e-05, "loss": 0.6528, "step": 39600 }, { "epoch": 3.982787951566096, "grad_norm": 3.3736953735351562, "learning_rate": 4.9221467723164766e-05, "loss": 0.6764, "step": 39800 }, { "epoch": 4.0028019613729615, "grad_norm": 2.3032097816467285, "learning_rate": 4.9213664016336434e-05, "loss": 0.6159, "step": 40000 }, { "epoch": 4.0028019613729615, "eval_loss": 0.768229067325592, "eval_runtime": 1574.1722, "eval_samples_per_second": 6.292, "eval_steps_per_second": 1.049, "step": 40000 }, { "epoch": 4.022815971179826, "grad_norm": 11.645203590393066, "learning_rate": 4.920582201847359e-05, "loss": 0.6393, "step": 40200 }, { "epoch": 4.04282998098669, "grad_norm": 6.980269908905029, "learning_rate": 4.919794174197741e-05, "loss": 0.6209, "step": 40400 }, { "epoch": 4.062843990793556, "grad_norm": 7.686216354370117, "learning_rate": 4.919002319930962e-05, "loss": 0.6388, "step": 40600 }, { "epoch": 4.08285800060042, "grad_norm": 1.9562063217163086, "learning_rate": 4.918206640299246e-05, "loss": 0.63, "step": 40800 }, { "epoch": 4.102872010407285, "grad_norm": 1.8460928201675415, "learning_rate": 4.9174071365608654e-05, "loss": 0.6559, "step": 41000 }, { "epoch": 4.102872010407285, "eval_loss": 0.7917781472206116, "eval_runtime": 1572.616, "eval_samples_per_second": 6.298, "eval_steps_per_second": 1.05, "step": 41000 }, { "epoch": 4.12288602021415, "grad_norm": 3.9306538105010986, "learning_rate": 4.916603809980141e-05, "loss": 0.6194, "step": 41200 }, { "epoch": 4.142900030021015, "grad_norm": 9.051627159118652, "learning_rate": 4.9157966618274366e-05, "loss": 0.5982, "step": 41400 }, { "epoch": 4.162914039827879, "grad_norm": 5.957490921020508, "learning_rate": 4.914985693379164e-05, "loss": 0.6357, "step": 41600 }, { "epoch": 4.182928049634745, "grad_norm": 3.302683115005493, "learning_rate": 4.914170905917771e-05, "loss": 0.638, "step": 41800 }, { "epoch": 4.202942059441609, "grad_norm": 11.077726364135742, "learning_rate": 4.9133523007317486e-05, "loss": 0.6237, "step": 42000 }, { "epoch": 4.202942059441609, "eval_loss": 0.771676778793335, "eval_runtime": 1580.8689, "eval_samples_per_second": 6.265, "eval_steps_per_second": 1.044, "step": 42000 }, { "epoch": 4.2229560692484736, "grad_norm": 11.065333366394043, "learning_rate": 4.912529879115624e-05, "loss": 0.5834, "step": 42200 }, { "epoch": 4.242970079055339, "grad_norm": 2.1685075759887695, "learning_rate": 4.911703642369958e-05, "loss": 0.6069, "step": 42400 }, { "epoch": 4.262984088862203, "grad_norm": 2.1144607067108154, "learning_rate": 4.910873591801346e-05, "loss": 0.571, "step": 42600 }, { "epoch": 4.2829980986690686, "grad_norm": 5.7213029861450195, "learning_rate": 4.910039728722416e-05, "loss": 0.6817, "step": 42800 }, { "epoch": 4.303012108475933, "grad_norm": 2.383558511734009, "learning_rate": 4.9092020544518225e-05, "loss": 0.5806, "step": 43000 }, { "epoch": 4.303012108475933, "eval_loss": 0.8465139865875244, "eval_runtime": 1593.9873, "eval_samples_per_second": 6.213, "eval_steps_per_second": 1.036, "step": 43000 }, { "epoch": 4.323026118282798, "grad_norm": 11.154216766357422, "learning_rate": 4.908360570314248e-05, "loss": 0.6506, "step": 43200 }, { "epoch": 4.343040128089663, "grad_norm": 5.561831951141357, "learning_rate": 4.907515277640399e-05, "loss": 0.5722, "step": 43400 }, { "epoch": 4.363054137896528, "grad_norm": 10.376618385314941, "learning_rate": 4.906666177767005e-05, "loss": 0.6389, "step": 43600 }, { "epoch": 4.383068147703392, "grad_norm": 1.3942511081695557, "learning_rate": 4.90581327203682e-05, "loss": 0.6533, "step": 43800 }, { "epoch": 4.403082157510257, "grad_norm": 7.246211051940918, "learning_rate": 4.9049565617986093e-05, "loss": 0.6192, "step": 44000 }, { "epoch": 4.403082157510257, "eval_loss": 0.855816662311554, "eval_runtime": 1592.0784, "eval_samples_per_second": 6.221, "eval_steps_per_second": 1.037, "step": 44000 }, { "epoch": 4.423096167317122, "grad_norm": 1.6817080974578857, "learning_rate": 4.904096048407161e-05, "loss": 0.6486, "step": 44200 }, { "epoch": 4.4431101771239865, "grad_norm": 9.532196044921875, "learning_rate": 4.9032317332232724e-05, "loss": 0.6431, "step": 44400 }, { "epoch": 4.463124186930852, "grad_norm": 7.840386390686035, "learning_rate": 4.9023636176137575e-05, "loss": 0.6219, "step": 44600 }, { "epoch": 4.483138196737716, "grad_norm": 3.651336193084717, "learning_rate": 4.901491702951437e-05, "loss": 0.5985, "step": 44800 }, { "epoch": 4.5031522065445815, "grad_norm": 4.822089195251465, "learning_rate": 4.9006159906151415e-05, "loss": 0.6196, "step": 45000 }, { "epoch": 4.5031522065445815, "eval_loss": 0.8108223676681519, "eval_runtime": 1595.3882, "eval_samples_per_second": 6.208, "eval_steps_per_second": 1.035, "step": 45000 }, { "epoch": 4.523166216351446, "grad_norm": 2.1222033500671387, "learning_rate": 4.899736481989705e-05, "loss": 0.674, "step": 45200 }, { "epoch": 4.543180226158311, "grad_norm": 1.322166085243225, "learning_rate": 4.898853178465966e-05, "loss": 0.6321, "step": 45400 }, { "epoch": 4.563194235965176, "grad_norm": 6.389608383178711, "learning_rate": 4.897966081440765e-05, "loss": 0.6531, "step": 45600 }, { "epoch": 4.58320824577204, "grad_norm": 5.005728721618652, "learning_rate": 4.897075192316939e-05, "loss": 0.6016, "step": 45800 }, { "epoch": 4.603222255578905, "grad_norm": 2.4323718547821045, "learning_rate": 4.8961805125033254e-05, "loss": 0.6182, "step": 46000 }, { "epoch": 4.603222255578905, "eval_loss": 0.8169678449630737, "eval_runtime": 1590.4327, "eval_samples_per_second": 6.227, "eval_steps_per_second": 1.038, "step": 46000 }, { "epoch": 4.62323626538577, "grad_norm": 17.969669342041016, "learning_rate": 4.895282043414753e-05, "loss": 0.6142, "step": 46200 }, { "epoch": 4.643250275192635, "grad_norm": 24.616683959960938, "learning_rate": 4.8943797864720434e-05, "loss": 0.6427, "step": 46400 }, { "epoch": 4.663264284999499, "grad_norm": 3.0616493225097656, "learning_rate": 4.893473743102009e-05, "loss": 0.6221, "step": 46600 }, { "epoch": 4.683278294806365, "grad_norm": 5.405192852020264, "learning_rate": 4.8925639147374515e-05, "loss": 0.644, "step": 46800 }, { "epoch": 4.703292304613229, "grad_norm": 20.367870330810547, "learning_rate": 4.891650302817154e-05, "loss": 0.6255, "step": 47000 }, { "epoch": 4.703292304613229, "eval_loss": 0.7534742951393127, "eval_runtime": 1591.6836, "eval_samples_per_second": 6.222, "eval_steps_per_second": 1.037, "step": 47000 }, { "epoch": 4.723306314420094, "grad_norm": 8.428770065307617, "learning_rate": 4.8907329087858876e-05, "loss": 0.6053, "step": 47200 }, { "epoch": 4.743320324226959, "grad_norm": 3.0602355003356934, "learning_rate": 4.889811734094401e-05, "loss": 0.6584, "step": 47400 }, { "epoch": 4.763334334033823, "grad_norm": 7.801473617553711, "learning_rate": 4.888886780199421e-05, "loss": 0.5903, "step": 47600 }, { "epoch": 4.7833483438406885, "grad_norm": 12.095342636108398, "learning_rate": 4.8879580485636546e-05, "loss": 0.6252, "step": 47800 }, { "epoch": 4.803362353647553, "grad_norm": 2.2458577156066895, "learning_rate": 4.88702554065578e-05, "loss": 0.6262, "step": 48000 }, { "epoch": 4.803362353647553, "eval_loss": 0.7720254063606262, "eval_runtime": 1590.6096, "eval_samples_per_second": 6.227, "eval_steps_per_second": 1.038, "step": 48000 }, { "epoch": 4.823376363454418, "grad_norm": 12.90213394165039, "learning_rate": 4.886089257950448e-05, "loss": 0.5954, "step": 48200 }, { "epoch": 4.843390373261283, "grad_norm": 6.05360746383667, "learning_rate": 4.8851492019282776e-05, "loss": 0.6258, "step": 48400 }, { "epoch": 4.863404383068148, "grad_norm": 8.981491088867188, "learning_rate": 4.884205374075856e-05, "loss": 0.6176, "step": 48600 }, { "epoch": 4.883418392875012, "grad_norm": 4.065619468688965, "learning_rate": 4.8832577758857354e-05, "loss": 0.641, "step": 48800 }, { "epoch": 4.903432402681878, "grad_norm": 6.65540075302124, "learning_rate": 4.88230640885643e-05, "loss": 0.582, "step": 49000 }, { "epoch": 4.903432402681878, "eval_loss": 0.7783287167549133, "eval_runtime": 1590.3549, "eval_samples_per_second": 6.228, "eval_steps_per_second": 1.038, "step": 49000 }, { "epoch": 4.923446412488742, "grad_norm": 4.533128261566162, "learning_rate": 4.881351274492413e-05, "loss": 0.6118, "step": 49200 }, { "epoch": 4.9434604222956064, "grad_norm": 7.281586647033691, "learning_rate": 4.880392374304116e-05, "loss": 0.5972, "step": 49400 }, { "epoch": 4.963474432102472, "grad_norm": 5.0835347175598145, "learning_rate": 4.8794297098079266e-05, "loss": 0.5734, "step": 49600 }, { "epoch": 4.983488441909336, "grad_norm": 3.1324243545532227, "learning_rate": 4.878463282526184e-05, "loss": 0.626, "step": 49800 }, { "epoch": 5.0035024517162014, "grad_norm": 3.5139007568359375, "learning_rate": 4.877493093987178e-05, "loss": 0.6057, "step": 50000 }, { "epoch": 5.0035024517162014, "eval_loss": 0.7316200137138367, "eval_runtime": 1591.7629, "eval_samples_per_second": 6.222, "eval_steps_per_second": 1.037, "step": 50000 }, { "epoch": 5.023516461523066, "grad_norm": 5.95849609375, "learning_rate": 4.876519145725147e-05, "loss": 0.5763, "step": 50200 }, { "epoch": 5.043530471329931, "grad_norm": 3.809885263442993, "learning_rate": 4.8755414392802736e-05, "loss": 0.5746, "step": 50400 }, { "epoch": 5.063544481136796, "grad_norm": 4.327802658081055, "learning_rate": 4.874559976198685e-05, "loss": 0.5603, "step": 50600 }, { "epoch": 5.083558490943661, "grad_norm": 6.0695672035217285, "learning_rate": 4.8735747580324495e-05, "loss": 0.6253, "step": 50800 }, { "epoch": 5.103572500750525, "grad_norm": 11.268665313720703, "learning_rate": 4.872585786339571e-05, "loss": 0.5842, "step": 51000 }, { "epoch": 5.103572500750525, "eval_loss": 0.7857053279876709, "eval_runtime": 1590.4536, "eval_samples_per_second": 6.227, "eval_steps_per_second": 1.038, "step": 51000 }, { "epoch": 5.123586510557391, "grad_norm": 10.165565490722656, "learning_rate": 4.871593062683992e-05, "loss": 0.5785, "step": 51200 }, { "epoch": 5.143600520364255, "grad_norm": 5.183705806732178, "learning_rate": 4.8705965886355864e-05, "loss": 0.6241, "step": 51400 }, { "epoch": 5.163614530171119, "grad_norm": 4.588796615600586, "learning_rate": 4.8695963657701596e-05, "loss": 0.5663, "step": 51600 }, { "epoch": 5.183628539977985, "grad_norm": 6.818345069885254, "learning_rate": 4.868592395669446e-05, "loss": 0.5976, "step": 51800 }, { "epoch": 5.203642549784849, "grad_norm": 10.186352729797363, "learning_rate": 4.8675846799211044e-05, "loss": 0.5884, "step": 52000 }, { "epoch": 5.203642549784849, "eval_loss": 0.7691501975059509, "eval_runtime": 1599.2487, "eval_samples_per_second": 6.193, "eval_steps_per_second": 1.032, "step": 52000 }, { "epoch": 5.223656559591714, "grad_norm": 4.758870601654053, "learning_rate": 4.866573220118718e-05, "loss": 0.566, "step": 52200 }, { "epoch": 5.243670569398579, "grad_norm": 2.222989559173584, "learning_rate": 4.86555801786179e-05, "loss": 0.5793, "step": 52400 }, { "epoch": 5.263684579205444, "grad_norm": 0.7242764234542847, "learning_rate": 4.864539074755743e-05, "loss": 0.5782, "step": 52600 }, { "epoch": 5.2836985890123085, "grad_norm": 5.347113132476807, "learning_rate": 4.863516392411913e-05, "loss": 0.564, "step": 52800 }, { "epoch": 5.303712598819174, "grad_norm": 8.143167495727539, "learning_rate": 4.8624899724475526e-05, "loss": 0.6029, "step": 53000 }, { "epoch": 5.303712598819174, "eval_loss": 0.8285964131355286, "eval_runtime": 1548.0585, "eval_samples_per_second": 6.398, "eval_steps_per_second": 1.066, "step": 53000 }, { "epoch": 5.323726608626038, "grad_norm": 11.232599258422852, "learning_rate": 4.8614598164858214e-05, "loss": 0.606, "step": 53200 }, { "epoch": 5.3437406184329035, "grad_norm": 1.3047754764556885, "learning_rate": 4.860425926155789e-05, "loss": 0.6121, "step": 53400 }, { "epoch": 5.363754628239768, "grad_norm": 4.630054950714111, "learning_rate": 4.859388303092432e-05, "loss": 0.6047, "step": 53600 }, { "epoch": 5.383768638046632, "grad_norm": 7.616769790649414, "learning_rate": 4.858346948936626e-05, "loss": 0.5915, "step": 53800 }, { "epoch": 5.403782647853498, "grad_norm": 1.5812709331512451, "learning_rate": 4.85730186533515e-05, "loss": 0.5965, "step": 54000 }, { "epoch": 5.403782647853498, "eval_loss": 0.7853936553001404, "eval_runtime": 1545.3026, "eval_samples_per_second": 6.409, "eval_steps_per_second": 1.068, "step": 54000 }, { "epoch": 5.423796657660362, "grad_norm": 4.173960208892822, "learning_rate": 4.856253053940679e-05, "loss": 0.6323, "step": 54200 }, { "epoch": 5.443810667467227, "grad_norm": 5.145197868347168, "learning_rate": 4.8552005164117843e-05, "loss": 0.5809, "step": 54400 }, { "epoch": 5.463824677274092, "grad_norm": 5.654176235198975, "learning_rate": 4.854144254412929e-05, "loss": 0.5896, "step": 54600 }, { "epoch": 5.483838687080957, "grad_norm": 14.16973876953125, "learning_rate": 4.8530842696144654e-05, "loss": 0.5673, "step": 54800 }, { "epoch": 5.503852696887821, "grad_norm": 3.832796573638916, "learning_rate": 4.852020563692634e-05, "loss": 0.5671, "step": 55000 }, { "epoch": 5.503852696887821, "eval_loss": 0.7880047559738159, "eval_runtime": 1542.8956, "eval_samples_per_second": 6.419, "eval_steps_per_second": 1.07, "step": 55000 }, { "epoch": 5.523866706694687, "grad_norm": 14.580747604370117, "learning_rate": 4.8509531383295594e-05, "loss": 0.6165, "step": 55200 }, { "epoch": 5.543880716501551, "grad_norm": 6.7741475105285645, "learning_rate": 4.849881995213248e-05, "loss": 0.6314, "step": 55400 }, { "epoch": 5.5638947263084155, "grad_norm": 5.056469440460205, "learning_rate": 4.8488071360375854e-05, "loss": 0.6093, "step": 55600 }, { "epoch": 5.583908736115281, "grad_norm": 6.04001522064209, "learning_rate": 4.847728562502334e-05, "loss": 0.619, "step": 55800 }, { "epoch": 5.603922745922145, "grad_norm": 5.1041646003723145, "learning_rate": 4.8466462763131295e-05, "loss": 0.6255, "step": 56000 }, { "epoch": 5.603922745922145, "eval_loss": 0.741844117641449, "eval_runtime": 1543.6342, "eval_samples_per_second": 6.416, "eval_steps_per_second": 1.07, "step": 56000 }, { "epoch": 5.6239367557290105, "grad_norm": 3.6125340461730957, "learning_rate": 4.8455602791814784e-05, "loss": 0.6042, "step": 56200 }, { "epoch": 5.643950765535875, "grad_norm": 8.569421768188477, "learning_rate": 4.8444705728247575e-05, "loss": 0.6173, "step": 56400 }, { "epoch": 5.66396477534274, "grad_norm": 0.38532838225364685, "learning_rate": 4.843377158966208e-05, "loss": 0.5801, "step": 56600 }, { "epoch": 5.683978785149605, "grad_norm": 6.338273048400879, "learning_rate": 4.8422800393349334e-05, "loss": 0.5685, "step": 56800 }, { "epoch": 5.70399279495647, "grad_norm": 0.5274912118911743, "learning_rate": 4.8411792156658975e-05, "loss": 0.6331, "step": 57000 }, { "epoch": 5.70399279495647, "eval_loss": 0.7559316754341125, "eval_runtime": 1544.4576, "eval_samples_per_second": 6.413, "eval_steps_per_second": 1.069, "step": 57000 }, { "epoch": 5.724006804763334, "grad_norm": 5.574749946594238, "learning_rate": 4.8400746896999236e-05, "loss": 0.5861, "step": 57200 }, { "epoch": 5.744020814570199, "grad_norm": 10.209929466247559, "learning_rate": 4.838966463183687e-05, "loss": 0.595, "step": 57400 }, { "epoch": 5.764034824377064, "grad_norm": 7.6002044677734375, "learning_rate": 4.8378545378697176e-05, "loss": 0.602, "step": 57600 }, { "epoch": 5.7840488341839285, "grad_norm": 12.900764465332031, "learning_rate": 4.836738915516393e-05, "loss": 0.6035, "step": 57800 }, { "epoch": 5.804062843990794, "grad_norm": 3.203505039215088, "learning_rate": 4.835619597887937e-05, "loss": 0.5631, "step": 58000 }, { "epoch": 5.804062843990794, "eval_loss": 0.7846883535385132, "eval_runtime": 1544.5761, "eval_samples_per_second": 6.412, "eval_steps_per_second": 1.069, "step": 58000 }, { "epoch": 5.824076853797658, "grad_norm": 3.8713245391845703, "learning_rate": 4.834496586754418e-05, "loss": 0.5991, "step": 58200 }, { "epoch": 5.8440908636045235, "grad_norm": 2.455888509750366, "learning_rate": 4.833369883891744e-05, "loss": 0.6084, "step": 58400 }, { "epoch": 5.864104873411388, "grad_norm": 12.519994735717773, "learning_rate": 4.832239491081662e-05, "loss": 0.613, "step": 58600 }, { "epoch": 5.884118883218253, "grad_norm": 10.638232231140137, "learning_rate": 4.8311054101117546e-05, "loss": 0.5766, "step": 58800 }, { "epoch": 5.904132893025118, "grad_norm": 3.805115222930908, "learning_rate": 4.8299676427754365e-05, "loss": 0.6025, "step": 59000 }, { "epoch": 5.904132893025118, "eval_loss": 0.7480840682983398, "eval_runtime": 1547.3966, "eval_samples_per_second": 6.4, "eval_steps_per_second": 1.067, "step": 59000 }, { "epoch": 5.924146902831982, "grad_norm": 5.378095626831055, "learning_rate": 4.828826190871951e-05, "loss": 0.6051, "step": 59200 }, { "epoch": 5.944160912638847, "grad_norm": 3.4008994102478027, "learning_rate": 4.827681056206368e-05, "loss": 0.6076, "step": 59400 }, { "epoch": 5.964174922445712, "grad_norm": 4.712219715118408, "learning_rate": 4.826532240589583e-05, "loss": 0.5843, "step": 59600 }, { "epoch": 5.984188932252577, "grad_norm": 16.525747299194336, "learning_rate": 4.8253797458383115e-05, "loss": 0.5992, "step": 59800 }, { "epoch": 6.004202942059441, "grad_norm": 7.130965232849121, "learning_rate": 4.824223573775087e-05, "loss": 0.5935, "step": 60000 }, { "epoch": 6.004202942059441, "eval_loss": 0.7637354135513306, "eval_runtime": 1544.2409, "eval_samples_per_second": 6.414, "eval_steps_per_second": 1.069, "step": 60000 }, { "epoch": 6.024216951866307, "grad_norm": 2.9639017581939697, "learning_rate": 4.823063726228258e-05, "loss": 0.5514, "step": 60200 }, { "epoch": 6.044230961673171, "grad_norm": 11.580534934997559, "learning_rate": 4.821900205031986e-05, "loss": 0.5665, "step": 60400 }, { "epoch": 6.064244971480036, "grad_norm": 6.879183292388916, "learning_rate": 4.820733012026242e-05, "loss": 0.5523, "step": 60600 }, { "epoch": 6.084258981286901, "grad_norm": 2.010526180267334, "learning_rate": 4.819562149056801e-05, "loss": 0.5371, "step": 60800 }, { "epoch": 6.104272991093765, "grad_norm": 6.071661472320557, "learning_rate": 4.8183876179752466e-05, "loss": 0.5276, "step": 61000 }, { "epoch": 6.104272991093765, "eval_loss": 0.830379068851471, "eval_runtime": 1542.137, "eval_samples_per_second": 6.422, "eval_steps_per_second": 1.071, "step": 61000 }, { "epoch": 6.1242870009006305, "grad_norm": 3.4491350650787354, "learning_rate": 4.8172094206389596e-05, "loss": 0.5491, "step": 61200 }, { "epoch": 6.144301010707495, "grad_norm": 4.337362289428711, "learning_rate": 4.8160275589111176e-05, "loss": 0.6327, "step": 61400 }, { "epoch": 6.16431502051436, "grad_norm": 4.566754341125488, "learning_rate": 4.814842034660696e-05, "loss": 0.5164, "step": 61600 }, { "epoch": 6.184329030321225, "grad_norm": 14.335103988647461, "learning_rate": 4.81365284976246e-05, "loss": 0.56, "step": 61800 }, { "epoch": 6.20434304012809, "grad_norm": 2.529834032058716, "learning_rate": 4.812460006096965e-05, "loss": 0.5705, "step": 62000 }, { "epoch": 6.20434304012809, "eval_loss": 0.8106730580329895, "eval_runtime": 1543.5914, "eval_samples_per_second": 6.416, "eval_steps_per_second": 1.07, "step": 62000 }, { "epoch": 6.224357049934954, "grad_norm": 9.699427604675293, "learning_rate": 4.811263505550551e-05, "loss": 0.5354, "step": 62200 }, { "epoch": 6.24437105974182, "grad_norm": 12.089444160461426, "learning_rate": 4.810063350015342e-05, "loss": 0.5613, "step": 62400 }, { "epoch": 6.264385069548684, "grad_norm": 4.127741813659668, "learning_rate": 4.808859541389241e-05, "loss": 0.6164, "step": 62600 }, { "epoch": 6.284399079355548, "grad_norm": 2.735119581222534, "learning_rate": 4.8076520815759286e-05, "loss": 0.6074, "step": 62800 }, { "epoch": 6.304413089162414, "grad_norm": 2.50549054145813, "learning_rate": 4.806440972484859e-05, "loss": 0.5237, "step": 63000 }, { "epoch": 6.304413089162414, "eval_loss": 0.8066494464874268, "eval_runtime": 1461.2531, "eval_samples_per_second": 6.778, "eval_steps_per_second": 1.13, "step": 63000 }, { "epoch": 6.324427098969278, "grad_norm": 2.4227826595306396, "learning_rate": 4.8052262160312576e-05, "loss": 0.5526, "step": 63200 }, { "epoch": 6.344441108776143, "grad_norm": 10.553482055664062, "learning_rate": 4.804007814136118e-05, "loss": 0.5748, "step": 63400 }, { "epoch": 6.364455118583008, "grad_norm": 8.875799179077148, "learning_rate": 4.802785768726197e-05, "loss": 0.5871, "step": 63600 }, { "epoch": 6.384469128389873, "grad_norm": 6.988894939422607, "learning_rate": 4.801560081734015e-05, "loss": 0.599, "step": 63800 }, { "epoch": 6.4044831381967375, "grad_norm": 5.192503452301025, "learning_rate": 4.80033075509785e-05, "loss": 0.6018, "step": 64000 }, { "epoch": 6.4044831381967375, "eval_loss": 0.7544043064117432, "eval_runtime": 1463.1673, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.128, "step": 64000 }, { "epoch": 6.424497148003603, "grad_norm": 6.199488162994385, "learning_rate": 4.7990977907617364e-05, "loss": 0.578, "step": 64200 }, { "epoch": 6.444511157810467, "grad_norm": 7.2241010665893555, "learning_rate": 4.797861190675461e-05, "loss": 0.5955, "step": 64400 }, { "epoch": 6.4645251676173325, "grad_norm": 11.323012351989746, "learning_rate": 4.7966209567945606e-05, "loss": 0.54, "step": 64600 }, { "epoch": 6.484539177424197, "grad_norm": 11.22043228149414, "learning_rate": 4.7953770910803164e-05, "loss": 0.5659, "step": 64800 }, { "epoch": 6.504553187231062, "grad_norm": 4.568957805633545, "learning_rate": 4.7941295954997557e-05, "loss": 0.5719, "step": 65000 }, { "epoch": 6.504553187231062, "eval_loss": 0.7700421214103699, "eval_runtime": 1464.4475, "eval_samples_per_second": 6.763, "eval_steps_per_second": 1.127, "step": 65000 }, { "epoch": 6.524567197037927, "grad_norm": 8.035478591918945, "learning_rate": 4.792878472025644e-05, "loss": 0.5697, "step": 65200 }, { "epoch": 6.544581206844791, "grad_norm": 7.443570613861084, "learning_rate": 4.7916237226364834e-05, "loss": 0.5896, "step": 65400 }, { "epoch": 6.564595216651656, "grad_norm": 4.9627861976623535, "learning_rate": 4.790365349316513e-05, "loss": 0.5897, "step": 65600 }, { "epoch": 6.584609226458521, "grad_norm": 5.448978424072266, "learning_rate": 4.789103354055701e-05, "loss": 0.5888, "step": 65800 }, { "epoch": 6.604623236265386, "grad_norm": 6.586881637573242, "learning_rate": 4.7878377388497403e-05, "loss": 0.5675, "step": 66000 }, { "epoch": 6.604623236265386, "eval_loss": 0.7565573453903198, "eval_runtime": 1467.7836, "eval_samples_per_second": 6.748, "eval_steps_per_second": 1.125, "step": 66000 }, { "epoch": 6.6246372460722505, "grad_norm": 4.141233921051025, "learning_rate": 4.786568505700053e-05, "loss": 0.5453, "step": 66200 }, { "epoch": 6.644651255879116, "grad_norm": 1.1646991968154907, "learning_rate": 4.785295656613781e-05, "loss": 0.5556, "step": 66400 }, { "epoch": 6.66466526568598, "grad_norm": 5.191328048706055, "learning_rate": 4.784019193603784e-05, "loss": 0.5566, "step": 66600 }, { "epoch": 6.6846792754928455, "grad_norm": 9.895342826843262, "learning_rate": 4.782739118688635e-05, "loss": 0.6004, "step": 66800 }, { "epoch": 6.70469328529971, "grad_norm": 1.419704794883728, "learning_rate": 4.781455433892622e-05, "loss": 0.5925, "step": 67000 }, { "epoch": 6.70469328529971, "eval_loss": 0.758189857006073, "eval_runtime": 1487.1051, "eval_samples_per_second": 6.66, "eval_steps_per_second": 1.11, "step": 67000 }, { "epoch": 6.724707295106574, "grad_norm": 6.3590521812438965, "learning_rate": 4.78016814124574e-05, "loss": 0.5688, "step": 67200 }, { "epoch": 6.74472130491344, "grad_norm": 1.2542431354522705, "learning_rate": 4.7788772427836886e-05, "loss": 0.5245, "step": 67400 }, { "epoch": 6.764735314720304, "grad_norm": 2.1889560222625732, "learning_rate": 4.7775827405478715e-05, "loss": 0.5868, "step": 67600 }, { "epoch": 6.784749324527169, "grad_norm": 17.025833129882812, "learning_rate": 4.776284636585389e-05, "loss": 0.5573, "step": 67800 }, { "epoch": 6.804763334334034, "grad_norm": 5.567526817321777, "learning_rate": 4.774982932949039e-05, "loss": 0.5704, "step": 68000 }, { "epoch": 6.804763334334034, "eval_loss": 0.8135498762130737, "eval_runtime": 1481.5421, "eval_samples_per_second": 6.685, "eval_steps_per_second": 1.114, "step": 68000 }, { "epoch": 6.824777344140899, "grad_norm": 9.542104721069336, "learning_rate": 4.773677631697312e-05, "loss": 0.5989, "step": 68200 }, { "epoch": 6.844791353947763, "grad_norm": 3.8040010929107666, "learning_rate": 4.7723687348943865e-05, "loss": 0.5879, "step": 68400 }, { "epoch": 6.864805363754629, "grad_norm": 10.797565460205078, "learning_rate": 4.771056244610127e-05, "loss": 0.5773, "step": 68600 }, { "epoch": 6.884819373561493, "grad_norm": 2.8293638229370117, "learning_rate": 4.769740162920081e-05, "loss": 0.5717, "step": 68800 }, { "epoch": 6.9048333833683575, "grad_norm": 7.189076900482178, "learning_rate": 4.7684204919054754e-05, "loss": 0.5581, "step": 69000 }, { "epoch": 6.9048333833683575, "eval_loss": 0.7789385318756104, "eval_runtime": 1462.5074, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.129, "step": 69000 }, { "epoch": 6.924847393175223, "grad_norm": 3.3455846309661865, "learning_rate": 4.767097233653214e-05, "loss": 0.5843, "step": 69200 }, { "epoch": 6.944861402982087, "grad_norm": 6.965803146362305, "learning_rate": 4.765770390255871e-05, "loss": 0.6092, "step": 69400 }, { "epoch": 6.9648754127889525, "grad_norm": 11.405823707580566, "learning_rate": 4.7644399638116944e-05, "loss": 0.5399, "step": 69600 }, { "epoch": 6.984889422595817, "grad_norm": 4.625624179840088, "learning_rate": 4.763105956424593e-05, "loss": 0.5775, "step": 69800 }, { "epoch": 7.004903432402682, "grad_norm": 4.556118488311768, "learning_rate": 4.761768370204142e-05, "loss": 0.6012, "step": 70000 }, { "epoch": 7.004903432402682, "eval_loss": 0.7578462958335876, "eval_runtime": 1460.9931, "eval_samples_per_second": 6.779, "eval_steps_per_second": 1.13, "step": 70000 }, { "epoch": 7.024917442209547, "grad_norm": 6.490872859954834, "learning_rate": 4.760427207265575e-05, "loss": 0.5317, "step": 70200 }, { "epoch": 7.044931452016412, "grad_norm": 3.015681266784668, "learning_rate": 4.759082469729783e-05, "loss": 0.5291, "step": 70400 }, { "epoch": 7.064945461823276, "grad_norm": 7.003421306610107, "learning_rate": 4.757734159723308e-05, "loss": 0.5158, "step": 70600 }, { "epoch": 7.084959471630141, "grad_norm": 11.199054718017578, "learning_rate": 4.756382279378341e-05, "loss": 0.5371, "step": 70800 }, { "epoch": 7.104973481437006, "grad_norm": 6.298213481903076, "learning_rate": 4.755026830832722e-05, "loss": 0.5415, "step": 71000 }, { "epoch": 7.104973481437006, "eval_loss": 0.7836835384368896, "eval_runtime": 1462.4594, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.129, "step": 71000 }, { "epoch": 7.12498749124387, "grad_norm": 7.163265228271484, "learning_rate": 4.7536678162299314e-05, "loss": 0.541, "step": 71200 }, { "epoch": 7.145001501050736, "grad_norm": 3.766996145248413, "learning_rate": 4.752305237719089e-05, "loss": 0.5465, "step": 71400 }, { "epoch": 7.1650155108576, "grad_norm": 7.500974655151367, "learning_rate": 4.750939097454952e-05, "loss": 0.5464, "step": 71600 }, { "epoch": 7.185029520664465, "grad_norm": 6.896329402923584, "learning_rate": 4.749569397597907e-05, "loss": 0.5555, "step": 71800 }, { "epoch": 7.20504353047133, "grad_norm": 0.9183210134506226, "learning_rate": 4.748196140313974e-05, "loss": 0.582, "step": 72000 }, { "epoch": 7.20504353047133, "eval_loss": 0.7605819702148438, "eval_runtime": 1462.6082, "eval_samples_per_second": 6.771, "eval_steps_per_second": 1.129, "step": 72000 }, { "epoch": 7.225057540278195, "grad_norm": 3.8166685104370117, "learning_rate": 4.746819327774794e-05, "loss": 0.5281, "step": 72200 }, { "epoch": 7.2450715500850595, "grad_norm": 5.982170104980469, "learning_rate": 4.745438962157635e-05, "loss": 0.5135, "step": 72400 }, { "epoch": 7.265085559891924, "grad_norm": 14.572467803955078, "learning_rate": 4.7440550456453795e-05, "loss": 0.5609, "step": 72600 }, { "epoch": 7.285099569698789, "grad_norm": 7.50581693649292, "learning_rate": 4.742667580426528e-05, "loss": 0.545, "step": 72800 }, { "epoch": 7.305113579505654, "grad_norm": 2.362046003341675, "learning_rate": 4.741276568695192e-05, "loss": 0.5538, "step": 73000 }, { "epoch": 7.305113579505654, "eval_loss": 0.7861754894256592, "eval_runtime": 1462.0527, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.129, "step": 73000 }, { "epoch": 7.325127589312519, "grad_norm": 2.1391143798828125, "learning_rate": 4.739882012651091e-05, "loss": 0.5507, "step": 73200 }, { "epoch": 7.345141599119383, "grad_norm": 12.303117752075195, "learning_rate": 4.7384839144995494e-05, "loss": 0.5144, "step": 73400 }, { "epoch": 7.365155608926249, "grad_norm": 9.266813278198242, "learning_rate": 4.737082276451494e-05, "loss": 0.5759, "step": 73600 }, { "epoch": 7.385169618733113, "grad_norm": 8.287341117858887, "learning_rate": 4.7356771007234496e-05, "loss": 0.5188, "step": 73800 }, { "epoch": 7.405183628539978, "grad_norm": 29.918621063232422, "learning_rate": 4.734268389537534e-05, "loss": 0.5372, "step": 74000 }, { "epoch": 7.405183628539978, "eval_loss": 0.8055480718612671, "eval_runtime": 1523.6179, "eval_samples_per_second": 6.5, "eval_steps_per_second": 1.084, "step": 74000 }, { "epoch": 7.425197638346843, "grad_norm": 17.910940170288086, "learning_rate": 4.732856145121456e-05, "loss": 0.5209, "step": 74200 }, { "epoch": 7.445211648153707, "grad_norm": 7.601934432983398, "learning_rate": 4.731440369708514e-05, "loss": 0.5229, "step": 74400 }, { "epoch": 7.4652256579605725, "grad_norm": 11.329294204711914, "learning_rate": 4.730021065537588e-05, "loss": 0.5669, "step": 74600 }, { "epoch": 7.485239667767437, "grad_norm": 16.27219009399414, "learning_rate": 4.7285982348531376e-05, "loss": 0.5669, "step": 74800 }, { "epoch": 7.505253677574302, "grad_norm": 1.7959884405136108, "learning_rate": 4.7271718799052033e-05, "loss": 0.5522, "step": 75000 }, { "epoch": 7.505253677574302, "eval_loss": 0.765407919883728, "eval_runtime": 1524.483, "eval_samples_per_second": 6.497, "eval_steps_per_second": 1.083, "step": 75000 }, { "epoch": 7.525267687381167, "grad_norm": 6.736725807189941, "learning_rate": 4.725742002949394e-05, "loss": 0.5787, "step": 75200 }, { "epoch": 7.545281697188032, "grad_norm": 25.726804733276367, "learning_rate": 4.7243086062468914e-05, "loss": 0.5368, "step": 75400 }, { "epoch": 7.565295706994896, "grad_norm": 5.139993667602539, "learning_rate": 4.7228716920644414e-05, "loss": 0.5312, "step": 75600 }, { "epoch": 7.585309716801762, "grad_norm": 11.758186340332031, "learning_rate": 4.721431262674353e-05, "loss": 0.5423, "step": 75800 }, { "epoch": 7.605323726608626, "grad_norm": 6.783292770385742, "learning_rate": 4.719987320354495e-05, "loss": 0.5765, "step": 76000 }, { "epoch": 7.605323726608626, "eval_loss": 0.7755579948425293, "eval_runtime": 1525.6457, "eval_samples_per_second": 6.492, "eval_steps_per_second": 1.082, "step": 76000 }, { "epoch": 7.62533773641549, "grad_norm": 5.204184055328369, "learning_rate": 4.718539867388292e-05, "loss": 0.5568, "step": 76200 }, { "epoch": 7.645351746222356, "grad_norm": 12.7410249710083, "learning_rate": 4.717088906064716e-05, "loss": 0.5752, "step": 76400 }, { "epoch": 7.66536575602922, "grad_norm": 5.71970272064209, "learning_rate": 4.715634438678292e-05, "loss": 0.5802, "step": 76600 }, { "epoch": 7.685379765836085, "grad_norm": 4.99209451675415, "learning_rate": 4.714176467529087e-05, "loss": 0.5241, "step": 76800 }, { "epoch": 7.70539377564295, "grad_norm": 6.552138328552246, "learning_rate": 4.712714994922709e-05, "loss": 0.5578, "step": 77000 }, { "epoch": 7.70539377564295, "eval_loss": 0.75287926197052, "eval_runtime": 1525.7719, "eval_samples_per_second": 6.491, "eval_steps_per_second": 1.082, "step": 77000 }, { "epoch": 7.725407785449815, "grad_norm": 0.49038270115852356, "learning_rate": 4.711250023170304e-05, "loss": 0.5464, "step": 77200 }, { "epoch": 7.7454217952566795, "grad_norm": 28.516048431396484, "learning_rate": 4.70978155458855e-05, "loss": 0.5412, "step": 77400 }, { "epoch": 7.765435805063545, "grad_norm": 8.932271957397461, "learning_rate": 4.708309591499657e-05, "loss": 0.5436, "step": 77600 }, { "epoch": 7.785449814870409, "grad_norm": 8.901341438293457, "learning_rate": 4.7068341362313597e-05, "loss": 0.5608, "step": 77800 }, { "epoch": 7.805463824677274, "grad_norm": 5.064136981964111, "learning_rate": 4.7053551911169136e-05, "loss": 0.5296, "step": 78000 }, { "epoch": 7.805463824677274, "eval_loss": 0.7763922810554504, "eval_runtime": 1525.7829, "eval_samples_per_second": 6.491, "eval_steps_per_second": 1.082, "step": 78000 }, { "epoch": 7.825477834484139, "grad_norm": 6.408000946044922, "learning_rate": 4.703872758495096e-05, "loss": 0.5533, "step": 78200 }, { "epoch": 7.845491844291003, "grad_norm": 11.75259017944336, "learning_rate": 4.702386840710199e-05, "loss": 0.548, "step": 78400 }, { "epoch": 7.865505854097869, "grad_norm": 5.475959777832031, "learning_rate": 4.700897440112023e-05, "loss": 0.5836, "step": 78600 }, { "epoch": 7.885519863904733, "grad_norm": 5.4115471839904785, "learning_rate": 4.69940455905588e-05, "loss": 0.5846, "step": 78800 }, { "epoch": 7.905533873711598, "grad_norm": 4.801541805267334, "learning_rate": 4.6979081999025834e-05, "loss": 0.5972, "step": 79000 }, { "epoch": 7.905533873711598, "eval_loss": 0.7697009444236755, "eval_runtime": 1524.6109, "eval_samples_per_second": 6.496, "eval_steps_per_second": 1.083, "step": 79000 }, { "epoch": 7.925547883518463, "grad_norm": 5.107807159423828, "learning_rate": 4.6964083650184476e-05, "loss": 0.559, "step": 79200 }, { "epoch": 7.945561893325328, "grad_norm": 11.719144821166992, "learning_rate": 4.694905056775284e-05, "loss": 0.5216, "step": 79400 }, { "epoch": 7.965575903132192, "grad_norm": 6.384688854217529, "learning_rate": 4.693398277550395e-05, "loss": 0.5475, "step": 79600 }, { "epoch": 7.985589912939057, "grad_norm": 4.395055294036865, "learning_rate": 4.691888029726573e-05, "loss": 0.5615, "step": 79800 }, { "epoch": 8.005603922745923, "grad_norm": 4.827722549438477, "learning_rate": 4.690374315692098e-05, "loss": 0.5395, "step": 80000 }, { "epoch": 8.005603922745923, "eval_loss": 0.763211190700531, "eval_runtime": 1524.6653, "eval_samples_per_second": 6.496, "eval_steps_per_second": 1.083, "step": 80000 }, { "epoch": 8.025617932552787, "grad_norm": 1.9977535009384155, "learning_rate": 4.688857137840725e-05, "loss": 0.5345, "step": 80200 }, { "epoch": 8.045631942359652, "grad_norm": 3.336663246154785, "learning_rate": 4.687336498571694e-05, "loss": 0.4857, "step": 80400 }, { "epoch": 8.065645952166516, "grad_norm": 112.60790252685547, "learning_rate": 4.6858124002897134e-05, "loss": 0.4998, "step": 80600 }, { "epoch": 8.08565996197338, "grad_norm": 5.354961395263672, "learning_rate": 4.6842848454049656e-05, "loss": 0.5029, "step": 80800 }, { "epoch": 8.105673971780247, "grad_norm": 4.052475929260254, "learning_rate": 4.682753836333095e-05, "loss": 0.5464, "step": 81000 }, { "epoch": 8.105673971780247, "eval_loss": 0.7941437363624573, "eval_runtime": 1524.3149, "eval_samples_per_second": 6.497, "eval_steps_per_second": 1.083, "step": 81000 }, { "epoch": 8.125687981587111, "grad_norm": 8.936397552490234, "learning_rate": 4.6812193754952124e-05, "loss": 0.5339, "step": 81200 }, { "epoch": 8.145701991393976, "grad_norm": 10.229630470275879, "learning_rate": 4.679681465317884e-05, "loss": 0.5269, "step": 81400 }, { "epoch": 8.16571600120084, "grad_norm": 12.118572235107422, "learning_rate": 4.678140108233135e-05, "loss": 0.5189, "step": 81600 }, { "epoch": 8.185730011007706, "grad_norm": 7.392326354980469, "learning_rate": 4.6765953066784344e-05, "loss": 0.536, "step": 81800 }, { "epoch": 8.20574402081457, "grad_norm": 25.872758865356445, "learning_rate": 4.675047063096706e-05, "loss": 0.5105, "step": 82000 }, { "epoch": 8.20574402081457, "eval_loss": 0.8114084005355835, "eval_runtime": 1528.1359, "eval_samples_per_second": 6.481, "eval_steps_per_second": 1.08, "step": 82000 }, { "epoch": 8.225758030621435, "grad_norm": 8.335721015930176, "learning_rate": 4.673495379936311e-05, "loss": 0.5407, "step": 82200 }, { "epoch": 8.2457720404283, "grad_norm": 12.653794288635254, "learning_rate": 4.671940259651053e-05, "loss": 0.5282, "step": 82400 }, { "epoch": 8.265786050235164, "grad_norm": 4.7570343017578125, "learning_rate": 4.6703817047001694e-05, "loss": 0.5617, "step": 82600 }, { "epoch": 8.28580006004203, "grad_norm": 11.030816078186035, "learning_rate": 4.668819717548331e-05, "loss": 0.5141, "step": 82800 }, { "epoch": 8.305814069848894, "grad_norm": 1.7761846780776978, "learning_rate": 4.6672543006656346e-05, "loss": 0.5385, "step": 83000 }, { "epoch": 8.305814069848894, "eval_loss": 0.788813591003418, "eval_runtime": 1528.5893, "eval_samples_per_second": 6.479, "eval_steps_per_second": 1.08, "step": 83000 }, { "epoch": 8.325828079655759, "grad_norm": 4.854828357696533, "learning_rate": 4.6656854565276e-05, "loss": 0.5497, "step": 83200 }, { "epoch": 8.345842089462623, "grad_norm": 6.066869258880615, "learning_rate": 4.664113187615169e-05, "loss": 0.5309, "step": 83400 }, { "epoch": 8.36585609926949, "grad_norm": 8.572700500488281, "learning_rate": 4.662537496414699e-05, "loss": 0.5177, "step": 83600 }, { "epoch": 8.385870109076354, "grad_norm": 6.054539203643799, "learning_rate": 4.660958385417956e-05, "loss": 0.5174, "step": 83800 }, { "epoch": 8.405884118883218, "grad_norm": 5.9689507484436035, "learning_rate": 4.659375857122119e-05, "loss": 0.5554, "step": 84000 }, { "epoch": 8.405884118883218, "eval_loss": 0.7740781307220459, "eval_runtime": 1525.3004, "eval_samples_per_second": 6.493, "eval_steps_per_second": 1.082, "step": 84000 }, { "epoch": 8.425898128690083, "grad_norm": 7.059330940246582, "learning_rate": 4.657789914029767e-05, "loss": 0.5345, "step": 84200 }, { "epoch": 8.445912138496947, "grad_norm": 17.09292984008789, "learning_rate": 4.6562005586488824e-05, "loss": 0.515, "step": 84400 }, { "epoch": 8.465926148303813, "grad_norm": 3.8907713890075684, "learning_rate": 4.654607793492839e-05, "loss": 0.493, "step": 84600 }, { "epoch": 8.485940158110678, "grad_norm": 15.289313316345215, "learning_rate": 4.65301162108041e-05, "loss": 0.4971, "step": 84800 }, { "epoch": 8.505954167917542, "grad_norm": 14.269034385681152, "learning_rate": 4.651412043935749e-05, "loss": 0.5157, "step": 85000 }, { "epoch": 8.505954167917542, "eval_loss": 0.7897810935974121, "eval_runtime": 1590.8796, "eval_samples_per_second": 6.225, "eval_steps_per_second": 1.038, "step": 85000 }, { "epoch": 8.525968177724407, "grad_norm": 8.801824569702148, "learning_rate": 4.649809064588398e-05, "loss": 0.5471, "step": 85200 }, { "epoch": 8.545982187531273, "grad_norm": 21.51582145690918, "learning_rate": 4.648202685573279e-05, "loss": 0.518, "step": 85400 }, { "epoch": 8.565996197338137, "grad_norm": 11.740971565246582, "learning_rate": 4.646592909430692e-05, "loss": 0.541, "step": 85600 }, { "epoch": 8.586010207145002, "grad_norm": 7.054279804229736, "learning_rate": 4.6449797387063024e-05, "loss": 0.5653, "step": 85800 }, { "epoch": 8.606024216951866, "grad_norm": 7.152441501617432, "learning_rate": 4.6433631759511506e-05, "loss": 0.5334, "step": 86000 }, { "epoch": 8.606024216951866, "eval_loss": 0.7463188767433167, "eval_runtime": 1587.988, "eval_samples_per_second": 6.237, "eval_steps_per_second": 1.04, "step": 86000 }, { "epoch": 8.62603822675873, "grad_norm": 1.2795531749725342, "learning_rate": 4.641743223721639e-05, "loss": 0.5296, "step": 86200 }, { "epoch": 8.646052236565597, "grad_norm": 11.169625282287598, "learning_rate": 4.640119884579529e-05, "loss": 0.5036, "step": 86400 }, { "epoch": 8.666066246372461, "grad_norm": 13.228611946105957, "learning_rate": 4.638493161091938e-05, "loss": 0.5056, "step": 86600 }, { "epoch": 8.686080256179325, "grad_norm": 8.148116111755371, "learning_rate": 4.6368630558313375e-05, "loss": 0.5213, "step": 86800 }, { "epoch": 8.70609426598619, "grad_norm": 2.704084873199463, "learning_rate": 4.635229571375544e-05, "loss": 0.4924, "step": 87000 }, { "epoch": 8.70609426598619, "eval_loss": 0.8092161417007446, "eval_runtime": 1587.0234, "eval_samples_per_second": 6.241, "eval_steps_per_second": 1.04, "step": 87000 }, { "epoch": 8.726108275793056, "grad_norm": 4.078919410705566, "learning_rate": 4.6335927103077206e-05, "loss": 0.5437, "step": 87200 }, { "epoch": 8.74612228559992, "grad_norm": 12.336515426635742, "learning_rate": 4.631952475216368e-05, "loss": 0.5439, "step": 87400 }, { "epoch": 8.766136295406785, "grad_norm": 5.8206305503845215, "learning_rate": 4.6303088686953235e-05, "loss": 0.5619, "step": 87600 }, { "epoch": 8.78615030521365, "grad_norm": 14.04720687866211, "learning_rate": 4.628661893343755e-05, "loss": 0.5509, "step": 87800 }, { "epoch": 8.806164315020514, "grad_norm": 2.2983529567718506, "learning_rate": 4.627011551766159e-05, "loss": 0.5367, "step": 88000 }, { "epoch": 8.806164315020514, "eval_loss": 0.7914876341819763, "eval_runtime": 1590.4715, "eval_samples_per_second": 6.227, "eval_steps_per_second": 1.038, "step": 88000 }, { "epoch": 8.82617832482738, "grad_norm": 13.600398063659668, "learning_rate": 4.625357846572354e-05, "loss": 0.4874, "step": 88200 }, { "epoch": 8.846192334634244, "grad_norm": 12.058847427368164, "learning_rate": 4.62370078037748e-05, "loss": 0.516, "step": 88400 }, { "epoch": 8.866206344441109, "grad_norm": 10.35982608795166, "learning_rate": 4.622040355801989e-05, "loss": 0.5258, "step": 88600 }, { "epoch": 8.886220354247973, "grad_norm": 8.732118606567383, "learning_rate": 4.620376575471646e-05, "loss": 0.5425, "step": 88800 }, { "epoch": 8.90623436405484, "grad_norm": 5.525943756103516, "learning_rate": 4.6187094420175214e-05, "loss": 0.5562, "step": 89000 }, { "epoch": 8.90623436405484, "eval_loss": 0.7789895534515381, "eval_runtime": 1588.1313, "eval_samples_per_second": 6.236, "eval_steps_per_second": 1.04, "step": 89000 }, { "epoch": 8.926248373861704, "grad_norm": 5.7132086753845215, "learning_rate": 4.617038958075989e-05, "loss": 0.5384, "step": 89200 }, { "epoch": 8.946262383668568, "grad_norm": 1.0245046615600586, "learning_rate": 4.6153651262887217e-05, "loss": 0.5571, "step": 89400 }, { "epoch": 8.966276393475432, "grad_norm": 9.889337539672852, "learning_rate": 4.613687949302685e-05, "loss": 0.4943, "step": 89600 }, { "epoch": 8.986290403282297, "grad_norm": 2.876408338546753, "learning_rate": 4.6120074297701345e-05, "loss": 0.5231, "step": 89800 }, { "epoch": 9.006304413089163, "grad_norm": 1.6252576112747192, "learning_rate": 4.6103235703486137e-05, "loss": 0.4856, "step": 90000 }, { "epoch": 9.006304413089163, "eval_loss": 0.7352625727653503, "eval_runtime": 1589.5567, "eval_samples_per_second": 6.231, "eval_steps_per_second": 1.039, "step": 90000 }, { "epoch": 9.026318422896027, "grad_norm": 4.923126697540283, "learning_rate": 4.608636373700945e-05, "loss": 0.4943, "step": 90200 }, { "epoch": 9.046332432702892, "grad_norm": 10.076866149902344, "learning_rate": 4.6069458424952305e-05, "loss": 0.478, "step": 90400 }, { "epoch": 9.066346442509756, "grad_norm": 3.539828300476074, "learning_rate": 4.6052519794048446e-05, "loss": 0.5033, "step": 90600 }, { "epoch": 9.086360452316622, "grad_norm": 11.253156661987305, "learning_rate": 4.60355478710843e-05, "loss": 0.5427, "step": 90800 }, { "epoch": 9.106374462123487, "grad_norm": 7.337778568267822, "learning_rate": 4.601854268289894e-05, "loss": 0.5072, "step": 91000 }, { "epoch": 9.106374462123487, "eval_loss": 0.7597683668136597, "eval_runtime": 1589.0193, "eval_samples_per_second": 6.233, "eval_steps_per_second": 1.039, "step": 91000 }, { "epoch": 9.126388471930351, "grad_norm": 7.646730899810791, "learning_rate": 4.6001504256384074e-05, "loss": 0.5143, "step": 91200 }, { "epoch": 9.146402481737216, "grad_norm": 3.592362403869629, "learning_rate": 4.5984432618483936e-05, "loss": 0.4747, "step": 91400 }, { "epoch": 9.16641649154408, "grad_norm": 8.276998519897461, "learning_rate": 4.59673277961953e-05, "loss": 0.4878, "step": 91600 }, { "epoch": 9.186430501350946, "grad_norm": 4.521116733551025, "learning_rate": 4.5950189816567404e-05, "loss": 0.5148, "step": 91800 }, { "epoch": 9.20644451115781, "grad_norm": 15.691106796264648, "learning_rate": 4.5933018706701934e-05, "loss": 0.5092, "step": 92000 }, { "epoch": 9.20644451115781, "eval_loss": 0.7598811388015747, "eval_runtime": 1599.9096, "eval_samples_per_second": 6.19, "eval_steps_per_second": 1.032, "step": 92000 }, { "epoch": 9.226458520964675, "grad_norm": 16.27528953552246, "learning_rate": 4.5915814493752964e-05, "loss": 0.4998, "step": 92200 }, { "epoch": 9.24647253077154, "grad_norm": 8.023094177246094, "learning_rate": 4.589857720492691e-05, "loss": 0.5147, "step": 92400 }, { "epoch": 9.266486540578406, "grad_norm": 11.121880531311035, "learning_rate": 4.5881306867482485e-05, "loss": 0.4913, "step": 92600 }, { "epoch": 9.28650055038527, "grad_norm": 7.749110698699951, "learning_rate": 4.58640035087307e-05, "loss": 0.5111, "step": 92800 }, { "epoch": 9.306514560192134, "grad_norm": 6.6985578536987305, "learning_rate": 4.584666715603476e-05, "loss": 0.5072, "step": 93000 }, { "epoch": 9.306514560192134, "eval_loss": 0.7944608926773071, "eval_runtime": 1585.0977, "eval_samples_per_second": 6.248, "eval_steps_per_second": 1.042, "step": 93000 }, { "epoch": 9.326528569998999, "grad_norm": 5.164041996002197, "learning_rate": 4.582929783681003e-05, "loss": 0.4746, "step": 93200 }, { "epoch": 9.346542579805863, "grad_norm": 4.971350193023682, "learning_rate": 4.581189557852403e-05, "loss": 0.4924, "step": 93400 }, { "epoch": 9.36655658961273, "grad_norm": 3.1172475814819336, "learning_rate": 4.579446040869638e-05, "loss": 0.524, "step": 93600 }, { "epoch": 9.386570599419594, "grad_norm": 8.019272804260254, "learning_rate": 4.577699235489872e-05, "loss": 0.5235, "step": 93800 }, { "epoch": 9.406584609226458, "grad_norm": 5.51103401184082, "learning_rate": 4.575949144475471e-05, "loss": 0.5157, "step": 94000 }, { "epoch": 9.406584609226458, "eval_loss": 0.7691966891288757, "eval_runtime": 1612.1848, "eval_samples_per_second": 6.143, "eval_steps_per_second": 1.024, "step": 94000 }, { "epoch": 9.426598619033323, "grad_norm": 10.15511417388916, "learning_rate": 4.5741957705939944e-05, "loss": 0.4843, "step": 94200 }, { "epoch": 9.446612628840189, "grad_norm": 8.398979187011719, "learning_rate": 4.572439116618197e-05, "loss": 0.4603, "step": 94400 }, { "epoch": 9.466626638647053, "grad_norm": 21.175609588623047, "learning_rate": 4.5706791853260174e-05, "loss": 0.5067, "step": 94600 }, { "epoch": 9.486640648453918, "grad_norm": 4.593827247619629, "learning_rate": 4.5689159795005775e-05, "loss": 0.4838, "step": 94800 }, { "epoch": 9.506654658260782, "grad_norm": 7.515419006347656, "learning_rate": 4.567149501930179e-05, "loss": 0.4676, "step": 95000 }, { "epoch": 9.506654658260782, "eval_loss": 0.7987233400344849, "eval_runtime": 1508.8972, "eval_samples_per_second": 6.564, "eval_steps_per_second": 1.094, "step": 95000 }, { "epoch": 9.526668668067646, "grad_norm": 26.512752532958984, "learning_rate": 4.5653797554082955e-05, "loss": 0.5362, "step": 95200 }, { "epoch": 9.546682677874513, "grad_norm": 11.467738151550293, "learning_rate": 4.563606742733572e-05, "loss": 0.5161, "step": 95400 }, { "epoch": 9.566696687681377, "grad_norm": 2.2783453464508057, "learning_rate": 4.5618304667098166e-05, "loss": 0.5175, "step": 95600 }, { "epoch": 9.586710697488241, "grad_norm": 15.340252876281738, "learning_rate": 4.560050930145999e-05, "loss": 0.494, "step": 95800 }, { "epoch": 9.606724707295106, "grad_norm": 10.904950141906738, "learning_rate": 4.5582681358562465e-05, "loss": 0.5234, "step": 96000 }, { "epoch": 9.606724707295106, "eval_loss": 0.7605124711990356, "eval_runtime": 1508.8527, "eval_samples_per_second": 6.564, "eval_steps_per_second": 1.094, "step": 96000 }, { "epoch": 9.626738717101972, "grad_norm": 1.5022099018096924, "learning_rate": 4.5564820866598334e-05, "loss": 0.5009, "step": 96200 }, { "epoch": 9.646752726908836, "grad_norm": 12.933259010314941, "learning_rate": 4.554692785381187e-05, "loss": 0.5047, "step": 96400 }, { "epoch": 9.6667667367157, "grad_norm": 4.616204738616943, "learning_rate": 4.552900234849875e-05, "loss": 0.5267, "step": 96600 }, { "epoch": 9.686780746522565, "grad_norm": 2.5323028564453125, "learning_rate": 4.5511044379006016e-05, "loss": 0.5314, "step": 96800 }, { "epoch": 9.70679475632943, "grad_norm": 4.653432846069336, "learning_rate": 4.549305397373207e-05, "loss": 0.5177, "step": 97000 }, { "epoch": 9.70679475632943, "eval_loss": 0.7546625137329102, "eval_runtime": 1507.8085, "eval_samples_per_second": 6.568, "eval_steps_per_second": 1.095, "step": 97000 }, { "epoch": 9.726808766136296, "grad_norm": 7.31988525390625, "learning_rate": 4.547503116112661e-05, "loss": 0.4964, "step": 97200 }, { "epoch": 9.74682277594316, "grad_norm": 1.9884003400802612, "learning_rate": 4.5456975969690576e-05, "loss": 0.5405, "step": 97400 }, { "epoch": 9.766836785750025, "grad_norm": 10.859720230102539, "learning_rate": 4.5438888427976085e-05, "loss": 0.523, "step": 97600 }, { "epoch": 9.786850795556889, "grad_norm": 9.16154956817627, "learning_rate": 4.542076856458646e-05, "loss": 0.5299, "step": 97800 }, { "epoch": 9.806864805363755, "grad_norm": 7.3798723220825195, "learning_rate": 4.54026164081761e-05, "loss": 0.5194, "step": 98000 }, { "epoch": 9.806864805363755, "eval_loss": 0.7950179576873779, "eval_runtime": 1511.0925, "eval_samples_per_second": 6.554, "eval_steps_per_second": 1.093, "step": 98000 }, { "epoch": 9.82687881517062, "grad_norm": 6.394034385681152, "learning_rate": 4.53844319874505e-05, "loss": 0.4639, "step": 98200 }, { "epoch": 9.846892824977484, "grad_norm": 11.254334449768066, "learning_rate": 4.536621533116615e-05, "loss": 0.4844, "step": 98400 }, { "epoch": 9.866906834784348, "grad_norm": 7.895632743835449, "learning_rate": 4.534796646813053e-05, "loss": 0.5239, "step": 98600 }, { "epoch": 9.886920844591215, "grad_norm": 10.168183326721191, "learning_rate": 4.5329685427202056e-05, "loss": 0.4958, "step": 98800 }, { "epoch": 9.906934854398079, "grad_norm": 8.56737232208252, "learning_rate": 4.531137223729002e-05, "loss": 0.5209, "step": 99000 }, { "epoch": 9.906934854398079, "eval_loss": 0.7643815279006958, "eval_runtime": 1514.3484, "eval_samples_per_second": 6.54, "eval_steps_per_second": 1.09, "step": 99000 }, { "epoch": 9.926948864204943, "grad_norm": 6.205153942108154, "learning_rate": 4.5293026927354553e-05, "loss": 0.5176, "step": 99200 }, { "epoch": 9.946962874011808, "grad_norm": 8.276895523071289, "learning_rate": 4.52746495264066e-05, "loss": 0.5382, "step": 99400 }, { "epoch": 9.966976883818672, "grad_norm": 51.11056137084961, "learning_rate": 4.525624006350781e-05, "loss": 0.5202, "step": 99600 }, { "epoch": 9.986990893625538, "grad_norm": 12.367562294006348, "learning_rate": 4.523779856777059e-05, "loss": 0.5138, "step": 99800 }, { "epoch": 10.007004903432403, "grad_norm": 4.005044460296631, "learning_rate": 4.521932506835796e-05, "loss": 0.5083, "step": 100000 }, { "epoch": 10.007004903432403, "eval_loss": 0.7911326885223389, "eval_runtime": 1515.5876, "eval_samples_per_second": 6.535, "eval_steps_per_second": 1.089, "step": 100000 }, { "epoch": 10.027018913239267, "grad_norm": 1.4350804090499878, "learning_rate": 4.520081959448358e-05, "loss": 0.4841, "step": 100200 }, { "epoch": 10.047032923046132, "grad_norm": 8.367425918579102, "learning_rate": 4.518228217541164e-05, "loss": 0.4907, "step": 100400 }, { "epoch": 10.067046932852998, "grad_norm": 6.665814399719238, "learning_rate": 4.5163712840456884e-05, "loss": 0.4708, "step": 100600 }, { "epoch": 10.087060942659862, "grad_norm": 5.1859869956970215, "learning_rate": 4.5145111618984504e-05, "loss": 0.4744, "step": 100800 }, { "epoch": 10.107074952466727, "grad_norm": 11.06208610534668, "learning_rate": 4.512647854041012e-05, "loss": 0.4886, "step": 101000 }, { "epoch": 10.107074952466727, "eval_loss": 0.811222493648529, "eval_runtime": 1510.4035, "eval_samples_per_second": 6.557, "eval_steps_per_second": 1.093, "step": 101000 }, { "epoch": 10.127088962273591, "grad_norm": 9.01505184173584, "learning_rate": 4.510781363419975e-05, "loss": 0.5011, "step": 101200 }, { "epoch": 10.147102972080456, "grad_norm": 5.905966758728027, "learning_rate": 4.50891169298697e-05, "loss": 0.5062, "step": 101400 }, { "epoch": 10.167116981887322, "grad_norm": 5.097990989685059, "learning_rate": 4.50703884569866e-05, "loss": 0.4815, "step": 101600 }, { "epoch": 10.187130991694186, "grad_norm": 10.612571716308594, "learning_rate": 4.5051628245167314e-05, "loss": 0.4887, "step": 101800 }, { "epoch": 10.20714500150105, "grad_norm": 13.730401039123535, "learning_rate": 4.5032836324078884e-05, "loss": 0.4672, "step": 102000 }, { "epoch": 10.20714500150105, "eval_loss": 0.7495100498199463, "eval_runtime": 1509.1934, "eval_samples_per_second": 6.562, "eval_steps_per_second": 1.094, "step": 102000 }, { "epoch": 10.227159011307915, "grad_norm": 12.568594932556152, "learning_rate": 4.501401272343849e-05, "loss": 0.5161, "step": 102200 }, { "epoch": 10.247173021114781, "grad_norm": 7.425437927246094, "learning_rate": 4.499515747301344e-05, "loss": 0.4711, "step": 102400 }, { "epoch": 10.267187030921646, "grad_norm": 8.484464645385742, "learning_rate": 4.497627060262107e-05, "loss": 0.5292, "step": 102600 }, { "epoch": 10.28720104072851, "grad_norm": 9.892447471618652, "learning_rate": 4.495735214212871e-05, "loss": 0.484, "step": 102800 }, { "epoch": 10.307215050535374, "grad_norm": 7.636592388153076, "learning_rate": 4.493840212145367e-05, "loss": 0.4848, "step": 103000 }, { "epoch": 10.307215050535374, "eval_loss": 0.7500751614570618, "eval_runtime": 1509.2623, "eval_samples_per_second": 6.562, "eval_steps_per_second": 1.094, "step": 103000 }, { "epoch": 10.327229060342239, "grad_norm": 2.424403429031372, "learning_rate": 4.4919420570563166e-05, "loss": 0.4641, "step": 103200 }, { "epoch": 10.347243070149105, "grad_norm": 12.456744194030762, "learning_rate": 4.490040751947424e-05, "loss": 0.5024, "step": 103400 }, { "epoch": 10.36725707995597, "grad_norm": 0.4191274046897888, "learning_rate": 4.4881362998253797e-05, "loss": 0.4822, "step": 103600 }, { "epoch": 10.387271089762834, "grad_norm": 21.666641235351562, "learning_rate": 4.486228703701848e-05, "loss": 0.5248, "step": 103800 }, { "epoch": 10.407285099569698, "grad_norm": 9.053685188293457, "learning_rate": 4.4843179665934654e-05, "loss": 0.4893, "step": 104000 }, { "epoch": 10.407285099569698, "eval_loss": 0.772674560546875, "eval_runtime": 1507.7841, "eval_samples_per_second": 6.569, "eval_steps_per_second": 1.095, "step": 104000 }, { "epoch": 10.427299109376564, "grad_norm": 4.747574806213379, "learning_rate": 4.482404091521836e-05, "loss": 0.5203, "step": 104200 }, { "epoch": 10.447313119183429, "grad_norm": 7.969445705413818, "learning_rate": 4.4804870815135265e-05, "loss": 0.4643, "step": 104400 }, { "epoch": 10.467327128990293, "grad_norm": 10.557317733764648, "learning_rate": 4.47856693960006e-05, "loss": 0.4548, "step": 104600 }, { "epoch": 10.487341138797158, "grad_norm": 0.17864495515823364, "learning_rate": 4.476643668817912e-05, "loss": 0.5143, "step": 104800 }, { "epoch": 10.507355148604024, "grad_norm": 6.341115474700928, "learning_rate": 4.4747172722085085e-05, "loss": 0.4656, "step": 105000 }, { "epoch": 10.507355148604024, "eval_loss": 0.7513960003852844, "eval_runtime": 1504.5721, "eval_samples_per_second": 6.583, "eval_steps_per_second": 1.097, "step": 105000 }, { "epoch": 10.527369158410888, "grad_norm": 6.542806148529053, "learning_rate": 4.472787752818216e-05, "loss": 0.5119, "step": 105200 }, { "epoch": 10.547383168217753, "grad_norm": 7.5298357009887695, "learning_rate": 4.4708551136983415e-05, "loss": 0.483, "step": 105400 }, { "epoch": 10.567397178024617, "grad_norm": 3.8048598766326904, "learning_rate": 4.4689193579051225e-05, "loss": 0.4454, "step": 105600 }, { "epoch": 10.587411187831481, "grad_norm": 13.350851058959961, "learning_rate": 4.466980488499729e-05, "loss": 0.5467, "step": 105800 }, { "epoch": 10.607425197638348, "grad_norm": 9.044102668762207, "learning_rate": 4.4650385085482505e-05, "loss": 0.5193, "step": 106000 }, { "epoch": 10.607425197638348, "eval_loss": 0.7555210590362549, "eval_runtime": 1519.9796, "eval_samples_per_second": 6.516, "eval_steps_per_second": 1.086, "step": 106000 }, { "epoch": 10.627439207445212, "grad_norm": 8.711231231689453, "learning_rate": 4.463093421121699e-05, "loss": 0.4983, "step": 106200 }, { "epoch": 10.647453217252076, "grad_norm": 11.417213439941406, "learning_rate": 4.461145229296e-05, "loss": 0.5123, "step": 106400 }, { "epoch": 10.66746722705894, "grad_norm": 3.407996416091919, "learning_rate": 4.4591939361519865e-05, "loss": 0.4985, "step": 106600 }, { "epoch": 10.687481236865807, "grad_norm": 5.544752597808838, "learning_rate": 4.457239544775396e-05, "loss": 0.4764, "step": 106800 }, { "epoch": 10.707495246672671, "grad_norm": 11.374485969543457, "learning_rate": 4.455282058256869e-05, "loss": 0.5038, "step": 107000 }, { "epoch": 10.707495246672671, "eval_loss": 0.7464540004730225, "eval_runtime": 1529.2032, "eval_samples_per_second": 6.477, "eval_steps_per_second": 1.08, "step": 107000 }, { "epoch": 10.727509256479536, "grad_norm": 14.991117477416992, "learning_rate": 4.4533214796919355e-05, "loss": 0.5325, "step": 107200 }, { "epoch": 10.7475232662864, "grad_norm": 4.14377498626709, "learning_rate": 4.451357812181018e-05, "loss": 0.5351, "step": 107400 }, { "epoch": 10.767537276093265, "grad_norm": 3.1982901096343994, "learning_rate": 4.449391058829426e-05, "loss": 0.4935, "step": 107600 }, { "epoch": 10.78755128590013, "grad_norm": 17.85526466369629, "learning_rate": 4.447421222747343e-05, "loss": 0.4842, "step": 107800 }, { "epoch": 10.807565295706995, "grad_norm": 5.987515926361084, "learning_rate": 4.4454483070498335e-05, "loss": 0.4955, "step": 108000 }, { "epoch": 10.807565295706995, "eval_loss": 0.7446533441543579, "eval_runtime": 1529.6172, "eval_samples_per_second": 6.475, "eval_steps_per_second": 1.079, "step": 108000 }, { "epoch": 10.82757930551386, "grad_norm": 7.852579116821289, "learning_rate": 4.443472314856828e-05, "loss": 0.4471, "step": 108200 }, { "epoch": 10.847593315320724, "grad_norm": 12.757951736450195, "learning_rate": 4.4414932492931245e-05, "loss": 0.4848, "step": 108400 }, { "epoch": 10.86760732512759, "grad_norm": 3.4348502159118652, "learning_rate": 4.439511113488379e-05, "loss": 0.5177, "step": 108600 }, { "epoch": 10.887621334934455, "grad_norm": 14.383734703063965, "learning_rate": 4.437525910577105e-05, "loss": 0.5035, "step": 108800 }, { "epoch": 10.907635344741319, "grad_norm": 6.70511531829834, "learning_rate": 4.435537643698664e-05, "loss": 0.5209, "step": 109000 }, { "epoch": 10.907635344741319, "eval_loss": 0.752037763595581, "eval_runtime": 1530.9558, "eval_samples_per_second": 6.469, "eval_steps_per_second": 1.078, "step": 109000 }, { "epoch": 10.927649354548183, "grad_norm": 42.864105224609375, "learning_rate": 4.433546315997264e-05, "loss": 0.5213, "step": 109200 }, { "epoch": 10.947663364355048, "grad_norm": 7.345578193664551, "learning_rate": 4.431551930621955e-05, "loss": 0.5255, "step": 109400 }, { "epoch": 10.967677374161914, "grad_norm": 9.278435707092285, "learning_rate": 4.4295544907266195e-05, "loss": 0.5447, "step": 109600 }, { "epoch": 10.987691383968778, "grad_norm": 6.9013848304748535, "learning_rate": 4.4275539994699724e-05, "loss": 0.4934, "step": 109800 }, { "epoch": 11.007705393775643, "grad_norm": 9.29002857208252, "learning_rate": 4.425550460015552e-05, "loss": 0.5, "step": 110000 }, { "epoch": 11.007705393775643, "eval_loss": 0.7502064108848572, "eval_runtime": 1536.073, "eval_samples_per_second": 6.448, "eval_steps_per_second": 1.075, "step": 110000 }, { "epoch": 11.027719403582507, "grad_norm": 5.391198635101318, "learning_rate": 4.423543875531717e-05, "loss": 0.45, "step": 110200 }, { "epoch": 11.047733413389373, "grad_norm": 4.4135637283325195, "learning_rate": 4.4215342491916456e-05, "loss": 0.4742, "step": 110400 }, { "epoch": 11.067747423196238, "grad_norm": 8.184747695922852, "learning_rate": 4.4195215841733205e-05, "loss": 0.4625, "step": 110600 }, { "epoch": 11.087761433003102, "grad_norm": 2.6932077407836914, "learning_rate": 4.417505883659534e-05, "loss": 0.4729, "step": 110800 }, { "epoch": 11.107775442809967, "grad_norm": 22.752954483032227, "learning_rate": 4.415487150837877e-05, "loss": 0.4665, "step": 111000 }, { "epoch": 11.107775442809967, "eval_loss": 0.7782527208328247, "eval_runtime": 1534.4984, "eval_samples_per_second": 6.454, "eval_steps_per_second": 1.076, "step": 111000 }, { "epoch": 11.127789452616831, "grad_norm": 9.615676879882812, "learning_rate": 4.413465388900734e-05, "loss": 0.4568, "step": 111200 }, { "epoch": 11.147803462423697, "grad_norm": 11.395662307739258, "learning_rate": 4.4114406010452836e-05, "loss": 0.4314, "step": 111400 }, { "epoch": 11.167817472230562, "grad_norm": 6.622340202331543, "learning_rate": 4.409412790473487e-05, "loss": 0.4532, "step": 111600 }, { "epoch": 11.187831482037426, "grad_norm": 1.393022894859314, "learning_rate": 4.407381960392085e-05, "loss": 0.4991, "step": 111800 }, { "epoch": 11.20784549184429, "grad_norm": 4.066559314727783, "learning_rate": 4.4053481140125944e-05, "loss": 0.4682, "step": 112000 }, { "epoch": 11.20784549184429, "eval_loss": 0.7852290272712708, "eval_runtime": 1533.3791, "eval_samples_per_second": 6.459, "eval_steps_per_second": 1.077, "step": 112000 }, { "epoch": 11.227859501651157, "grad_norm": 6.776269912719727, "learning_rate": 4.403311254551302e-05, "loss": 0.4644, "step": 112200 }, { "epoch": 11.247873511458021, "grad_norm": 11.697609901428223, "learning_rate": 4.401271385229259e-05, "loss": 0.4794, "step": 112400 }, { "epoch": 11.267887521264885, "grad_norm": 7.506731033325195, "learning_rate": 4.399228509272277e-05, "loss": 0.4846, "step": 112600 }, { "epoch": 11.28790153107175, "grad_norm": 3.080813407897949, "learning_rate": 4.397182629910921e-05, "loss": 0.4685, "step": 112800 }, { "epoch": 11.307915540878614, "grad_norm": 9.698938369750977, "learning_rate": 4.3951337503805075e-05, "loss": 0.4909, "step": 113000 }, { "epoch": 11.307915540878614, "eval_loss": 0.7894698977470398, "eval_runtime": 1531.8504, "eval_samples_per_second": 6.465, "eval_steps_per_second": 1.078, "step": 113000 }, { "epoch": 11.32792955068548, "grad_norm": 3.730142116546631, "learning_rate": 4.3930818739210944e-05, "loss": 0.4878, "step": 113200 }, { "epoch": 11.347943560492345, "grad_norm": 6.763973712921143, "learning_rate": 4.391027003777483e-05, "loss": 0.4508, "step": 113400 }, { "epoch": 11.36795757029921, "grad_norm": 5.431915760040283, "learning_rate": 4.3889691431992044e-05, "loss": 0.4845, "step": 113600 }, { "epoch": 11.387971580106074, "grad_norm": 7.291522979736328, "learning_rate": 4.386908295440521e-05, "loss": 0.4566, "step": 113800 }, { "epoch": 11.40798558991294, "grad_norm": 2.217823028564453, "learning_rate": 4.3848444637604206e-05, "loss": 0.4797, "step": 114000 }, { "epoch": 11.40798558991294, "eval_loss": 0.7697126269340515, "eval_runtime": 1531.4606, "eval_samples_per_second": 6.467, "eval_steps_per_second": 1.078, "step": 114000 }, { "epoch": 11.427999599719804, "grad_norm": 12.138418197631836, "learning_rate": 4.382777651422605e-05, "loss": 0.4908, "step": 114200 }, { "epoch": 11.448013609526669, "grad_norm": 4.048282623291016, "learning_rate": 4.3807078616954956e-05, "loss": 0.4282, "step": 114400 }, { "epoch": 11.468027619333533, "grad_norm": 5.013810157775879, "learning_rate": 4.378635097852216e-05, "loss": 0.4401, "step": 114600 }, { "epoch": 11.488041629140398, "grad_norm": 4.272950649261475, "learning_rate": 4.376559363170599e-05, "loss": 0.4996, "step": 114800 }, { "epoch": 11.508055638947264, "grad_norm": 16.383968353271484, "learning_rate": 4.374480660933171e-05, "loss": 0.5071, "step": 115000 }, { "epoch": 11.508055638947264, "eval_loss": 0.7520093321800232, "eval_runtime": 1528.0468, "eval_samples_per_second": 6.481, "eval_steps_per_second": 1.08, "step": 115000 }, { "epoch": 11.528069648754128, "grad_norm": 9.370074272155762, "learning_rate": 4.372398994427154e-05, "loss": 0.4814, "step": 115200 }, { "epoch": 11.548083658560993, "grad_norm": 4.452792167663574, "learning_rate": 4.370314366944454e-05, "loss": 0.481, "step": 115400 }, { "epoch": 11.568097668367857, "grad_norm": 6.910656452178955, "learning_rate": 4.3682267817816636e-05, "loss": 0.4328, "step": 115600 }, { "epoch": 11.588111678174723, "grad_norm": 4.114524841308594, "learning_rate": 4.366136242240051e-05, "loss": 0.4859, "step": 115800 }, { "epoch": 11.608125687981588, "grad_norm": 16.97528076171875, "learning_rate": 4.364042751625555e-05, "loss": 0.4489, "step": 116000 }, { "epoch": 11.608125687981588, "eval_loss": 0.7805171608924866, "eval_runtime": 1549.2394, "eval_samples_per_second": 6.393, "eval_steps_per_second": 1.066, "step": 116000 }, { "epoch": 11.628139697788452, "grad_norm": 8.208148956298828, "learning_rate": 4.3619463132487835e-05, "loss": 0.474, "step": 116200 }, { "epoch": 11.648153707595316, "grad_norm": 16.751638412475586, "learning_rate": 4.359846930425005e-05, "loss": 0.4805, "step": 116400 }, { "epoch": 11.66816771740218, "grad_norm": 7.855900287628174, "learning_rate": 4.357744606474143e-05, "loss": 0.5069, "step": 116600 }, { "epoch": 11.688181727209047, "grad_norm": 9.67066764831543, "learning_rate": 4.355639344720775e-05, "loss": 0.4671, "step": 116800 }, { "epoch": 11.708195737015911, "grad_norm": 11.64322280883789, "learning_rate": 4.35353114849412e-05, "loss": 0.4739, "step": 117000 }, { "epoch": 11.708195737015911, "eval_loss": 0.7411287426948547, "eval_runtime": 1547.121, "eval_samples_per_second": 6.402, "eval_steps_per_second": 1.067, "step": 117000 }, { "epoch": 11.728209746822776, "grad_norm": 11.756793022155762, "learning_rate": 4.351420021128043e-05, "loss": 0.4241, "step": 117200 }, { "epoch": 11.74822375662964, "grad_norm": 2.9145941734313965, "learning_rate": 4.349305965961039e-05, "loss": 0.4777, "step": 117400 }, { "epoch": 11.768237766436506, "grad_norm": 14.672354698181152, "learning_rate": 4.3471889863362356e-05, "loss": 0.4883, "step": 117600 }, { "epoch": 11.78825177624337, "grad_norm": 11.193058013916016, "learning_rate": 4.345069085601385e-05, "loss": 0.4797, "step": 117800 }, { "epoch": 11.808265786050235, "grad_norm": 3.7404911518096924, "learning_rate": 4.342946267108858e-05, "loss": 0.4315, "step": 118000 }, { "epoch": 11.808265786050235, "eval_loss": 0.7627429366111755, "eval_runtime": 1548.0435, "eval_samples_per_second": 6.398, "eval_steps_per_second": 1.067, "step": 118000 }, { "epoch": 11.8282797958571, "grad_norm": 1.4443763494491577, "learning_rate": 4.3408205342156414e-05, "loss": 0.4628, "step": 118200 }, { "epoch": 11.848293805663964, "grad_norm": 1.5931999683380127, "learning_rate": 4.338691890283329e-05, "loss": 0.4985, "step": 118400 }, { "epoch": 11.86830781547083, "grad_norm": 3.04166316986084, "learning_rate": 4.336560338678116e-05, "loss": 0.4633, "step": 118600 }, { "epoch": 11.888321825277695, "grad_norm": 8.934111595153809, "learning_rate": 4.3344258827708015e-05, "loss": 0.5072, "step": 118800 }, { "epoch": 11.908335835084559, "grad_norm": 14.092962265014648, "learning_rate": 4.332288525936772e-05, "loss": 0.4936, "step": 119000 }, { "epoch": 11.908335835084559, "eval_loss": 0.7909422516822815, "eval_runtime": 1546.1115, "eval_samples_per_second": 6.406, "eval_steps_per_second": 1.068, "step": 119000 }, { "epoch": 11.928349844891423, "grad_norm": 14.235006332397461, "learning_rate": 4.330148271556004e-05, "loss": 0.5486, "step": 119200 }, { "epoch": 11.94836385469829, "grad_norm": 6.035211563110352, "learning_rate": 4.328005123013056e-05, "loss": 0.4823, "step": 119400 }, { "epoch": 11.968377864505154, "grad_norm": 25.300004959106445, "learning_rate": 4.325859083697063e-05, "loss": 0.5143, "step": 119600 }, { "epoch": 11.988391874312018, "grad_norm": 6.9751877784729, "learning_rate": 4.3237101570017316e-05, "loss": 0.4901, "step": 119800 }, { "epoch": 12.008405884118883, "grad_norm": 7.40806245803833, "learning_rate": 4.321558346325333e-05, "loss": 0.4633, "step": 120000 }, { "epoch": 12.008405884118883, "eval_loss": 0.766183614730835, "eval_runtime": 1551.1264, "eval_samples_per_second": 6.385, "eval_steps_per_second": 1.064, "step": 120000 }, { "epoch": 12.028419893925747, "grad_norm": 6.593225955963135, "learning_rate": 4.319403655070701e-05, "loss": 0.4087, "step": 120200 }, { "epoch": 12.048433903732613, "grad_norm": 3.6601810455322266, "learning_rate": 4.317246086645223e-05, "loss": 0.4331, "step": 120400 }, { "epoch": 12.068447913539478, "grad_norm": 5.580539703369141, "learning_rate": 4.31508564446084e-05, "loss": 0.4598, "step": 120600 }, { "epoch": 12.088461923346342, "grad_norm": 5.284858226776123, "learning_rate": 4.312922331934033e-05, "loss": 0.4338, "step": 120800 }, { "epoch": 12.108475933153207, "grad_norm": 9.368062973022461, "learning_rate": 4.310756152485823e-05, "loss": 0.4232, "step": 121000 }, { "epoch": 12.108475933153207, "eval_loss": 0.7667014598846436, "eval_runtime": 1549.6084, "eval_samples_per_second": 6.391, "eval_steps_per_second": 1.065, "step": 121000 }, { "epoch": 12.128489942960073, "grad_norm": 6.790826797485352, "learning_rate": 4.308587109541766e-05, "loss": 0.4317, "step": 121200 }, { "epoch": 12.148503952766937, "grad_norm": 7.251533031463623, "learning_rate": 4.3064152065319476e-05, "loss": 0.4195, "step": 121400 }, { "epoch": 12.168517962573802, "grad_norm": 5.731253623962402, "learning_rate": 4.3042404468909726e-05, "loss": 0.4671, "step": 121600 }, { "epoch": 12.188531972380666, "grad_norm": 8.78419017791748, "learning_rate": 4.302062834057966e-05, "loss": 0.4737, "step": 121800 }, { "epoch": 12.20854598218753, "grad_norm": 16.491430282592773, "learning_rate": 4.299882371476564e-05, "loss": 0.4232, "step": 122000 }, { "epoch": 12.20854598218753, "eval_loss": 0.7764489650726318, "eval_runtime": 1548.74, "eval_samples_per_second": 6.395, "eval_steps_per_second": 1.066, "step": 122000 }, { "epoch": 12.228559991994397, "grad_norm": 2.5727787017822266, "learning_rate": 4.2976990625949086e-05, "loss": 0.4571, "step": 122200 }, { "epoch": 12.248574001801261, "grad_norm": 3.1625683307647705, "learning_rate": 4.295512910865644e-05, "loss": 0.4718, "step": 122400 }, { "epoch": 12.268588011608125, "grad_norm": 9.511300086975098, "learning_rate": 4.2933239197459105e-05, "loss": 0.4192, "step": 122600 }, { "epoch": 12.28860202141499, "grad_norm": 10.674617767333984, "learning_rate": 4.2911320926973354e-05, "loss": 0.4653, "step": 122800 }, { "epoch": 12.308616031221856, "grad_norm": 7.583080768585205, "learning_rate": 4.288937433186035e-05, "loss": 0.4617, "step": 123000 }, { "epoch": 12.308616031221856, "eval_loss": 0.7778533697128296, "eval_runtime": 1550.4554, "eval_samples_per_second": 6.388, "eval_steps_per_second": 1.065, "step": 123000 }, { "epoch": 12.32863004102872, "grad_norm": 7.099635124206543, "learning_rate": 4.286739944682602e-05, "loss": 0.429, "step": 123200 }, { "epoch": 12.348644050835585, "grad_norm": 17.548032760620117, "learning_rate": 4.284539630662103e-05, "loss": 0.4929, "step": 123400 }, { "epoch": 12.36865806064245, "grad_norm": 10.165279388427734, "learning_rate": 4.282336494604073e-05, "loss": 0.4751, "step": 123600 }, { "epoch": 12.388672070449314, "grad_norm": 23.920074462890625, "learning_rate": 4.280130539992513e-05, "loss": 0.4607, "step": 123800 }, { "epoch": 12.40868608025618, "grad_norm": 5.454583168029785, "learning_rate": 4.277921770315875e-05, "loss": 0.4701, "step": 124000 }, { "epoch": 12.40868608025618, "eval_loss": 0.7843158841133118, "eval_runtime": 1550.4507, "eval_samples_per_second": 6.388, "eval_steps_per_second": 1.065, "step": 124000 }, { "epoch": 12.428700090063044, "grad_norm": 10.337953567504883, "learning_rate": 4.275710189067067e-05, "loss": 0.3914, "step": 124200 }, { "epoch": 12.448714099869909, "grad_norm": 10.430801391601562, "learning_rate": 4.273495799743442e-05, "loss": 0.4692, "step": 124400 }, { "epoch": 12.468728109676773, "grad_norm": 5.08858585357666, "learning_rate": 4.271278605846795e-05, "loss": 0.4286, "step": 124600 }, { "epoch": 12.48874211948364, "grad_norm": 5.919161319732666, "learning_rate": 4.269058610883354e-05, "loss": 0.4447, "step": 124800 }, { "epoch": 12.508756129290504, "grad_norm": 10.656771659851074, "learning_rate": 4.266835818363777e-05, "loss": 0.4886, "step": 125000 }, { "epoch": 12.508756129290504, "eval_loss": 0.771526575088501, "eval_runtime": 1549.4513, "eval_samples_per_second": 6.392, "eval_steps_per_second": 1.066, "step": 125000 }, { "epoch": 12.528770139097368, "grad_norm": 10.407238960266113, "learning_rate": 4.2646102318031466e-05, "loss": 0.435, "step": 125200 }, { "epoch": 12.548784148904232, "grad_norm": 3.8951525688171387, "learning_rate": 4.262381854720964e-05, "loss": 0.4733, "step": 125400 }, { "epoch": 12.568798158711097, "grad_norm": 7.700132846832275, "learning_rate": 4.260150690641143e-05, "loss": 0.4644, "step": 125600 }, { "epoch": 12.588812168517963, "grad_norm": 11.427380561828613, "learning_rate": 4.257916743092004e-05, "loss": 0.4508, "step": 125800 }, { "epoch": 12.608826178324827, "grad_norm": 2.5501906871795654, "learning_rate": 4.2556800156062716e-05, "loss": 0.4657, "step": 126000 }, { "epoch": 12.608826178324827, "eval_loss": 0.7523473501205444, "eval_runtime": 1626.2597, "eval_samples_per_second": 6.09, "eval_steps_per_second": 1.015, "step": 126000 }, { "epoch": 12.628840188131692, "grad_norm": 20.439237594604492, "learning_rate": 4.253440511721063e-05, "loss": 0.4907, "step": 126200 }, { "epoch": 12.648854197938556, "grad_norm": 3.4529783725738525, "learning_rate": 4.2511982349778874e-05, "loss": 0.4746, "step": 126400 }, { "epoch": 12.668868207745422, "grad_norm": 7.690820217132568, "learning_rate": 4.248953188922641e-05, "loss": 0.4553, "step": 126600 }, { "epoch": 12.688882217552287, "grad_norm": 4.355913162231445, "learning_rate": 4.246705377105595e-05, "loss": 0.4836, "step": 126800 }, { "epoch": 12.708896227359151, "grad_norm": 6.1771416664123535, "learning_rate": 4.2444548030813996e-05, "loss": 0.4754, "step": 127000 }, { "epoch": 12.708896227359151, "eval_loss": 0.7619630098342896, "eval_runtime": 1621.6474, "eval_samples_per_second": 6.107, "eval_steps_per_second": 1.018, "step": 127000 }, { "epoch": 12.728910237166016, "grad_norm": 0.5682120323181152, "learning_rate": 4.242201470409069e-05, "loss": 0.4214, "step": 127200 }, { "epoch": 12.748924246972882, "grad_norm": 3.2245278358459473, "learning_rate": 4.239945382651981e-05, "loss": 0.4646, "step": 127400 }, { "epoch": 12.768938256779746, "grad_norm": 12.788905143737793, "learning_rate": 4.237686543377872e-05, "loss": 0.4325, "step": 127600 }, { "epoch": 12.78895226658661, "grad_norm": 14.074710845947266, "learning_rate": 4.235424956158827e-05, "loss": 0.4877, "step": 127800 }, { "epoch": 12.808966276393475, "grad_norm": 17.93095588684082, "learning_rate": 4.233160624571276e-05, "loss": 0.4903, "step": 128000 }, { "epoch": 12.808966276393475, "eval_loss": 0.7653047442436218, "eval_runtime": 1624.4172, "eval_samples_per_second": 6.097, "eval_steps_per_second": 1.016, "step": 128000 }, { "epoch": 12.82898028620034, "grad_norm": 4.710815906524658, "learning_rate": 4.230893552195994e-05, "loss": 0.474, "step": 128200 }, { "epoch": 12.848994296007206, "grad_norm": 4.588099956512451, "learning_rate": 4.228623742618084e-05, "loss": 0.4179, "step": 128400 }, { "epoch": 12.86900830581407, "grad_norm": 15.0830659866333, "learning_rate": 4.2263511994269814e-05, "loss": 0.4899, "step": 128600 }, { "epoch": 12.889022315620934, "grad_norm": 14.8272123336792, "learning_rate": 4.224075926216444e-05, "loss": 0.423, "step": 128800 }, { "epoch": 12.909036325427799, "grad_norm": 3.084211587905884, "learning_rate": 4.221797926584545e-05, "loss": 0.4539, "step": 129000 }, { "epoch": 12.909036325427799, "eval_loss": 0.7529559135437012, "eval_runtime": 1623.1081, "eval_samples_per_second": 6.102, "eval_steps_per_second": 1.017, "step": 129000 }, { "epoch": 12.929050335234665, "grad_norm": 10.327719688415527, "learning_rate": 4.219517204133671e-05, "loss": 0.4581, "step": 129200 }, { "epoch": 12.94906434504153, "grad_norm": 6.88381290435791, "learning_rate": 4.217233762470514e-05, "loss": 0.4566, "step": 129400 }, { "epoch": 12.969078354848394, "grad_norm": 14.138056755065918, "learning_rate": 4.214947605206065e-05, "loss": 0.4964, "step": 129600 }, { "epoch": 12.989092364655258, "grad_norm": 4.556768894195557, "learning_rate": 4.212658735955611e-05, "loss": 0.4718, "step": 129800 }, { "epoch": 13.009106374462123, "grad_norm": 3.681356430053711, "learning_rate": 4.210367158338726e-05, "loss": 0.4613, "step": 130000 }, { "epoch": 13.009106374462123, "eval_loss": 0.7476616501808167, "eval_runtime": 1621.7171, "eval_samples_per_second": 6.107, "eval_steps_per_second": 1.018, "step": 130000 }, { "epoch": 13.029120384268989, "grad_norm": 7.354981899261475, "learning_rate": 4.2080728759792696e-05, "loss": 0.4683, "step": 130200 }, { "epoch": 13.049134394075853, "grad_norm": 5.324392318725586, "learning_rate": 4.2057758925053756e-05, "loss": 0.4237, "step": 130400 }, { "epoch": 13.069148403882718, "grad_norm": 13.376620292663574, "learning_rate": 4.203476211549451e-05, "loss": 0.4596, "step": 130600 }, { "epoch": 13.089162413689582, "grad_norm": 5.131417274475098, "learning_rate": 4.201173836748168e-05, "loss": 0.4228, "step": 130800 }, { "epoch": 13.109176423496448, "grad_norm": 9.362079620361328, "learning_rate": 4.198868771742459e-05, "loss": 0.4056, "step": 131000 }, { "epoch": 13.109176423496448, "eval_loss": 0.7539029717445374, "eval_runtime": 1623.5084, "eval_samples_per_second": 6.1, "eval_steps_per_second": 1.017, "step": 131000 }, { "epoch": 13.129190433303313, "grad_norm": 5.9107666015625, "learning_rate": 4.196561020177511e-05, "loss": 0.4188, "step": 131200 }, { "epoch": 13.149204443110177, "grad_norm": 7.184723377227783, "learning_rate": 4.194250585702758e-05, "loss": 0.436, "step": 131400 }, { "epoch": 13.169218452917042, "grad_norm": 5.684628486633301, "learning_rate": 4.191937471971879e-05, "loss": 0.4237, "step": 131600 }, { "epoch": 13.189232462723906, "grad_norm": 4.227687358856201, "learning_rate": 4.189621682642788e-05, "loss": 0.4147, "step": 131800 }, { "epoch": 13.209246472530772, "grad_norm": 6.2200446128845215, "learning_rate": 4.187303221377632e-05, "loss": 0.4231, "step": 132000 }, { "epoch": 13.209246472530772, "eval_loss": 0.7648545503616333, "eval_runtime": 1624.7998, "eval_samples_per_second": 6.096, "eval_steps_per_second": 1.016, "step": 132000 }, { "epoch": 13.229260482337637, "grad_norm": 9.214865684509277, "learning_rate": 4.1849820918427804e-05, "loss": 0.4586, "step": 132200 }, { "epoch": 13.249274492144501, "grad_norm": 11.640349388122559, "learning_rate": 4.182658297708825e-05, "loss": 0.4523, "step": 132400 }, { "epoch": 13.269288501951365, "grad_norm": 0.7906355857849121, "learning_rate": 4.18033184265057e-05, "loss": 0.421, "step": 132600 }, { "epoch": 13.289302511758232, "grad_norm": 0.7716972231864929, "learning_rate": 4.178002730347028e-05, "loss": 0.4304, "step": 132800 }, { "epoch": 13.309316521565096, "grad_norm": 13.78469181060791, "learning_rate": 4.175670964481414e-05, "loss": 0.4726, "step": 133000 }, { "epoch": 13.309316521565096, "eval_loss": 0.7501928806304932, "eval_runtime": 1624.9861, "eval_samples_per_second": 6.095, "eval_steps_per_second": 1.016, "step": 133000 }, { "epoch": 13.32933053137196, "grad_norm": 8.089627265930176, "learning_rate": 4.173336548741139e-05, "loss": 0.4463, "step": 133200 }, { "epoch": 13.349344541178825, "grad_norm": 7.025575160980225, "learning_rate": 4.170999486817803e-05, "loss": 0.4292, "step": 133400 }, { "epoch": 13.36935855098569, "grad_norm": 9.765005111694336, "learning_rate": 4.168659782407192e-05, "loss": 0.4364, "step": 133600 }, { "epoch": 13.389372560792555, "grad_norm": 12.521211624145508, "learning_rate": 4.166317439209271e-05, "loss": 0.4193, "step": 133800 }, { "epoch": 13.40938657059942, "grad_norm": 8.010429382324219, "learning_rate": 4.1639724609281796e-05, "loss": 0.4516, "step": 134000 }, { "epoch": 13.40938657059942, "eval_loss": 0.7473704218864441, "eval_runtime": 1621.5397, "eval_samples_per_second": 6.108, "eval_steps_per_second": 1.018, "step": 134000 }, { "epoch": 13.429400580406284, "grad_norm": 0.5313342809677124, "learning_rate": 4.161624851272219e-05, "loss": 0.4307, "step": 134200 }, { "epoch": 13.449414590213149, "grad_norm": 0.5368172526359558, "learning_rate": 4.159274613953858e-05, "loss": 0.4547, "step": 134400 }, { "epoch": 13.469428600020015, "grad_norm": 0.6423940062522888, "learning_rate": 4.1569217526897165e-05, "loss": 0.4481, "step": 134600 }, { "epoch": 13.48944260982688, "grad_norm": 2.5826308727264404, "learning_rate": 4.154566271200566e-05, "loss": 0.4646, "step": 134800 }, { "epoch": 13.509456619633744, "grad_norm": 7.7838945388793945, "learning_rate": 4.15220817321132e-05, "loss": 0.4254, "step": 135000 }, { "epoch": 13.509456619633744, "eval_loss": 0.7682405114173889, "eval_runtime": 1625.5319, "eval_samples_per_second": 6.093, "eval_steps_per_second": 1.016, "step": 135000 }, { "epoch": 13.529470629440608, "grad_norm": 10.070534706115723, "learning_rate": 4.14984746245103e-05, "loss": 0.4623, "step": 135200 }, { "epoch": 13.549484639247474, "grad_norm": 11.282571792602539, "learning_rate": 4.147484142652882e-05, "loss": 0.4783, "step": 135400 }, { "epoch": 13.569498649054339, "grad_norm": 10.082357406616211, "learning_rate": 4.145118217554183e-05, "loss": 0.4295, "step": 135600 }, { "epoch": 13.589512658861203, "grad_norm": 9.939977645874023, "learning_rate": 4.1427496908963644e-05, "loss": 0.4612, "step": 135800 }, { "epoch": 13.609526668668067, "grad_norm": 8.689457893371582, "learning_rate": 4.140378566424969e-05, "loss": 0.4402, "step": 136000 }, { "epoch": 13.609526668668067, "eval_loss": 0.7990610003471375, "eval_runtime": 1488.7594, "eval_samples_per_second": 6.653, "eval_steps_per_second": 1.109, "step": 136000 }, { "epoch": 13.629540678474932, "grad_norm": 4.258101463317871, "learning_rate": 4.1380048478896496e-05, "loss": 0.435, "step": 136200 }, { "epoch": 13.649554688281798, "grad_norm": 21.015066146850586, "learning_rate": 4.135628539044159e-05, "loss": 0.4391, "step": 136400 }, { "epoch": 13.669568698088662, "grad_norm": 11.38297176361084, "learning_rate": 4.1332496436463474e-05, "loss": 0.4983, "step": 136600 }, { "epoch": 13.689582707895527, "grad_norm": 21.613985061645508, "learning_rate": 4.130868165458157e-05, "loss": 0.4393, "step": 136800 }, { "epoch": 13.709596717702391, "grad_norm": 0.07275316119194031, "learning_rate": 4.128484108245611e-05, "loss": 0.432, "step": 137000 }, { "epoch": 13.709596717702391, "eval_loss": 0.7579061985015869, "eval_runtime": 1490.1198, "eval_samples_per_second": 6.646, "eval_steps_per_second": 1.108, "step": 137000 }, { "epoch": 13.729610727509257, "grad_norm": 6.296494007110596, "learning_rate": 4.126097475778814e-05, "loss": 0.471, "step": 137200 }, { "epoch": 13.749624737316122, "grad_norm": 9.794976234436035, "learning_rate": 4.123708271831941e-05, "loss": 0.4585, "step": 137400 }, { "epoch": 13.769638747122986, "grad_norm": 19.100250244140625, "learning_rate": 4.121316500183234e-05, "loss": 0.445, "step": 137600 }, { "epoch": 13.78965275692985, "grad_norm": 14.55346393585205, "learning_rate": 4.118922164614998e-05, "loss": 0.4497, "step": 137800 }, { "epoch": 13.809666766736715, "grad_norm": 9.02545166015625, "learning_rate": 4.116525268913589e-05, "loss": 0.4388, "step": 138000 }, { "epoch": 13.809666766736715, "eval_loss": 0.7618295550346375, "eval_runtime": 1488.5465, "eval_samples_per_second": 6.653, "eval_steps_per_second": 1.109, "step": 138000 }, { "epoch": 13.829680776543581, "grad_norm": 26.34003257751465, "learning_rate": 4.1141258168694117e-05, "loss": 0.451, "step": 138200 }, { "epoch": 13.849694786350446, "grad_norm": 4.816425800323486, "learning_rate": 4.1117238122769176e-05, "loss": 0.4691, "step": 138400 }, { "epoch": 13.86970879615731, "grad_norm": 3.177556037902832, "learning_rate": 4.109319258934589e-05, "loss": 0.4494, "step": 138600 }, { "epoch": 13.889722805964174, "grad_norm": 24.689815521240234, "learning_rate": 4.106912160644942e-05, "loss": 0.4207, "step": 138800 }, { "epoch": 13.90973681577104, "grad_norm": 16.401344299316406, "learning_rate": 4.104502521214518e-05, "loss": 0.4617, "step": 139000 }, { "epoch": 13.90973681577104, "eval_loss": 0.7686560153961182, "eval_runtime": 1490.8369, "eval_samples_per_second": 6.643, "eval_steps_per_second": 1.107, "step": 139000 }, { "epoch": 13.929750825577905, "grad_norm": 5.846863269805908, "learning_rate": 4.102090344453875e-05, "loss": 0.4284, "step": 139200 }, { "epoch": 13.94976483538477, "grad_norm": 18.123268127441406, "learning_rate": 4.0996756341775836e-05, "loss": 0.4323, "step": 139400 }, { "epoch": 13.969778845191634, "grad_norm": 17.534378051757812, "learning_rate": 4.0972583942042215e-05, "loss": 0.4386, "step": 139600 }, { "epoch": 13.989792854998498, "grad_norm": 5.356030464172363, "learning_rate": 4.094838628356368e-05, "loss": 0.438, "step": 139800 }, { "epoch": 14.009806864805364, "grad_norm": 8.589960098266602, "learning_rate": 4.092416340460593e-05, "loss": 0.4332, "step": 140000 }, { "epoch": 14.009806864805364, "eval_loss": 0.7382287383079529, "eval_runtime": 1487.992, "eval_samples_per_second": 6.656, "eval_steps_per_second": 1.11, "step": 140000 }, { "epoch": 14.029820874612229, "grad_norm": 5.245506763458252, "learning_rate": 4.089991534347459e-05, "loss": 0.4142, "step": 140200 }, { "epoch": 14.049834884419093, "grad_norm": 18.552839279174805, "learning_rate": 4.087564213851509e-05, "loss": 0.4048, "step": 140400 }, { "epoch": 14.069848894225958, "grad_norm": 4.969577312469482, "learning_rate": 4.085134382811262e-05, "loss": 0.4078, "step": 140600 }, { "epoch": 14.089862904032824, "grad_norm": 2.864461660385132, "learning_rate": 4.0827020450692064e-05, "loss": 0.3986, "step": 140800 }, { "epoch": 14.109876913839688, "grad_norm": 6.569096088409424, "learning_rate": 4.080267204471796e-05, "loss": 0.4496, "step": 141000 }, { "epoch": 14.109876913839688, "eval_loss": 0.7448933720588684, "eval_runtime": 1491.2217, "eval_samples_per_second": 6.642, "eval_steps_per_second": 1.107, "step": 141000 }, { "epoch": 14.129890923646553, "grad_norm": 4.149287700653076, "learning_rate": 4.0778298648694426e-05, "loss": 0.4032, "step": 141200 }, { "epoch": 14.149904933453417, "grad_norm": 4.667185306549072, "learning_rate": 4.0753900301165095e-05, "loss": 0.4176, "step": 141400 }, { "epoch": 14.169918943260281, "grad_norm": 11.26301383972168, "learning_rate": 4.072947704071304e-05, "loss": 0.4406, "step": 141600 }, { "epoch": 14.189932953067148, "grad_norm": 7.106633186340332, "learning_rate": 4.0705028905960766e-05, "loss": 0.4136, "step": 141800 }, { "epoch": 14.209946962874012, "grad_norm": 13.648582458496094, "learning_rate": 4.0680555935570084e-05, "loss": 0.4023, "step": 142000 }, { "epoch": 14.209946962874012, "eval_loss": 0.7900242805480957, "eval_runtime": 1490.0957, "eval_samples_per_second": 6.647, "eval_steps_per_second": 1.108, "step": 142000 }, { "epoch": 14.229960972680876, "grad_norm": 7.681081295013428, "learning_rate": 4.065605816824209e-05, "loss": 0.4314, "step": 142200 }, { "epoch": 14.24997498248774, "grad_norm": 8.42757511138916, "learning_rate": 4.063153564271711e-05, "loss": 0.3877, "step": 142400 }, { "epoch": 14.269988992294607, "grad_norm": 10.055173873901367, "learning_rate": 4.060698839777459e-05, "loss": 0.4052, "step": 142600 }, { "epoch": 14.290003002101471, "grad_norm": 6.625060081481934, "learning_rate": 4.058241647223309e-05, "loss": 0.4019, "step": 142800 }, { "epoch": 14.310017011908336, "grad_norm": 8.984930992126465, "learning_rate": 4.055781990495019e-05, "loss": 0.422, "step": 143000 }, { "epoch": 14.310017011908336, "eval_loss": 0.7608165740966797, "eval_runtime": 1488.9717, "eval_samples_per_second": 6.652, "eval_steps_per_second": 1.109, "step": 143000 }, { "epoch": 14.3300310217152, "grad_norm": 4.40515661239624, "learning_rate": 4.0533198734822435e-05, "loss": 0.4131, "step": 143200 }, { "epoch": 14.350045031522065, "grad_norm": 12.642192840576172, "learning_rate": 4.05085530007853e-05, "loss": 0.4189, "step": 143400 }, { "epoch": 14.37005904132893, "grad_norm": 8.683585166931152, "learning_rate": 4.048388274181307e-05, "loss": 0.4286, "step": 143600 }, { "epoch": 14.390073051135795, "grad_norm": 5.533873081207275, "learning_rate": 4.045918799691883e-05, "loss": 0.4429, "step": 143800 }, { "epoch": 14.41008706094266, "grad_norm": 6.7790303230285645, "learning_rate": 4.0434468805154405e-05, "loss": 0.4451, "step": 144000 }, { "epoch": 14.41008706094266, "eval_loss": 0.7714783549308777, "eval_runtime": 1488.0845, "eval_samples_per_second": 6.656, "eval_steps_per_second": 1.109, "step": 144000 }, { "epoch": 14.430101070749524, "grad_norm": 9.325886726379395, "learning_rate": 4.040972520561023e-05, "loss": 0.4337, "step": 144200 }, { "epoch": 14.45011508055639, "grad_norm": 2.4591455459594727, "learning_rate": 4.038495723741541e-05, "loss": 0.4409, "step": 144400 }, { "epoch": 14.470129090363255, "grad_norm": 4.4016218185424805, "learning_rate": 4.0360164939737514e-05, "loss": 0.4477, "step": 144600 }, { "epoch": 14.490143100170119, "grad_norm": 4.2608642578125, "learning_rate": 4.033534835178262e-05, "loss": 0.4408, "step": 144800 }, { "epoch": 14.510157109976983, "grad_norm": 6.486283779144287, "learning_rate": 4.0310507512795236e-05, "loss": 0.4291, "step": 145000 }, { "epoch": 14.510157109976983, "eval_loss": 0.7712586522102356, "eval_runtime": 1487.987, "eval_samples_per_second": 6.656, "eval_steps_per_second": 1.11, "step": 145000 }, { "epoch": 14.530171119783848, "grad_norm": 16.854297637939453, "learning_rate": 4.0285642462058174e-05, "loss": 0.447, "step": 145200 }, { "epoch": 14.550185129590714, "grad_norm": 4.47225284576416, "learning_rate": 4.026075323889258e-05, "loss": 0.4165, "step": 145400 }, { "epoch": 14.570199139397578, "grad_norm": 23.305692672729492, "learning_rate": 4.0235839882657796e-05, "loss": 0.4153, "step": 145600 }, { "epoch": 14.590213149204443, "grad_norm": 7.9691267013549805, "learning_rate": 4.021090243275133e-05, "loss": 0.4107, "step": 145800 }, { "epoch": 14.610227159011307, "grad_norm": 6.718746185302734, "learning_rate": 4.018594092860881e-05, "loss": 0.4443, "step": 146000 }, { "epoch": 14.610227159011307, "eval_loss": 0.7566423416137695, "eval_runtime": 1490.6783, "eval_samples_per_second": 6.644, "eval_steps_per_second": 1.108, "step": 146000 }, { "epoch": 14.630241168818173, "grad_norm": 4.313332557678223, "learning_rate": 4.016095540970388e-05, "loss": 0.4504, "step": 146200 }, { "epoch": 14.650255178625038, "grad_norm": 9.547863960266113, "learning_rate": 4.013594591554818e-05, "loss": 0.4414, "step": 146400 }, { "epoch": 14.670269188431902, "grad_norm": 8.416645050048828, "learning_rate": 4.0110912485691255e-05, "loss": 0.4507, "step": 146600 }, { "epoch": 14.690283198238767, "grad_norm": 0.9708487391471863, "learning_rate": 4.00858551597205e-05, "loss": 0.4209, "step": 146800 }, { "epoch": 14.710297208045631, "grad_norm": 6.661731243133545, "learning_rate": 4.00607739772611e-05, "loss": 0.4535, "step": 147000 }, { "epoch": 14.710297208045631, "eval_loss": 0.756863534450531, "eval_runtime": 1516.4693, "eval_samples_per_second": 6.531, "eval_steps_per_second": 1.089, "step": 147000 }, { "epoch": 14.730311217852497, "grad_norm": 7.775916576385498, "learning_rate": 4.003566897797596e-05, "loss": 0.4, "step": 147200 }, { "epoch": 14.750325227659362, "grad_norm": 16.315149307250977, "learning_rate": 4.0010540201565675e-05, "loss": 0.4524, "step": 147400 }, { "epoch": 14.770339237466226, "grad_norm": 2.5067367553710938, "learning_rate": 3.9985387687768394e-05, "loss": 0.4374, "step": 147600 }, { "epoch": 14.79035324727309, "grad_norm": 15.096090316772461, "learning_rate": 3.996021147635985e-05, "loss": 0.4465, "step": 147800 }, { "epoch": 14.810367257079957, "grad_norm": 9.167848587036133, "learning_rate": 3.993501160715324e-05, "loss": 0.4392, "step": 148000 }, { "epoch": 14.810367257079957, "eval_loss": 0.7609195113182068, "eval_runtime": 1514.3858, "eval_samples_per_second": 6.54, "eval_steps_per_second": 1.09, "step": 148000 }, { "epoch": 14.830381266886821, "grad_norm": 1.623733401298523, "learning_rate": 3.990978811999913e-05, "loss": 0.4187, "step": 148200 }, { "epoch": 14.850395276693686, "grad_norm": 13.449614524841309, "learning_rate": 3.988454105478551e-05, "loss": 0.4536, "step": 148400 }, { "epoch": 14.87040928650055, "grad_norm": 0.7873796820640564, "learning_rate": 3.985927045143759e-05, "loss": 0.4443, "step": 148600 }, { "epoch": 14.890423296307414, "grad_norm": 13.830516815185547, "learning_rate": 3.983397634991784e-05, "loss": 0.4417, "step": 148800 }, { "epoch": 14.91043730611428, "grad_norm": 1.4764608144760132, "learning_rate": 3.980865879022587e-05, "loss": 0.4357, "step": 149000 }, { "epoch": 14.91043730611428, "eval_loss": 0.7731632590293884, "eval_runtime": 1514.303, "eval_samples_per_second": 6.54, "eval_steps_per_second": 1.09, "step": 149000 }, { "epoch": 14.930451315921145, "grad_norm": 25.803640365600586, "learning_rate": 3.978331781239841e-05, "loss": 0.4192, "step": 149200 }, { "epoch": 14.95046532572801, "grad_norm": 12.558375358581543, "learning_rate": 3.9757953456509185e-05, "loss": 0.4282, "step": 149400 }, { "epoch": 14.970479335534874, "grad_norm": 8.578922271728516, "learning_rate": 3.973256576266893e-05, "loss": 0.4082, "step": 149600 }, { "epoch": 14.99049334534174, "grad_norm": 11.158002853393555, "learning_rate": 3.970715477102526e-05, "loss": 0.4517, "step": 149800 }, { "epoch": 15.010507355148604, "grad_norm": 1.4982997179031372, "learning_rate": 3.968172052176264e-05, "loss": 0.4127, "step": 150000 }, { "epoch": 15.010507355148604, "eval_loss": 0.7653890252113342, "eval_runtime": 1514.1489, "eval_samples_per_second": 6.541, "eval_steps_per_second": 1.09, "step": 150000 }, { "epoch": 15.030521364955469, "grad_norm": 21.794002532958984, "learning_rate": 3.965626305510232e-05, "loss": 0.4003, "step": 150200 }, { "epoch": 15.050535374762333, "grad_norm": 3.7466158866882324, "learning_rate": 3.9630782411302256e-05, "loss": 0.3859, "step": 150400 }, { "epoch": 15.070549384569198, "grad_norm": 10.125372886657715, "learning_rate": 3.960527863065707e-05, "loss": 0.4043, "step": 150600 }, { "epoch": 15.090563394376064, "grad_norm": 10.071447372436523, "learning_rate": 3.9579751753497954e-05, "loss": 0.3909, "step": 150800 }, { "epoch": 15.110577404182928, "grad_norm": 5.57199239730835, "learning_rate": 3.955420182019264e-05, "loss": 0.4037, "step": 151000 }, { "epoch": 15.110577404182928, "eval_loss": 0.7820418477058411, "eval_runtime": 1510.9579, "eval_samples_per_second": 6.555, "eval_steps_per_second": 1.093, "step": 151000 }, { "epoch": 15.130591413989793, "grad_norm": 7.519534111022949, "learning_rate": 3.9528628871145295e-05, "loss": 0.395, "step": 151200 }, { "epoch": 15.150605423796657, "grad_norm": 16.392948150634766, "learning_rate": 3.950303294679653e-05, "loss": 0.4206, "step": 151400 }, { "epoch": 15.170619433603523, "grad_norm": 8.056946754455566, "learning_rate": 3.9477414087623235e-05, "loss": 0.4336, "step": 151600 }, { "epoch": 15.190633443410388, "grad_norm": 21.798471450805664, "learning_rate": 3.9451772334138605e-05, "loss": 0.3902, "step": 151800 }, { "epoch": 15.210647453217252, "grad_norm": 9.109590530395508, "learning_rate": 3.9426107726892035e-05, "loss": 0.4121, "step": 152000 }, { "epoch": 15.210647453217252, "eval_loss": 0.7755745053291321, "eval_runtime": 1510.2656, "eval_samples_per_second": 6.558, "eval_steps_per_second": 1.093, "step": 152000 }, { "epoch": 15.230661463024116, "grad_norm": 5.246907711029053, "learning_rate": 3.940042030646904e-05, "loss": 0.3851, "step": 152200 }, { "epoch": 15.25067547283098, "grad_norm": 2.519624710083008, "learning_rate": 3.9374710113491236e-05, "loss": 0.4079, "step": 152400 }, { "epoch": 15.270689482637847, "grad_norm": 26.723350524902344, "learning_rate": 3.934897718861624e-05, "loss": 0.4131, "step": 152600 }, { "epoch": 15.290703492444711, "grad_norm": 3.7904245853424072, "learning_rate": 3.932322157253761e-05, "loss": 0.4022, "step": 152800 }, { "epoch": 15.310717502251576, "grad_norm": 3.840501308441162, "learning_rate": 3.9297443305984796e-05, "loss": 0.4042, "step": 153000 }, { "epoch": 15.310717502251576, "eval_loss": 0.7703304886817932, "eval_runtime": 1512.2252, "eval_samples_per_second": 6.549, "eval_steps_per_second": 1.092, "step": 153000 }, { "epoch": 15.33073151205844, "grad_norm": 6.853086471557617, "learning_rate": 3.927164242972307e-05, "loss": 0.444, "step": 153200 }, { "epoch": 15.350745521865306, "grad_norm": 12.067425727844238, "learning_rate": 3.924581898455345e-05, "loss": 0.3805, "step": 153400 }, { "epoch": 15.37075953167217, "grad_norm": 16.926166534423828, "learning_rate": 3.921997301131265e-05, "loss": 0.425, "step": 153600 }, { "epoch": 15.390773541479035, "grad_norm": 9.092694282531738, "learning_rate": 3.919410455087301e-05, "loss": 0.3973, "step": 153800 }, { "epoch": 15.4107875512859, "grad_norm": 7.52393102645874, "learning_rate": 3.9168213644142425e-05, "loss": 0.4109, "step": 154000 }, { "epoch": 15.4107875512859, "eval_loss": 0.7817687392234802, "eval_runtime": 1517.2138, "eval_samples_per_second": 6.528, "eval_steps_per_second": 1.088, "step": 154000 }, { "epoch": 15.430801561092766, "grad_norm": 16.566139221191406, "learning_rate": 3.91423003320643e-05, "loss": 0.4461, "step": 154200 }, { "epoch": 15.45081557089963, "grad_norm": 10.28718376159668, "learning_rate": 3.9116364655617445e-05, "loss": 0.3897, "step": 154400 }, { "epoch": 15.470829580706495, "grad_norm": 4.450534343719482, "learning_rate": 3.9090406655816056e-05, "loss": 0.3844, "step": 154600 }, { "epoch": 15.490843590513359, "grad_norm": 6.733483791351318, "learning_rate": 3.906442637370964e-05, "loss": 0.4516, "step": 154800 }, { "epoch": 15.510857600320223, "grad_norm": 1.3769174814224243, "learning_rate": 3.903842385038292e-05, "loss": 0.4225, "step": 155000 }, { "epoch": 15.510857600320223, "eval_loss": 0.7734164595603943, "eval_runtime": 1515.9792, "eval_samples_per_second": 6.533, "eval_steps_per_second": 1.089, "step": 155000 }, { "epoch": 15.53087161012709, "grad_norm": 5.082502365112305, "learning_rate": 3.9012399126955795e-05, "loss": 0.3885, "step": 155200 }, { "epoch": 15.550885619933954, "grad_norm": 29.592838287353516, "learning_rate": 3.898635224458328e-05, "loss": 0.426, "step": 155400 }, { "epoch": 15.570899629740818, "grad_norm": 10.67177677154541, "learning_rate": 3.8960283244455423e-05, "loss": 0.3821, "step": 155600 }, { "epoch": 15.590913639547683, "grad_norm": 19.47475814819336, "learning_rate": 3.8934192167797256e-05, "loss": 0.4423, "step": 155800 }, { "epoch": 15.610927649354547, "grad_norm": 2.8575806617736816, "learning_rate": 3.8908079055868725e-05, "loss": 0.4291, "step": 156000 }, { "epoch": 15.610927649354547, "eval_loss": 0.786810040473938, "eval_runtime": 1512.289, "eval_samples_per_second": 6.549, "eval_steps_per_second": 1.092, "step": 156000 }, { "epoch": 15.630941659161413, "grad_norm": 7.890353202819824, "learning_rate": 3.88819439499646e-05, "loss": 0.4429, "step": 156200 }, { "epoch": 15.650955668968278, "grad_norm": 1.6731680631637573, "learning_rate": 3.885578689141446e-05, "loss": 0.421, "step": 156400 }, { "epoch": 15.670969678775142, "grad_norm": 6.232865810394287, "learning_rate": 3.882960792158258e-05, "loss": 0.4481, "step": 156600 }, { "epoch": 15.690983688582007, "grad_norm": 9.522499084472656, "learning_rate": 3.88034070818679e-05, "loss": 0.4607, "step": 156800 }, { "epoch": 15.710997698388873, "grad_norm": 4.936123847961426, "learning_rate": 3.8777184413703924e-05, "loss": 0.4427, "step": 157000 }, { "epoch": 15.710997698388873, "eval_loss": 0.7637055516242981, "eval_runtime": 1472.2968, "eval_samples_per_second": 6.727, "eval_steps_per_second": 1.121, "step": 157000 }, { "epoch": 15.731011708195737, "grad_norm": 6.290923595428467, "learning_rate": 3.875093995855871e-05, "loss": 0.3829, "step": 157200 }, { "epoch": 15.751025718002602, "grad_norm": 20.11770248413086, "learning_rate": 3.872467375793473e-05, "loss": 0.4088, "step": 157400 }, { "epoch": 15.771039727809466, "grad_norm": 8.758841514587402, "learning_rate": 3.8698385853368856e-05, "loss": 0.4441, "step": 157600 }, { "epoch": 15.791053737616332, "grad_norm": 1.5055787563323975, "learning_rate": 3.86720762864323e-05, "loss": 0.3931, "step": 157800 }, { "epoch": 15.811067747423197, "grad_norm": 15.013199806213379, "learning_rate": 3.8645745098730524e-05, "loss": 0.4275, "step": 158000 }, { "epoch": 15.811067747423197, "eval_loss": 0.7680337429046631, "eval_runtime": 1458.2535, "eval_samples_per_second": 6.792, "eval_steps_per_second": 1.132, "step": 158000 }, { "epoch": 15.831081757230061, "grad_norm": 5.662177562713623, "learning_rate": 3.861939233190316e-05, "loss": 0.4245, "step": 158200 }, { "epoch": 15.851095767036925, "grad_norm": 6.628124237060547, "learning_rate": 3.8593018027623985e-05, "loss": 0.3899, "step": 158400 }, { "epoch": 15.87110977684379, "grad_norm": 3.5307018756866455, "learning_rate": 3.8566622227600834e-05, "loss": 0.4502, "step": 158600 }, { "epoch": 15.891123786650656, "grad_norm": 6.381788730621338, "learning_rate": 3.854020497357552e-05, "loss": 0.4627, "step": 158800 }, { "epoch": 15.91113779645752, "grad_norm": 3.2594547271728516, "learning_rate": 3.851376630732381e-05, "loss": 0.3952, "step": 159000 }, { "epoch": 15.91113779645752, "eval_loss": 0.7483944892883301, "eval_runtime": 1468.4753, "eval_samples_per_second": 6.744, "eval_steps_per_second": 1.124, "step": 159000 }, { "epoch": 15.931151806264385, "grad_norm": 16.968040466308594, "learning_rate": 3.84873062706553e-05, "loss": 0.4179, "step": 159200 }, { "epoch": 15.95116581607125, "grad_norm": 5.338058948516846, "learning_rate": 3.846082490541341e-05, "loss": 0.4195, "step": 159400 }, { "epoch": 15.971179825878115, "grad_norm": 32.63713455200195, "learning_rate": 3.843432225347525e-05, "loss": 0.42, "step": 159600 }, { "epoch": 15.99119383568498, "grad_norm": 0.11113325506448746, "learning_rate": 3.840779835675165e-05, "loss": 0.4148, "step": 159800 }, { "epoch": 16.011207845491846, "grad_norm": 6.0584635734558105, "learning_rate": 3.8381253257186975e-05, "loss": 0.4113, "step": 160000 }, { "epoch": 16.011207845491846, "eval_loss": 0.7617592215538025, "eval_runtime": 1470.7039, "eval_samples_per_second": 6.734, "eval_steps_per_second": 1.123, "step": 160000 }, { "epoch": 16.03122185529871, "grad_norm": 18.655542373657227, "learning_rate": 3.835468699675917e-05, "loss": 0.3813, "step": 160200 }, { "epoch": 16.051235865105575, "grad_norm": 10.799280166625977, "learning_rate": 3.8328099617479606e-05, "loss": 0.3732, "step": 160400 }, { "epoch": 16.07124987491244, "grad_norm": 15.690262794494629, "learning_rate": 3.830149116139306e-05, "loss": 0.4248, "step": 160600 }, { "epoch": 16.091263884719304, "grad_norm": 1.9755300283432007, "learning_rate": 3.827486167057768e-05, "loss": 0.371, "step": 160800 }, { "epoch": 16.111277894526168, "grad_norm": 16.134281158447266, "learning_rate": 3.824821118714481e-05, "loss": 0.39, "step": 161000 }, { "epoch": 16.111277894526168, "eval_loss": 0.7807313203811646, "eval_runtime": 1472.0, "eval_samples_per_second": 6.728, "eval_steps_per_second": 1.122, "step": 161000 }, { "epoch": 16.131291904333033, "grad_norm": 26.669353485107422, "learning_rate": 3.822153975323904e-05, "loss": 0.3942, "step": 161200 }, { "epoch": 16.151305914139897, "grad_norm": 6.815261363983154, "learning_rate": 3.819484741103807e-05, "loss": 0.3949, "step": 161400 }, { "epoch": 16.17131992394676, "grad_norm": 0.662516713142395, "learning_rate": 3.816813420275267e-05, "loss": 0.4042, "step": 161600 }, { "epoch": 16.19133393375363, "grad_norm": 4.4575629234313965, "learning_rate": 3.8141400170626615e-05, "loss": 0.4047, "step": 161800 }, { "epoch": 16.211347943560494, "grad_norm": 7.2186994552612305, "learning_rate": 3.8114645356936586e-05, "loss": 0.4196, "step": 162000 }, { "epoch": 16.211347943560494, "eval_loss": 0.7669842839241028, "eval_runtime": 1470.4529, "eval_samples_per_second": 6.735, "eval_steps_per_second": 1.123, "step": 162000 }, { "epoch": 16.231361953367358, "grad_norm": 5.904379367828369, "learning_rate": 3.808786980399216e-05, "loss": 0.4251, "step": 162200 }, { "epoch": 16.251375963174223, "grad_norm": 7.073555946350098, "learning_rate": 3.8061073554135696e-05, "loss": 0.3884, "step": 162400 }, { "epoch": 16.271389972981087, "grad_norm": 5.102229118347168, "learning_rate": 3.803425664974226e-05, "loss": 0.3592, "step": 162600 }, { "epoch": 16.29140398278795, "grad_norm": 8.483026504516602, "learning_rate": 3.8007419133219624e-05, "loss": 0.3845, "step": 162800 }, { "epoch": 16.311417992594816, "grad_norm": 12.943239212036133, "learning_rate": 3.798056104700812e-05, "loss": 0.3746, "step": 163000 }, { "epoch": 16.311417992594816, "eval_loss": 0.7768390774726868, "eval_runtime": 1468.6036, "eval_samples_per_second": 6.744, "eval_steps_per_second": 1.124, "step": 163000 }, { "epoch": 16.33143200240168, "grad_norm": 26.573715209960938, "learning_rate": 3.795368243358064e-05, "loss": 0.3926, "step": 163200 }, { "epoch": 16.351446012208545, "grad_norm": 5.9382195472717285, "learning_rate": 3.7926783335442494e-05, "loss": 0.3795, "step": 163400 }, { "epoch": 16.371460022015413, "grad_norm": 10.47982120513916, "learning_rate": 3.7899863795131435e-05, "loss": 0.3729, "step": 163600 }, { "epoch": 16.391474031822277, "grad_norm": 15.343215942382812, "learning_rate": 3.787292385521751e-05, "loss": 0.3695, "step": 163800 }, { "epoch": 16.41148804162914, "grad_norm": 19.13174057006836, "learning_rate": 3.784596355830303e-05, "loss": 0.4074, "step": 164000 }, { "epoch": 16.41148804162914, "eval_loss": 0.7845814228057861, "eval_runtime": 1469.4542, "eval_samples_per_second": 6.74, "eval_steps_per_second": 1.124, "step": 164000 }, { "epoch": 16.431502051436006, "grad_norm": 8.790281295776367, "learning_rate": 3.781898294702251e-05, "loss": 0.4324, "step": 164200 }, { "epoch": 16.45151606124287, "grad_norm": 7.333425521850586, "learning_rate": 3.7791982064042586e-05, "loss": 0.3628, "step": 164400 }, { "epoch": 16.471530071049735, "grad_norm": 4.856706619262695, "learning_rate": 3.7764960952061935e-05, "loss": 0.3923, "step": 164600 }, { "epoch": 16.4915440808566, "grad_norm": 36.636962890625, "learning_rate": 3.773791965381126e-05, "loss": 0.3935, "step": 164800 }, { "epoch": 16.511558090663463, "grad_norm": 10.400918960571289, "learning_rate": 3.771085821205314e-05, "loss": 0.4135, "step": 165000 }, { "epoch": 16.511558090663463, "eval_loss": 0.7822126150131226, "eval_runtime": 1462.8728, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.129, "step": 165000 }, { "epoch": 16.531572100470328, "grad_norm": 6.877062797546387, "learning_rate": 3.768377666958204e-05, "loss": 0.4173, "step": 165200 }, { "epoch": 16.551586110277196, "grad_norm": 14.624258041381836, "learning_rate": 3.7656675069224215e-05, "loss": 0.3964, "step": 165400 }, { "epoch": 16.57160012008406, "grad_norm": 12.826676368713379, "learning_rate": 3.7629553453837605e-05, "loss": 0.3726, "step": 165600 }, { "epoch": 16.591614129890925, "grad_norm": 11.437576293945312, "learning_rate": 3.760241186631184e-05, "loss": 0.4233, "step": 165800 }, { "epoch": 16.61162813969779, "grad_norm": 0.6441407799720764, "learning_rate": 3.757525034956812e-05, "loss": 0.3945, "step": 166000 }, { "epoch": 16.61162813969779, "eval_loss": 0.7656811475753784, "eval_runtime": 1464.1701, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.128, "step": 166000 }, { "epoch": 16.631642149504653, "grad_norm": 10.267215728759766, "learning_rate": 3.7548068946559156e-05, "loss": 0.4239, "step": 166200 }, { "epoch": 16.651656159311518, "grad_norm": 1.8072429895401, "learning_rate": 3.7520867700269105e-05, "loss": 0.411, "step": 166400 }, { "epoch": 16.671670169118382, "grad_norm": 1.888088583946228, "learning_rate": 3.74936466537135e-05, "loss": 0.4112, "step": 166600 }, { "epoch": 16.691684178925247, "grad_norm": 9.826933860778809, "learning_rate": 3.746640584993922e-05, "loss": 0.4219, "step": 166800 }, { "epoch": 16.71169818873211, "grad_norm": 10.402721405029297, "learning_rate": 3.743914533202434e-05, "loss": 0.4266, "step": 167000 }, { "epoch": 16.71169818873211, "eval_loss": 0.7653430700302124, "eval_runtime": 1465.5815, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.127, "step": 167000 }, { "epoch": 16.73171219853898, "grad_norm": 11.12868881225586, "learning_rate": 3.7411865143078126e-05, "loss": 0.3673, "step": 167200 }, { "epoch": 16.751726208345843, "grad_norm": 5.077221393585205, "learning_rate": 3.7384565326240986e-05, "loss": 0.4265, "step": 167400 }, { "epoch": 16.771740218152708, "grad_norm": 6.271111488342285, "learning_rate": 3.735724592468432e-05, "loss": 0.37, "step": 167600 }, { "epoch": 16.791754227959572, "grad_norm": 18.724306106567383, "learning_rate": 3.732990698161051e-05, "loss": 0.4286, "step": 167800 }, { "epoch": 16.811768237766437, "grad_norm": 18.32990264892578, "learning_rate": 3.730254854025286e-05, "loss": 0.451, "step": 168000 }, { "epoch": 16.811768237766437, "eval_loss": 0.7359278202056885, "eval_runtime": 1566.4575, "eval_samples_per_second": 6.323, "eval_steps_per_second": 1.054, "step": 168000 }, { "epoch": 16.8317822475733, "grad_norm": 1.822493314743042, "learning_rate": 3.7275170643875506e-05, "loss": 0.4114, "step": 168200 }, { "epoch": 16.851796257380165, "grad_norm": 12.630325317382812, "learning_rate": 3.724777333577332e-05, "loss": 0.393, "step": 168400 }, { "epoch": 16.87181026718703, "grad_norm": 10.155108451843262, "learning_rate": 3.72203566592719e-05, "loss": 0.4043, "step": 168600 }, { "epoch": 16.891824276993894, "grad_norm": 4.053652763366699, "learning_rate": 3.719292065772747e-05, "loss": 0.3815, "step": 168800 }, { "epoch": 16.911838286800762, "grad_norm": 2.4216363430023193, "learning_rate": 3.7165465374526805e-05, "loss": 0.4208, "step": 169000 }, { "epoch": 16.911838286800762, "eval_loss": 0.750212550163269, "eval_runtime": 1563.0404, "eval_samples_per_second": 6.336, "eval_steps_per_second": 1.056, "step": 169000 }, { "epoch": 16.931852296607627, "grad_norm": 23.250818252563477, "learning_rate": 3.713799085308716e-05, "loss": 0.4513, "step": 169200 }, { "epoch": 16.95186630641449, "grad_norm": 1.0921449661254883, "learning_rate": 3.711049713685626e-05, "loss": 0.3963, "step": 169400 }, { "epoch": 16.971880316221355, "grad_norm": 25.49650001525879, "learning_rate": 3.708298426931213e-05, "loss": 0.3916, "step": 169600 }, { "epoch": 16.99189432602822, "grad_norm": 31.79463768005371, "learning_rate": 3.70554522939631e-05, "loss": 0.3846, "step": 169800 }, { "epoch": 17.011908335835084, "grad_norm": 0.188314750790596, "learning_rate": 3.702790125434773e-05, "loss": 0.3902, "step": 170000 }, { "epoch": 17.011908335835084, "eval_loss": 0.7739624977111816, "eval_runtime": 1565.3238, "eval_samples_per_second": 6.327, "eval_steps_per_second": 1.055, "step": 170000 }, { "epoch": 17.03192234564195, "grad_norm": 4.140522480010986, "learning_rate": 3.7000331194034727e-05, "loss": 0.3818, "step": 170200 }, { "epoch": 17.051936355448813, "grad_norm": 10.223337173461914, "learning_rate": 3.6972742156622844e-05, "loss": 0.3892, "step": 170400 }, { "epoch": 17.071950365255677, "grad_norm": 4.8125810623168945, "learning_rate": 3.694513418574089e-05, "loss": 0.3871, "step": 170600 }, { "epoch": 17.091964375062545, "grad_norm": 30.56576156616211, "learning_rate": 3.691750732504757e-05, "loss": 0.3949, "step": 170800 }, { "epoch": 17.11197838486941, "grad_norm": 14.917612075805664, "learning_rate": 3.68898616182315e-05, "loss": 0.3478, "step": 171000 }, { "epoch": 17.11197838486941, "eval_loss": 0.7645723819732666, "eval_runtime": 1512.9349, "eval_samples_per_second": 6.546, "eval_steps_per_second": 1.091, "step": 171000 }, { "epoch": 17.131992394676274, "grad_norm": 4.803834915161133, "learning_rate": 3.686219710901109e-05, "loss": 0.3743, "step": 171200 }, { "epoch": 17.15200640448314, "grad_norm": 3.2290098667144775, "learning_rate": 3.6834513841134464e-05, "loss": 0.3887, "step": 171400 }, { "epoch": 17.172020414290003, "grad_norm": 11.044529914855957, "learning_rate": 3.680681185837942e-05, "loss": 0.3891, "step": 171600 }, { "epoch": 17.192034424096867, "grad_norm": 6.4509596824646, "learning_rate": 3.6779091204553365e-05, "loss": 0.3663, "step": 171800 }, { "epoch": 17.212048433903732, "grad_norm": 8.678690910339355, "learning_rate": 3.67513519234932e-05, "loss": 0.3707, "step": 172000 }, { "epoch": 17.212048433903732, "eval_loss": 0.7939319610595703, "eval_runtime": 1516.7403, "eval_samples_per_second": 6.53, "eval_steps_per_second": 1.089, "step": 172000 }, { "epoch": 17.232062443710596, "grad_norm": 12.71422290802002, "learning_rate": 3.6723594059065334e-05, "loss": 0.3884, "step": 172200 }, { "epoch": 17.25207645351746, "grad_norm": 5.6347832679748535, "learning_rate": 3.669581765516552e-05, "loss": 0.3861, "step": 172400 }, { "epoch": 17.27209046332433, "grad_norm": 20.509811401367188, "learning_rate": 3.666802275571884e-05, "loss": 0.3875, "step": 172600 }, { "epoch": 17.292104473131193, "grad_norm": 3.946704864501953, "learning_rate": 3.664020940467963e-05, "loss": 0.3885, "step": 172800 }, { "epoch": 17.312118482938057, "grad_norm": 7.720773220062256, "learning_rate": 3.6612377646031405e-05, "loss": 0.397, "step": 173000 }, { "epoch": 17.312118482938057, "eval_loss": 0.798268735408783, "eval_runtime": 1501.3336, "eval_samples_per_second": 6.597, "eval_steps_per_second": 1.1, "step": 173000 }, { "epoch": 17.332132492744922, "grad_norm": 12.45269775390625, "learning_rate": 3.658452752378678e-05, "loss": 0.3768, "step": 173200 }, { "epoch": 17.352146502551786, "grad_norm": 9.128340721130371, "learning_rate": 3.655665908198742e-05, "loss": 0.4101, "step": 173400 }, { "epoch": 17.37216051235865, "grad_norm": 29.633726119995117, "learning_rate": 3.652877236470395e-05, "loss": 0.3924, "step": 173600 }, { "epoch": 17.392174522165515, "grad_norm": 33.25758361816406, "learning_rate": 3.650086741603591e-05, "loss": 0.3719, "step": 173800 }, { "epoch": 17.41218853197238, "grad_norm": 1.5027943849563599, "learning_rate": 3.647294428011167e-05, "loss": 0.3742, "step": 174000 }, { "epoch": 17.41218853197238, "eval_loss": 0.7619876861572266, "eval_runtime": 1555.8838, "eval_samples_per_second": 6.366, "eval_steps_per_second": 1.061, "step": 174000 }, { "epoch": 17.432202541779244, "grad_norm": 17.718544006347656, "learning_rate": 3.644500300108834e-05, "loss": 0.39, "step": 174200 }, { "epoch": 17.452216551586112, "grad_norm": 11.692049980163574, "learning_rate": 3.641704362315174e-05, "loss": 0.4157, "step": 174400 }, { "epoch": 17.472230561392976, "grad_norm": 9.977456092834473, "learning_rate": 3.6389066190516305e-05, "loss": 0.3877, "step": 174600 }, { "epoch": 17.49224457119984, "grad_norm": 19.431852340698242, "learning_rate": 3.636107074742503e-05, "loss": 0.4016, "step": 174800 }, { "epoch": 17.512258581006705, "grad_norm": 6.457306861877441, "learning_rate": 3.6333057338149364e-05, "loss": 0.382, "step": 175000 }, { "epoch": 17.512258581006705, "eval_loss": 0.7596673965454102, "eval_runtime": 1514.3753, "eval_samples_per_second": 6.54, "eval_steps_per_second": 1.09, "step": 175000 }, { "epoch": 17.53227259081357, "grad_norm": 19.771194458007812, "learning_rate": 3.630502600698922e-05, "loss": 0.3891, "step": 175200 }, { "epoch": 17.552286600620434, "grad_norm": 5.6119608879089355, "learning_rate": 3.627697679827279e-05, "loss": 0.3988, "step": 175400 }, { "epoch": 17.5723006104273, "grad_norm": 10.194558143615723, "learning_rate": 3.624890975635658e-05, "loss": 0.3854, "step": 175600 }, { "epoch": 17.592314620234163, "grad_norm": 10.260909080505371, "learning_rate": 3.622082492562529e-05, "loss": 0.3844, "step": 175800 }, { "epoch": 17.612328630041027, "grad_norm": 17.542762756347656, "learning_rate": 3.619272235049173e-05, "loss": 0.3772, "step": 176000 }, { "epoch": 17.612328630041027, "eval_loss": 0.7801889181137085, "eval_runtime": 1517.7481, "eval_samples_per_second": 6.525, "eval_steps_per_second": 1.088, "step": 176000 }, { "epoch": 17.632342639847895, "grad_norm": 22.70116424560547, "learning_rate": 3.616460207539679e-05, "loss": 0.4001, "step": 176200 }, { "epoch": 17.65235664965476, "grad_norm": 7.569830894470215, "learning_rate": 3.613646414480936e-05, "loss": 0.3709, "step": 176400 }, { "epoch": 17.672370659461624, "grad_norm": 8.641268730163574, "learning_rate": 3.610830860322621e-05, "loss": 0.4012, "step": 176600 }, { "epoch": 17.69238466926849, "grad_norm": 2.2792892456054688, "learning_rate": 3.608013549517202e-05, "loss": 0.4028, "step": 176800 }, { "epoch": 17.712398679075353, "grad_norm": 8.134289741516113, "learning_rate": 3.605194486519919e-05, "loss": 0.3918, "step": 177000 }, { "epoch": 17.712398679075353, "eval_loss": 0.793817400932312, "eval_runtime": 1527.4622, "eval_samples_per_second": 6.484, "eval_steps_per_second": 1.081, "step": 177000 }, { "epoch": 17.732412688882217, "grad_norm": 2.9860496520996094, "learning_rate": 3.6023736757887856e-05, "loss": 0.3777, "step": 177200 }, { "epoch": 17.75242669868908, "grad_norm": 13.93966007232666, "learning_rate": 3.599551121784579e-05, "loss": 0.3983, "step": 177400 }, { "epoch": 17.772440708495946, "grad_norm": 9.91971206665039, "learning_rate": 3.596726828970835e-05, "loss": 0.4491, "step": 177600 }, { "epoch": 17.79245471830281, "grad_norm": 17.03766632080078, "learning_rate": 3.593900801813835e-05, "loss": 0.4157, "step": 177800 }, { "epoch": 17.81246872810968, "grad_norm": 3.748239278793335, "learning_rate": 3.5910730447826065e-05, "loss": 0.3816, "step": 178000 }, { "epoch": 17.81246872810968, "eval_loss": 0.765974760055542, "eval_runtime": 1509.3165, "eval_samples_per_second": 6.562, "eval_steps_per_second": 1.094, "step": 178000 }, { "epoch": 17.832482737916543, "grad_norm": 5.083103179931641, "learning_rate": 3.5882435623489124e-05, "loss": 0.3714, "step": 178200 }, { "epoch": 17.852496747723407, "grad_norm": 6.431742191314697, "learning_rate": 3.585412358987241e-05, "loss": 0.4274, "step": 178400 }, { "epoch": 17.87251075753027, "grad_norm": 5.3167877197265625, "learning_rate": 3.5825794391748065e-05, "loss": 0.3574, "step": 178600 }, { "epoch": 17.892524767337136, "grad_norm": 18.877614974975586, "learning_rate": 3.579744807391533e-05, "loss": 0.4278, "step": 178800 }, { "epoch": 17.912538777144, "grad_norm": 23.500747680664062, "learning_rate": 3.576908468120055e-05, "loss": 0.3966, "step": 179000 }, { "epoch": 17.912538777144, "eval_loss": 0.8019661903381348, "eval_runtime": 1490.4604, "eval_samples_per_second": 6.645, "eval_steps_per_second": 1.108, "step": 179000 }, { "epoch": 17.932552786950865, "grad_norm": 2.8193018436431885, "learning_rate": 3.574070425845707e-05, "loss": 0.4075, "step": 179200 }, { "epoch": 17.95256679675773, "grad_norm": 10.545781135559082, "learning_rate": 3.5712306850565166e-05, "loss": 0.4059, "step": 179400 }, { "epoch": 17.972580806564594, "grad_norm": 12.210260391235352, "learning_rate": 3.5683892502431935e-05, "loss": 0.4082, "step": 179600 }, { "epoch": 17.99259481637146, "grad_norm": 7.547031879425049, "learning_rate": 3.565546125899133e-05, "loss": 0.4099, "step": 179800 }, { "epoch": 18.012608826178326, "grad_norm": 2.5298640727996826, "learning_rate": 3.5627013165203956e-05, "loss": 0.3678, "step": 180000 }, { "epoch": 18.012608826178326, "eval_loss": 0.7820643782615662, "eval_runtime": 1490.9156, "eval_samples_per_second": 6.643, "eval_steps_per_second": 1.107, "step": 180000 }, { "epoch": 18.03262283598519, "grad_norm": 6.786888599395752, "learning_rate": 3.5598548266057114e-05, "loss": 0.3125, "step": 180200 }, { "epoch": 18.052636845792055, "grad_norm": 6.623667240142822, "learning_rate": 3.5570066606564665e-05, "loss": 0.3686, "step": 180400 }, { "epoch": 18.07265085559892, "grad_norm": 2.2229738235473633, "learning_rate": 3.5541568231766955e-05, "loss": 0.3613, "step": 180600 }, { "epoch": 18.092664865405784, "grad_norm": 8.915009498596191, "learning_rate": 3.5513053186730794e-05, "loss": 0.3613, "step": 180800 }, { "epoch": 18.112678875212648, "grad_norm": 1.1803689002990723, "learning_rate": 3.5484521516549334e-05, "loss": 0.3584, "step": 181000 }, { "epoch": 18.112678875212648, "eval_loss": 0.7887444496154785, "eval_runtime": 1515.6295, "eval_samples_per_second": 6.535, "eval_steps_per_second": 1.089, "step": 181000 }, { "epoch": 18.132692885019512, "grad_norm": 12.71410083770752, "learning_rate": 3.545597326634202e-05, "loss": 0.36, "step": 181200 }, { "epoch": 18.152706894826377, "grad_norm": 20.662046432495117, "learning_rate": 3.542740848125453e-05, "loss": 0.3569, "step": 181400 }, { "epoch": 18.172720904633245, "grad_norm": 5.769546985626221, "learning_rate": 3.5398827206458675e-05, "loss": 0.3483, "step": 181600 }, { "epoch": 18.19273491444011, "grad_norm": 24.562068939208984, "learning_rate": 3.537022948715234e-05, "loss": 0.3554, "step": 181800 }, { "epoch": 18.212748924246974, "grad_norm": 11.538078308105469, "learning_rate": 3.534161536855944e-05, "loss": 0.3809, "step": 182000 }, { "epoch": 18.212748924246974, "eval_loss": 0.7801101207733154, "eval_runtime": 1548.4296, "eval_samples_per_second": 6.396, "eval_steps_per_second": 1.066, "step": 182000 }, { "epoch": 18.232762934053838, "grad_norm": 13.441089630126953, "learning_rate": 3.531298489592979e-05, "loss": 0.3647, "step": 182200 }, { "epoch": 18.252776943860702, "grad_norm": 6.621340274810791, "learning_rate": 3.528433811453908e-05, "loss": 0.3641, "step": 182400 }, { "epoch": 18.272790953667567, "grad_norm": 7.562870025634766, "learning_rate": 3.525567506968881e-05, "loss": 0.3384, "step": 182600 }, { "epoch": 18.29280496347443, "grad_norm": 18.719125747680664, "learning_rate": 3.5226995806706176e-05, "loss": 0.3766, "step": 182800 }, { "epoch": 18.312818973281296, "grad_norm": 22.917043685913086, "learning_rate": 3.5198300370944017e-05, "loss": 0.362, "step": 183000 }, { "epoch": 18.312818973281296, "eval_loss": 0.770767331123352, "eval_runtime": 1550.5514, "eval_samples_per_second": 6.387, "eval_steps_per_second": 1.065, "step": 183000 }, { "epoch": 18.33283298308816, "grad_norm": 6.396115303039551, "learning_rate": 3.5169588807780766e-05, "loss": 0.3535, "step": 183200 }, { "epoch": 18.352846992895028, "grad_norm": 5.804123401641846, "learning_rate": 3.514086116262036e-05, "loss": 0.3613, "step": 183400 }, { "epoch": 18.372861002701892, "grad_norm": 2.7069480419158936, "learning_rate": 3.511211748089214e-05, "loss": 0.3664, "step": 183600 }, { "epoch": 18.392875012508757, "grad_norm": 1.0863311290740967, "learning_rate": 3.508335780805082e-05, "loss": 0.4007, "step": 183800 }, { "epoch": 18.41288902231562, "grad_norm": 4.851414203643799, "learning_rate": 3.505458218957644e-05, "loss": 0.3779, "step": 184000 }, { "epoch": 18.41288902231562, "eval_loss": 0.7830953001976013, "eval_runtime": 1548.7746, "eval_samples_per_second": 6.395, "eval_steps_per_second": 1.066, "step": 184000 }, { "epoch": 18.432903032122486, "grad_norm": 12.451931953430176, "learning_rate": 3.50257906709742e-05, "loss": 0.4012, "step": 184200 }, { "epoch": 18.45291704192935, "grad_norm": 14.96850872039795, "learning_rate": 3.499698329777448e-05, "loss": 0.3618, "step": 184400 }, { "epoch": 18.472931051736214, "grad_norm": 0.2603464126586914, "learning_rate": 3.4968160115532704e-05, "loss": 0.3871, "step": 184600 }, { "epoch": 18.49294506154308, "grad_norm": 3.26134991645813, "learning_rate": 3.493932116982932e-05, "loss": 0.3965, "step": 184800 }, { "epoch": 18.512959071349947, "grad_norm": 6.965743064880371, "learning_rate": 3.4910466506269706e-05, "loss": 0.3967, "step": 185000 }, { "epoch": 18.512959071349947, "eval_loss": 0.7649522423744202, "eval_runtime": 1547.8939, "eval_samples_per_second": 6.398, "eval_steps_per_second": 1.067, "step": 185000 }, { "epoch": 18.53297308115681, "grad_norm": 3.061868667602539, "learning_rate": 3.4881596170484074e-05, "loss": 0.3557, "step": 185200 }, { "epoch": 18.552987090963676, "grad_norm": 7.551391124725342, "learning_rate": 3.485271020812744e-05, "loss": 0.4003, "step": 185400 }, { "epoch": 18.57300110077054, "grad_norm": 16.62161636352539, "learning_rate": 3.4823808664879514e-05, "loss": 0.3715, "step": 185600 }, { "epoch": 18.593015110577404, "grad_norm": 22.237520217895508, "learning_rate": 3.479489158644467e-05, "loss": 0.4018, "step": 185800 }, { "epoch": 18.61302912038427, "grad_norm": 17.28827667236328, "learning_rate": 3.476595901855181e-05, "loss": 0.4066, "step": 186000 }, { "epoch": 18.61302912038427, "eval_loss": 0.8051779270172119, "eval_runtime": 1546.1396, "eval_samples_per_second": 6.406, "eval_steps_per_second": 1.068, "step": 186000 }, { "epoch": 18.633043130191133, "grad_norm": 5.126684665679932, "learning_rate": 3.4737011006954384e-05, "loss": 0.3617, "step": 186200 }, { "epoch": 18.653057139997998, "grad_norm": 14.021966934204102, "learning_rate": 3.470804759743021e-05, "loss": 0.4295, "step": 186400 }, { "epoch": 18.673071149804862, "grad_norm": 7.209155082702637, "learning_rate": 3.4679068835781494e-05, "loss": 0.374, "step": 186600 }, { "epoch": 18.693085159611726, "grad_norm": 9.020947456359863, "learning_rate": 3.465007476783471e-05, "loss": 0.3995, "step": 186800 }, { "epoch": 18.713099169418594, "grad_norm": 0.15609002113342285, "learning_rate": 3.462106543944052e-05, "loss": 0.3761, "step": 187000 }, { "epoch": 18.713099169418594, "eval_loss": 0.7985454201698303, "eval_runtime": 1547.8637, "eval_samples_per_second": 6.398, "eval_steps_per_second": 1.067, "step": 187000 }, { "epoch": 18.73311317922546, "grad_norm": 10.072789192199707, "learning_rate": 3.4592040896473745e-05, "loss": 0.3879, "step": 187200 }, { "epoch": 18.753127189032323, "grad_norm": 7.9964919090271, "learning_rate": 3.456300118483325e-05, "loss": 0.3962, "step": 187400 }, { "epoch": 18.773141198839188, "grad_norm": 20.1501522064209, "learning_rate": 3.453394635044188e-05, "loss": 0.3908, "step": 187600 }, { "epoch": 18.793155208646052, "grad_norm": 11.060686111450195, "learning_rate": 3.450487643924642e-05, "loss": 0.3736, "step": 187800 }, { "epoch": 18.813169218452916, "grad_norm": 2.367464542388916, "learning_rate": 3.447579149721748e-05, "loss": 0.3959, "step": 188000 }, { "epoch": 18.813169218452916, "eval_loss": 0.7910016775131226, "eval_runtime": 1525.4894, "eval_samples_per_second": 6.492, "eval_steps_per_second": 1.082, "step": 188000 }, { "epoch": 18.83318322825978, "grad_norm": 12.449917793273926, "learning_rate": 3.444669157034944e-05, "loss": 0.3878, "step": 188200 }, { "epoch": 18.853197238066645, "grad_norm": 10.483795166015625, "learning_rate": 3.441757670466038e-05, "loss": 0.3717, "step": 188400 }, { "epoch": 18.873211247873513, "grad_norm": 4.330296516418457, "learning_rate": 3.4388446946192e-05, "loss": 0.3911, "step": 188600 }, { "epoch": 18.893225257680378, "grad_norm": 6.799245357513428, "learning_rate": 3.4359302341009556e-05, "loss": 0.3601, "step": 188800 }, { "epoch": 18.913239267487242, "grad_norm": 9.315254211425781, "learning_rate": 3.433014293520177e-05, "loss": 0.3807, "step": 189000 }, { "epoch": 18.913239267487242, "eval_loss": 0.8016336560249329, "eval_runtime": 1514.9739, "eval_samples_per_second": 6.537, "eval_steps_per_second": 1.09, "step": 189000 }, { "epoch": 18.933253277294106, "grad_norm": 5.664348602294922, "learning_rate": 3.4300968774880805e-05, "loss": 0.3855, "step": 189200 }, { "epoch": 18.95326728710097, "grad_norm": 13.287593841552734, "learning_rate": 3.4271779906182114e-05, "loss": 0.3544, "step": 189400 }, { "epoch": 18.973281296907835, "grad_norm": 8.481534957885742, "learning_rate": 3.4242576375264426e-05, "loss": 0.415, "step": 189600 }, { "epoch": 18.9932953067147, "grad_norm": 10.081475257873535, "learning_rate": 3.4213358228309666e-05, "loss": 0.4443, "step": 189800 }, { "epoch": 19.013309316521564, "grad_norm": 3.9933559894561768, "learning_rate": 3.418412551152286e-05, "loss": 0.341, "step": 190000 }, { "epoch": 19.013309316521564, "eval_loss": 0.7955721020698547, "eval_runtime": 1533.133, "eval_samples_per_second": 6.46, "eval_steps_per_second": 1.077, "step": 190000 }, { "epoch": 19.03332332632843, "grad_norm": 13.757511138916016, "learning_rate": 3.4154878271132076e-05, "loss": 0.356, "step": 190200 }, { "epoch": 19.053337336135296, "grad_norm": 6.414867877960205, "learning_rate": 3.4125616553388344e-05, "loss": 0.3606, "step": 190400 }, { "epoch": 19.07335134594216, "grad_norm": 8.018113136291504, "learning_rate": 3.409634040456561e-05, "loss": 0.3423, "step": 190600 }, { "epoch": 19.093365355749025, "grad_norm": 2.9298839569091797, "learning_rate": 3.406704987096063e-05, "loss": 0.3644, "step": 190800 }, { "epoch": 19.11337936555589, "grad_norm": 17.1110782623291, "learning_rate": 3.40377449988929e-05, "loss": 0.3658, "step": 191000 }, { "epoch": 19.11337936555589, "eval_loss": 0.8007732629776001, "eval_runtime": 1522.3819, "eval_samples_per_second": 6.506, "eval_steps_per_second": 1.084, "step": 191000 }, { "epoch": 19.133393375362754, "grad_norm": 3.623276948928833, "learning_rate": 3.4008425834704586e-05, "loss": 0.3254, "step": 191200 }, { "epoch": 19.15340738516962, "grad_norm": 8.487483978271484, "learning_rate": 3.397909242476049e-05, "loss": 0.3441, "step": 191400 }, { "epoch": 19.173421394976483, "grad_norm": 8.737314224243164, "learning_rate": 3.394974481544787e-05, "loss": 0.3539, "step": 191600 }, { "epoch": 19.193435404783347, "grad_norm": 15.70198917388916, "learning_rate": 3.392038305317653e-05, "loss": 0.3791, "step": 191800 }, { "epoch": 19.21344941459021, "grad_norm": 10.665410995483398, "learning_rate": 3.38910071843786e-05, "loss": 0.3552, "step": 192000 }, { "epoch": 19.21344941459021, "eval_loss": 0.7815977931022644, "eval_runtime": 1502.999, "eval_samples_per_second": 6.589, "eval_steps_per_second": 1.098, "step": 192000 }, { "epoch": 19.23346342439708, "grad_norm": 4.155167579650879, "learning_rate": 3.386161725550851e-05, "loss": 0.3414, "step": 192200 }, { "epoch": 19.253477434203944, "grad_norm": 2.58325457572937, "learning_rate": 3.3832213313042974e-05, "loss": 0.3477, "step": 192400 }, { "epoch": 19.27349144401081, "grad_norm": 12.262226104736328, "learning_rate": 3.3802795403480804e-05, "loss": 0.3734, "step": 192600 }, { "epoch": 19.293505453817673, "grad_norm": 11.248457908630371, "learning_rate": 3.377336357334295e-05, "loss": 0.3619, "step": 192800 }, { "epoch": 19.313519463624537, "grad_norm": 28.491479873657227, "learning_rate": 3.374391786917234e-05, "loss": 0.3733, "step": 193000 }, { "epoch": 19.313519463624537, "eval_loss": 0.8074179291725159, "eval_runtime": 1503.7789, "eval_samples_per_second": 6.586, "eval_steps_per_second": 1.098, "step": 193000 }, { "epoch": 19.3335334734314, "grad_norm": 17.691810607910156, "learning_rate": 3.3714458337533884e-05, "loss": 0.3981, "step": 193200 }, { "epoch": 19.353547483238266, "grad_norm": 7.061100006103516, "learning_rate": 3.368498502501431e-05, "loss": 0.3424, "step": 193400 }, { "epoch": 19.37356149304513, "grad_norm": 6.130334854125977, "learning_rate": 3.3655497978222184e-05, "loss": 0.3262, "step": 193600 }, { "epoch": 19.393575502851995, "grad_norm": 7.46610689163208, "learning_rate": 3.3625997243787756e-05, "loss": 0.3899, "step": 193800 }, { "epoch": 19.413589512658863, "grad_norm": 3.6629462242126465, "learning_rate": 3.3596482868362924e-05, "loss": 0.3542, "step": 194000 }, { "epoch": 19.413589512658863, "eval_loss": 0.7756953239440918, "eval_runtime": 1503.9432, "eval_samples_per_second": 6.585, "eval_steps_per_second": 1.098, "step": 194000 }, { "epoch": 19.433603522465727, "grad_norm": 10.277660369873047, "learning_rate": 3.3566954898621184e-05, "loss": 0.3693, "step": 194200 }, { "epoch": 19.45361753227259, "grad_norm": 3.940843105316162, "learning_rate": 3.353741338125751e-05, "loss": 0.3453, "step": 194400 }, { "epoch": 19.473631542079456, "grad_norm": 8.944561004638672, "learning_rate": 3.35078583629883e-05, "loss": 0.3912, "step": 194600 }, { "epoch": 19.49364555188632, "grad_norm": 12.63878059387207, "learning_rate": 3.3478289890551316e-05, "loss": 0.3636, "step": 194800 }, { "epoch": 19.513659561693185, "grad_norm": 9.266249656677246, "learning_rate": 3.344870801070558e-05, "loss": 0.3541, "step": 195000 }, { "epoch": 19.513659561693185, "eval_loss": 0.8040087223052979, "eval_runtime": 1504.2561, "eval_samples_per_second": 6.584, "eval_steps_per_second": 1.098, "step": 195000 }, { "epoch": 19.53367357150005, "grad_norm": 2.7073543071746826, "learning_rate": 3.341911277023132e-05, "loss": 0.3675, "step": 195200 }, { "epoch": 19.553687581306914, "grad_norm": 3.7014424800872803, "learning_rate": 3.33895042159299e-05, "loss": 0.355, "step": 195400 }, { "epoch": 19.573701591113778, "grad_norm": 2.8240573406219482, "learning_rate": 3.335988239462372e-05, "loss": 0.3711, "step": 195600 }, { "epoch": 19.593715600920646, "grad_norm": 27.749778747558594, "learning_rate": 3.333024735315619e-05, "loss": 0.3421, "step": 195800 }, { "epoch": 19.61372961072751, "grad_norm": 2.5404410362243652, "learning_rate": 3.3300599138391606e-05, "loss": 0.3624, "step": 196000 }, { "epoch": 19.61372961072751, "eval_loss": 0.8013516664505005, "eval_runtime": 1503.0076, "eval_samples_per_second": 6.589, "eval_steps_per_second": 1.098, "step": 196000 }, { "epoch": 19.633743620534375, "grad_norm": 11.92870807647705, "learning_rate": 3.3270937797215085e-05, "loss": 0.3376, "step": 196200 }, { "epoch": 19.65375763034124, "grad_norm": 1.9202059507369995, "learning_rate": 3.3241263376532534e-05, "loss": 0.3503, "step": 196400 }, { "epoch": 19.673771640148104, "grad_norm": 10.40019702911377, "learning_rate": 3.3211575923270524e-05, "loss": 0.3701, "step": 196600 }, { "epoch": 19.693785649954968, "grad_norm": 7.772400856018066, "learning_rate": 3.318187548437622e-05, "loss": 0.3584, "step": 196800 }, { "epoch": 19.713799659761833, "grad_norm": 26.906827926635742, "learning_rate": 3.3152162106817347e-05, "loss": 0.4058, "step": 197000 }, { "epoch": 19.713799659761833, "eval_loss": 0.798649787902832, "eval_runtime": 1503.8161, "eval_samples_per_second": 6.586, "eval_steps_per_second": 1.098, "step": 197000 }, { "epoch": 19.733813669568697, "grad_norm": 8.826122283935547, "learning_rate": 3.31224358375821e-05, "loss": 0.3702, "step": 197200 }, { "epoch": 19.75382767937556, "grad_norm": 0.42952945828437805, "learning_rate": 3.309269672367902e-05, "loss": 0.372, "step": 197400 }, { "epoch": 19.77384168918243, "grad_norm": 15.32080364227295, "learning_rate": 3.306294481213701e-05, "loss": 0.3894, "step": 197600 }, { "epoch": 19.793855698989294, "grad_norm": 14.298660278320312, "learning_rate": 3.3033180150005164e-05, "loss": 0.3363, "step": 197800 }, { "epoch": 19.813869708796158, "grad_norm": 22.742708206176758, "learning_rate": 3.300340278435277e-05, "loss": 0.4016, "step": 198000 }, { "epoch": 19.813869708796158, "eval_loss": 0.7975320816040039, "eval_runtime": 1502.4777, "eval_samples_per_second": 6.592, "eval_steps_per_second": 1.099, "step": 198000 }, { "epoch": 19.833883718603023, "grad_norm": 7.672244548797607, "learning_rate": 3.297361276226919e-05, "loss": 0.3616, "step": 198200 }, { "epoch": 19.853897728409887, "grad_norm": 13.984142303466797, "learning_rate": 3.2943810130863814e-05, "loss": 0.3907, "step": 198400 }, { "epoch": 19.87391173821675, "grad_norm": 1.248093605041504, "learning_rate": 3.291399493726596e-05, "loss": 0.3748, "step": 198600 }, { "epoch": 19.893925748023616, "grad_norm": 0.9091699123382568, "learning_rate": 3.288416722862483e-05, "loss": 0.361, "step": 198800 }, { "epoch": 19.91393975783048, "grad_norm": 63.1245231628418, "learning_rate": 3.285432705210938e-05, "loss": 0.3741, "step": 199000 }, { "epoch": 19.91393975783048, "eval_loss": 0.797353208065033, "eval_runtime": 1501.406, "eval_samples_per_second": 6.596, "eval_steps_per_second": 1.1, "step": 199000 }, { "epoch": 19.933953767637345, "grad_norm": 7.82739782333374, "learning_rate": 3.2824474454908314e-05, "loss": 0.369, "step": 199200 }, { "epoch": 19.953967777444213, "grad_norm": 2.7716870307922363, "learning_rate": 3.2794609484229977e-05, "loss": 0.3794, "step": 199400 }, { "epoch": 19.973981787251077, "grad_norm": 8.549223899841309, "learning_rate": 3.276473218730227e-05, "loss": 0.348, "step": 199600 }, { "epoch": 19.99399579705794, "grad_norm": 10.51595687866211, "learning_rate": 3.27348426113726e-05, "loss": 0.3686, "step": 199800 }, { "epoch": 20.014009806864806, "grad_norm": 0.18855886161327362, "learning_rate": 3.2704940803707756e-05, "loss": 0.3178, "step": 200000 }, { "epoch": 20.014009806864806, "eval_loss": 0.8048191666603088, "eval_runtime": 1502.4774, "eval_samples_per_second": 6.592, "eval_steps_per_second": 1.099, "step": 200000 }, { "epoch": 20.03402381667167, "grad_norm": 17.050275802612305, "learning_rate": 3.267502681159392e-05, "loss": 0.3158, "step": 200200 }, { "epoch": 20.054037826478535, "grad_norm": 3.8888516426086426, "learning_rate": 3.264510068233653e-05, "loss": 0.3169, "step": 200400 }, { "epoch": 20.0740518362854, "grad_norm": 12.56888198852539, "learning_rate": 3.261516246326016e-05, "loss": 0.3174, "step": 200600 }, { "epoch": 20.094065846092263, "grad_norm": 3.1336772441864014, "learning_rate": 3.25852122017086e-05, "loss": 0.3469, "step": 200800 }, { "epoch": 20.114079855899128, "grad_norm": 4.384590148925781, "learning_rate": 3.255524994504459e-05, "loss": 0.3523, "step": 201000 }, { "epoch": 20.114079855899128, "eval_loss": 0.8155721426010132, "eval_runtime": 1501.0781, "eval_samples_per_second": 6.598, "eval_steps_per_second": 1.1, "step": 201000 }, { "epoch": 20.134093865705996, "grad_norm": 13.947051048278809, "learning_rate": 3.2525275740649915e-05, "loss": 0.3348, "step": 201200 }, { "epoch": 20.15410787551286, "grad_norm": 21.520389556884766, "learning_rate": 3.2495289635925195e-05, "loss": 0.3472, "step": 201400 }, { "epoch": 20.174121885319725, "grad_norm": 8.147628784179688, "learning_rate": 3.246529167828991e-05, "loss": 0.3669, "step": 201600 }, { "epoch": 20.19413589512659, "grad_norm": 4.401460647583008, "learning_rate": 3.243528191518226e-05, "loss": 0.3202, "step": 201800 }, { "epoch": 20.214149904933453, "grad_norm": 13.096490859985352, "learning_rate": 3.2405260394059124e-05, "loss": 0.3391, "step": 202000 }, { "epoch": 20.214149904933453, "eval_loss": 0.8122527003288269, "eval_runtime": 1498.4638, "eval_samples_per_second": 6.609, "eval_steps_per_second": 1.102, "step": 202000 }, { "epoch": 20.234163914740318, "grad_norm": 8.165190696716309, "learning_rate": 3.237522716239597e-05, "loss": 0.3495, "step": 202200 }, { "epoch": 20.254177924547182, "grad_norm": 8.230537414550781, "learning_rate": 3.23451822676868e-05, "loss": 0.3281, "step": 202400 }, { "epoch": 20.274191934354047, "grad_norm": 8.653314590454102, "learning_rate": 3.2315125757444034e-05, "loss": 0.3391, "step": 202600 }, { "epoch": 20.29420594416091, "grad_norm": 18.007892608642578, "learning_rate": 3.228505767919848e-05, "loss": 0.3474, "step": 202800 }, { "epoch": 20.31421995396778, "grad_norm": 6.068183898925781, "learning_rate": 3.2254978080499235e-05, "loss": 0.338, "step": 203000 }, { "epoch": 20.31421995396778, "eval_loss": 0.7908835411071777, "eval_runtime": 1488.7024, "eval_samples_per_second": 6.653, "eval_steps_per_second": 1.109, "step": 203000 }, { "epoch": 20.334233963774643, "grad_norm": 55.26036834716797, "learning_rate": 3.22248870089136e-05, "loss": 0.3399, "step": 203200 }, { "epoch": 20.354247973581508, "grad_norm": 1.6583762168884277, "learning_rate": 3.219478451202704e-05, "loss": 0.3561, "step": 203400 }, { "epoch": 20.374261983388372, "grad_norm": 6.456911087036133, "learning_rate": 3.216467063744309e-05, "loss": 0.3321, "step": 203600 }, { "epoch": 20.394275993195237, "grad_norm": 9.357166290283203, "learning_rate": 3.2134545432783256e-05, "loss": 0.3052, "step": 203800 }, { "epoch": 20.4142900030021, "grad_norm": 8.31332778930664, "learning_rate": 3.2104408945686975e-05, "loss": 0.3606, "step": 204000 }, { "epoch": 20.4142900030021, "eval_loss": 0.8134813904762268, "eval_runtime": 1488.5222, "eval_samples_per_second": 6.654, "eval_steps_per_second": 1.109, "step": 204000 }, { "epoch": 20.434304012808965, "grad_norm": 7.425869464874268, "learning_rate": 3.207426122381152e-05, "loss": 0.3477, "step": 204200 }, { "epoch": 20.45431802261583, "grad_norm": 10.714173316955566, "learning_rate": 3.2044102314831934e-05, "loss": 0.361, "step": 204400 }, { "epoch": 20.474332032422694, "grad_norm": 15.123747825622559, "learning_rate": 3.201393226644095e-05, "loss": 0.3687, "step": 204600 }, { "epoch": 20.494346042229562, "grad_norm": 3.1687936782836914, "learning_rate": 3.1983751126348926e-05, "loss": 0.3396, "step": 204800 }, { "epoch": 20.514360052036427, "grad_norm": 22.85626220703125, "learning_rate": 3.195355894228374e-05, "loss": 0.3388, "step": 205000 }, { "epoch": 20.514360052036427, "eval_loss": 0.8053442239761353, "eval_runtime": 1490.4896, "eval_samples_per_second": 6.645, "eval_steps_per_second": 1.108, "step": 205000 }, { "epoch": 20.53437406184329, "grad_norm": 1.0241858959197998, "learning_rate": 3.192335576199075e-05, "loss": 0.3449, "step": 205200 }, { "epoch": 20.554388071650155, "grad_norm": 4.589802265167236, "learning_rate": 3.18931416332327e-05, "loss": 0.3723, "step": 205400 }, { "epoch": 20.57440208145702, "grad_norm": 7.255118370056152, "learning_rate": 3.186291660378965e-05, "loss": 0.3104, "step": 205600 }, { "epoch": 20.594416091263884, "grad_norm": 6.926268577575684, "learning_rate": 3.1832680721458886e-05, "loss": 0.3821, "step": 205800 }, { "epoch": 20.61443010107075, "grad_norm": 11.157844543457031, "learning_rate": 3.180243403405487e-05, "loss": 0.3985, "step": 206000 }, { "epoch": 20.61443010107075, "eval_loss": 0.776350200176239, "eval_runtime": 1487.2312, "eval_samples_per_second": 6.659, "eval_steps_per_second": 1.11, "step": 206000 }, { "epoch": 20.634444110877613, "grad_norm": 5.26931619644165, "learning_rate": 3.177217658940915e-05, "loss": 0.3673, "step": 206200 }, { "epoch": 20.654458120684478, "grad_norm": 8.617594718933105, "learning_rate": 3.174190843537028e-05, "loss": 0.3253, "step": 206400 }, { "epoch": 20.674472130491345, "grad_norm": 10.43480110168457, "learning_rate": 3.1711629619803765e-05, "loss": 0.3513, "step": 206600 }, { "epoch": 20.69448614029821, "grad_norm": 6.719945430755615, "learning_rate": 3.168134019059193e-05, "loss": 0.3903, "step": 206800 }, { "epoch": 20.714500150105074, "grad_norm": 13.390800476074219, "learning_rate": 3.165104019563393e-05, "loss": 0.402, "step": 207000 }, { "epoch": 20.714500150105074, "eval_loss": 0.8006730675697327, "eval_runtime": 1489.7747, "eval_samples_per_second": 6.648, "eval_steps_per_second": 1.108, "step": 207000 }, { "epoch": 20.73451415991194, "grad_norm": 4.664961814880371, "learning_rate": 3.16207296828456e-05, "loss": 0.3583, "step": 207200 }, { "epoch": 20.754528169718803, "grad_norm": 1.4413669109344482, "learning_rate": 3.159040870015941e-05, "loss": 0.3806, "step": 207400 }, { "epoch": 20.774542179525668, "grad_norm": 2.065563201904297, "learning_rate": 3.156007729552442e-05, "loss": 0.3868, "step": 207600 }, { "epoch": 20.794556189332532, "grad_norm": 10.00201416015625, "learning_rate": 3.1529735516906125e-05, "loss": 0.3383, "step": 207800 }, { "epoch": 20.814570199139396, "grad_norm": 5.354613304138184, "learning_rate": 3.149938341228644e-05, "loss": 0.3516, "step": 208000 }, { "epoch": 20.814570199139396, "eval_loss": 0.7904466390609741, "eval_runtime": 1489.9858, "eval_samples_per_second": 6.647, "eval_steps_per_second": 1.108, "step": 208000 }, { "epoch": 20.83458420894626, "grad_norm": 10.926655769348145, "learning_rate": 3.146902102966364e-05, "loss": 0.3538, "step": 208200 }, { "epoch": 20.85459821875313, "grad_norm": 6.700216770172119, "learning_rate": 3.1438648417052195e-05, "loss": 0.3566, "step": 208400 }, { "epoch": 20.874612228559993, "grad_norm": 2.214862585067749, "learning_rate": 3.1408265622482805e-05, "loss": 0.3763, "step": 208600 }, { "epoch": 20.894626238366858, "grad_norm": 9.555449485778809, "learning_rate": 3.1377872694002256e-05, "loss": 0.3576, "step": 208800 }, { "epoch": 20.914640248173722, "grad_norm": 12.834331512451172, "learning_rate": 3.134746967967334e-05, "loss": 0.3744, "step": 209000 }, { "epoch": 20.914640248173722, "eval_loss": 0.7839940786361694, "eval_runtime": 1487.0967, "eval_samples_per_second": 6.66, "eval_steps_per_second": 1.11, "step": 209000 }, { "epoch": 20.934654257980586, "grad_norm": 7.232730388641357, "learning_rate": 3.131705662757482e-05, "loss": 0.3314, "step": 209200 }, { "epoch": 20.95466826778745, "grad_norm": 11.088382720947266, "learning_rate": 3.128663358580134e-05, "loss": 0.3674, "step": 209400 }, { "epoch": 20.974682277594315, "grad_norm": 1.0415066480636597, "learning_rate": 3.125620060246332e-05, "loss": 0.3456, "step": 209600 }, { "epoch": 20.99469628740118, "grad_norm": 6.149880886077881, "learning_rate": 3.122575772568689e-05, "loss": 0.3666, "step": 209800 }, { "epoch": 21.014710297208044, "grad_norm": 14.650206565856934, "learning_rate": 3.119530500361387e-05, "loss": 0.3221, "step": 210000 }, { "epoch": 21.014710297208044, "eval_loss": 0.7715635895729065, "eval_runtime": 1492.6044, "eval_samples_per_second": 6.635, "eval_steps_per_second": 1.106, "step": 210000 }, { "epoch": 21.034724307014912, "grad_norm": 19.30446434020996, "learning_rate": 3.116484248440162e-05, "loss": 0.3146, "step": 210200 }, { "epoch": 21.054738316821776, "grad_norm": 4.13395881652832, "learning_rate": 3.113437021622298e-05, "loss": 0.324, "step": 210400 }, { "epoch": 21.07475232662864, "grad_norm": 17.416893005371094, "learning_rate": 3.110388824726625e-05, "loss": 0.3068, "step": 210600 }, { "epoch": 21.094766336435505, "grad_norm": 10.548822402954102, "learning_rate": 3.107339662573503e-05, "loss": 0.3582, "step": 210800 }, { "epoch": 21.11478034624237, "grad_norm": 16.216642379760742, "learning_rate": 3.104289539984819e-05, "loss": 0.3302, "step": 211000 }, { "epoch": 21.11478034624237, "eval_loss": 0.8083071112632751, "eval_runtime": 1497.7878, "eval_samples_per_second": 6.612, "eval_steps_per_second": 1.102, "step": 211000 }, { "epoch": 21.134794356049234, "grad_norm": 5.75538969039917, "learning_rate": 3.1012384617839816e-05, "loss": 0.3639, "step": 211200 }, { "epoch": 21.1548083658561, "grad_norm": 1.7366374731063843, "learning_rate": 3.098186432795907e-05, "loss": 0.3334, "step": 211400 }, { "epoch": 21.174822375662963, "grad_norm": 9.619279861450195, "learning_rate": 3.0951334578470176e-05, "loss": 0.3508, "step": 211600 }, { "epoch": 21.194836385469827, "grad_norm": 5.790093421936035, "learning_rate": 3.092079541765231e-05, "loss": 0.3335, "step": 211800 }, { "epoch": 21.214850395276695, "grad_norm": 8.613719940185547, "learning_rate": 3.0890246893799524e-05, "loss": 0.3225, "step": 212000 }, { "epoch": 21.214850395276695, "eval_loss": 0.8243083357810974, "eval_runtime": 1496.305, "eval_samples_per_second": 6.619, "eval_steps_per_second": 1.103, "step": 212000 }, { "epoch": 21.23486440508356, "grad_norm": 6.689505100250244, "learning_rate": 3.085968905522068e-05, "loss": 0.3187, "step": 212200 }, { "epoch": 21.254878414890424, "grad_norm": 10.098431587219238, "learning_rate": 3.0829121950239375e-05, "loss": 0.3483, "step": 212400 }, { "epoch": 21.27489242469729, "grad_norm": 14.08651065826416, "learning_rate": 3.079854562719385e-05, "loss": 0.3174, "step": 212600 }, { "epoch": 21.294906434504153, "grad_norm": 12.810739517211914, "learning_rate": 3.0767960134436934e-05, "loss": 0.3261, "step": 212800 }, { "epoch": 21.314920444311017, "grad_norm": 32.22780990600586, "learning_rate": 3.073736552033595e-05, "loss": 0.3203, "step": 213000 }, { "epoch": 21.314920444311017, "eval_loss": 0.8051511645317078, "eval_runtime": 1495.8225, "eval_samples_per_second": 6.621, "eval_steps_per_second": 1.104, "step": 213000 }, { "epoch": 21.33493445411788, "grad_norm": 7.6782546043396, "learning_rate": 3.0706761833272655e-05, "loss": 0.3525, "step": 213200 }, { "epoch": 21.354948463924746, "grad_norm": 5.389456272125244, "learning_rate": 3.067614912164314e-05, "loss": 0.3063, "step": 213400 }, { "epoch": 21.37496247373161, "grad_norm": 16.243350982666016, "learning_rate": 3.064552743385777e-05, "loss": 0.3435, "step": 213600 }, { "epoch": 21.39497648353848, "grad_norm": 24.745712280273438, "learning_rate": 3.06148968183411e-05, "loss": 0.3454, "step": 213800 }, { "epoch": 21.414990493345343, "grad_norm": 9.496084213256836, "learning_rate": 3.0584257323531844e-05, "loss": 0.3332, "step": 214000 }, { "epoch": 21.414990493345343, "eval_loss": 0.7834358215332031, "eval_runtime": 1629.6467, "eval_samples_per_second": 6.077, "eval_steps_per_second": 1.013, "step": 214000 }, { "epoch": 21.435004503152207, "grad_norm": 3.151773452758789, "learning_rate": 3.0553608997882685e-05, "loss": 0.3443, "step": 214200 }, { "epoch": 21.45501851295907, "grad_norm": 14.667357444763184, "learning_rate": 3.052295188986034e-05, "loss": 0.3508, "step": 214400 }, { "epoch": 21.475032522765936, "grad_norm": 7.5599775314331055, "learning_rate": 3.049228604794538e-05, "loss": 0.344, "step": 214600 }, { "epoch": 21.4950465325728, "grad_norm": 2.0568230152130127, "learning_rate": 3.0461611520632182e-05, "loss": 0.3348, "step": 214800 }, { "epoch": 21.515060542379665, "grad_norm": 19.43706512451172, "learning_rate": 3.0430928356428873e-05, "loss": 0.3196, "step": 215000 }, { "epoch": 21.515060542379665, "eval_loss": 0.8127709627151489, "eval_runtime": 1627.6863, "eval_samples_per_second": 6.085, "eval_steps_per_second": 1.014, "step": 215000 }, { "epoch": 21.53507455218653, "grad_norm": 14.796923637390137, "learning_rate": 3.040023660385724e-05, "loss": 0.349, "step": 215200 }, { "epoch": 21.555088561993394, "grad_norm": 13.904855728149414, "learning_rate": 3.0369536311452635e-05, "loss": 0.3645, "step": 215400 }, { "epoch": 21.57510257180026, "grad_norm": 4.179819107055664, "learning_rate": 3.0338827527763924e-05, "loss": 0.3548, "step": 215600 }, { "epoch": 21.595116581607126, "grad_norm": 2.6156089305877686, "learning_rate": 3.030811030135342e-05, "loss": 0.3513, "step": 215800 } ], "logging_steps": 200, "max_steps": 499650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.0937185381113856e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }