diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,10533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8317161075686166, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005544774050457444, + "grad_norm": 1.7043471336364746, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8725, + "step": 1 + }, + { + "epoch": 0.0011089548100914888, + "grad_norm": 1.2943507432937622, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.882, + "step": 2 + }, + { + "epoch": 0.0016634322151372332, + "grad_norm": 1.2082455158233643, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8304, + "step": 3 + }, + { + "epoch": 0.0022179096201829776, + "grad_norm": 0.9585644602775574, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.8958, + "step": 4 + }, + { + "epoch": 0.0027723870252287217, + "grad_norm": 1.1721417903900146, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9696, + "step": 5 + }, + { + "epoch": 0.0033268644302744664, + "grad_norm": 0.9636610150337219, + "learning_rate": 4.800000000000001e-06, + "loss": 1.8921, + "step": 6 + }, + { + "epoch": 0.0038813418353202105, + "grad_norm": 1.1441367864608765, + "learning_rate": 5.600000000000001e-06, + "loss": 1.8852, + "step": 7 + }, + { + "epoch": 0.004435819240365955, + "grad_norm": 0.7332591414451599, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.8977, + "step": 8 + }, + { + "epoch": 0.0049902966454117, + "grad_norm": 0.736126184463501, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.8914, + "step": 9 + }, + { + "epoch": 0.0055447740504574435, + "grad_norm": 0.5888351798057556, + "learning_rate": 8.000000000000001e-06, + "loss": 1.959, + "step": 10 + }, + { + "epoch": 0.006099251455503188, + "grad_norm": 0.5506263971328735, + "learning_rate": 8.8e-06, + "loss": 1.8849, + "step": 11 + }, + { + "epoch": 0.006653728860548933, + "grad_norm": 0.5089067220687866, + "learning_rate": 9.600000000000001e-06, + "loss": 1.8318, + "step": 12 + }, + { + "epoch": 0.007208206265594677, + "grad_norm": 0.5495124459266663, + "learning_rate": 1.04e-05, + "loss": 1.8303, + "step": 13 + }, + { + "epoch": 0.007762683670640421, + "grad_norm": 0.5372406840324402, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.837, + "step": 14 + }, + { + "epoch": 0.008317161075686166, + "grad_norm": 0.4969247877597809, + "learning_rate": 1.2e-05, + "loss": 1.8305, + "step": 15 + }, + { + "epoch": 0.00887163848073191, + "grad_norm": 0.43892425298690796, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.8299, + "step": 16 + }, + { + "epoch": 0.009426115885777655, + "grad_norm": 0.4186934530735016, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.8882, + "step": 17 + }, + { + "epoch": 0.0099805932908234, + "grad_norm": 0.36783891916275024, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.7997, + "step": 18 + }, + { + "epoch": 0.010535070695869144, + "grad_norm": 0.3592284023761749, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.7928, + "step": 19 + }, + { + "epoch": 0.011089548100914887, + "grad_norm": 0.4156821072101593, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.8382, + "step": 20 + }, + { + "epoch": 0.011644025505960632, + "grad_norm": 0.36838528513908386, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.765, + "step": 21 + }, + { + "epoch": 0.012198502911006376, + "grad_norm": 0.4337383508682251, + "learning_rate": 1.76e-05, + "loss": 1.7961, + "step": 22 + }, + { + "epoch": 0.01275298031605212, + "grad_norm": 0.34529271721839905, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.7795, + "step": 23 + }, + { + "epoch": 0.013307457721097865, + "grad_norm": 0.3445529341697693, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.8074, + "step": 24 + }, + { + "epoch": 0.01386193512614361, + "grad_norm": 0.32090720534324646, + "learning_rate": 2e-05, + "loss": 1.7261, + "step": 25 + }, + { + "epoch": 0.014416412531189355, + "grad_norm": 0.30814656615257263, + "learning_rate": 2.08e-05, + "loss": 1.7856, + "step": 26 + }, + { + "epoch": 0.0149708899362351, + "grad_norm": 0.3047591745853424, + "learning_rate": 2.1600000000000003e-05, + "loss": 1.7895, + "step": 27 + }, + { + "epoch": 0.015525367341280842, + "grad_norm": 0.2949264347553253, + "learning_rate": 2.2400000000000002e-05, + "loss": 1.8077, + "step": 28 + }, + { + "epoch": 0.016079844746326587, + "grad_norm": 0.2805996239185333, + "learning_rate": 2.32e-05, + "loss": 1.7806, + "step": 29 + }, + { + "epoch": 0.01663432215137233, + "grad_norm": 0.2810855805873871, + "learning_rate": 2.4e-05, + "loss": 1.7969, + "step": 30 + }, + { + "epoch": 0.017188799556418076, + "grad_norm": 0.2863365411758423, + "learning_rate": 2.4800000000000003e-05, + "loss": 1.8053, + "step": 31 + }, + { + "epoch": 0.01774327696146382, + "grad_norm": 0.28471997380256653, + "learning_rate": 2.5600000000000002e-05, + "loss": 1.7866, + "step": 32 + }, + { + "epoch": 0.018297754366509565, + "grad_norm": 0.2715449631214142, + "learning_rate": 2.6400000000000005e-05, + "loss": 1.7546, + "step": 33 + }, + { + "epoch": 0.01885223177155531, + "grad_norm": 0.256256639957428, + "learning_rate": 2.7200000000000004e-05, + "loss": 1.7548, + "step": 34 + }, + { + "epoch": 0.019406709176601054, + "grad_norm": 0.26935723423957825, + "learning_rate": 2.8e-05, + "loss": 1.7742, + "step": 35 + }, + { + "epoch": 0.0199611865816468, + "grad_norm": 0.24071869254112244, + "learning_rate": 2.8800000000000002e-05, + "loss": 1.7903, + "step": 36 + }, + { + "epoch": 0.020515663986692544, + "grad_norm": 0.26816120743751526, + "learning_rate": 2.96e-05, + "loss": 1.83, + "step": 37 + }, + { + "epoch": 0.021070141391738288, + "grad_norm": 0.2523341774940491, + "learning_rate": 3.0400000000000004e-05, + "loss": 1.7478, + "step": 38 + }, + { + "epoch": 0.02162461879678403, + "grad_norm": 0.25324249267578125, + "learning_rate": 3.1200000000000006e-05, + "loss": 1.763, + "step": 39 + }, + { + "epoch": 0.022179096201829774, + "grad_norm": 0.2640393376350403, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.7722, + "step": 40 + }, + { + "epoch": 0.02273357360687552, + "grad_norm": 0.2618652880191803, + "learning_rate": 3.28e-05, + "loss": 1.693, + "step": 41 + }, + { + "epoch": 0.023288051011921263, + "grad_norm": 0.2571040093898773, + "learning_rate": 3.3600000000000004e-05, + "loss": 1.6989, + "step": 42 + }, + { + "epoch": 0.023842528416967008, + "grad_norm": 0.23714734613895416, + "learning_rate": 3.44e-05, + "loss": 1.769, + "step": 43 + }, + { + "epoch": 0.024397005822012752, + "grad_norm": 0.2353614717721939, + "learning_rate": 3.52e-05, + "loss": 1.7387, + "step": 44 + }, + { + "epoch": 0.024951483227058497, + "grad_norm": 0.2511078715324402, + "learning_rate": 3.6e-05, + "loss": 1.7478, + "step": 45 + }, + { + "epoch": 0.02550596063210424, + "grad_norm": 0.2509211003780365, + "learning_rate": 3.680000000000001e-05, + "loss": 1.7231, + "step": 46 + }, + { + "epoch": 0.026060438037149986, + "grad_norm": 0.2628484070301056, + "learning_rate": 3.76e-05, + "loss": 1.8243, + "step": 47 + }, + { + "epoch": 0.02661491544219573, + "grad_norm": 0.23542599380016327, + "learning_rate": 3.8400000000000005e-05, + "loss": 1.6862, + "step": 48 + }, + { + "epoch": 0.027169392847241475, + "grad_norm": 0.2473030835390091, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.6858, + "step": 49 + }, + { + "epoch": 0.02772387025228722, + "grad_norm": 0.24878697097301483, + "learning_rate": 4e-05, + "loss": 1.7847, + "step": 50 + }, + { + "epoch": 0.028278347657332965, + "grad_norm": 0.23445457220077515, + "learning_rate": 4.08e-05, + "loss": 1.7048, + "step": 51 + }, + { + "epoch": 0.02883282506237871, + "grad_norm": 0.23827125132083893, + "learning_rate": 4.16e-05, + "loss": 1.6579, + "step": 52 + }, + { + "epoch": 0.029387302467424454, + "grad_norm": 0.2579360902309418, + "learning_rate": 4.240000000000001e-05, + "loss": 1.7823, + "step": 53 + }, + { + "epoch": 0.0299417798724702, + "grad_norm": 0.2626456320285797, + "learning_rate": 4.3200000000000007e-05, + "loss": 1.7795, + "step": 54 + }, + { + "epoch": 0.03049625727751594, + "grad_norm": 0.24642065167427063, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.7575, + "step": 55 + }, + { + "epoch": 0.031050734682561684, + "grad_norm": 0.24688799679279327, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.7619, + "step": 56 + }, + { + "epoch": 0.03160521208760743, + "grad_norm": 0.24448993802070618, + "learning_rate": 4.56e-05, + "loss": 1.7114, + "step": 57 + }, + { + "epoch": 0.03215968949265317, + "grad_norm": 0.24135121703147888, + "learning_rate": 4.64e-05, + "loss": 1.7762, + "step": 58 + }, + { + "epoch": 0.03271416689769892, + "grad_norm": 0.24882449209690094, + "learning_rate": 4.72e-05, + "loss": 1.6934, + "step": 59 + }, + { + "epoch": 0.03326864430274466, + "grad_norm": 0.24142266809940338, + "learning_rate": 4.8e-05, + "loss": 1.6882, + "step": 60 + }, + { + "epoch": 0.03382312170779041, + "grad_norm": 0.2599814236164093, + "learning_rate": 4.88e-05, + "loss": 1.755, + "step": 61 + }, + { + "epoch": 0.03437759911283615, + "grad_norm": 0.2435712218284607, + "learning_rate": 4.9600000000000006e-05, + "loss": 1.7064, + "step": 62 + }, + { + "epoch": 0.034932076517881896, + "grad_norm": 0.25077876448631287, + "learning_rate": 5.0400000000000005e-05, + "loss": 1.6786, + "step": 63 + }, + { + "epoch": 0.03548655392292764, + "grad_norm": 0.25001490116119385, + "learning_rate": 5.1200000000000004e-05, + "loss": 1.6983, + "step": 64 + }, + { + "epoch": 0.036041031327973386, + "grad_norm": 0.24909815192222595, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.6403, + "step": 65 + }, + { + "epoch": 0.03659550873301913, + "grad_norm": 0.2543540298938751, + "learning_rate": 5.280000000000001e-05, + "loss": 1.6812, + "step": 66 + }, + { + "epoch": 0.037149986138064875, + "grad_norm": 0.26552706956863403, + "learning_rate": 5.360000000000001e-05, + "loss": 1.7326, + "step": 67 + }, + { + "epoch": 0.03770446354311062, + "grad_norm": 0.24254681169986725, + "learning_rate": 5.440000000000001e-05, + "loss": 1.7514, + "step": 68 + }, + { + "epoch": 0.038258940948156364, + "grad_norm": 0.2548975944519043, + "learning_rate": 5.52e-05, + "loss": 1.6883, + "step": 69 + }, + { + "epoch": 0.03881341835320211, + "grad_norm": 0.25074952840805054, + "learning_rate": 5.6e-05, + "loss": 1.7236, + "step": 70 + }, + { + "epoch": 0.03936789575824785, + "grad_norm": 0.26199671626091003, + "learning_rate": 5.6800000000000005e-05, + "loss": 1.6943, + "step": 71 + }, + { + "epoch": 0.0399223731632936, + "grad_norm": 0.27566343545913696, + "learning_rate": 5.7600000000000004e-05, + "loss": 1.6859, + "step": 72 + }, + { + "epoch": 0.04047685056833934, + "grad_norm": 0.26068150997161865, + "learning_rate": 5.84e-05, + "loss": 1.7378, + "step": 73 + }, + { + "epoch": 0.04103132797338509, + "grad_norm": 0.2568175196647644, + "learning_rate": 5.92e-05, + "loss": 1.6894, + "step": 74 + }, + { + "epoch": 0.04158580537843083, + "grad_norm": 0.2596546709537506, + "learning_rate": 6.000000000000001e-05, + "loss": 1.7519, + "step": 75 + }, + { + "epoch": 0.042140282783476576, + "grad_norm": 0.2714129686355591, + "learning_rate": 6.080000000000001e-05, + "loss": 1.7159, + "step": 76 + }, + { + "epoch": 0.04269476018852232, + "grad_norm": 0.2649083733558655, + "learning_rate": 6.16e-05, + "loss": 1.7031, + "step": 77 + }, + { + "epoch": 0.04324923759356806, + "grad_norm": 0.30052971839904785, + "learning_rate": 6.240000000000001e-05, + "loss": 1.7358, + "step": 78 + }, + { + "epoch": 0.0438037149986138, + "grad_norm": 0.30009427666664124, + "learning_rate": 6.32e-05, + "loss": 1.6739, + "step": 79 + }, + { + "epoch": 0.04435819240365955, + "grad_norm": 0.27122125029563904, + "learning_rate": 6.400000000000001e-05, + "loss": 1.721, + "step": 80 + }, + { + "epoch": 0.04491266980870529, + "grad_norm": 0.30771657824516296, + "learning_rate": 6.48e-05, + "loss": 1.8069, + "step": 81 + }, + { + "epoch": 0.04546714721375104, + "grad_norm": 0.24620802700519562, + "learning_rate": 6.56e-05, + "loss": 1.6847, + "step": 82 + }, + { + "epoch": 0.04602162461879678, + "grad_norm": 0.2720058262348175, + "learning_rate": 6.64e-05, + "loss": 1.7179, + "step": 83 + }, + { + "epoch": 0.046576102023842526, + "grad_norm": 0.24807985126972198, + "learning_rate": 6.720000000000001e-05, + "loss": 1.6593, + "step": 84 + }, + { + "epoch": 0.04713057942888827, + "grad_norm": 0.2534205913543701, + "learning_rate": 6.8e-05, + "loss": 1.7475, + "step": 85 + }, + { + "epoch": 0.047685056833934016, + "grad_norm": 0.28452959656715393, + "learning_rate": 6.88e-05, + "loss": 1.6402, + "step": 86 + }, + { + "epoch": 0.04823953423897976, + "grad_norm": 0.2749129831790924, + "learning_rate": 6.960000000000001e-05, + "loss": 1.7025, + "step": 87 + }, + { + "epoch": 0.048794011644025505, + "grad_norm": 0.26466885209083557, + "learning_rate": 7.04e-05, + "loss": 1.7151, + "step": 88 + }, + { + "epoch": 0.04934848904907125, + "grad_norm": 0.32394999265670776, + "learning_rate": 7.120000000000001e-05, + "loss": 1.6569, + "step": 89 + }, + { + "epoch": 0.049902966454116994, + "grad_norm": 0.23801451921463013, + "learning_rate": 7.2e-05, + "loss": 1.7009, + "step": 90 + }, + { + "epoch": 0.05045744385916274, + "grad_norm": 0.33740007877349854, + "learning_rate": 7.280000000000001e-05, + "loss": 1.6932, + "step": 91 + }, + { + "epoch": 0.05101192126420848, + "grad_norm": 0.2623404562473297, + "learning_rate": 7.360000000000001e-05, + "loss": 1.7311, + "step": 92 + }, + { + "epoch": 0.05156639866925423, + "grad_norm": 0.292054146528244, + "learning_rate": 7.44e-05, + "loss": 1.6297, + "step": 93 + }, + { + "epoch": 0.05212087607429997, + "grad_norm": 0.3140726387500763, + "learning_rate": 7.52e-05, + "loss": 1.7442, + "step": 94 + }, + { + "epoch": 0.05267535347934572, + "grad_norm": 0.2532927989959717, + "learning_rate": 7.6e-05, + "loss": 1.736, + "step": 95 + }, + { + "epoch": 0.05322983088439146, + "grad_norm": 0.31100720167160034, + "learning_rate": 7.680000000000001e-05, + "loss": 1.6931, + "step": 96 + }, + { + "epoch": 0.053784308289437206, + "grad_norm": 0.275055855512619, + "learning_rate": 7.76e-05, + "loss": 1.6661, + "step": 97 + }, + { + "epoch": 0.05433878569448295, + "grad_norm": 0.3106204569339752, + "learning_rate": 7.840000000000001e-05, + "loss": 1.7421, + "step": 98 + }, + { + "epoch": 0.054893263099528695, + "grad_norm": 0.30006468296051025, + "learning_rate": 7.92e-05, + "loss": 1.686, + "step": 99 + }, + { + "epoch": 0.05544774050457444, + "grad_norm": 0.2701585292816162, + "learning_rate": 8e-05, + "loss": 1.6339, + "step": 100 + }, + { + "epoch": 0.056002217909620185, + "grad_norm": 0.31820419430732727, + "learning_rate": 7.999993193868717e-05, + "loss": 1.772, + "step": 101 + }, + { + "epoch": 0.05655669531466593, + "grad_norm": 0.25506505370140076, + "learning_rate": 7.999972775498027e-05, + "loss": 1.6857, + "step": 102 + }, + { + "epoch": 0.057111172719711674, + "grad_norm": 0.27725860476493835, + "learning_rate": 7.999938744957418e-05, + "loss": 1.756, + "step": 103 + }, + { + "epoch": 0.05766565012475742, + "grad_norm": 0.237404003739357, + "learning_rate": 7.999891102362694e-05, + "loss": 1.6907, + "step": 104 + }, + { + "epoch": 0.05822012752980316, + "grad_norm": 0.26032620668411255, + "learning_rate": 7.999829847875989e-05, + "loss": 1.6277, + "step": 105 + }, + { + "epoch": 0.05877460493484891, + "grad_norm": 0.23683027923107147, + "learning_rate": 7.999754981705756e-05, + "loss": 1.6519, + "step": 106 + }, + { + "epoch": 0.05932908233989465, + "grad_norm": 0.2389203906059265, + "learning_rate": 7.999666504106769e-05, + "loss": 1.6287, + "step": 107 + }, + { + "epoch": 0.0598835597449404, + "grad_norm": 0.25654640793800354, + "learning_rate": 7.999564415380122e-05, + "loss": 1.7226, + "step": 108 + }, + { + "epoch": 0.060438037149986135, + "grad_norm": 0.25321975350379944, + "learning_rate": 7.99944871587323e-05, + "loss": 1.6724, + "step": 109 + }, + { + "epoch": 0.06099251455503188, + "grad_norm": 0.25723859667778015, + "learning_rate": 7.999319405979828e-05, + "loss": 1.7587, + "step": 110 + }, + { + "epoch": 0.061546991960077624, + "grad_norm": 0.23742374777793884, + "learning_rate": 7.999176486139964e-05, + "loss": 1.7245, + "step": 111 + }, + { + "epoch": 0.06210146936512337, + "grad_norm": 0.23199070990085602, + "learning_rate": 7.999019956840004e-05, + "loss": 1.7447, + "step": 112 + }, + { + "epoch": 0.06265594677016911, + "grad_norm": 0.2442471832036972, + "learning_rate": 7.998849818612628e-05, + "loss": 1.8053, + "step": 113 + }, + { + "epoch": 0.06321042417521486, + "grad_norm": 0.22759103775024414, + "learning_rate": 7.998666072036827e-05, + "loss": 1.6593, + "step": 114 + }, + { + "epoch": 0.0637649015802606, + "grad_norm": 0.23776134848594666, + "learning_rate": 7.998468717737903e-05, + "loss": 1.7303, + "step": 115 + }, + { + "epoch": 0.06431937898530635, + "grad_norm": 0.2140997052192688, + "learning_rate": 7.998257756387466e-05, + "loss": 1.6146, + "step": 116 + }, + { + "epoch": 0.06487385639035209, + "grad_norm": 0.2312529981136322, + "learning_rate": 7.99803318870343e-05, + "loss": 1.691, + "step": 117 + }, + { + "epoch": 0.06542833379539784, + "grad_norm": 0.23457646369934082, + "learning_rate": 7.997795015450015e-05, + "loss": 1.6844, + "step": 118 + }, + { + "epoch": 0.06598281120044358, + "grad_norm": 0.25258150696754456, + "learning_rate": 7.997543237437738e-05, + "loss": 1.7305, + "step": 119 + }, + { + "epoch": 0.06653728860548933, + "grad_norm": 0.21662160754203796, + "learning_rate": 7.99727785552342e-05, + "loss": 1.6637, + "step": 120 + }, + { + "epoch": 0.06709176601053507, + "grad_norm": 0.23025107383728027, + "learning_rate": 7.99699887061017e-05, + "loss": 1.7312, + "step": 121 + }, + { + "epoch": 0.06764624341558081, + "grad_norm": 0.2787688374519348, + "learning_rate": 7.996706283647393e-05, + "loss": 1.7159, + "step": 122 + }, + { + "epoch": 0.06820072082062656, + "grad_norm": 0.26639994978904724, + "learning_rate": 7.996400095630781e-05, + "loss": 1.7311, + "step": 123 + }, + { + "epoch": 0.0687551982256723, + "grad_norm": 0.28086450695991516, + "learning_rate": 7.996080307602312e-05, + "loss": 1.6401, + "step": 124 + }, + { + "epoch": 0.06930967563071805, + "grad_norm": 0.24936339259147644, + "learning_rate": 7.995746920650248e-05, + "loss": 1.6197, + "step": 125 + }, + { + "epoch": 0.06986415303576379, + "grad_norm": 0.263729065656662, + "learning_rate": 7.995399935909122e-05, + "loss": 1.6693, + "step": 126 + }, + { + "epoch": 0.07041863044080954, + "grad_norm": 0.3544696271419525, + "learning_rate": 7.99503935455975e-05, + "loss": 1.6949, + "step": 127 + }, + { + "epoch": 0.07097310784585528, + "grad_norm": 0.23669736087322235, + "learning_rate": 7.994665177829211e-05, + "loss": 1.6262, + "step": 128 + }, + { + "epoch": 0.07152758525090103, + "grad_norm": 0.3165992498397827, + "learning_rate": 7.994277406990857e-05, + "loss": 1.6604, + "step": 129 + }, + { + "epoch": 0.07208206265594677, + "grad_norm": 0.29037636518478394, + "learning_rate": 7.993876043364294e-05, + "loss": 1.5977, + "step": 130 + }, + { + "epoch": 0.07263654006099252, + "grad_norm": 0.26414918899536133, + "learning_rate": 7.993461088315389e-05, + "loss": 1.7664, + "step": 131 + }, + { + "epoch": 0.07319101746603826, + "grad_norm": 0.3492623567581177, + "learning_rate": 7.993032543256263e-05, + "loss": 1.7055, + "step": 132 + }, + { + "epoch": 0.073745494871084, + "grad_norm": 0.2867588698863983, + "learning_rate": 7.992590409645282e-05, + "loss": 1.8092, + "step": 133 + }, + { + "epoch": 0.07429997227612975, + "grad_norm": 0.248403400182724, + "learning_rate": 7.992134688987056e-05, + "loss": 1.6885, + "step": 134 + }, + { + "epoch": 0.0748544496811755, + "grad_norm": 0.306547075510025, + "learning_rate": 7.991665382832433e-05, + "loss": 1.7218, + "step": 135 + }, + { + "epoch": 0.07540892708622124, + "grad_norm": 0.25006142258644104, + "learning_rate": 7.99118249277849e-05, + "loss": 1.6728, + "step": 136 + }, + { + "epoch": 0.07596340449126698, + "grad_norm": 0.29479724168777466, + "learning_rate": 7.990686020468536e-05, + "loss": 1.6751, + "step": 137 + }, + { + "epoch": 0.07651788189631273, + "grad_norm": 0.23760248720645905, + "learning_rate": 7.990175967592098e-05, + "loss": 1.625, + "step": 138 + }, + { + "epoch": 0.07707235930135847, + "grad_norm": 0.30315011739730835, + "learning_rate": 7.98965233588492e-05, + "loss": 1.6881, + "step": 139 + }, + { + "epoch": 0.07762683670640422, + "grad_norm": 0.22332574427127838, + "learning_rate": 7.989115127128955e-05, + "loss": 1.6179, + "step": 140 + }, + { + "epoch": 0.07818131411144996, + "grad_norm": 0.28878095746040344, + "learning_rate": 7.98856434315236e-05, + "loss": 1.7134, + "step": 141 + }, + { + "epoch": 0.0787357915164957, + "grad_norm": 0.231205016374588, + "learning_rate": 7.987999985829486e-05, + "loss": 1.6805, + "step": 142 + }, + { + "epoch": 0.07929026892154145, + "grad_norm": 0.29243049025535583, + "learning_rate": 7.987422057080881e-05, + "loss": 1.7045, + "step": 143 + }, + { + "epoch": 0.0798447463265872, + "grad_norm": 0.23502500355243683, + "learning_rate": 7.986830558873275e-05, + "loss": 1.6984, + "step": 144 + }, + { + "epoch": 0.08039922373163294, + "grad_norm": 0.23500199615955353, + "learning_rate": 7.986225493219573e-05, + "loss": 1.5982, + "step": 145 + }, + { + "epoch": 0.08095370113667869, + "grad_norm": 0.2353154569864273, + "learning_rate": 7.985606862178855e-05, + "loss": 1.7826, + "step": 146 + }, + { + "epoch": 0.08150817854172443, + "grad_norm": 0.2630767524242401, + "learning_rate": 7.984974667856362e-05, + "loss": 1.7355, + "step": 147 + }, + { + "epoch": 0.08206265594677017, + "grad_norm": 0.221221461892128, + "learning_rate": 7.984328912403494e-05, + "loss": 1.7203, + "step": 148 + }, + { + "epoch": 0.08261713335181592, + "grad_norm": 0.23368635773658752, + "learning_rate": 7.983669598017798e-05, + "loss": 1.6725, + "step": 149 + }, + { + "epoch": 0.08317161075686166, + "grad_norm": 0.22409488260746002, + "learning_rate": 7.982996726942963e-05, + "loss": 1.685, + "step": 150 + }, + { + "epoch": 0.08372608816190741, + "grad_norm": 0.2248145341873169, + "learning_rate": 7.982310301468815e-05, + "loss": 1.6743, + "step": 151 + }, + { + "epoch": 0.08428056556695315, + "grad_norm": 0.2202092409133911, + "learning_rate": 7.981610323931306e-05, + "loss": 1.6568, + "step": 152 + }, + { + "epoch": 0.0848350429719989, + "grad_norm": 0.2243170440196991, + "learning_rate": 7.980896796712504e-05, + "loss": 1.7107, + "step": 153 + }, + { + "epoch": 0.08538952037704464, + "grad_norm": 0.23059862852096558, + "learning_rate": 7.980169722240589e-05, + "loss": 1.6998, + "step": 154 + }, + { + "epoch": 0.08594399778209039, + "grad_norm": 0.2216973900794983, + "learning_rate": 7.979429102989842e-05, + "loss": 1.6527, + "step": 155 + }, + { + "epoch": 0.08649847518713612, + "grad_norm": 0.23553825914859772, + "learning_rate": 7.978674941480643e-05, + "loss": 1.6883, + "step": 156 + }, + { + "epoch": 0.08705295259218186, + "grad_norm": 0.2300165742635727, + "learning_rate": 7.977907240279449e-05, + "loss": 1.6638, + "step": 157 + }, + { + "epoch": 0.0876074299972276, + "grad_norm": 0.23822426795959473, + "learning_rate": 7.977126001998798e-05, + "loss": 1.6435, + "step": 158 + }, + { + "epoch": 0.08816190740227335, + "grad_norm": 0.2425059974193573, + "learning_rate": 7.976331229297298e-05, + "loss": 1.6745, + "step": 159 + }, + { + "epoch": 0.0887163848073191, + "grad_norm": 0.26218435168266296, + "learning_rate": 7.975522924879609e-05, + "loss": 1.6546, + "step": 160 + }, + { + "epoch": 0.08927086221236484, + "grad_norm": 0.23087595403194427, + "learning_rate": 7.974701091496448e-05, + "loss": 1.6581, + "step": 161 + }, + { + "epoch": 0.08982533961741059, + "grad_norm": 0.24700067937374115, + "learning_rate": 7.973865731944565e-05, + "loss": 1.7501, + "step": 162 + }, + { + "epoch": 0.09037981702245633, + "grad_norm": 0.22620323300361633, + "learning_rate": 7.973016849066742e-05, + "loss": 1.698, + "step": 163 + }, + { + "epoch": 0.09093429442750207, + "grad_norm": 0.21732601523399353, + "learning_rate": 7.972154445751788e-05, + "loss": 1.622, + "step": 164 + }, + { + "epoch": 0.09148877183254782, + "grad_norm": 0.22213229537010193, + "learning_rate": 7.971278524934515e-05, + "loss": 1.7224, + "step": 165 + }, + { + "epoch": 0.09204324923759356, + "grad_norm": 0.27407029271125793, + "learning_rate": 7.970389089595738e-05, + "loss": 1.679, + "step": 166 + }, + { + "epoch": 0.09259772664263931, + "grad_norm": 0.24043461680412292, + "learning_rate": 7.969486142762266e-05, + "loss": 1.6364, + "step": 167 + }, + { + "epoch": 0.09315220404768505, + "grad_norm": 0.22668756544589996, + "learning_rate": 7.968569687506886e-05, + "loss": 1.6504, + "step": 168 + }, + { + "epoch": 0.0937066814527308, + "grad_norm": 0.2548286020755768, + "learning_rate": 7.967639726948355e-05, + "loss": 1.6703, + "step": 169 + }, + { + "epoch": 0.09426115885777654, + "grad_norm": 0.2330275923013687, + "learning_rate": 7.96669626425139e-05, + "loss": 1.691, + "step": 170 + }, + { + "epoch": 0.09481563626282229, + "grad_norm": 0.2295777052640915, + "learning_rate": 7.965739302626656e-05, + "loss": 1.66, + "step": 171 + }, + { + "epoch": 0.09537011366786803, + "grad_norm": 0.23224836587905884, + "learning_rate": 7.964768845330756e-05, + "loss": 1.6317, + "step": 172 + }, + { + "epoch": 0.09592459107291378, + "grad_norm": 0.21784305572509766, + "learning_rate": 7.963784895666221e-05, + "loss": 1.674, + "step": 173 + }, + { + "epoch": 0.09647906847795952, + "grad_norm": 0.25221794843673706, + "learning_rate": 7.962787456981498e-05, + "loss": 1.6848, + "step": 174 + }, + { + "epoch": 0.09703354588300526, + "grad_norm": 0.2345496118068695, + "learning_rate": 7.961776532670931e-05, + "loss": 1.6349, + "step": 175 + }, + { + "epoch": 0.09758802328805101, + "grad_norm": 0.23866072297096252, + "learning_rate": 7.960752126174765e-05, + "loss": 1.6078, + "step": 176 + }, + { + "epoch": 0.09814250069309675, + "grad_norm": 0.2709258794784546, + "learning_rate": 7.959714240979124e-05, + "loss": 1.6811, + "step": 177 + }, + { + "epoch": 0.0986969780981425, + "grad_norm": 0.22360965609550476, + "learning_rate": 7.958662880615997e-05, + "loss": 1.633, + "step": 178 + }, + { + "epoch": 0.09925145550318824, + "grad_norm": 0.25358742475509644, + "learning_rate": 7.957598048663234e-05, + "loss": 1.6449, + "step": 179 + }, + { + "epoch": 0.09980593290823399, + "grad_norm": 0.2268538922071457, + "learning_rate": 7.956519748744525e-05, + "loss": 1.6675, + "step": 180 + }, + { + "epoch": 0.10036041031327973, + "grad_norm": 0.26430633664131165, + "learning_rate": 7.9554279845294e-05, + "loss": 1.7128, + "step": 181 + }, + { + "epoch": 0.10091488771832548, + "grad_norm": 0.22235046327114105, + "learning_rate": 7.9543227597332e-05, + "loss": 1.6033, + "step": 182 + }, + { + "epoch": 0.10146936512337122, + "grad_norm": 0.26423388719558716, + "learning_rate": 7.953204078117081e-05, + "loss": 1.6274, + "step": 183 + }, + { + "epoch": 0.10202384252841697, + "grad_norm": 0.22570672631263733, + "learning_rate": 7.952071943487987e-05, + "loss": 1.6938, + "step": 184 + }, + { + "epoch": 0.10257831993346271, + "grad_norm": 0.25327280163764954, + "learning_rate": 7.95092635969865e-05, + "loss": 1.7074, + "step": 185 + }, + { + "epoch": 0.10313279733850846, + "grad_norm": 0.21103128790855408, + "learning_rate": 7.949767330647562e-05, + "loss": 1.6202, + "step": 186 + }, + { + "epoch": 0.1036872747435542, + "grad_norm": 0.24825690686702728, + "learning_rate": 7.94859486027898e-05, + "loss": 1.6757, + "step": 187 + }, + { + "epoch": 0.10424175214859994, + "grad_norm": 0.2204912304878235, + "learning_rate": 7.947408952582892e-05, + "loss": 1.6779, + "step": 188 + }, + { + "epoch": 0.10479622955364569, + "grad_norm": 0.26558375358581543, + "learning_rate": 7.946209611595026e-05, + "loss": 1.6829, + "step": 189 + }, + { + "epoch": 0.10535070695869143, + "grad_norm": 0.2372429221868515, + "learning_rate": 7.944996841396815e-05, + "loss": 1.6825, + "step": 190 + }, + { + "epoch": 0.10590518436373718, + "grad_norm": 0.21156929433345795, + "learning_rate": 7.943770646115396e-05, + "loss": 1.6063, + "step": 191 + }, + { + "epoch": 0.10645966176878292, + "grad_norm": 0.21675823628902435, + "learning_rate": 7.94253102992359e-05, + "loss": 1.612, + "step": 192 + }, + { + "epoch": 0.10701413917382867, + "grad_norm": 0.2156941145658493, + "learning_rate": 7.941277997039894e-05, + "loss": 1.6382, + "step": 193 + }, + { + "epoch": 0.10756861657887441, + "grad_norm": 0.20940068364143372, + "learning_rate": 7.940011551728463e-05, + "loss": 1.6607, + "step": 194 + }, + { + "epoch": 0.10812309398392016, + "grad_norm": 0.21804174780845642, + "learning_rate": 7.93873169829909e-05, + "loss": 1.6386, + "step": 195 + }, + { + "epoch": 0.1086775713889659, + "grad_norm": 0.21801279485225677, + "learning_rate": 7.937438441107203e-05, + "loss": 1.6612, + "step": 196 + }, + { + "epoch": 0.10923204879401165, + "grad_norm": 0.254651814699173, + "learning_rate": 7.93613178455384e-05, + "loss": 1.6548, + "step": 197 + }, + { + "epoch": 0.10978652619905739, + "grad_norm": 0.20725683867931366, + "learning_rate": 7.934811733085637e-05, + "loss": 1.6508, + "step": 198 + }, + { + "epoch": 0.11034100360410314, + "grad_norm": 0.25397831201553345, + "learning_rate": 7.933478291194821e-05, + "loss": 1.6434, + "step": 199 + }, + { + "epoch": 0.11089548100914888, + "grad_norm": 0.21976791322231293, + "learning_rate": 7.932131463419177e-05, + "loss": 1.6073, + "step": 200 + }, + { + "epoch": 0.11144995841419462, + "grad_norm": 0.23310157656669617, + "learning_rate": 7.930771254342051e-05, + "loss": 1.6755, + "step": 201 + }, + { + "epoch": 0.11200443581924037, + "grad_norm": 0.24408988654613495, + "learning_rate": 7.929397668592323e-05, + "loss": 1.6681, + "step": 202 + }, + { + "epoch": 0.11255891322428611, + "grad_norm": 0.22306199371814728, + "learning_rate": 7.928010710844397e-05, + "loss": 1.6328, + "step": 203 + }, + { + "epoch": 0.11311339062933186, + "grad_norm": 0.2562360465526581, + "learning_rate": 7.926610385818179e-05, + "loss": 1.639, + "step": 204 + }, + { + "epoch": 0.1136678680343776, + "grad_norm": 0.22006537020206451, + "learning_rate": 7.925196698279068e-05, + "loss": 1.6636, + "step": 205 + }, + { + "epoch": 0.11422234543942335, + "grad_norm": 0.23495469987392426, + "learning_rate": 7.923769653037935e-05, + "loss": 1.6408, + "step": 206 + }, + { + "epoch": 0.11477682284446909, + "grad_norm": 0.215364009141922, + "learning_rate": 7.92232925495111e-05, + "loss": 1.6994, + "step": 207 + }, + { + "epoch": 0.11533130024951484, + "grad_norm": 0.2487768530845642, + "learning_rate": 7.920875508920361e-05, + "loss": 1.6307, + "step": 208 + }, + { + "epoch": 0.11588577765456058, + "grad_norm": 0.2132566124200821, + "learning_rate": 7.919408419892881e-05, + "loss": 1.6538, + "step": 209 + }, + { + "epoch": 0.11644025505960633, + "grad_norm": 0.22761502861976624, + "learning_rate": 7.917927992861272e-05, + "loss": 1.6135, + "step": 210 + }, + { + "epoch": 0.11699473246465207, + "grad_norm": 0.22848926484584808, + "learning_rate": 7.916434232863522e-05, + "loss": 1.7293, + "step": 211 + }, + { + "epoch": 0.11754920986969782, + "grad_norm": 0.21311335265636444, + "learning_rate": 7.914927144982995e-05, + "loss": 1.6792, + "step": 212 + }, + { + "epoch": 0.11810368727474356, + "grad_norm": 0.23164795339107513, + "learning_rate": 7.913406734348412e-05, + "loss": 1.5472, + "step": 213 + }, + { + "epoch": 0.1186581646797893, + "grad_norm": 0.21141847968101501, + "learning_rate": 7.911873006133827e-05, + "loss": 1.6259, + "step": 214 + }, + { + "epoch": 0.11921264208483505, + "grad_norm": 0.2537563443183899, + "learning_rate": 7.910325965558621e-05, + "loss": 1.7058, + "step": 215 + }, + { + "epoch": 0.1197671194898808, + "grad_norm": 0.2353929728269577, + "learning_rate": 7.908765617887473e-05, + "loss": 1.6479, + "step": 216 + }, + { + "epoch": 0.12032159689492654, + "grad_norm": 0.20736263692378998, + "learning_rate": 7.907191968430347e-05, + "loss": 1.603, + "step": 217 + }, + { + "epoch": 0.12087607429997227, + "grad_norm": 0.23365424573421478, + "learning_rate": 7.905605022542478e-05, + "loss": 1.6792, + "step": 218 + }, + { + "epoch": 0.12143055170501801, + "grad_norm": 0.22553478181362152, + "learning_rate": 7.904004785624345e-05, + "loss": 1.7089, + "step": 219 + }, + { + "epoch": 0.12198502911006376, + "grad_norm": 0.2107611447572708, + "learning_rate": 7.902391263121662e-05, + "loss": 1.6541, + "step": 220 + }, + { + "epoch": 0.1225395065151095, + "grad_norm": 0.21160966157913208, + "learning_rate": 7.900764460525349e-05, + "loss": 1.6459, + "step": 221 + }, + { + "epoch": 0.12309398392015525, + "grad_norm": 0.21517297625541687, + "learning_rate": 7.899124383371524e-05, + "loss": 1.6354, + "step": 222 + }, + { + "epoch": 0.12364846132520099, + "grad_norm": 0.21948044002056122, + "learning_rate": 7.897471037241476e-05, + "loss": 1.7011, + "step": 223 + }, + { + "epoch": 0.12420293873024674, + "grad_norm": 0.21453773975372314, + "learning_rate": 7.895804427761651e-05, + "loss": 1.6508, + "step": 224 + }, + { + "epoch": 0.12475741613529248, + "grad_norm": 0.23449410498142242, + "learning_rate": 7.894124560603631e-05, + "loss": 1.6891, + "step": 225 + }, + { + "epoch": 0.12531189354033823, + "grad_norm": 0.22934886813163757, + "learning_rate": 7.892431441484113e-05, + "loss": 1.6989, + "step": 226 + }, + { + "epoch": 0.12586637094538397, + "grad_norm": 0.24871671199798584, + "learning_rate": 7.890725076164894e-05, + "loss": 1.6596, + "step": 227 + }, + { + "epoch": 0.12642084835042972, + "grad_norm": 0.21111659705638885, + "learning_rate": 7.889005470452845e-05, + "loss": 1.6158, + "step": 228 + }, + { + "epoch": 0.12697532575547546, + "grad_norm": 0.24391771852970123, + "learning_rate": 7.8872726301999e-05, + "loss": 1.6943, + "step": 229 + }, + { + "epoch": 0.1275298031605212, + "grad_norm": 0.2066636085510254, + "learning_rate": 7.885526561303024e-05, + "loss": 1.6627, + "step": 230 + }, + { + "epoch": 0.12808428056556695, + "grad_norm": 0.2221907377243042, + "learning_rate": 7.883767269704209e-05, + "loss": 1.6358, + "step": 231 + }, + { + "epoch": 0.1286387579706127, + "grad_norm": 0.21195903420448303, + "learning_rate": 7.881994761390437e-05, + "loss": 1.6259, + "step": 232 + }, + { + "epoch": 0.12919323537565844, + "grad_norm": 0.22241343557834625, + "learning_rate": 7.88020904239367e-05, + "loss": 1.7442, + "step": 233 + }, + { + "epoch": 0.12974771278070418, + "grad_norm": 0.20912249386310577, + "learning_rate": 7.878410118790827e-05, + "loss": 1.6546, + "step": 234 + }, + { + "epoch": 0.13030219018574993, + "grad_norm": 0.21954858303070068, + "learning_rate": 7.876597996703763e-05, + "loss": 1.6339, + "step": 235 + }, + { + "epoch": 0.13085666759079567, + "grad_norm": 0.23016227781772614, + "learning_rate": 7.874772682299251e-05, + "loss": 1.6333, + "step": 236 + }, + { + "epoch": 0.13141114499584142, + "grad_norm": 0.22401848435401917, + "learning_rate": 7.872934181788953e-05, + "loss": 1.7135, + "step": 237 + }, + { + "epoch": 0.13196562240088716, + "grad_norm": 0.23134422302246094, + "learning_rate": 7.871082501429409e-05, + "loss": 1.704, + "step": 238 + }, + { + "epoch": 0.1325200998059329, + "grad_norm": 0.24263045191764832, + "learning_rate": 7.869217647522006e-05, + "loss": 1.7007, + "step": 239 + }, + { + "epoch": 0.13307457721097865, + "grad_norm": 0.21756958961486816, + "learning_rate": 7.867339626412965e-05, + "loss": 1.6961, + "step": 240 + }, + { + "epoch": 0.1336290546160244, + "grad_norm": 0.23123596608638763, + "learning_rate": 7.865448444493317e-05, + "loss": 1.6698, + "step": 241 + }, + { + "epoch": 0.13418353202107014, + "grad_norm": 0.21551208198070526, + "learning_rate": 7.863544108198877e-05, + "loss": 1.6982, + "step": 242 + }, + { + "epoch": 0.13473800942611588, + "grad_norm": 0.22842957079410553, + "learning_rate": 7.861626624010226e-05, + "loss": 1.5909, + "step": 243 + }, + { + "epoch": 0.13529248683116163, + "grad_norm": 0.22002427279949188, + "learning_rate": 7.85969599845269e-05, + "loss": 1.649, + "step": 244 + }, + { + "epoch": 0.13584696423620737, + "grad_norm": 0.23170796036720276, + "learning_rate": 7.857752238096313e-05, + "loss": 1.685, + "step": 245 + }, + { + "epoch": 0.13640144164125312, + "grad_norm": 0.21300175786018372, + "learning_rate": 7.855795349555839e-05, + "loss": 1.6501, + "step": 246 + }, + { + "epoch": 0.13695591904629886, + "grad_norm": 0.2336280345916748, + "learning_rate": 7.853825339490689e-05, + "loss": 1.6482, + "step": 247 + }, + { + "epoch": 0.1375103964513446, + "grad_norm": 0.23497241735458374, + "learning_rate": 7.851842214604937e-05, + "loss": 1.5985, + "step": 248 + }, + { + "epoch": 0.13806487385639035, + "grad_norm": 0.21944530308246613, + "learning_rate": 7.849845981647285e-05, + "loss": 1.6658, + "step": 249 + }, + { + "epoch": 0.1386193512614361, + "grad_norm": 0.21688306331634521, + "learning_rate": 7.847836647411049e-05, + "loss": 1.6014, + "step": 250 + }, + { + "epoch": 0.13917382866648184, + "grad_norm": 0.21929487586021423, + "learning_rate": 7.84581421873412e-05, + "loss": 1.6299, + "step": 251 + }, + { + "epoch": 0.13972830607152759, + "grad_norm": 0.21958981454372406, + "learning_rate": 7.843778702498961e-05, + "loss": 1.6961, + "step": 252 + }, + { + "epoch": 0.14028278347657333, + "grad_norm": 0.21822874248027802, + "learning_rate": 7.841730105632563e-05, + "loss": 1.6779, + "step": 253 + }, + { + "epoch": 0.14083726088161908, + "grad_norm": 0.21482256054878235, + "learning_rate": 7.839668435106437e-05, + "loss": 1.6304, + "step": 254 + }, + { + "epoch": 0.14139173828666482, + "grad_norm": 0.22191710770130157, + "learning_rate": 7.837593697936582e-05, + "loss": 1.6461, + "step": 255 + }, + { + "epoch": 0.14194621569171056, + "grad_norm": 0.227654829621315, + "learning_rate": 7.835505901183468e-05, + "loss": 1.7217, + "step": 256 + }, + { + "epoch": 0.1425006930967563, + "grad_norm": 0.2576013505458832, + "learning_rate": 7.833405051952002e-05, + "loss": 1.7567, + "step": 257 + }, + { + "epoch": 0.14305517050180205, + "grad_norm": 0.20585085451602936, + "learning_rate": 7.831291157391513e-05, + "loss": 1.6422, + "step": 258 + }, + { + "epoch": 0.1436096479068478, + "grad_norm": 0.22537145018577576, + "learning_rate": 7.82916422469572e-05, + "loss": 1.6604, + "step": 259 + }, + { + "epoch": 0.14416412531189354, + "grad_norm": 0.23208315670490265, + "learning_rate": 7.827024261102718e-05, + "loss": 1.615, + "step": 260 + }, + { + "epoch": 0.1447186027169393, + "grad_norm": 0.2133701741695404, + "learning_rate": 7.824871273894943e-05, + "loss": 1.6413, + "step": 261 + }, + { + "epoch": 0.14527308012198503, + "grad_norm": 0.2303103655576706, + "learning_rate": 7.82270527039915e-05, + "loss": 1.661, + "step": 262 + }, + { + "epoch": 0.14582755752703078, + "grad_norm": 0.22526273131370544, + "learning_rate": 7.820526257986393e-05, + "loss": 1.6806, + "step": 263 + }, + { + "epoch": 0.14638203493207652, + "grad_norm": 0.22275319695472717, + "learning_rate": 7.818334244071994e-05, + "loss": 1.7037, + "step": 264 + }, + { + "epoch": 0.14693651233712227, + "grad_norm": 0.22790545225143433, + "learning_rate": 7.81612923611552e-05, + "loss": 1.654, + "step": 265 + }, + { + "epoch": 0.147490989742168, + "grad_norm": 0.22253917157649994, + "learning_rate": 7.813911241620755e-05, + "loss": 1.6492, + "step": 266 + }, + { + "epoch": 0.14804546714721376, + "grad_norm": 0.22290851175785065, + "learning_rate": 7.811680268135684e-05, + "loss": 1.6355, + "step": 267 + }, + { + "epoch": 0.1485999445522595, + "grad_norm": 0.22694170475006104, + "learning_rate": 7.809436323252456e-05, + "loss": 1.622, + "step": 268 + }, + { + "epoch": 0.14915442195730524, + "grad_norm": 0.21007151901721954, + "learning_rate": 7.80717941460736e-05, + "loss": 1.6312, + "step": 269 + }, + { + "epoch": 0.149708899362351, + "grad_norm": 0.2363370954990387, + "learning_rate": 7.804909549880806e-05, + "loss": 1.6222, + "step": 270 + }, + { + "epoch": 0.15026337676739673, + "grad_norm": 0.26099053025245667, + "learning_rate": 7.802626736797292e-05, + "loss": 1.6424, + "step": 271 + }, + { + "epoch": 0.15081785417244248, + "grad_norm": 0.21376170217990875, + "learning_rate": 7.800330983125381e-05, + "loss": 1.6055, + "step": 272 + }, + { + "epoch": 0.15137233157748822, + "grad_norm": 0.26935890316963196, + "learning_rate": 7.798022296677675e-05, + "loss": 1.6858, + "step": 273 + }, + { + "epoch": 0.15192680898253397, + "grad_norm": 0.22003325819969177, + "learning_rate": 7.795700685310783e-05, + "loss": 1.6803, + "step": 274 + }, + { + "epoch": 0.1524812863875797, + "grad_norm": 0.25127747654914856, + "learning_rate": 7.793366156925302e-05, + "loss": 1.5456, + "step": 275 + }, + { + "epoch": 0.15303576379262546, + "grad_norm": 0.21749310195446014, + "learning_rate": 7.791018719465785e-05, + "loss": 1.6904, + "step": 276 + }, + { + "epoch": 0.1535902411976712, + "grad_norm": 0.2340991199016571, + "learning_rate": 7.788658380920716e-05, + "loss": 1.6946, + "step": 277 + }, + { + "epoch": 0.15414471860271695, + "grad_norm": 0.21612657606601715, + "learning_rate": 7.786285149322483e-05, + "loss": 1.5836, + "step": 278 + }, + { + "epoch": 0.1546991960077627, + "grad_norm": 0.2601798474788666, + "learning_rate": 7.783899032747346e-05, + "loss": 1.679, + "step": 279 + }, + { + "epoch": 0.15525367341280844, + "grad_norm": 0.24051488935947418, + "learning_rate": 7.78150003931542e-05, + "loss": 1.6504, + "step": 280 + }, + { + "epoch": 0.15580815081785418, + "grad_norm": 0.22417373955249786, + "learning_rate": 7.779088177190636e-05, + "loss": 1.6767, + "step": 281 + }, + { + "epoch": 0.15636262822289992, + "grad_norm": 0.23958833515644073, + "learning_rate": 7.776663454580718e-05, + "loss": 1.6636, + "step": 282 + }, + { + "epoch": 0.15691710562794567, + "grad_norm": 0.22947099804878235, + "learning_rate": 7.774225879737156e-05, + "loss": 1.6653, + "step": 283 + }, + { + "epoch": 0.1574715830329914, + "grad_norm": 0.23993003368377686, + "learning_rate": 7.771775460955178e-05, + "loss": 1.6637, + "step": 284 + }, + { + "epoch": 0.15802606043803716, + "grad_norm": 0.2175922393798828, + "learning_rate": 7.76931220657372e-05, + "loss": 1.6086, + "step": 285 + }, + { + "epoch": 0.1585805378430829, + "grad_norm": 0.22795039415359497, + "learning_rate": 7.766836124975399e-05, + "loss": 1.6274, + "step": 286 + }, + { + "epoch": 0.15913501524812865, + "grad_norm": 0.2554066479206085, + "learning_rate": 7.764347224586482e-05, + "loss": 1.6059, + "step": 287 + }, + { + "epoch": 0.1596894926531744, + "grad_norm": 0.2622605860233307, + "learning_rate": 7.761845513876861e-05, + "loss": 1.6815, + "step": 288 + }, + { + "epoch": 0.16024397005822014, + "grad_norm": 0.25482630729675293, + "learning_rate": 7.759331001360021e-05, + "loss": 1.7245, + "step": 289 + }, + { + "epoch": 0.16079844746326588, + "grad_norm": 0.23500695824623108, + "learning_rate": 7.756803695593015e-05, + "loss": 1.6039, + "step": 290 + }, + { + "epoch": 0.16135292486831163, + "grad_norm": 0.24643713235855103, + "learning_rate": 7.754263605176429e-05, + "loss": 1.6804, + "step": 291 + }, + { + "epoch": 0.16190740227335737, + "grad_norm": 0.301843523979187, + "learning_rate": 7.751710738754357e-05, + "loss": 1.6837, + "step": 292 + }, + { + "epoch": 0.16246187967840311, + "grad_norm": 0.22402901947498322, + "learning_rate": 7.749145105014372e-05, + "loss": 1.7017, + "step": 293 + }, + { + "epoch": 0.16301635708344886, + "grad_norm": 0.30275362730026245, + "learning_rate": 7.746566712687493e-05, + "loss": 1.7249, + "step": 294 + }, + { + "epoch": 0.1635708344884946, + "grad_norm": 0.20885106921195984, + "learning_rate": 7.74397557054816e-05, + "loss": 1.6964, + "step": 295 + }, + { + "epoch": 0.16412531189354035, + "grad_norm": 0.28100892901420593, + "learning_rate": 7.741371687414198e-05, + "loss": 1.6428, + "step": 296 + }, + { + "epoch": 0.1646797892985861, + "grad_norm": 0.24655266106128693, + "learning_rate": 7.738755072146794e-05, + "loss": 1.638, + "step": 297 + }, + { + "epoch": 0.16523426670363184, + "grad_norm": 0.2542823255062103, + "learning_rate": 7.736125733650461e-05, + "loss": 1.6958, + "step": 298 + }, + { + "epoch": 0.16578874410867758, + "grad_norm": 0.24665483832359314, + "learning_rate": 7.733483680873009e-05, + "loss": 1.6095, + "step": 299 + }, + { + "epoch": 0.16634322151372333, + "grad_norm": 0.25384095311164856, + "learning_rate": 7.730828922805517e-05, + "loss": 1.6943, + "step": 300 + }, + { + "epoch": 0.16689769891876907, + "grad_norm": 0.26568636298179626, + "learning_rate": 7.728161468482304e-05, + "loss": 1.693, + "step": 301 + }, + { + "epoch": 0.16745217632381482, + "grad_norm": 0.2455059140920639, + "learning_rate": 7.72548132698089e-05, + "loss": 1.6415, + "step": 302 + }, + { + "epoch": 0.16800665372886056, + "grad_norm": 0.24293363094329834, + "learning_rate": 7.722788507421971e-05, + "loss": 1.6402, + "step": 303 + }, + { + "epoch": 0.1685611311339063, + "grad_norm": 0.22879062592983246, + "learning_rate": 7.720083018969393e-05, + "loss": 1.6153, + "step": 304 + }, + { + "epoch": 0.16911560853895205, + "grad_norm": 0.21951867640018463, + "learning_rate": 7.717364870830107e-05, + "loss": 1.6482, + "step": 305 + }, + { + "epoch": 0.1696700859439978, + "grad_norm": 0.2079135924577713, + "learning_rate": 7.71463407225415e-05, + "loss": 1.5899, + "step": 306 + }, + { + "epoch": 0.17022456334904354, + "grad_norm": 0.20707400143146515, + "learning_rate": 7.71189063253461e-05, + "loss": 1.6667, + "step": 307 + }, + { + "epoch": 0.17077904075408928, + "grad_norm": 0.22458939254283905, + "learning_rate": 7.709134561007592e-05, + "loss": 1.5537, + "step": 308 + }, + { + "epoch": 0.17133351815913503, + "grad_norm": 0.21969960629940033, + "learning_rate": 7.706365867052188e-05, + "loss": 1.6693, + "step": 309 + }, + { + "epoch": 0.17188799556418077, + "grad_norm": 0.22047635912895203, + "learning_rate": 7.703584560090447e-05, + "loss": 1.5548, + "step": 310 + }, + { + "epoch": 0.1724424729692265, + "grad_norm": 0.22892658412456512, + "learning_rate": 7.700790649587336e-05, + "loss": 1.6342, + "step": 311 + }, + { + "epoch": 0.17299695037427223, + "grad_norm": 0.23613335192203522, + "learning_rate": 7.697984145050718e-05, + "loss": 1.6324, + "step": 312 + }, + { + "epoch": 0.17355142777931798, + "grad_norm": 0.2484220266342163, + "learning_rate": 7.695165056031313e-05, + "loss": 1.6467, + "step": 313 + }, + { + "epoch": 0.17410590518436372, + "grad_norm": 0.2812964618206024, + "learning_rate": 7.692333392122663e-05, + "loss": 1.6744, + "step": 314 + }, + { + "epoch": 0.17466038258940947, + "grad_norm": 0.2211124300956726, + "learning_rate": 7.689489162961109e-05, + "loss": 1.6213, + "step": 315 + }, + { + "epoch": 0.1752148599944552, + "grad_norm": 0.22334423661231995, + "learning_rate": 7.686632378225748e-05, + "loss": 1.6909, + "step": 316 + }, + { + "epoch": 0.17576933739950096, + "grad_norm": 0.2386290282011032, + "learning_rate": 7.683763047638407e-05, + "loss": 1.6193, + "step": 317 + }, + { + "epoch": 0.1763238148045467, + "grad_norm": 0.20850642025470734, + "learning_rate": 7.680881180963605e-05, + "loss": 1.6541, + "step": 318 + }, + { + "epoch": 0.17687829220959245, + "grad_norm": 0.22296078503131866, + "learning_rate": 7.677986788008524e-05, + "loss": 1.6285, + "step": 319 + }, + { + "epoch": 0.1774327696146382, + "grad_norm": 0.20673969388008118, + "learning_rate": 7.675079878622974e-05, + "loss": 1.6835, + "step": 320 + }, + { + "epoch": 0.17798724701968394, + "grad_norm": 0.21489155292510986, + "learning_rate": 7.672160462699359e-05, + "loss": 1.6354, + "step": 321 + }, + { + "epoch": 0.17854172442472968, + "grad_norm": 0.21602220833301544, + "learning_rate": 7.669228550172639e-05, + "loss": 1.5907, + "step": 322 + }, + { + "epoch": 0.17909620182977543, + "grad_norm": 0.2056579738855362, + "learning_rate": 7.666284151020309e-05, + "loss": 1.6304, + "step": 323 + }, + { + "epoch": 0.17965067923482117, + "grad_norm": 0.22408543527126312, + "learning_rate": 7.663327275262353e-05, + "loss": 1.636, + "step": 324 + }, + { + "epoch": 0.18020515663986691, + "grad_norm": 0.20583994686603546, + "learning_rate": 7.66035793296121e-05, + "loss": 1.6401, + "step": 325 + }, + { + "epoch": 0.18075963404491266, + "grad_norm": 0.21680684387683868, + "learning_rate": 7.657376134221749e-05, + "loss": 1.6827, + "step": 326 + }, + { + "epoch": 0.1813141114499584, + "grad_norm": 0.218284010887146, + "learning_rate": 7.654381889191225e-05, + "loss": 1.585, + "step": 327 + }, + { + "epoch": 0.18186858885500415, + "grad_norm": 0.20787712931632996, + "learning_rate": 7.651375208059252e-05, + "loss": 1.6359, + "step": 328 + }, + { + "epoch": 0.1824230662600499, + "grad_norm": 0.216603085398674, + "learning_rate": 7.648356101057764e-05, + "loss": 1.6884, + "step": 329 + }, + { + "epoch": 0.18297754366509564, + "grad_norm": 0.2224433422088623, + "learning_rate": 7.645324578460978e-05, + "loss": 1.6869, + "step": 330 + }, + { + "epoch": 0.18353202107014138, + "grad_norm": 0.21461333334445953, + "learning_rate": 7.642280650585366e-05, + "loss": 1.5748, + "step": 331 + }, + { + "epoch": 0.18408649847518713, + "grad_norm": 0.20232297480106354, + "learning_rate": 7.639224327789613e-05, + "loss": 1.6181, + "step": 332 + }, + { + "epoch": 0.18464097588023287, + "grad_norm": 0.2159082591533661, + "learning_rate": 7.636155620474589e-05, + "loss": 1.6346, + "step": 333 + }, + { + "epoch": 0.18519545328527862, + "grad_norm": 0.21551144123077393, + "learning_rate": 7.633074539083302e-05, + "loss": 1.6951, + "step": 334 + }, + { + "epoch": 0.18574993069032436, + "grad_norm": 0.2233298122882843, + "learning_rate": 7.629981094100878e-05, + "loss": 1.6491, + "step": 335 + }, + { + "epoch": 0.1863044080953701, + "grad_norm": 0.21482495963573456, + "learning_rate": 7.626875296054512e-05, + "loss": 1.6596, + "step": 336 + }, + { + "epoch": 0.18685888550041585, + "grad_norm": 0.2133273333311081, + "learning_rate": 7.623757155513439e-05, + "loss": 1.6508, + "step": 337 + }, + { + "epoch": 0.1874133629054616, + "grad_norm": 0.20683740079402924, + "learning_rate": 7.620626683088894e-05, + "loss": 1.6145, + "step": 338 + }, + { + "epoch": 0.18796784031050734, + "grad_norm": 0.20707263052463531, + "learning_rate": 7.617483889434083e-05, + "loss": 1.6415, + "step": 339 + }, + { + "epoch": 0.18852231771555308, + "grad_norm": 0.2513361871242523, + "learning_rate": 7.614328785244138e-05, + "loss": 1.7166, + "step": 340 + }, + { + "epoch": 0.18907679512059883, + "grad_norm": 0.21265187859535217, + "learning_rate": 7.611161381256084e-05, + "loss": 1.6454, + "step": 341 + }, + { + "epoch": 0.18963127252564457, + "grad_norm": 0.28207194805145264, + "learning_rate": 7.607981688248807e-05, + "loss": 1.6473, + "step": 342 + }, + { + "epoch": 0.19018574993069032, + "grad_norm": 0.256237268447876, + "learning_rate": 7.604789717043011e-05, + "loss": 1.7602, + "step": 343 + }, + { + "epoch": 0.19074022733573606, + "grad_norm": 0.2375311702489853, + "learning_rate": 7.601585478501181e-05, + "loss": 1.6399, + "step": 344 + }, + { + "epoch": 0.1912947047407818, + "grad_norm": 0.23850996792316437, + "learning_rate": 7.598368983527554e-05, + "loss": 1.6538, + "step": 345 + }, + { + "epoch": 0.19184918214582755, + "grad_norm": 0.2116571068763733, + "learning_rate": 7.595140243068072e-05, + "loss": 1.6129, + "step": 346 + }, + { + "epoch": 0.1924036595508733, + "grad_norm": 0.2089197337627411, + "learning_rate": 7.591899268110352e-05, + "loss": 1.6216, + "step": 347 + }, + { + "epoch": 0.19295813695591904, + "grad_norm": 0.20319993793964386, + "learning_rate": 7.588646069683642e-05, + "loss": 1.6086, + "step": 348 + }, + { + "epoch": 0.19351261436096479, + "grad_norm": 0.2238406538963318, + "learning_rate": 7.585380658858793e-05, + "loss": 1.6992, + "step": 349 + }, + { + "epoch": 0.19406709176601053, + "grad_norm": 0.21192164719104767, + "learning_rate": 7.58210304674821e-05, + "loss": 1.6756, + "step": 350 + }, + { + "epoch": 0.19462156917105627, + "grad_norm": 0.21115197241306305, + "learning_rate": 7.578813244505823e-05, + "loss": 1.5859, + "step": 351 + }, + { + "epoch": 0.19517604657610202, + "grad_norm": 0.20488910377025604, + "learning_rate": 7.575511263327044e-05, + "loss": 1.6174, + "step": 352 + }, + { + "epoch": 0.19573052398114776, + "grad_norm": 0.21583472192287445, + "learning_rate": 7.572197114448733e-05, + "loss": 1.639, + "step": 353 + }, + { + "epoch": 0.1962850013861935, + "grad_norm": 0.22085356712341309, + "learning_rate": 7.568870809149155e-05, + "loss": 1.7244, + "step": 354 + }, + { + "epoch": 0.19683947879123925, + "grad_norm": 0.20893336832523346, + "learning_rate": 7.565532358747949e-05, + "loss": 1.6678, + "step": 355 + }, + { + "epoch": 0.197393956196285, + "grad_norm": 0.20535239577293396, + "learning_rate": 7.562181774606075e-05, + "loss": 1.5594, + "step": 356 + }, + { + "epoch": 0.19794843360133074, + "grad_norm": 0.23027966916561127, + "learning_rate": 7.558819068125796e-05, + "loss": 1.7349, + "step": 357 + }, + { + "epoch": 0.1985029110063765, + "grad_norm": 0.20420025289058685, + "learning_rate": 7.555444250750618e-05, + "loss": 1.6259, + "step": 358 + }, + { + "epoch": 0.19905738841142223, + "grad_norm": 0.21578669548034668, + "learning_rate": 7.552057333965271e-05, + "loss": 1.652, + "step": 359 + }, + { + "epoch": 0.19961186581646798, + "grad_norm": 0.21920384466648102, + "learning_rate": 7.548658329295651e-05, + "loss": 1.6564, + "step": 360 + }, + { + "epoch": 0.20016634322151372, + "grad_norm": 0.23153354227542877, + "learning_rate": 7.545247248308798e-05, + "loss": 1.6196, + "step": 361 + }, + { + "epoch": 0.20072082062655947, + "grad_norm": 0.22002752125263214, + "learning_rate": 7.541824102612839e-05, + "loss": 1.6524, + "step": 362 + }, + { + "epoch": 0.2012752980316052, + "grad_norm": 0.21228551864624023, + "learning_rate": 7.53838890385697e-05, + "loss": 1.7047, + "step": 363 + }, + { + "epoch": 0.20182977543665095, + "grad_norm": 0.21426419913768768, + "learning_rate": 7.534941663731394e-05, + "loss": 1.6433, + "step": 364 + }, + { + "epoch": 0.2023842528416967, + "grad_norm": 0.20719939470291138, + "learning_rate": 7.531482393967295e-05, + "loss": 1.6065, + "step": 365 + }, + { + "epoch": 0.20293873024674244, + "grad_norm": 0.21919012069702148, + "learning_rate": 7.528011106336797e-05, + "loss": 1.6418, + "step": 366 + }, + { + "epoch": 0.2034932076517882, + "grad_norm": 0.201614648103714, + "learning_rate": 7.524527812652917e-05, + "loss": 1.5683, + "step": 367 + }, + { + "epoch": 0.20404768505683393, + "grad_norm": 0.20898142457008362, + "learning_rate": 7.521032524769537e-05, + "loss": 1.6671, + "step": 368 + }, + { + "epoch": 0.20460216246187968, + "grad_norm": 0.2096984088420868, + "learning_rate": 7.517525254581346e-05, + "loss": 1.5993, + "step": 369 + }, + { + "epoch": 0.20515663986692542, + "grad_norm": 0.21319280564785004, + "learning_rate": 7.514006014023817e-05, + "loss": 1.5941, + "step": 370 + }, + { + "epoch": 0.20571111727197117, + "grad_norm": 0.2070310115814209, + "learning_rate": 7.510474815073157e-05, + "loss": 1.6734, + "step": 371 + }, + { + "epoch": 0.2062655946770169, + "grad_norm": 0.20351865887641907, + "learning_rate": 7.506931669746266e-05, + "loss": 1.6756, + "step": 372 + }, + { + "epoch": 0.20682007208206266, + "grad_norm": 0.20868165791034698, + "learning_rate": 7.503376590100702e-05, + "loss": 1.6391, + "step": 373 + }, + { + "epoch": 0.2073745494871084, + "grad_norm": 0.24216507375240326, + "learning_rate": 7.499809588234634e-05, + "loss": 1.7327, + "step": 374 + }, + { + "epoch": 0.20792902689215415, + "grad_norm": 0.21316882967948914, + "learning_rate": 7.496230676286802e-05, + "loss": 1.6154, + "step": 375 + }, + { + "epoch": 0.2084835042971999, + "grad_norm": 0.23029237985610962, + "learning_rate": 7.492639866436479e-05, + "loss": 1.6068, + "step": 376 + }, + { + "epoch": 0.20903798170224563, + "grad_norm": 0.21493342518806458, + "learning_rate": 7.489037170903429e-05, + "loss": 1.6474, + "step": 377 + }, + { + "epoch": 0.20959245910729138, + "grad_norm": 0.2048824578523636, + "learning_rate": 7.485422601947858e-05, + "loss": 1.5741, + "step": 378 + }, + { + "epoch": 0.21014693651233712, + "grad_norm": 0.20953327417373657, + "learning_rate": 7.481796171870383e-05, + "loss": 1.609, + "step": 379 + }, + { + "epoch": 0.21070141391738287, + "grad_norm": 0.21259397268295288, + "learning_rate": 7.478157893011984e-05, + "loss": 1.681, + "step": 380 + }, + { + "epoch": 0.2112558913224286, + "grad_norm": 0.2035396844148636, + "learning_rate": 7.474507777753962e-05, + "loss": 1.5764, + "step": 381 + }, + { + "epoch": 0.21181036872747436, + "grad_norm": 0.21311511099338531, + "learning_rate": 7.470845838517899e-05, + "loss": 1.6694, + "step": 382 + }, + { + "epoch": 0.2123648461325201, + "grad_norm": 0.21380078792572021, + "learning_rate": 7.467172087765616e-05, + "loss": 1.6693, + "step": 383 + }, + { + "epoch": 0.21291932353756585, + "grad_norm": 0.20704282820224762, + "learning_rate": 7.463486537999125e-05, + "loss": 1.6214, + "step": 384 + }, + { + "epoch": 0.2134738009426116, + "grad_norm": 0.20259855687618256, + "learning_rate": 7.459789201760596e-05, + "loss": 1.6286, + "step": 385 + }, + { + "epoch": 0.21402827834765734, + "grad_norm": 0.2159855216741562, + "learning_rate": 7.456080091632305e-05, + "loss": 1.6627, + "step": 386 + }, + { + "epoch": 0.21458275575270308, + "grad_norm": 0.21545730531215668, + "learning_rate": 7.452359220236601e-05, + "loss": 1.6175, + "step": 387 + }, + { + "epoch": 0.21513723315774883, + "grad_norm": 0.21161332726478577, + "learning_rate": 7.44862660023585e-05, + "loss": 1.6789, + "step": 388 + }, + { + "epoch": 0.21569171056279457, + "grad_norm": 0.2151462882757187, + "learning_rate": 7.444882244332403e-05, + "loss": 1.6574, + "step": 389 + }, + { + "epoch": 0.21624618796784031, + "grad_norm": 0.23342940211296082, + "learning_rate": 7.441126165268552e-05, + "loss": 1.6155, + "step": 390 + }, + { + "epoch": 0.21680066537288606, + "grad_norm": 0.2156396359205246, + "learning_rate": 7.437358375826476e-05, + "loss": 1.6173, + "step": 391 + }, + { + "epoch": 0.2173551427779318, + "grad_norm": 0.2337697595357895, + "learning_rate": 7.433578888828215e-05, + "loss": 1.6365, + "step": 392 + }, + { + "epoch": 0.21790962018297755, + "grad_norm": 0.21427282691001892, + "learning_rate": 7.429787717135608e-05, + "loss": 1.6423, + "step": 393 + }, + { + "epoch": 0.2184640975880233, + "grad_norm": 0.23731502890586853, + "learning_rate": 7.425984873650262e-05, + "loss": 1.6152, + "step": 394 + }, + { + "epoch": 0.21901857499306904, + "grad_norm": 0.21108436584472656, + "learning_rate": 7.422170371313501e-05, + "loss": 1.6023, + "step": 395 + }, + { + "epoch": 0.21957305239811478, + "grad_norm": 0.24385766685009003, + "learning_rate": 7.418344223106331e-05, + "loss": 1.6968, + "step": 396 + }, + { + "epoch": 0.22012752980316053, + "grad_norm": 0.22708410024642944, + "learning_rate": 7.414506442049382e-05, + "loss": 1.6314, + "step": 397 + }, + { + "epoch": 0.22068200720820627, + "grad_norm": 0.2248944193124771, + "learning_rate": 7.410657041202877e-05, + "loss": 1.6085, + "step": 398 + }, + { + "epoch": 0.22123648461325202, + "grad_norm": 0.2051655650138855, + "learning_rate": 7.406796033666577e-05, + "loss": 1.6075, + "step": 399 + }, + { + "epoch": 0.22179096201829776, + "grad_norm": 0.24941745400428772, + "learning_rate": 7.402923432579749e-05, + "loss": 1.7027, + "step": 400 + }, + { + "epoch": 0.2223454394233435, + "grad_norm": 0.20685605704784393, + "learning_rate": 7.399039251121104e-05, + "loss": 1.5774, + "step": 401 + }, + { + "epoch": 0.22289991682838925, + "grad_norm": 0.25279372930526733, + "learning_rate": 7.395143502508767e-05, + "loss": 1.659, + "step": 402 + }, + { + "epoch": 0.223454394233435, + "grad_norm": 0.21988433599472046, + "learning_rate": 7.391236200000227e-05, + "loss": 1.6034, + "step": 403 + }, + { + "epoch": 0.22400887163848074, + "grad_norm": 0.25263452529907227, + "learning_rate": 7.387317356892294e-05, + "loss": 1.6467, + "step": 404 + }, + { + "epoch": 0.22456334904352648, + "grad_norm": 0.2200496941804886, + "learning_rate": 7.383386986521044e-05, + "loss": 1.5981, + "step": 405 + }, + { + "epoch": 0.22511782644857223, + "grad_norm": 0.22581440210342407, + "learning_rate": 7.379445102261787e-05, + "loss": 1.6179, + "step": 406 + }, + { + "epoch": 0.22567230385361797, + "grad_norm": 0.2082045078277588, + "learning_rate": 7.375491717529014e-05, + "loss": 1.6553, + "step": 407 + }, + { + "epoch": 0.22622678125866372, + "grad_norm": 0.21761249005794525, + "learning_rate": 7.371526845776351e-05, + "loss": 1.5159, + "step": 408 + }, + { + "epoch": 0.22678125866370946, + "grad_norm": 0.22059603035449982, + "learning_rate": 7.36755050049652e-05, + "loss": 1.6327, + "step": 409 + }, + { + "epoch": 0.2273357360687552, + "grad_norm": 0.21581657230854034, + "learning_rate": 7.363562695221285e-05, + "loss": 1.6646, + "step": 410 + }, + { + "epoch": 0.22789021347380095, + "grad_norm": 0.21198253333568573, + "learning_rate": 7.359563443521407e-05, + "loss": 1.6374, + "step": 411 + }, + { + "epoch": 0.2284446908788467, + "grad_norm": 0.21386800706386566, + "learning_rate": 7.3555527590066e-05, + "loss": 1.6085, + "step": 412 + }, + { + "epoch": 0.22899916828389244, + "grad_norm": 0.21344758570194244, + "learning_rate": 7.351530655325492e-05, + "loss": 1.6691, + "step": 413 + }, + { + "epoch": 0.22955364568893818, + "grad_norm": 0.21234236657619476, + "learning_rate": 7.347497146165562e-05, + "loss": 1.5652, + "step": 414 + }, + { + "epoch": 0.23010812309398393, + "grad_norm": 0.21020184457302094, + "learning_rate": 7.343452245253108e-05, + "loss": 1.6386, + "step": 415 + }, + { + "epoch": 0.23066260049902967, + "grad_norm": 0.22808673977851868, + "learning_rate": 7.339395966353193e-05, + "loss": 1.6169, + "step": 416 + }, + { + "epoch": 0.23121707790407542, + "grad_norm": 0.21194703876972198, + "learning_rate": 7.335328323269599e-05, + "loss": 1.6279, + "step": 417 + }, + { + "epoch": 0.23177155530912116, + "grad_norm": 0.2219015657901764, + "learning_rate": 7.331249329844784e-05, + "loss": 1.6187, + "step": 418 + }, + { + "epoch": 0.2323260327141669, + "grad_norm": 0.2099056988954544, + "learning_rate": 7.327158999959831e-05, + "loss": 1.6131, + "step": 419 + }, + { + "epoch": 0.23288051011921265, + "grad_norm": 0.2129475176334381, + "learning_rate": 7.323057347534401e-05, + "loss": 1.7079, + "step": 420 + }, + { + "epoch": 0.2334349875242584, + "grad_norm": 0.20943042635917664, + "learning_rate": 7.318944386526683e-05, + "loss": 1.6559, + "step": 421 + }, + { + "epoch": 0.23398946492930414, + "grad_norm": 0.21280522644519806, + "learning_rate": 7.314820130933358e-05, + "loss": 1.6158, + "step": 422 + }, + { + "epoch": 0.2345439423343499, + "grad_norm": 0.21108831465244293, + "learning_rate": 7.310684594789535e-05, + "loss": 1.5913, + "step": 423 + }, + { + "epoch": 0.23509841973939563, + "grad_norm": 0.21344444155693054, + "learning_rate": 7.306537792168717e-05, + "loss": 1.6157, + "step": 424 + }, + { + "epoch": 0.23565289714444138, + "grad_norm": 0.20498408377170563, + "learning_rate": 7.302379737182746e-05, + "loss": 1.6026, + "step": 425 + }, + { + "epoch": 0.23620737454948712, + "grad_norm": 0.21187523007392883, + "learning_rate": 7.298210443981754e-05, + "loss": 1.6558, + "step": 426 + }, + { + "epoch": 0.23676185195453286, + "grad_norm": 0.2079460322856903, + "learning_rate": 7.29402992675412e-05, + "loss": 1.7147, + "step": 427 + }, + { + "epoch": 0.2373163293595786, + "grad_norm": 0.22233690321445465, + "learning_rate": 7.289838199726419e-05, + "loss": 1.681, + "step": 428 + }, + { + "epoch": 0.23787080676462435, + "grad_norm": 0.21120913326740265, + "learning_rate": 7.285635277163373e-05, + "loss": 1.6054, + "step": 429 + }, + { + "epoch": 0.2384252841696701, + "grad_norm": 0.22804483771324158, + "learning_rate": 7.281421173367805e-05, + "loss": 1.716, + "step": 430 + }, + { + "epoch": 0.23897976157471584, + "grad_norm": 0.21392664313316345, + "learning_rate": 7.277195902680584e-05, + "loss": 1.6429, + "step": 431 + }, + { + "epoch": 0.2395342389797616, + "grad_norm": 0.21854659914970398, + "learning_rate": 7.272959479480584e-05, + "loss": 1.6437, + "step": 432 + }, + { + "epoch": 0.24008871638480733, + "grad_norm": 0.21218635141849518, + "learning_rate": 7.268711918184635e-05, + "loss": 1.7084, + "step": 433 + }, + { + "epoch": 0.24064319378985308, + "grad_norm": 0.21876314282417297, + "learning_rate": 7.26445323324746e-05, + "loss": 1.6482, + "step": 434 + }, + { + "epoch": 0.24119767119489882, + "grad_norm": 0.2061976045370102, + "learning_rate": 7.260183439161651e-05, + "loss": 1.6213, + "step": 435 + }, + { + "epoch": 0.24175214859994454, + "grad_norm": 0.22555728256702423, + "learning_rate": 7.255902550457592e-05, + "loss": 1.7004, + "step": 436 + }, + { + "epoch": 0.24230662600499028, + "grad_norm": 0.20105816423892975, + "learning_rate": 7.251610581703432e-05, + "loss": 1.5968, + "step": 437 + }, + { + "epoch": 0.24286110341003603, + "grad_norm": 0.22831952571868896, + "learning_rate": 7.24730754750502e-05, + "loss": 1.6453, + "step": 438 + }, + { + "epoch": 0.24341558081508177, + "grad_norm": 0.20313981175422668, + "learning_rate": 7.242993462505861e-05, + "loss": 1.6452, + "step": 439 + }, + { + "epoch": 0.24397005822012752, + "grad_norm": 0.21168267726898193, + "learning_rate": 7.238668341387078e-05, + "loss": 1.6479, + "step": 440 + }, + { + "epoch": 0.24452453562517326, + "grad_norm": 0.20957493782043457, + "learning_rate": 7.234332198867334e-05, + "loss": 1.63, + "step": 441 + }, + { + "epoch": 0.245079013030219, + "grad_norm": 0.2083057314157486, + "learning_rate": 7.22998504970281e-05, + "loss": 1.6362, + "step": 442 + }, + { + "epoch": 0.24563349043526475, + "grad_norm": 0.23519518971443176, + "learning_rate": 7.22562690868714e-05, + "loss": 1.7074, + "step": 443 + }, + { + "epoch": 0.2461879678403105, + "grad_norm": 0.21451757848262787, + "learning_rate": 7.221257790651364e-05, + "loss": 1.5854, + "step": 444 + }, + { + "epoch": 0.24674244524535624, + "grad_norm": 0.21790243685245514, + "learning_rate": 7.216877710463877e-05, + "loss": 1.5891, + "step": 445 + }, + { + "epoch": 0.24729692265040198, + "grad_norm": 0.2403048574924469, + "learning_rate": 7.212486683030377e-05, + "loss": 1.6683, + "step": 446 + }, + { + "epoch": 0.24785140005544773, + "grad_norm": 0.252909779548645, + "learning_rate": 7.208084723293823e-05, + "loss": 1.5939, + "step": 447 + }, + { + "epoch": 0.24840587746049347, + "grad_norm": 0.22861145436763763, + "learning_rate": 7.203671846234371e-05, + "loss": 1.6284, + "step": 448 + }, + { + "epoch": 0.24896035486553922, + "grad_norm": 0.2679903209209442, + "learning_rate": 7.199248066869331e-05, + "loss": 1.6749, + "step": 449 + }, + { + "epoch": 0.24951483227058496, + "grad_norm": 0.19942152500152588, + "learning_rate": 7.194813400253114e-05, + "loss": 1.5894, + "step": 450 + }, + { + "epoch": 0.25006930967563074, + "grad_norm": 0.23093563318252563, + "learning_rate": 7.190367861477183e-05, + "loss": 1.5947, + "step": 451 + }, + { + "epoch": 0.25062378708067645, + "grad_norm": 0.22997967898845673, + "learning_rate": 7.185911465669998e-05, + "loss": 1.6138, + "step": 452 + }, + { + "epoch": 0.2511782644857222, + "grad_norm": 0.2587045729160309, + "learning_rate": 7.181444227996966e-05, + "loss": 1.6803, + "step": 453 + }, + { + "epoch": 0.25173274189076794, + "grad_norm": 0.2321499139070511, + "learning_rate": 7.17696616366039e-05, + "loss": 1.7357, + "step": 454 + }, + { + "epoch": 0.2522872192958137, + "grad_norm": 0.24048465490341187, + "learning_rate": 7.172477287899418e-05, + "loss": 1.6514, + "step": 455 + }, + { + "epoch": 0.25284169670085943, + "grad_norm": 0.22340674698352814, + "learning_rate": 7.167977615989985e-05, + "loss": 1.6029, + "step": 456 + }, + { + "epoch": 0.2533961741059052, + "grad_norm": 0.21953530609607697, + "learning_rate": 7.163467163244775e-05, + "loss": 1.5879, + "step": 457 + }, + { + "epoch": 0.2539506515109509, + "grad_norm": 0.22932195663452148, + "learning_rate": 7.158945945013151e-05, + "loss": 1.6812, + "step": 458 + }, + { + "epoch": 0.2545051289159967, + "grad_norm": 0.21497461199760437, + "learning_rate": 7.15441397668112e-05, + "loss": 1.6981, + "step": 459 + }, + { + "epoch": 0.2550596063210424, + "grad_norm": 0.22530704736709595, + "learning_rate": 7.149871273671262e-05, + "loss": 1.6335, + "step": 460 + }, + { + "epoch": 0.2556140837260882, + "grad_norm": 0.20990800857543945, + "learning_rate": 7.145317851442696e-05, + "loss": 1.6352, + "step": 461 + }, + { + "epoch": 0.2561685611311339, + "grad_norm": 0.2138289213180542, + "learning_rate": 7.140753725491019e-05, + "loss": 1.6216, + "step": 462 + }, + { + "epoch": 0.25672303853617967, + "grad_norm": 0.2041904479265213, + "learning_rate": 7.136178911348248e-05, + "loss": 1.6162, + "step": 463 + }, + { + "epoch": 0.2572775159412254, + "grad_norm": 0.22262172400951385, + "learning_rate": 7.131593424582777e-05, + "loss": 1.633, + "step": 464 + }, + { + "epoch": 0.25783199334627116, + "grad_norm": 0.20571322739124298, + "learning_rate": 7.12699728079932e-05, + "loss": 1.6332, + "step": 465 + }, + { + "epoch": 0.2583864707513169, + "grad_norm": 0.22316017746925354, + "learning_rate": 7.122390495638853e-05, + "loss": 1.6925, + "step": 466 + }, + { + "epoch": 0.25894094815636265, + "grad_norm": 0.20561595261096954, + "learning_rate": 7.117773084778568e-05, + "loss": 1.5525, + "step": 467 + }, + { + "epoch": 0.25949542556140837, + "grad_norm": 0.2053271234035492, + "learning_rate": 7.113145063931821e-05, + "loss": 1.6606, + "step": 468 + }, + { + "epoch": 0.26004990296645414, + "grad_norm": 0.2116890847682953, + "learning_rate": 7.108506448848069e-05, + "loss": 1.5761, + "step": 469 + }, + { + "epoch": 0.26060438037149986, + "grad_norm": 0.20428834855556488, + "learning_rate": 7.103857255312823e-05, + "loss": 1.6077, + "step": 470 + }, + { + "epoch": 0.2611588577765456, + "grad_norm": 0.20274734497070312, + "learning_rate": 7.099197499147594e-05, + "loss": 1.5833, + "step": 471 + }, + { + "epoch": 0.26171333518159134, + "grad_norm": 0.20062290132045746, + "learning_rate": 7.094527196209838e-05, + "loss": 1.5884, + "step": 472 + }, + { + "epoch": 0.2622678125866371, + "grad_norm": 0.2172449678182602, + "learning_rate": 7.089846362392904e-05, + "loss": 1.7173, + "step": 473 + }, + { + "epoch": 0.26282228999168283, + "grad_norm": 0.21342062950134277, + "learning_rate": 7.085155013625974e-05, + "loss": 1.6288, + "step": 474 + }, + { + "epoch": 0.2633767673967286, + "grad_norm": 0.2020542323589325, + "learning_rate": 7.080453165874018e-05, + "loss": 1.5916, + "step": 475 + }, + { + "epoch": 0.2639312448017743, + "grad_norm": 0.20829129219055176, + "learning_rate": 7.07574083513773e-05, + "loss": 1.6227, + "step": 476 + }, + { + "epoch": 0.2644857222068201, + "grad_norm": 0.20422028005123138, + "learning_rate": 7.071018037453485e-05, + "loss": 1.63, + "step": 477 + }, + { + "epoch": 0.2650401996118658, + "grad_norm": 0.19959598779678345, + "learning_rate": 7.066284788893268e-05, + "loss": 1.6, + "step": 478 + }, + { + "epoch": 0.2655946770169116, + "grad_norm": 0.21311348676681519, + "learning_rate": 7.061541105564642e-05, + "loss": 1.6359, + "step": 479 + }, + { + "epoch": 0.2661491544219573, + "grad_norm": 0.21512793004512787, + "learning_rate": 7.056787003610667e-05, + "loss": 1.6067, + "step": 480 + }, + { + "epoch": 0.2667036318270031, + "grad_norm": 0.2027646005153656, + "learning_rate": 7.052022499209864e-05, + "loss": 1.6362, + "step": 481 + }, + { + "epoch": 0.2672581092320488, + "grad_norm": 0.23081980645656586, + "learning_rate": 7.047247608576157e-05, + "loss": 1.6021, + "step": 482 + }, + { + "epoch": 0.26781258663709456, + "grad_norm": 0.20609967410564423, + "learning_rate": 7.04246234795881e-05, + "loss": 1.6012, + "step": 483 + }, + { + "epoch": 0.2683670640421403, + "grad_norm": 0.20472081005573273, + "learning_rate": 7.03766673364238e-05, + "loss": 1.5602, + "step": 484 + }, + { + "epoch": 0.26892154144718605, + "grad_norm": 0.2030676007270813, + "learning_rate": 7.032860781946657e-05, + "loss": 1.6053, + "step": 485 + }, + { + "epoch": 0.26947601885223177, + "grad_norm": 0.22049987316131592, + "learning_rate": 7.028044509226612e-05, + "loss": 1.6074, + "step": 486 + }, + { + "epoch": 0.27003049625727754, + "grad_norm": 0.20548531413078308, + "learning_rate": 7.023217931872334e-05, + "loss": 1.5976, + "step": 487 + }, + { + "epoch": 0.27058497366232326, + "grad_norm": 0.22220125794410706, + "learning_rate": 7.018381066308984e-05, + "loss": 1.589, + "step": 488 + }, + { + "epoch": 0.27113945106736903, + "grad_norm": 0.209039568901062, + "learning_rate": 7.013533928996733e-05, + "loss": 1.6254, + "step": 489 + }, + { + "epoch": 0.27169392847241475, + "grad_norm": 0.21839891374111176, + "learning_rate": 7.00867653643071e-05, + "loss": 1.6726, + "step": 490 + }, + { + "epoch": 0.2722484058774605, + "grad_norm": 0.20095285773277283, + "learning_rate": 7.003808905140936e-05, + "loss": 1.5836, + "step": 491 + }, + { + "epoch": 0.27280288328250624, + "grad_norm": 0.19979004561901093, + "learning_rate": 6.998931051692283e-05, + "loss": 1.5875, + "step": 492 + }, + { + "epoch": 0.273357360687552, + "grad_norm": 0.20679202675819397, + "learning_rate": 6.994042992684406e-05, + "loss": 1.5333, + "step": 493 + }, + { + "epoch": 0.2739118380925977, + "grad_norm": 0.1942681074142456, + "learning_rate": 6.989144744751689e-05, + "loss": 1.5397, + "step": 494 + }, + { + "epoch": 0.2744663154976435, + "grad_norm": 0.210038959980011, + "learning_rate": 6.984236324563192e-05, + "loss": 1.6923, + "step": 495 + }, + { + "epoch": 0.2750207929026892, + "grad_norm": 0.213044673204422, + "learning_rate": 6.979317748822594e-05, + "loss": 1.6881, + "step": 496 + }, + { + "epoch": 0.275575270307735, + "grad_norm": 0.21259163320064545, + "learning_rate": 6.974389034268127e-05, + "loss": 1.6463, + "step": 497 + }, + { + "epoch": 0.2761297477127807, + "grad_norm": 0.2128421813249588, + "learning_rate": 6.969450197672534e-05, + "loss": 1.6319, + "step": 498 + }, + { + "epoch": 0.2766842251178264, + "grad_norm": 0.21015048027038574, + "learning_rate": 6.964501255842995e-05, + "loss": 1.5996, + "step": 499 + }, + { + "epoch": 0.2772387025228722, + "grad_norm": 0.20773227512836456, + "learning_rate": 6.959542225621087e-05, + "loss": 1.6476, + "step": 500 + }, + { + "epoch": 0.2777931799279179, + "grad_norm": 0.21952128410339355, + "learning_rate": 6.954573123882718e-05, + "loss": 1.5959, + "step": 501 + }, + { + "epoch": 0.2783476573329637, + "grad_norm": 0.20505724847316742, + "learning_rate": 6.949593967538062e-05, + "loss": 1.583, + "step": 502 + }, + { + "epoch": 0.2789021347380094, + "grad_norm": 0.21379536390304565, + "learning_rate": 6.94460477353152e-05, + "loss": 1.6331, + "step": 503 + }, + { + "epoch": 0.27945661214305517, + "grad_norm": 0.22115319967269897, + "learning_rate": 6.939605558841644e-05, + "loss": 1.5612, + "step": 504 + }, + { + "epoch": 0.2800110895481009, + "grad_norm": 0.2217654585838318, + "learning_rate": 6.934596340481088e-05, + "loss": 1.5818, + "step": 505 + }, + { + "epoch": 0.28056556695314666, + "grad_norm": 0.21580982208251953, + "learning_rate": 6.929577135496556e-05, + "loss": 1.6032, + "step": 506 + }, + { + "epoch": 0.2811200443581924, + "grad_norm": 0.2084866166114807, + "learning_rate": 6.924547960968726e-05, + "loss": 1.5739, + "step": 507 + }, + { + "epoch": 0.28167452176323815, + "grad_norm": 0.20981299877166748, + "learning_rate": 6.919508834012213e-05, + "loss": 1.6299, + "step": 508 + }, + { + "epoch": 0.28222899916828387, + "grad_norm": 0.20049329102039337, + "learning_rate": 6.914459771775496e-05, + "loss": 1.6606, + "step": 509 + }, + { + "epoch": 0.28278347657332964, + "grad_norm": 0.22596615552902222, + "learning_rate": 6.909400791440864e-05, + "loss": 1.5644, + "step": 510 + }, + { + "epoch": 0.28333795397837536, + "grad_norm": 0.20176082849502563, + "learning_rate": 6.904331910224361e-05, + "loss": 1.5579, + "step": 511 + }, + { + "epoch": 0.28389243138342113, + "grad_norm": 0.22138462960720062, + "learning_rate": 6.899253145375723e-05, + "loss": 1.6555, + "step": 512 + }, + { + "epoch": 0.28444690878846685, + "grad_norm": 0.2055101990699768, + "learning_rate": 6.894164514178317e-05, + "loss": 1.591, + "step": 513 + }, + { + "epoch": 0.2850013861935126, + "grad_norm": 0.2184118628501892, + "learning_rate": 6.88906603394909e-05, + "loss": 1.6579, + "step": 514 + }, + { + "epoch": 0.28555586359855833, + "grad_norm": 0.21609501540660858, + "learning_rate": 6.883957722038508e-05, + "loss": 1.6977, + "step": 515 + }, + { + "epoch": 0.2861103410036041, + "grad_norm": 0.20950822532176971, + "learning_rate": 6.878839595830489e-05, + "loss": 1.6842, + "step": 516 + }, + { + "epoch": 0.2866648184086498, + "grad_norm": 0.20638826489448547, + "learning_rate": 6.873711672742353e-05, + "loss": 1.5596, + "step": 517 + }, + { + "epoch": 0.2872192958136956, + "grad_norm": 0.20869210362434387, + "learning_rate": 6.868573970224757e-05, + "loss": 1.5563, + "step": 518 + }, + { + "epoch": 0.2877737732187413, + "grad_norm": 0.20834721624851227, + "learning_rate": 6.863426505761645e-05, + "loss": 1.6254, + "step": 519 + }, + { + "epoch": 0.2883282506237871, + "grad_norm": 0.20829017460346222, + "learning_rate": 6.85826929687017e-05, + "loss": 1.6216, + "step": 520 + }, + { + "epoch": 0.2888827280288328, + "grad_norm": 0.20338068902492523, + "learning_rate": 6.853102361100657e-05, + "loss": 1.5475, + "step": 521 + }, + { + "epoch": 0.2894372054338786, + "grad_norm": 0.21138431131839752, + "learning_rate": 6.847925716036526e-05, + "loss": 1.625, + "step": 522 + }, + { + "epoch": 0.2899916828389243, + "grad_norm": 0.21660204231739044, + "learning_rate": 6.842739379294241e-05, + "loss": 1.5498, + "step": 523 + }, + { + "epoch": 0.29054616024397006, + "grad_norm": 0.2074098438024521, + "learning_rate": 6.837543368523244e-05, + "loss": 1.6134, + "step": 524 + }, + { + "epoch": 0.2911006376490158, + "grad_norm": 0.20286381244659424, + "learning_rate": 6.832337701405904e-05, + "loss": 1.6212, + "step": 525 + }, + { + "epoch": 0.29165511505406155, + "grad_norm": 0.21887677907943726, + "learning_rate": 6.827122395657445e-05, + "loss": 1.6228, + "step": 526 + }, + { + "epoch": 0.29220959245910727, + "grad_norm": 0.20947344601154327, + "learning_rate": 6.821897469025895e-05, + "loss": 1.6106, + "step": 527 + }, + { + "epoch": 0.29276406986415304, + "grad_norm": 0.20572635531425476, + "learning_rate": 6.816662939292024e-05, + "loss": 1.5996, + "step": 528 + }, + { + "epoch": 0.29331854726919876, + "grad_norm": 0.20719175040721893, + "learning_rate": 6.81141882426928e-05, + "loss": 1.6113, + "step": 529 + }, + { + "epoch": 0.29387302467424453, + "grad_norm": 0.2026231288909912, + "learning_rate": 6.80616514180373e-05, + "loss": 1.5744, + "step": 530 + }, + { + "epoch": 0.29442750207929025, + "grad_norm": 0.19878005981445312, + "learning_rate": 6.800901909774e-05, + "loss": 1.5897, + "step": 531 + }, + { + "epoch": 0.294981979484336, + "grad_norm": 0.2110818773508072, + "learning_rate": 6.795629146091215e-05, + "loss": 1.5741, + "step": 532 + }, + { + "epoch": 0.29553645688938174, + "grad_norm": 0.20320414006710052, + "learning_rate": 6.790346868698936e-05, + "loss": 1.6345, + "step": 533 + }, + { + "epoch": 0.2960909342944275, + "grad_norm": 0.19918134808540344, + "learning_rate": 6.785055095573098e-05, + "loss": 1.5811, + "step": 534 + }, + { + "epoch": 0.2966454116994732, + "grad_norm": 0.20493772625923157, + "learning_rate": 6.779753844721955e-05, + "loss": 1.5529, + "step": 535 + }, + { + "epoch": 0.297199889104519, + "grad_norm": 0.20805048942565918, + "learning_rate": 6.774443134186008e-05, + "loss": 1.5974, + "step": 536 + }, + { + "epoch": 0.2977543665095647, + "grad_norm": 0.21310682594776154, + "learning_rate": 6.769122982037959e-05, + "loss": 1.6811, + "step": 537 + }, + { + "epoch": 0.2983088439146105, + "grad_norm": 0.21554915606975555, + "learning_rate": 6.763793406382629e-05, + "loss": 1.6707, + "step": 538 + }, + { + "epoch": 0.2988633213196562, + "grad_norm": 0.21235989034175873, + "learning_rate": 6.758454425356917e-05, + "loss": 1.6141, + "step": 539 + }, + { + "epoch": 0.299417798724702, + "grad_norm": 0.2248426377773285, + "learning_rate": 6.753106057129725e-05, + "loss": 1.6711, + "step": 540 + }, + { + "epoch": 0.2999722761297477, + "grad_norm": 0.20382218062877655, + "learning_rate": 6.747748319901902e-05, + "loss": 1.5494, + "step": 541 + }, + { + "epoch": 0.30052675353479347, + "grad_norm": 0.21593333780765533, + "learning_rate": 6.74238123190618e-05, + "loss": 1.6111, + "step": 542 + }, + { + "epoch": 0.3010812309398392, + "grad_norm": 0.20532597601413727, + "learning_rate": 6.737004811407109e-05, + "loss": 1.5824, + "step": 543 + }, + { + "epoch": 0.30163570834488496, + "grad_norm": 0.23563292622566223, + "learning_rate": 6.731619076701002e-05, + "loss": 1.5641, + "step": 544 + }, + { + "epoch": 0.3021901857499307, + "grad_norm": 0.2265416830778122, + "learning_rate": 6.726224046115871e-05, + "loss": 1.7597, + "step": 545 + }, + { + "epoch": 0.30274466315497645, + "grad_norm": 0.21692894399166107, + "learning_rate": 6.720819738011355e-05, + "loss": 1.5962, + "step": 546 + }, + { + "epoch": 0.30329914056002216, + "grad_norm": 0.2314797043800354, + "learning_rate": 6.715406170778671e-05, + "loss": 1.5945, + "step": 547 + }, + { + "epoch": 0.30385361796506793, + "grad_norm": 0.21994145214557648, + "learning_rate": 6.709983362840544e-05, + "loss": 1.6904, + "step": 548 + }, + { + "epoch": 0.30440809537011365, + "grad_norm": 0.21997472643852234, + "learning_rate": 6.704551332651144e-05, + "loss": 1.6322, + "step": 549 + }, + { + "epoch": 0.3049625727751594, + "grad_norm": 0.22159717977046967, + "learning_rate": 6.699110098696029e-05, + "loss": 1.6348, + "step": 550 + }, + { + "epoch": 0.30551705018020514, + "grad_norm": 0.20532500743865967, + "learning_rate": 6.693659679492072e-05, + "loss": 1.5946, + "step": 551 + }, + { + "epoch": 0.3060715275852509, + "grad_norm": 0.22436174750328064, + "learning_rate": 6.688200093587409e-05, + "loss": 1.5677, + "step": 552 + }, + { + "epoch": 0.30662600499029663, + "grad_norm": 0.20958715677261353, + "learning_rate": 6.682731359561369e-05, + "loss": 1.5468, + "step": 553 + }, + { + "epoch": 0.3071804823953424, + "grad_norm": 0.23304277658462524, + "learning_rate": 6.677253496024412e-05, + "loss": 1.6636, + "step": 554 + }, + { + "epoch": 0.3077349598003881, + "grad_norm": 0.2050829380750656, + "learning_rate": 6.671766521618069e-05, + "loss": 1.5629, + "step": 555 + }, + { + "epoch": 0.3082894372054339, + "grad_norm": 0.22310495376586914, + "learning_rate": 6.666270455014874e-05, + "loss": 1.6744, + "step": 556 + }, + { + "epoch": 0.3088439146104796, + "grad_norm": 0.21029001474380493, + "learning_rate": 6.6607653149183e-05, + "loss": 1.6397, + "step": 557 + }, + { + "epoch": 0.3093983920155254, + "grad_norm": 0.21131139993667603, + "learning_rate": 6.655251120062702e-05, + "loss": 1.62, + "step": 558 + }, + { + "epoch": 0.3099528694205711, + "grad_norm": 0.21726414561271667, + "learning_rate": 6.649727889213246e-05, + "loss": 1.601, + "step": 559 + }, + { + "epoch": 0.31050734682561687, + "grad_norm": 0.2165653556585312, + "learning_rate": 6.644195641165851e-05, + "loss": 1.577, + "step": 560 + }, + { + "epoch": 0.3110618242306626, + "grad_norm": 0.20710572600364685, + "learning_rate": 6.63865439474712e-05, + "loss": 1.5241, + "step": 561 + }, + { + "epoch": 0.31161630163570836, + "grad_norm": 0.20715083181858063, + "learning_rate": 6.633104168814276e-05, + "loss": 1.5959, + "step": 562 + }, + { + "epoch": 0.3121707790407541, + "grad_norm": 0.2072436511516571, + "learning_rate": 6.627544982255105e-05, + "loss": 1.669, + "step": 563 + }, + { + "epoch": 0.31272525644579985, + "grad_norm": 0.22219659388065338, + "learning_rate": 6.62197685398788e-05, + "loss": 1.6437, + "step": 564 + }, + { + "epoch": 0.31327973385084557, + "grad_norm": 0.209031879901886, + "learning_rate": 6.616399802961312e-05, + "loss": 1.6819, + "step": 565 + }, + { + "epoch": 0.31383421125589134, + "grad_norm": 0.21048156917095184, + "learning_rate": 6.610813848154467e-05, + "loss": 1.6213, + "step": 566 + }, + { + "epoch": 0.31438868866093705, + "grad_norm": 0.20893770456314087, + "learning_rate": 6.605219008576718e-05, + "loss": 1.6092, + "step": 567 + }, + { + "epoch": 0.3149431660659828, + "grad_norm": 0.20855359733104706, + "learning_rate": 6.599615303267672e-05, + "loss": 1.6285, + "step": 568 + }, + { + "epoch": 0.31549764347102854, + "grad_norm": 0.20627255737781525, + "learning_rate": 6.594002751297106e-05, + "loss": 1.5787, + "step": 569 + }, + { + "epoch": 0.3160521208760743, + "grad_norm": 0.21437032520771027, + "learning_rate": 6.588381371764903e-05, + "loss": 1.5824, + "step": 570 + }, + { + "epoch": 0.31660659828112003, + "grad_norm": 0.21238544583320618, + "learning_rate": 6.582751183800983e-05, + "loss": 1.6169, + "step": 571 + }, + { + "epoch": 0.3171610756861658, + "grad_norm": 0.20815801620483398, + "learning_rate": 6.57711220656525e-05, + "loss": 1.5866, + "step": 572 + }, + { + "epoch": 0.3177155530912115, + "grad_norm": 0.21221747994422913, + "learning_rate": 6.57146445924751e-05, + "loss": 1.6005, + "step": 573 + }, + { + "epoch": 0.3182700304962573, + "grad_norm": 0.22167359292507172, + "learning_rate": 6.565807961067421e-05, + "loss": 1.661, + "step": 574 + }, + { + "epoch": 0.318824507901303, + "grad_norm": 0.22634592652320862, + "learning_rate": 6.560142731274416e-05, + "loss": 1.6441, + "step": 575 + }, + { + "epoch": 0.3193789853063488, + "grad_norm": 0.20446883141994476, + "learning_rate": 6.554468789147644e-05, + "loss": 1.5857, + "step": 576 + }, + { + "epoch": 0.3199334627113945, + "grad_norm": 0.22486351430416107, + "learning_rate": 6.548786153995901e-05, + "loss": 1.6204, + "step": 577 + }, + { + "epoch": 0.3204879401164403, + "grad_norm": 0.2034136950969696, + "learning_rate": 6.543094845157569e-05, + "loss": 1.5765, + "step": 578 + }, + { + "epoch": 0.321042417521486, + "grad_norm": 0.21498997509479523, + "learning_rate": 6.537394882000545e-05, + "loss": 1.5481, + "step": 579 + }, + { + "epoch": 0.32159689492653176, + "grad_norm": 0.2029666006565094, + "learning_rate": 6.531686283922179e-05, + "loss": 1.578, + "step": 580 + }, + { + "epoch": 0.3221513723315775, + "grad_norm": 0.20419777929782867, + "learning_rate": 6.525969070349205e-05, + "loss": 1.5908, + "step": 581 + }, + { + "epoch": 0.32270584973662325, + "grad_norm": 0.2099248170852661, + "learning_rate": 6.520243260737674e-05, + "loss": 1.5928, + "step": 582 + }, + { + "epoch": 0.32326032714166897, + "grad_norm": 0.21225160360336304, + "learning_rate": 6.514508874572893e-05, + "loss": 1.5871, + "step": 583 + }, + { + "epoch": 0.32381480454671474, + "grad_norm": 0.20643840730190277, + "learning_rate": 6.508765931369356e-05, + "loss": 1.5607, + "step": 584 + }, + { + "epoch": 0.32436928195176046, + "grad_norm": 0.2086772918701172, + "learning_rate": 6.503014450670674e-05, + "loss": 1.5538, + "step": 585 + }, + { + "epoch": 0.32492375935680623, + "grad_norm": 0.2010130137205124, + "learning_rate": 6.497254452049513e-05, + "loss": 1.5856, + "step": 586 + }, + { + "epoch": 0.32547823676185195, + "grad_norm": 0.2193068265914917, + "learning_rate": 6.491485955107526e-05, + "loss": 1.7064, + "step": 587 + }, + { + "epoch": 0.3260327141668977, + "grad_norm": 0.2186783403158188, + "learning_rate": 6.485708979475291e-05, + "loss": 1.5548, + "step": 588 + }, + { + "epoch": 0.32658719157194344, + "grad_norm": 0.213240385055542, + "learning_rate": 6.47992354481223e-05, + "loss": 1.6239, + "step": 589 + }, + { + "epoch": 0.3271416689769892, + "grad_norm": 0.2521916627883911, + "learning_rate": 6.474129670806561e-05, + "loss": 1.6636, + "step": 590 + }, + { + "epoch": 0.3276961463820349, + "grad_norm": 0.21178847551345825, + "learning_rate": 6.468327377175214e-05, + "loss": 1.5943, + "step": 591 + }, + { + "epoch": 0.3282506237870807, + "grad_norm": 0.2146124243736267, + "learning_rate": 6.462516683663778e-05, + "loss": 1.615, + "step": 592 + }, + { + "epoch": 0.3288051011921264, + "grad_norm": 0.20442825555801392, + "learning_rate": 6.456697610046423e-05, + "loss": 1.5938, + "step": 593 + }, + { + "epoch": 0.3293595785971722, + "grad_norm": 0.2278599590063095, + "learning_rate": 6.450870176125838e-05, + "loss": 1.6331, + "step": 594 + }, + { + "epoch": 0.3299140560022179, + "grad_norm": 0.2228085696697235, + "learning_rate": 6.445034401733164e-05, + "loss": 1.6489, + "step": 595 + }, + { + "epoch": 0.3304685334072637, + "grad_norm": 0.24472852051258087, + "learning_rate": 6.439190306727926e-05, + "loss": 1.5416, + "step": 596 + }, + { + "epoch": 0.3310230108123094, + "grad_norm": 0.22040173411369324, + "learning_rate": 6.433337910997958e-05, + "loss": 1.5253, + "step": 597 + }, + { + "epoch": 0.33157748821735517, + "grad_norm": 0.27518245577812195, + "learning_rate": 6.427477234459353e-05, + "loss": 1.6411, + "step": 598 + }, + { + "epoch": 0.3321319656224009, + "grad_norm": 0.2157251238822937, + "learning_rate": 6.421608297056374e-05, + "loss": 1.6153, + "step": 599 + }, + { + "epoch": 0.33268644302744665, + "grad_norm": 0.2443874329328537, + "learning_rate": 6.415731118761401e-05, + "loss": 1.5696, + "step": 600 + }, + { + "epoch": 0.33324092043249237, + "grad_norm": 0.22603949904441833, + "learning_rate": 6.409845719574857e-05, + "loss": 1.6176, + "step": 601 + }, + { + "epoch": 0.33379539783753814, + "grad_norm": 0.20818479359149933, + "learning_rate": 6.403952119525143e-05, + "loss": 1.5308, + "step": 602 + }, + { + "epoch": 0.33434987524258386, + "grad_norm": 0.2425011247396469, + "learning_rate": 6.398050338668567e-05, + "loss": 1.5735, + "step": 603 + }, + { + "epoch": 0.33490435264762963, + "grad_norm": 0.21690739691257477, + "learning_rate": 6.392140397089275e-05, + "loss": 1.6721, + "step": 604 + }, + { + "epoch": 0.33545883005267535, + "grad_norm": 0.24877040088176727, + "learning_rate": 6.386222314899187e-05, + "loss": 1.6139, + "step": 605 + }, + { + "epoch": 0.3360133074577211, + "grad_norm": 0.2150987684726715, + "learning_rate": 6.380296112237926e-05, + "loss": 1.5697, + "step": 606 + }, + { + "epoch": 0.33656778486276684, + "grad_norm": 0.22079655528068542, + "learning_rate": 6.374361809272749e-05, + "loss": 1.6363, + "step": 607 + }, + { + "epoch": 0.3371222622678126, + "grad_norm": 0.20685946941375732, + "learning_rate": 6.368419426198475e-05, + "loss": 1.6146, + "step": 608 + }, + { + "epoch": 0.33767673967285833, + "grad_norm": 0.21263012290000916, + "learning_rate": 6.362468983237427e-05, + "loss": 1.5774, + "step": 609 + }, + { + "epoch": 0.3382312170779041, + "grad_norm": 0.21780216693878174, + "learning_rate": 6.356510500639353e-05, + "loss": 1.6147, + "step": 610 + }, + { + "epoch": 0.3387856944829498, + "grad_norm": 0.21480686962604523, + "learning_rate": 6.350543998681358e-05, + "loss": 1.6554, + "step": 611 + }, + { + "epoch": 0.3393401718879956, + "grad_norm": 0.2028435915708542, + "learning_rate": 6.344569497667843e-05, + "loss": 1.546, + "step": 612 + }, + { + "epoch": 0.3398946492930413, + "grad_norm": 0.22665052115917206, + "learning_rate": 6.338587017930425e-05, + "loss": 1.571, + "step": 613 + }, + { + "epoch": 0.3404491266980871, + "grad_norm": 0.20434421300888062, + "learning_rate": 6.332596579827876e-05, + "loss": 1.586, + "step": 614 + }, + { + "epoch": 0.3410036041031328, + "grad_norm": 0.20660161972045898, + "learning_rate": 6.326598203746049e-05, + "loss": 1.5352, + "step": 615 + }, + { + "epoch": 0.34155808150817857, + "grad_norm": 0.19511684775352478, + "learning_rate": 6.320591910097813e-05, + "loss": 1.5336, + "step": 616 + }, + { + "epoch": 0.3421125589132243, + "grad_norm": 0.21570967137813568, + "learning_rate": 6.314577719322978e-05, + "loss": 1.625, + "step": 617 + }, + { + "epoch": 0.34266703631827006, + "grad_norm": 0.20801737904548645, + "learning_rate": 6.308555651888233e-05, + "loss": 1.5529, + "step": 618 + }, + { + "epoch": 0.3432215137233158, + "grad_norm": 0.21180784702301025, + "learning_rate": 6.302525728287064e-05, + "loss": 1.6666, + "step": 619 + }, + { + "epoch": 0.34377599112836155, + "grad_norm": 0.20075224339962006, + "learning_rate": 6.296487969039701e-05, + "loss": 1.6188, + "step": 620 + }, + { + "epoch": 0.34433046853340726, + "grad_norm": 0.22708876430988312, + "learning_rate": 6.290442394693033e-05, + "loss": 1.6372, + "step": 621 + }, + { + "epoch": 0.344884945938453, + "grad_norm": 0.20980872213840485, + "learning_rate": 6.284389025820547e-05, + "loss": 1.584, + "step": 622 + }, + { + "epoch": 0.34543942334349875, + "grad_norm": 0.20637759566307068, + "learning_rate": 6.278327883022255e-05, + "loss": 1.5419, + "step": 623 + }, + { + "epoch": 0.34599390074854447, + "grad_norm": 0.2151980698108673, + "learning_rate": 6.272258986924624e-05, + "loss": 1.6066, + "step": 624 + }, + { + "epoch": 0.34654837815359024, + "grad_norm": 0.21410825848579407, + "learning_rate": 6.266182358180504e-05, + "loss": 1.657, + "step": 625 + }, + { + "epoch": 0.34710285555863596, + "grad_norm": 0.20414431393146515, + "learning_rate": 6.260098017469063e-05, + "loss": 1.6131, + "step": 626 + }, + { + "epoch": 0.34765733296368173, + "grad_norm": 0.19871225953102112, + "learning_rate": 6.254005985495711e-05, + "loss": 1.5843, + "step": 627 + }, + { + "epoch": 0.34821181036872745, + "grad_norm": 0.20605245232582092, + "learning_rate": 6.247906282992034e-05, + "loss": 1.6049, + "step": 628 + }, + { + "epoch": 0.3487662877737732, + "grad_norm": 0.20710204541683197, + "learning_rate": 6.241798930715719e-05, + "loss": 1.6005, + "step": 629 + }, + { + "epoch": 0.34932076517881894, + "grad_norm": 0.212701216340065, + "learning_rate": 6.235683949450486e-05, + "loss": 1.5958, + "step": 630 + }, + { + "epoch": 0.3498752425838647, + "grad_norm": 0.2133304476737976, + "learning_rate": 6.229561360006019e-05, + "loss": 1.6924, + "step": 631 + }, + { + "epoch": 0.3504297199889104, + "grad_norm": 0.208997905254364, + "learning_rate": 6.223431183217892e-05, + "loss": 1.6412, + "step": 632 + }, + { + "epoch": 0.3509841973939562, + "grad_norm": 0.2044183760881424, + "learning_rate": 6.217293439947498e-05, + "loss": 1.6015, + "step": 633 + }, + { + "epoch": 0.3515386747990019, + "grad_norm": 0.2030702829360962, + "learning_rate": 6.211148151081978e-05, + "loss": 1.5971, + "step": 634 + }, + { + "epoch": 0.3520931522040477, + "grad_norm": 0.20268899202346802, + "learning_rate": 6.204995337534159e-05, + "loss": 1.5271, + "step": 635 + }, + { + "epoch": 0.3526476296090934, + "grad_norm": 0.2157873958349228, + "learning_rate": 6.198835020242467e-05, + "loss": 1.6008, + "step": 636 + }, + { + "epoch": 0.3532021070141392, + "grad_norm": 0.20912663638591766, + "learning_rate": 6.192667220170863e-05, + "loss": 1.5696, + "step": 637 + }, + { + "epoch": 0.3537565844191849, + "grad_norm": 0.21048259735107422, + "learning_rate": 6.18649195830878e-05, + "loss": 1.5968, + "step": 638 + }, + { + "epoch": 0.35431106182423067, + "grad_norm": 0.21538151800632477, + "learning_rate": 6.180309255671035e-05, + "loss": 1.5876, + "step": 639 + }, + { + "epoch": 0.3548655392292764, + "grad_norm": 0.22569550573825836, + "learning_rate": 6.174119133297775e-05, + "loss": 1.5357, + "step": 640 + }, + { + "epoch": 0.35542001663432216, + "grad_norm": 0.20763731002807617, + "learning_rate": 6.167921612254391e-05, + "loss": 1.6395, + "step": 641 + }, + { + "epoch": 0.3559744940393679, + "grad_norm": 0.23909436166286469, + "learning_rate": 6.161716713631453e-05, + "loss": 1.6401, + "step": 642 + }, + { + "epoch": 0.35652897144441364, + "grad_norm": 0.20901797711849213, + "learning_rate": 6.155504458544641e-05, + "loss": 1.6965, + "step": 643 + }, + { + "epoch": 0.35708344884945936, + "grad_norm": 0.21390089392662048, + "learning_rate": 6.149284868134663e-05, + "loss": 1.5991, + "step": 644 + }, + { + "epoch": 0.35763792625450513, + "grad_norm": 0.2071395069360733, + "learning_rate": 6.143057963567198e-05, + "loss": 1.6255, + "step": 645 + }, + { + "epoch": 0.35819240365955085, + "grad_norm": 0.23318380117416382, + "learning_rate": 6.136823766032808e-05, + "loss": 1.6149, + "step": 646 + }, + { + "epoch": 0.3587468810645966, + "grad_norm": 0.21669737994670868, + "learning_rate": 6.130582296746876e-05, + "loss": 1.6068, + "step": 647 + }, + { + "epoch": 0.35930135846964234, + "grad_norm": 0.20202122628688812, + "learning_rate": 6.124333576949533e-05, + "loss": 1.5232, + "step": 648 + }, + { + "epoch": 0.3598558358746881, + "grad_norm": 0.20784695446491241, + "learning_rate": 6.118077627905584e-05, + "loss": 1.5963, + "step": 649 + }, + { + "epoch": 0.36041031327973383, + "grad_norm": 0.20467756688594818, + "learning_rate": 6.111814470904431e-05, + "loss": 1.5672, + "step": 650 + }, + { + "epoch": 0.3609647906847796, + "grad_norm": 0.20664702355861664, + "learning_rate": 6.105544127260012e-05, + "loss": 1.5897, + "step": 651 + }, + { + "epoch": 0.3615192680898253, + "grad_norm": 0.21211595833301544, + "learning_rate": 6.0992666183107134e-05, + "loss": 1.5792, + "step": 652 + }, + { + "epoch": 0.3620737454948711, + "grad_norm": 0.22114971280097961, + "learning_rate": 6.092981965419313e-05, + "loss": 1.5907, + "step": 653 + }, + { + "epoch": 0.3626282228999168, + "grad_norm": 0.22703395783901215, + "learning_rate": 6.086690189972898e-05, + "loss": 1.5498, + "step": 654 + }, + { + "epoch": 0.3631827003049626, + "grad_norm": 0.22778987884521484, + "learning_rate": 6.080391313382793e-05, + "loss": 1.6132, + "step": 655 + }, + { + "epoch": 0.3637371777100083, + "grad_norm": 0.21941795945167542, + "learning_rate": 6.074085357084487e-05, + "loss": 1.6606, + "step": 656 + }, + { + "epoch": 0.36429165511505407, + "grad_norm": 0.20801624655723572, + "learning_rate": 6.0677723425375636e-05, + "loss": 1.558, + "step": 657 + }, + { + "epoch": 0.3648461325200998, + "grad_norm": 0.22957980632781982, + "learning_rate": 6.061452291225627e-05, + "loss": 1.4949, + "step": 658 + }, + { + "epoch": 0.36540060992514556, + "grad_norm": 0.20350852608680725, + "learning_rate": 6.055125224656225e-05, + "loss": 1.5406, + "step": 659 + }, + { + "epoch": 0.3659550873301913, + "grad_norm": 0.21100889146327972, + "learning_rate": 6.048791164360781e-05, + "loss": 1.5888, + "step": 660 + }, + { + "epoch": 0.36650956473523705, + "grad_norm": 0.21262866258621216, + "learning_rate": 6.0424501318945194e-05, + "loss": 1.61, + "step": 661 + }, + { + "epoch": 0.36706404214028276, + "grad_norm": 0.22323627769947052, + "learning_rate": 6.036102148836387e-05, + "loss": 1.6143, + "step": 662 + }, + { + "epoch": 0.36761851954532854, + "grad_norm": 0.22186584770679474, + "learning_rate": 6.02974723678899e-05, + "loss": 1.5533, + "step": 663 + }, + { + "epoch": 0.36817299695037425, + "grad_norm": 0.2202465534210205, + "learning_rate": 6.0233854173785086e-05, + "loss": 1.6063, + "step": 664 + }, + { + "epoch": 0.36872747435542, + "grad_norm": 0.2212614119052887, + "learning_rate": 6.017016712254635e-05, + "loss": 1.5558, + "step": 665 + }, + { + "epoch": 0.36928195176046574, + "grad_norm": 0.21273072063922882, + "learning_rate": 6.0106411430904865e-05, + "loss": 1.5717, + "step": 666 + }, + { + "epoch": 0.3698364291655115, + "grad_norm": 0.24066902697086334, + "learning_rate": 6.004258731582546e-05, + "loss": 1.6385, + "step": 667 + }, + { + "epoch": 0.37039090657055723, + "grad_norm": 0.2065984606742859, + "learning_rate": 5.997869499450581e-05, + "loss": 1.6573, + "step": 668 + }, + { + "epoch": 0.370945383975603, + "grad_norm": 0.29595062136650085, + "learning_rate": 5.991473468437562e-05, + "loss": 1.6945, + "step": 669 + }, + { + "epoch": 0.3714998613806487, + "grad_norm": 0.20868165791034698, + "learning_rate": 5.985070660309609e-05, + "loss": 1.5274, + "step": 670 + }, + { + "epoch": 0.3720543387856945, + "grad_norm": 0.2694888412952423, + "learning_rate": 5.978661096855893e-05, + "loss": 1.5829, + "step": 671 + }, + { + "epoch": 0.3726088161907402, + "grad_norm": 0.21415413916110992, + "learning_rate": 5.972244799888583e-05, + "loss": 1.6068, + "step": 672 + }, + { + "epoch": 0.373163293595786, + "grad_norm": 0.2538868486881256, + "learning_rate": 5.9658217912427554e-05, + "loss": 1.586, + "step": 673 + }, + { + "epoch": 0.3737177710008317, + "grad_norm": 0.22458183765411377, + "learning_rate": 5.959392092776333e-05, + "loss": 1.5573, + "step": 674 + }, + { + "epoch": 0.3742722484058775, + "grad_norm": 0.23578275740146637, + "learning_rate": 5.952955726370001e-05, + "loss": 1.5725, + "step": 675 + }, + { + "epoch": 0.3748267258109232, + "grad_norm": 0.24922388792037964, + "learning_rate": 5.946512713927135e-05, + "loss": 1.6212, + "step": 676 + }, + { + "epoch": 0.37538120321596896, + "grad_norm": 0.21490588784217834, + "learning_rate": 5.940063077373732e-05, + "loss": 1.61, + "step": 677 + }, + { + "epoch": 0.3759356806210147, + "grad_norm": 0.26785025000572205, + "learning_rate": 5.933606838658328e-05, + "loss": 1.5897, + "step": 678 + }, + { + "epoch": 0.37649015802606045, + "grad_norm": 0.2139958143234253, + "learning_rate": 5.927144019751925e-05, + "loss": 1.5807, + "step": 679 + }, + { + "epoch": 0.37704463543110617, + "grad_norm": 0.25820040702819824, + "learning_rate": 5.9206746426479215e-05, + "loss": 1.5907, + "step": 680 + }, + { + "epoch": 0.37759911283615194, + "grad_norm": 0.21500596404075623, + "learning_rate": 5.9141987293620334e-05, + "loss": 1.5283, + "step": 681 + }, + { + "epoch": 0.37815359024119766, + "grad_norm": 0.21151086688041687, + "learning_rate": 5.907716301932217e-05, + "loss": 1.644, + "step": 682 + }, + { + "epoch": 0.37870806764624343, + "grad_norm": 0.24681684374809265, + "learning_rate": 5.901227382418599e-05, + "loss": 1.55, + "step": 683 + }, + { + "epoch": 0.37926254505128915, + "grad_norm": 0.20953518152236938, + "learning_rate": 5.894731992903399e-05, + "loss": 1.5991, + "step": 684 + }, + { + "epoch": 0.3798170224563349, + "grad_norm": 0.2509237825870514, + "learning_rate": 5.888230155490853e-05, + "loss": 1.6488, + "step": 685 + }, + { + "epoch": 0.38037149986138064, + "grad_norm": 0.22760917246341705, + "learning_rate": 5.8817218923071406e-05, + "loss": 1.5948, + "step": 686 + }, + { + "epoch": 0.3809259772664264, + "grad_norm": 0.20528970658779144, + "learning_rate": 5.875207225500308e-05, + "loss": 1.53, + "step": 687 + }, + { + "epoch": 0.3814804546714721, + "grad_norm": 0.3011578619480133, + "learning_rate": 5.8686861772401974e-05, + "loss": 1.6489, + "step": 688 + }, + { + "epoch": 0.3820349320765179, + "grad_norm": 0.20146943628787994, + "learning_rate": 5.8621587697183595e-05, + "loss": 1.5495, + "step": 689 + }, + { + "epoch": 0.3825894094815636, + "grad_norm": 0.24754847586154938, + "learning_rate": 5.8556250251479945e-05, + "loss": 1.6115, + "step": 690 + }, + { + "epoch": 0.3831438868866094, + "grad_norm": 0.21535150706768036, + "learning_rate": 5.84908496576386e-05, + "loss": 1.6324, + "step": 691 + }, + { + "epoch": 0.3836983642916551, + "grad_norm": 0.21527227759361267, + "learning_rate": 5.8425386138222116e-05, + "loss": 1.5944, + "step": 692 + }, + { + "epoch": 0.3842528416967009, + "grad_norm": 0.2230699062347412, + "learning_rate": 5.8359859916007116e-05, + "loss": 1.5704, + "step": 693 + }, + { + "epoch": 0.3848073191017466, + "grad_norm": 0.20711487531661987, + "learning_rate": 5.8294271213983646e-05, + "loss": 1.614, + "step": 694 + }, + { + "epoch": 0.38536179650679236, + "grad_norm": 0.23301945626735687, + "learning_rate": 5.822862025535436e-05, + "loss": 1.6114, + "step": 695 + }, + { + "epoch": 0.3859162739118381, + "grad_norm": 0.22132247686386108, + "learning_rate": 5.816290726353378e-05, + "loss": 1.6219, + "step": 696 + }, + { + "epoch": 0.38647075131688385, + "grad_norm": 0.22144806385040283, + "learning_rate": 5.809713246214756e-05, + "loss": 1.6163, + "step": 697 + }, + { + "epoch": 0.38702522872192957, + "grad_norm": 0.21476450562477112, + "learning_rate": 5.8031296075031625e-05, + "loss": 1.5815, + "step": 698 + }, + { + "epoch": 0.38757970612697534, + "grad_norm": 0.2073630541563034, + "learning_rate": 5.7965398326231535e-05, + "loss": 1.5726, + "step": 699 + }, + { + "epoch": 0.38813418353202106, + "grad_norm": 0.23282212018966675, + "learning_rate": 5.7899439440001656e-05, + "loss": 1.5905, + "step": 700 + }, + { + "epoch": 0.38868866093706683, + "grad_norm": 0.2030622363090515, + "learning_rate": 5.7833419640804426e-05, + "loss": 1.606, + "step": 701 + }, + { + "epoch": 0.38924313834211255, + "grad_norm": 0.22797855734825134, + "learning_rate": 5.7767339153309526e-05, + "loss": 1.6297, + "step": 702 + }, + { + "epoch": 0.3897976157471583, + "grad_norm": 0.21430207788944244, + "learning_rate": 5.770119820239321e-05, + "loss": 1.647, + "step": 703 + }, + { + "epoch": 0.39035209315220404, + "grad_norm": 0.21015015244483948, + "learning_rate": 5.7634997013137465e-05, + "loss": 1.5667, + "step": 704 + }, + { + "epoch": 0.3909065705572498, + "grad_norm": 0.2301322966814041, + "learning_rate": 5.7568735810829294e-05, + "loss": 1.6313, + "step": 705 + }, + { + "epoch": 0.3914610479622955, + "grad_norm": 0.21517878770828247, + "learning_rate": 5.750241482095993e-05, + "loss": 1.671, + "step": 706 + }, + { + "epoch": 0.3920155253673413, + "grad_norm": 0.2164376974105835, + "learning_rate": 5.743603426922401e-05, + "loss": 1.6186, + "step": 707 + }, + { + "epoch": 0.392570002772387, + "grad_norm": 0.2114233523607254, + "learning_rate": 5.736959438151895e-05, + "loss": 1.7116, + "step": 708 + }, + { + "epoch": 0.3931244801774328, + "grad_norm": 0.22350315749645233, + "learning_rate": 5.730309538394404e-05, + "loss": 1.6869, + "step": 709 + }, + { + "epoch": 0.3936789575824785, + "grad_norm": 0.2107817828655243, + "learning_rate": 5.723653750279974e-05, + "loss": 1.6738, + "step": 710 + }, + { + "epoch": 0.3942334349875243, + "grad_norm": 0.20846299827098846, + "learning_rate": 5.716992096458686e-05, + "loss": 1.5339, + "step": 711 + }, + { + "epoch": 0.39478791239257, + "grad_norm": 0.2204621285200119, + "learning_rate": 5.710324599600589e-05, + "loss": 1.5552, + "step": 712 + }, + { + "epoch": 0.39534238979761577, + "grad_norm": 0.23123124241828918, + "learning_rate": 5.7036512823956085e-05, + "loss": 1.7236, + "step": 713 + }, + { + "epoch": 0.3958968672026615, + "grad_norm": 0.24838416278362274, + "learning_rate": 5.696972167553485e-05, + "loss": 1.6249, + "step": 714 + }, + { + "epoch": 0.39645134460770726, + "grad_norm": 0.2113085836172104, + "learning_rate": 5.6902872778036825e-05, + "loss": 1.5164, + "step": 715 + }, + { + "epoch": 0.397005822012753, + "grad_norm": 0.2129892259836197, + "learning_rate": 5.6835966358953186e-05, + "loss": 1.6242, + "step": 716 + }, + { + "epoch": 0.39756029941779875, + "grad_norm": 0.21519723534584045, + "learning_rate": 5.67690026459709e-05, + "loss": 1.5984, + "step": 717 + }, + { + "epoch": 0.39811477682284446, + "grad_norm": 0.20214593410491943, + "learning_rate": 5.670198186697185e-05, + "loss": 1.6229, + "step": 718 + }, + { + "epoch": 0.39866925422789024, + "grad_norm": 0.21002614498138428, + "learning_rate": 5.6634904250032166e-05, + "loss": 1.6142, + "step": 719 + }, + { + "epoch": 0.39922373163293595, + "grad_norm": 0.20920030772686005, + "learning_rate": 5.656777002342136e-05, + "loss": 1.5819, + "step": 720 + }, + { + "epoch": 0.3997782090379817, + "grad_norm": 0.21228238940238953, + "learning_rate": 5.650057941560164e-05, + "loss": 1.6284, + "step": 721 + }, + { + "epoch": 0.40033268644302744, + "grad_norm": 0.331759512424469, + "learning_rate": 5.643333265522702e-05, + "loss": 1.5552, + "step": 722 + }, + { + "epoch": 0.4008871638480732, + "grad_norm": 0.20200534164905548, + "learning_rate": 5.636602997114268e-05, + "loss": 1.545, + "step": 723 + }, + { + "epoch": 0.40144164125311893, + "grad_norm": 0.2071356177330017, + "learning_rate": 5.629867159238404e-05, + "loss": 1.5618, + "step": 724 + }, + { + "epoch": 0.4019961186581647, + "grad_norm": 0.2137397825717926, + "learning_rate": 5.62312577481761e-05, + "loss": 1.5858, + "step": 725 + }, + { + "epoch": 0.4025505960632104, + "grad_norm": 0.2188277244567871, + "learning_rate": 5.616378866793259e-05, + "loss": 1.6123, + "step": 726 + }, + { + "epoch": 0.4031050734682562, + "grad_norm": 0.21393775939941406, + "learning_rate": 5.609626458125521e-05, + "loss": 1.6253, + "step": 727 + }, + { + "epoch": 0.4036595508733019, + "grad_norm": 0.2320735603570938, + "learning_rate": 5.6028685717932895e-05, + "loss": 1.5904, + "step": 728 + }, + { + "epoch": 0.4042140282783477, + "grad_norm": 0.22155635058879852, + "learning_rate": 5.596105230794091e-05, + "loss": 1.6295, + "step": 729 + }, + { + "epoch": 0.4047685056833934, + "grad_norm": 0.21663862466812134, + "learning_rate": 5.589336458144023e-05, + "loss": 1.5455, + "step": 730 + }, + { + "epoch": 0.40532298308843917, + "grad_norm": 0.2116219699382782, + "learning_rate": 5.582562276877659e-05, + "loss": 1.6089, + "step": 731 + }, + { + "epoch": 0.4058774604934849, + "grad_norm": 0.21755720674991608, + "learning_rate": 5.575782710047985e-05, + "loss": 1.5895, + "step": 732 + }, + { + "epoch": 0.40643193789853066, + "grad_norm": 0.21230702102184296, + "learning_rate": 5.5689977807263105e-05, + "loss": 1.5425, + "step": 733 + }, + { + "epoch": 0.4069864153035764, + "grad_norm": 0.21914036571979523, + "learning_rate": 5.5622075120021976e-05, + "loss": 1.659, + "step": 734 + }, + { + "epoch": 0.40754089270862215, + "grad_norm": 0.2092135101556778, + "learning_rate": 5.5554119269833746e-05, + "loss": 1.5671, + "step": 735 + }, + { + "epoch": 0.40809537011366787, + "grad_norm": 0.20937521755695343, + "learning_rate": 5.548611048795663e-05, + "loss": 1.552, + "step": 736 + }, + { + "epoch": 0.40864984751871364, + "grad_norm": 0.20987878739833832, + "learning_rate": 5.5418049005828994e-05, + "loss": 1.565, + "step": 737 + }, + { + "epoch": 0.40920432492375935, + "grad_norm": 0.22819262742996216, + "learning_rate": 5.534993505506851e-05, + "loss": 1.5928, + "step": 738 + }, + { + "epoch": 0.4097588023288051, + "grad_norm": 0.2156732827425003, + "learning_rate": 5.5281768867471455e-05, + "loss": 1.6158, + "step": 739 + }, + { + "epoch": 0.41031327973385084, + "grad_norm": 0.21924850344657898, + "learning_rate": 5.521355067501181e-05, + "loss": 1.5497, + "step": 740 + }, + { + "epoch": 0.4108677571388966, + "grad_norm": 0.20754189789295197, + "learning_rate": 5.5145280709840566e-05, + "loss": 1.5645, + "step": 741 + }, + { + "epoch": 0.41142223454394233, + "grad_norm": 0.22220148146152496, + "learning_rate": 5.5076959204284915e-05, + "loss": 1.5868, + "step": 742 + }, + { + "epoch": 0.4119767119489881, + "grad_norm": 0.20873475074768066, + "learning_rate": 5.5008586390847404e-05, + "loss": 1.616, + "step": 743 + }, + { + "epoch": 0.4125311893540338, + "grad_norm": 0.2225092649459839, + "learning_rate": 5.494016250220521e-05, + "loss": 1.6132, + "step": 744 + }, + { + "epoch": 0.4130856667590796, + "grad_norm": 0.20154620707035065, + "learning_rate": 5.487168777120932e-05, + "loss": 1.5352, + "step": 745 + }, + { + "epoch": 0.4136401441641253, + "grad_norm": 0.20975901186466217, + "learning_rate": 5.480316243088375e-05, + "loss": 1.5762, + "step": 746 + }, + { + "epoch": 0.41419462156917103, + "grad_norm": 0.2048894613981247, + "learning_rate": 5.4734586714424706e-05, + "loss": 1.5744, + "step": 747 + }, + { + "epoch": 0.4147490989742168, + "grad_norm": 0.2058396339416504, + "learning_rate": 5.466596085519988e-05, + "loss": 1.6035, + "step": 748 + }, + { + "epoch": 0.4153035763792625, + "grad_norm": 0.21916894614696503, + "learning_rate": 5.459728508674756e-05, + "loss": 1.6108, + "step": 749 + }, + { + "epoch": 0.4158580537843083, + "grad_norm": 0.2189890742301941, + "learning_rate": 5.4528559642775885e-05, + "loss": 1.6031, + "step": 750 + }, + { + "epoch": 0.416412531189354, + "grad_norm": 0.2107512503862381, + "learning_rate": 5.445978475716207e-05, + "loss": 1.5418, + "step": 751 + }, + { + "epoch": 0.4169670085943998, + "grad_norm": 0.20556119084358215, + "learning_rate": 5.4390960663951565e-05, + "loss": 1.597, + "step": 752 + }, + { + "epoch": 0.4175214859994455, + "grad_norm": 0.20787164568901062, + "learning_rate": 5.4322087597357264e-05, + "loss": 1.6246, + "step": 753 + }, + { + "epoch": 0.41807596340449127, + "grad_norm": 0.22257111966609955, + "learning_rate": 5.4253165791758743e-05, + "loss": 1.5532, + "step": 754 + }, + { + "epoch": 0.418630440809537, + "grad_norm": 0.20410916209220886, + "learning_rate": 5.4184195481701425e-05, + "loss": 1.5668, + "step": 755 + }, + { + "epoch": 0.41918491821458276, + "grad_norm": 0.234087273478508, + "learning_rate": 5.411517690189581e-05, + "loss": 1.5941, + "step": 756 + }, + { + "epoch": 0.4197393956196285, + "grad_norm": 0.21471847593784332, + "learning_rate": 5.404611028721665e-05, + "loss": 1.5839, + "step": 757 + }, + { + "epoch": 0.42029387302467425, + "grad_norm": 0.2194783240556717, + "learning_rate": 5.3976995872702174e-05, + "loss": 1.664, + "step": 758 + }, + { + "epoch": 0.42084835042971996, + "grad_norm": 0.21059168875217438, + "learning_rate": 5.390783389355326e-05, + "loss": 1.5688, + "step": 759 + }, + { + "epoch": 0.42140282783476574, + "grad_norm": 0.22869731485843658, + "learning_rate": 5.3838624585132666e-05, + "loss": 1.5823, + "step": 760 + }, + { + "epoch": 0.42195730523981145, + "grad_norm": 0.21715888381004333, + "learning_rate": 5.3769368182964226e-05, + "loss": 1.6105, + "step": 761 + }, + { + "epoch": 0.4225117826448572, + "grad_norm": 0.2125747799873352, + "learning_rate": 5.3700064922732e-05, + "loss": 1.5806, + "step": 762 + }, + { + "epoch": 0.42306626004990294, + "grad_norm": 0.20356424152851105, + "learning_rate": 5.363071504027956e-05, + "loss": 1.5695, + "step": 763 + }, + { + "epoch": 0.4236207374549487, + "grad_norm": 0.21644245088100433, + "learning_rate": 5.3561318771609076e-05, + "loss": 1.6602, + "step": 764 + }, + { + "epoch": 0.42417521485999443, + "grad_norm": 0.21127082407474518, + "learning_rate": 5.349187635288063e-05, + "loss": 1.554, + "step": 765 + }, + { + "epoch": 0.4247296922650402, + "grad_norm": 0.22828276455402374, + "learning_rate": 5.3422388020411325e-05, + "loss": 1.6498, + "step": 766 + }, + { + "epoch": 0.4252841696700859, + "grad_norm": 0.22633306682109833, + "learning_rate": 5.3352854010674515e-05, + "loss": 1.6079, + "step": 767 + }, + { + "epoch": 0.4258386470751317, + "grad_norm": 0.2162792831659317, + "learning_rate": 5.3283274560299e-05, + "loss": 1.6315, + "step": 768 + }, + { + "epoch": 0.4263931244801774, + "grad_norm": 0.23089319467544556, + "learning_rate": 5.321364990606821e-05, + "loss": 1.638, + "step": 769 + }, + { + "epoch": 0.4269476018852232, + "grad_norm": 0.2013845294713974, + "learning_rate": 5.3143980284919424e-05, + "loss": 1.5903, + "step": 770 + }, + { + "epoch": 0.4275020792902689, + "grad_norm": 0.21818867325782776, + "learning_rate": 5.307426593394294e-05, + "loss": 1.5645, + "step": 771 + }, + { + "epoch": 0.42805655669531467, + "grad_norm": 0.2172323316335678, + "learning_rate": 5.300450709038126e-05, + "loss": 1.5598, + "step": 772 + }, + { + "epoch": 0.4286110341003604, + "grad_norm": 0.21557918190956116, + "learning_rate": 5.29347039916283e-05, + "loss": 1.5937, + "step": 773 + }, + { + "epoch": 0.42916551150540616, + "grad_norm": 0.6955245733261108, + "learning_rate": 5.286485687522861e-05, + "loss": 1.6414, + "step": 774 + }, + { + "epoch": 0.4297199889104519, + "grad_norm": 0.21430703997612, + "learning_rate": 5.27949659788765e-05, + "loss": 1.5782, + "step": 775 + }, + { + "epoch": 0.43027446631549765, + "grad_norm": 0.2163209319114685, + "learning_rate": 5.272503154041527e-05, + "loss": 1.5992, + "step": 776 + }, + { + "epoch": 0.43082894372054337, + "grad_norm": 0.21228471398353577, + "learning_rate": 5.265505379783642e-05, + "loss": 1.5764, + "step": 777 + }, + { + "epoch": 0.43138342112558914, + "grad_norm": 0.21286910772323608, + "learning_rate": 5.258503298927879e-05, + "loss": 1.5892, + "step": 778 + }, + { + "epoch": 0.43193789853063486, + "grad_norm": 0.24243953824043274, + "learning_rate": 5.25149693530278e-05, + "loss": 1.6385, + "step": 779 + }, + { + "epoch": 0.43249237593568063, + "grad_norm": 0.20691627264022827, + "learning_rate": 5.244486312751459e-05, + "loss": 1.5992, + "step": 780 + }, + { + "epoch": 0.43304685334072635, + "grad_norm": 0.21702216565608978, + "learning_rate": 5.237471455131526e-05, + "loss": 1.5866, + "step": 781 + }, + { + "epoch": 0.4336013307457721, + "grad_norm": 0.22178849577903748, + "learning_rate": 5.230452386315e-05, + "loss": 1.6211, + "step": 782 + }, + { + "epoch": 0.43415580815081783, + "grad_norm": 0.2103152573108673, + "learning_rate": 5.223429130188235e-05, + "loss": 1.5822, + "step": 783 + }, + { + "epoch": 0.4347102855558636, + "grad_norm": 0.2260008603334427, + "learning_rate": 5.216401710651831e-05, + "loss": 1.5565, + "step": 784 + }, + { + "epoch": 0.4352647629609093, + "grad_norm": 0.22004656493663788, + "learning_rate": 5.20937015162056e-05, + "loss": 1.6813, + "step": 785 + }, + { + "epoch": 0.4358192403659551, + "grad_norm": 0.21554167568683624, + "learning_rate": 5.202334477023277e-05, + "loss": 1.6137, + "step": 786 + }, + { + "epoch": 0.4363737177710008, + "grad_norm": 0.2462761402130127, + "learning_rate": 5.195294710802845e-05, + "loss": 1.6396, + "step": 787 + }, + { + "epoch": 0.4369281951760466, + "grad_norm": 0.20096971094608307, + "learning_rate": 5.1882508769160504e-05, + "loss": 1.5586, + "step": 788 + }, + { + "epoch": 0.4374826725810923, + "grad_norm": 0.21837452054023743, + "learning_rate": 5.1812029993335226e-05, + "loss": 1.5882, + "step": 789 + }, + { + "epoch": 0.4380371499861381, + "grad_norm": 0.20460233092308044, + "learning_rate": 5.174151102039653e-05, + "loss": 1.5031, + "step": 790 + }, + { + "epoch": 0.4385916273911838, + "grad_norm": 0.2106686383485794, + "learning_rate": 5.167095209032509e-05, + "loss": 1.5377, + "step": 791 + }, + { + "epoch": 0.43914610479622956, + "grad_norm": 0.24171209335327148, + "learning_rate": 5.160035344323758e-05, + "loss": 1.6242, + "step": 792 + }, + { + "epoch": 0.4397005822012753, + "grad_norm": 0.21198348701000214, + "learning_rate": 5.152971531938583e-05, + "loss": 1.5975, + "step": 793 + }, + { + "epoch": 0.44025505960632105, + "grad_norm": 0.2270112782716751, + "learning_rate": 5.145903795915603e-05, + "loss": 1.5772, + "step": 794 + }, + { + "epoch": 0.44080953701136677, + "grad_norm": 0.20378631353378296, + "learning_rate": 5.138832160306785e-05, + "loss": 1.5778, + "step": 795 + }, + { + "epoch": 0.44136401441641254, + "grad_norm": 0.23949572443962097, + "learning_rate": 5.1317566491773714e-05, + "loss": 1.6321, + "step": 796 + }, + { + "epoch": 0.44191849182145826, + "grad_norm": 0.20986564457416534, + "learning_rate": 5.1246772866057885e-05, + "loss": 1.5273, + "step": 797 + }, + { + "epoch": 0.44247296922650403, + "grad_norm": 0.20840664207935333, + "learning_rate": 5.117594096683574e-05, + "loss": 1.5354, + "step": 798 + }, + { + "epoch": 0.44302744663154975, + "grad_norm": 0.24688231945037842, + "learning_rate": 5.1105071035152884e-05, + "loss": 1.637, + "step": 799 + }, + { + "epoch": 0.4435819240365955, + "grad_norm": 0.20614393055438995, + "learning_rate": 5.1034163312184325e-05, + "loss": 1.5573, + "step": 800 + }, + { + "epoch": 0.44413640144164124, + "grad_norm": 0.22784316539764404, + "learning_rate": 5.096321803923372e-05, + "loss": 1.6117, + "step": 801 + }, + { + "epoch": 0.444690878846687, + "grad_norm": 0.22619880735874176, + "learning_rate": 5.089223545773248e-05, + "loss": 1.5151, + "step": 802 + }, + { + "epoch": 0.4452453562517327, + "grad_norm": 0.2142421305179596, + "learning_rate": 5.082121580923899e-05, + "loss": 1.6123, + "step": 803 + }, + { + "epoch": 0.4457998336567785, + "grad_norm": 0.21790969371795654, + "learning_rate": 5.0750159335437775e-05, + "loss": 1.5859, + "step": 804 + }, + { + "epoch": 0.4463543110618242, + "grad_norm": 0.21449251472949982, + "learning_rate": 5.067906627813868e-05, + "loss": 1.6656, + "step": 805 + }, + { + "epoch": 0.44690878846687, + "grad_norm": 0.221047043800354, + "learning_rate": 5.0607936879276055e-05, + "loss": 1.5889, + "step": 806 + }, + { + "epoch": 0.4474632658719157, + "grad_norm": 0.22916845977306366, + "learning_rate": 5.0536771380907885e-05, + "loss": 1.5901, + "step": 807 + }, + { + "epoch": 0.4480177432769615, + "grad_norm": 0.20636655390262604, + "learning_rate": 5.046557002521507e-05, + "loss": 1.5883, + "step": 808 + }, + { + "epoch": 0.4485722206820072, + "grad_norm": 0.23739656805992126, + "learning_rate": 5.039433305450047e-05, + "loss": 1.6089, + "step": 809 + }, + { + "epoch": 0.44912669808705297, + "grad_norm": 0.2077774852514267, + "learning_rate": 5.03230607111882e-05, + "loss": 1.5868, + "step": 810 + }, + { + "epoch": 0.4496811754920987, + "grad_norm": 0.21799373626708984, + "learning_rate": 5.025175323782267e-05, + "loss": 1.565, + "step": 811 + }, + { + "epoch": 0.45023565289714446, + "grad_norm": 0.21249794960021973, + "learning_rate": 5.0180410877067955e-05, + "loss": 1.5063, + "step": 812 + }, + { + "epoch": 0.4507901303021902, + "grad_norm": 0.21143661439418793, + "learning_rate": 5.0109033871706754e-05, + "loss": 1.62, + "step": 813 + }, + { + "epoch": 0.45134460770723595, + "grad_norm": 0.21807655692100525, + "learning_rate": 5.0037622464639724e-05, + "loss": 1.585, + "step": 814 + }, + { + "epoch": 0.45189908511228166, + "grad_norm": 0.2183266580104828, + "learning_rate": 4.996617689888455e-05, + "loss": 1.6266, + "step": 815 + }, + { + "epoch": 0.45245356251732743, + "grad_norm": 0.20334866642951965, + "learning_rate": 4.989469741757519e-05, + "loss": 1.6247, + "step": 816 + }, + { + "epoch": 0.45300803992237315, + "grad_norm": 0.21051491796970367, + "learning_rate": 4.9823184263961e-05, + "loss": 1.5917, + "step": 817 + }, + { + "epoch": 0.4535625173274189, + "grad_norm": 0.2286101132631302, + "learning_rate": 4.975163768140596e-05, + "loss": 1.6318, + "step": 818 + }, + { + "epoch": 0.45411699473246464, + "grad_norm": 0.20518524944782257, + "learning_rate": 4.9680057913387775e-05, + "loss": 1.5723, + "step": 819 + }, + { + "epoch": 0.4546714721375104, + "grad_norm": 0.2158391773700714, + "learning_rate": 4.960844520349709e-05, + "loss": 1.6655, + "step": 820 + }, + { + "epoch": 0.45522594954255613, + "grad_norm": 0.20380595326423645, + "learning_rate": 4.953679979543666e-05, + "loss": 1.5681, + "step": 821 + }, + { + "epoch": 0.4557804269476019, + "grad_norm": 0.2011675089597702, + "learning_rate": 4.946512193302051e-05, + "loss": 1.5654, + "step": 822 + }, + { + "epoch": 0.4563349043526476, + "grad_norm": 0.21012163162231445, + "learning_rate": 4.939341186017313e-05, + "loss": 1.6037, + "step": 823 + }, + { + "epoch": 0.4568893817576934, + "grad_norm": 0.2087511122226715, + "learning_rate": 4.932166982092858e-05, + "loss": 1.5656, + "step": 824 + }, + { + "epoch": 0.4574438591627391, + "grad_norm": 0.21227777004241943, + "learning_rate": 4.924989605942973e-05, + "loss": 1.6027, + "step": 825 + }, + { + "epoch": 0.4579983365677849, + "grad_norm": 0.20791299641132355, + "learning_rate": 4.9178090819927414e-05, + "loss": 1.5582, + "step": 826 + }, + { + "epoch": 0.4585528139728306, + "grad_norm": 0.2057400643825531, + "learning_rate": 4.910625434677956e-05, + "loss": 1.6189, + "step": 827 + }, + { + "epoch": 0.45910729137787637, + "grad_norm": 0.20824283361434937, + "learning_rate": 4.903438688445043e-05, + "loss": 1.5573, + "step": 828 + }, + { + "epoch": 0.4596617687829221, + "grad_norm": 0.20382678508758545, + "learning_rate": 4.896248867750969e-05, + "loss": 1.5942, + "step": 829 + }, + { + "epoch": 0.46021624618796786, + "grad_norm": 0.21372130513191223, + "learning_rate": 4.8890559970631667e-05, + "loss": 1.5624, + "step": 830 + }, + { + "epoch": 0.4607707235930136, + "grad_norm": 0.20644085109233856, + "learning_rate": 4.881860100859446e-05, + "loss": 1.6326, + "step": 831 + }, + { + "epoch": 0.46132520099805935, + "grad_norm": 0.20801600813865662, + "learning_rate": 4.874661203627917e-05, + "loss": 1.5424, + "step": 832 + }, + { + "epoch": 0.46187967840310507, + "grad_norm": 0.21283288300037384, + "learning_rate": 4.867459329866897e-05, + "loss": 1.6228, + "step": 833 + }, + { + "epoch": 0.46243415580815084, + "grad_norm": 0.21098175644874573, + "learning_rate": 4.860254504084835e-05, + "loss": 1.5839, + "step": 834 + }, + { + "epoch": 0.46298863321319655, + "grad_norm": 0.20932459831237793, + "learning_rate": 4.853046750800228e-05, + "loss": 1.5911, + "step": 835 + }, + { + "epoch": 0.4635431106182423, + "grad_norm": 0.2054491639137268, + "learning_rate": 4.8458360945415317e-05, + "loss": 1.6222, + "step": 836 + }, + { + "epoch": 0.46409758802328804, + "grad_norm": 0.19949305057525635, + "learning_rate": 4.838622559847084e-05, + "loss": 1.5386, + "step": 837 + }, + { + "epoch": 0.4646520654283338, + "grad_norm": 0.21647600829601288, + "learning_rate": 4.831406171265015e-05, + "loss": 1.5216, + "step": 838 + }, + { + "epoch": 0.46520654283337953, + "grad_norm": 0.20176930725574493, + "learning_rate": 4.824186953353171e-05, + "loss": 1.5054, + "step": 839 + }, + { + "epoch": 0.4657610202384253, + "grad_norm": 0.20189246535301208, + "learning_rate": 4.816964930679024e-05, + "loss": 1.5422, + "step": 840 + }, + { + "epoch": 0.466315497643471, + "grad_norm": 0.2124892771244049, + "learning_rate": 4.8097401278195904e-05, + "loss": 1.6112, + "step": 841 + }, + { + "epoch": 0.4668699750485168, + "grad_norm": 0.20635344088077545, + "learning_rate": 4.8025125693613485e-05, + "loss": 1.55, + "step": 842 + }, + { + "epoch": 0.4674244524535625, + "grad_norm": 0.20471568405628204, + "learning_rate": 4.7952822799001564e-05, + "loss": 1.5934, + "step": 843 + }, + { + "epoch": 0.4679789298586083, + "grad_norm": 0.20640766620635986, + "learning_rate": 4.78804928404116e-05, + "loss": 1.5859, + "step": 844 + }, + { + "epoch": 0.468533407263654, + "grad_norm": 0.2087356001138687, + "learning_rate": 4.780813606398722e-05, + "loss": 1.6468, + "step": 845 + }, + { + "epoch": 0.4690878846686998, + "grad_norm": 0.20477420091629028, + "learning_rate": 4.7735752715963265e-05, + "loss": 1.5926, + "step": 846 + }, + { + "epoch": 0.4696423620737455, + "grad_norm": 0.20607273280620575, + "learning_rate": 4.766334304266503e-05, + "loss": 1.5688, + "step": 847 + }, + { + "epoch": 0.47019683947879126, + "grad_norm": 0.20167583227157593, + "learning_rate": 4.7590907290507396e-05, + "loss": 1.5408, + "step": 848 + }, + { + "epoch": 0.470751316883837, + "grad_norm": 0.22671446204185486, + "learning_rate": 4.751844570599395e-05, + "loss": 1.5693, + "step": 849 + }, + { + "epoch": 0.47130579428888275, + "grad_norm": 0.2094809114933014, + "learning_rate": 4.7445958535716265e-05, + "loss": 1.586, + "step": 850 + }, + { + "epoch": 0.47186027169392847, + "grad_norm": 0.21534985303878784, + "learning_rate": 4.73734460263529e-05, + "loss": 1.6699, + "step": 851 + }, + { + "epoch": 0.47241474909897424, + "grad_norm": 0.21628102660179138, + "learning_rate": 4.730090842466871e-05, + "loss": 1.6307, + "step": 852 + }, + { + "epoch": 0.47296922650401996, + "grad_norm": 0.21070408821105957, + "learning_rate": 4.7228345977513886e-05, + "loss": 1.5859, + "step": 853 + }, + { + "epoch": 0.47352370390906573, + "grad_norm": 0.21383161842823029, + "learning_rate": 4.715575893182324e-05, + "loss": 1.672, + "step": 854 + }, + { + "epoch": 0.47407818131411145, + "grad_norm": 0.2086363434791565, + "learning_rate": 4.7083147534615224e-05, + "loss": 1.5431, + "step": 855 + }, + { + "epoch": 0.4746326587191572, + "grad_norm": 0.1970445066690445, + "learning_rate": 4.70105120329912e-05, + "loss": 1.4742, + "step": 856 + }, + { + "epoch": 0.47518713612420294, + "grad_norm": 0.20317339897155762, + "learning_rate": 4.6937852674134555e-05, + "loss": 1.5643, + "step": 857 + }, + { + "epoch": 0.4757416135292487, + "grad_norm": 0.20548830926418304, + "learning_rate": 4.6865169705309815e-05, + "loss": 1.5369, + "step": 858 + }, + { + "epoch": 0.4762960909342944, + "grad_norm": 0.20916157960891724, + "learning_rate": 4.679246337386195e-05, + "loss": 1.544, + "step": 859 + }, + { + "epoch": 0.4768505683393402, + "grad_norm": 0.2089296132326126, + "learning_rate": 4.671973392721535e-05, + "loss": 1.5906, + "step": 860 + }, + { + "epoch": 0.4774050457443859, + "grad_norm": 0.21110884845256805, + "learning_rate": 4.6646981612873105e-05, + "loss": 1.6617, + "step": 861 + }, + { + "epoch": 0.4779595231494317, + "grad_norm": 0.20762455463409424, + "learning_rate": 4.65742066784161e-05, + "loss": 1.5733, + "step": 862 + }, + { + "epoch": 0.4785140005544774, + "grad_norm": 0.20924974977970123, + "learning_rate": 4.650140937150222e-05, + "loss": 1.6104, + "step": 863 + }, + { + "epoch": 0.4790684779595232, + "grad_norm": 0.21603722870349884, + "learning_rate": 4.642858993986549e-05, + "loss": 1.5838, + "step": 864 + }, + { + "epoch": 0.4796229553645689, + "grad_norm": 0.20661696791648865, + "learning_rate": 4.635574863131522e-05, + "loss": 1.6063, + "step": 865 + }, + { + "epoch": 0.48017743276961466, + "grad_norm": 0.2147568166255951, + "learning_rate": 4.6282885693735145e-05, + "loss": 1.5424, + "step": 866 + }, + { + "epoch": 0.4807319101746604, + "grad_norm": 0.20484501123428345, + "learning_rate": 4.621000137508263e-05, + "loss": 1.5754, + "step": 867 + }, + { + "epoch": 0.48128638757970615, + "grad_norm": 0.22287361323833466, + "learning_rate": 4.61370959233878e-05, + "loss": 1.6868, + "step": 868 + }, + { + "epoch": 0.48184086498475187, + "grad_norm": 0.21111956238746643, + "learning_rate": 4.6064169586752706e-05, + "loss": 1.4827, + "step": 869 + }, + { + "epoch": 0.48239534238979764, + "grad_norm": 0.20634272694587708, + "learning_rate": 4.599122261335044e-05, + "loss": 1.5634, + "step": 870 + }, + { + "epoch": 0.48294981979484336, + "grad_norm": 0.2151550054550171, + "learning_rate": 4.591825525142433e-05, + "loss": 1.5334, + "step": 871 + }, + { + "epoch": 0.4835042971998891, + "grad_norm": 0.21538786590099335, + "learning_rate": 4.584526774928713e-05, + "loss": 1.563, + "step": 872 + }, + { + "epoch": 0.48405877460493485, + "grad_norm": 0.2079581320285797, + "learning_rate": 4.5772260355320075e-05, + "loss": 1.5912, + "step": 873 + }, + { + "epoch": 0.48461325200998057, + "grad_norm": 0.21243925392627716, + "learning_rate": 4.5699233317972145e-05, + "loss": 1.5245, + "step": 874 + }, + { + "epoch": 0.48516772941502634, + "grad_norm": 0.20607562363147736, + "learning_rate": 4.562618688575911e-05, + "loss": 1.5385, + "step": 875 + }, + { + "epoch": 0.48572220682007206, + "grad_norm": 0.2081446647644043, + "learning_rate": 4.555312130726279e-05, + "loss": 1.589, + "step": 876 + }, + { + "epoch": 0.48627668422511783, + "grad_norm": 0.20849229395389557, + "learning_rate": 4.5480036831130144e-05, + "loss": 1.6432, + "step": 877 + }, + { + "epoch": 0.48683116163016354, + "grad_norm": 0.220262348651886, + "learning_rate": 4.540693370607244e-05, + "loss": 1.5364, + "step": 878 + }, + { + "epoch": 0.4873856390352093, + "grad_norm": 0.2163216471672058, + "learning_rate": 4.533381218086443e-05, + "loss": 1.6256, + "step": 879 + }, + { + "epoch": 0.48794011644025503, + "grad_norm": 0.21877123415470123, + "learning_rate": 4.5260672504343436e-05, + "loss": 1.6153, + "step": 880 + }, + { + "epoch": 0.4884945938453008, + "grad_norm": 0.20430409908294678, + "learning_rate": 4.518751492540859e-05, + "loss": 1.5887, + "step": 881 + }, + { + "epoch": 0.4890490712503465, + "grad_norm": 0.21504053473472595, + "learning_rate": 4.5114339693019924e-05, + "loss": 1.6251, + "step": 882 + }, + { + "epoch": 0.4896035486553923, + "grad_norm": 0.21704961359500885, + "learning_rate": 4.504114705619758e-05, + "loss": 1.6214, + "step": 883 + }, + { + "epoch": 0.490158026060438, + "grad_norm": 0.2103128433227539, + "learning_rate": 4.4967937264020896e-05, + "loss": 1.5078, + "step": 884 + }, + { + "epoch": 0.4907125034654838, + "grad_norm": 0.21497170627117157, + "learning_rate": 4.4894710565627585e-05, + "loss": 1.6292, + "step": 885 + }, + { + "epoch": 0.4912669808705295, + "grad_norm": 0.20846691727638245, + "learning_rate": 4.4821467210212924e-05, + "loss": 1.5809, + "step": 886 + }, + { + "epoch": 0.4918214582755753, + "grad_norm": 0.20751187205314636, + "learning_rate": 4.474820744702887e-05, + "loss": 1.5428, + "step": 887 + }, + { + "epoch": 0.492375935680621, + "grad_norm": 0.22106949985027313, + "learning_rate": 4.4674931525383176e-05, + "loss": 1.57, + "step": 888 + }, + { + "epoch": 0.49293041308566676, + "grad_norm": 0.21874569356441498, + "learning_rate": 4.460163969463864e-05, + "loss": 1.565, + "step": 889 + }, + { + "epoch": 0.4934848904907125, + "grad_norm": 0.21572619676589966, + "learning_rate": 4.452833220421216e-05, + "loss": 1.5302, + "step": 890 + }, + { + "epoch": 0.49403936789575825, + "grad_norm": 0.2403019368648529, + "learning_rate": 4.445500930357393e-05, + "loss": 1.5533, + "step": 891 + }, + { + "epoch": 0.49459384530080397, + "grad_norm": 0.2276349812746048, + "learning_rate": 4.438167124224663e-05, + "loss": 1.5833, + "step": 892 + }, + { + "epoch": 0.49514832270584974, + "grad_norm": 0.20668740570545197, + "learning_rate": 4.430831826980445e-05, + "loss": 1.5467, + "step": 893 + }, + { + "epoch": 0.49570280011089546, + "grad_norm": 0.21800528466701508, + "learning_rate": 4.4234950635872406e-05, + "loss": 1.5423, + "step": 894 + }, + { + "epoch": 0.49625727751594123, + "grad_norm": 0.21564136445522308, + "learning_rate": 4.416156859012534e-05, + "loss": 1.5414, + "step": 895 + }, + { + "epoch": 0.49681175492098695, + "grad_norm": 0.21869921684265137, + "learning_rate": 4.4088172382287205e-05, + "loss": 1.5703, + "step": 896 + }, + { + "epoch": 0.4973662323260327, + "grad_norm": 0.2215128093957901, + "learning_rate": 4.401476226213009e-05, + "loss": 1.5246, + "step": 897 + }, + { + "epoch": 0.49792070973107844, + "grad_norm": 0.24564118683338165, + "learning_rate": 4.394133847947346e-05, + "loss": 1.6237, + "step": 898 + }, + { + "epoch": 0.4984751871361242, + "grad_norm": 0.20941196382045746, + "learning_rate": 4.386790128418328e-05, + "loss": 1.5816, + "step": 899 + }, + { + "epoch": 0.4990296645411699, + "grad_norm": 0.21743817627429962, + "learning_rate": 4.3794450926171106e-05, + "loss": 1.5763, + "step": 900 + }, + { + "epoch": 0.4995841419462157, + "grad_norm": 0.2064601182937622, + "learning_rate": 4.3720987655393384e-05, + "loss": 1.549, + "step": 901 + }, + { + "epoch": 0.5001386193512615, + "grad_norm": 0.2088851034641266, + "learning_rate": 4.36475117218504e-05, + "loss": 1.5859, + "step": 902 + }, + { + "epoch": 0.5006930967563071, + "grad_norm": 0.21022847294807434, + "learning_rate": 4.357402337558561e-05, + "loss": 1.5749, + "step": 903 + }, + { + "epoch": 0.5012475741613529, + "grad_norm": 0.21769824624061584, + "learning_rate": 4.350052286668466e-05, + "loss": 1.6526, + "step": 904 + }, + { + "epoch": 0.5018020515663987, + "grad_norm": 0.2308962196111679, + "learning_rate": 4.342701044527461e-05, + "loss": 1.6642, + "step": 905 + }, + { + "epoch": 0.5023565289714444, + "grad_norm": 0.21604010462760925, + "learning_rate": 4.335348636152306e-05, + "loss": 1.6659, + "step": 906 + }, + { + "epoch": 0.5029110063764901, + "grad_norm": 0.20606866478919983, + "learning_rate": 4.3279950865637296e-05, + "loss": 1.5198, + "step": 907 + }, + { + "epoch": 0.5034654837815359, + "grad_norm": 0.20940081775188446, + "learning_rate": 4.320640420786344e-05, + "loss": 1.572, + "step": 908 + }, + { + "epoch": 0.5040199611865817, + "grad_norm": 0.22230613231658936, + "learning_rate": 4.313284663848558e-05, + "loss": 1.5601, + "step": 909 + }, + { + "epoch": 0.5045744385916274, + "grad_norm": 0.21619489789009094, + "learning_rate": 4.305927840782497e-05, + "loss": 1.6177, + "step": 910 + }, + { + "epoch": 0.5051289159966731, + "grad_norm": 0.2147083431482315, + "learning_rate": 4.298569976623912e-05, + "loss": 1.6277, + "step": 911 + }, + { + "epoch": 0.5056833934017189, + "grad_norm": 0.21617257595062256, + "learning_rate": 4.291211096412099e-05, + "loss": 1.6043, + "step": 912 + }, + { + "epoch": 0.5062378708067646, + "grad_norm": 0.20666974782943726, + "learning_rate": 4.283851225189807e-05, + "loss": 1.5511, + "step": 913 + }, + { + "epoch": 0.5067923482118104, + "grad_norm": 0.22238971292972565, + "learning_rate": 4.276490388003164e-05, + "loss": 1.6442, + "step": 914 + }, + { + "epoch": 0.5073468256168561, + "grad_norm": 0.21716102957725525, + "learning_rate": 4.269128609901581e-05, + "loss": 1.5887, + "step": 915 + }, + { + "epoch": 0.5079013030219018, + "grad_norm": 0.20268242061138153, + "learning_rate": 4.261765915937674e-05, + "loss": 1.5302, + "step": 916 + }, + { + "epoch": 0.5084557804269476, + "grad_norm": 0.21744856238365173, + "learning_rate": 4.254402331167171e-05, + "loss": 1.5459, + "step": 917 + }, + { + "epoch": 0.5090102578319934, + "grad_norm": 0.20796500146389008, + "learning_rate": 4.247037880648836e-05, + "loss": 1.5942, + "step": 918 + }, + { + "epoch": 0.509564735237039, + "grad_norm": 0.22836215794086456, + "learning_rate": 4.239672589444376e-05, + "loss": 1.6449, + "step": 919 + }, + { + "epoch": 0.5101192126420848, + "grad_norm": 0.20672906935214996, + "learning_rate": 4.232306482618362e-05, + "loss": 1.5657, + "step": 920 + }, + { + "epoch": 0.5106736900471306, + "grad_norm": 0.21046310663223267, + "learning_rate": 4.22493958523814e-05, + "loss": 1.5618, + "step": 921 + }, + { + "epoch": 0.5112281674521764, + "grad_norm": 0.2109476923942566, + "learning_rate": 4.2175719223737426e-05, + "loss": 1.5723, + "step": 922 + }, + { + "epoch": 0.511782644857222, + "grad_norm": 0.2226995974779129, + "learning_rate": 4.210203519097813e-05, + "loss": 1.6005, + "step": 923 + }, + { + "epoch": 0.5123371222622678, + "grad_norm": 0.21203988790512085, + "learning_rate": 4.202834400485508e-05, + "loss": 1.5898, + "step": 924 + }, + { + "epoch": 0.5128915996673136, + "grad_norm": 0.21835650503635406, + "learning_rate": 4.1954645916144245e-05, + "loss": 1.5458, + "step": 925 + }, + { + "epoch": 0.5134460770723593, + "grad_norm": 0.22072114050388336, + "learning_rate": 4.188094117564505e-05, + "loss": 1.5888, + "step": 926 + }, + { + "epoch": 0.514000554477405, + "grad_norm": 0.20924319326877594, + "learning_rate": 4.1807230034179566e-05, + "loss": 1.5856, + "step": 927 + }, + { + "epoch": 0.5145550318824508, + "grad_norm": 0.21937541663646698, + "learning_rate": 4.1733512742591646e-05, + "loss": 1.5418, + "step": 928 + }, + { + "epoch": 0.5151095092874965, + "grad_norm": 0.2233019769191742, + "learning_rate": 4.165978955174606e-05, + "loss": 1.6723, + "step": 929 + }, + { + "epoch": 0.5156639866925423, + "grad_norm": 0.22729487717151642, + "learning_rate": 4.15860607125277e-05, + "loss": 1.5608, + "step": 930 + }, + { + "epoch": 0.516218464097588, + "grad_norm": 0.20513324439525604, + "learning_rate": 4.151232647584061e-05, + "loss": 1.5652, + "step": 931 + }, + { + "epoch": 0.5167729415026338, + "grad_norm": 0.2118518203496933, + "learning_rate": 4.143858709260726e-05, + "loss": 1.6201, + "step": 932 + }, + { + "epoch": 0.5173274189076795, + "grad_norm": 0.20294101536273956, + "learning_rate": 4.136484281376758e-05, + "loss": 1.5273, + "step": 933 + }, + { + "epoch": 0.5178818963127253, + "grad_norm": 0.21510812640190125, + "learning_rate": 4.1291093890278244e-05, + "loss": 1.5741, + "step": 934 + }, + { + "epoch": 0.518436373717771, + "grad_norm": 0.20787851512432098, + "learning_rate": 4.1217340573111625e-05, + "loss": 1.6002, + "step": 935 + }, + { + "epoch": 0.5189908511228167, + "grad_norm": 0.20799653232097626, + "learning_rate": 4.114358311325513e-05, + "loss": 1.541, + "step": 936 + }, + { + "epoch": 0.5195453285278625, + "grad_norm": 0.20286118984222412, + "learning_rate": 4.106982176171025e-05, + "loss": 1.4923, + "step": 937 + }, + { + "epoch": 0.5200998059329083, + "grad_norm": 0.2020486742258072, + "learning_rate": 4.099605676949169e-05, + "loss": 1.4772, + "step": 938 + }, + { + "epoch": 0.5206542833379539, + "grad_norm": 0.21534226834774017, + "learning_rate": 4.0922288387626536e-05, + "loss": 1.5663, + "step": 939 + }, + { + "epoch": 0.5212087607429997, + "grad_norm": 0.24164481461048126, + "learning_rate": 4.0848516867153474e-05, + "loss": 1.6155, + "step": 940 + }, + { + "epoch": 0.5217632381480455, + "grad_norm": 0.20027059316635132, + "learning_rate": 4.077474245912182e-05, + "loss": 1.5143, + "step": 941 + }, + { + "epoch": 0.5223177155530913, + "grad_norm": 0.2217823565006256, + "learning_rate": 4.070096541459071e-05, + "loss": 1.5583, + "step": 942 + }, + { + "epoch": 0.5228721929581369, + "grad_norm": 0.21679724752902985, + "learning_rate": 4.0627185984628295e-05, + "loss": 1.5588, + "step": 943 + }, + { + "epoch": 0.5234266703631827, + "grad_norm": 0.20818306505680084, + "learning_rate": 4.055340442031079e-05, + "loss": 1.5417, + "step": 944 + }, + { + "epoch": 0.5239811477682285, + "grad_norm": 0.21085157990455627, + "learning_rate": 4.0479620972721726e-05, + "loss": 1.5735, + "step": 945 + }, + { + "epoch": 0.5245356251732742, + "grad_norm": 0.21629397571086884, + "learning_rate": 4.040583589295101e-05, + "loss": 1.6296, + "step": 946 + }, + { + "epoch": 0.5250901025783199, + "grad_norm": 0.2124132364988327, + "learning_rate": 4.03320494320941e-05, + "loss": 1.5781, + "step": 947 + }, + { + "epoch": 0.5256445799833657, + "grad_norm": 0.20531564950942993, + "learning_rate": 4.0258261841251175e-05, + "loss": 1.5726, + "step": 948 + }, + { + "epoch": 0.5261990573884114, + "grad_norm": 0.20956727862358093, + "learning_rate": 4.018447337152626e-05, + "loss": 1.5653, + "step": 949 + }, + { + "epoch": 0.5267535347934572, + "grad_norm": 0.20362792909145355, + "learning_rate": 4.0110684274026365e-05, + "loss": 1.576, + "step": 950 + }, + { + "epoch": 0.5273080121985029, + "grad_norm": 0.20399406552314758, + "learning_rate": 4.00368947998606e-05, + "loss": 1.5732, + "step": 951 + }, + { + "epoch": 0.5278624896035486, + "grad_norm": 0.21037329733371735, + "learning_rate": 3.9963105200139415e-05, + "loss": 1.5909, + "step": 952 + }, + { + "epoch": 0.5284169670085944, + "grad_norm": 0.2091919481754303, + "learning_rate": 3.9889315725973655e-05, + "loss": 1.5478, + "step": 953 + }, + { + "epoch": 0.5289714444136402, + "grad_norm": 0.2109626978635788, + "learning_rate": 3.981552662847375e-05, + "loss": 1.5515, + "step": 954 + }, + { + "epoch": 0.5295259218186859, + "grad_norm": 0.21235403418540955, + "learning_rate": 3.9741738158748824e-05, + "loss": 1.5435, + "step": 955 + }, + { + "epoch": 0.5300803992237316, + "grad_norm": 0.2205905020236969, + "learning_rate": 3.966795056790591e-05, + "loss": 1.6312, + "step": 956 + }, + { + "epoch": 0.5306348766287774, + "grad_norm": 0.2057129591703415, + "learning_rate": 3.959416410704901e-05, + "loss": 1.5718, + "step": 957 + }, + { + "epoch": 0.5311893540338232, + "grad_norm": 0.2083214968442917, + "learning_rate": 3.952037902727829e-05, + "loss": 1.5801, + "step": 958 + }, + { + "epoch": 0.5317438314388688, + "grad_norm": 0.21389059722423553, + "learning_rate": 3.944659557968922e-05, + "loss": 1.5781, + "step": 959 + }, + { + "epoch": 0.5322983088439146, + "grad_norm": 0.20008264482021332, + "learning_rate": 3.937281401537171e-05, + "loss": 1.5221, + "step": 960 + }, + { + "epoch": 0.5328527862489604, + "grad_norm": 0.22723457217216492, + "learning_rate": 3.92990345854093e-05, + "loss": 1.6409, + "step": 961 + }, + { + "epoch": 0.5334072636540061, + "grad_norm": 0.21602332592010498, + "learning_rate": 3.922525754087819e-05, + "loss": 1.624, + "step": 962 + }, + { + "epoch": 0.5339617410590518, + "grad_norm": 0.2093724012374878, + "learning_rate": 3.915148313284653e-05, + "loss": 1.5583, + "step": 963 + }, + { + "epoch": 0.5345162184640976, + "grad_norm": 0.20897488296031952, + "learning_rate": 3.907771161237347e-05, + "loss": 1.6248, + "step": 964 + }, + { + "epoch": 0.5350706958691434, + "grad_norm": 0.21805384755134583, + "learning_rate": 3.900394323050833e-05, + "loss": 1.6227, + "step": 965 + }, + { + "epoch": 0.5356251732741891, + "grad_norm": 0.22409921884536743, + "learning_rate": 3.893017823828977e-05, + "loss": 1.5582, + "step": 966 + }, + { + "epoch": 0.5361796506792348, + "grad_norm": 0.22005674242973328, + "learning_rate": 3.8856416886744874e-05, + "loss": 1.6032, + "step": 967 + }, + { + "epoch": 0.5367341280842806, + "grad_norm": 0.20452825725078583, + "learning_rate": 3.878265942688838e-05, + "loss": 1.5501, + "step": 968 + }, + { + "epoch": 0.5372886054893263, + "grad_norm": 0.21274332702159882, + "learning_rate": 3.8708906109721776e-05, + "loss": 1.5386, + "step": 969 + }, + { + "epoch": 0.5378430828943721, + "grad_norm": 0.21413595974445343, + "learning_rate": 3.863515718623242e-05, + "loss": 1.6382, + "step": 970 + }, + { + "epoch": 0.5383975602994178, + "grad_norm": 0.22043178975582123, + "learning_rate": 3.856141290739276e-05, + "loss": 1.5306, + "step": 971 + }, + { + "epoch": 0.5389520377044635, + "grad_norm": 0.2085334211587906, + "learning_rate": 3.8487673524159404e-05, + "loss": 1.5452, + "step": 972 + }, + { + "epoch": 0.5395065151095093, + "grad_norm": 0.2164732664823532, + "learning_rate": 3.8413939287472305e-05, + "loss": 1.6273, + "step": 973 + }, + { + "epoch": 0.5400609925145551, + "grad_norm": 0.23173940181732178, + "learning_rate": 3.8340210448253945e-05, + "loss": 1.629, + "step": 974 + }, + { + "epoch": 0.5406154699196007, + "grad_norm": 0.20250822603702545, + "learning_rate": 3.826648725740836e-05, + "loss": 1.5313, + "step": 975 + }, + { + "epoch": 0.5411699473246465, + "grad_norm": 0.20862893760204315, + "learning_rate": 3.819276996582045e-05, + "loss": 1.574, + "step": 976 + }, + { + "epoch": 0.5417244247296923, + "grad_norm": 0.2386544793844223, + "learning_rate": 3.8119058824354966e-05, + "loss": 1.6064, + "step": 977 + }, + { + "epoch": 0.5422789021347381, + "grad_norm": 0.2093050479888916, + "learning_rate": 3.804535408385577e-05, + "loss": 1.5596, + "step": 978 + }, + { + "epoch": 0.5428333795397837, + "grad_norm": 0.22440102696418762, + "learning_rate": 3.7971655995144937e-05, + "loss": 1.5967, + "step": 979 + }, + { + "epoch": 0.5433878569448295, + "grad_norm": 0.21898169815540314, + "learning_rate": 3.789796480902188e-05, + "loss": 1.5639, + "step": 980 + }, + { + "epoch": 0.5439423343498753, + "grad_norm": 0.21143299341201782, + "learning_rate": 3.782428077626259e-05, + "loss": 1.5549, + "step": 981 + }, + { + "epoch": 0.544496811754921, + "grad_norm": 0.21122704446315765, + "learning_rate": 3.775060414761861e-05, + "loss": 1.5778, + "step": 982 + }, + { + "epoch": 0.5450512891599667, + "grad_norm": 0.22585324943065643, + "learning_rate": 3.767693517381638e-05, + "loss": 1.6119, + "step": 983 + }, + { + "epoch": 0.5456057665650125, + "grad_norm": 0.1983765810728073, + "learning_rate": 3.760327410555625e-05, + "loss": 1.4554, + "step": 984 + }, + { + "epoch": 0.5461602439700582, + "grad_norm": 0.210382342338562, + "learning_rate": 3.752962119351166e-05, + "loss": 1.562, + "step": 985 + }, + { + "epoch": 0.546714721375104, + "grad_norm": 0.2129284292459488, + "learning_rate": 3.745597668832831e-05, + "loss": 1.6004, + "step": 986 + }, + { + "epoch": 0.5472691987801497, + "grad_norm": 0.20643486082553864, + "learning_rate": 3.738234084062327e-05, + "loss": 1.5137, + "step": 987 + }, + { + "epoch": 0.5478236761851955, + "grad_norm": 0.20947743952274323, + "learning_rate": 3.730871390098419e-05, + "loss": 1.5833, + "step": 988 + }, + { + "epoch": 0.5483781535902412, + "grad_norm": 0.2061213254928589, + "learning_rate": 3.723509611996837e-05, + "loss": 1.5231, + "step": 989 + }, + { + "epoch": 0.548932630995287, + "grad_norm": 0.21855583786964417, + "learning_rate": 3.716148774810194e-05, + "loss": 1.5806, + "step": 990 + }, + { + "epoch": 0.5494871084003327, + "grad_norm": 0.21223531663417816, + "learning_rate": 3.708788903587904e-05, + "loss": 1.5188, + "step": 991 + }, + { + "epoch": 0.5500415858053784, + "grad_norm": 0.20913442969322205, + "learning_rate": 3.701430023376089e-05, + "loss": 1.6229, + "step": 992 + }, + { + "epoch": 0.5505960632104242, + "grad_norm": 0.2168632298707962, + "learning_rate": 3.6940721592175026e-05, + "loss": 1.5483, + "step": 993 + }, + { + "epoch": 0.55115054061547, + "grad_norm": 0.21936000883579254, + "learning_rate": 3.686715336151443e-05, + "loss": 1.5724, + "step": 994 + }, + { + "epoch": 0.5517050180205156, + "grad_norm": 0.21921208500862122, + "learning_rate": 3.6793595792136565e-05, + "loss": 1.6074, + "step": 995 + }, + { + "epoch": 0.5522594954255614, + "grad_norm": 0.21475403010845184, + "learning_rate": 3.672004913436271e-05, + "loss": 1.6166, + "step": 996 + }, + { + "epoch": 0.5528139728306072, + "grad_norm": 0.2194800078868866, + "learning_rate": 3.664651363847695e-05, + "loss": 1.5691, + "step": 997 + }, + { + "epoch": 0.5533684502356528, + "grad_norm": 0.21276038885116577, + "learning_rate": 3.65729895547254e-05, + "loss": 1.5916, + "step": 998 + }, + { + "epoch": 0.5539229276406986, + "grad_norm": 0.2259768396615982, + "learning_rate": 3.649947713331536e-05, + "loss": 1.5617, + "step": 999 + }, + { + "epoch": 0.5544774050457444, + "grad_norm": 0.21104209125041962, + "learning_rate": 3.64259766244144e-05, + "loss": 1.5882, + "step": 1000 + }, + { + "epoch": 0.5550318824507902, + "grad_norm": 0.21075722575187683, + "learning_rate": 3.63524882781496e-05, + "loss": 1.6089, + "step": 1001 + }, + { + "epoch": 0.5555863598558358, + "grad_norm": 0.23340259492397308, + "learning_rate": 3.627901234460663e-05, + "loss": 1.6235, + "step": 1002 + }, + { + "epoch": 0.5561408372608816, + "grad_norm": 0.22706355154514313, + "learning_rate": 3.6205549073828894e-05, + "loss": 1.5388, + "step": 1003 + }, + { + "epoch": 0.5566953146659274, + "grad_norm": 0.22355878353118896, + "learning_rate": 3.613209871581674e-05, + "loss": 1.5329, + "step": 1004 + }, + { + "epoch": 0.5572497920709731, + "grad_norm": 0.22762571275234222, + "learning_rate": 3.6058661520526555e-05, + "loss": 1.5367, + "step": 1005 + }, + { + "epoch": 0.5578042694760188, + "grad_norm": 0.21231630444526672, + "learning_rate": 3.598523773786991e-05, + "loss": 1.5366, + "step": 1006 + }, + { + "epoch": 0.5583587468810646, + "grad_norm": 0.22039973735809326, + "learning_rate": 3.591182761771281e-05, + "loss": 1.6085, + "step": 1007 + }, + { + "epoch": 0.5589132242861103, + "grad_norm": 0.22262850403785706, + "learning_rate": 3.5838431409874666e-05, + "loss": 1.5297, + "step": 1008 + }, + { + "epoch": 0.5594677016911561, + "grad_norm": 0.20765410363674164, + "learning_rate": 3.576504936412762e-05, + "loss": 1.5488, + "step": 1009 + }, + { + "epoch": 0.5600221790962018, + "grad_norm": 0.21379908919334412, + "learning_rate": 3.5691681730195564e-05, + "loss": 1.581, + "step": 1010 + }, + { + "epoch": 0.5605766565012475, + "grad_norm": 0.2365419566631317, + "learning_rate": 3.561832875775338e-05, + "loss": 1.6571, + "step": 1011 + }, + { + "epoch": 0.5611311339062933, + "grad_norm": 0.2150985300540924, + "learning_rate": 3.5544990696426077e-05, + "loss": 1.546, + "step": 1012 + }, + { + "epoch": 0.5616856113113391, + "grad_norm": 0.20792889595031738, + "learning_rate": 3.5471667795787845e-05, + "loss": 1.5404, + "step": 1013 + }, + { + "epoch": 0.5622400887163848, + "grad_norm": 0.23230226337909698, + "learning_rate": 3.5398360305361375e-05, + "loss": 1.5393, + "step": 1014 + }, + { + "epoch": 0.5627945661214305, + "grad_norm": 0.20976661145687103, + "learning_rate": 3.532506847461684e-05, + "loss": 1.5753, + "step": 1015 + }, + { + "epoch": 0.5633490435264763, + "grad_norm": 0.21504442393779755, + "learning_rate": 3.5251792552971146e-05, + "loss": 1.5944, + "step": 1016 + }, + { + "epoch": 0.5639035209315221, + "grad_norm": 0.21869289875030518, + "learning_rate": 3.517853278978708e-05, + "loss": 1.5166, + "step": 1017 + }, + { + "epoch": 0.5644579983365677, + "grad_norm": 0.22892804443836212, + "learning_rate": 3.510528943437243e-05, + "loss": 1.6643, + "step": 1018 + }, + { + "epoch": 0.5650124757416135, + "grad_norm": 0.21030141413211823, + "learning_rate": 3.503206273597913e-05, + "loss": 1.5408, + "step": 1019 + }, + { + "epoch": 0.5655669531466593, + "grad_norm": 0.23448987305164337, + "learning_rate": 3.495885294380243e-05, + "loss": 1.5352, + "step": 1020 + }, + { + "epoch": 0.566121430551705, + "grad_norm": 0.20386260747909546, + "learning_rate": 3.488566030698008e-05, + "loss": 1.5978, + "step": 1021 + }, + { + "epoch": 0.5666759079567507, + "grad_norm": 0.22522738575935364, + "learning_rate": 3.481248507459143e-05, + "loss": 1.6675, + "step": 1022 + }, + { + "epoch": 0.5672303853617965, + "grad_norm": 0.2048107385635376, + "learning_rate": 3.4739327495656584e-05, + "loss": 1.4263, + "step": 1023 + }, + { + "epoch": 0.5677848627668423, + "grad_norm": 0.21060484647750854, + "learning_rate": 3.466618781913559e-05, + "loss": 1.5268, + "step": 1024 + }, + { + "epoch": 0.568339340171888, + "grad_norm": 0.22075191140174866, + "learning_rate": 3.459306629392757e-05, + "loss": 1.6015, + "step": 1025 + }, + { + "epoch": 0.5688938175769337, + "grad_norm": 0.20704005658626556, + "learning_rate": 3.4519963168869855e-05, + "loss": 1.4969, + "step": 1026 + }, + { + "epoch": 0.5694482949819795, + "grad_norm": 0.21155935525894165, + "learning_rate": 3.444687869273722e-05, + "loss": 1.5355, + "step": 1027 + }, + { + "epoch": 0.5700027723870252, + "grad_norm": 0.21162542700767517, + "learning_rate": 3.437381311424091e-05, + "loss": 1.5899, + "step": 1028 + }, + { + "epoch": 0.570557249792071, + "grad_norm": 0.2058079093694687, + "learning_rate": 3.4300766682027875e-05, + "loss": 1.4967, + "step": 1029 + }, + { + "epoch": 0.5711117271971167, + "grad_norm": 0.21342620253562927, + "learning_rate": 3.422773964467994e-05, + "loss": 1.5941, + "step": 1030 + }, + { + "epoch": 0.5716662046021624, + "grad_norm": 0.21482619643211365, + "learning_rate": 3.4154732250712876e-05, + "loss": 1.5887, + "step": 1031 + }, + { + "epoch": 0.5722206820072082, + "grad_norm": 0.20948712527751923, + "learning_rate": 3.4081744748575684e-05, + "loss": 1.5695, + "step": 1032 + }, + { + "epoch": 0.572775159412254, + "grad_norm": 0.2241201102733612, + "learning_rate": 3.400877738664958e-05, + "loss": 1.5542, + "step": 1033 + }, + { + "epoch": 0.5733296368172996, + "grad_norm": 0.21154385805130005, + "learning_rate": 3.39358304132473e-05, + "loss": 1.5349, + "step": 1034 + }, + { + "epoch": 0.5738841142223454, + "grad_norm": 0.22654007375240326, + "learning_rate": 3.386290407661221e-05, + "loss": 1.6839, + "step": 1035 + }, + { + "epoch": 0.5744385916273912, + "grad_norm": 0.2175297737121582, + "learning_rate": 3.3789998624917376e-05, + "loss": 1.5791, + "step": 1036 + }, + { + "epoch": 0.574993069032437, + "grad_norm": 0.2157989740371704, + "learning_rate": 3.3717114306264875e-05, + "loss": 1.5716, + "step": 1037 + }, + { + "epoch": 0.5755475464374826, + "grad_norm": 0.2078414410352707, + "learning_rate": 3.364425136868479e-05, + "loss": 1.586, + "step": 1038 + }, + { + "epoch": 0.5761020238425284, + "grad_norm": 0.2155274897813797, + "learning_rate": 3.357141006013451e-05, + "loss": 1.5882, + "step": 1039 + }, + { + "epoch": 0.5766565012475742, + "grad_norm": 0.21952180564403534, + "learning_rate": 3.349859062849779e-05, + "loss": 1.6528, + "step": 1040 + }, + { + "epoch": 0.5772109786526199, + "grad_norm": 0.2106204628944397, + "learning_rate": 3.3425793321583914e-05, + "loss": 1.5356, + "step": 1041 + }, + { + "epoch": 0.5777654560576656, + "grad_norm": 0.2236543446779251, + "learning_rate": 3.335301838712692e-05, + "loss": 1.5639, + "step": 1042 + }, + { + "epoch": 0.5783199334627114, + "grad_norm": 0.21134650707244873, + "learning_rate": 3.328026607278466e-05, + "loss": 1.5635, + "step": 1043 + }, + { + "epoch": 0.5788744108677571, + "grad_norm": 0.21437637507915497, + "learning_rate": 3.3207536626138046e-05, + "loss": 1.557, + "step": 1044 + }, + { + "epoch": 0.5794288882728029, + "grad_norm": 0.21564576029777527, + "learning_rate": 3.313483029469019e-05, + "loss": 1.548, + "step": 1045 + }, + { + "epoch": 0.5799833656778486, + "grad_norm": 0.21482598781585693, + "learning_rate": 3.306214732586546e-05, + "loss": 1.5844, + "step": 1046 + }, + { + "epoch": 0.5805378430828944, + "grad_norm": 0.20883773267269135, + "learning_rate": 3.2989487967008806e-05, + "loss": 1.5653, + "step": 1047 + }, + { + "epoch": 0.5810923204879401, + "grad_norm": 0.20583604276180267, + "learning_rate": 3.291685246538478e-05, + "loss": 1.5298, + "step": 1048 + }, + { + "epoch": 0.5816467978929859, + "grad_norm": 0.20545633137226105, + "learning_rate": 3.2844241068176766e-05, + "loss": 1.5625, + "step": 1049 + }, + { + "epoch": 0.5822012752980316, + "grad_norm": 0.2127029448747635, + "learning_rate": 3.277165402248612e-05, + "loss": 1.6048, + "step": 1050 + }, + { + "epoch": 0.5827557527030773, + "grad_norm": 0.20467516779899597, + "learning_rate": 3.26990915753313e-05, + "loss": 1.5292, + "step": 1051 + }, + { + "epoch": 0.5833102301081231, + "grad_norm": 0.20428596436977386, + "learning_rate": 3.2626553973647115e-05, + "loss": 1.5542, + "step": 1052 + }, + { + "epoch": 0.5838647075131689, + "grad_norm": 0.21240785717964172, + "learning_rate": 3.255404146428375e-05, + "loss": 1.5601, + "step": 1053 + }, + { + "epoch": 0.5844191849182145, + "grad_norm": 0.20584693551063538, + "learning_rate": 3.248155429400605e-05, + "loss": 1.5476, + "step": 1054 + }, + { + "epoch": 0.5849736623232603, + "grad_norm": 0.21363112330436707, + "learning_rate": 3.2409092709492624e-05, + "loss": 1.6318, + "step": 1055 + }, + { + "epoch": 0.5855281397283061, + "grad_norm": 0.22125588357448578, + "learning_rate": 3.233665695733498e-05, + "loss": 1.5496, + "step": 1056 + }, + { + "epoch": 0.5860826171333519, + "grad_norm": 0.22506000101566315, + "learning_rate": 3.2264247284036755e-05, + "loss": 1.547, + "step": 1057 + }, + { + "epoch": 0.5866370945383975, + "grad_norm": 0.21581967175006866, + "learning_rate": 3.2191863936012794e-05, + "loss": 1.5397, + "step": 1058 + }, + { + "epoch": 0.5871915719434433, + "grad_norm": 0.21308638155460358, + "learning_rate": 3.211950715958841e-05, + "loss": 1.5873, + "step": 1059 + }, + { + "epoch": 0.5877460493484891, + "grad_norm": 0.20769579708576202, + "learning_rate": 3.2047177200998456e-05, + "loss": 1.5548, + "step": 1060 + }, + { + "epoch": 0.5883005267535348, + "grad_norm": 0.20457741618156433, + "learning_rate": 3.197487430638652e-05, + "loss": 1.55, + "step": 1061 + }, + { + "epoch": 0.5888550041585805, + "grad_norm": 0.21090541779994965, + "learning_rate": 3.1902598721804096e-05, + "loss": 1.6617, + "step": 1062 + }, + { + "epoch": 0.5894094815636263, + "grad_norm": 0.21099694073200226, + "learning_rate": 3.1830350693209774e-05, + "loss": 1.5077, + "step": 1063 + }, + { + "epoch": 0.589963958968672, + "grad_norm": 0.21446220576763153, + "learning_rate": 3.1758130466468293e-05, + "loss": 1.6078, + "step": 1064 + }, + { + "epoch": 0.5905184363737178, + "grad_norm": 0.21016094088554382, + "learning_rate": 3.168593828734986e-05, + "loss": 1.5688, + "step": 1065 + }, + { + "epoch": 0.5910729137787635, + "grad_norm": 0.21141645312309265, + "learning_rate": 3.161377440152918e-05, + "loss": 1.5527, + "step": 1066 + }, + { + "epoch": 0.5916273911838092, + "grad_norm": 0.2074812948703766, + "learning_rate": 3.154163905458469e-05, + "loss": 1.5399, + "step": 1067 + }, + { + "epoch": 0.592181868588855, + "grad_norm": 0.2177177220582962, + "learning_rate": 3.146953249199774e-05, + "loss": 1.599, + "step": 1068 + }, + { + "epoch": 0.5927363459939008, + "grad_norm": 0.20859509706497192, + "learning_rate": 3.139745495915166e-05, + "loss": 1.5818, + "step": 1069 + }, + { + "epoch": 0.5932908233989465, + "grad_norm": 0.20710986852645874, + "learning_rate": 3.1325406701331056e-05, + "loss": 1.559, + "step": 1070 + }, + { + "epoch": 0.5938453008039922, + "grad_norm": 0.21879532933235168, + "learning_rate": 3.1253387963720835e-05, + "loss": 1.5392, + "step": 1071 + }, + { + "epoch": 0.594399778209038, + "grad_norm": 0.2117648422718048, + "learning_rate": 3.118139899140553e-05, + "loss": 1.6009, + "step": 1072 + }, + { + "epoch": 0.5949542556140838, + "grad_norm": 0.21254301071166992, + "learning_rate": 3.110944002936835e-05, + "loss": 1.5663, + "step": 1073 + }, + { + "epoch": 0.5955087330191294, + "grad_norm": 0.2058122456073761, + "learning_rate": 3.1037511322490324e-05, + "loss": 1.5066, + "step": 1074 + }, + { + "epoch": 0.5960632104241752, + "grad_norm": 0.2119075357913971, + "learning_rate": 3.096561311554959e-05, + "loss": 1.5925, + "step": 1075 + }, + { + "epoch": 0.596617687829221, + "grad_norm": 0.2224556803703308, + "learning_rate": 3.089374565322045e-05, + "loss": 1.6041, + "step": 1076 + }, + { + "epoch": 0.5971721652342667, + "grad_norm": 0.21044711768627167, + "learning_rate": 3.082190918007259e-05, + "loss": 1.5778, + "step": 1077 + }, + { + "epoch": 0.5977266426393124, + "grad_norm": 0.21917963027954102, + "learning_rate": 3.0750103940570284e-05, + "loss": 1.668, + "step": 1078 + }, + { + "epoch": 0.5982811200443582, + "grad_norm": 0.21519285440444946, + "learning_rate": 3.067833017907144e-05, + "loss": 1.567, + "step": 1079 + }, + { + "epoch": 0.598835597449404, + "grad_norm": 0.21112053096294403, + "learning_rate": 3.0606588139826884e-05, + "loss": 1.5426, + "step": 1080 + }, + { + "epoch": 0.5993900748544497, + "grad_norm": 0.21877877414226532, + "learning_rate": 3.0534878066979494e-05, + "loss": 1.5749, + "step": 1081 + }, + { + "epoch": 0.5999445522594954, + "grad_norm": 0.2124583125114441, + "learning_rate": 3.046320020456334e-05, + "loss": 1.4941, + "step": 1082 + }, + { + "epoch": 0.6004990296645412, + "grad_norm": 0.21378760039806366, + "learning_rate": 3.0391554796502925e-05, + "loss": 1.5903, + "step": 1083 + }, + { + "epoch": 0.6010535070695869, + "grad_norm": 0.21284620463848114, + "learning_rate": 3.031994208661223e-05, + "loss": 1.5439, + "step": 1084 + }, + { + "epoch": 0.6016079844746327, + "grad_norm": 0.2211058884859085, + "learning_rate": 3.0248362318594055e-05, + "loss": 1.5932, + "step": 1085 + }, + { + "epoch": 0.6021624618796784, + "grad_norm": 0.20590293407440186, + "learning_rate": 3.0176815736039007e-05, + "loss": 1.5591, + "step": 1086 + }, + { + "epoch": 0.6027169392847241, + "grad_norm": 0.21580100059509277, + "learning_rate": 3.010530258242483e-05, + "loss": 1.5505, + "step": 1087 + }, + { + "epoch": 0.6032714166897699, + "grad_norm": 0.2178206592798233, + "learning_rate": 3.0033823101115473e-05, + "loss": 1.4999, + "step": 1088 + }, + { + "epoch": 0.6038258940948157, + "grad_norm": 0.22579710185527802, + "learning_rate": 2.9962377535360286e-05, + "loss": 1.622, + "step": 1089 + }, + { + "epoch": 0.6043803714998613, + "grad_norm": 0.22102071344852448, + "learning_rate": 2.9890966128293243e-05, + "loss": 1.5054, + "step": 1090 + }, + { + "epoch": 0.6049348489049071, + "grad_norm": 0.2073606252670288, + "learning_rate": 2.9819589122932055e-05, + "loss": 1.4963, + "step": 1091 + }, + { + "epoch": 0.6054893263099529, + "grad_norm": 0.20773765444755554, + "learning_rate": 2.9748246762177326e-05, + "loss": 1.5046, + "step": 1092 + }, + { + "epoch": 0.6060438037149987, + "grad_norm": 0.2178242802619934, + "learning_rate": 2.9676939288811825e-05, + "loss": 1.623, + "step": 1093 + }, + { + "epoch": 0.6065982811200443, + "grad_norm": 0.21794721484184265, + "learning_rate": 2.960566694549954e-05, + "loss": 1.5074, + "step": 1094 + }, + { + "epoch": 0.6071527585250901, + "grad_norm": 0.22172760963439941, + "learning_rate": 2.953442997478494e-05, + "loss": 1.6091, + "step": 1095 + }, + { + "epoch": 0.6077072359301359, + "grad_norm": 0.20793460309505463, + "learning_rate": 2.9463228619092132e-05, + "loss": 1.5604, + "step": 1096 + }, + { + "epoch": 0.6082617133351816, + "grad_norm": 0.2183390110731125, + "learning_rate": 2.9392063120723962e-05, + "loss": 1.4959, + "step": 1097 + }, + { + "epoch": 0.6088161907402273, + "grad_norm": 0.2197534292936325, + "learning_rate": 2.9320933721861335e-05, + "loss": 1.536, + "step": 1098 + }, + { + "epoch": 0.6093706681452731, + "grad_norm": 0.2130371481180191, + "learning_rate": 2.9249840664562242e-05, + "loss": 1.6402, + "step": 1099 + }, + { + "epoch": 0.6099251455503188, + "grad_norm": 0.21440336108207703, + "learning_rate": 2.917878419076102e-05, + "loss": 1.5631, + "step": 1100 + }, + { + "epoch": 0.6104796229553646, + "grad_norm": 0.22678157687187195, + "learning_rate": 2.9107764542267536e-05, + "loss": 1.5603, + "step": 1101 + }, + { + "epoch": 0.6110341003604103, + "grad_norm": 0.21499857306480408, + "learning_rate": 2.903678196076628e-05, + "loss": 1.642, + "step": 1102 + }, + { + "epoch": 0.611588577765456, + "grad_norm": 0.20951659977436066, + "learning_rate": 2.896583668781569e-05, + "loss": 1.5311, + "step": 1103 + }, + { + "epoch": 0.6121430551705018, + "grad_norm": 0.2207638919353485, + "learning_rate": 2.8894928964847133e-05, + "loss": 1.5827, + "step": 1104 + }, + { + "epoch": 0.6126975325755476, + "grad_norm": 0.2097187638282776, + "learning_rate": 2.8824059033164262e-05, + "loss": 1.5433, + "step": 1105 + }, + { + "epoch": 0.6132520099805933, + "grad_norm": 0.20598512887954712, + "learning_rate": 2.875322713394213e-05, + "loss": 1.5966, + "step": 1106 + }, + { + "epoch": 0.613806487385639, + "grad_norm": 0.20717155933380127, + "learning_rate": 2.8682433508226303e-05, + "loss": 1.4861, + "step": 1107 + }, + { + "epoch": 0.6143609647906848, + "grad_norm": 0.20514699816703796, + "learning_rate": 2.8611678396932164e-05, + "loss": 1.5664, + "step": 1108 + }, + { + "epoch": 0.6149154421957306, + "grad_norm": 0.21211731433868408, + "learning_rate": 2.8540962040843982e-05, + "loss": 1.5026, + "step": 1109 + }, + { + "epoch": 0.6154699196007762, + "grad_norm": 0.21061904728412628, + "learning_rate": 2.8470284680614172e-05, + "loss": 1.5795, + "step": 1110 + }, + { + "epoch": 0.616024397005822, + "grad_norm": 0.2107418179512024, + "learning_rate": 2.8399646556762435e-05, + "loss": 1.6298, + "step": 1111 + }, + { + "epoch": 0.6165788744108678, + "grad_norm": 0.20848070085048676, + "learning_rate": 2.832904790967492e-05, + "loss": 1.5619, + "step": 1112 + }, + { + "epoch": 0.6171333518159136, + "grad_norm": 0.21443457901477814, + "learning_rate": 2.8258488979603488e-05, + "loss": 1.5624, + "step": 1113 + }, + { + "epoch": 0.6176878292209592, + "grad_norm": 0.21567028760910034, + "learning_rate": 2.8187970006664784e-05, + "loss": 1.5691, + "step": 1114 + }, + { + "epoch": 0.618242306626005, + "grad_norm": 0.20712579786777496, + "learning_rate": 2.8117491230839502e-05, + "loss": 1.5106, + "step": 1115 + }, + { + "epoch": 0.6187967840310508, + "grad_norm": 0.2100321501493454, + "learning_rate": 2.8047052891971565e-05, + "loss": 1.5948, + "step": 1116 + }, + { + "epoch": 0.6193512614360965, + "grad_norm": 0.21011658012866974, + "learning_rate": 2.7976655229767248e-05, + "loss": 1.5616, + "step": 1117 + }, + { + "epoch": 0.6199057388411422, + "grad_norm": 0.22047735750675201, + "learning_rate": 2.790629848379442e-05, + "loss": 1.6123, + "step": 1118 + }, + { + "epoch": 0.620460216246188, + "grad_norm": 0.2138945609331131, + "learning_rate": 2.78359828934817e-05, + "loss": 1.5528, + "step": 1119 + }, + { + "epoch": 0.6210146936512337, + "grad_norm": 0.216939777135849, + "learning_rate": 2.7765708698117663e-05, + "loss": 1.5472, + "step": 1120 + }, + { + "epoch": 0.6215691710562794, + "grad_norm": 0.22868293523788452, + "learning_rate": 2.7695476136850018e-05, + "loss": 1.5588, + "step": 1121 + }, + { + "epoch": 0.6221236484613252, + "grad_norm": 0.2176186591386795, + "learning_rate": 2.7625285448684757e-05, + "loss": 1.5469, + "step": 1122 + }, + { + "epoch": 0.622678125866371, + "grad_norm": 0.21710044145584106, + "learning_rate": 2.7555136872485415e-05, + "loss": 1.5845, + "step": 1123 + }, + { + "epoch": 0.6232326032714167, + "grad_norm": 0.21951667964458466, + "learning_rate": 2.748503064697221e-05, + "loss": 1.5436, + "step": 1124 + }, + { + "epoch": 0.6237870806764624, + "grad_norm": 0.2149830311536789, + "learning_rate": 2.7414967010721212e-05, + "loss": 1.5896, + "step": 1125 + }, + { + "epoch": 0.6243415580815082, + "grad_norm": 0.21103326976299286, + "learning_rate": 2.7344946202163592e-05, + "loss": 1.6166, + "step": 1126 + }, + { + "epoch": 0.6248960354865539, + "grad_norm": 0.22287732362747192, + "learning_rate": 2.727496845958474e-05, + "loss": 1.5062, + "step": 1127 + }, + { + "epoch": 0.6254505128915997, + "grad_norm": 0.2150767594575882, + "learning_rate": 2.7205034021123505e-05, + "loss": 1.5854, + "step": 1128 + }, + { + "epoch": 0.6260049902966454, + "grad_norm": 0.21351411938667297, + "learning_rate": 2.71351431247714e-05, + "loss": 1.6161, + "step": 1129 + }, + { + "epoch": 0.6265594677016911, + "grad_norm": 0.2130012810230255, + "learning_rate": 2.7065296008371703e-05, + "loss": 1.5533, + "step": 1130 + }, + { + "epoch": 0.6271139451067369, + "grad_norm": 0.22186146676540375, + "learning_rate": 2.699549290961876e-05, + "loss": 1.5779, + "step": 1131 + }, + { + "epoch": 0.6276684225117827, + "grad_norm": 0.2098151296377182, + "learning_rate": 2.6925734066057076e-05, + "loss": 1.5365, + "step": 1132 + }, + { + "epoch": 0.6282228999168283, + "grad_norm": 0.22018581628799438, + "learning_rate": 2.6856019715080576e-05, + "loss": 1.5884, + "step": 1133 + }, + { + "epoch": 0.6287773773218741, + "grad_norm": 0.2144719660282135, + "learning_rate": 2.6786350093931805e-05, + "loss": 1.5707, + "step": 1134 + }, + { + "epoch": 0.6293318547269199, + "grad_norm": 0.2225160151720047, + "learning_rate": 2.6716725439701013e-05, + "loss": 1.6019, + "step": 1135 + }, + { + "epoch": 0.6298863321319657, + "grad_norm": 0.21820110082626343, + "learning_rate": 2.66471459893255e-05, + "loss": 1.6112, + "step": 1136 + }, + { + "epoch": 0.6304408095370113, + "grad_norm": 0.2195034772157669, + "learning_rate": 2.6577611979588685e-05, + "loss": 1.5993, + "step": 1137 + }, + { + "epoch": 0.6309952869420571, + "grad_norm": 0.214851975440979, + "learning_rate": 2.6508123647119376e-05, + "loss": 1.544, + "step": 1138 + }, + { + "epoch": 0.6315497643471029, + "grad_norm": 0.21557554602622986, + "learning_rate": 2.643868122839093e-05, + "loss": 1.569, + "step": 1139 + }, + { + "epoch": 0.6321042417521486, + "grad_norm": 0.21042174100875854, + "learning_rate": 2.6369284959720447e-05, + "loss": 1.5386, + "step": 1140 + }, + { + "epoch": 0.6326587191571943, + "grad_norm": 0.20624153316020966, + "learning_rate": 2.629993507726801e-05, + "loss": 1.5237, + "step": 1141 + }, + { + "epoch": 0.6332131965622401, + "grad_norm": 0.208335280418396, + "learning_rate": 2.6230631817035784e-05, + "loss": 1.5465, + "step": 1142 + }, + { + "epoch": 0.6337676739672858, + "grad_norm": 0.21543239057064056, + "learning_rate": 2.6161375414867337e-05, + "loss": 1.5521, + "step": 1143 + }, + { + "epoch": 0.6343221513723316, + "grad_norm": 0.21806393563747406, + "learning_rate": 2.6092166106446753e-05, + "loss": 1.504, + "step": 1144 + }, + { + "epoch": 0.6348766287773773, + "grad_norm": 0.20788079500198364, + "learning_rate": 2.602300412729784e-05, + "loss": 1.5175, + "step": 1145 + }, + { + "epoch": 0.635431106182423, + "grad_norm": 0.21668802201747894, + "learning_rate": 2.5953889712783364e-05, + "loss": 1.5238, + "step": 1146 + }, + { + "epoch": 0.6359855835874688, + "grad_norm": 0.219620943069458, + "learning_rate": 2.5884823098104202e-05, + "loss": 1.5186, + "step": 1147 + }, + { + "epoch": 0.6365400609925146, + "grad_norm": 0.22174449265003204, + "learning_rate": 2.5815804518298575e-05, + "loss": 1.6057, + "step": 1148 + }, + { + "epoch": 0.6370945383975603, + "grad_norm": 0.2142484486103058, + "learning_rate": 2.5746834208241266e-05, + "loss": 1.5701, + "step": 1149 + }, + { + "epoch": 0.637649015802606, + "grad_norm": 0.21401485800743103, + "learning_rate": 2.5677912402642742e-05, + "loss": 1.5556, + "step": 1150 + }, + { + "epoch": 0.6382034932076518, + "grad_norm": 0.21097297966480255, + "learning_rate": 2.560903933604844e-05, + "loss": 1.561, + "step": 1151 + }, + { + "epoch": 0.6387579706126976, + "grad_norm": 0.21315309405326843, + "learning_rate": 2.554021524283794e-05, + "loss": 1.5939, + "step": 1152 + }, + { + "epoch": 0.6393124480177432, + "grad_norm": 0.21164047718048096, + "learning_rate": 2.5471440357224112e-05, + "loss": 1.5832, + "step": 1153 + }, + { + "epoch": 0.639866925422789, + "grad_norm": 0.1997397392988205, + "learning_rate": 2.5402714913252463e-05, + "loss": 1.4702, + "step": 1154 + }, + { + "epoch": 0.6404214028278348, + "grad_norm": 0.2216353565454483, + "learning_rate": 2.5334039144800128e-05, + "loss": 1.6679, + "step": 1155 + }, + { + "epoch": 0.6409758802328805, + "grad_norm": 0.20689931511878967, + "learning_rate": 2.5265413285575297e-05, + "loss": 1.5088, + "step": 1156 + }, + { + "epoch": 0.6415303576379262, + "grad_norm": 0.21255846321582794, + "learning_rate": 2.5196837569116267e-05, + "loss": 1.5848, + "step": 1157 + }, + { + "epoch": 0.642084835042972, + "grad_norm": 0.19903656840324402, + "learning_rate": 2.5128312228790684e-05, + "loss": 1.5309, + "step": 1158 + }, + { + "epoch": 0.6426393124480178, + "grad_norm": 0.2166842818260193, + "learning_rate": 2.505983749779481e-05, + "loss": 1.6536, + "step": 1159 + }, + { + "epoch": 0.6431937898530635, + "grad_norm": 0.21221613883972168, + "learning_rate": 2.4991413609152606e-05, + "loss": 1.5502, + "step": 1160 + }, + { + "epoch": 0.6437482672581092, + "grad_norm": 0.2121618390083313, + "learning_rate": 2.4923040795715095e-05, + "loss": 1.566, + "step": 1161 + }, + { + "epoch": 0.644302744663155, + "grad_norm": 0.21857164800167084, + "learning_rate": 2.485471929015944e-05, + "loss": 1.6212, + "step": 1162 + }, + { + "epoch": 0.6448572220682007, + "grad_norm": 0.21056576073169708, + "learning_rate": 2.4786449324988203e-05, + "loss": 1.5362, + "step": 1163 + }, + { + "epoch": 0.6454116994732465, + "grad_norm": 0.21527813374996185, + "learning_rate": 2.4718231132528562e-05, + "loss": 1.5372, + "step": 1164 + }, + { + "epoch": 0.6459661768782922, + "grad_norm": 0.20520104467868805, + "learning_rate": 2.4650064944931495e-05, + "loss": 1.5219, + "step": 1165 + }, + { + "epoch": 0.6465206542833379, + "grad_norm": 0.20994901657104492, + "learning_rate": 2.4581950994171013e-05, + "loss": 1.5247, + "step": 1166 + }, + { + "epoch": 0.6470751316883837, + "grad_norm": 0.21351756155490875, + "learning_rate": 2.4513889512043383e-05, + "loss": 1.572, + "step": 1167 + }, + { + "epoch": 0.6476296090934295, + "grad_norm": 0.21030496060848236, + "learning_rate": 2.444588073016627e-05, + "loss": 1.5433, + "step": 1168 + }, + { + "epoch": 0.6481840864984751, + "grad_norm": 0.2144717425107956, + "learning_rate": 2.4377924879978045e-05, + "loss": 1.563, + "step": 1169 + }, + { + "epoch": 0.6487385639035209, + "grad_norm": 0.20969878137111664, + "learning_rate": 2.4310022192736905e-05, + "loss": 1.5527, + "step": 1170 + }, + { + "epoch": 0.6492930413085667, + "grad_norm": 0.20833313465118408, + "learning_rate": 2.4242172899520164e-05, + "loss": 1.5475, + "step": 1171 + }, + { + "epoch": 0.6498475187136125, + "grad_norm": 0.21155394613742828, + "learning_rate": 2.417437723122343e-05, + "loss": 1.565, + "step": 1172 + }, + { + "epoch": 0.6504019961186581, + "grad_norm": 0.2123716175556183, + "learning_rate": 2.4106635418559786e-05, + "loss": 1.5333, + "step": 1173 + }, + { + "epoch": 0.6509564735237039, + "grad_norm": 0.21838416159152985, + "learning_rate": 2.40389476920591e-05, + "loss": 1.6055, + "step": 1174 + }, + { + "epoch": 0.6515109509287497, + "grad_norm": 0.21394363045692444, + "learning_rate": 2.397131428206711e-05, + "loss": 1.5401, + "step": 1175 + }, + { + "epoch": 0.6520654283337954, + "grad_norm": 0.21762172877788544, + "learning_rate": 2.3903735418744783e-05, + "loss": 1.6162, + "step": 1176 + }, + { + "epoch": 0.6526199057388411, + "grad_norm": 0.21325650811195374, + "learning_rate": 2.3836211332067426e-05, + "loss": 1.5997, + "step": 1177 + }, + { + "epoch": 0.6531743831438869, + "grad_norm": 0.2168748527765274, + "learning_rate": 2.376874225182391e-05, + "loss": 1.5998, + "step": 1178 + }, + { + "epoch": 0.6537288605489326, + "grad_norm": 0.22087283432483673, + "learning_rate": 2.3701328407615977e-05, + "loss": 1.6222, + "step": 1179 + }, + { + "epoch": 0.6542833379539784, + "grad_norm": 0.21224388480186462, + "learning_rate": 2.3633970028857327e-05, + "loss": 1.4607, + "step": 1180 + }, + { + "epoch": 0.6548378153590241, + "grad_norm": 0.21814611554145813, + "learning_rate": 2.356666734477298e-05, + "loss": 1.575, + "step": 1181 + }, + { + "epoch": 0.6553922927640699, + "grad_norm": 0.21967004239559174, + "learning_rate": 2.3499420584398377e-05, + "loss": 1.5645, + "step": 1182 + }, + { + "epoch": 0.6559467701691156, + "grad_norm": 0.2046661376953125, + "learning_rate": 2.343222997657865e-05, + "loss": 1.5153, + "step": 1183 + }, + { + "epoch": 0.6565012475741614, + "grad_norm": 0.21323862671852112, + "learning_rate": 2.336509574996784e-05, + "loss": 1.5637, + "step": 1184 + }, + { + "epoch": 0.6570557249792071, + "grad_norm": 0.21644696593284607, + "learning_rate": 2.3298018133028166e-05, + "loss": 1.5501, + "step": 1185 + }, + { + "epoch": 0.6576102023842528, + "grad_norm": 0.22033213078975677, + "learning_rate": 2.3230997354029116e-05, + "loss": 1.5611, + "step": 1186 + }, + { + "epoch": 0.6581646797892986, + "grad_norm": 0.2171270102262497, + "learning_rate": 2.3164033641046824e-05, + "loss": 1.578, + "step": 1187 + }, + { + "epoch": 0.6587191571943444, + "grad_norm": 0.2252931147813797, + "learning_rate": 2.309712722196319e-05, + "loss": 1.62, + "step": 1188 + }, + { + "epoch": 0.65927363459939, + "grad_norm": 0.20670726895332336, + "learning_rate": 2.303027832446516e-05, + "loss": 1.4831, + "step": 1189 + }, + { + "epoch": 0.6598281120044358, + "grad_norm": 0.2236548513174057, + "learning_rate": 2.296348717604392e-05, + "loss": 1.6453, + "step": 1190 + }, + { + "epoch": 0.6603825894094816, + "grad_norm": 0.22350645065307617, + "learning_rate": 2.2896754003994124e-05, + "loss": 1.5666, + "step": 1191 + }, + { + "epoch": 0.6609370668145274, + "grad_norm": 0.21167486906051636, + "learning_rate": 2.2830079035413153e-05, + "loss": 1.5366, + "step": 1192 + }, + { + "epoch": 0.661491544219573, + "grad_norm": 0.2091861069202423, + "learning_rate": 2.2763462497200282e-05, + "loss": 1.5729, + "step": 1193 + }, + { + "epoch": 0.6620460216246188, + "grad_norm": 0.20955854654312134, + "learning_rate": 2.2696904616055953e-05, + "loss": 1.5978, + "step": 1194 + }, + { + "epoch": 0.6626004990296646, + "grad_norm": 0.20902609825134277, + "learning_rate": 2.2630405618481052e-05, + "loss": 1.5898, + "step": 1195 + }, + { + "epoch": 0.6631549764347103, + "grad_norm": 0.2072347104549408, + "learning_rate": 2.2563965730775995e-05, + "loss": 1.517, + "step": 1196 + }, + { + "epoch": 0.663709453839756, + "grad_norm": 0.20107337832450867, + "learning_rate": 2.2497585179040087e-05, + "loss": 1.5093, + "step": 1197 + }, + { + "epoch": 0.6642639312448018, + "grad_norm": 0.2148445099592209, + "learning_rate": 2.243126418917071e-05, + "loss": 1.5597, + "step": 1198 + }, + { + "epoch": 0.6648184086498475, + "grad_norm": 0.20870907604694366, + "learning_rate": 2.236500298686253e-05, + "loss": 1.5352, + "step": 1199 + }, + { + "epoch": 0.6653728860548933, + "grad_norm": 0.21237953007221222, + "learning_rate": 2.229880179760681e-05, + "loss": 1.5499, + "step": 1200 + }, + { + "epoch": 0.665927363459939, + "grad_norm": 0.21513739228248596, + "learning_rate": 2.2232660846690477e-05, + "loss": 1.5859, + "step": 1201 + }, + { + "epoch": 0.6664818408649847, + "grad_norm": 0.22024357318878174, + "learning_rate": 2.2166580359195594e-05, + "loss": 1.6493, + "step": 1202 + }, + { + "epoch": 0.6670363182700305, + "grad_norm": 0.21224506199359894, + "learning_rate": 2.210056055999835e-05, + "loss": 1.588, + "step": 1203 + }, + { + "epoch": 0.6675907956750763, + "grad_norm": 0.21365848183631897, + "learning_rate": 2.2034601673768475e-05, + "loss": 1.5165, + "step": 1204 + }, + { + "epoch": 0.668145273080122, + "grad_norm": 0.21693235635757446, + "learning_rate": 2.1968703924968388e-05, + "loss": 1.6435, + "step": 1205 + }, + { + "epoch": 0.6686997504851677, + "grad_norm": 0.2145548313856125, + "learning_rate": 2.1902867537852453e-05, + "loss": 1.5788, + "step": 1206 + }, + { + "epoch": 0.6692542278902135, + "grad_norm": 0.21968676149845123, + "learning_rate": 2.1837092736466233e-05, + "loss": 1.6045, + "step": 1207 + }, + { + "epoch": 0.6698087052952593, + "grad_norm": 0.2109319120645523, + "learning_rate": 2.1771379744645643e-05, + "loss": 1.5182, + "step": 1208 + }, + { + "epoch": 0.6703631827003049, + "grad_norm": 0.21400365233421326, + "learning_rate": 2.170572878601636e-05, + "loss": 1.5532, + "step": 1209 + }, + { + "epoch": 0.6709176601053507, + "grad_norm": 0.20740005373954773, + "learning_rate": 2.1640140083992908e-05, + "loss": 1.5601, + "step": 1210 + }, + { + "epoch": 0.6714721375103965, + "grad_norm": 0.21574941277503967, + "learning_rate": 2.1574613861777904e-05, + "loss": 1.5462, + "step": 1211 + }, + { + "epoch": 0.6720266149154422, + "grad_norm": 0.2179066389799118, + "learning_rate": 2.1509150342361407e-05, + "loss": 1.6072, + "step": 1212 + }, + { + "epoch": 0.6725810923204879, + "grad_norm": 0.21342717111110687, + "learning_rate": 2.144374974852007e-05, + "loss": 1.6028, + "step": 1213 + }, + { + "epoch": 0.6731355697255337, + "grad_norm": 0.21398906409740448, + "learning_rate": 2.1378412302816408e-05, + "loss": 1.595, + "step": 1214 + }, + { + "epoch": 0.6736900471305795, + "grad_norm": 0.20966409146785736, + "learning_rate": 2.1313138227598053e-05, + "loss": 1.5391, + "step": 1215 + }, + { + "epoch": 0.6742445245356252, + "grad_norm": 0.20586159825325012, + "learning_rate": 2.1247927744996913e-05, + "loss": 1.5392, + "step": 1216 + }, + { + "epoch": 0.6747990019406709, + "grad_norm": 0.21044990420341492, + "learning_rate": 2.11827810769286e-05, + "loss": 1.6125, + "step": 1217 + }, + { + "epoch": 0.6753534793457167, + "grad_norm": 0.22008857131004333, + "learning_rate": 2.111769844509149e-05, + "loss": 1.5878, + "step": 1218 + }, + { + "epoch": 0.6759079567507624, + "grad_norm": 0.21583212912082672, + "learning_rate": 2.105268007096603e-05, + "loss": 1.5553, + "step": 1219 + }, + { + "epoch": 0.6764624341558082, + "grad_norm": 0.20590542256832123, + "learning_rate": 2.0987726175814025e-05, + "loss": 1.5257, + "step": 1220 + }, + { + "epoch": 0.6770169115608539, + "grad_norm": 0.21506734192371368, + "learning_rate": 2.0922836980677844e-05, + "loss": 1.6258, + "step": 1221 + }, + { + "epoch": 0.6775713889658996, + "grad_norm": 0.21825283765792847, + "learning_rate": 2.085801270637968e-05, + "loss": 1.5429, + "step": 1222 + }, + { + "epoch": 0.6781258663709454, + "grad_norm": 0.2100907862186432, + "learning_rate": 2.0793253573520785e-05, + "loss": 1.5905, + "step": 1223 + }, + { + "epoch": 0.6786803437759912, + "grad_norm": 0.21609370410442352, + "learning_rate": 2.0728559802480754e-05, + "loss": 1.6102, + "step": 1224 + }, + { + "epoch": 0.6792348211810368, + "grad_norm": 0.21235951781272888, + "learning_rate": 2.0663931613416746e-05, + "loss": 1.4997, + "step": 1225 + }, + { + "epoch": 0.6797892985860826, + "grad_norm": 0.20869648456573486, + "learning_rate": 2.0599369226262693e-05, + "loss": 1.5351, + "step": 1226 + }, + { + "epoch": 0.6803437759911284, + "grad_norm": 0.21568715572357178, + "learning_rate": 2.053487286072865e-05, + "loss": 1.5294, + "step": 1227 + }, + { + "epoch": 0.6808982533961742, + "grad_norm": 0.21656538546085358, + "learning_rate": 2.0470442736300013e-05, + "loss": 1.5462, + "step": 1228 + }, + { + "epoch": 0.6814527308012198, + "grad_norm": 0.21754758059978485, + "learning_rate": 2.0406079072236684e-05, + "loss": 1.5732, + "step": 1229 + }, + { + "epoch": 0.6820072082062656, + "grad_norm": 0.21004228293895721, + "learning_rate": 2.0341782087572453e-05, + "loss": 1.5723, + "step": 1230 + }, + { + "epoch": 0.6825616856113114, + "grad_norm": 0.22100019454956055, + "learning_rate": 2.0277552001114183e-05, + "loss": 1.5375, + "step": 1231 + }, + { + "epoch": 0.6831161630163571, + "grad_norm": 0.21162226796150208, + "learning_rate": 2.0213389031441072e-05, + "loss": 1.5535, + "step": 1232 + }, + { + "epoch": 0.6836706404214028, + "grad_norm": 0.21510981023311615, + "learning_rate": 2.0149293396903936e-05, + "loss": 1.5419, + "step": 1233 + }, + { + "epoch": 0.6842251178264486, + "grad_norm": 0.22378797829151154, + "learning_rate": 2.0085265315624375e-05, + "loss": 1.5729, + "step": 1234 + }, + { + "epoch": 0.6847795952314943, + "grad_norm": 0.2192632257938385, + "learning_rate": 2.002130500549422e-05, + "loss": 1.4287, + "step": 1235 + }, + { + "epoch": 0.6853340726365401, + "grad_norm": 0.22077576816082, + "learning_rate": 1.995741268417455e-05, + "loss": 1.5635, + "step": 1236 + }, + { + "epoch": 0.6858885500415858, + "grad_norm": 0.22820354998111725, + "learning_rate": 1.9893588569095148e-05, + "loss": 1.5514, + "step": 1237 + }, + { + "epoch": 0.6864430274466315, + "grad_norm": 0.22799494862556458, + "learning_rate": 1.9829832877453673e-05, + "loss": 1.6064, + "step": 1238 + }, + { + "epoch": 0.6869975048516773, + "grad_norm": 0.21330177783966064, + "learning_rate": 1.976614582621492e-05, + "loss": 1.5972, + "step": 1239 + }, + { + "epoch": 0.6875519822567231, + "grad_norm": 0.22794507443904877, + "learning_rate": 1.9702527632110128e-05, + "loss": 1.5362, + "step": 1240 + }, + { + "epoch": 0.6881064596617688, + "grad_norm": 0.2161223590373993, + "learning_rate": 1.9638978511636133e-05, + "loss": 1.5349, + "step": 1241 + }, + { + "epoch": 0.6886609370668145, + "grad_norm": 0.21102319657802582, + "learning_rate": 1.9575498681054816e-05, + "loss": 1.5777, + "step": 1242 + }, + { + "epoch": 0.6892154144718603, + "grad_norm": 0.2153400331735611, + "learning_rate": 1.9512088356392206e-05, + "loss": 1.5413, + "step": 1243 + }, + { + "epoch": 0.689769891876906, + "grad_norm": 0.23227231204509735, + "learning_rate": 1.9448747753437766e-05, + "loss": 1.5721, + "step": 1244 + }, + { + "epoch": 0.6903243692819517, + "grad_norm": 0.21719689667224884, + "learning_rate": 1.938547708774373e-05, + "loss": 1.5343, + "step": 1245 + }, + { + "epoch": 0.6908788466869975, + "grad_norm": 0.20928537845611572, + "learning_rate": 1.9322276574624374e-05, + "loss": 1.5005, + "step": 1246 + }, + { + "epoch": 0.6914333240920433, + "grad_norm": 0.23064006865024567, + "learning_rate": 1.925914642915514e-05, + "loss": 1.5943, + "step": 1247 + }, + { + "epoch": 0.6919878014970889, + "grad_norm": 0.22050850093364716, + "learning_rate": 1.919608686617208e-05, + "loss": 1.5496, + "step": 1248 + }, + { + "epoch": 0.6925422789021347, + "grad_norm": 0.22092626988887787, + "learning_rate": 1.9133098100271018e-05, + "loss": 1.6028, + "step": 1249 + }, + { + "epoch": 0.6930967563071805, + "grad_norm": 0.23645174503326416, + "learning_rate": 1.9070180345806867e-05, + "loss": 1.5723, + "step": 1250 + }, + { + "epoch": 0.6936512337122263, + "grad_norm": 0.22588878870010376, + "learning_rate": 1.9007333816892886e-05, + "loss": 1.59, + "step": 1251 + }, + { + "epoch": 0.6942057111172719, + "grad_norm": 0.2189507782459259, + "learning_rate": 1.8944558727399894e-05, + "loss": 1.5932, + "step": 1252 + }, + { + "epoch": 0.6947601885223177, + "grad_norm": 0.21700263023376465, + "learning_rate": 1.8881855290955702e-05, + "loss": 1.5475, + "step": 1253 + }, + { + "epoch": 0.6953146659273635, + "grad_norm": 0.21820667386054993, + "learning_rate": 1.8819223720944176e-05, + "loss": 1.534, + "step": 1254 + }, + { + "epoch": 0.6958691433324092, + "grad_norm": 0.22174763679504395, + "learning_rate": 1.875666423050467e-05, + "loss": 1.6515, + "step": 1255 + }, + { + "epoch": 0.6964236207374549, + "grad_norm": 0.22205999493598938, + "learning_rate": 1.8694177032531247e-05, + "loss": 1.6275, + "step": 1256 + }, + { + "epoch": 0.6969780981425007, + "grad_norm": 0.21358875930309296, + "learning_rate": 1.863176233967193e-05, + "loss": 1.6198, + "step": 1257 + }, + { + "epoch": 0.6975325755475464, + "grad_norm": 0.21394740045070648, + "learning_rate": 1.856942036432804e-05, + "loss": 1.5314, + "step": 1258 + }, + { + "epoch": 0.6980870529525922, + "grad_norm": 0.21883390843868256, + "learning_rate": 1.850715131865336e-05, + "loss": 1.6133, + "step": 1259 + }, + { + "epoch": 0.6986415303576379, + "grad_norm": 0.21168088912963867, + "learning_rate": 1.8444955414553595e-05, + "loss": 1.6031, + "step": 1260 + }, + { + "epoch": 0.6991960077626836, + "grad_norm": 0.2189856618642807, + "learning_rate": 1.8382832863685477e-05, + "loss": 1.5858, + "step": 1261 + }, + { + "epoch": 0.6997504851677294, + "grad_norm": 0.20865415036678314, + "learning_rate": 1.8320783877456107e-05, + "loss": 1.5458, + "step": 1262 + }, + { + "epoch": 0.7003049625727752, + "grad_norm": 0.21896782517433167, + "learning_rate": 1.825880866702226e-05, + "loss": 1.5763, + "step": 1263 + }, + { + "epoch": 0.7008594399778209, + "grad_norm": 0.2206234335899353, + "learning_rate": 1.8196907443289656e-05, + "loss": 1.5721, + "step": 1264 + }, + { + "epoch": 0.7014139173828666, + "grad_norm": 0.217427596449852, + "learning_rate": 1.813508041691222e-05, + "loss": 1.6216, + "step": 1265 + }, + { + "epoch": 0.7019683947879124, + "grad_norm": 0.2126351296901703, + "learning_rate": 1.8073327798291387e-05, + "loss": 1.5936, + "step": 1266 + }, + { + "epoch": 0.7025228721929582, + "grad_norm": 0.2115585058927536, + "learning_rate": 1.8011649797575343e-05, + "loss": 1.5893, + "step": 1267 + }, + { + "epoch": 0.7030773495980038, + "grad_norm": 0.21458736062049866, + "learning_rate": 1.7950046624658418e-05, + "loss": 1.4703, + "step": 1268 + }, + { + "epoch": 0.7036318270030496, + "grad_norm": 0.21214710175991058, + "learning_rate": 1.788851848918022e-05, + "loss": 1.6151, + "step": 1269 + }, + { + "epoch": 0.7041863044080954, + "grad_norm": 0.21619988977909088, + "learning_rate": 1.7827065600525043e-05, + "loss": 1.4581, + "step": 1270 + }, + { + "epoch": 0.7047407818131411, + "grad_norm": 0.21274301409721375, + "learning_rate": 1.77656881678211e-05, + "loss": 1.5052, + "step": 1271 + }, + { + "epoch": 0.7052952592181868, + "grad_norm": 0.2131747454404831, + "learning_rate": 1.7704386399939818e-05, + "loss": 1.5616, + "step": 1272 + }, + { + "epoch": 0.7058497366232326, + "grad_norm": 0.22459973394870758, + "learning_rate": 1.7643160505495146e-05, + "loss": 1.6162, + "step": 1273 + }, + { + "epoch": 0.7064042140282784, + "grad_norm": 0.21642784774303436, + "learning_rate": 1.7582010692842823e-05, + "loss": 1.5565, + "step": 1274 + }, + { + "epoch": 0.7069586914333241, + "grad_norm": 0.22068682312965393, + "learning_rate": 1.7520937170079667e-05, + "loss": 1.5865, + "step": 1275 + }, + { + "epoch": 0.7075131688383698, + "grad_norm": 0.2210376113653183, + "learning_rate": 1.7459940145042904e-05, + "loss": 1.5605, + "step": 1276 + }, + { + "epoch": 0.7080676462434156, + "grad_norm": 0.21524764597415924, + "learning_rate": 1.7399019825309387e-05, + "loss": 1.568, + "step": 1277 + }, + { + "epoch": 0.7086221236484613, + "grad_norm": 0.20720575749874115, + "learning_rate": 1.733817641819496e-05, + "loss": 1.5723, + "step": 1278 + }, + { + "epoch": 0.7091766010535071, + "grad_norm": 0.22238507866859436, + "learning_rate": 1.7277410130753775e-05, + "loss": 1.5773, + "step": 1279 + }, + { + "epoch": 0.7097310784585528, + "grad_norm": 0.20956286787986755, + "learning_rate": 1.7216721169777452e-05, + "loss": 1.5252, + "step": 1280 + }, + { + "epoch": 0.7102855558635985, + "grad_norm": 0.21438319981098175, + "learning_rate": 1.7156109741794533e-05, + "loss": 1.4835, + "step": 1281 + }, + { + "epoch": 0.7108400332686443, + "grad_norm": 0.21574389934539795, + "learning_rate": 1.709557605306967e-05, + "loss": 1.588, + "step": 1282 + }, + { + "epoch": 0.7113945106736901, + "grad_norm": 0.2138158082962036, + "learning_rate": 1.7035120309602994e-05, + "loss": 1.5567, + "step": 1283 + }, + { + "epoch": 0.7119489880787357, + "grad_norm": 0.21723328530788422, + "learning_rate": 1.6974742717129373e-05, + "loss": 1.5732, + "step": 1284 + }, + { + "epoch": 0.7125034654837815, + "grad_norm": 0.2203451544046402, + "learning_rate": 1.6914443481117678e-05, + "loss": 1.6133, + "step": 1285 + }, + { + "epoch": 0.7130579428888273, + "grad_norm": 0.20896215736865997, + "learning_rate": 1.6854222806770228e-05, + "loss": 1.5527, + "step": 1286 + }, + { + "epoch": 0.7136124202938731, + "grad_norm": 0.20967014133930206, + "learning_rate": 1.679408089902188e-05, + "loss": 1.616, + "step": 1287 + }, + { + "epoch": 0.7141668976989187, + "grad_norm": 0.21301911771297455, + "learning_rate": 1.673401796253952e-05, + "loss": 1.5695, + "step": 1288 + }, + { + "epoch": 0.7147213751039645, + "grad_norm": 0.22176018357276917, + "learning_rate": 1.667403420172125e-05, + "loss": 1.5707, + "step": 1289 + }, + { + "epoch": 0.7152758525090103, + "grad_norm": 0.20806194841861725, + "learning_rate": 1.6614129820695755e-05, + "loss": 1.547, + "step": 1290 + }, + { + "epoch": 0.715830329914056, + "grad_norm": 0.21885931491851807, + "learning_rate": 1.6554305023321587e-05, + "loss": 1.5618, + "step": 1291 + }, + { + "epoch": 0.7163848073191017, + "grad_norm": 0.21963781118392944, + "learning_rate": 1.6494560013186413e-05, + "loss": 1.5697, + "step": 1292 + }, + { + "epoch": 0.7169392847241475, + "grad_norm": 0.20642498135566711, + "learning_rate": 1.6434894993606474e-05, + "loss": 1.5343, + "step": 1293 + }, + { + "epoch": 0.7174937621291932, + "grad_norm": 0.21944020688533783, + "learning_rate": 1.6375310167625736e-05, + "loss": 1.589, + "step": 1294 + }, + { + "epoch": 0.718048239534239, + "grad_norm": 0.20910033583641052, + "learning_rate": 1.631580573801526e-05, + "loss": 1.5539, + "step": 1295 + }, + { + "epoch": 0.7186027169392847, + "grad_norm": 0.21879972517490387, + "learning_rate": 1.625638190727253e-05, + "loss": 1.557, + "step": 1296 + }, + { + "epoch": 0.7191571943443305, + "grad_norm": 0.21549879014492035, + "learning_rate": 1.6197038877620745e-05, + "loss": 1.5955, + "step": 1297 + }, + { + "epoch": 0.7197116717493762, + "grad_norm": 0.21491822600364685, + "learning_rate": 1.6137776851008135e-05, + "loss": 1.5934, + "step": 1298 + }, + { + "epoch": 0.720266149154422, + "grad_norm": 0.21684475243091583, + "learning_rate": 1.607859602910726e-05, + "loss": 1.5928, + "step": 1299 + }, + { + "epoch": 0.7208206265594677, + "grad_norm": 0.2135036587715149, + "learning_rate": 1.601949661331434e-05, + "loss": 1.5901, + "step": 1300 + }, + { + "epoch": 0.7213751039645134, + "grad_norm": 0.21174384653568268, + "learning_rate": 1.596047880474859e-05, + "loss": 1.5478, + "step": 1301 + }, + { + "epoch": 0.7219295813695592, + "grad_norm": 0.21973606944084167, + "learning_rate": 1.5901542804251446e-05, + "loss": 1.5873, + "step": 1302 + }, + { + "epoch": 0.722484058774605, + "grad_norm": 0.21199335157871246, + "learning_rate": 1.5842688812385997e-05, + "loss": 1.5681, + "step": 1303 + }, + { + "epoch": 0.7230385361796506, + "grad_norm": 0.20926666259765625, + "learning_rate": 1.578391702943628e-05, + "loss": 1.5635, + "step": 1304 + }, + { + "epoch": 0.7235930135846964, + "grad_norm": 0.20591633021831512, + "learning_rate": 1.5725227655406485e-05, + "loss": 1.5811, + "step": 1305 + }, + { + "epoch": 0.7241474909897422, + "grad_norm": 0.21061578392982483, + "learning_rate": 1.5666620890020423e-05, + "loss": 1.5401, + "step": 1306 + }, + { + "epoch": 0.724701968394788, + "grad_norm": 0.21330875158309937, + "learning_rate": 1.5608096932720758e-05, + "loss": 1.5631, + "step": 1307 + }, + { + "epoch": 0.7252564457998336, + "grad_norm": 0.21160577237606049, + "learning_rate": 1.5549655982668365e-05, + "loss": 1.5541, + "step": 1308 + }, + { + "epoch": 0.7258109232048794, + "grad_norm": 0.21944500505924225, + "learning_rate": 1.549129823874164e-05, + "loss": 1.5407, + "step": 1309 + }, + { + "epoch": 0.7263654006099252, + "grad_norm": 0.21115024387836456, + "learning_rate": 1.543302389953578e-05, + "loss": 1.5101, + "step": 1310 + }, + { + "epoch": 0.7269198780149709, + "grad_norm": 0.21656592190265656, + "learning_rate": 1.5374833163362228e-05, + "loss": 1.5505, + "step": 1311 + }, + { + "epoch": 0.7274743554200166, + "grad_norm": 0.20774325728416443, + "learning_rate": 1.5316726228247874e-05, + "loss": 1.5188, + "step": 1312 + }, + { + "epoch": 0.7280288328250624, + "grad_norm": 0.22355897724628448, + "learning_rate": 1.525870329193441e-05, + "loss": 1.5638, + "step": 1313 + }, + { + "epoch": 0.7285833102301081, + "grad_norm": 0.20786559581756592, + "learning_rate": 1.5200764551877707e-05, + "loss": 1.5139, + "step": 1314 + }, + { + "epoch": 0.7291377876351539, + "grad_norm": 0.21378779411315918, + "learning_rate": 1.51429102052471e-05, + "loss": 1.5272, + "step": 1315 + }, + { + "epoch": 0.7296922650401996, + "grad_norm": 0.2113804817199707, + "learning_rate": 1.5085140448924738e-05, + "loss": 1.5503, + "step": 1316 + }, + { + "epoch": 0.7302467424452453, + "grad_norm": 0.2093944549560547, + "learning_rate": 1.5027455479504896e-05, + "loss": 1.5027, + "step": 1317 + }, + { + "epoch": 0.7308012198502911, + "grad_norm": 0.2111719399690628, + "learning_rate": 1.4969855493293275e-05, + "loss": 1.4586, + "step": 1318 + }, + { + "epoch": 0.7313556972553369, + "grad_norm": 0.21029268205165863, + "learning_rate": 1.491234068630646e-05, + "loss": 1.6253, + "step": 1319 + }, + { + "epoch": 0.7319101746603826, + "grad_norm": 0.2066301554441452, + "learning_rate": 1.4854911254271086e-05, + "loss": 1.4414, + "step": 1320 + }, + { + "epoch": 0.7324646520654283, + "grad_norm": 0.2162986844778061, + "learning_rate": 1.4797567392623275e-05, + "loss": 1.5728, + "step": 1321 + }, + { + "epoch": 0.7330191294704741, + "grad_norm": 0.21040524542331696, + "learning_rate": 1.4740309296507969e-05, + "loss": 1.5839, + "step": 1322 + }, + { + "epoch": 0.7335736068755199, + "grad_norm": 0.22399760782718658, + "learning_rate": 1.4683137160778218e-05, + "loss": 1.6573, + "step": 1323 + }, + { + "epoch": 0.7341280842805655, + "grad_norm": 0.20881177484989166, + "learning_rate": 1.4626051179994569e-05, + "loss": 1.5418, + "step": 1324 + }, + { + "epoch": 0.7346825616856113, + "grad_norm": 0.21787725389003754, + "learning_rate": 1.4569051548424323e-05, + "loss": 1.6005, + "step": 1325 + }, + { + "epoch": 0.7352370390906571, + "grad_norm": 0.2111303210258484, + "learning_rate": 1.4512138460041004e-05, + "loss": 1.5171, + "step": 1326 + }, + { + "epoch": 0.7357915164957028, + "grad_norm": 0.20768657326698303, + "learning_rate": 1.4455312108523587e-05, + "loss": 1.5723, + "step": 1327 + }, + { + "epoch": 0.7363459939007485, + "grad_norm": 0.20773515105247498, + "learning_rate": 1.4398572687255858e-05, + "loss": 1.5349, + "step": 1328 + }, + { + "epoch": 0.7369004713057943, + "grad_norm": 0.20800316333770752, + "learning_rate": 1.43419203893258e-05, + "loss": 1.5316, + "step": 1329 + }, + { + "epoch": 0.73745494871084, + "grad_norm": 0.21494507789611816, + "learning_rate": 1.42853554075249e-05, + "loss": 1.5631, + "step": 1330 + }, + { + "epoch": 0.7380094261158858, + "grad_norm": 0.2120676040649414, + "learning_rate": 1.4228877934347506e-05, + "loss": 1.5709, + "step": 1331 + }, + { + "epoch": 0.7385639035209315, + "grad_norm": 0.20988710224628448, + "learning_rate": 1.4172488161990168e-05, + "loss": 1.5334, + "step": 1332 + }, + { + "epoch": 0.7391183809259773, + "grad_norm": 0.21031689643859863, + "learning_rate": 1.4116186282350981e-05, + "loss": 1.5124, + "step": 1333 + }, + { + "epoch": 0.739672858331023, + "grad_norm": 0.21639370918273926, + "learning_rate": 1.4059972487028936e-05, + "loss": 1.567, + "step": 1334 + }, + { + "epoch": 0.7402273357360688, + "grad_norm": 0.21508349478244781, + "learning_rate": 1.4003846967323283e-05, + "loss": 1.5132, + "step": 1335 + }, + { + "epoch": 0.7407818131411145, + "grad_norm": 0.21374095976352692, + "learning_rate": 1.3947809914232808e-05, + "loss": 1.555, + "step": 1336 + }, + { + "epoch": 0.7413362905461602, + "grad_norm": 0.22325333952903748, + "learning_rate": 1.3891861518455342e-05, + "loss": 1.5703, + "step": 1337 + }, + { + "epoch": 0.741890767951206, + "grad_norm": 0.22245821356773376, + "learning_rate": 1.3836001970386898e-05, + "loss": 1.5424, + "step": 1338 + }, + { + "epoch": 0.7424452453562518, + "grad_norm": 0.2141106277704239, + "learning_rate": 1.3780231460121204e-05, + "loss": 1.601, + "step": 1339 + }, + { + "epoch": 0.7429997227612974, + "grad_norm": 0.21222813427448273, + "learning_rate": 1.3724550177448968e-05, + "loss": 1.5908, + "step": 1340 + }, + { + "epoch": 0.7435542001663432, + "grad_norm": 0.21440914273262024, + "learning_rate": 1.3668958311857247e-05, + "loss": 1.6119, + "step": 1341 + }, + { + "epoch": 0.744108677571389, + "grad_norm": 0.21355807781219482, + "learning_rate": 1.3613456052528822e-05, + "loss": 1.5494, + "step": 1342 + }, + { + "epoch": 0.7446631549764348, + "grad_norm": 0.22582007944583893, + "learning_rate": 1.3558043588341488e-05, + "loss": 1.6013, + "step": 1343 + }, + { + "epoch": 0.7452176323814804, + "grad_norm": 0.22058305144309998, + "learning_rate": 1.3502721107867536e-05, + "loss": 1.6472, + "step": 1344 + }, + { + "epoch": 0.7457721097865262, + "grad_norm": 0.21767617762088776, + "learning_rate": 1.3447488799372997e-05, + "loss": 1.5594, + "step": 1345 + }, + { + "epoch": 0.746326587191572, + "grad_norm": 0.21455064415931702, + "learning_rate": 1.339234685081702e-05, + "loss": 1.4785, + "step": 1346 + }, + { + "epoch": 0.7468810645966177, + "grad_norm": 0.2150326520204544, + "learning_rate": 1.3337295449851282e-05, + "loss": 1.5622, + "step": 1347 + }, + { + "epoch": 0.7474355420016634, + "grad_norm": 0.21228265762329102, + "learning_rate": 1.328233478381932e-05, + "loss": 1.5579, + "step": 1348 + }, + { + "epoch": 0.7479900194067092, + "grad_norm": 0.21863175928592682, + "learning_rate": 1.3227465039755889e-05, + "loss": 1.5563, + "step": 1349 + }, + { + "epoch": 0.748544496811755, + "grad_norm": 0.22516675293445587, + "learning_rate": 1.3172686404386323e-05, + "loss": 1.5843, + "step": 1350 + }, + { + "epoch": 0.7490989742168007, + "grad_norm": 0.21071338653564453, + "learning_rate": 1.3117999064125923e-05, + "loss": 1.4893, + "step": 1351 + }, + { + "epoch": 0.7496534516218464, + "grad_norm": 0.21058571338653564, + "learning_rate": 1.3063403205079302e-05, + "loss": 1.4929, + "step": 1352 + }, + { + "epoch": 0.7502079290268922, + "grad_norm": 0.2110683172941208, + "learning_rate": 1.300889901303973e-05, + "loss": 1.5716, + "step": 1353 + }, + { + "epoch": 0.7507624064319379, + "grad_norm": 0.22310392558574677, + "learning_rate": 1.2954486673488554e-05, + "loss": 1.5591, + "step": 1354 + }, + { + "epoch": 0.7513168838369837, + "grad_norm": 0.21150308847427368, + "learning_rate": 1.290016637159457e-05, + "loss": 1.5552, + "step": 1355 + }, + { + "epoch": 0.7518713612420294, + "grad_norm": 0.2172410786151886, + "learning_rate": 1.2845938292213296e-05, + "loss": 1.6018, + "step": 1356 + }, + { + "epoch": 0.7524258386470751, + "grad_norm": 0.21438592672348022, + "learning_rate": 1.2791802619886457e-05, + "loss": 1.5254, + "step": 1357 + }, + { + "epoch": 0.7529803160521209, + "grad_norm": 0.2097012847661972, + "learning_rate": 1.2737759538841297e-05, + "loss": 1.4957, + "step": 1358 + }, + { + "epoch": 0.7535347934571667, + "grad_norm": 0.22424669563770294, + "learning_rate": 1.2683809232989975e-05, + "loss": 1.5994, + "step": 1359 + }, + { + "epoch": 0.7540892708622123, + "grad_norm": 0.23129421472549438, + "learning_rate": 1.2629951885928931e-05, + "loss": 1.6463, + "step": 1360 + }, + { + "epoch": 0.7546437482672581, + "grad_norm": 0.2219121903181076, + "learning_rate": 1.2576187680938214e-05, + "loss": 1.641, + "step": 1361 + }, + { + "epoch": 0.7551982256723039, + "grad_norm": 0.21605202555656433, + "learning_rate": 1.2522516800980995e-05, + "loss": 1.6419, + "step": 1362 + }, + { + "epoch": 0.7557527030773497, + "grad_norm": 0.2168269157409668, + "learning_rate": 1.2468939428702762e-05, + "loss": 1.5664, + "step": 1363 + }, + { + "epoch": 0.7563071804823953, + "grad_norm": 0.21760636568069458, + "learning_rate": 1.2415455746430846e-05, + "loss": 1.5758, + "step": 1364 + }, + { + "epoch": 0.7568616578874411, + "grad_norm": 0.2120659053325653, + "learning_rate": 1.2362065936173728e-05, + "loss": 1.584, + "step": 1365 + }, + { + "epoch": 0.7574161352924869, + "grad_norm": 0.20472942292690277, + "learning_rate": 1.230877017962043e-05, + "loss": 1.5304, + "step": 1366 + }, + { + "epoch": 0.7579706126975326, + "grad_norm": 0.21528665721416473, + "learning_rate": 1.2255568658139918e-05, + "loss": 1.6002, + "step": 1367 + }, + { + "epoch": 0.7585250901025783, + "grad_norm": 0.21428853273391724, + "learning_rate": 1.2202461552780473e-05, + "loss": 1.6109, + "step": 1368 + }, + { + "epoch": 0.7590795675076241, + "grad_norm": 0.21304574608802795, + "learning_rate": 1.214944904426902e-05, + "loss": 1.5883, + "step": 1369 + }, + { + "epoch": 0.7596340449126698, + "grad_norm": 0.22913344204425812, + "learning_rate": 1.209653131301066e-05, + "loss": 1.6736, + "step": 1370 + }, + { + "epoch": 0.7601885223177155, + "grad_norm": 0.2191346287727356, + "learning_rate": 1.2043708539087865e-05, + "loss": 1.5518, + "step": 1371 + }, + { + "epoch": 0.7607429997227613, + "grad_norm": 0.21301528811454773, + "learning_rate": 1.1990980902260008e-05, + "loss": 1.5366, + "step": 1372 + }, + { + "epoch": 0.761297477127807, + "grad_norm": 0.21601028740406036, + "learning_rate": 1.1938348581962713e-05, + "loss": 1.5593, + "step": 1373 + }, + { + "epoch": 0.7618519545328528, + "grad_norm": 0.21673454344272614, + "learning_rate": 1.1885811757307209e-05, + "loss": 1.589, + "step": 1374 + }, + { + "epoch": 0.7624064319378985, + "grad_norm": 0.20797033607959747, + "learning_rate": 1.1833370607079778e-05, + "loss": 1.5379, + "step": 1375 + }, + { + "epoch": 0.7629609093429442, + "grad_norm": 0.22476620972156525, + "learning_rate": 1.1781025309741056e-05, + "loss": 1.5536, + "step": 1376 + }, + { + "epoch": 0.76351538674799, + "grad_norm": 0.2215532660484314, + "learning_rate": 1.1728776043425563e-05, + "loss": 1.5604, + "step": 1377 + }, + { + "epoch": 0.7640698641530358, + "grad_norm": 0.214621901512146, + "learning_rate": 1.1676622985940983e-05, + "loss": 1.5223, + "step": 1378 + }, + { + "epoch": 0.7646243415580815, + "grad_norm": 0.2120085209608078, + "learning_rate": 1.1624566314767573e-05, + "loss": 1.5189, + "step": 1379 + }, + { + "epoch": 0.7651788189631272, + "grad_norm": 0.21431514620780945, + "learning_rate": 1.1572606207057607e-05, + "loss": 1.5565, + "step": 1380 + }, + { + "epoch": 0.765733296368173, + "grad_norm": 0.21584293246269226, + "learning_rate": 1.152074283963475e-05, + "loss": 1.5717, + "step": 1381 + }, + { + "epoch": 0.7662877737732188, + "grad_norm": 0.22375768423080444, + "learning_rate": 1.1468976388993438e-05, + "loss": 1.6198, + "step": 1382 + }, + { + "epoch": 0.7668422511782644, + "grad_norm": 0.22090241312980652, + "learning_rate": 1.1417307031298304e-05, + "loss": 1.5913, + "step": 1383 + }, + { + "epoch": 0.7673967285833102, + "grad_norm": 0.20872661471366882, + "learning_rate": 1.1365734942383565e-05, + "loss": 1.5244, + "step": 1384 + }, + { + "epoch": 0.767951205988356, + "grad_norm": 0.218302920460701, + "learning_rate": 1.131426029775244e-05, + "loss": 1.5787, + "step": 1385 + }, + { + "epoch": 0.7685056833934018, + "grad_norm": 0.21635451912879944, + "learning_rate": 1.1262883272576492e-05, + "loss": 1.5958, + "step": 1386 + }, + { + "epoch": 0.7690601607984474, + "grad_norm": 0.21018511056900024, + "learning_rate": 1.1211604041695114e-05, + "loss": 1.5487, + "step": 1387 + }, + { + "epoch": 0.7696146382034932, + "grad_norm": 0.23798473179340363, + "learning_rate": 1.1160422779614928e-05, + "loss": 1.6402, + "step": 1388 + }, + { + "epoch": 0.770169115608539, + "grad_norm": 0.21271711587905884, + "learning_rate": 1.1109339660509098e-05, + "loss": 1.5796, + "step": 1389 + }, + { + "epoch": 0.7707235930135847, + "grad_norm": 0.20341956615447998, + "learning_rate": 1.1058354858216842e-05, + "loss": 1.5225, + "step": 1390 + }, + { + "epoch": 0.7712780704186304, + "grad_norm": 0.23549644649028778, + "learning_rate": 1.1007468546242786e-05, + "loss": 1.5797, + "step": 1391 + }, + { + "epoch": 0.7718325478236762, + "grad_norm": 0.22186161577701569, + "learning_rate": 1.0956680897756394e-05, + "loss": 1.5828, + "step": 1392 + }, + { + "epoch": 0.7723870252287219, + "grad_norm": 0.22768884897232056, + "learning_rate": 1.0905992085591373e-05, + "loss": 1.5781, + "step": 1393 + }, + { + "epoch": 0.7729415026337677, + "grad_norm": 0.20948225259780884, + "learning_rate": 1.0855402282245047e-05, + "loss": 1.5476, + "step": 1394 + }, + { + "epoch": 0.7734959800388134, + "grad_norm": 0.21078969538211823, + "learning_rate": 1.0804911659877874e-05, + "loss": 1.4978, + "step": 1395 + }, + { + "epoch": 0.7740504574438591, + "grad_norm": 0.22138863801956177, + "learning_rate": 1.0754520390312755e-05, + "loss": 1.5183, + "step": 1396 + }, + { + "epoch": 0.7746049348489049, + "grad_norm": 0.2178879678249359, + "learning_rate": 1.0704228645034464e-05, + "loss": 1.5743, + "step": 1397 + }, + { + "epoch": 0.7751594122539507, + "grad_norm": 0.20594577491283417, + "learning_rate": 1.0654036595189124e-05, + "loss": 1.4646, + "step": 1398 + }, + { + "epoch": 0.7757138896589963, + "grad_norm": 0.21161887049674988, + "learning_rate": 1.0603944411583576e-05, + "loss": 1.5743, + "step": 1399 + }, + { + "epoch": 0.7762683670640421, + "grad_norm": 0.21460282802581787, + "learning_rate": 1.0553952264684804e-05, + "loss": 1.628, + "step": 1400 + }, + { + "epoch": 0.7768228444690879, + "grad_norm": 0.2193065732717514, + "learning_rate": 1.0504060324619374e-05, + "loss": 1.5382, + "step": 1401 + }, + { + "epoch": 0.7773773218741337, + "grad_norm": 0.21738368272781372, + "learning_rate": 1.0454268761172824e-05, + "loss": 1.5957, + "step": 1402 + }, + { + "epoch": 0.7779317992791793, + "grad_norm": 0.21127985417842865, + "learning_rate": 1.040457774378913e-05, + "loss": 1.5551, + "step": 1403 + }, + { + "epoch": 0.7784862766842251, + "grad_norm": 0.20940497517585754, + "learning_rate": 1.0354987441570064e-05, + "loss": 1.5498, + "step": 1404 + }, + { + "epoch": 0.7790407540892709, + "grad_norm": 0.21508832275867462, + "learning_rate": 1.0305498023274674e-05, + "loss": 1.6172, + "step": 1405 + }, + { + "epoch": 0.7795952314943166, + "grad_norm": 0.21197104454040527, + "learning_rate": 1.025610965731874e-05, + "loss": 1.5321, + "step": 1406 + }, + { + "epoch": 0.7801497088993623, + "grad_norm": 0.21619556844234467, + "learning_rate": 1.0206822511774073e-05, + "loss": 1.5089, + "step": 1407 + }, + { + "epoch": 0.7807041863044081, + "grad_norm": 0.2153014987707138, + "learning_rate": 1.015763675436808e-05, + "loss": 1.5755, + "step": 1408 + }, + { + "epoch": 0.7812586637094538, + "grad_norm": 0.20888642966747284, + "learning_rate": 1.0108552552483122e-05, + "loss": 1.5995, + "step": 1409 + }, + { + "epoch": 0.7818131411144996, + "grad_norm": 0.2115897834300995, + "learning_rate": 1.0059570073155953e-05, + "loss": 1.5222, + "step": 1410 + }, + { + "epoch": 0.7823676185195453, + "grad_norm": 0.20816390216350555, + "learning_rate": 1.0010689483077187e-05, + "loss": 1.517, + "step": 1411 + }, + { + "epoch": 0.782922095924591, + "grad_norm": 0.21498888731002808, + "learning_rate": 9.961910948590643e-06, + "loss": 1.5944, + "step": 1412 + }, + { + "epoch": 0.7834765733296368, + "grad_norm": 0.2085815966129303, + "learning_rate": 9.91323463569292e-06, + "loss": 1.4606, + "step": 1413 + }, + { + "epoch": 0.7840310507346826, + "grad_norm": 0.21854795515537262, + "learning_rate": 9.864660710032669e-06, + "loss": 1.6048, + "step": 1414 + }, + { + "epoch": 0.7845855281397283, + "grad_norm": 0.21158936619758606, + "learning_rate": 9.816189336910166e-06, + "loss": 1.5845, + "step": 1415 + }, + { + "epoch": 0.785140005544774, + "grad_norm": 0.21523204445838928, + "learning_rate": 9.767820681276671e-06, + "loss": 1.5289, + "step": 1416 + }, + { + "epoch": 0.7856944829498198, + "grad_norm": 0.2180699110031128, + "learning_rate": 9.719554907733895e-06, + "loss": 1.5658, + "step": 1417 + }, + { + "epoch": 0.7862489603548656, + "grad_norm": 0.21391287446022034, + "learning_rate": 9.671392180533443e-06, + "loss": 1.579, + "step": 1418 + }, + { + "epoch": 0.7868034377599112, + "grad_norm": 0.20858129858970642, + "learning_rate": 9.62333266357622e-06, + "loss": 1.5127, + "step": 1419 + }, + { + "epoch": 0.787357915164957, + "grad_norm": 0.2106478065252304, + "learning_rate": 9.575376520411907e-06, + "loss": 1.5466, + "step": 1420 + }, + { + "epoch": 0.7879123925700028, + "grad_norm": 0.20766615867614746, + "learning_rate": 9.527523914238452e-06, + "loss": 1.5704, + "step": 1421 + }, + { + "epoch": 0.7884668699750486, + "grad_norm": 0.21175894141197205, + "learning_rate": 9.479775007901378e-06, + "loss": 1.5494, + "step": 1422 + }, + { + "epoch": 0.7890213473800942, + "grad_norm": 0.2133675217628479, + "learning_rate": 9.43212996389335e-06, + "loss": 1.5659, + "step": 1423 + }, + { + "epoch": 0.78957582478514, + "grad_norm": 0.2181132733821869, + "learning_rate": 9.384588944353594e-06, + "loss": 1.6126, + "step": 1424 + }, + { + "epoch": 0.7901303021901858, + "grad_norm": 0.22013451159000397, + "learning_rate": 9.337152111067312e-06, + "loss": 1.5657, + "step": 1425 + }, + { + "epoch": 0.7906847795952315, + "grad_norm": 0.22332167625427246, + "learning_rate": 9.28981962546517e-06, + "loss": 1.5943, + "step": 1426 + }, + { + "epoch": 0.7912392570002772, + "grad_norm": 0.20821382105350494, + "learning_rate": 9.242591648622698e-06, + "loss": 1.5187, + "step": 1427 + }, + { + "epoch": 0.791793734405323, + "grad_norm": 0.2182924449443817, + "learning_rate": 9.19546834125983e-06, + "loss": 1.585, + "step": 1428 + }, + { + "epoch": 0.7923482118103687, + "grad_norm": 0.20459789037704468, + "learning_rate": 9.148449863740279e-06, + "loss": 1.52, + "step": 1429 + }, + { + "epoch": 0.7929026892154145, + "grad_norm": 0.218237042427063, + "learning_rate": 9.101536376070981e-06, + "loss": 1.5027, + "step": 1430 + }, + { + "epoch": 0.7934571666204602, + "grad_norm": 0.2164810746908188, + "learning_rate": 9.05472803790163e-06, + "loss": 1.5413, + "step": 1431 + }, + { + "epoch": 0.794011644025506, + "grad_norm": 0.21726112067699432, + "learning_rate": 9.008025008524068e-06, + "loss": 1.5963, + "step": 1432 + }, + { + "epoch": 0.7945661214305517, + "grad_norm": 0.22692659497261047, + "learning_rate": 8.96142744687178e-06, + "loss": 1.5632, + "step": 1433 + }, + { + "epoch": 0.7951205988355975, + "grad_norm": 0.21242159605026245, + "learning_rate": 8.914935511519313e-06, + "loss": 1.5366, + "step": 1434 + }, + { + "epoch": 0.7956750762406432, + "grad_norm": 0.214253231883049, + "learning_rate": 8.868549360681791e-06, + "loss": 1.5982, + "step": 1435 + }, + { + "epoch": 0.7962295536456889, + "grad_norm": 0.21711012721061707, + "learning_rate": 8.822269152214327e-06, + "loss": 1.5109, + "step": 1436 + }, + { + "epoch": 0.7967840310507347, + "grad_norm": 0.22008074820041656, + "learning_rate": 8.776095043611494e-06, + "loss": 1.5869, + "step": 1437 + }, + { + "epoch": 0.7973385084557805, + "grad_norm": 0.21308183670043945, + "learning_rate": 8.730027192006809e-06, + "loss": 1.5816, + "step": 1438 + }, + { + "epoch": 0.7978929858608261, + "grad_norm": 0.21756571531295776, + "learning_rate": 8.684065754172235e-06, + "loss": 1.5787, + "step": 1439 + }, + { + "epoch": 0.7984474632658719, + "grad_norm": 0.21786224842071533, + "learning_rate": 8.638210886517524e-06, + "loss": 1.5704, + "step": 1440 + }, + { + "epoch": 0.7990019406709177, + "grad_norm": 0.21699659526348114, + "learning_rate": 8.592462745089815e-06, + "loss": 1.5663, + "step": 1441 + }, + { + "epoch": 0.7995564180759634, + "grad_norm": 0.21736855804920197, + "learning_rate": 8.546821485573033e-06, + "loss": 1.5873, + "step": 1442 + }, + { + "epoch": 0.8001108954810091, + "grad_norm": 0.20840026438236237, + "learning_rate": 8.501287263287383e-06, + "loss": 1.543, + "step": 1443 + }, + { + "epoch": 0.8006653728860549, + "grad_norm": 0.21741265058517456, + "learning_rate": 8.45586023318882e-06, + "loss": 1.5909, + "step": 1444 + }, + { + "epoch": 0.8012198502911007, + "grad_norm": 0.20903275907039642, + "learning_rate": 8.410540549868478e-06, + "loss": 1.5251, + "step": 1445 + }, + { + "epoch": 0.8017743276961464, + "grad_norm": 0.21058255434036255, + "learning_rate": 8.365328367552257e-06, + "loss": 1.5511, + "step": 1446 + }, + { + "epoch": 0.8023288051011921, + "grad_norm": 0.21151955425739288, + "learning_rate": 8.320223840100152e-06, + "loss": 1.6012, + "step": 1447 + }, + { + "epoch": 0.8028832825062379, + "grad_norm": 0.2185509204864502, + "learning_rate": 8.27522712100584e-06, + "loss": 1.5945, + "step": 1448 + }, + { + "epoch": 0.8034377599112836, + "grad_norm": 0.22160445153713226, + "learning_rate": 8.230338363396107e-06, + "loss": 1.5107, + "step": 1449 + }, + { + "epoch": 0.8039922373163294, + "grad_norm": 0.2089175581932068, + "learning_rate": 8.185557720030347e-06, + "loss": 1.5586, + "step": 1450 + }, + { + "epoch": 0.8045467147213751, + "grad_norm": 0.20976027846336365, + "learning_rate": 8.140885343300034e-06, + "loss": 1.5384, + "step": 1451 + }, + { + "epoch": 0.8051011921264208, + "grad_norm": 0.21759721636772156, + "learning_rate": 8.09632138522817e-06, + "loss": 1.5683, + "step": 1452 + }, + { + "epoch": 0.8056556695314666, + "grad_norm": 0.20990976691246033, + "learning_rate": 8.051865997468856e-06, + "loss": 1.5553, + "step": 1453 + }, + { + "epoch": 0.8062101469365124, + "grad_norm": 0.21652382612228394, + "learning_rate": 8.007519331306701e-06, + "loss": 1.5073, + "step": 1454 + }, + { + "epoch": 0.806764624341558, + "grad_norm": 0.21122099459171295, + "learning_rate": 7.9632815376563e-06, + "loss": 1.5442, + "step": 1455 + }, + { + "epoch": 0.8073191017466038, + "grad_norm": 0.21625083684921265, + "learning_rate": 7.919152767061765e-06, + "loss": 1.5177, + "step": 1456 + }, + { + "epoch": 0.8078735791516496, + "grad_norm": 0.21200041472911835, + "learning_rate": 7.875133169696231e-06, + "loss": 1.62, + "step": 1457 + }, + { + "epoch": 0.8084280565566954, + "grad_norm": 0.21699056029319763, + "learning_rate": 7.83122289536125e-06, + "loss": 1.6298, + "step": 1458 + }, + { + "epoch": 0.808982533961741, + "grad_norm": 0.21260713040828705, + "learning_rate": 7.78742209348637e-06, + "loss": 1.5463, + "step": 1459 + }, + { + "epoch": 0.8095370113667868, + "grad_norm": 0.21496307849884033, + "learning_rate": 7.743730913128606e-06, + "loss": 1.5319, + "step": 1460 + }, + { + "epoch": 0.8100914887718326, + "grad_norm": 0.2123148888349533, + "learning_rate": 7.700149502971901e-06, + "loss": 1.5688, + "step": 1461 + }, + { + "epoch": 0.8106459661768783, + "grad_norm": 0.2114924192428589, + "learning_rate": 7.656678011326674e-06, + "loss": 1.4625, + "step": 1462 + }, + { + "epoch": 0.811200443581924, + "grad_norm": 0.21606843173503876, + "learning_rate": 7.613316586129231e-06, + "loss": 1.6005, + "step": 1463 + }, + { + "epoch": 0.8117549209869698, + "grad_norm": 0.21045103669166565, + "learning_rate": 7.570065374941386e-06, + "loss": 1.5143, + "step": 1464 + }, + { + "epoch": 0.8123093983920155, + "grad_norm": 0.21124376356601715, + "learning_rate": 7.526924524949826e-06, + "loss": 1.5344, + "step": 1465 + }, + { + "epoch": 0.8128638757970613, + "grad_norm": 0.2107442021369934, + "learning_rate": 7.483894182965699e-06, + "loss": 1.5411, + "step": 1466 + }, + { + "epoch": 0.813418353202107, + "grad_norm": 0.21770091354846954, + "learning_rate": 7.440974495424087e-06, + "loss": 1.6174, + "step": 1467 + }, + { + "epoch": 0.8139728306071528, + "grad_norm": 0.21479737758636475, + "learning_rate": 7.398165608383499e-06, + "loss": 1.5734, + "step": 1468 + }, + { + "epoch": 0.8145273080121985, + "grad_norm": 0.22530944645404816, + "learning_rate": 7.355467667525404e-06, + "loss": 1.5711, + "step": 1469 + }, + { + "epoch": 0.8150817854172443, + "grad_norm": 0.2182978242635727, + "learning_rate": 7.312880818153676e-06, + "loss": 1.5933, + "step": 1470 + }, + { + "epoch": 0.81563626282229, + "grad_norm": 0.20854482054710388, + "learning_rate": 7.270405205194158e-06, + "loss": 1.5411, + "step": 1471 + }, + { + "epoch": 0.8161907402273357, + "grad_norm": 0.21064384281635284, + "learning_rate": 7.228040973194175e-06, + "loss": 1.5622, + "step": 1472 + }, + { + "epoch": 0.8167452176323815, + "grad_norm": 0.2184503674507141, + "learning_rate": 7.185788266321969e-06, + "loss": 1.5769, + "step": 1473 + }, + { + "epoch": 0.8172996950374273, + "grad_norm": 0.2059141993522644, + "learning_rate": 7.143647228366277e-06, + "loss": 1.5357, + "step": 1474 + }, + { + "epoch": 0.8178541724424729, + "grad_norm": 0.21199941635131836, + "learning_rate": 7.10161800273582e-06, + "loss": 1.5796, + "step": 1475 + }, + { + "epoch": 0.8184086498475187, + "grad_norm": 0.2183256447315216, + "learning_rate": 7.059700732458807e-06, + "loss": 1.5835, + "step": 1476 + }, + { + "epoch": 0.8189631272525645, + "grad_norm": 0.21991661190986633, + "learning_rate": 7.0178955601824816e-06, + "loss": 1.6238, + "step": 1477 + }, + { + "epoch": 0.8195176046576103, + "grad_norm": 0.2150619775056839, + "learning_rate": 6.976202628172548e-06, + "loss": 1.6039, + "step": 1478 + }, + { + "epoch": 0.8200720820626559, + "grad_norm": 0.21119360625743866, + "learning_rate": 6.934622078312836e-06, + "loss": 1.5284, + "step": 1479 + }, + { + "epoch": 0.8206265594677017, + "grad_norm": 0.21574904024600983, + "learning_rate": 6.8931540521046536e-06, + "loss": 1.5784, + "step": 1480 + }, + { + "epoch": 0.8211810368727475, + "grad_norm": 0.21076872944831848, + "learning_rate": 6.851798690666429e-06, + "loss": 1.5671, + "step": 1481 + }, + { + "epoch": 0.8217355142777932, + "grad_norm": 0.21518339216709137, + "learning_rate": 6.810556134733173e-06, + "loss": 1.5842, + "step": 1482 + }, + { + "epoch": 0.8222899916828389, + "grad_norm": 0.21064461767673492, + "learning_rate": 6.769426524656007e-06, + "loss": 1.5794, + "step": 1483 + }, + { + "epoch": 0.8228444690878847, + "grad_norm": 0.20996730029582977, + "learning_rate": 6.728410000401693e-06, + "loss": 1.5662, + "step": 1484 + }, + { + "epoch": 0.8233989464929304, + "grad_norm": 0.21565823256969452, + "learning_rate": 6.687506701552156e-06, + "loss": 1.5227, + "step": 1485 + }, + { + "epoch": 0.8239534238979762, + "grad_norm": 0.20833303034305573, + "learning_rate": 6.6467167673040134e-06, + "loss": 1.5601, + "step": 1486 + }, + { + "epoch": 0.8245079013030219, + "grad_norm": 0.2161036729812622, + "learning_rate": 6.606040336468091e-06, + "loss": 1.5762, + "step": 1487 + }, + { + "epoch": 0.8250623787080676, + "grad_norm": 0.20862828195095062, + "learning_rate": 6.565477547468933e-06, + "loss": 1.5346, + "step": 1488 + }, + { + "epoch": 0.8256168561131134, + "grad_norm": 0.20955637097358704, + "learning_rate": 6.52502853834438e-06, + "loss": 1.5954, + "step": 1489 + }, + { + "epoch": 0.8261713335181592, + "grad_norm": 0.21076586842536926, + "learning_rate": 6.484693446745089e-06, + "loss": 1.5716, + "step": 1490 + }, + { + "epoch": 0.8267258109232049, + "grad_norm": 0.21574625372886658, + "learning_rate": 6.4444724099339996e-06, + "loss": 1.5426, + "step": 1491 + }, + { + "epoch": 0.8272802883282506, + "grad_norm": 0.207157164812088, + "learning_rate": 6.404365564785946e-06, + "loss": 1.5192, + "step": 1492 + }, + { + "epoch": 0.8278347657332964, + "grad_norm": 0.21191108226776123, + "learning_rate": 6.364373047787156e-06, + "loss": 1.6491, + "step": 1493 + }, + { + "epoch": 0.8283892431383421, + "grad_norm": 0.21367955207824707, + "learning_rate": 6.324494995034789e-06, + "loss": 1.5497, + "step": 1494 + }, + { + "epoch": 0.8289437205433878, + "grad_norm": 0.20905102789402008, + "learning_rate": 6.28473154223649e-06, + "loss": 1.5512, + "step": 1495 + }, + { + "epoch": 0.8294981979484336, + "grad_norm": 0.21540366113185883, + "learning_rate": 6.245082824709863e-06, + "loss": 1.5571, + "step": 1496 + }, + { + "epoch": 0.8300526753534794, + "grad_norm": 0.21667438745498657, + "learning_rate": 6.205548977382143e-06, + "loss": 1.6235, + "step": 1497 + }, + { + "epoch": 0.830607152758525, + "grad_norm": 0.215713232755661, + "learning_rate": 6.166130134789572e-06, + "loss": 1.5784, + "step": 1498 + }, + { + "epoch": 0.8311616301635708, + "grad_norm": 0.20865316689014435, + "learning_rate": 6.126826431077071e-06, + "loss": 1.5567, + "step": 1499 + }, + { + "epoch": 0.8317161075686166, + "grad_norm": 0.2152477353811264, + "learning_rate": 6.087637999997729e-06, + "loss": 1.6069, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 1803, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.575018759487488e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}