diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16120 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 100, + "global_step": 24030, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008322929671244279, + "grad_norm": 0.0830078125, + "learning_rate": 2.08073241781107e-08, + "loss": 0.2994, + "step": 20 + }, + { + "epoch": 0.016645859342488557, + "grad_norm": 0.0849609375, + "learning_rate": 4.16146483562214e-08, + "loss": 0.2962, + "step": 40 + }, + { + "epoch": 0.024968789013732832, + "grad_norm": 0.0947265625, + "learning_rate": 6.242197253433209e-08, + "loss": 0.2974, + "step": 60 + }, + { + "epoch": 0.033291718684977115, + "grad_norm": 0.08544921875, + "learning_rate": 8.32292967124428e-08, + "loss": 0.3034, + "step": 80 + }, + { + "epoch": 0.04161464835622139, + "grad_norm": 0.07861328125, + "learning_rate": 1.0403662089055348e-07, + "loss": 0.2985, + "step": 100 + }, + { + "epoch": 0.04161464835622139, + "eval_main_loss": 0.3075891137123108, + "eval_main_runtime": 6.3529, + "eval_main_samples_per_second": 29.908, + "eval_main_steps_per_second": 3.778, + "step": 100 + }, + { + "epoch": 0.04161464835622139, + "eval_anatomy_loss": 2.9698662757873535, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.496, + "eval_anatomy_steps_per_second": 3.748, + "step": 100 + }, + { + "epoch": 0.04161464835622139, + "eval_college_mathematics_loss": 2.178929567337036, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.511, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 100 + }, + { + "epoch": 0.04161464835622139, + "eval_international_law_loss": 3.19402813911438, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.493, + "eval_international_law_steps_per_second": 3.746, + "step": 100 + }, + { + "epoch": 0.049937578027465665, + "grad_norm": 0.076171875, + "learning_rate": 1.2484394506866418e-07, + "loss": 0.2971, + "step": 120 + }, + { + "epoch": 0.05826050769870995, + "grad_norm": 0.08349609375, + "learning_rate": 1.4565126924677488e-07, + "loss": 0.2983, + "step": 140 + }, + { + "epoch": 0.06658343736995423, + "grad_norm": 0.083984375, + "learning_rate": 1.664585934248856e-07, + "loss": 0.2998, + "step": 160 + }, + { + "epoch": 0.0749063670411985, + "grad_norm": 0.0791015625, + "learning_rate": 1.8726591760299626e-07, + "loss": 0.2974, + "step": 180 + }, + { + "epoch": 0.08322929671244278, + "grad_norm": 0.07568359375, + "learning_rate": 2.0807324178110696e-07, + "loss": 0.2981, + "step": 200 + }, + { + "epoch": 0.08322929671244278, + "eval_main_loss": 0.307395339012146, + "eval_main_runtime": 6.3616, + "eval_main_samples_per_second": 29.866, + "eval_main_steps_per_second": 3.773, + "step": 200 + }, + { + "epoch": 0.08322929671244278, + "eval_anatomy_loss": 2.9686527252197266, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.48, + "eval_anatomy_steps_per_second": 3.74, + "step": 200 + }, + { + "epoch": 0.08322929671244278, + "eval_college_mathematics_loss": 2.1776983737945557, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.499, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 200 + }, + { + "epoch": 0.08322929671244278, + "eval_international_law_loss": 3.1930840015411377, + "eval_international_law_runtime": 0.268, + "eval_international_law_samples_per_second": 7.463, + "eval_international_law_steps_per_second": 3.732, + "step": 200 + }, + { + "epoch": 0.09155222638368705, + "grad_norm": 0.08251953125, + "learning_rate": 2.2888056595921765e-07, + "loss": 0.3004, + "step": 220 + }, + { + "epoch": 0.09987515605493133, + "grad_norm": 0.08837890625, + "learning_rate": 2.4968789013732837e-07, + "loss": 0.3023, + "step": 240 + }, + { + "epoch": 0.10819808572617562, + "grad_norm": 0.0888671875, + "learning_rate": 2.704952143154391e-07, + "loss": 0.3018, + "step": 260 + }, + { + "epoch": 0.1165210153974199, + "grad_norm": 0.09130859375, + "learning_rate": 2.9130253849354976e-07, + "loss": 0.3018, + "step": 280 + }, + { + "epoch": 0.12484394506866417, + "grad_norm": 0.08642578125, + "learning_rate": 3.121098626716604e-07, + "loss": 0.2971, + "step": 300 + }, + { + "epoch": 0.12484394506866417, + "eval_main_loss": 0.3072234094142914, + "eval_main_runtime": 6.363, + "eval_main_samples_per_second": 29.86, + "eval_main_steps_per_second": 3.772, + "step": 300 + }, + { + "epoch": 0.12484394506866417, + "eval_anatomy_loss": 2.9682118892669678, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.476, + "eval_anatomy_steps_per_second": 3.738, + "step": 300 + }, + { + "epoch": 0.12484394506866417, + "eval_college_mathematics_loss": 2.1782639026641846, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 300 + }, + { + "epoch": 0.12484394506866417, + "eval_international_law_loss": 3.1912684440612793, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.505, + "eval_international_law_steps_per_second": 3.752, + "step": 300 + }, + { + "epoch": 0.13316687473990846, + "grad_norm": 0.087890625, + "learning_rate": 3.329171868497712e-07, + "loss": 0.3034, + "step": 320 + }, + { + "epoch": 0.14148980441115272, + "grad_norm": 0.08837890625, + "learning_rate": 3.537245110278818e-07, + "loss": 0.299, + "step": 340 + }, + { + "epoch": 0.149812734082397, + "grad_norm": 0.08251953125, + "learning_rate": 3.7453183520599253e-07, + "loss": 0.2994, + "step": 360 + }, + { + "epoch": 0.15813566375364127, + "grad_norm": 0.09326171875, + "learning_rate": 3.953391593841032e-07, + "loss": 0.3001, + "step": 380 + }, + { + "epoch": 0.16645859342488556, + "grad_norm": 0.1025390625, + "learning_rate": 4.161464835622139e-07, + "loss": 0.2971, + "step": 400 + }, + { + "epoch": 0.16645859342488556, + "eval_main_loss": 0.3068091869354248, + "eval_main_runtime": 6.3609, + "eval_main_samples_per_second": 29.87, + "eval_main_steps_per_second": 3.773, + "step": 400 + }, + { + "epoch": 0.16645859342488556, + "eval_anatomy_loss": 2.969444990158081, + "eval_anatomy_runtime": 0.2677, + "eval_anatomy_samples_per_second": 7.472, + "eval_anatomy_steps_per_second": 3.736, + "step": 400 + }, + { + "epoch": 0.16645859342488556, + "eval_college_mathematics_loss": 2.1761982440948486, + "eval_college_mathematics_runtime": 0.2684, + "eval_college_mathematics_samples_per_second": 7.45, + "eval_college_mathematics_steps_per_second": 3.725, + "step": 400 + }, + { + "epoch": 0.16645859342488556, + "eval_international_law_loss": 3.1895108222961426, + "eval_international_law_runtime": 0.2681, + "eval_international_law_samples_per_second": 7.461, + "eval_international_law_steps_per_second": 3.731, + "step": 400 + }, + { + "epoch": 0.17478152309612985, + "grad_norm": 0.11279296875, + "learning_rate": 4.3695380774032463e-07, + "loss": 0.2971, + "step": 420 + }, + { + "epoch": 0.1831044527673741, + "grad_norm": 0.10400390625, + "learning_rate": 4.577611319184353e-07, + "loss": 0.3002, + "step": 440 + }, + { + "epoch": 0.1914273824386184, + "grad_norm": 0.10498046875, + "learning_rate": 4.785684560965461e-07, + "loss": 0.3001, + "step": 460 + }, + { + "epoch": 0.19975031210986266, + "grad_norm": 0.1171875, + "learning_rate": 4.993757802746567e-07, + "loss": 0.2933, + "step": 480 + }, + { + "epoch": 0.20807324178110695, + "grad_norm": 0.11767578125, + "learning_rate": 5.201831044527674e-07, + "loss": 0.2953, + "step": 500 + }, + { + "epoch": 0.20807324178110695, + "eval_main_loss": 0.3060111999511719, + "eval_main_runtime": 6.3306, + "eval_main_samples_per_second": 30.013, + "eval_main_steps_per_second": 3.791, + "step": 500 + }, + { + "epoch": 0.20807324178110695, + "eval_anatomy_loss": 2.966276168823242, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.758, + "step": 500 + }, + { + "epoch": 0.20807324178110695, + "eval_college_mathematics_loss": 2.173703193664551, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 500 + }, + { + "epoch": 0.20807324178110695, + "eval_international_law_loss": 3.1900217533111572, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.496, + "eval_international_law_steps_per_second": 3.748, + "step": 500 + }, + { + "epoch": 0.21639617145235124, + "grad_norm": 0.11962890625, + "learning_rate": 5.409904286308782e-07, + "loss": 0.3011, + "step": 520 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.12890625, + "learning_rate": 5.617977528089888e-07, + "loss": 0.2986, + "step": 540 + }, + { + "epoch": 0.2330420307948398, + "grad_norm": 0.142578125, + "learning_rate": 5.826050769870995e-07, + "loss": 0.3017, + "step": 560 + }, + { + "epoch": 0.24136496046608405, + "grad_norm": 0.1455078125, + "learning_rate": 6.034124011652102e-07, + "loss": 0.2957, + "step": 580 + }, + { + "epoch": 0.24968789013732834, + "grad_norm": 0.166015625, + "learning_rate": 6.242197253433208e-07, + "loss": 0.2969, + "step": 600 + }, + { + "epoch": 0.24968789013732834, + "eval_main_loss": 0.3044416904449463, + "eval_main_runtime": 6.333, + "eval_main_samples_per_second": 30.002, + "eval_main_steps_per_second": 3.79, + "step": 600 + }, + { + "epoch": 0.24968789013732834, + "eval_anatomy_loss": 2.9617347717285156, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.753, + "step": 600 + }, + { + "epoch": 0.24968789013732834, + "eval_college_mathematics_loss": 2.169576644897461, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.534, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 600 + }, + { + "epoch": 0.24968789013732834, + "eval_international_law_loss": 3.184370756149292, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.53, + "eval_international_law_steps_per_second": 3.765, + "step": 600 + }, + { + "epoch": 0.2580108198085726, + "grad_norm": 0.16796875, + "learning_rate": 6.450270495214315e-07, + "loss": 0.2953, + "step": 620 + }, + { + "epoch": 0.2663337494798169, + "grad_norm": 0.1650390625, + "learning_rate": 6.658343736995424e-07, + "loss": 0.2994, + "step": 640 + }, + { + "epoch": 0.2746566791510612, + "grad_norm": 0.1796875, + "learning_rate": 6.86641697877653e-07, + "loss": 0.2941, + "step": 660 + }, + { + "epoch": 0.28297960882230544, + "grad_norm": 0.1611328125, + "learning_rate": 7.074490220557636e-07, + "loss": 0.2982, + "step": 680 + }, + { + "epoch": 0.29130253849354976, + "grad_norm": 0.1845703125, + "learning_rate": 7.282563462338745e-07, + "loss": 0.2956, + "step": 700 + }, + { + "epoch": 0.29130253849354976, + "eval_main_loss": 0.3035320043563843, + "eval_main_runtime": 6.3633, + "eval_main_samples_per_second": 29.859, + "eval_main_steps_per_second": 3.772, + "step": 700 + }, + { + "epoch": 0.29130253849354976, + "eval_anatomy_loss": 2.9591825008392334, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.487, + "eval_anatomy_steps_per_second": 3.744, + "step": 700 + }, + { + "epoch": 0.29130253849354976, + "eval_college_mathematics_loss": 2.1641225814819336, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.491, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 700 + }, + { + "epoch": 0.29130253849354976, + "eval_international_law_loss": 3.183608293533325, + "eval_international_law_runtime": 0.2677, + "eval_international_law_samples_per_second": 7.47, + "eval_international_law_steps_per_second": 3.735, + "step": 700 + }, + { + "epoch": 0.299625468164794, + "grad_norm": 0.1806640625, + "learning_rate": 7.490636704119851e-07, + "loss": 0.2968, + "step": 720 + }, + { + "epoch": 0.3079483978360383, + "grad_norm": 0.1748046875, + "learning_rate": 7.698709945900957e-07, + "loss": 0.2924, + "step": 740 + }, + { + "epoch": 0.31627132750728254, + "grad_norm": 0.1787109375, + "learning_rate": 7.906783187682064e-07, + "loss": 0.2948, + "step": 760 + }, + { + "epoch": 0.32459425717852686, + "grad_norm": 0.173828125, + "learning_rate": 8.114856429463172e-07, + "loss": 0.2987, + "step": 780 + }, + { + "epoch": 0.3329171868497711, + "grad_norm": 0.16015625, + "learning_rate": 8.322929671244278e-07, + "loss": 0.3004, + "step": 800 + }, + { + "epoch": 0.3329171868497711, + "eval_main_loss": 0.3030804991722107, + "eval_main_runtime": 6.3678, + "eval_main_samples_per_second": 29.838, + "eval_main_steps_per_second": 3.769, + "step": 800 + }, + { + "epoch": 0.3329171868497711, + "eval_anatomy_loss": 2.958465337753296, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.476, + "eval_anatomy_steps_per_second": 3.738, + "step": 800 + }, + { + "epoch": 0.3329171868497711, + "eval_college_mathematics_loss": 2.1627776622772217, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.474, + "eval_college_mathematics_steps_per_second": 3.737, + "step": 800 + }, + { + "epoch": 0.3329171868497711, + "eval_international_law_loss": 3.1812000274658203, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.507, + "eval_international_law_steps_per_second": 3.754, + "step": 800 + }, + { + "epoch": 0.3412401165210154, + "grad_norm": 0.1650390625, + "learning_rate": 8.531002913025385e-07, + "loss": 0.2941, + "step": 820 + }, + { + "epoch": 0.3495630461922597, + "grad_norm": 0.1806640625, + "learning_rate": 8.739076154806493e-07, + "loss": 0.2961, + "step": 840 + }, + { + "epoch": 0.35788597586350396, + "grad_norm": 0.189453125, + "learning_rate": 8.947149396587599e-07, + "loss": 0.2957, + "step": 860 + }, + { + "epoch": 0.3662089055347482, + "grad_norm": 0.1708984375, + "learning_rate": 9.155222638368706e-07, + "loss": 0.2941, + "step": 880 + }, + { + "epoch": 0.37453183520599254, + "grad_norm": 0.203125, + "learning_rate": 9.363295880149814e-07, + "loss": 0.2956, + "step": 900 + }, + { + "epoch": 0.37453183520599254, + "eval_main_loss": 0.30232974886894226, + "eval_main_runtime": 6.3643, + "eval_main_samples_per_second": 29.854, + "eval_main_steps_per_second": 3.771, + "step": 900 + }, + { + "epoch": 0.37453183520599254, + "eval_anatomy_loss": 2.9550912380218506, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.478, + "eval_anatomy_steps_per_second": 3.739, + "step": 900 + }, + { + "epoch": 0.37453183520599254, + "eval_college_mathematics_loss": 2.162787914276123, + "eval_college_mathematics_runtime": 0.266, + "eval_college_mathematics_samples_per_second": 7.518, + "eval_college_mathematics_steps_per_second": 3.759, + "step": 900 + }, + { + "epoch": 0.37453183520599254, + "eval_international_law_loss": 3.1821956634521484, + "eval_international_law_runtime": 0.2683, + "eval_international_law_samples_per_second": 7.456, + "eval_international_law_steps_per_second": 3.728, + "step": 900 + }, + { + "epoch": 0.3828547648772368, + "grad_norm": 0.21875, + "learning_rate": 9.571369121930921e-07, + "loss": 0.2989, + "step": 920 + }, + { + "epoch": 0.39117769454848106, + "grad_norm": 0.23046875, + "learning_rate": 9.779442363712028e-07, + "loss": 0.2965, + "step": 940 + }, + { + "epoch": 0.3995006242197253, + "grad_norm": 0.2001953125, + "learning_rate": 9.987515605493135e-07, + "loss": 0.2969, + "step": 960 + }, + { + "epoch": 0.40782355389096964, + "grad_norm": 0.2275390625, + "learning_rate": 1.0195588847274241e-06, + "loss": 0.2913, + "step": 980 + }, + { + "epoch": 0.4161464835622139, + "grad_norm": 0.275390625, + "learning_rate": 1.0403662089055348e-06, + "loss": 0.2945, + "step": 1000 + }, + { + "epoch": 0.4161464835622139, + "eval_main_loss": 0.3009730279445648, + "eval_main_runtime": 6.3379, + "eval_main_samples_per_second": 29.978, + "eval_main_steps_per_second": 3.787, + "step": 1000 + }, + { + "epoch": 0.4161464835622139, + "eval_anatomy_loss": 2.9502501487731934, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.512, + "eval_anatomy_steps_per_second": 3.756, + "step": 1000 + }, + { + "epoch": 0.4161464835622139, + "eval_college_mathematics_loss": 2.1539924144744873, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.493, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 1000 + }, + { + "epoch": 0.4161464835622139, + "eval_international_law_loss": 3.1735572814941406, + "eval_international_law_runtime": 0.268, + "eval_international_law_samples_per_second": 7.464, + "eval_international_law_steps_per_second": 3.732, + "step": 1000 + }, + { + "epoch": 0.42446941323345816, + "grad_norm": 0.2294921875, + "learning_rate": 1.0611735330836455e-06, + "loss": 0.2908, + "step": 1020 + }, + { + "epoch": 0.4327923429047025, + "grad_norm": 0.26953125, + "learning_rate": 1.0819808572617564e-06, + "loss": 0.2923, + "step": 1040 + }, + { + "epoch": 0.44111527257594674, + "grad_norm": 0.25390625, + "learning_rate": 1.102788181439867e-06, + "loss": 0.291, + "step": 1060 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.255859375, + "learning_rate": 1.1235955056179777e-06, + "loss": 0.2914, + "step": 1080 + }, + { + "epoch": 0.4577611319184353, + "grad_norm": 0.275390625, + "learning_rate": 1.1444028297960884e-06, + "loss": 0.2921, + "step": 1100 + }, + { + "epoch": 0.4577611319184353, + "eval_main_loss": 0.29984337091445923, + "eval_main_runtime": 6.3525, + "eval_main_samples_per_second": 29.909, + "eval_main_steps_per_second": 3.778, + "step": 1100 + }, + { + "epoch": 0.4577611319184353, + "eval_anatomy_loss": 2.947199821472168, + "eval_anatomy_runtime": 0.27, + "eval_anatomy_samples_per_second": 7.407, + "eval_anatomy_steps_per_second": 3.703, + "step": 1100 + }, + { + "epoch": 0.4577611319184353, + "eval_college_mathematics_loss": 2.1513562202453613, + "eval_college_mathematics_runtime": 0.2683, + "eval_college_mathematics_samples_per_second": 7.454, + "eval_college_mathematics_steps_per_second": 3.727, + "step": 1100 + }, + { + "epoch": 0.4577611319184353, + "eval_international_law_loss": 3.1728124618530273, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.507, + "eval_international_law_steps_per_second": 3.754, + "step": 1100 + }, + { + "epoch": 0.4660840615896796, + "grad_norm": 0.271484375, + "learning_rate": 1.165210153974199e-06, + "loss": 0.2917, + "step": 1120 + }, + { + "epoch": 0.47440699126092384, + "grad_norm": 0.30078125, + "learning_rate": 1.1860174781523097e-06, + "loss": 0.2874, + "step": 1140 + }, + { + "epoch": 0.4827299209321681, + "grad_norm": 0.298828125, + "learning_rate": 1.2068248023304204e-06, + "loss": 0.2926, + "step": 1160 + }, + { + "epoch": 0.4910528506034124, + "grad_norm": 0.294921875, + "learning_rate": 1.2276321265085312e-06, + "loss": 0.2913, + "step": 1180 + }, + { + "epoch": 0.4993757802746567, + "grad_norm": 0.2890625, + "learning_rate": 1.2484394506866417e-06, + "loss": 0.2926, + "step": 1200 + }, + { + "epoch": 0.4993757802746567, + "eval_main_loss": 0.29899466037750244, + "eval_main_runtime": 6.3585, + "eval_main_samples_per_second": 29.881, + "eval_main_steps_per_second": 3.774, + "step": 1200 + }, + { + "epoch": 0.4993757802746567, + "eval_anatomy_loss": 2.9453182220458984, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.477, + "eval_anatomy_steps_per_second": 3.738, + "step": 1200 + }, + { + "epoch": 0.4993757802746567, + "eval_college_mathematics_loss": 2.1483311653137207, + "eval_college_mathematics_runtime": 0.2675, + "eval_college_mathematics_samples_per_second": 7.477, + "eval_college_mathematics_steps_per_second": 3.739, + "step": 1200 + }, + { + "epoch": 0.4993757802746567, + "eval_international_law_loss": 3.1692144870758057, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.493, + "eval_international_law_steps_per_second": 3.746, + "step": 1200 + }, + { + "epoch": 0.5076987099459009, + "grad_norm": 0.294921875, + "learning_rate": 1.2692467748647524e-06, + "loss": 0.2921, + "step": 1220 + }, + { + "epoch": 0.5160216396171452, + "grad_norm": 0.29296875, + "learning_rate": 1.290054099042863e-06, + "loss": 0.2888, + "step": 1240 + }, + { + "epoch": 0.5243445692883895, + "grad_norm": 0.2734375, + "learning_rate": 1.3108614232209737e-06, + "loss": 0.2914, + "step": 1260 + }, + { + "epoch": 0.5326674989596338, + "grad_norm": 0.306640625, + "learning_rate": 1.3316687473990848e-06, + "loss": 0.2912, + "step": 1280 + }, + { + "epoch": 0.5409904286308781, + "grad_norm": 0.2890625, + "learning_rate": 1.3524760715771954e-06, + "loss": 0.2909, + "step": 1300 + }, + { + "epoch": 0.5409904286308781, + "eval_main_loss": 0.29816609621047974, + "eval_main_runtime": 6.3672, + "eval_main_samples_per_second": 29.84, + "eval_main_steps_per_second": 3.769, + "step": 1300 + }, + { + "epoch": 0.5409904286308781, + "eval_anatomy_loss": 2.942121744155884, + "eval_anatomy_runtime": 0.2697, + "eval_anatomy_samples_per_second": 7.416, + "eval_anatomy_steps_per_second": 3.708, + "step": 1300 + }, + { + "epoch": 0.5409904286308781, + "eval_college_mathematics_loss": 2.1458935737609863, + "eval_college_mathematics_runtime": 0.2675, + "eval_college_mathematics_samples_per_second": 7.476, + "eval_college_mathematics_steps_per_second": 3.738, + "step": 1300 + }, + { + "epoch": 0.5409904286308781, + "eval_international_law_loss": 3.1678075790405273, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.482, + "eval_international_law_steps_per_second": 3.741, + "step": 1300 + }, + { + "epoch": 0.5493133583021224, + "grad_norm": 0.291015625, + "learning_rate": 1.373283395755306e-06, + "loss": 0.2928, + "step": 1320 + }, + { + "epoch": 0.5576362879733666, + "grad_norm": 0.263671875, + "learning_rate": 1.3940907199334166e-06, + "loss": 0.2915, + "step": 1340 + }, + { + "epoch": 0.5659592176446109, + "grad_norm": 0.26953125, + "learning_rate": 1.4148980441115272e-06, + "loss": 0.2893, + "step": 1360 + }, + { + "epoch": 0.5742821473158551, + "grad_norm": 0.283203125, + "learning_rate": 1.435705368289638e-06, + "loss": 0.2907, + "step": 1380 + }, + { + "epoch": 0.5826050769870995, + "grad_norm": 0.28125, + "learning_rate": 1.456512692467749e-06, + "loss": 0.2938, + "step": 1400 + }, + { + "epoch": 0.5826050769870995, + "eval_main_loss": 0.2975090444087982, + "eval_main_runtime": 6.3608, + "eval_main_samples_per_second": 29.87, + "eval_main_steps_per_second": 3.773, + "step": 1400 + }, + { + "epoch": 0.5826050769870995, + "eval_anatomy_loss": 2.940919876098633, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.495, + "eval_anatomy_steps_per_second": 3.747, + "step": 1400 + }, + { + "epoch": 0.5826050769870995, + "eval_college_mathematics_loss": 2.1454479694366455, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.495, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 1400 + }, + { + "epoch": 0.5826050769870995, + "eval_international_law_loss": 3.1660165786743164, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.48, + "eval_international_law_steps_per_second": 3.74, + "step": 1400 + }, + { + "epoch": 0.5909280066583438, + "grad_norm": 0.310546875, + "learning_rate": 1.4773200166458597e-06, + "loss": 0.2922, + "step": 1420 + }, + { + "epoch": 0.599250936329588, + "grad_norm": 0.29296875, + "learning_rate": 1.4981273408239701e-06, + "loss": 0.2957, + "step": 1440 + }, + { + "epoch": 0.6075738660008323, + "grad_norm": 0.302734375, + "learning_rate": 1.5189346650020808e-06, + "loss": 0.2891, + "step": 1460 + }, + { + "epoch": 0.6158967956720766, + "grad_norm": 0.291015625, + "learning_rate": 1.5397419891801914e-06, + "loss": 0.2913, + "step": 1480 + }, + { + "epoch": 0.6242197253433208, + "grad_norm": 0.3125, + "learning_rate": 1.5605493133583021e-06, + "loss": 0.2864, + "step": 1500 + }, + { + "epoch": 0.6242197253433208, + "eval_main_loss": 0.29658398032188416, + "eval_main_runtime": 6.3423, + "eval_main_samples_per_second": 29.958, + "eval_main_steps_per_second": 3.784, + "step": 1500 + }, + { + "epoch": 0.6242197253433208, + "eval_anatomy_loss": 2.939924478530884, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.496, + "eval_anatomy_steps_per_second": 3.748, + "step": 1500 + }, + { + "epoch": 0.6242197253433208, + "eval_college_mathematics_loss": 2.138369560241699, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.505, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 1500 + }, + { + "epoch": 0.6242197253433208, + "eval_international_law_loss": 3.16491436958313, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.512, + "eval_international_law_steps_per_second": 3.756, + "step": 1500 + }, + { + "epoch": 0.6325426550145651, + "grad_norm": 0.294921875, + "learning_rate": 1.5813566375364128e-06, + "loss": 0.2904, + "step": 1520 + }, + { + "epoch": 0.6408655846858095, + "grad_norm": 0.294921875, + "learning_rate": 1.6021639617145237e-06, + "loss": 0.291, + "step": 1540 + }, + { + "epoch": 0.6491885143570537, + "grad_norm": 0.3203125, + "learning_rate": 1.6229712858926343e-06, + "loss": 0.287, + "step": 1560 + }, + { + "epoch": 0.657511444028298, + "grad_norm": 0.369140625, + "learning_rate": 1.643778610070745e-06, + "loss": 0.2892, + "step": 1580 + }, + { + "epoch": 0.6658343736995422, + "grad_norm": 0.328125, + "learning_rate": 1.6645859342488557e-06, + "loss": 0.2882, + "step": 1600 + }, + { + "epoch": 0.6658343736995422, + "eval_main_loss": 0.29542383551597595, + "eval_main_runtime": 6.3441, + "eval_main_samples_per_second": 29.949, + "eval_main_steps_per_second": 3.783, + "step": 1600 + }, + { + "epoch": 0.6658343736995422, + "eval_anatomy_loss": 2.932673692703247, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 1600 + }, + { + "epoch": 0.6658343736995422, + "eval_college_mathematics_loss": 2.137777328491211, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.505, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 1600 + }, + { + "epoch": 0.6658343736995422, + "eval_international_law_loss": 3.161029577255249, + "eval_international_law_runtime": 0.2659, + "eval_international_law_samples_per_second": 7.523, + "eval_international_law_steps_per_second": 3.761, + "step": 1600 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.302734375, + "learning_rate": 1.6853932584269663e-06, + "loss": 0.2896, + "step": 1620 + }, + { + "epoch": 0.6824802330420308, + "grad_norm": 0.349609375, + "learning_rate": 1.706200582605077e-06, + "loss": 0.29, + "step": 1640 + }, + { + "epoch": 0.690803162713275, + "grad_norm": 0.3671875, + "learning_rate": 1.7270079067831877e-06, + "loss": 0.2867, + "step": 1660 + }, + { + "epoch": 0.6991260923845194, + "grad_norm": 0.3671875, + "learning_rate": 1.7478152309612985e-06, + "loss": 0.2883, + "step": 1680 + }, + { + "epoch": 0.7074490220557637, + "grad_norm": 0.341796875, + "learning_rate": 1.7686225551394092e-06, + "loss": 0.2843, + "step": 1700 + }, + { + "epoch": 0.7074490220557637, + "eval_main_loss": 0.29412609338760376, + "eval_main_runtime": 6.3391, + "eval_main_samples_per_second": 29.973, + "eval_main_steps_per_second": 3.786, + "step": 1700 + }, + { + "epoch": 0.7074490220557637, + "eval_anatomy_loss": 2.92899751663208, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.49, + "eval_anatomy_steps_per_second": 3.745, + "step": 1700 + }, + { + "epoch": 0.7074490220557637, + "eval_college_mathematics_loss": 2.132533311843872, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.532, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 1700 + }, + { + "epoch": 0.7074490220557637, + "eval_international_law_loss": 3.1562838554382324, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.516, + "eval_international_law_steps_per_second": 3.758, + "step": 1700 + }, + { + "epoch": 0.7157719517270079, + "grad_norm": 0.373046875, + "learning_rate": 1.7894298793175199e-06, + "loss": 0.2916, + "step": 1720 + }, + { + "epoch": 0.7240948813982522, + "grad_norm": 0.373046875, + "learning_rate": 1.8102372034956305e-06, + "loss": 0.2844, + "step": 1740 + }, + { + "epoch": 0.7324178110694964, + "grad_norm": 0.408203125, + "learning_rate": 1.8310445276737412e-06, + "loss": 0.2877, + "step": 1760 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3984375, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.2862, + "step": 1780 + }, + { + "epoch": 0.7490636704119851, + "grad_norm": 0.421875, + "learning_rate": 1.8726591760299627e-06, + "loss": 0.2826, + "step": 1800 + }, + { + "epoch": 0.7490636704119851, + "eval_main_loss": 0.2929327189922333, + "eval_main_runtime": 6.3377, + "eval_main_samples_per_second": 29.979, + "eval_main_steps_per_second": 3.787, + "step": 1800 + }, + { + "epoch": 0.7490636704119851, + "eval_anatomy_loss": 2.923774480819702, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.527, + "eval_anatomy_steps_per_second": 3.763, + "step": 1800 + }, + { + "epoch": 0.7490636704119851, + "eval_college_mathematics_loss": 2.1297199726104736, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.516, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 1800 + }, + { + "epoch": 0.7490636704119851, + "eval_international_law_loss": 3.153568744659424, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.505, + "eval_international_law_steps_per_second": 3.753, + "step": 1800 + }, + { + "epoch": 0.7573866000832293, + "grad_norm": 0.36328125, + "learning_rate": 1.8934665002080734e-06, + "loss": 0.2883, + "step": 1820 + }, + { + "epoch": 0.7657095297544736, + "grad_norm": 0.4140625, + "learning_rate": 1.9142738243861843e-06, + "loss": 0.2827, + "step": 1840 + }, + { + "epoch": 0.7740324594257179, + "grad_norm": 0.45703125, + "learning_rate": 1.9350811485642947e-06, + "loss": 0.2892, + "step": 1860 + }, + { + "epoch": 0.7823553890969621, + "grad_norm": 0.380859375, + "learning_rate": 1.9558884727424056e-06, + "loss": 0.2816, + "step": 1880 + }, + { + "epoch": 0.7906783187682064, + "grad_norm": 0.3671875, + "learning_rate": 1.976695796920516e-06, + "loss": 0.2859, + "step": 1900 + }, + { + "epoch": 0.7906783187682064, + "eval_main_loss": 0.2918354272842407, + "eval_main_runtime": 6.3351, + "eval_main_samples_per_second": 29.992, + "eval_main_steps_per_second": 3.788, + "step": 1900 + }, + { + "epoch": 0.7906783187682064, + "eval_anatomy_loss": 2.922013759613037, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.495, + "eval_anatomy_steps_per_second": 3.747, + "step": 1900 + }, + { + "epoch": 0.7906783187682064, + "eval_college_mathematics_loss": 2.127079963684082, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 1900 + }, + { + "epoch": 0.7906783187682064, + "eval_international_law_loss": 3.1497292518615723, + "eval_international_law_runtime": 0.2681, + "eval_international_law_samples_per_second": 7.459, + "eval_international_law_steps_per_second": 3.729, + "step": 1900 + }, + { + "epoch": 0.7990012484394506, + "grad_norm": 0.44140625, + "learning_rate": 1.997503121098627e-06, + "loss": 0.2827, + "step": 1920 + }, + { + "epoch": 0.807324178110695, + "grad_norm": 0.396484375, + "learning_rate": 2.018310445276738e-06, + "loss": 0.2835, + "step": 1940 + }, + { + "epoch": 0.8156471077819393, + "grad_norm": 0.4375, + "learning_rate": 2.0391177694548483e-06, + "loss": 0.2824, + "step": 1960 + }, + { + "epoch": 0.8239700374531835, + "grad_norm": 0.4453125, + "learning_rate": 2.059925093632959e-06, + "loss": 0.2834, + "step": 1980 + }, + { + "epoch": 0.8322929671244278, + "grad_norm": 0.3984375, + "learning_rate": 2.0807324178110696e-06, + "loss": 0.285, + "step": 2000 + }, + { + "epoch": 0.8322929671244278, + "eval_main_loss": 0.29074886441230774, + "eval_main_runtime": 6.3397, + "eval_main_samples_per_second": 29.97, + "eval_main_steps_per_second": 3.786, + "step": 2000 + }, + { + "epoch": 0.8322929671244278, + "eval_anatomy_loss": 2.92093825340271, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.494, + "eval_anatomy_steps_per_second": 3.747, + "step": 2000 + }, + { + "epoch": 0.8322929671244278, + "eval_college_mathematics_loss": 2.1216940879821777, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.511, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 2000 + }, + { + "epoch": 0.8322929671244278, + "eval_international_law_loss": 3.146960973739624, + "eval_international_law_runtime": 0.2679, + "eval_international_law_samples_per_second": 7.465, + "eval_international_law_steps_per_second": 3.733, + "step": 2000 + }, + { + "epoch": 0.8406158967956721, + "grad_norm": 0.41015625, + "learning_rate": 2.1015397419891805e-06, + "loss": 0.2793, + "step": 2020 + }, + { + "epoch": 0.8489388264669163, + "grad_norm": 0.443359375, + "learning_rate": 2.122347066167291e-06, + "loss": 0.2855, + "step": 2040 + }, + { + "epoch": 0.8572617561381606, + "grad_norm": 0.45703125, + "learning_rate": 2.1431543903454014e-06, + "loss": 0.2789, + "step": 2060 + }, + { + "epoch": 0.865584685809405, + "grad_norm": 0.4765625, + "learning_rate": 2.1639617145235127e-06, + "loss": 0.2807, + "step": 2080 + }, + { + "epoch": 0.8739076154806492, + "grad_norm": 0.431640625, + "learning_rate": 2.184769038701623e-06, + "loss": 0.2851, + "step": 2100 + }, + { + "epoch": 0.8739076154806492, + "eval_main_loss": 0.2896941602230072, + "eval_main_runtime": 6.3421, + "eval_main_samples_per_second": 29.958, + "eval_main_steps_per_second": 3.784, + "step": 2100 + }, + { + "epoch": 0.8739076154806492, + "eval_anatomy_loss": 2.916968822479248, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.481, + "eval_anatomy_steps_per_second": 3.741, + "step": 2100 + }, + { + "epoch": 0.8739076154806492, + "eval_college_mathematics_loss": 2.119070053100586, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 2100 + }, + { + "epoch": 0.8739076154806492, + "eval_international_law_loss": 3.1451635360717773, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.481, + "eval_international_law_steps_per_second": 3.74, + "step": 2100 + }, + { + "epoch": 0.8822305451518935, + "grad_norm": 0.455078125, + "learning_rate": 2.205576362879734e-06, + "loss": 0.2894, + "step": 2120 + }, + { + "epoch": 0.8905534748231377, + "grad_norm": 0.431640625, + "learning_rate": 2.2263836870578445e-06, + "loss": 0.2872, + "step": 2140 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.455078125, + "learning_rate": 2.2471910112359554e-06, + "loss": 0.2842, + "step": 2160 + }, + { + "epoch": 0.9071993341656263, + "grad_norm": 0.42578125, + "learning_rate": 2.267998335414066e-06, + "loss": 0.282, + "step": 2180 + }, + { + "epoch": 0.9155222638368706, + "grad_norm": 0.435546875, + "learning_rate": 2.2888056595921767e-06, + "loss": 0.2787, + "step": 2200 + }, + { + "epoch": 0.9155222638368706, + "eval_main_loss": 0.28874531388282776, + "eval_main_runtime": 6.3297, + "eval_main_samples_per_second": 30.017, + "eval_main_steps_per_second": 3.792, + "step": 2200 + }, + { + "epoch": 0.9155222638368706, + "eval_anatomy_loss": 2.9137964248657227, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.522, + "eval_anatomy_steps_per_second": 3.761, + "step": 2200 + }, + { + "epoch": 0.9155222638368706, + "eval_college_mathematics_loss": 2.115910530090332, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 2200 + }, + { + "epoch": 0.9155222638368706, + "eval_international_law_loss": 3.14198637008667, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.497, + "eval_international_law_steps_per_second": 3.749, + "step": 2200 + }, + { + "epoch": 0.9238451935081149, + "grad_norm": 0.474609375, + "learning_rate": 2.3096129837702876e-06, + "loss": 0.2844, + "step": 2220 + }, + { + "epoch": 0.9321681231793592, + "grad_norm": 0.44921875, + "learning_rate": 2.330420307948398e-06, + "loss": 0.2822, + "step": 2240 + }, + { + "epoch": 0.9404910528506034, + "grad_norm": 0.466796875, + "learning_rate": 2.351227632126509e-06, + "loss": 0.2776, + "step": 2260 + }, + { + "epoch": 0.9488139825218477, + "grad_norm": 0.388671875, + "learning_rate": 2.3720349563046194e-06, + "loss": 0.2788, + "step": 2280 + }, + { + "epoch": 0.9571369121930919, + "grad_norm": 0.486328125, + "learning_rate": 2.39284228048273e-06, + "loss": 0.2806, + "step": 2300 + }, + { + "epoch": 0.9571369121930919, + "eval_main_loss": 0.2877212464809418, + "eval_main_runtime": 6.3349, + "eval_main_samples_per_second": 29.993, + "eval_main_steps_per_second": 3.789, + "step": 2300 + }, + { + "epoch": 0.9571369121930919, + "eval_anatomy_loss": 2.9125289916992188, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.507, + "eval_anatomy_steps_per_second": 3.753, + "step": 2300 + }, + { + "epoch": 0.9571369121930919, + "eval_college_mathematics_loss": 2.1152408123016357, + "eval_college_mathematics_runtime": 0.2681, + "eval_college_mathematics_samples_per_second": 7.459, + "eval_college_mathematics_steps_per_second": 3.729, + "step": 2300 + }, + { + "epoch": 0.9571369121930919, + "eval_international_law_loss": 3.1422488689422607, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.516, + "eval_international_law_steps_per_second": 3.758, + "step": 2300 + }, + { + "epoch": 0.9654598418643362, + "grad_norm": 0.490234375, + "learning_rate": 2.4136496046608407e-06, + "loss": 0.2781, + "step": 2320 + }, + { + "epoch": 0.9737827715355806, + "grad_norm": 0.4296875, + "learning_rate": 2.4344569288389516e-06, + "loss": 0.2808, + "step": 2340 + }, + { + "epoch": 0.9821057012068248, + "grad_norm": 0.4609375, + "learning_rate": 2.4552642530170625e-06, + "loss": 0.2829, + "step": 2360 + }, + { + "epoch": 0.9904286308780691, + "grad_norm": 0.466796875, + "learning_rate": 2.476071577195173e-06, + "loss": 0.2789, + "step": 2380 + }, + { + "epoch": 0.9987515605493134, + "grad_norm": 0.44921875, + "learning_rate": 2.4968789013732834e-06, + "loss": 0.283, + "step": 2400 + }, + { + "epoch": 0.9987515605493134, + "eval_main_loss": 0.2861831784248352, + "eval_main_runtime": 6.3371, + "eval_main_samples_per_second": 29.982, + "eval_main_steps_per_second": 3.787, + "step": 2400 + }, + { + "epoch": 0.9987515605493134, + "eval_anatomy_loss": 2.9071733951568604, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.513, + "eval_anatomy_steps_per_second": 3.756, + "step": 2400 + }, + { + "epoch": 0.9987515605493134, + "eval_college_mathematics_loss": 2.1126503944396973, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.503, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 2400 + }, + { + "epoch": 0.9987515605493134, + "eval_international_law_loss": 3.1375622749328613, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.492, + "eval_international_law_steps_per_second": 3.746, + "step": 2400 + }, + { + "epoch": 1.0070744902205577, + "grad_norm": 0.474609375, + "learning_rate": 2.5176862255513947e-06, + "loss": 0.2779, + "step": 2420 + }, + { + "epoch": 1.0153974198918019, + "grad_norm": 0.51171875, + "learning_rate": 2.5384935497295047e-06, + "loss": 0.282, + "step": 2440 + }, + { + "epoch": 1.0237203495630463, + "grad_norm": 0.435546875, + "learning_rate": 2.559300873907616e-06, + "loss": 0.2757, + "step": 2460 + }, + { + "epoch": 1.0320432792342904, + "grad_norm": 0.431640625, + "learning_rate": 2.580108198085726e-06, + "loss": 0.2803, + "step": 2480 + }, + { + "epoch": 1.0403662089055348, + "grad_norm": 0.46484375, + "learning_rate": 2.6009155222638374e-06, + "loss": 0.2768, + "step": 2500 + }, + { + "epoch": 1.0403662089055348, + "eval_main_loss": 0.283886194229126, + "eval_main_runtime": 6.3349, + "eval_main_samples_per_second": 29.992, + "eval_main_steps_per_second": 3.789, + "step": 2500 + }, + { + "epoch": 1.0403662089055348, + "eval_anatomy_loss": 2.9029533863067627, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.519, + "eval_anatomy_steps_per_second": 3.76, + "step": 2500 + }, + { + "epoch": 1.0403662089055348, + "eval_college_mathematics_loss": 2.1050500869750977, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 2500 + }, + { + "epoch": 1.0403662089055348, + "eval_international_law_loss": 3.1294405460357666, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.484, + "eval_international_law_steps_per_second": 3.742, + "step": 2500 + }, + { + "epoch": 1.048689138576779, + "grad_norm": 0.462890625, + "learning_rate": 2.6217228464419474e-06, + "loss": 0.2817, + "step": 2520 + }, + { + "epoch": 1.0570120682480233, + "grad_norm": 0.5078125, + "learning_rate": 2.6425301706200583e-06, + "loss": 0.2763, + "step": 2540 + }, + { + "epoch": 1.0653349979192677, + "grad_norm": 0.46484375, + "learning_rate": 2.6633374947981696e-06, + "loss": 0.2757, + "step": 2560 + }, + { + "epoch": 1.0736579275905118, + "grad_norm": 0.51953125, + "learning_rate": 2.6841448189762796e-06, + "loss": 0.2764, + "step": 2580 + }, + { + "epoch": 1.0819808572617562, + "grad_norm": 0.48828125, + "learning_rate": 2.704952143154391e-06, + "loss": 0.2747, + "step": 2600 + }, + { + "epoch": 1.0819808572617562, + "eval_main_loss": 0.2814878225326538, + "eval_main_runtime": 6.3334, + "eval_main_samples_per_second": 30.0, + "eval_main_steps_per_second": 3.789, + "step": 2600 + }, + { + "epoch": 1.0819808572617562, + "eval_anatomy_loss": 2.898045778274536, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.503, + "eval_anatomy_steps_per_second": 3.752, + "step": 2600 + }, + { + "epoch": 1.0819808572617562, + "eval_college_mathematics_loss": 2.1027755737304688, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 2600 + }, + { + "epoch": 1.0819808572617562, + "eval_international_law_loss": 3.1270911693573, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.503, + "eval_international_law_steps_per_second": 3.751, + "step": 2600 + }, + { + "epoch": 1.0903037869330003, + "grad_norm": 0.474609375, + "learning_rate": 2.725759467332501e-06, + "loss": 0.2752, + "step": 2620 + }, + { + "epoch": 1.0986267166042447, + "grad_norm": 0.49609375, + "learning_rate": 2.746566791510612e-06, + "loss": 0.2709, + "step": 2640 + }, + { + "epoch": 1.1069496462754889, + "grad_norm": 0.490234375, + "learning_rate": 2.7673741156887223e-06, + "loss": 0.2741, + "step": 2660 + }, + { + "epoch": 1.1152725759467332, + "grad_norm": 0.44140625, + "learning_rate": 2.788181439866833e-06, + "loss": 0.2693, + "step": 2680 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.474609375, + "learning_rate": 2.8089887640449444e-06, + "loss": 0.2796, + "step": 2700 + }, + { + "epoch": 1.1235955056179776, + "eval_main_loss": 0.2786170542240143, + "eval_main_runtime": 6.3383, + "eval_main_samples_per_second": 29.977, + "eval_main_steps_per_second": 3.787, + "step": 2700 + }, + { + "epoch": 1.1235955056179776, + "eval_anatomy_loss": 2.8903613090515137, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.517, + "eval_anatomy_steps_per_second": 3.758, + "step": 2700 + }, + { + "epoch": 1.1235955056179776, + "eval_college_mathematics_loss": 2.0952742099761963, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.537, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 2700 + }, + { + "epoch": 1.1235955056179776, + "eval_international_law_loss": 3.120177745819092, + "eval_international_law_runtime": 0.2677, + "eval_international_law_samples_per_second": 7.47, + "eval_international_law_steps_per_second": 3.735, + "step": 2700 + }, + { + "epoch": 1.1319184352892218, + "grad_norm": 0.50390625, + "learning_rate": 2.8297960882230545e-06, + "loss": 0.2717, + "step": 2720 + }, + { + "epoch": 1.1402413649604661, + "grad_norm": 0.498046875, + "learning_rate": 2.8506034124011653e-06, + "loss": 0.2704, + "step": 2740 + }, + { + "epoch": 1.1485642946317103, + "grad_norm": 0.4765625, + "learning_rate": 2.871410736579276e-06, + "loss": 0.2688, + "step": 2760 + }, + { + "epoch": 1.1568872243029547, + "grad_norm": 0.458984375, + "learning_rate": 2.8922180607573867e-06, + "loss": 0.2654, + "step": 2780 + }, + { + "epoch": 1.1652101539741988, + "grad_norm": 0.419921875, + "learning_rate": 2.913025384935498e-06, + "loss": 0.2646, + "step": 2800 + }, + { + "epoch": 1.1652101539741988, + "eval_main_loss": 0.27353334426879883, + "eval_main_runtime": 6.3411, + "eval_main_samples_per_second": 29.963, + "eval_main_steps_per_second": 3.785, + "step": 2800 + }, + { + "epoch": 1.1652101539741988, + "eval_anatomy_loss": 2.8842265605926514, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.521, + "eval_anatomy_steps_per_second": 3.76, + "step": 2800 + }, + { + "epoch": 1.1652101539741988, + "eval_college_mathematics_loss": 2.086782693862915, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.531, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 2800 + }, + { + "epoch": 1.1652101539741988, + "eval_international_law_loss": 3.11441707611084, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.501, + "eval_international_law_steps_per_second": 3.75, + "step": 2800 + }, + { + "epoch": 1.1735330836454432, + "grad_norm": 0.43359375, + "learning_rate": 2.933832709113608e-06, + "loss": 0.2618, + "step": 2820 + }, + { + "epoch": 1.1818560133166875, + "grad_norm": 0.453125, + "learning_rate": 2.9546400332917193e-06, + "loss": 0.2636, + "step": 2840 + }, + { + "epoch": 1.1901789429879317, + "grad_norm": 0.453125, + "learning_rate": 2.9754473574698293e-06, + "loss": 0.2654, + "step": 2860 + }, + { + "epoch": 1.198501872659176, + "grad_norm": 0.46484375, + "learning_rate": 2.9962546816479402e-06, + "loss": 0.2586, + "step": 2880 + }, + { + "epoch": 1.2068248023304202, + "grad_norm": 0.462890625, + "learning_rate": 3.0170620058260507e-06, + "loss": 0.2576, + "step": 2900 + }, + { + "epoch": 1.2068248023304202, + "eval_main_loss": 0.2663625478744507, + "eval_main_runtime": 6.3386, + "eval_main_samples_per_second": 29.975, + "eval_main_steps_per_second": 3.786, + "step": 2900 + }, + { + "epoch": 1.2068248023304202, + "eval_anatomy_loss": 2.876462936401367, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.503, + "eval_anatomy_steps_per_second": 3.752, + "step": 2900 + }, + { + "epoch": 1.2068248023304202, + "eval_college_mathematics_loss": 2.0814149379730225, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.473, + "eval_college_mathematics_steps_per_second": 3.736, + "step": 2900 + }, + { + "epoch": 1.2068248023304202, + "eval_international_law_loss": 3.108351707458496, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.527, + "eval_international_law_steps_per_second": 3.764, + "step": 2900 + }, + { + "epoch": 1.2151477320016646, + "grad_norm": 0.41015625, + "learning_rate": 3.0378693300041616e-06, + "loss": 0.2599, + "step": 2920 + }, + { + "epoch": 1.2234706616729087, + "grad_norm": 0.46875, + "learning_rate": 3.058676654182273e-06, + "loss": 0.2602, + "step": 2940 + }, + { + "epoch": 1.2317935913441531, + "grad_norm": 0.4609375, + "learning_rate": 3.079483978360383e-06, + "loss": 0.2564, + "step": 2960 + }, + { + "epoch": 1.2401165210153975, + "grad_norm": 0.42578125, + "learning_rate": 3.1002913025384938e-06, + "loss": 0.2551, + "step": 2980 + }, + { + "epoch": 1.2484394506866416, + "grad_norm": 0.416015625, + "learning_rate": 3.1210986267166042e-06, + "loss": 0.2551, + "step": 3000 + }, + { + "epoch": 1.2484394506866416, + "eval_main_loss": 0.2596937417984009, + "eval_main_runtime": 6.3395, + "eval_main_samples_per_second": 29.971, + "eval_main_steps_per_second": 3.786, + "step": 3000 + }, + { + "epoch": 1.2484394506866416, + "eval_anatomy_loss": 2.8675546646118164, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.501, + "eval_anatomy_steps_per_second": 3.751, + "step": 3000 + }, + { + "epoch": 1.2484394506866416, + "eval_college_mathematics_loss": 2.0764455795288086, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.515, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 3000 + }, + { + "epoch": 1.2484394506866416, + "eval_international_law_loss": 3.1014602184295654, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.505, + "eval_international_law_steps_per_second": 3.752, + "step": 3000 + }, + { + "epoch": 1.256762380357886, + "grad_norm": 0.423828125, + "learning_rate": 3.141905950894715e-06, + "loss": 0.2528, + "step": 3020 + }, + { + "epoch": 1.2650853100291304, + "grad_norm": 0.408203125, + "learning_rate": 3.1627132750728256e-06, + "loss": 0.2555, + "step": 3040 + }, + { + "epoch": 1.2734082397003745, + "grad_norm": 0.404296875, + "learning_rate": 3.1835205992509364e-06, + "loss": 0.2527, + "step": 3060 + }, + { + "epoch": 1.2817311693716187, + "grad_norm": 0.41796875, + "learning_rate": 3.2043279234290473e-06, + "loss": 0.2523, + "step": 3080 + }, + { + "epoch": 1.290054099042863, + "grad_norm": 0.4453125, + "learning_rate": 3.2251352476071578e-06, + "loss": 0.2501, + "step": 3100 + }, + { + "epoch": 1.290054099042863, + "eval_main_loss": 0.2549309730529785, + "eval_main_runtime": 6.3572, + "eval_main_samples_per_second": 29.887, + "eval_main_steps_per_second": 3.775, + "step": 3100 + }, + { + "epoch": 1.290054099042863, + "eval_anatomy_loss": 2.862455368041992, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.507, + "eval_anatomy_steps_per_second": 3.753, + "step": 3100 + }, + { + "epoch": 1.290054099042863, + "eval_college_mathematics_loss": 2.0720179080963135, + "eval_college_mathematics_runtime": 0.2673, + "eval_college_mathematics_samples_per_second": 7.482, + "eval_college_mathematics_steps_per_second": 3.741, + "step": 3100 + }, + { + "epoch": 1.290054099042863, + "eval_international_law_loss": 3.095749855041504, + "eval_international_law_runtime": 0.2682, + "eval_international_law_samples_per_second": 7.457, + "eval_international_law_steps_per_second": 3.728, + "step": 3100 + }, + { + "epoch": 1.2983770287141074, + "grad_norm": 0.423828125, + "learning_rate": 3.2459425717852687e-06, + "loss": 0.2479, + "step": 3120 + }, + { + "epoch": 1.3066999583853516, + "grad_norm": 0.376953125, + "learning_rate": 3.266749895963379e-06, + "loss": 0.2454, + "step": 3140 + }, + { + "epoch": 1.315022888056596, + "grad_norm": 0.41796875, + "learning_rate": 3.28755722014149e-06, + "loss": 0.2428, + "step": 3160 + }, + { + "epoch": 1.3233458177278403, + "grad_norm": 0.404296875, + "learning_rate": 3.3083645443196004e-06, + "loss": 0.243, + "step": 3180 + }, + { + "epoch": 1.3316687473990845, + "grad_norm": 0.38671875, + "learning_rate": 3.3291718684977113e-06, + "loss": 0.2489, + "step": 3200 + }, + { + "epoch": 1.3316687473990845, + "eval_main_loss": 0.2516440749168396, + "eval_main_runtime": 6.3553, + "eval_main_samples_per_second": 29.896, + "eval_main_steps_per_second": 3.776, + "step": 3200 + }, + { + "epoch": 1.3316687473990845, + "eval_anatomy_loss": 2.8593392372131348, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.481, + "eval_anatomy_steps_per_second": 3.741, + "step": 3200 + }, + { + "epoch": 1.3316687473990845, + "eval_college_mathematics_loss": 2.0700876712799072, + "eval_college_mathematics_runtime": 0.2677, + "eval_college_mathematics_samples_per_second": 7.471, + "eval_college_mathematics_steps_per_second": 3.735, + "step": 3200 + }, + { + "epoch": 1.3316687473990845, + "eval_international_law_loss": 3.094395875930786, + "eval_international_law_runtime": 0.2675, + "eval_international_law_samples_per_second": 7.478, + "eval_international_law_steps_per_second": 3.739, + "step": 3200 + }, + { + "epoch": 1.3399916770703286, + "grad_norm": 0.34765625, + "learning_rate": 3.349979192675822e-06, + "loss": 0.2518, + "step": 3220 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.412109375, + "learning_rate": 3.3707865168539327e-06, + "loss": 0.243, + "step": 3240 + }, + { + "epoch": 1.3566375364128174, + "grad_norm": 0.431640625, + "learning_rate": 3.3915938410320435e-06, + "loss": 0.246, + "step": 3260 + }, + { + "epoch": 1.3649604660840615, + "grad_norm": 0.322265625, + "learning_rate": 3.412401165210154e-06, + "loss": 0.249, + "step": 3280 + }, + { + "epoch": 1.373283395755306, + "grad_norm": 0.37890625, + "learning_rate": 3.433208489388265e-06, + "loss": 0.2439, + "step": 3300 + }, + { + "epoch": 1.373283395755306, + "eval_main_loss": 0.24985496699810028, + "eval_main_runtime": 6.3475, + "eval_main_samples_per_second": 29.933, + "eval_main_steps_per_second": 3.781, + "step": 3300 + }, + { + "epoch": 1.373283395755306, + "eval_anatomy_loss": 2.8591318130493164, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.514, + "eval_anatomy_steps_per_second": 3.757, + "step": 3300 + }, + { + "epoch": 1.373283395755306, + "eval_college_mathematics_loss": 2.067070722579956, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.475, + "eval_college_mathematics_steps_per_second": 3.738, + "step": 3300 + }, + { + "epoch": 1.373283395755306, + "eval_international_law_loss": 3.0916695594787598, + "eval_international_law_runtime": 0.2676, + "eval_international_law_samples_per_second": 7.475, + "eval_international_law_steps_per_second": 3.738, + "step": 3300 + }, + { + "epoch": 1.3816063254265503, + "grad_norm": 0.369140625, + "learning_rate": 3.4540158135663753e-06, + "loss": 0.2431, + "step": 3320 + }, + { + "epoch": 1.3899292550977944, + "grad_norm": 0.416015625, + "learning_rate": 3.474823137744486e-06, + "loss": 0.2437, + "step": 3340 + }, + { + "epoch": 1.3982521847690386, + "grad_norm": 0.353515625, + "learning_rate": 3.495630461922597e-06, + "loss": 0.2434, + "step": 3360 + }, + { + "epoch": 1.406575114440283, + "grad_norm": 0.376953125, + "learning_rate": 3.5164377861007075e-06, + "loss": 0.243, + "step": 3380 + }, + { + "epoch": 1.4148980441115273, + "grad_norm": 0.40625, + "learning_rate": 3.5372451102788184e-06, + "loss": 0.2447, + "step": 3400 + }, + { + "epoch": 1.4148980441115273, + "eval_main_loss": 0.24850161373615265, + "eval_main_runtime": 6.3478, + "eval_main_samples_per_second": 29.931, + "eval_main_steps_per_second": 3.781, + "step": 3400 + }, + { + "epoch": 1.4148980441115273, + "eval_anatomy_loss": 2.85494327545166, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.753, + "step": 3400 + }, + { + "epoch": 1.4148980441115273, + "eval_college_mathematics_loss": 2.071016550064087, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 3400 + }, + { + "epoch": 1.4148980441115273, + "eval_international_law_loss": 3.0913243293762207, + "eval_international_law_runtime": 0.2682, + "eval_international_law_samples_per_second": 7.457, + "eval_international_law_steps_per_second": 3.729, + "step": 3400 + }, + { + "epoch": 1.4232209737827715, + "grad_norm": 0.3828125, + "learning_rate": 3.558052434456929e-06, + "loss": 0.2432, + "step": 3420 + }, + { + "epoch": 1.4315439034540158, + "grad_norm": 0.37890625, + "learning_rate": 3.5788597586350397e-06, + "loss": 0.2367, + "step": 3440 + }, + { + "epoch": 1.4398668331252602, + "grad_norm": 0.357421875, + "learning_rate": 3.59966708281315e-06, + "loss": 0.242, + "step": 3460 + }, + { + "epoch": 1.4481897627965044, + "grad_norm": 0.369140625, + "learning_rate": 3.620474406991261e-06, + "loss": 0.2382, + "step": 3480 + }, + { + "epoch": 1.4565126924677487, + "grad_norm": 0.34765625, + "learning_rate": 3.641281731169372e-06, + "loss": 0.2412, + "step": 3500 + }, + { + "epoch": 1.4565126924677487, + "eval_main_loss": 0.24757333099842072, + "eval_main_runtime": 6.3261, + "eval_main_samples_per_second": 30.034, + "eval_main_steps_per_second": 3.794, + "step": 3500 + }, + { + "epoch": 1.4565126924677487, + "eval_anatomy_loss": 2.8548152446746826, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.525, + "eval_anatomy_steps_per_second": 3.763, + "step": 3500 + }, + { + "epoch": 1.4565126924677487, + "eval_college_mathematics_loss": 2.0659983158111572, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.531, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 3500 + }, + { + "epoch": 1.4565126924677487, + "eval_international_law_loss": 3.090118646621704, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.489, + "eval_international_law_steps_per_second": 3.745, + "step": 3500 + }, + { + "epoch": 1.4648356221389929, + "grad_norm": 0.375, + "learning_rate": 3.6620890553474824e-06, + "loss": 0.2393, + "step": 3520 + }, + { + "epoch": 1.4731585518102372, + "grad_norm": 0.365234375, + "learning_rate": 3.6828963795255933e-06, + "loss": 0.2408, + "step": 3540 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.322265625, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.2421, + "step": 3560 + }, + { + "epoch": 1.4898044111527258, + "grad_norm": 0.333984375, + "learning_rate": 3.7245110278818146e-06, + "loss": 0.2396, + "step": 3580 + }, + { + "epoch": 1.4981273408239701, + "grad_norm": 0.390625, + "learning_rate": 3.7453183520599255e-06, + "loss": 0.2409, + "step": 3600 + }, + { + "epoch": 1.4981273408239701, + "eval_main_loss": 0.24677349627017975, + "eval_main_runtime": 6.3243, + "eval_main_samples_per_second": 30.043, + "eval_main_steps_per_second": 3.795, + "step": 3600 + }, + { + "epoch": 1.4981273408239701, + "eval_anatomy_loss": 2.8548572063446045, + "eval_anatomy_runtime": 0.2652, + "eval_anatomy_samples_per_second": 7.542, + "eval_anatomy_steps_per_second": 3.771, + "step": 3600 + }, + { + "epoch": 1.4981273408239701, + "eval_college_mathematics_loss": 2.067040205001831, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.529, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 3600 + }, + { + "epoch": 1.4981273408239701, + "eval_international_law_loss": 3.08817720413208, + "eval_international_law_runtime": 0.2659, + "eval_international_law_samples_per_second": 7.523, + "eval_international_law_steps_per_second": 3.761, + "step": 3600 + }, + { + "epoch": 1.5064502704952143, + "grad_norm": 0.3828125, + "learning_rate": 3.766125676238036e-06, + "loss": 0.24, + "step": 3620 + }, + { + "epoch": 1.5147732001664584, + "grad_norm": 0.37109375, + "learning_rate": 3.786933000416147e-06, + "loss": 0.2414, + "step": 3640 + }, + { + "epoch": 1.5230961298377028, + "grad_norm": 0.37109375, + "learning_rate": 3.8077403245942573e-06, + "loss": 0.2401, + "step": 3660 + }, + { + "epoch": 1.5314190595089472, + "grad_norm": 0.37890625, + "learning_rate": 3.828547648772369e-06, + "loss": 0.2373, + "step": 3680 + }, + { + "epoch": 1.5397419891801913, + "grad_norm": 0.353515625, + "learning_rate": 3.849354972950479e-06, + "loss": 0.2404, + "step": 3700 + }, + { + "epoch": 1.5397419891801913, + "eval_main_loss": 0.24623610079288483, + "eval_main_runtime": 6.3272, + "eval_main_samples_per_second": 30.029, + "eval_main_steps_per_second": 3.793, + "step": 3700 + }, + { + "epoch": 1.5397419891801913, + "eval_anatomy_loss": 2.8530216217041016, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.518, + "eval_anatomy_steps_per_second": 3.759, + "step": 3700 + }, + { + "epoch": 1.5397419891801913, + "eval_college_mathematics_loss": 2.065014123916626, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.528, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 3700 + }, + { + "epoch": 1.5397419891801913, + "eval_international_law_loss": 3.0873329639434814, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.51, + "eval_international_law_steps_per_second": 3.755, + "step": 3700 + }, + { + "epoch": 1.5480649188514357, + "grad_norm": 0.34765625, + "learning_rate": 3.8701622971285895e-06, + "loss": 0.2414, + "step": 3720 + }, + { + "epoch": 1.55638784852268, + "grad_norm": 0.384765625, + "learning_rate": 3.890969621306701e-06, + "loss": 0.2402, + "step": 3740 + }, + { + "epoch": 1.5647107781939242, + "grad_norm": 0.400390625, + "learning_rate": 3.911776945484811e-06, + "loss": 0.2394, + "step": 3760 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.328125, + "learning_rate": 3.932584269662922e-06, + "loss": 0.239, + "step": 3780 + }, + { + "epoch": 1.581356637536413, + "grad_norm": 0.384765625, + "learning_rate": 3.953391593841032e-06, + "loss": 0.2383, + "step": 3800 + }, + { + "epoch": 1.581356637536413, + "eval_main_loss": 0.24581117928028107, + "eval_main_runtime": 6.3278, + "eval_main_samples_per_second": 30.026, + "eval_main_steps_per_second": 3.793, + "step": 3800 + }, + { + "epoch": 1.581356637536413, + "eval_anatomy_loss": 2.8533191680908203, + "eval_anatomy_runtime": 0.2681, + "eval_anatomy_samples_per_second": 7.459, + "eval_anatomy_steps_per_second": 3.729, + "step": 3800 + }, + { + "epoch": 1.581356637536413, + "eval_college_mathematics_loss": 2.0673060417175293, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.526, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 3800 + }, + { + "epoch": 1.581356637536413, + "eval_international_law_loss": 3.0861918926239014, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.482, + "eval_international_law_steps_per_second": 3.741, + "step": 3800 + }, + { + "epoch": 1.5896795672076571, + "grad_norm": 0.34375, + "learning_rate": 3.9741989180191435e-06, + "loss": 0.2399, + "step": 3820 + }, + { + "epoch": 1.5980024968789013, + "grad_norm": 0.33984375, + "learning_rate": 3.995006242197254e-06, + "loss": 0.2298, + "step": 3840 + }, + { + "epoch": 1.6063254265501457, + "grad_norm": 0.37109375, + "learning_rate": 4.015813566375364e-06, + "loss": 0.2409, + "step": 3860 + }, + { + "epoch": 1.61464835622139, + "grad_norm": 0.36328125, + "learning_rate": 4.036620890553476e-06, + "loss": 0.2397, + "step": 3880 + }, + { + "epoch": 1.6229712858926342, + "grad_norm": 0.35546875, + "learning_rate": 4.057428214731586e-06, + "loss": 0.2373, + "step": 3900 + }, + { + "epoch": 1.6229712858926342, + "eval_main_loss": 0.2453879863023758, + "eval_main_runtime": 6.3211, + "eval_main_samples_per_second": 30.058, + "eval_main_steps_per_second": 3.797, + "step": 3900 + }, + { + "epoch": 1.6229712858926342, + "eval_anatomy_loss": 2.8509092330932617, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.525, + "eval_anatomy_steps_per_second": 3.762, + "step": 3900 + }, + { + "epoch": 1.6229712858926342, + "eval_college_mathematics_loss": 2.0684049129486084, + "eval_college_mathematics_runtime": 0.2654, + "eval_college_mathematics_samples_per_second": 7.535, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 3900 + }, + { + "epoch": 1.6229712858926342, + "eval_international_law_loss": 3.0845108032226562, + "eval_international_law_runtime": 0.2652, + "eval_international_law_samples_per_second": 7.542, + "eval_international_law_steps_per_second": 3.771, + "step": 3900 + }, + { + "epoch": 1.6312942155638783, + "grad_norm": 0.3203125, + "learning_rate": 4.078235538909697e-06, + "loss": 0.2356, + "step": 3920 + }, + { + "epoch": 1.639617145235123, + "grad_norm": 0.36328125, + "learning_rate": 4.099042863087807e-06, + "loss": 0.238, + "step": 3940 + }, + { + "epoch": 1.647940074906367, + "grad_norm": 0.3359375, + "learning_rate": 4.119850187265918e-06, + "loss": 0.2421, + "step": 3960 + }, + { + "epoch": 1.6562630045776112, + "grad_norm": 0.380859375, + "learning_rate": 4.140657511444029e-06, + "loss": 0.2401, + "step": 3980 + }, + { + "epoch": 1.6645859342488556, + "grad_norm": 0.341796875, + "learning_rate": 4.161464835622139e-06, + "loss": 0.2372, + "step": 4000 + }, + { + "epoch": 1.6645859342488556, + "eval_main_loss": 0.2449522763490677, + "eval_main_runtime": 6.3294, + "eval_main_samples_per_second": 30.019, + "eval_main_steps_per_second": 3.792, + "step": 4000 + }, + { + "epoch": 1.6645859342488556, + "eval_anatomy_loss": 2.8499295711517334, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.487, + "eval_anatomy_steps_per_second": 3.744, + "step": 4000 + }, + { + "epoch": 1.6645859342488556, + "eval_college_mathematics_loss": 2.0638344287872314, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.514, + "eval_college_mathematics_steps_per_second": 3.757, + "step": 4000 + }, + { + "epoch": 1.6645859342488556, + "eval_international_law_loss": 3.08413028717041, + "eval_international_law_runtime": 0.2655, + "eval_international_law_samples_per_second": 7.534, + "eval_international_law_steps_per_second": 3.767, + "step": 4000 + }, + { + "epoch": 1.6729088639201, + "grad_norm": 0.361328125, + "learning_rate": 4.1822721598002506e-06, + "loss": 0.2415, + "step": 4020 + }, + { + "epoch": 1.6812317935913441, + "grad_norm": 0.349609375, + "learning_rate": 4.203079483978361e-06, + "loss": 0.244, + "step": 4040 + }, + { + "epoch": 1.6895547232625883, + "grad_norm": 0.33984375, + "learning_rate": 4.2238868081564715e-06, + "loss": 0.2371, + "step": 4060 + }, + { + "epoch": 1.6978776529338329, + "grad_norm": 0.353515625, + "learning_rate": 4.244694132334582e-06, + "loss": 0.2371, + "step": 4080 + }, + { + "epoch": 1.706200582605077, + "grad_norm": 0.318359375, + "learning_rate": 4.265501456512693e-06, + "loss": 0.2389, + "step": 4100 + }, + { + "epoch": 1.706200582605077, + "eval_main_loss": 0.24446672201156616, + "eval_main_runtime": 6.3267, + "eval_main_samples_per_second": 30.031, + "eval_main_steps_per_second": 3.793, + "step": 4100 + }, + { + "epoch": 1.706200582605077, + "eval_anatomy_loss": 2.8472187519073486, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 4100 + }, + { + "epoch": 1.706200582605077, + "eval_college_mathematics_loss": 2.0650761127471924, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.513, + "eval_college_mathematics_steps_per_second": 3.757, + "step": 4100 + }, + { + "epoch": 1.706200582605077, + "eval_international_law_loss": 3.085548162460327, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.529, + "eval_international_law_steps_per_second": 3.765, + "step": 4100 + }, + { + "epoch": 1.7145235122763212, + "grad_norm": 0.265625, + "learning_rate": 4.286308780690803e-06, + "loss": 0.2373, + "step": 4120 + }, + { + "epoch": 1.7228464419475655, + "grad_norm": 0.330078125, + "learning_rate": 4.307116104868914e-06, + "loss": 0.2394, + "step": 4140 + }, + { + "epoch": 1.73116937161881, + "grad_norm": 0.3515625, + "learning_rate": 4.3279234290470254e-06, + "loss": 0.2376, + "step": 4160 + }, + { + "epoch": 1.739492301290054, + "grad_norm": 0.330078125, + "learning_rate": 4.348730753225136e-06, + "loss": 0.2348, + "step": 4180 + }, + { + "epoch": 1.7478152309612984, + "grad_norm": 0.392578125, + "learning_rate": 4.369538077403246e-06, + "loss": 0.2377, + "step": 4200 + }, + { + "epoch": 1.7478152309612984, + "eval_main_loss": 0.2439066469669342, + "eval_main_runtime": 6.3271, + "eval_main_samples_per_second": 30.03, + "eval_main_steps_per_second": 3.793, + "step": 4200 + }, + { + "epoch": 1.7478152309612984, + "eval_anatomy_loss": 2.8513143062591553, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.517, + "eval_anatomy_steps_per_second": 3.758, + "step": 4200 + }, + { + "epoch": 1.7478152309612984, + "eval_college_mathematics_loss": 2.062347412109375, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.539, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 4200 + }, + { + "epoch": 1.7478152309612984, + "eval_international_law_loss": 3.082612991333008, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.504, + "eval_international_law_steps_per_second": 3.752, + "step": 4200 + }, + { + "epoch": 1.7561381606325428, + "grad_norm": 0.322265625, + "learning_rate": 4.390345401581357e-06, + "loss": 0.2359, + "step": 4220 + }, + { + "epoch": 1.764461090303787, + "grad_norm": 0.392578125, + "learning_rate": 4.411152725759468e-06, + "loss": 0.2387, + "step": 4240 + }, + { + "epoch": 1.772784019975031, + "grad_norm": 0.330078125, + "learning_rate": 4.431960049937578e-06, + "loss": 0.2356, + "step": 4260 + }, + { + "epoch": 1.7811069496462755, + "grad_norm": 0.326171875, + "learning_rate": 4.452767374115689e-06, + "loss": 0.236, + "step": 4280 + }, + { + "epoch": 1.7894298793175198, + "grad_norm": 0.35546875, + "learning_rate": 4.4735746982938e-06, + "loss": 0.2359, + "step": 4300 + }, + { + "epoch": 1.7894298793175198, + "eval_main_loss": 0.2437724471092224, + "eval_main_runtime": 6.33, + "eval_main_samples_per_second": 30.016, + "eval_main_steps_per_second": 3.791, + "step": 4300 + }, + { + "epoch": 1.7894298793175198, + "eval_anatomy_loss": 2.847775936126709, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.752, + "step": 4300 + }, + { + "epoch": 1.7894298793175198, + "eval_college_mathematics_loss": 2.0655105113983154, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.534, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 4300 + }, + { + "epoch": 1.7894298793175198, + "eval_international_law_loss": 3.084287166595459, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.757, + "step": 4300 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.322265625, + "learning_rate": 4.494382022471911e-06, + "loss": 0.2378, + "step": 4320 + }, + { + "epoch": 1.8060757386600084, + "grad_norm": 0.322265625, + "learning_rate": 4.515189346650021e-06, + "loss": 0.2356, + "step": 4340 + }, + { + "epoch": 1.8143986683312527, + "grad_norm": 0.369140625, + "learning_rate": 4.535996670828132e-06, + "loss": 0.2358, + "step": 4360 + }, + { + "epoch": 1.822721598002497, + "grad_norm": 0.34375, + "learning_rate": 4.556803995006243e-06, + "loss": 0.2379, + "step": 4380 + }, + { + "epoch": 1.831044527673741, + "grad_norm": 0.33984375, + "learning_rate": 4.5776113191843534e-06, + "loss": 0.2395, + "step": 4400 + }, + { + "epoch": 1.831044527673741, + "eval_main_loss": 0.24319638311862946, + "eval_main_runtime": 6.3267, + "eval_main_samples_per_second": 30.032, + "eval_main_steps_per_second": 3.793, + "step": 4400 + }, + { + "epoch": 1.831044527673741, + "eval_anatomy_loss": 2.846129894256592, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.501, + "eval_anatomy_steps_per_second": 3.751, + "step": 4400 + }, + { + "epoch": 1.831044527673741, + "eval_college_mathematics_loss": 2.0631587505340576, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.545, + "eval_college_mathematics_steps_per_second": 3.773, + "step": 4400 + }, + { + "epoch": 1.831044527673741, + "eval_international_law_loss": 3.0830471515655518, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.498, + "eval_international_law_steps_per_second": 3.749, + "step": 4400 + }, + { + "epoch": 1.8393674573449854, + "grad_norm": 0.29296875, + "learning_rate": 4.598418643362464e-06, + "loss": 0.2367, + "step": 4420 + }, + { + "epoch": 1.8476903870162298, + "grad_norm": 0.31640625, + "learning_rate": 4.619225967540575e-06, + "loss": 0.2356, + "step": 4440 + }, + { + "epoch": 1.856013316687474, + "grad_norm": 0.3359375, + "learning_rate": 4.640033291718685e-06, + "loss": 0.2386, + "step": 4460 + }, + { + "epoch": 1.8643362463587183, + "grad_norm": 0.337890625, + "learning_rate": 4.660840615896796e-06, + "loss": 0.236, + "step": 4480 + }, + { + "epoch": 1.8726591760299627, + "grad_norm": 0.32421875, + "learning_rate": 4.6816479400749066e-06, + "loss": 0.24, + "step": 4500 + }, + { + "epoch": 1.8726591760299627, + "eval_main_loss": 0.2428365796804428, + "eval_main_runtime": 6.3298, + "eval_main_samples_per_second": 30.017, + "eval_main_steps_per_second": 3.792, + "step": 4500 + }, + { + "epoch": 1.8726591760299627, + "eval_anatomy_loss": 2.846830368041992, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.526, + "eval_anatomy_steps_per_second": 3.763, + "step": 4500 + }, + { + "epoch": 1.8726591760299627, + "eval_college_mathematics_loss": 2.061974287033081, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 4500 + }, + { + "epoch": 1.8726591760299627, + "eval_international_law_loss": 3.0783193111419678, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.492, + "eval_international_law_steps_per_second": 3.746, + "step": 4500 + }, + { + "epoch": 1.8809821057012068, + "grad_norm": 0.353515625, + "learning_rate": 4.702455264253018e-06, + "loss": 0.2359, + "step": 4520 + }, + { + "epoch": 1.889305035372451, + "grad_norm": 0.310546875, + "learning_rate": 4.723262588431128e-06, + "loss": 0.2419, + "step": 4540 + }, + { + "epoch": 1.8976279650436954, + "grad_norm": 0.294921875, + "learning_rate": 4.744069912609239e-06, + "loss": 0.2353, + "step": 4560 + }, + { + "epoch": 1.9059508947149397, + "grad_norm": 0.361328125, + "learning_rate": 4.76487723678735e-06, + "loss": 0.2364, + "step": 4580 + }, + { + "epoch": 1.9142738243861839, + "grad_norm": 0.330078125, + "learning_rate": 4.78568456096546e-06, + "loss": 0.2397, + "step": 4600 + }, + { + "epoch": 1.9142738243861839, + "eval_main_loss": 0.24256636202335358, + "eval_main_runtime": 6.328, + "eval_main_samples_per_second": 30.025, + "eval_main_steps_per_second": 3.793, + "step": 4600 + }, + { + "epoch": 1.9142738243861839, + "eval_anatomy_loss": 2.8468308448791504, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.498, + "eval_anatomy_steps_per_second": 3.749, + "step": 4600 + }, + { + "epoch": 1.9142738243861839, + "eval_college_mathematics_loss": 2.06019926071167, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.534, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 4600 + }, + { + "epoch": 1.9142738243861839, + "eval_international_law_loss": 3.079383373260498, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.518, + "eval_international_law_steps_per_second": 3.759, + "step": 4600 + }, + { + "epoch": 1.9225967540574282, + "grad_norm": 0.328125, + "learning_rate": 4.806491885143571e-06, + "loss": 0.2299, + "step": 4620 + }, + { + "epoch": 1.9309196837286726, + "grad_norm": 0.314453125, + "learning_rate": 4.8272992093216814e-06, + "loss": 0.2363, + "step": 4640 + }, + { + "epoch": 1.9392426133999168, + "grad_norm": 0.298828125, + "learning_rate": 4.848106533499793e-06, + "loss": 0.2336, + "step": 4660 + }, + { + "epoch": 1.947565543071161, + "grad_norm": 0.326171875, + "learning_rate": 4.868913857677903e-06, + "loss": 0.2344, + "step": 4680 + }, + { + "epoch": 1.9558884727424053, + "grad_norm": 0.296875, + "learning_rate": 4.889721181856014e-06, + "loss": 0.2373, + "step": 4700 + }, + { + "epoch": 1.9558884727424053, + "eval_main_loss": 0.2421763837337494, + "eval_main_runtime": 6.3346, + "eval_main_samples_per_second": 29.994, + "eval_main_steps_per_second": 3.789, + "step": 4700 + }, + { + "epoch": 1.9558884727424053, + "eval_anatomy_loss": 2.8438565731048584, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.742, + "step": 4700 + }, + { + "epoch": 1.9558884727424053, + "eval_college_mathematics_loss": 2.0591747760772705, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.501, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 4700 + }, + { + "epoch": 1.9558884727424053, + "eval_international_law_loss": 3.0781655311584473, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.512, + "eval_international_law_steps_per_second": 3.756, + "step": 4700 + }, + { + "epoch": 1.9642114024136497, + "grad_norm": 0.333984375, + "learning_rate": 4.910528506034125e-06, + "loss": 0.2338, + "step": 4720 + }, + { + "epoch": 1.9725343320848938, + "grad_norm": 0.3359375, + "learning_rate": 4.9313358302122346e-06, + "loss": 0.2396, + "step": 4740 + }, + { + "epoch": 1.9808572617561382, + "grad_norm": 0.35546875, + "learning_rate": 4.952143154390346e-06, + "loss": 0.2303, + "step": 4760 + }, + { + "epoch": 1.9891801914273826, + "grad_norm": 0.310546875, + "learning_rate": 4.972950478568456e-06, + "loss": 0.2332, + "step": 4780 + }, + { + "epoch": 1.9975031210986267, + "grad_norm": 0.283203125, + "learning_rate": 4.993757802746567e-06, + "loss": 0.2409, + "step": 4800 + }, + { + "epoch": 1.9975031210986267, + "eval_main_loss": 0.24170413613319397, + "eval_main_runtime": 6.3319, + "eval_main_samples_per_second": 30.007, + "eval_main_steps_per_second": 3.79, + "step": 4800 + }, + { + "epoch": 1.9975031210986267, + "eval_anatomy_loss": 2.843282699584961, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.491, + "eval_anatomy_steps_per_second": 3.745, + "step": 4800 + }, + { + "epoch": 1.9975031210986267, + "eval_college_mathematics_loss": 2.059091329574585, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 4800 + }, + { + "epoch": 1.9975031210986267, + "eval_international_law_loss": 3.0776219367980957, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.756, + "step": 4800 + }, + { + "epoch": 2.005826050769871, + "grad_norm": 0.296875, + "learning_rate": 4.999993456981855e-06, + "loss": 0.2335, + "step": 4820 + }, + { + "epoch": 2.0141489804411155, + "grad_norm": 0.3203125, + "learning_rate": 4.999961409628488e-06, + "loss": 0.2309, + "step": 4840 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.3515625, + "learning_rate": 4.999902656502973e-06, + "loss": 0.2327, + "step": 4860 + }, + { + "epoch": 2.0307948397836038, + "grad_norm": 0.296875, + "learning_rate": 4.99981719823294e-06, + "loss": 0.2379, + "step": 4880 + }, + { + "epoch": 2.039117769454848, + "grad_norm": 0.28515625, + "learning_rate": 4.999705035731294e-06, + "loss": 0.2372, + "step": 4900 + }, + { + "epoch": 2.039117769454848, + "eval_main_loss": 0.24145889282226562, + "eval_main_runtime": 6.3292, + "eval_main_samples_per_second": 30.02, + "eval_main_steps_per_second": 3.792, + "step": 4900 + }, + { + "epoch": 2.039117769454848, + "eval_anatomy_loss": 2.841543197631836, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.506, + "eval_anatomy_steps_per_second": 3.753, + "step": 4900 + }, + { + "epoch": 2.039117769454848, + "eval_college_mathematics_loss": 2.058622360229492, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.503, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 4900 + }, + { + "epoch": 2.039117769454848, + "eval_international_law_loss": 3.0742015838623047, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.49, + "eval_international_law_steps_per_second": 3.745, + "step": 4900 + }, + { + "epoch": 2.0474406991260925, + "grad_norm": 0.3203125, + "learning_rate": 4.999566170196208e-06, + "loss": 0.236, + "step": 4920 + }, + { + "epoch": 2.0557636287973367, + "grad_norm": 0.26171875, + "learning_rate": 4.999400603111109e-06, + "loss": 0.2362, + "step": 4940 + }, + { + "epoch": 2.064086558468581, + "grad_norm": 0.279296875, + "learning_rate": 4.999208336244664e-06, + "loss": 0.2338, + "step": 4960 + }, + { + "epoch": 2.0724094881398254, + "grad_norm": 0.28515625, + "learning_rate": 4.998989371650758e-06, + "loss": 0.2336, + "step": 4980 + }, + { + "epoch": 2.0807324178110695, + "grad_norm": 0.30078125, + "learning_rate": 4.998743711668475e-06, + "loss": 0.233, + "step": 5000 + }, + { + "epoch": 2.0807324178110695, + "eval_main_loss": 0.2409505397081375, + "eval_main_runtime": 6.3309, + "eval_main_samples_per_second": 30.012, + "eval_main_steps_per_second": 3.791, + "step": 5000 + }, + { + "epoch": 2.0807324178110695, + "eval_anatomy_loss": 2.8420751094818115, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.527, + "eval_anatomy_steps_per_second": 3.764, + "step": 5000 + }, + { + "epoch": 2.0807324178110695, + "eval_college_mathematics_loss": 2.0578598976135254, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.534, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 5000 + }, + { + "epoch": 2.0807324178110695, + "eval_international_law_loss": 3.0756309032440186, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.529, + "eval_international_law_steps_per_second": 3.765, + "step": 5000 + }, + { + "epoch": 2.0890553474823137, + "grad_norm": 0.291015625, + "learning_rate": 4.998471358922071e-06, + "loss": 0.227, + "step": 5020 + }, + { + "epoch": 2.097378277153558, + "grad_norm": 0.275390625, + "learning_rate": 4.998172316320947e-06, + "loss": 0.2354, + "step": 5040 + }, + { + "epoch": 2.1057012068248024, + "grad_norm": 0.322265625, + "learning_rate": 4.997846587059618e-06, + "loss": 0.2314, + "step": 5060 + }, + { + "epoch": 2.1140241364960466, + "grad_norm": 0.328125, + "learning_rate": 4.997494174617679e-06, + "loss": 0.2381, + "step": 5080 + }, + { + "epoch": 2.1223470661672907, + "grad_norm": 0.28515625, + "learning_rate": 4.997115082759764e-06, + "loss": 0.233, + "step": 5100 + }, + { + "epoch": 2.1223470661672907, + "eval_main_loss": 0.2407059371471405, + "eval_main_runtime": 6.33, + "eval_main_samples_per_second": 30.016, + "eval_main_steps_per_second": 3.791, + "step": 5100 + }, + { + "epoch": 2.1223470661672907, + "eval_anatomy_loss": 2.8402063846588135, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.503, + "eval_anatomy_steps_per_second": 3.752, + "step": 5100 + }, + { + "epoch": 2.1223470661672907, + "eval_college_mathematics_loss": 2.0568246841430664, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.517, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 5100 + }, + { + "epoch": 2.1223470661672907, + "eval_international_law_loss": 3.073444128036499, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.514, + "eval_international_law_steps_per_second": 3.757, + "step": 5100 + }, + { + "epoch": 2.1306699958385353, + "grad_norm": 0.322265625, + "learning_rate": 4.996709315535515e-06, + "loss": 0.2334, + "step": 5120 + }, + { + "epoch": 2.1389929255097795, + "grad_norm": 0.2578125, + "learning_rate": 4.9962768772795274e-06, + "loss": 0.235, + "step": 5140 + }, + { + "epoch": 2.1473158551810236, + "grad_norm": 0.31640625, + "learning_rate": 4.995817772611314e-06, + "loss": 0.232, + "step": 5160 + }, + { + "epoch": 2.1556387848522682, + "grad_norm": 0.3046875, + "learning_rate": 4.995332006435246e-06, + "loss": 0.2399, + "step": 5180 + }, + { + "epoch": 2.1639617145235124, + "grad_norm": 0.294921875, + "learning_rate": 4.9948195839405085e-06, + "loss": 0.2345, + "step": 5200 + }, + { + "epoch": 2.1639617145235124, + "eval_main_loss": 0.2404136210680008, + "eval_main_runtime": 6.3338, + "eval_main_samples_per_second": 29.998, + "eval_main_steps_per_second": 3.789, + "step": 5200 + }, + { + "epoch": 2.1639617145235124, + "eval_anatomy_loss": 2.838599443435669, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.527, + "eval_anatomy_steps_per_second": 3.763, + "step": 5200 + }, + { + "epoch": 2.1639617145235124, + "eval_college_mathematics_loss": 2.056016683578491, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 5200 + }, + { + "epoch": 2.1639617145235124, + "eval_international_law_loss": 3.0730698108673096, + "eval_international_law_runtime": 0.2655, + "eval_international_law_samples_per_second": 7.533, + "eval_international_law_steps_per_second": 3.767, + "step": 5200 + }, + { + "epoch": 2.1722846441947565, + "grad_norm": 0.333984375, + "learning_rate": 4.9942805106010415e-06, + "loss": 0.2301, + "step": 5220 + }, + { + "epoch": 2.1806075738660007, + "grad_norm": 0.296875, + "learning_rate": 4.993714792175483e-06, + "loss": 0.2344, + "step": 5240 + }, + { + "epoch": 2.1889305035372453, + "grad_norm": 0.3125, + "learning_rate": 4.993122434707103e-06, + "loss": 0.2359, + "step": 5260 + }, + { + "epoch": 2.1972534332084894, + "grad_norm": 0.34375, + "learning_rate": 4.992503444523746e-06, + "loss": 0.2366, + "step": 5280 + }, + { + "epoch": 2.2055763628797336, + "grad_norm": 0.302734375, + "learning_rate": 4.991857828237757e-06, + "loss": 0.2369, + "step": 5300 + }, + { + "epoch": 2.2055763628797336, + "eval_main_loss": 0.24032773077487946, + "eval_main_runtime": 6.3258, + "eval_main_samples_per_second": 30.036, + "eval_main_steps_per_second": 3.794, + "step": 5300 + }, + { + "epoch": 2.2055763628797336, + "eval_anatomy_loss": 2.839557409286499, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.758, + "step": 5300 + }, + { + "epoch": 2.2055763628797336, + "eval_college_mathematics_loss": 2.0543038845062256, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.494, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 5300 + }, + { + "epoch": 2.2055763628797336, + "eval_international_law_loss": 3.0714683532714844, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.487, + "eval_international_law_steps_per_second": 3.743, + "step": 5300 + }, + { + "epoch": 2.2138992925509777, + "grad_norm": 0.287109375, + "learning_rate": 4.9911855927459175e-06, + "loss": 0.2338, + "step": 5320 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3359375, + "learning_rate": 4.990486745229364e-06, + "loss": 0.2324, + "step": 5340 + }, + { + "epoch": 2.2305451518934665, + "grad_norm": 0.337890625, + "learning_rate": 4.989761293153516e-06, + "loss": 0.2362, + "step": 5360 + }, + { + "epoch": 2.2388680815647106, + "grad_norm": 0.2734375, + "learning_rate": 4.989009244267998e-06, + "loss": 0.2331, + "step": 5380 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.322265625, + "learning_rate": 4.988230606606552e-06, + "loss": 0.235, + "step": 5400 + }, + { + "epoch": 2.247191011235955, + "eval_main_loss": 0.2399168312549591, + "eval_main_runtime": 6.3585, + "eval_main_samples_per_second": 29.881, + "eval_main_steps_per_second": 3.774, + "step": 5400 + }, + { + "epoch": 2.247191011235955, + "eval_anatomy_loss": 2.83823823928833, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.752, + "step": 5400 + }, + { + "epoch": 2.247191011235955, + "eval_college_mathematics_loss": 2.0524001121520996, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.497, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 5400 + }, + { + "epoch": 2.247191011235955, + "eval_international_law_loss": 3.073040246963501, + "eval_international_law_runtime": 0.2681, + "eval_international_law_samples_per_second": 7.461, + "eval_international_law_steps_per_second": 3.73, + "step": 5400 + }, + { + "epoch": 2.2555139409071994, + "grad_norm": 0.29296875, + "learning_rate": 4.987425388486953e-06, + "loss": 0.2338, + "step": 5420 + }, + { + "epoch": 2.2638368705784435, + "grad_norm": 0.29296875, + "learning_rate": 4.986593598510924e-06, + "loss": 0.2309, + "step": 5440 + }, + { + "epoch": 2.272159800249688, + "grad_norm": 0.271484375, + "learning_rate": 4.985735245564039e-06, + "loss": 0.2325, + "step": 5460 + }, + { + "epoch": 2.2804827299209323, + "grad_norm": 0.34375, + "learning_rate": 4.984850338815631e-06, + "loss": 0.2319, + "step": 5480 + }, + { + "epoch": 2.2888056595921764, + "grad_norm": 0.333984375, + "learning_rate": 4.983938887718692e-06, + "loss": 0.2414, + "step": 5500 + }, + { + "epoch": 2.2888056595921764, + "eval_main_loss": 0.23984655737876892, + "eval_main_runtime": 6.3511, + "eval_main_samples_per_second": 29.916, + "eval_main_steps_per_second": 3.779, + "step": 5500 + }, + { + "epoch": 2.2888056595921764, + "eval_anatomy_loss": 2.8364694118499756, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.741, + "step": 5500 + }, + { + "epoch": 2.2888056595921764, + "eval_college_mathematics_loss": 2.051426410675049, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 5500 + }, + { + "epoch": 2.2888056595921764, + "eval_international_law_loss": 3.0698070526123047, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.502, + "eval_international_law_steps_per_second": 3.751, + "step": 5500 + }, + { + "epoch": 2.2971285892634206, + "grad_norm": 0.30078125, + "learning_rate": 4.983000902009776e-06, + "loss": 0.2384, + "step": 5520 + }, + { + "epoch": 2.305451518934665, + "grad_norm": 0.306640625, + "learning_rate": 4.982036391708891e-06, + "loss": 0.2342, + "step": 5540 + }, + { + "epoch": 2.3137744486059093, + "grad_norm": 0.2734375, + "learning_rate": 4.98104536711939e-06, + "loss": 0.2402, + "step": 5560 + }, + { + "epoch": 2.3220973782771535, + "grad_norm": 0.26953125, + "learning_rate": 4.9800278388278715e-06, + "loss": 0.231, + "step": 5580 + }, + { + "epoch": 2.3304203079483976, + "grad_norm": 0.294921875, + "learning_rate": 4.978983817704051e-06, + "loss": 0.2428, + "step": 5600 + }, + { + "epoch": 2.3304203079483976, + "eval_main_loss": 0.23956555128097534, + "eval_main_runtime": 6.334, + "eval_main_samples_per_second": 29.997, + "eval_main_steps_per_second": 3.789, + "step": 5600 + }, + { + "epoch": 2.3304203079483976, + "eval_anatomy_loss": 2.8366129398345947, + "eval_anatomy_runtime": 0.268, + "eval_anatomy_samples_per_second": 7.464, + "eval_anatomy_steps_per_second": 3.732, + "step": 5600 + }, + { + "epoch": 2.3304203079483976, + "eval_college_mathematics_loss": 2.0517213344573975, + "eval_college_mathematics_runtime": 0.2675, + "eval_college_mathematics_samples_per_second": 7.477, + "eval_college_mathematics_steps_per_second": 3.738, + "step": 5600 + }, + { + "epoch": 2.3304203079483976, + "eval_international_law_loss": 3.0719730854034424, + "eval_international_law_runtime": 0.2653, + "eval_international_law_samples_per_second": 7.538, + "eval_international_law_steps_per_second": 3.769, + "step": 5600 + }, + { + "epoch": 2.338743237619642, + "grad_norm": 0.259765625, + "learning_rate": 4.977913314900659e-06, + "loss": 0.228, + "step": 5620 + }, + { + "epoch": 2.3470661672908864, + "grad_norm": 0.265625, + "learning_rate": 4.976816341853312e-06, + "loss": 0.2344, + "step": 5640 + }, + { + "epoch": 2.3553890969621305, + "grad_norm": 0.294921875, + "learning_rate": 4.975692910280397e-06, + "loss": 0.2305, + "step": 5660 + }, + { + "epoch": 2.363712026633375, + "grad_norm": 0.279296875, + "learning_rate": 4.974543032182943e-06, + "loss": 0.2372, + "step": 5680 + }, + { + "epoch": 2.3720349563046192, + "grad_norm": 0.294921875, + "learning_rate": 4.973366719844491e-06, + "loss": 0.2319, + "step": 5700 + }, + { + "epoch": 2.3720349563046192, + "eval_main_loss": 0.23942260444164276, + "eval_main_runtime": 6.3369, + "eval_main_samples_per_second": 29.983, + "eval_main_steps_per_second": 3.787, + "step": 5700 + }, + { + "epoch": 2.3720349563046192, + "eval_anatomy_loss": 2.8341376781463623, + "eval_anatomy_runtime": 0.2654, + "eval_anatomy_samples_per_second": 7.537, + "eval_anatomy_steps_per_second": 3.768, + "step": 5700 + }, + { + "epoch": 2.3720349563046192, + "eval_college_mathematics_loss": 2.0530576705932617, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.512, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 5700 + }, + { + "epoch": 2.3720349563046192, + "eval_international_law_loss": 3.070324420928955, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.513, + "eval_international_law_steps_per_second": 3.757, + "step": 5700 + }, + { + "epoch": 2.3803578859758634, + "grad_norm": 0.30859375, + "learning_rate": 4.972163985830967e-06, + "loss": 0.2326, + "step": 5720 + }, + { + "epoch": 2.388680815647108, + "grad_norm": 0.296875, + "learning_rate": 4.970934842990546e-06, + "loss": 0.234, + "step": 5740 + }, + { + "epoch": 2.397003745318352, + "grad_norm": 0.28515625, + "learning_rate": 4.969679304453513e-06, + "loss": 0.231, + "step": 5760 + }, + { + "epoch": 2.4053266749895963, + "grad_norm": 0.33984375, + "learning_rate": 4.968397383632127e-06, + "loss": 0.2322, + "step": 5780 + }, + { + "epoch": 2.4136496046608404, + "grad_norm": 0.263671875, + "learning_rate": 4.967089094220473e-06, + "loss": 0.2276, + "step": 5800 + }, + { + "epoch": 2.4136496046608404, + "eval_main_loss": 0.23935824632644653, + "eval_main_runtime": 6.3314, + "eval_main_samples_per_second": 30.009, + "eval_main_steps_per_second": 3.791, + "step": 5800 + }, + { + "epoch": 2.4136496046608404, + "eval_anatomy_loss": 2.8363730907440186, + "eval_anatomy_runtime": 0.2672, + "eval_anatomy_samples_per_second": 7.484, + "eval_anatomy_steps_per_second": 3.742, + "step": 5800 + }, + { + "epoch": 2.4136496046608404, + "eval_college_mathematics_loss": 2.04972767829895, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 5800 + }, + { + "epoch": 2.4136496046608404, + "eval_international_law_loss": 3.0698299407958984, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.536, + "eval_international_law_steps_per_second": 3.768, + "step": 5800 + }, + { + "epoch": 2.421972534332085, + "grad_norm": 0.2734375, + "learning_rate": 4.9657544501943175e-06, + "loss": 0.2349, + "step": 5820 + }, + { + "epoch": 2.430295464003329, + "grad_norm": 0.28125, + "learning_rate": 4.964393465810963e-06, + "loss": 0.236, + "step": 5840 + }, + { + "epoch": 2.4386183936745733, + "grad_norm": 0.248046875, + "learning_rate": 4.9630061556090855e-06, + "loss": 0.2362, + "step": 5860 + }, + { + "epoch": 2.4469413233458175, + "grad_norm": 0.240234375, + "learning_rate": 4.961592534408592e-06, + "loss": 0.2285, + "step": 5880 + }, + { + "epoch": 2.455264253017062, + "grad_norm": 0.30859375, + "learning_rate": 4.9601526173104544e-06, + "loss": 0.2346, + "step": 5900 + }, + { + "epoch": 2.455264253017062, + "eval_main_loss": 0.2392611801624298, + "eval_main_runtime": 6.3213, + "eval_main_samples_per_second": 30.057, + "eval_main_steps_per_second": 3.797, + "step": 5900 + }, + { + "epoch": 2.455264253017062, + "eval_anatomy_loss": 2.8343961238861084, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.753, + "step": 5900 + }, + { + "epoch": 2.455264253017062, + "eval_college_mathematics_loss": 2.0529263019561768, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.514, + "eval_college_mathematics_steps_per_second": 3.757, + "step": 5900 + }, + { + "epoch": 2.455264253017062, + "eval_international_law_loss": 3.068134069442749, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.507, + "eval_international_law_steps_per_second": 3.753, + "step": 5900 + }, + { + "epoch": 2.4635871826883062, + "grad_norm": 0.291015625, + "learning_rate": 4.958686419696548e-06, + "loss": 0.228, + "step": 5920 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.265625, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.233, + "step": 5940 + }, + { + "epoch": 2.480233042030795, + "grad_norm": 0.259765625, + "learning_rate": 4.955675245852475e-06, + "loss": 0.2316, + "step": 5960 + }, + { + "epoch": 2.488555971702039, + "grad_norm": 0.33984375, + "learning_rate": 4.954130301789093e-06, + "loss": 0.2342, + "step": 5980 + }, + { + "epoch": 2.4968789013732833, + "grad_norm": 0.28125, + "learning_rate": 4.952559141543171e-06, + "loss": 0.233, + "step": 6000 + }, + { + "epoch": 2.4968789013732833, + "eval_main_loss": 0.23919633030891418, + "eval_main_runtime": 6.3217, + "eval_main_samples_per_second": 30.055, + "eval_main_steps_per_second": 3.796, + "step": 6000 + }, + { + "epoch": 2.4968789013732833, + "eval_anatomy_loss": 2.835357427597046, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.758, + "step": 6000 + }, + { + "epoch": 2.4968789013732833, + "eval_college_mathematics_loss": 2.0497984886169434, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 6000 + }, + { + "epoch": 2.4968789013732833, + "eval_international_law_loss": 3.0689234733581543, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.494, + "eval_international_law_steps_per_second": 3.747, + "step": 6000 + }, + { + "epoch": 2.505201831044528, + "grad_norm": 0.271484375, + "learning_rate": 4.950961781898586e-06, + "loss": 0.2345, + "step": 6020 + }, + { + "epoch": 2.513524760715772, + "grad_norm": 0.2578125, + "learning_rate": 4.94933823991909e-06, + "loss": 0.2327, + "step": 6040 + }, + { + "epoch": 2.521847690387016, + "grad_norm": 0.34765625, + "learning_rate": 4.947688532948129e-06, + "loss": 0.2286, + "step": 6060 + }, + { + "epoch": 2.5301706200582608, + "grad_norm": 0.283203125, + "learning_rate": 4.9460126786086535e-06, + "loss": 0.2313, + "step": 6080 + }, + { + "epoch": 2.538493549729505, + "grad_norm": 0.25, + "learning_rate": 4.944310694802935e-06, + "loss": 0.2343, + "step": 6100 + }, + { + "epoch": 2.538493549729505, + "eval_main_loss": 0.23911188542842865, + "eval_main_runtime": 6.33, + "eval_main_samples_per_second": 30.016, + "eval_main_steps_per_second": 3.791, + "step": 6100 + }, + { + "epoch": 2.538493549729505, + "eval_anatomy_loss": 2.834503650665283, + "eval_anatomy_runtime": 0.2651, + "eval_anatomy_samples_per_second": 7.545, + "eval_anatomy_steps_per_second": 3.772, + "step": 6100 + }, + { + "epoch": 2.538493549729505, + "eval_college_mathematics_loss": 2.049175977706909, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.5, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 6100 + }, + { + "epoch": 2.538493549729505, + "eval_international_law_loss": 3.0691072940826416, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.502, + "eval_international_law_steps_per_second": 3.751, + "step": 6100 + }, + { + "epoch": 2.546816479400749, + "grad_norm": 0.29296875, + "learning_rate": 4.942582599712369e-06, + "loss": 0.2356, + "step": 6120 + }, + { + "epoch": 2.555139409071993, + "grad_norm": 0.29296875, + "learning_rate": 4.940828411797287e-06, + "loss": 0.2369, + "step": 6140 + }, + { + "epoch": 2.5634623387432374, + "grad_norm": 0.306640625, + "learning_rate": 4.9390481497967545e-06, + "loss": 0.2287, + "step": 6160 + }, + { + "epoch": 2.571785268414482, + "grad_norm": 0.298828125, + "learning_rate": 4.937241832728373e-06, + "loss": 0.2288, + "step": 6180 + }, + { + "epoch": 2.580108198085726, + "grad_norm": 0.2890625, + "learning_rate": 4.9354094798880806e-06, + "loss": 0.2301, + "step": 6200 + }, + { + "epoch": 2.580108198085726, + "eval_main_loss": 0.23901519179344177, + "eval_main_runtime": 6.3573, + "eval_main_samples_per_second": 29.887, + "eval_main_steps_per_second": 3.775, + "step": 6200 + }, + { + "epoch": 2.580108198085726, + "eval_anatomy_loss": 2.8348350524902344, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.513, + "eval_anatomy_steps_per_second": 3.757, + "step": 6200 + }, + { + "epoch": 2.580108198085726, + "eval_college_mathematics_loss": 2.048042058944702, + "eval_college_mathematics_runtime": 0.2678, + "eval_college_mathematics_samples_per_second": 7.467, + "eval_college_mathematics_steps_per_second": 3.734, + "step": 6200 + }, + { + "epoch": 2.580108198085726, + "eval_international_law_loss": 3.0679852962493896, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.483, + "eval_international_law_steps_per_second": 3.742, + "step": 6200 + }, + { + "epoch": 2.5884311277569703, + "grad_norm": 0.251953125, + "learning_rate": 4.9335511108499344e-06, + "loss": 0.2333, + "step": 6220 + }, + { + "epoch": 2.596754057428215, + "grad_norm": 0.291015625, + "learning_rate": 4.931666745465915e-06, + "loss": 0.2275, + "step": 6240 + }, + { + "epoch": 2.605076987099459, + "grad_norm": 0.26953125, + "learning_rate": 4.929756403865706e-06, + "loss": 0.2303, + "step": 6260 + }, + { + "epoch": 2.613399916770703, + "grad_norm": 0.251953125, + "learning_rate": 4.927820106456481e-06, + "loss": 0.2314, + "step": 6280 + }, + { + "epoch": 2.6217228464419478, + "grad_norm": 0.2734375, + "learning_rate": 4.925857873922686e-06, + "loss": 0.2344, + "step": 6300 + }, + { + "epoch": 2.6217228464419478, + "eval_main_loss": 0.2388763278722763, + "eval_main_runtime": 6.3519, + "eval_main_samples_per_second": 29.912, + "eval_main_steps_per_second": 3.778, + "step": 6300 + }, + { + "epoch": 2.6217228464419478, + "eval_anatomy_loss": 2.8354506492614746, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.478, + "eval_anatomy_steps_per_second": 3.739, + "step": 6300 + }, + { + "epoch": 2.6217228464419478, + "eval_college_mathematics_loss": 2.0530946254730225, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 6300 + }, + { + "epoch": 2.6217228464419478, + "eval_international_law_loss": 3.0681755542755127, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.485, + "eval_international_law_steps_per_second": 3.742, + "step": 6300 + }, + { + "epoch": 2.630045776113192, + "grad_norm": 0.287109375, + "learning_rate": 4.923869727225819e-06, + "loss": 0.23, + "step": 6320 + }, + { + "epoch": 2.638368705784436, + "grad_norm": 0.259765625, + "learning_rate": 4.921855687604206e-06, + "loss": 0.2337, + "step": 6340 + }, + { + "epoch": 2.6466916354556806, + "grad_norm": 0.267578125, + "learning_rate": 4.91981577657277e-06, + "loss": 0.2337, + "step": 6360 + }, + { + "epoch": 2.655014565126925, + "grad_norm": 0.2412109375, + "learning_rate": 4.917750015922809e-06, + "loss": 0.2363, + "step": 6380 + }, + { + "epoch": 2.663337494798169, + "grad_norm": 0.29296875, + "learning_rate": 4.915658427721755e-06, + "loss": 0.2317, + "step": 6400 + }, + { + "epoch": 2.663337494798169, + "eval_main_loss": 0.23897981643676758, + "eval_main_runtime": 6.3411, + "eval_main_samples_per_second": 29.963, + "eval_main_steps_per_second": 3.785, + "step": 6400 + }, + { + "epoch": 2.663337494798169, + "eval_anatomy_loss": 2.8342771530151367, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.495, + "eval_anatomy_steps_per_second": 3.748, + "step": 6400 + }, + { + "epoch": 2.663337494798169, + "eval_college_mathematics_loss": 2.0512688159942627, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 6400 + }, + { + "epoch": 2.663337494798169, + "eval_international_law_loss": 3.0684025287628174, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.504, + "eval_international_law_steps_per_second": 3.752, + "step": 6400 + }, + { + "epoch": 2.671660424469413, + "grad_norm": 0.310546875, + "learning_rate": 4.9135410343129465e-06, + "loss": 0.2327, + "step": 6420 + }, + { + "epoch": 2.6799833541406572, + "grad_norm": 0.306640625, + "learning_rate": 4.911397858315382e-06, + "loss": 0.2279, + "step": 6440 + }, + { + "epoch": 2.688306283811902, + "grad_norm": 0.326171875, + "learning_rate": 4.909228922623482e-06, + "loss": 0.2287, + "step": 6460 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.337890625, + "learning_rate": 4.907034250406846e-06, + "loss": 0.2288, + "step": 6480 + }, + { + "epoch": 2.70495214315439, + "grad_norm": 0.294921875, + "learning_rate": 4.904813865110002e-06, + "loss": 0.2308, + "step": 6500 + }, + { + "epoch": 2.70495214315439, + "eval_main_loss": 0.23884011805057526, + "eval_main_runtime": 6.3205, + "eval_main_samples_per_second": 30.061, + "eval_main_steps_per_second": 3.797, + "step": 6500 + }, + { + "epoch": 2.70495214315439, + "eval_anatomy_loss": 2.8351662158966064, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.51, + "eval_anatomy_steps_per_second": 3.755, + "step": 6500 + }, + { + "epoch": 2.70495214315439, + "eval_college_mathematics_loss": 2.0514416694641113, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 6500 + }, + { + "epoch": 2.70495214315439, + "eval_international_law_loss": 3.0670132637023926, + "eval_international_law_runtime": 0.265, + "eval_international_law_samples_per_second": 7.546, + "eval_international_law_steps_per_second": 3.773, + "step": 6500 + }, + { + "epoch": 2.7132750728256347, + "grad_norm": 0.302734375, + "learning_rate": 4.902567790452158e-06, + "loss": 0.2308, + "step": 6520 + }, + { + "epoch": 2.721598002496879, + "grad_norm": 0.263671875, + "learning_rate": 4.900296050426947e-06, + "loss": 0.2373, + "step": 6540 + }, + { + "epoch": 2.729920932168123, + "grad_norm": 0.2197265625, + "learning_rate": 4.897998669302173e-06, + "loss": 0.2291, + "step": 6560 + }, + { + "epoch": 2.7382438618393676, + "grad_norm": 0.36328125, + "learning_rate": 4.895675671619549e-06, + "loss": 0.2369, + "step": 6580 + }, + { + "epoch": 2.746566791510612, + "grad_norm": 0.2578125, + "learning_rate": 4.893327082194436e-06, + "loss": 0.2353, + "step": 6600 + }, + { + "epoch": 2.746566791510612, + "eval_main_loss": 0.23877017199993134, + "eval_main_runtime": 6.35, + "eval_main_samples_per_second": 29.921, + "eval_main_steps_per_second": 3.78, + "step": 6600 + }, + { + "epoch": 2.746566791510612, + "eval_anatomy_loss": 2.832685947418213, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.502, + "eval_anatomy_steps_per_second": 3.751, + "step": 6600 + }, + { + "epoch": 2.746566791510612, + "eval_college_mathematics_loss": 2.0484695434570312, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.489, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 6600 + }, + { + "epoch": 2.746566791510612, + "eval_international_law_loss": 3.0677947998046875, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.493, + "eval_international_law_steps_per_second": 3.746, + "step": 6600 + }, + { + "epoch": 2.754889721181856, + "grad_norm": 0.267578125, + "learning_rate": 4.890952926115581e-06, + "loss": 0.2308, + "step": 6620 + }, + { + "epoch": 2.7632126508531005, + "grad_norm": 0.2890625, + "learning_rate": 4.888553228744842e-06, + "loss": 0.233, + "step": 6640 + }, + { + "epoch": 2.7715355805243447, + "grad_norm": 0.306640625, + "learning_rate": 4.886128015716925e-06, + "loss": 0.2397, + "step": 6660 + }, + { + "epoch": 2.779858510195589, + "grad_norm": 0.3203125, + "learning_rate": 4.883677312939103e-06, + "loss": 0.2345, + "step": 6680 + }, + { + "epoch": 2.788181439866833, + "grad_norm": 0.291015625, + "learning_rate": 4.881201146590945e-06, + "loss": 0.2314, + "step": 6700 + }, + { + "epoch": 2.788181439866833, + "eval_main_loss": 0.23880185186862946, + "eval_main_runtime": 6.3571, + "eval_main_samples_per_second": 29.888, + "eval_main_steps_per_second": 3.775, + "step": 6700 + }, + { + "epoch": 2.788181439866833, + "eval_anatomy_loss": 2.83292555809021, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.741, + "step": 6700 + }, + { + "epoch": 2.788181439866833, + "eval_college_mathematics_loss": 2.048858880996704, + "eval_college_mathematics_runtime": 0.2675, + "eval_college_mathematics_samples_per_second": 7.478, + "eval_college_mathematics_steps_per_second": 3.739, + "step": 6700 + }, + { + "epoch": 2.788181439866833, + "eval_international_law_loss": 3.066660165786743, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.505, + "eval_international_law_steps_per_second": 3.752, + "step": 6700 + }, + { + "epoch": 2.796504369538077, + "grad_norm": 0.251953125, + "learning_rate": 4.878699543124031e-06, + "loss": 0.2328, + "step": 6720 + }, + { + "epoch": 2.8048272992093217, + "grad_norm": 0.291015625, + "learning_rate": 4.876172529261678e-06, + "loss": 0.2334, + "step": 6740 + }, + { + "epoch": 2.813150228880566, + "grad_norm": 0.28515625, + "learning_rate": 4.873620131998642e-06, + "loss": 0.2327, + "step": 6760 + }, + { + "epoch": 2.82147315855181, + "grad_norm": 0.314453125, + "learning_rate": 4.871042378600842e-06, + "loss": 0.2309, + "step": 6780 + }, + { + "epoch": 2.8297960882230546, + "grad_norm": 0.2890625, + "learning_rate": 4.8684392966050594e-06, + "loss": 0.2307, + "step": 6800 + }, + { + "epoch": 2.8297960882230546, + "eval_main_loss": 0.2386612743139267, + "eval_main_runtime": 6.3547, + "eval_main_samples_per_second": 29.899, + "eval_main_steps_per_second": 3.777, + "step": 6800 + }, + { + "epoch": 2.8297960882230546, + "eval_anatomy_loss": 2.83203125, + "eval_anatomy_runtime": 0.268, + "eval_anatomy_samples_per_second": 7.464, + "eval_anatomy_steps_per_second": 3.732, + "step": 6800 + }, + { + "epoch": 2.8297960882230546, + "eval_college_mathematics_loss": 2.0493810176849365, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.487, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 6800 + }, + { + "epoch": 2.8297960882230546, + "eval_international_law_loss": 3.065821409225464, + "eval_international_law_runtime": 0.2667, + "eval_international_law_samples_per_second": 7.498, + "eval_international_law_steps_per_second": 3.749, + "step": 6800 + }, + { + "epoch": 2.8381190178942988, + "grad_norm": 0.302734375, + "learning_rate": 4.865810913818651e-06, + "loss": 0.2294, + "step": 6820 + }, + { + "epoch": 2.846441947565543, + "grad_norm": 0.291015625, + "learning_rate": 4.863157258319245e-06, + "loss": 0.2324, + "step": 6840 + }, + { + "epoch": 2.8547648772367875, + "grad_norm": 0.27734375, + "learning_rate": 4.8604783584544475e-06, + "loss": 0.2313, + "step": 6860 + }, + { + "epoch": 2.8630878069080317, + "grad_norm": 0.28515625, + "learning_rate": 4.857774242841536e-06, + "loss": 0.2281, + "step": 6880 + }, + { + "epoch": 2.871410736579276, + "grad_norm": 0.296875, + "learning_rate": 4.855044940367155e-06, + "loss": 0.2329, + "step": 6900 + }, + { + "epoch": 2.871410736579276, + "eval_main_loss": 0.23859336972236633, + "eval_main_runtime": 6.3286, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 6900 + }, + { + "epoch": 2.871410736579276, + "eval_anatomy_loss": 2.833575487136841, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.507, + "eval_anatomy_steps_per_second": 3.753, + "step": 6900 + }, + { + "epoch": 2.871410736579276, + "eval_college_mathematics_loss": 2.0479042530059814, + "eval_college_mathematics_runtime": 0.2652, + "eval_college_mathematics_samples_per_second": 7.542, + "eval_college_mathematics_steps_per_second": 3.771, + "step": 6900 + }, + { + "epoch": 2.871410736579276, + "eval_international_law_loss": 3.0679168701171875, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.516, + "eval_international_law_steps_per_second": 3.758, + "step": 6900 + }, + { + "epoch": 2.8797336662505204, + "grad_norm": 0.275390625, + "learning_rate": 4.8522904801870065e-06, + "loss": 0.2309, + "step": 6920 + }, + { + "epoch": 2.8880565959217646, + "grad_norm": 0.330078125, + "learning_rate": 4.8495108917255385e-06, + "loss": 0.2347, + "step": 6940 + }, + { + "epoch": 2.8963795255930087, + "grad_norm": 0.296875, + "learning_rate": 4.846706204675632e-06, + "loss": 0.2301, + "step": 6960 + }, + { + "epoch": 2.904702455264253, + "grad_norm": 0.35546875, + "learning_rate": 4.843876448998283e-06, + "loss": 0.2354, + "step": 6980 + }, + { + "epoch": 2.9130253849354975, + "grad_norm": 0.314453125, + "learning_rate": 4.841021654922281e-06, + "loss": 0.2359, + "step": 7000 + }, + { + "epoch": 2.9130253849354975, + "eval_main_loss": 0.23874352872371674, + "eval_main_runtime": 6.316, + "eval_main_samples_per_second": 30.083, + "eval_main_steps_per_second": 3.8, + "step": 7000 + }, + { + "epoch": 2.9130253849354975, + "eval_anatomy_loss": 2.832608699798584, + "eval_anatomy_runtime": 0.2648, + "eval_anatomy_samples_per_second": 7.552, + "eval_anatomy_steps_per_second": 3.776, + "step": 7000 + }, + { + "epoch": 2.9130253849354975, + "eval_college_mathematics_loss": 2.0510120391845703, + "eval_college_mathematics_runtime": 0.2649, + "eval_college_mathematics_samples_per_second": 7.55, + "eval_college_mathematics_steps_per_second": 3.775, + "step": 7000 + }, + { + "epoch": 2.9130253849354975, + "eval_international_law_loss": 3.0677924156188965, + "eval_international_law_runtime": 0.2642, + "eval_international_law_samples_per_second": 7.57, + "eval_international_law_steps_per_second": 3.785, + "step": 7000 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.298828125, + "learning_rate": 4.838141852943891e-06, + "loss": 0.2289, + "step": 7020 + }, + { + "epoch": 2.9296712442779858, + "grad_norm": 0.330078125, + "learning_rate": 4.835237073826521e-06, + "loss": 0.2305, + "step": 7040 + }, + { + "epoch": 2.93799417394923, + "grad_norm": 0.279296875, + "learning_rate": 4.8323073486003976e-06, + "loss": 0.2312, + "step": 7060 + }, + { + "epoch": 2.9463171036204745, + "grad_norm": 0.28515625, + "learning_rate": 4.829352708562233e-06, + "loss": 0.2321, + "step": 7080 + }, + { + "epoch": 2.9546400332917186, + "grad_norm": 0.296875, + "learning_rate": 4.826373185274893e-06, + "loss": 0.2336, + "step": 7100 + }, + { + "epoch": 2.9546400332917186, + "eval_main_loss": 0.23865433037281036, + "eval_main_runtime": 6.3269, + "eval_main_samples_per_second": 30.031, + "eval_main_steps_per_second": 3.793, + "step": 7100 + }, + { + "epoch": 2.9546400332917186, + "eval_anatomy_loss": 2.8344128131866455, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.513, + "eval_anatomy_steps_per_second": 3.756, + "step": 7100 + }, + { + "epoch": 2.9546400332917186, + "eval_college_mathematics_loss": 2.0521576404571533, + "eval_college_mathematics_runtime": 0.2649, + "eval_college_mathematics_samples_per_second": 7.55, + "eval_college_mathematics_steps_per_second": 3.775, + "step": 7100 + }, + { + "epoch": 2.9546400332917186, + "eval_international_law_loss": 3.0670950412750244, + "eval_international_law_runtime": 0.2651, + "eval_international_law_samples_per_second": 7.545, + "eval_international_law_steps_per_second": 3.772, + "step": 7100 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.267578125, + "learning_rate": 4.823368810567056e-06, + "loss": 0.2332, + "step": 7120 + }, + { + "epoch": 2.9712858926342074, + "grad_norm": 0.31640625, + "learning_rate": 4.820339616532878e-06, + "loss": 0.2297, + "step": 7140 + }, + { + "epoch": 2.9796088223054515, + "grad_norm": 0.283203125, + "learning_rate": 4.817285635531641e-06, + "loss": 0.2311, + "step": 7160 + }, + { + "epoch": 2.9879317519766957, + "grad_norm": 0.30859375, + "learning_rate": 4.81420690018742e-06, + "loss": 0.231, + "step": 7180 + }, + { + "epoch": 2.9962546816479403, + "grad_norm": 0.31640625, + "learning_rate": 4.811103443388724e-06, + "loss": 0.2332, + "step": 7200 + }, + { + "epoch": 2.9962546816479403, + "eval_main_loss": 0.23862618207931519, + "eval_main_runtime": 6.3548, + "eval_main_samples_per_second": 29.899, + "eval_main_steps_per_second": 3.777, + "step": 7200 + }, + { + "epoch": 2.9962546816479403, + "eval_anatomy_loss": 2.834916114807129, + "eval_anatomy_runtime": 0.2679, + "eval_anatomy_samples_per_second": 7.464, + "eval_anatomy_steps_per_second": 3.732, + "step": 7200 + }, + { + "epoch": 2.9962546816479403, + "eval_college_mathematics_loss": 2.0544211864471436, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.475, + "eval_college_mathematics_steps_per_second": 3.737, + "step": 7200 + }, + { + "epoch": 2.9962546816479403, + "eval_international_law_loss": 3.067124843597412, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.49, + "eval_international_law_steps_per_second": 3.745, + "step": 7200 + }, + { + "epoch": 3.0045776113191844, + "grad_norm": 0.3125, + "learning_rate": 4.807975298288149e-06, + "loss": 0.233, + "step": 7220 + }, + { + "epoch": 3.0129005409904286, + "grad_norm": 0.271484375, + "learning_rate": 4.804822498302021e-06, + "loss": 0.2302, + "step": 7240 + }, + { + "epoch": 3.0212234706616727, + "grad_norm": 0.244140625, + "learning_rate": 4.8016450771100455e-06, + "loss": 0.2335, + "step": 7260 + }, + { + "epoch": 3.0295464003329173, + "grad_norm": 0.26171875, + "learning_rate": 4.798443068654939e-06, + "loss": 0.2322, + "step": 7280 + }, + { + "epoch": 3.0378693300041615, + "grad_norm": 0.267578125, + "learning_rate": 4.795216507142074e-06, + "loss": 0.2298, + "step": 7300 + }, + { + "epoch": 3.0378693300041615, + "eval_main_loss": 0.23858603835105896, + "eval_main_runtime": 6.3548, + "eval_main_samples_per_second": 29.899, + "eval_main_steps_per_second": 3.777, + "step": 7300 + }, + { + "epoch": 3.0378693300041615, + "eval_anatomy_loss": 2.832512617111206, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.487, + "eval_anatomy_steps_per_second": 3.743, + "step": 7300 + }, + { + "epoch": 3.0378693300041615, + "eval_college_mathematics_loss": 2.0511765480041504, + "eval_college_mathematics_runtime": 0.2677, + "eval_college_mathematics_samples_per_second": 7.472, + "eval_college_mathematics_steps_per_second": 3.736, + "step": 7300 + }, + { + "epoch": 3.0378693300041615, + "eval_international_law_loss": 3.067012071609497, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.491, + "eval_international_law_steps_per_second": 3.745, + "step": 7300 + }, + { + "epoch": 3.0461922596754056, + "grad_norm": 0.263671875, + "learning_rate": 4.791965427039109e-06, + "loss": 0.234, + "step": 7320 + }, + { + "epoch": 3.0545151893466502, + "grad_norm": 0.2734375, + "learning_rate": 4.788689863075622e-06, + "loss": 0.2319, + "step": 7340 + }, + { + "epoch": 3.0628381190178944, + "grad_norm": 0.31640625, + "learning_rate": 4.785389850242739e-06, + "loss": 0.2309, + "step": 7360 + }, + { + "epoch": 3.0711610486891385, + "grad_norm": 0.240234375, + "learning_rate": 4.78206542379276e-06, + "loss": 0.2283, + "step": 7380 + }, + { + "epoch": 3.0794839783603827, + "grad_norm": 0.279296875, + "learning_rate": 4.778716619238784e-06, + "loss": 0.23, + "step": 7400 + }, + { + "epoch": 3.0794839783603827, + "eval_main_loss": 0.23865698277950287, + "eval_main_runtime": 6.3366, + "eval_main_samples_per_second": 29.984, + "eval_main_steps_per_second": 3.788, + "step": 7400 + }, + { + "epoch": 3.0794839783603827, + "eval_anatomy_loss": 2.8329567909240723, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.502, + "eval_anatomy_steps_per_second": 3.751, + "step": 7400 + }, + { + "epoch": 3.0794839783603827, + "eval_college_mathematics_loss": 2.0505635738372803, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 7400 + }, + { + "epoch": 3.0794839783603827, + "eval_international_law_loss": 3.06803035736084, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.512, + "eval_international_law_steps_per_second": 3.756, + "step": 7400 + }, + { + "epoch": 3.0878069080316273, + "grad_norm": 0.2412109375, + "learning_rate": 4.7753434723543266e-06, + "loss": 0.2308, + "step": 7420 + }, + { + "epoch": 3.0961298377028714, + "grad_norm": 0.28515625, + "learning_rate": 4.771946019172942e-06, + "loss": 0.2331, + "step": 7440 + }, + { + "epoch": 3.1044527673741156, + "grad_norm": 0.2890625, + "learning_rate": 4.768524295987835e-06, + "loss": 0.2383, + "step": 7460 + }, + { + "epoch": 3.11277569704536, + "grad_norm": 0.25, + "learning_rate": 4.765078339351472e-06, + "loss": 0.2327, + "step": 7480 + }, + { + "epoch": 3.1210986267166043, + "grad_norm": 0.3046875, + "learning_rate": 4.761608186075196e-06, + "loss": 0.2312, + "step": 7500 + }, + { + "epoch": 3.1210986267166043, + "eval_main_loss": 0.2385028600692749, + "eval_main_runtime": 6.3337, + "eval_main_samples_per_second": 29.998, + "eval_main_steps_per_second": 3.789, + "step": 7500 + }, + { + "epoch": 3.1210986267166043, + "eval_anatomy_loss": 2.834723949432373, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.522, + "eval_anatomy_steps_per_second": 3.761, + "step": 7500 + }, + { + "epoch": 3.1210986267166043, + "eval_college_mathematics_loss": 2.048158884048462, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.494, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 7500 + }, + { + "epoch": 3.1210986267166043, + "eval_international_law_loss": 3.065969228744507, + "eval_international_law_runtime": 0.2652, + "eval_international_law_samples_per_second": 7.542, + "eval_international_law_steps_per_second": 3.771, + "step": 7500 + }, + { + "epoch": 3.1294215563878485, + "grad_norm": 0.30078125, + "learning_rate": 4.758113873228828e-06, + "loss": 0.2361, + "step": 7520 + }, + { + "epoch": 3.1377444860590926, + "grad_norm": 0.328125, + "learning_rate": 4.754595438140272e-06, + "loss": 0.2312, + "step": 7540 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.265625, + "learning_rate": 4.75105291839512e-06, + "loss": 0.232, + "step": 7560 + }, + { + "epoch": 3.1543903454015814, + "grad_norm": 0.2353515625, + "learning_rate": 4.747486351836246e-06, + "loss": 0.2308, + "step": 7580 + }, + { + "epoch": 3.1627132750728255, + "grad_norm": 0.294921875, + "learning_rate": 4.743895776563403e-06, + "loss": 0.2334, + "step": 7600 + }, + { + "epoch": 3.1627132750728255, + "eval_main_loss": 0.23862504959106445, + "eval_main_runtime": 6.3275, + "eval_main_samples_per_second": 30.028, + "eval_main_steps_per_second": 3.793, + "step": 7600 + }, + { + "epoch": 3.1627132750728255, + "eval_anatomy_loss": 2.8308141231536865, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.528, + "eval_anatomy_steps_per_second": 3.764, + "step": 7600 + }, + { + "epoch": 3.1627132750728255, + "eval_college_mathematics_loss": 2.053100347518921, + "eval_college_mathematics_runtime": 0.2659, + "eval_college_mathematics_samples_per_second": 7.521, + "eval_college_mathematics_steps_per_second": 3.761, + "step": 7600 + }, + { + "epoch": 3.1627132750728255, + "eval_international_law_loss": 3.066927433013916, + "eval_international_law_runtime": 0.2659, + "eval_international_law_samples_per_second": 7.521, + "eval_international_law_steps_per_second": 3.76, + "step": 7600 + }, + { + "epoch": 3.17103620474407, + "grad_norm": 0.25390625, + "learning_rate": 4.740281230932817e-06, + "loss": 0.2345, + "step": 7620 + }, + { + "epoch": 3.1793591344153143, + "grad_norm": 0.236328125, + "learning_rate": 4.736642753556777e-06, + "loss": 0.2343, + "step": 7640 + }, + { + "epoch": 3.1876820640865584, + "grad_norm": 0.318359375, + "learning_rate": 4.732980383303223e-06, + "loss": 0.2327, + "step": 7660 + }, + { + "epoch": 3.1960049937578026, + "grad_norm": 0.30859375, + "learning_rate": 4.729294159295329e-06, + "loss": 0.2316, + "step": 7680 + }, + { + "epoch": 3.204327923429047, + "grad_norm": 0.310546875, + "learning_rate": 4.725584120911085e-06, + "loss": 0.2324, + "step": 7700 + }, + { + "epoch": 3.204327923429047, + "eval_main_loss": 0.2384970486164093, + "eval_main_runtime": 6.3308, + "eval_main_samples_per_second": 30.012, + "eval_main_steps_per_second": 3.791, + "step": 7700 + }, + { + "epoch": 3.204327923429047, + "eval_anatomy_loss": 2.8297290802001953, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.501, + "eval_anatomy_steps_per_second": 3.751, + "step": 7700 + }, + { + "epoch": 3.204327923429047, + "eval_college_mathematics_loss": 2.0536551475524902, + "eval_college_mathematics_runtime": 0.2648, + "eval_college_mathematics_samples_per_second": 7.552, + "eval_college_mathematics_steps_per_second": 3.776, + "step": 7700 + }, + { + "epoch": 3.204327923429047, + "eval_international_law_loss": 3.0671679973602295, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.513, + "eval_international_law_steps_per_second": 3.757, + "step": 7700 + }, + { + "epoch": 3.2126508531002913, + "grad_norm": 0.296875, + "learning_rate": 4.721850307782879e-06, + "loss": 0.2278, + "step": 7720 + }, + { + "epoch": 3.2209737827715355, + "grad_norm": 0.306640625, + "learning_rate": 4.718092759797073e-06, + "loss": 0.2338, + "step": 7740 + }, + { + "epoch": 3.22929671244278, + "grad_norm": 0.2734375, + "learning_rate": 4.714311517093573e-06, + "loss": 0.2315, + "step": 7760 + }, + { + "epoch": 3.237619642114024, + "grad_norm": 0.2412109375, + "learning_rate": 4.710506620065406e-06, + "loss": 0.2368, + "step": 7780 + }, + { + "epoch": 3.2459425717852683, + "grad_norm": 0.291015625, + "learning_rate": 4.706678109358285e-06, + "loss": 0.2386, + "step": 7800 + }, + { + "epoch": 3.2459425717852683, + "eval_main_loss": 0.23850186169147491, + "eval_main_runtime": 6.3235, + "eval_main_samples_per_second": 30.047, + "eval_main_steps_per_second": 3.795, + "step": 7800 + }, + { + "epoch": 3.2459425717852683, + "eval_anatomy_loss": 2.8328044414520264, + "eval_anatomy_runtime": 0.265, + "eval_anatomy_samples_per_second": 7.548, + "eval_anatomy_steps_per_second": 3.774, + "step": 7800 + }, + { + "epoch": 3.2459425717852683, + "eval_college_mathematics_loss": 2.048779249191284, + "eval_college_mathematics_runtime": 0.2654, + "eval_college_mathematics_samples_per_second": 7.535, + "eval_college_mathematics_steps_per_second": 3.768, + "step": 7800 + }, + { + "epoch": 3.2459425717852683, + "eval_international_law_loss": 3.0646255016326904, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.529, + "eval_international_law_steps_per_second": 3.764, + "step": 7800 + }, + { + "epoch": 3.2542655014565125, + "grad_norm": 0.2578125, + "learning_rate": 4.702826025870173e-06, + "loss": 0.2354, + "step": 7820 + }, + { + "epoch": 3.262588431127757, + "grad_norm": 0.29296875, + "learning_rate": 4.698950410750854e-06, + "loss": 0.231, + "step": 7840 + }, + { + "epoch": 3.2709113607990012, + "grad_norm": 0.275390625, + "learning_rate": 4.695051305401483e-06, + "loss": 0.232, + "step": 7860 + }, + { + "epoch": 3.2792342904702454, + "grad_norm": 0.341796875, + "learning_rate": 4.691128751474149e-06, + "loss": 0.2307, + "step": 7880 + }, + { + "epoch": 3.28755722014149, + "grad_norm": 0.3046875, + "learning_rate": 4.6871827908714345e-06, + "loss": 0.2306, + "step": 7900 + }, + { + "epoch": 3.28755722014149, + "eval_main_loss": 0.23848366737365723, + "eval_main_runtime": 6.3519, + "eval_main_samples_per_second": 29.912, + "eval_main_steps_per_second": 3.778, + "step": 7900 + }, + { + "epoch": 3.28755722014149, + "eval_anatomy_loss": 2.834224224090576, + "eval_anatomy_runtime": 0.2672, + "eval_anatomy_samples_per_second": 7.485, + "eval_anatomy_steps_per_second": 3.742, + "step": 7900 + }, + { + "epoch": 3.28755722014149, + "eval_college_mathematics_loss": 2.05056095123291, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.486, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 7900 + }, + { + "epoch": 3.28755722014149, + "eval_international_law_loss": 3.0639660358428955, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.496, + "eval_international_law_steps_per_second": 3.748, + "step": 7900 + }, + { + "epoch": 3.295880149812734, + "grad_norm": 0.291015625, + "learning_rate": 4.6832134657459586e-06, + "loss": 0.2349, + "step": 7920 + }, + { + "epoch": 3.3042030794839783, + "grad_norm": 0.291015625, + "learning_rate": 4.679220818499932e-06, + "loss": 0.2316, + "step": 7940 + }, + { + "epoch": 3.3125260091552224, + "grad_norm": 0.236328125, + "learning_rate": 4.675204891784706e-06, + "loss": 0.2326, + "step": 7960 + }, + { + "epoch": 3.320848938826467, + "grad_norm": 0.259765625, + "learning_rate": 4.671165728500311e-06, + "loss": 0.2325, + "step": 7980 + }, + { + "epoch": 3.329171868497711, + "grad_norm": 0.306640625, + "learning_rate": 4.667103371795003e-06, + "loss": 0.2305, + "step": 8000 + }, + { + "epoch": 3.329171868497711, + "eval_main_loss": 0.23847457766532898, + "eval_main_runtime": 6.3564, + "eval_main_samples_per_second": 29.891, + "eval_main_steps_per_second": 3.776, + "step": 8000 + }, + { + "epoch": 3.329171868497711, + "eval_anatomy_loss": 2.8327555656433105, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.509, + "eval_anatomy_steps_per_second": 3.755, + "step": 8000 + }, + { + "epoch": 3.329171868497711, + "eval_college_mathematics_loss": 2.0528974533081055, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.505, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 8000 + }, + { + "epoch": 3.329171868497711, + "eval_international_law_loss": 3.065342903137207, + "eval_international_law_runtime": 0.2694, + "eval_international_law_samples_per_second": 7.423, + "eval_international_law_steps_per_second": 3.712, + "step": 8000 + }, + { + "epoch": 3.3374947981689553, + "grad_norm": 0.3203125, + "learning_rate": 4.6630178650648005e-06, + "loss": 0.226, + "step": 8020 + }, + { + "epoch": 3.3458177278402, + "grad_norm": 0.28515625, + "learning_rate": 4.658909251953023e-06, + "loss": 0.2341, + "step": 8040 + }, + { + "epoch": 3.354140657511444, + "grad_norm": 0.2578125, + "learning_rate": 4.654777576349822e-06, + "loss": 0.232, + "step": 8060 + }, + { + "epoch": 3.3624635871826882, + "grad_norm": 0.271484375, + "learning_rate": 4.650622882391713e-06, + "loss": 0.2266, + "step": 8080 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 0.271484375, + "learning_rate": 4.646445214461105e-06, + "loss": 0.2328, + "step": 8100 + }, + { + "epoch": 3.370786516853933, + "eval_main_loss": 0.23838673532009125, + "eval_main_runtime": 6.3504, + "eval_main_samples_per_second": 29.919, + "eval_main_steps_per_second": 3.779, + "step": 8100 + }, + { + "epoch": 3.370786516853933, + "eval_anatomy_loss": 2.8322196006774902, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.518, + "eval_anatomy_steps_per_second": 3.759, + "step": 8100 + }, + { + "epoch": 3.370786516853933, + "eval_college_mathematics_loss": 2.0516698360443115, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.499, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 8100 + }, + { + "epoch": 3.370786516853933, + "eval_international_law_loss": 3.0663843154907227, + "eval_international_law_runtime": 0.2677, + "eval_international_law_samples_per_second": 7.47, + "eval_international_law_steps_per_second": 3.735, + "step": 8100 + }, + { + "epoch": 3.379109446525177, + "grad_norm": 0.30078125, + "learning_rate": 4.642244617185827e-06, + "loss": 0.231, + "step": 8120 + }, + { + "epoch": 3.387432376196421, + "grad_norm": 0.267578125, + "learning_rate": 4.6380211354386475e-06, + "loss": 0.2312, + "step": 8140 + }, + { + "epoch": 3.3957553058676653, + "grad_norm": 0.263671875, + "learning_rate": 4.633774814336801e-06, + "loss": 0.2321, + "step": 8160 + }, + { + "epoch": 3.40407823553891, + "grad_norm": 0.333984375, + "learning_rate": 4.6295056992415026e-06, + "loss": 0.2298, + "step": 8180 + }, + { + "epoch": 3.412401165210154, + "grad_norm": 0.3125, + "learning_rate": 4.625213835757458e-06, + "loss": 0.2348, + "step": 8200 + }, + { + "epoch": 3.412401165210154, + "eval_main_loss": 0.23835770785808563, + "eval_main_runtime": 6.3288, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 8200 + }, + { + "epoch": 3.412401165210154, + "eval_anatomy_loss": 2.8323183059692383, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.531, + "eval_anatomy_steps_per_second": 3.766, + "step": 8200 + }, + { + "epoch": 3.412401165210154, + "eval_college_mathematics_loss": 2.0520193576812744, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.474, + "eval_college_mathematics_steps_per_second": 3.737, + "step": 8200 + }, + { + "epoch": 3.412401165210154, + "eval_international_law_loss": 3.0666942596435547, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.502, + "eval_international_law_steps_per_second": 3.751, + "step": 8200 + }, + { + "epoch": 3.420724094881398, + "grad_norm": 0.263671875, + "learning_rate": 4.62089926973239e-06, + "loss": 0.2312, + "step": 8220 + }, + { + "epoch": 3.4290470245526423, + "grad_norm": 0.267578125, + "learning_rate": 4.616562047256536e-06, + "loss": 0.23, + "step": 8240 + }, + { + "epoch": 3.437369954223887, + "grad_norm": 0.28125, + "learning_rate": 4.612202214662161e-06, + "loss": 0.2325, + "step": 8260 + }, + { + "epoch": 3.445692883895131, + "grad_norm": 0.3046875, + "learning_rate": 4.6078198185230605e-06, + "loss": 0.2323, + "step": 8280 + }, + { + "epoch": 3.454015813566375, + "grad_norm": 0.28125, + "learning_rate": 4.603414905654069e-06, + "loss": 0.2306, + "step": 8300 + }, + { + "epoch": 3.454015813566375, + "eval_main_loss": 0.23845936357975006, + "eval_main_runtime": 6.3281, + "eval_main_samples_per_second": 30.025, + "eval_main_steps_per_second": 3.793, + "step": 8300 + }, + { + "epoch": 3.454015813566375, + "eval_anatomy_loss": 2.8332934379577637, + "eval_anatomy_runtime": 0.2651, + "eval_anatomy_samples_per_second": 7.543, + "eval_anatomy_steps_per_second": 3.772, + "step": 8300 + }, + { + "epoch": 3.454015813566375, + "eval_college_mathematics_loss": 2.0525290966033936, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.524, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 8300 + }, + { + "epoch": 3.454015813566375, + "eval_international_law_loss": 3.0639939308166504, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.508, + "eval_international_law_steps_per_second": 3.754, + "step": 8300 + }, + { + "epoch": 3.46233874323762, + "grad_norm": 0.2734375, + "learning_rate": 4.5989875231105514e-06, + "loss": 0.2328, + "step": 8320 + }, + { + "epoch": 3.470661672908864, + "grad_norm": 0.25390625, + "learning_rate": 4.594537718187906e-06, + "loss": 0.2274, + "step": 8340 + }, + { + "epoch": 3.478984602580108, + "grad_norm": 0.287109375, + "learning_rate": 4.590065538421056e-06, + "loss": 0.2304, + "step": 8360 + }, + { + "epoch": 3.4873075322513527, + "grad_norm": 0.255859375, + "learning_rate": 4.585571031583946e-06, + "loss": 0.2292, + "step": 8380 + }, + { + "epoch": 3.495630461922597, + "grad_norm": 0.2734375, + "learning_rate": 4.581054245689026e-06, + "loss": 0.2309, + "step": 8400 + }, + { + "epoch": 3.495630461922597, + "eval_main_loss": 0.23835162818431854, + "eval_main_runtime": 6.3575, + "eval_main_samples_per_second": 29.886, + "eval_main_steps_per_second": 3.775, + "step": 8400 + }, + { + "epoch": 3.495630461922597, + "eval_anatomy_loss": 2.8338778018951416, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.516, + "eval_anatomy_steps_per_second": 3.758, + "step": 8400 + }, + { + "epoch": 3.495630461922597, + "eval_college_mathematics_loss": 2.053218126296997, + "eval_college_mathematics_runtime": 0.2688, + "eval_college_mathematics_samples_per_second": 7.442, + "eval_college_mathematics_steps_per_second": 3.721, + "step": 8400 + }, + { + "epoch": 3.495630461922597, + "eval_international_law_loss": 3.065638303756714, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.508, + "eval_international_law_steps_per_second": 3.754, + "step": 8400 + }, + { + "epoch": 3.503953391593841, + "grad_norm": 0.30859375, + "learning_rate": 4.576515228986743e-06, + "loss": 0.235, + "step": 8420 + }, + { + "epoch": 3.512276321265085, + "grad_norm": 0.328125, + "learning_rate": 4.571954029965024e-06, + "loss": 0.2322, + "step": 8440 + }, + { + "epoch": 3.5205992509363297, + "grad_norm": 0.322265625, + "learning_rate": 4.567370697348759e-06, + "loss": 0.2353, + "step": 8460 + }, + { + "epoch": 3.528922180607574, + "grad_norm": 0.27734375, + "learning_rate": 4.5627652800992765e-06, + "loss": 0.2312, + "step": 8480 + }, + { + "epoch": 3.537245110278818, + "grad_norm": 0.283203125, + "learning_rate": 4.558137827413825e-06, + "loss": 0.2317, + "step": 8500 + }, + { + "epoch": 3.537245110278818, + "eval_main_loss": 0.23841607570648193, + "eval_main_runtime": 6.3538, + "eval_main_samples_per_second": 29.903, + "eval_main_steps_per_second": 3.777, + "step": 8500 + }, + { + "epoch": 3.537245110278818, + "eval_anatomy_loss": 2.83097505569458, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.528, + "eval_anatomy_steps_per_second": 3.764, + "step": 8500 + }, + { + "epoch": 3.537245110278818, + "eval_college_mathematics_loss": 2.0496573448181152, + "eval_college_mathematics_runtime": 0.2674, + "eval_college_mathematics_samples_per_second": 7.48, + "eval_college_mathematics_steps_per_second": 3.74, + "step": 8500 + }, + { + "epoch": 3.537245110278818, + "eval_international_law_loss": 3.065972089767456, + "eval_international_law_runtime": 0.2675, + "eval_international_law_samples_per_second": 7.475, + "eval_international_law_steps_per_second": 3.738, + "step": 8500 + }, + { + "epoch": 3.545568039950062, + "grad_norm": 0.28515625, + "learning_rate": 4.5534883887250495e-06, + "loss": 0.235, + "step": 8520 + }, + { + "epoch": 3.553890969621307, + "grad_norm": 0.28125, + "learning_rate": 4.548817013700454e-06, + "loss": 0.2341, + "step": 8540 + }, + { + "epoch": 3.562213899292551, + "grad_norm": 0.294921875, + "learning_rate": 4.5441237522418804e-06, + "loss": 0.2295, + "step": 8560 + }, + { + "epoch": 3.570536828963795, + "grad_norm": 0.302734375, + "learning_rate": 4.53940865448497e-06, + "loss": 0.2318, + "step": 8580 + }, + { + "epoch": 3.5788597586350397, + "grad_norm": 0.30078125, + "learning_rate": 4.534671770798633e-06, + "loss": 0.2298, + "step": 8600 + }, + { + "epoch": 3.5788597586350397, + "eval_main_loss": 0.23844130337238312, + "eval_main_runtime": 6.3401, + "eval_main_samples_per_second": 29.968, + "eval_main_steps_per_second": 3.785, + "step": 8600 + }, + { + "epoch": 3.5788597586350397, + "eval_anatomy_loss": 2.8311820030212402, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.482, + "eval_anatomy_steps_per_second": 3.741, + "step": 8600 + }, + { + "epoch": 3.5788597586350397, + "eval_college_mathematics_loss": 2.051837921142578, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.517, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 8600 + }, + { + "epoch": 3.5788597586350397, + "eval_international_law_loss": 3.0655837059020996, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.503, + "eval_international_law_steps_per_second": 3.751, + "step": 8600 + }, + { + "epoch": 3.587182688306284, + "grad_norm": 0.26953125, + "learning_rate": 4.529913151784504e-06, + "loss": 0.2311, + "step": 8620 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.294921875, + "learning_rate": 4.525132848276405e-06, + "loss": 0.2341, + "step": 8640 + }, + { + "epoch": 3.6038285476487726, + "grad_norm": 0.296875, + "learning_rate": 4.520330911339805e-06, + "loss": 0.2343, + "step": 8660 + }, + { + "epoch": 3.6121514773200167, + "grad_norm": 0.28515625, + "learning_rate": 4.5155073922712665e-06, + "loss": 0.2324, + "step": 8680 + }, + { + "epoch": 3.620474406991261, + "grad_norm": 0.2119140625, + "learning_rate": 4.510662342597907e-06, + "loss": 0.2335, + "step": 8700 + }, + { + "epoch": 3.620474406991261, + "eval_main_loss": 0.23842211067676544, + "eval_main_runtime": 6.3337, + "eval_main_samples_per_second": 29.998, + "eval_main_steps_per_second": 3.789, + "step": 8700 + }, + { + "epoch": 3.620474406991261, + "eval_anatomy_loss": 2.8317272663116455, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.531, + "eval_anatomy_steps_per_second": 3.765, + "step": 8700 + }, + { + "epoch": 3.620474406991261, + "eval_college_mathematics_loss": 2.0518221855163574, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.515, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 8700 + }, + { + "epoch": 3.620474406991261, + "eval_international_law_loss": 3.06583571434021, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.492, + "eval_international_law_steps_per_second": 3.746, + "step": 8700 + }, + { + "epoch": 3.628797336662505, + "grad_norm": 0.263671875, + "learning_rate": 4.505795814076842e-06, + "loss": 0.2318, + "step": 8720 + }, + { + "epoch": 3.6371202663337496, + "grad_norm": 0.2578125, + "learning_rate": 4.5009078586946355e-06, + "loss": 0.2333, + "step": 8740 + }, + { + "epoch": 3.645443196004994, + "grad_norm": 0.279296875, + "learning_rate": 4.495998528666741e-06, + "loss": 0.2311, + "step": 8760 + }, + { + "epoch": 3.653766125676238, + "grad_norm": 0.259765625, + "learning_rate": 4.491067876436949e-06, + "loss": 0.2322, + "step": 8780 + }, + { + "epoch": 3.662089055347482, + "grad_norm": 0.318359375, + "learning_rate": 4.486115954676821e-06, + "loss": 0.2342, + "step": 8800 + }, + { + "epoch": 3.662089055347482, + "eval_main_loss": 0.23842394351959229, + "eval_main_runtime": 6.3208, + "eval_main_samples_per_second": 30.06, + "eval_main_steps_per_second": 3.797, + "step": 8800 + }, + { + "epoch": 3.662089055347482, + "eval_anatomy_loss": 2.8346548080444336, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.526, + "eval_anatomy_steps_per_second": 3.763, + "step": 8800 + }, + { + "epoch": 3.662089055347482, + "eval_college_mathematics_loss": 2.0510027408599854, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.526, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 8800 + }, + { + "epoch": 3.662089055347482, + "eval_international_law_loss": 3.0672590732574463, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.493, + "eval_international_law_steps_per_second": 3.747, + "step": 8800 + }, + { + "epoch": 3.6704119850187267, + "grad_norm": 0.25390625, + "learning_rate": 4.481142816285133e-06, + "loss": 0.2331, + "step": 8820 + }, + { + "epoch": 3.678734914689971, + "grad_norm": 0.2275390625, + "learning_rate": 4.476148514387305e-06, + "loss": 0.2343, + "step": 8840 + }, + { + "epoch": 3.687057844361215, + "grad_norm": 0.267578125, + "learning_rate": 4.471133102334836e-06, + "loss": 0.2323, + "step": 8860 + }, + { + "epoch": 3.6953807740324596, + "grad_norm": 0.310546875, + "learning_rate": 4.4660966337047325e-06, + "loss": 0.2326, + "step": 8880 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 0.26953125, + "learning_rate": 4.46103916229894e-06, + "loss": 0.235, + "step": 8900 + }, + { + "epoch": 3.7037037037037037, + "eval_main_loss": 0.2383807897567749, + "eval_main_runtime": 6.326, + "eval_main_samples_per_second": 30.035, + "eval_main_steps_per_second": 3.794, + "step": 8900 + }, + { + "epoch": 3.7037037037037037, + "eval_anatomy_loss": 2.830998420715332, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.531, + "eval_anatomy_steps_per_second": 3.765, + "step": 8900 + }, + { + "epoch": 3.7037037037037037, + "eval_college_mathematics_loss": 2.051891803741455, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.53, + "eval_college_mathematics_steps_per_second": 3.765, + "step": 8900 + }, + { + "epoch": 3.7037037037037037, + "eval_international_law_loss": 3.0654711723327637, + "eval_international_law_runtime": 0.2652, + "eval_international_law_samples_per_second": 7.541, + "eval_international_law_steps_per_second": 3.77, + "step": 8900 + }, + { + "epoch": 3.712026633374948, + "grad_norm": 0.26171875, + "learning_rate": 4.455960742143762e-06, + "loss": 0.2316, + "step": 8920 + }, + { + "epoch": 3.7203495630461925, + "grad_norm": 0.2373046875, + "learning_rate": 4.450861427489291e-06, + "loss": 0.2319, + "step": 8940 + }, + { + "epoch": 3.7286724927174366, + "grad_norm": 0.248046875, + "learning_rate": 4.445741272808821e-06, + "loss": 0.2296, + "step": 8960 + }, + { + "epoch": 3.7369954223886808, + "grad_norm": 0.2275390625, + "learning_rate": 4.440600332798269e-06, + "loss": 0.2317, + "step": 8980 + }, + { + "epoch": 3.7453183520599254, + "grad_norm": 0.28515625, + "learning_rate": 4.435438662375593e-06, + "loss": 0.237, + "step": 9000 + }, + { + "epoch": 3.7453183520599254, + "eval_main_loss": 0.23835314810276031, + "eval_main_runtime": 6.3284, + "eval_main_samples_per_second": 30.023, + "eval_main_steps_per_second": 3.792, + "step": 9000 + }, + { + "epoch": 3.7453183520599254, + "eval_anatomy_loss": 2.832582473754883, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.509, + "eval_anatomy_steps_per_second": 3.755, + "step": 9000 + }, + { + "epoch": 3.7453183520599254, + "eval_college_mathematics_loss": 2.0516104698181152, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.526, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 9000 + }, + { + "epoch": 3.7453183520599254, + "eval_international_law_loss": 3.0663774013519287, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.758, + "step": 9000 + }, + { + "epoch": 3.7536412817311695, + "grad_norm": 0.2412109375, + "learning_rate": 4.430256316680201e-06, + "loss": 0.2296, + "step": 9020 + }, + { + "epoch": 3.7619642114024137, + "grad_norm": 0.248046875, + "learning_rate": 4.425053351072365e-06, + "loss": 0.2348, + "step": 9040 + }, + { + "epoch": 3.770287141073658, + "grad_norm": 0.294921875, + "learning_rate": 4.419829821132629e-06, + "loss": 0.2318, + "step": 9060 + }, + { + "epoch": 3.778610070744902, + "grad_norm": 0.306640625, + "learning_rate": 4.414585782661215e-06, + "loss": 0.2311, + "step": 9080 + }, + { + "epoch": 3.7869330004161466, + "grad_norm": 0.2890625, + "learning_rate": 4.4093212916774245e-06, + "loss": 0.2342, + "step": 9100 + }, + { + "epoch": 3.7869330004161466, + "eval_main_loss": 0.23832525312900543, + "eval_main_runtime": 6.3321, + "eval_main_samples_per_second": 30.006, + "eval_main_steps_per_second": 3.79, + "step": 9100 + }, + { + "epoch": 3.7869330004161466, + "eval_anatomy_loss": 2.83284854888916, + "eval_anatomy_runtime": 0.2678, + "eval_anatomy_samples_per_second": 7.468, + "eval_anatomy_steps_per_second": 3.734, + "step": 9100 + }, + { + "epoch": 3.7869330004161466, + "eval_college_mathematics_loss": 2.0502493381500244, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.526, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 9100 + }, + { + "epoch": 3.7869330004161466, + "eval_international_law_loss": 3.0675525665283203, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.507, + "eval_international_law_steps_per_second": 3.754, + "step": 9100 + }, + { + "epoch": 3.7952559300873907, + "grad_norm": 0.279296875, + "learning_rate": 4.404036404419045e-06, + "loss": 0.2319, + "step": 9120 + }, + { + "epoch": 3.803578859758635, + "grad_norm": 0.314453125, + "learning_rate": 4.398731177341747e-06, + "loss": 0.2277, + "step": 9140 + }, + { + "epoch": 3.8119017894298795, + "grad_norm": 0.2734375, + "learning_rate": 4.39340566711848e-06, + "loss": 0.2322, + "step": 9160 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.279296875, + "learning_rate": 4.388059930638865e-06, + "loss": 0.2358, + "step": 9180 + }, + { + "epoch": 3.8285476487723678, + "grad_norm": 0.271484375, + "learning_rate": 4.3826940250085925e-06, + "loss": 0.2254, + "step": 9200 + }, + { + "epoch": 3.8285476487723678, + "eval_main_loss": 0.2383318841457367, + "eval_main_runtime": 6.3299, + "eval_main_samples_per_second": 30.016, + "eval_main_steps_per_second": 3.792, + "step": 9200 + }, + { + "epoch": 3.8285476487723678, + "eval_anatomy_loss": 2.833568811416626, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 9200 + }, + { + "epoch": 3.8285476487723678, + "eval_college_mathematics_loss": 2.055227756500244, + "eval_college_mathematics_runtime": 0.2648, + "eval_college_mathematics_samples_per_second": 7.554, + "eval_college_mathematics_steps_per_second": 3.777, + "step": 9200 + }, + { + "epoch": 3.8285476487723678, + "eval_international_law_loss": 3.0670459270477295, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.516, + "eval_international_law_steps_per_second": 3.758, + "step": 9200 + }, + { + "epoch": 3.8368705784436123, + "grad_norm": 0.298828125, + "learning_rate": 4.377308007548809e-06, + "loss": 0.2281, + "step": 9220 + }, + { + "epoch": 3.8451935081148565, + "grad_norm": 0.26171875, + "learning_rate": 4.371901935795504e-06, + "loss": 0.2323, + "step": 9240 + }, + { + "epoch": 3.8535164377861006, + "grad_norm": 0.2119140625, + "learning_rate": 4.3664758674988984e-06, + "loss": 0.228, + "step": 9260 + }, + { + "epoch": 3.8618393674573452, + "grad_norm": 0.2490234375, + "learning_rate": 4.361029860622822e-06, + "loss": 0.2315, + "step": 9280 + }, + { + "epoch": 3.8701622971285894, + "grad_norm": 0.263671875, + "learning_rate": 4.355563973344104e-06, + "loss": 0.231, + "step": 9300 + }, + { + "epoch": 3.8701622971285894, + "eval_main_loss": 0.23839576542377472, + "eval_main_runtime": 6.3285, + "eval_main_samples_per_second": 30.023, + "eval_main_steps_per_second": 3.792, + "step": 9300 + }, + { + "epoch": 3.8701622971285894, + "eval_anatomy_loss": 2.8333206176757812, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.516, + "eval_anatomy_steps_per_second": 3.758, + "step": 9300 + }, + { + "epoch": 3.8701622971285894, + "eval_college_mathematics_loss": 2.048659563064575, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.5, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 9300 + }, + { + "epoch": 3.8701622971285894, + "eval_international_law_loss": 3.0655484199523926, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.508, + "eval_international_law_steps_per_second": 3.754, + "step": 9300 + }, + { + "epoch": 3.8784852267998335, + "grad_norm": 0.2373046875, + "learning_rate": 4.3500782640519375e-06, + "loss": 0.2261, + "step": 9320 + }, + { + "epoch": 3.8868081564710777, + "grad_norm": 0.294921875, + "learning_rate": 4.344572791347272e-06, + "loss": 0.2274, + "step": 9340 + }, + { + "epoch": 3.895131086142322, + "grad_norm": 0.302734375, + "learning_rate": 4.339047614042172e-06, + "loss": 0.2316, + "step": 9360 + }, + { + "epoch": 3.9034540158135664, + "grad_norm": 0.33203125, + "learning_rate": 4.3335027911592004e-06, + "loss": 0.23, + "step": 9380 + }, + { + "epoch": 3.9117769454848106, + "grad_norm": 0.30078125, + "learning_rate": 4.327938381930782e-06, + "loss": 0.2318, + "step": 9400 + }, + { + "epoch": 3.9117769454848106, + "eval_main_loss": 0.23833923041820526, + "eval_main_runtime": 6.3193, + "eval_main_samples_per_second": 30.067, + "eval_main_steps_per_second": 3.798, + "step": 9400 + }, + { + "epoch": 3.9117769454848106, + "eval_anatomy_loss": 2.8319694995880127, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.52, + "eval_anatomy_steps_per_second": 3.76, + "step": 9400 + }, + { + "epoch": 3.9117769454848106, + "eval_college_mathematics_loss": 2.0500192642211914, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.502, + "eval_college_mathematics_steps_per_second": 3.751, + "step": 9400 + }, + { + "epoch": 3.9117769454848106, + "eval_international_law_loss": 3.0644402503967285, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.486, + "eval_international_law_steps_per_second": 3.743, + "step": 9400 + }, + { + "epoch": 3.9200998751560547, + "grad_norm": 0.271484375, + "learning_rate": 4.3223544457985735e-06, + "loss": 0.2315, + "step": 9420 + }, + { + "epoch": 3.9284228048272993, + "grad_norm": 0.2890625, + "learning_rate": 4.316751042412824e-06, + "loss": 0.2362, + "step": 9440 + }, + { + "epoch": 3.9367457344985435, + "grad_norm": 0.271484375, + "learning_rate": 4.311128231631745e-06, + "loss": 0.2343, + "step": 9460 + }, + { + "epoch": 3.9450686641697876, + "grad_norm": 0.328125, + "learning_rate": 4.305486073520865e-06, + "loss": 0.2349, + "step": 9480 + }, + { + "epoch": 3.9533915938410322, + "grad_norm": 0.275390625, + "learning_rate": 4.299824628352387e-06, + "loss": 0.2304, + "step": 9500 + }, + { + "epoch": 3.9533915938410322, + "eval_main_loss": 0.2382751852273941, + "eval_main_runtime": 6.3262, + "eval_main_samples_per_second": 30.034, + "eval_main_steps_per_second": 3.794, + "step": 9500 + }, + { + "epoch": 3.9533915938410322, + "eval_anatomy_loss": 2.832981586456299, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.518, + "eval_anatomy_steps_per_second": 3.759, + "step": 9500 + }, + { + "epoch": 3.9533915938410322, + "eval_college_mathematics_loss": 2.0524649620056152, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.484, + "eval_college_mathematics_steps_per_second": 3.742, + "step": 9500 + }, + { + "epoch": 3.9533915938410322, + "eval_international_law_loss": 3.0676116943359375, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.525, + "eval_international_law_steps_per_second": 3.763, + "step": 9500 + }, + { + "epoch": 3.9617145235122764, + "grad_norm": 0.3125, + "learning_rate": 4.2941439566045536e-06, + "loss": 0.2308, + "step": 9520 + }, + { + "epoch": 3.9700374531835205, + "grad_norm": 0.291015625, + "learning_rate": 4.2884441189609915e-06, + "loss": 0.2317, + "step": 9540 + }, + { + "epoch": 3.978360382854765, + "grad_norm": 0.37109375, + "learning_rate": 4.282725176310065e-06, + "loss": 0.2318, + "step": 9560 + }, + { + "epoch": 3.9866833125260093, + "grad_norm": 0.26171875, + "learning_rate": 4.27698718974423e-06, + "loss": 0.2343, + "step": 9580 + }, + { + "epoch": 3.9950062421972534, + "grad_norm": 0.267578125, + "learning_rate": 4.271230220559378e-06, + "loss": 0.2294, + "step": 9600 + }, + { + "epoch": 3.9950062421972534, + "eval_main_loss": 0.23832967877388, + "eval_main_runtime": 6.3277, + "eval_main_samples_per_second": 30.027, + "eval_main_steps_per_second": 3.793, + "step": 9600 + }, + { + "epoch": 3.9950062421972534, + "eval_anatomy_loss": 2.8348305225372314, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 9600 + }, + { + "epoch": 3.9950062421972534, + "eval_college_mathematics_loss": 2.0557780265808105, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 9600 + }, + { + "epoch": 3.9950062421972534, + "eval_international_law_loss": 3.0656328201293945, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.53, + "eval_international_law_steps_per_second": 3.765, + "step": 9600 + }, + { + "epoch": 4.003329171868498, + "grad_norm": 0.27734375, + "learning_rate": 4.2654543302541796e-06, + "loss": 0.2352, + "step": 9620 + }, + { + "epoch": 4.011652101539742, + "grad_norm": 0.296875, + "learning_rate": 4.259659580529432e-06, + "loss": 0.2334, + "step": 9640 + }, + { + "epoch": 4.019975031210986, + "grad_norm": 0.2412109375, + "learning_rate": 4.253846033287398e-06, + "loss": 0.2308, + "step": 9660 + }, + { + "epoch": 4.028297960882231, + "grad_norm": 0.314453125, + "learning_rate": 4.248013750631143e-06, + "loss": 0.2353, + "step": 9680 + }, + { + "epoch": 4.036620890553475, + "grad_norm": 0.26953125, + "learning_rate": 4.242162794863872e-06, + "loss": 0.2312, + "step": 9700 + }, + { + "epoch": 4.036620890553475, + "eval_main_loss": 0.2384222000837326, + "eval_main_runtime": 6.3303, + "eval_main_samples_per_second": 30.014, + "eval_main_steps_per_second": 3.791, + "step": 9700 + }, + { + "epoch": 4.036620890553475, + "eval_anatomy_loss": 2.832493305206299, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.525, + "eval_anatomy_steps_per_second": 3.763, + "step": 9700 + }, + { + "epoch": 4.036620890553475, + "eval_college_mathematics_loss": 2.0524394512176514, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 9700 + }, + { + "epoch": 4.036620890553475, + "eval_international_law_loss": 3.0663318634033203, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.525, + "eval_international_law_steps_per_second": 3.762, + "step": 9700 + }, + { + "epoch": 4.044943820224719, + "grad_norm": 0.298828125, + "learning_rate": 4.236293228488267e-06, + "loss": 0.2306, + "step": 9720 + }, + { + "epoch": 4.053266749895964, + "grad_norm": 0.298828125, + "learning_rate": 4.23040511420582e-06, + "loss": 0.2331, + "step": 9740 + }, + { + "epoch": 4.0615896795672075, + "grad_norm": 0.30859375, + "learning_rate": 4.224498514916152e-06, + "loss": 0.2312, + "step": 9760 + }, + { + "epoch": 4.069912609238452, + "grad_norm": 0.287109375, + "learning_rate": 4.218573493716359e-06, + "loss": 0.2321, + "step": 9780 + }, + { + "epoch": 4.078235538909696, + "grad_norm": 0.30078125, + "learning_rate": 4.212630113900322e-06, + "loss": 0.2333, + "step": 9800 + }, + { + "epoch": 4.078235538909696, + "eval_main_loss": 0.23829525709152222, + "eval_main_runtime": 6.3313, + "eval_main_samples_per_second": 30.01, + "eval_main_steps_per_second": 3.791, + "step": 9800 + }, + { + "epoch": 4.078235538909696, + "eval_anatomy_loss": 2.8339943885803223, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.757, + "step": 9800 + }, + { + "epoch": 4.078235538909696, + "eval_college_mathematics_loss": 2.0519936084747314, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.53, + "eval_college_mathematics_steps_per_second": 3.765, + "step": 9800 + }, + { + "epoch": 4.078235538909696, + "eval_international_law_loss": 3.06636905670166, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.494, + "eval_international_law_steps_per_second": 3.747, + "step": 9800 + }, + { + "epoch": 4.08655846858094, + "grad_norm": 0.287109375, + "learning_rate": 4.206668438958042e-06, + "loss": 0.2325, + "step": 9820 + }, + { + "epoch": 4.094881398252185, + "grad_norm": 0.294921875, + "learning_rate": 4.200688532574952e-06, + "loss": 0.2294, + "step": 9840 + }, + { + "epoch": 4.103204327923429, + "grad_norm": 0.275390625, + "learning_rate": 4.1946904586312485e-06, + "loss": 0.2326, + "step": 9860 + }, + { + "epoch": 4.111527257594673, + "grad_norm": 0.2578125, + "learning_rate": 4.188674281201198e-06, + "loss": 0.2348, + "step": 9880 + }, + { + "epoch": 4.119850187265918, + "grad_norm": 0.333984375, + "learning_rate": 4.182640064552456e-06, + "loss": 0.2324, + "step": 9900 + }, + { + "epoch": 4.119850187265918, + "eval_main_loss": 0.2383279949426651, + "eval_main_runtime": 6.3331, + "eval_main_samples_per_second": 30.001, + "eval_main_steps_per_second": 3.79, + "step": 9900 + }, + { + "epoch": 4.119850187265918, + "eval_anatomy_loss": 2.8347108364105225, + "eval_anatomy_runtime": 0.268, + "eval_anatomy_samples_per_second": 7.464, + "eval_anatomy_steps_per_second": 3.732, + "step": 9900 + }, + { + "epoch": 4.119850187265918, + "eval_college_mathematics_loss": 2.053701877593994, + "eval_college_mathematics_runtime": 0.266, + "eval_college_mathematics_samples_per_second": 7.519, + "eval_college_mathematics_steps_per_second": 3.759, + "step": 9900 + }, + { + "epoch": 4.119850187265918, + "eval_international_law_loss": 3.0677242279052734, + "eval_international_law_runtime": 0.2678, + "eval_international_law_samples_per_second": 7.469, + "eval_international_law_steps_per_second": 3.735, + "step": 9900 + }, + { + "epoch": 4.128173116937162, + "grad_norm": 0.244140625, + "learning_rate": 4.176587873145386e-06, + "loss": 0.2329, + "step": 9920 + }, + { + "epoch": 4.136496046608406, + "grad_norm": 0.3046875, + "learning_rate": 4.170517771632362e-06, + "loss": 0.2316, + "step": 9940 + }, + { + "epoch": 4.144818976279651, + "grad_norm": 0.2451171875, + "learning_rate": 4.164429824857086e-06, + "loss": 0.2336, + "step": 9960 + }, + { + "epoch": 4.1531419059508945, + "grad_norm": 0.255859375, + "learning_rate": 4.158324097853887e-06, + "loss": 0.2361, + "step": 9980 + }, + { + "epoch": 4.161464835622139, + "grad_norm": 0.26953125, + "learning_rate": 4.1522006558470365e-06, + "loss": 0.2359, + "step": 10000 + }, + { + "epoch": 4.161464835622139, + "eval_main_loss": 0.23838593065738678, + "eval_main_runtime": 6.3323, + "eval_main_samples_per_second": 30.005, + "eval_main_steps_per_second": 3.79, + "step": 10000 + }, + { + "epoch": 4.161464835622139, + "eval_anatomy_loss": 2.8330607414245605, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.488, + "eval_anatomy_steps_per_second": 3.744, + "step": 10000 + }, + { + "epoch": 4.161464835622139, + "eval_college_mathematics_loss": 2.04879093170166, + "eval_college_mathematics_runtime": 0.2649, + "eval_college_mathematics_samples_per_second": 7.551, + "eval_college_mathematics_steps_per_second": 3.776, + "step": 10000 + }, + { + "epoch": 4.161464835622139, + "eval_international_law_loss": 3.0646615028381348, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 10000 + }, + { + "epoch": 4.169787765293384, + "grad_norm": 0.322265625, + "learning_rate": 4.146059564250041e-06, + "loss": 0.2326, + "step": 10020 + }, + { + "epoch": 4.178110694964627, + "grad_norm": 0.2890625, + "learning_rate": 4.13990088866495e-06, + "loss": 0.2336, + "step": 10040 + }, + { + "epoch": 4.186433624635872, + "grad_norm": 0.310546875, + "learning_rate": 4.133724694881655e-06, + "loss": 0.2314, + "step": 10060 + }, + { + "epoch": 4.194756554307116, + "grad_norm": 0.306640625, + "learning_rate": 4.1275310488771855e-06, + "loss": 0.2286, + "step": 10080 + }, + { + "epoch": 4.20307948397836, + "grad_norm": 0.322265625, + "learning_rate": 4.1213200168149994e-06, + "loss": 0.2337, + "step": 10100 + }, + { + "epoch": 4.20307948397836, + "eval_main_loss": 0.23833875358104706, + "eval_main_runtime": 6.3313, + "eval_main_samples_per_second": 30.01, + "eval_main_steps_per_second": 3.791, + "step": 10100 + }, + { + "epoch": 4.20307948397836, + "eval_anatomy_loss": 2.8343758583068848, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.511, + "eval_anatomy_steps_per_second": 3.755, + "step": 10100 + }, + { + "epoch": 4.20307948397836, + "eval_college_mathematics_loss": 2.0506842136383057, + "eval_college_mathematics_runtime": 0.2649, + "eval_college_mathematics_samples_per_second": 7.551, + "eval_college_mathematics_steps_per_second": 3.775, + "step": 10100 + }, + { + "epoch": 4.20307948397836, + "eval_international_law_loss": 3.0656676292419434, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.491, + "eval_international_law_steps_per_second": 3.745, + "step": 10100 + }, + { + "epoch": 4.211402413649605, + "grad_norm": 0.294921875, + "learning_rate": 4.115091665044284e-06, + "loss": 0.2332, + "step": 10120 + }, + { + "epoch": 4.219725343320849, + "grad_norm": 0.265625, + "learning_rate": 4.108846060099246e-06, + "loss": 0.2285, + "step": 10140 + }, + { + "epoch": 4.228048272992093, + "grad_norm": 0.3046875, + "learning_rate": 4.102583268698393e-06, + "loss": 0.2258, + "step": 10160 + }, + { + "epoch": 4.236371202663338, + "grad_norm": 0.328125, + "learning_rate": 4.096303357743834e-06, + "loss": 0.2307, + "step": 10180 + }, + { + "epoch": 4.2446941323345815, + "grad_norm": 0.2890625, + "learning_rate": 4.0900063943205485e-06, + "loss": 0.232, + "step": 10200 + }, + { + "epoch": 4.2446941323345815, + "eval_main_loss": 0.23833294212818146, + "eval_main_runtime": 6.3324, + "eval_main_samples_per_second": 30.004, + "eval_main_steps_per_second": 3.79, + "step": 10200 + }, + { + "epoch": 4.2446941323345815, + "eval_anatomy_loss": 2.8333046436309814, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.511, + "eval_anatomy_steps_per_second": 3.756, + "step": 10200 + }, + { + "epoch": 4.2446941323345815, + "eval_college_mathematics_loss": 2.0533056259155273, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.487, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 10200 + }, + { + "epoch": 4.2446941323345815, + "eval_international_law_loss": 3.0654220581054688, + "eval_international_law_runtime": 0.2655, + "eval_international_law_samples_per_second": 7.533, + "eval_international_law_steps_per_second": 3.766, + "step": 10200 + }, + { + "epoch": 4.253017062005826, + "grad_norm": 0.263671875, + "learning_rate": 4.083692445695686e-06, + "loss": 0.2346, + "step": 10220 + }, + { + "epoch": 4.261339991677071, + "grad_norm": 0.2265625, + "learning_rate": 4.077361579317835e-06, + "loss": 0.2333, + "step": 10240 + }, + { + "epoch": 4.269662921348314, + "grad_norm": 0.2734375, + "learning_rate": 4.071013862816311e-06, + "loss": 0.2304, + "step": 10260 + }, + { + "epoch": 4.277985851019559, + "grad_norm": 0.2578125, + "learning_rate": 4.064649364000429e-06, + "loss": 0.235, + "step": 10280 + }, + { + "epoch": 4.286308780690804, + "grad_norm": 0.251953125, + "learning_rate": 4.058268150858779e-06, + "loss": 0.2325, + "step": 10300 + }, + { + "epoch": 4.286308780690804, + "eval_main_loss": 0.23827138543128967, + "eval_main_runtime": 6.325, + "eval_main_samples_per_second": 30.039, + "eval_main_steps_per_second": 3.794, + "step": 10300 + }, + { + "epoch": 4.286308780690804, + "eval_anatomy_loss": 2.832484006881714, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.509, + "eval_anatomy_steps_per_second": 3.754, + "step": 10300 + }, + { + "epoch": 4.286308780690804, + "eval_college_mathematics_loss": 2.050703287124634, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.512, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 10300 + }, + { + "epoch": 4.286308780690804, + "eval_international_law_loss": 3.06673002243042, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.536, + "eval_international_law_steps_per_second": 3.768, + "step": 10300 + }, + { + "epoch": 4.294631710362047, + "grad_norm": 0.25390625, + "learning_rate": 4.051870291558505e-06, + "loss": 0.2288, + "step": 10320 + }, + { + "epoch": 4.302954640033292, + "grad_norm": 0.28125, + "learning_rate": 4.045455854444569e-06, + "loss": 0.232, + "step": 10340 + }, + { + "epoch": 4.3112775697045365, + "grad_norm": 0.255859375, + "learning_rate": 4.039024908039029e-06, + "loss": 0.2337, + "step": 10360 + }, + { + "epoch": 4.31960049937578, + "grad_norm": 0.3125, + "learning_rate": 4.0325775210402995e-06, + "loss": 0.2343, + "step": 10380 + }, + { + "epoch": 4.327923429047025, + "grad_norm": 0.267578125, + "learning_rate": 4.026113762322423e-06, + "loss": 0.2309, + "step": 10400 + }, + { + "epoch": 4.327923429047025, + "eval_main_loss": 0.23835839331150055, + "eval_main_runtime": 6.3532, + "eval_main_samples_per_second": 29.906, + "eval_main_steps_per_second": 3.778, + "step": 10400 + }, + { + "epoch": 4.327923429047025, + "eval_anatomy_loss": 2.8327348232269287, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 10400 + }, + { + "epoch": 4.327923429047025, + "eval_college_mathematics_loss": 2.053480625152588, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.487, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 10400 + }, + { + "epoch": 4.327923429047025, + "eval_international_law_loss": 3.0650157928466797, + "eval_international_law_runtime": 0.2677, + "eval_international_law_samples_per_second": 7.472, + "eval_international_law_steps_per_second": 3.736, + "step": 10400 + }, + { + "epoch": 4.3362463587182685, + "grad_norm": 0.26953125, + "learning_rate": 4.019633700934334e-06, + "loss": 0.2329, + "step": 10420 + }, + { + "epoch": 4.344569288389513, + "grad_norm": 0.302734375, + "learning_rate": 4.013137406099117e-06, + "loss": 0.2306, + "step": 10440 + }, + { + "epoch": 4.352892218060758, + "grad_norm": 0.271484375, + "learning_rate": 4.006624947213272e-06, + "loss": 0.2314, + "step": 10460 + }, + { + "epoch": 4.361215147732001, + "grad_norm": 0.259765625, + "learning_rate": 4.000096393845968e-06, + "loss": 0.2319, + "step": 10480 + }, + { + "epoch": 4.369538077403246, + "grad_norm": 0.271484375, + "learning_rate": 3.993551815738307e-06, + "loss": 0.2279, + "step": 10500 + }, + { + "epoch": 4.369538077403246, + "eval_main_loss": 0.23829132318496704, + "eval_main_runtime": 6.3595, + "eval_main_samples_per_second": 29.876, + "eval_main_steps_per_second": 3.774, + "step": 10500 + }, + { + "epoch": 4.369538077403246, + "eval_anatomy_loss": 2.8348493576049805, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.502, + "eval_anatomy_steps_per_second": 3.751, + "step": 10500 + }, + { + "epoch": 4.369538077403246, + "eval_college_mathematics_loss": 2.051111936569214, + "eval_college_mathematics_runtime": 0.2673, + "eval_college_mathematics_samples_per_second": 7.481, + "eval_college_mathematics_steps_per_second": 3.741, + "step": 10500 + }, + { + "epoch": 4.369538077403246, + "eval_international_law_loss": 3.0652830600738525, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.483, + "eval_international_law_steps_per_second": 3.742, + "step": 10500 + }, + { + "epoch": 4.3778610070744906, + "grad_norm": 0.267578125, + "learning_rate": 3.9869912828025735e-06, + "loss": 0.2286, + "step": 10520 + }, + { + "epoch": 4.386183936745734, + "grad_norm": 0.3203125, + "learning_rate": 3.980414865121486e-06, + "loss": 0.2314, + "step": 10540 + }, + { + "epoch": 4.394506866416979, + "grad_norm": 0.2578125, + "learning_rate": 3.973822632947455e-06, + "loss": 0.2295, + "step": 10560 + }, + { + "epoch": 4.4028297960882234, + "grad_norm": 0.337890625, + "learning_rate": 3.9672146567018275e-06, + "loss": 0.2325, + "step": 10580 + }, + { + "epoch": 4.411152725759467, + "grad_norm": 0.279296875, + "learning_rate": 3.9605910069741375e-06, + "loss": 0.2316, + "step": 10600 + }, + { + "epoch": 4.411152725759467, + "eval_main_loss": 0.23840609192848206, + "eval_main_runtime": 6.3547, + "eval_main_samples_per_second": 29.899, + "eval_main_steps_per_second": 3.777, + "step": 10600 + }, + { + "epoch": 4.411152725759467, + "eval_anatomy_loss": 2.8325912952423096, + "eval_anatomy_runtime": 0.2688, + "eval_anatomy_samples_per_second": 7.442, + "eval_anatomy_steps_per_second": 3.721, + "step": 10600 + }, + { + "epoch": 4.411152725759467, + "eval_college_mathematics_loss": 2.0505118370056152, + "eval_college_mathematics_runtime": 0.2654, + "eval_college_mathematics_samples_per_second": 7.537, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 10600 + }, + { + "epoch": 4.411152725759467, + "eval_international_law_loss": 3.0657410621643066, + "eval_international_law_runtime": 0.2676, + "eval_international_law_samples_per_second": 7.475, + "eval_international_law_steps_per_second": 3.738, + "step": 10600 + }, + { + "epoch": 4.419475655430712, + "grad_norm": 0.287109375, + "learning_rate": 3.953951754521348e-06, + "loss": 0.2281, + "step": 10620 + }, + { + "epoch": 4.4277985851019555, + "grad_norm": 0.3046875, + "learning_rate": 3.947296970267098e-06, + "loss": 0.231, + "step": 10640 + }, + { + "epoch": 4.4361215147732, + "grad_norm": 0.2197265625, + "learning_rate": 3.940626725300949e-06, + "loss": 0.2298, + "step": 10660 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.26953125, + "learning_rate": 3.933941090877615e-06, + "loss": 0.2314, + "step": 10680 + }, + { + "epoch": 4.452767374115688, + "grad_norm": 0.287109375, + "learning_rate": 3.927240138416212e-06, + "loss": 0.234, + "step": 10700 + }, + { + "epoch": 4.452767374115688, + "eval_main_loss": 0.2382432222366333, + "eval_main_runtime": 6.321, + "eval_main_samples_per_second": 30.059, + "eval_main_steps_per_second": 3.797, + "step": 10700 + }, + { + "epoch": 4.452767374115688, + "eval_anatomy_loss": 2.833085536956787, + "eval_anatomy_runtime": 0.2641, + "eval_anatomy_samples_per_second": 7.574, + "eval_anatomy_steps_per_second": 3.787, + "step": 10700 + }, + { + "epoch": 4.452767374115688, + "eval_college_mathematics_loss": 2.0491299629211426, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.494, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 10700 + }, + { + "epoch": 4.452767374115688, + "eval_international_law_loss": 3.064594268798828, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.492, + "eval_international_law_steps_per_second": 3.746, + "step": 10700 + }, + { + "epoch": 4.461090303786933, + "grad_norm": 0.2734375, + "learning_rate": 3.920523939499487e-06, + "loss": 0.2293, + "step": 10720 + }, + { + "epoch": 4.4694132334581775, + "grad_norm": 0.28125, + "learning_rate": 3.913792565873061e-06, + "loss": 0.2285, + "step": 10740 + }, + { + "epoch": 4.477736163129421, + "grad_norm": 0.275390625, + "learning_rate": 3.907046089444654e-06, + "loss": 0.2337, + "step": 10760 + }, + { + "epoch": 4.486059092800666, + "grad_norm": 0.34375, + "learning_rate": 3.900284582283323e-06, + "loss": 0.2325, + "step": 10780 + }, + { + "epoch": 4.49438202247191, + "grad_norm": 0.369140625, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.2343, + "step": 10800 + }, + { + "epoch": 4.49438202247191, + "eval_main_loss": 0.23841159045696259, + "eval_main_runtime": 6.3194, + "eval_main_samples_per_second": 30.066, + "eval_main_steps_per_second": 3.798, + "step": 10800 + }, + { + "epoch": 4.49438202247191, + "eval_anatomy_loss": 2.8321523666381836, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.521, + "eval_anatomy_steps_per_second": 3.76, + "step": 10800 + }, + { + "epoch": 4.49438202247191, + "eval_college_mathematics_loss": 2.050726890563965, + "eval_college_mathematics_runtime": 0.2654, + "eval_college_mathematics_samples_per_second": 7.536, + "eval_college_mathematics_steps_per_second": 3.768, + "step": 10800 + }, + { + "epoch": 4.49438202247191, + "eval_international_law_loss": 3.0631279945373535, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.759, + "step": 10800 + }, + { + "epoch": 4.502704952143154, + "grad_norm": 0.314453125, + "learning_rate": 3.88671676484018e-06, + "loss": 0.2325, + "step": 10820 + }, + { + "epoch": 4.511027881814399, + "grad_norm": 0.322265625, + "learning_rate": 3.87991059949622e-06, + "loss": 0.2315, + "step": 10840 + }, + { + "epoch": 4.519350811485643, + "grad_norm": 0.29296875, + "learning_rate": 3.873089693293497e-06, + "loss": 0.2325, + "step": 10860 + }, + { + "epoch": 4.527673741156887, + "grad_norm": 0.25, + "learning_rate": 3.866254119096161e-06, + "loss": 0.2298, + "step": 10880 + }, + { + "epoch": 4.535996670828132, + "grad_norm": 0.275390625, + "learning_rate": 3.8594039499250545e-06, + "loss": 0.232, + "step": 10900 + }, + { + "epoch": 4.535996670828132, + "eval_main_loss": 0.2383212000131607, + "eval_main_runtime": 6.3117, + "eval_main_samples_per_second": 30.103, + "eval_main_steps_per_second": 3.802, + "step": 10900 + }, + { + "epoch": 4.535996670828132, + "eval_anatomy_loss": 2.833933115005493, + "eval_anatomy_runtime": 0.2655, + "eval_anatomy_samples_per_second": 7.534, + "eval_anatomy_steps_per_second": 3.767, + "step": 10900 + }, + { + "epoch": 4.535996670828132, + "eval_college_mathematics_loss": 2.0495407581329346, + "eval_college_mathematics_runtime": 0.2654, + "eval_college_mathematics_samples_per_second": 7.537, + "eval_college_mathematics_steps_per_second": 3.768, + "step": 10900 + }, + { + "epoch": 4.535996670828132, + "eval_international_law_loss": 3.0634400844573975, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.537, + "eval_international_law_steps_per_second": 3.768, + "step": 10900 + }, + { + "epoch": 4.544319600499376, + "grad_norm": 0.248046875, + "learning_rate": 3.852539258956931e-06, + "loss": 0.2345, + "step": 10920 + }, + { + "epoch": 4.55264253017062, + "grad_norm": 0.302734375, + "learning_rate": 3.845660119523671e-06, + "loss": 0.2315, + "step": 10940 + }, + { + "epoch": 4.5609654598418645, + "grad_norm": 0.27734375, + "learning_rate": 3.8387666051114995e-06, + "loss": 0.232, + "step": 10960 + }, + { + "epoch": 4.569288389513108, + "grad_norm": 0.30078125, + "learning_rate": 3.831858789360206e-06, + "loss": 0.2332, + "step": 10980 + }, + { + "epoch": 4.577611319184353, + "grad_norm": 0.26171875, + "learning_rate": 3.824936746062349e-06, + "loss": 0.2349, + "step": 11000 + }, + { + "epoch": 4.577611319184353, + "eval_main_loss": 0.23827816545963287, + "eval_main_runtime": 6.3591, + "eval_main_samples_per_second": 29.878, + "eval_main_steps_per_second": 3.774, + "step": 11000 + }, + { + "epoch": 4.577611319184353, + "eval_anatomy_loss": 2.83370304107666, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.51, + "eval_anatomy_steps_per_second": 3.755, + "step": 11000 + }, + { + "epoch": 4.577611319184353, + "eval_college_mathematics_loss": 2.051502227783203, + "eval_college_mathematics_runtime": 0.2674, + "eval_college_mathematics_samples_per_second": 7.478, + "eval_college_mathematics_steps_per_second": 3.739, + "step": 11000 + }, + { + "epoch": 4.577611319184353, + "eval_international_law_loss": 3.0672590732574463, + "eval_international_law_runtime": 0.268, + "eval_international_law_samples_per_second": 7.463, + "eval_international_law_steps_per_second": 3.731, + "step": 11000 + }, + { + "epoch": 4.585934248855597, + "grad_norm": 0.291015625, + "learning_rate": 3.818000549162474e-06, + "loss": 0.2318, + "step": 11020 + }, + { + "epoch": 4.594257178526841, + "grad_norm": 0.30078125, + "learning_rate": 3.811050272756324e-06, + "loss": 0.2348, + "step": 11040 + }, + { + "epoch": 4.602580108198086, + "grad_norm": 0.28515625, + "learning_rate": 3.804085991090044e-06, + "loss": 0.2342, + "step": 11060 + }, + { + "epoch": 4.61090303786933, + "grad_norm": 0.3125, + "learning_rate": 3.797107778559389e-06, + "loss": 0.2307, + "step": 11080 + }, + { + "epoch": 4.619225967540574, + "grad_norm": 0.2451171875, + "learning_rate": 3.7901157097089315e-06, + "loss": 0.228, + "step": 11100 + }, + { + "epoch": 4.619225967540574, + "eval_main_loss": 0.23824092745780945, + "eval_main_runtime": 6.3522, + "eval_main_samples_per_second": 29.911, + "eval_main_steps_per_second": 3.778, + "step": 11100 + }, + { + "epoch": 4.619225967540574, + "eval_anatomy_loss": 2.834642171859741, + "eval_anatomy_runtime": 0.2679, + "eval_anatomy_samples_per_second": 7.465, + "eval_anatomy_steps_per_second": 3.732, + "step": 11100 + }, + { + "epoch": 4.619225967540574, + "eval_college_mathematics_loss": 2.054388999938965, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 11100 + }, + { + "epoch": 4.619225967540574, + "eval_international_law_loss": 3.0648648738861084, + "eval_international_law_runtime": 0.2679, + "eval_international_law_samples_per_second": 7.465, + "eval_international_law_steps_per_second": 3.732, + "step": 11100 + }, + { + "epoch": 4.627548897211819, + "grad_norm": 0.275390625, + "learning_rate": 3.7831098592312643e-06, + "loss": 0.2368, + "step": 11120 + }, + { + "epoch": 4.635871826883063, + "grad_norm": 0.26171875, + "learning_rate": 3.7760903019662008e-06, + "loss": 0.2292, + "step": 11140 + }, + { + "epoch": 4.644194756554307, + "grad_norm": 0.298828125, + "learning_rate": 3.7690571128999775e-06, + "loss": 0.2371, + "step": 11160 + }, + { + "epoch": 4.6525176862255515, + "grad_norm": 0.2734375, + "learning_rate": 3.7620103671644516e-06, + "loss": 0.2305, + "step": 11180 + }, + { + "epoch": 4.660840615896795, + "grad_norm": 0.26953125, + "learning_rate": 3.7549501400362996e-06, + "loss": 0.2265, + "step": 11200 + }, + { + "epoch": 4.660840615896795, + "eval_main_loss": 0.2383652925491333, + "eval_main_runtime": 6.3394, + "eval_main_samples_per_second": 29.971, + "eval_main_steps_per_second": 3.786, + "step": 11200 + }, + { + "epoch": 4.660840615896795, + "eval_anatomy_loss": 2.8311378955841064, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.477, + "eval_anatomy_steps_per_second": 3.738, + "step": 11200 + }, + { + "epoch": 4.660840615896795, + "eval_college_mathematics_loss": 2.052487373352051, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 11200 + }, + { + "epoch": 4.660840615896795, + "eval_international_law_loss": 3.064990758895874, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.501, + "eval_international_law_steps_per_second": 3.751, + "step": 11200 + }, + { + "epoch": 4.66916354556804, + "grad_norm": 0.27734375, + "learning_rate": 3.7478765069362122e-06, + "loss": 0.2371, + "step": 11220 + }, + { + "epoch": 4.677486475239284, + "grad_norm": 0.32421875, + "learning_rate": 3.7407895434280893e-06, + "loss": 0.2293, + "step": 11240 + }, + { + "epoch": 4.685809404910529, + "grad_norm": 0.333984375, + "learning_rate": 3.7336893252182343e-06, + "loss": 0.2329, + "step": 11260 + }, + { + "epoch": 4.694132334581773, + "grad_norm": 0.28515625, + "learning_rate": 3.72657592815454e-06, + "loss": 0.2364, + "step": 11280 + }, + { + "epoch": 4.702455264253017, + "grad_norm": 0.259765625, + "learning_rate": 3.719449428225685e-06, + "loss": 0.2309, + "step": 11300 + }, + { + "epoch": 4.702455264253017, + "eval_main_loss": 0.23832421004772186, + "eval_main_runtime": 6.3244, + "eval_main_samples_per_second": 30.042, + "eval_main_steps_per_second": 3.795, + "step": 11300 + }, + { + "epoch": 4.702455264253017, + "eval_anatomy_loss": 2.8343684673309326, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 11300 + }, + { + "epoch": 4.702455264253017, + "eval_college_mathematics_loss": 2.049694776535034, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.504, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 11300 + }, + { + "epoch": 4.702455264253017, + "eval_international_law_loss": 3.0649161338806152, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.536, + "eval_international_law_steps_per_second": 3.768, + "step": 11300 + }, + { + "epoch": 4.710778193924261, + "grad_norm": 0.31640625, + "learning_rate": 3.712309901560316e-06, + "loss": 0.2332, + "step": 11320 + }, + { + "epoch": 4.719101123595506, + "grad_norm": 0.279296875, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.2321, + "step": 11340 + }, + { + "epoch": 4.72742405326675, + "grad_norm": 0.22265625, + "learning_rate": 3.6979920732296085e-06, + "loss": 0.2368, + "step": 11360 + }, + { + "epoch": 4.735746982937994, + "grad_norm": 0.25, + "learning_rate": 3.690813924514095e-06, + "loss": 0.2292, + "step": 11380 + }, + { + "epoch": 4.7440699126092385, + "grad_norm": 0.283203125, + "learning_rate": 3.6836230549600853e-06, + "loss": 0.231, + "step": 11400 + }, + { + "epoch": 4.7440699126092385, + "eval_main_loss": 0.23822040855884552, + "eval_main_runtime": 6.3505, + "eval_main_samples_per_second": 29.919, + "eval_main_steps_per_second": 3.779, + "step": 11400 + }, + { + "epoch": 4.7440699126092385, + "eval_anatomy_loss": 2.8312675952911377, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.48, + "eval_anatomy_steps_per_second": 3.74, + "step": 11400 + }, + { + "epoch": 4.7440699126092385, + "eval_college_mathematics_loss": 2.0531413555145264, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 11400 + }, + { + "epoch": 4.7440699126092385, + "eval_international_law_loss": 3.0663352012634277, + "eval_international_law_runtime": 0.2678, + "eval_international_law_samples_per_second": 7.467, + "eval_international_law_steps_per_second": 3.733, + "step": 11400 + }, + { + "epoch": 4.752392842280483, + "grad_norm": 0.259765625, + "learning_rate": 3.676419541383855e-06, + "loss": 0.2289, + "step": 11420 + }, + { + "epoch": 4.760715771951727, + "grad_norm": 0.2734375, + "learning_rate": 3.6692034607367486e-06, + "loss": 0.2305, + "step": 11440 + }, + { + "epoch": 4.769038701622971, + "grad_norm": 0.263671875, + "learning_rate": 3.6619748901043583e-06, + "loss": 0.2279, + "step": 11460 + }, + { + "epoch": 4.777361631294216, + "grad_norm": 0.30078125, + "learning_rate": 3.6547339067057007e-06, + "loss": 0.2382, + "step": 11480 + }, + { + "epoch": 4.78568456096546, + "grad_norm": 0.328125, + "learning_rate": 3.647480587892391e-06, + "loss": 0.2302, + "step": 11500 + }, + { + "epoch": 4.78568456096546, + "eval_main_loss": 0.23836463689804077, + "eval_main_runtime": 6.3523, + "eval_main_samples_per_second": 29.911, + "eval_main_steps_per_second": 3.778, + "step": 11500 + }, + { + "epoch": 4.78568456096546, + "eval_anatomy_loss": 2.8309946060180664, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.477, + "eval_anatomy_steps_per_second": 3.738, + "step": 11500 + }, + { + "epoch": 4.78568456096546, + "eval_college_mathematics_loss": 2.0509681701660156, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.513, + "eval_college_mathematics_steps_per_second": 3.757, + "step": 11500 + }, + { + "epoch": 4.78568456096546, + "eval_international_law_loss": 3.06599760055542, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.487, + "eval_international_law_steps_per_second": 3.743, + "step": 11500 + }, + { + "epoch": 4.794007490636704, + "grad_norm": 0.2470703125, + "learning_rate": 3.640215011147815e-06, + "loss": 0.2276, + "step": 11520 + }, + { + "epoch": 4.802330420307948, + "grad_norm": 0.2734375, + "learning_rate": 3.632937254086308e-06, + "loss": 0.2298, + "step": 11540 + }, + { + "epoch": 4.810653349979193, + "grad_norm": 0.28125, + "learning_rate": 3.6256473944523175e-06, + "loss": 0.2362, + "step": 11560 + }, + { + "epoch": 4.818976279650437, + "grad_norm": 0.255859375, + "learning_rate": 3.6183455101195785e-06, + "loss": 0.2301, + "step": 11580 + }, + { + "epoch": 4.827299209321681, + "grad_norm": 0.30078125, + "learning_rate": 3.611031679090278e-06, + "loss": 0.2313, + "step": 11600 + }, + { + "epoch": 4.827299209321681, + "eval_main_loss": 0.23831431567668915, + "eval_main_runtime": 6.3233, + "eval_main_samples_per_second": 30.048, + "eval_main_steps_per_second": 3.795, + "step": 11600 + }, + { + "epoch": 4.827299209321681, + "eval_anatomy_loss": 2.8330864906311035, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.489, + "eval_anatomy_steps_per_second": 3.744, + "step": 11600 + }, + { + "epoch": 4.827299209321681, + "eval_college_mathematics_loss": 2.0517561435699463, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.544, + "eval_college_mathematics_steps_per_second": 3.772, + "step": 11600 + }, + { + "epoch": 4.827299209321681, + "eval_international_law_loss": 3.066469669342041, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.523, + "eval_international_law_steps_per_second": 3.762, + "step": 11600 + }, + { + "epoch": 4.8356221389929255, + "grad_norm": 0.2890625, + "learning_rate": 3.603705979494225e-06, + "loss": 0.2306, + "step": 11620 + }, + { + "epoch": 4.84394506866417, + "grad_norm": 0.287109375, + "learning_rate": 3.5963684895880123e-06, + "loss": 0.229, + "step": 11640 + }, + { + "epoch": 4.852267998335414, + "grad_norm": 0.341796875, + "learning_rate": 3.589019287754183e-06, + "loss": 0.2244, + "step": 11660 + }, + { + "epoch": 4.860590928006658, + "grad_norm": 0.326171875, + "learning_rate": 3.581658452500394e-06, + "loss": 0.2329, + "step": 11680 + }, + { + "epoch": 4.868913857677903, + "grad_norm": 0.2421875, + "learning_rate": 3.574286062458574e-06, + "loss": 0.2329, + "step": 11700 + }, + { + "epoch": 4.868913857677903, + "eval_main_loss": 0.2383066862821579, + "eval_main_runtime": 6.3157, + "eval_main_samples_per_second": 30.084, + "eval_main_steps_per_second": 3.8, + "step": 11700 + }, + { + "epoch": 4.868913857677903, + "eval_anatomy_loss": 2.832721710205078, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.52, + "eval_anatomy_steps_per_second": 3.76, + "step": 11700 + }, + { + "epoch": 4.868913857677903, + "eval_college_mathematics_loss": 2.048736095428467, + "eval_college_mathematics_runtime": 0.2649, + "eval_college_mathematics_samples_per_second": 7.549, + "eval_college_mathematics_steps_per_second": 3.774, + "step": 11700 + }, + { + "epoch": 4.868913857677903, + "eval_international_law_loss": 3.0647518634796143, + "eval_international_law_runtime": 0.2646, + "eval_international_law_samples_per_second": 7.558, + "eval_international_law_steps_per_second": 3.779, + "step": 11700 + }, + { + "epoch": 4.877236787349147, + "grad_norm": 0.259765625, + "learning_rate": 3.5669021963840863e-06, + "loss": 0.2342, + "step": 11720 + }, + { + "epoch": 4.885559717020391, + "grad_norm": 0.26171875, + "learning_rate": 3.559506933154886e-06, + "loss": 0.2326, + "step": 11740 + }, + { + "epoch": 4.893882646691635, + "grad_norm": 0.28515625, + "learning_rate": 3.552100351770679e-06, + "loss": 0.2283, + "step": 11760 + }, + { + "epoch": 4.90220557636288, + "grad_norm": 0.265625, + "learning_rate": 3.544682531352076e-06, + "loss": 0.2359, + "step": 11780 + }, + { + "epoch": 4.910528506034124, + "grad_norm": 0.314453125, + "learning_rate": 3.53725355113975e-06, + "loss": 0.2338, + "step": 11800 + }, + { + "epoch": 4.910528506034124, + "eval_main_loss": 0.23833368718624115, + "eval_main_runtime": 6.3601, + "eval_main_samples_per_second": 29.874, + "eval_main_steps_per_second": 3.774, + "step": 11800 + }, + { + "epoch": 4.910528506034124, + "eval_anatomy_loss": 2.8294174671173096, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.479, + "eval_anatomy_steps_per_second": 3.74, + "step": 11800 + }, + { + "epoch": 4.910528506034124, + "eval_college_mathematics_loss": 2.054882764816284, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.495, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 11800 + }, + { + "epoch": 4.910528506034124, + "eval_international_law_loss": 3.0666239261627197, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.493, + "eval_international_law_steps_per_second": 3.746, + "step": 11800 + }, + { + "epoch": 4.918851435705369, + "grad_norm": 0.2451171875, + "learning_rate": 3.529813490493586e-06, + "loss": 0.2312, + "step": 11820 + }, + { + "epoch": 4.9271743653766125, + "grad_norm": 0.298828125, + "learning_rate": 3.5223624288918368e-06, + "loss": 0.2272, + "step": 11840 + }, + { + "epoch": 4.935497295047857, + "grad_norm": 0.390625, + "learning_rate": 3.514900445930273e-06, + "loss": 0.2308, + "step": 11860 + }, + { + "epoch": 4.943820224719101, + "grad_norm": 0.306640625, + "learning_rate": 3.507427621321331e-06, + "loss": 0.2286, + "step": 11880 + }, + { + "epoch": 4.952143154390345, + "grad_norm": 0.3046875, + "learning_rate": 3.4999440348932644e-06, + "loss": 0.2363, + "step": 11900 + }, + { + "epoch": 4.952143154390345, + "eval_main_loss": 0.23830710351467133, + "eval_main_runtime": 6.3539, + "eval_main_samples_per_second": 29.903, + "eval_main_steps_per_second": 3.777, + "step": 11900 + }, + { + "epoch": 4.952143154390345, + "eval_anatomy_loss": 2.832731008529663, + "eval_anatomy_runtime": 0.2683, + "eval_anatomy_samples_per_second": 7.453, + "eval_anatomy_steps_per_second": 3.727, + "step": 11900 + }, + { + "epoch": 4.952143154390345, + "eval_college_mathematics_loss": 2.0541601181030273, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 11900 + }, + { + "epoch": 4.952143154390345, + "eval_international_law_loss": 3.064253091812134, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.48, + "eval_international_law_steps_per_second": 3.74, + "step": 11900 + }, + { + "epoch": 4.96046608406159, + "grad_norm": 0.271484375, + "learning_rate": 3.4924497665892886e-06, + "loss": 0.2364, + "step": 11920 + }, + { + "epoch": 4.968789013732834, + "grad_norm": 0.271484375, + "learning_rate": 3.484944896466727e-06, + "loss": 0.2331, + "step": 11940 + }, + { + "epoch": 4.977111943404078, + "grad_norm": 0.2734375, + "learning_rate": 3.4774295046961593e-06, + "loss": 0.2344, + "step": 11960 + }, + { + "epoch": 4.985434873075323, + "grad_norm": 0.28515625, + "learning_rate": 3.4699036715605595e-06, + "loss": 0.2307, + "step": 11980 + }, + { + "epoch": 4.9937578027465666, + "grad_norm": 0.294921875, + "learning_rate": 3.4623674774544435e-06, + "loss": 0.2271, + "step": 12000 + }, + { + "epoch": 4.9937578027465666, + "eval_main_loss": 0.23841014504432678, + "eval_main_runtime": 6.3245, + "eval_main_samples_per_second": 30.042, + "eval_main_steps_per_second": 3.795, + "step": 12000 + }, + { + "epoch": 4.9937578027465666, + "eval_anatomy_loss": 2.8334553241729736, + "eval_anatomy_runtime": 0.2654, + "eval_anatomy_samples_per_second": 7.535, + "eval_anatomy_steps_per_second": 3.767, + "step": 12000 + }, + { + "epoch": 4.9937578027465666, + "eval_college_mathematics_loss": 2.050748348236084, + "eval_college_mathematics_runtime": 0.2648, + "eval_college_mathematics_samples_per_second": 7.553, + "eval_college_mathematics_steps_per_second": 3.776, + "step": 12000 + }, + { + "epoch": 4.9937578027465666, + "eval_international_law_loss": 3.066818952560425, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.479, + "eval_international_law_steps_per_second": 3.74, + "step": 12000 + }, + { + "epoch": 5.002080732417811, + "grad_norm": 0.267578125, + "learning_rate": 3.454821002883007e-06, + "loss": 0.2312, + "step": 12020 + }, + { + "epoch": 5.010403662089056, + "grad_norm": 0.29296875, + "learning_rate": 3.4472643284612656e-06, + "loss": 0.2312, + "step": 12040 + }, + { + "epoch": 5.0187265917602994, + "grad_norm": 0.287109375, + "learning_rate": 3.439697534913197e-06, + "loss": 0.2339, + "step": 12060 + }, + { + "epoch": 5.027049521431544, + "grad_norm": 0.23046875, + "learning_rate": 3.4321207030708725e-06, + "loss": 0.2329, + "step": 12080 + }, + { + "epoch": 5.035372451102788, + "grad_norm": 0.2734375, + "learning_rate": 3.4245339138736023e-06, + "loss": 0.2298, + "step": 12100 + }, + { + "epoch": 5.035372451102788, + "eval_main_loss": 0.23836223781108856, + "eval_main_runtime": 6.3231, + "eval_main_samples_per_second": 30.049, + "eval_main_steps_per_second": 3.796, + "step": 12100 + }, + { + "epoch": 5.035372451102788, + "eval_anatomy_loss": 2.8340742588043213, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.507, + "eval_anatomy_steps_per_second": 3.754, + "step": 12100 + }, + { + "epoch": 5.035372451102788, + "eval_college_mathematics_loss": 2.0523436069488525, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.545, + "eval_college_mathematics_steps_per_second": 3.773, + "step": 12100 + }, + { + "epoch": 5.035372451102788, + "eval_international_law_loss": 3.0669960975646973, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.529, + "eval_international_law_steps_per_second": 3.765, + "step": 12100 + }, + { + "epoch": 5.043695380774032, + "grad_norm": 0.255859375, + "learning_rate": 3.416937248367061e-06, + "loss": 0.2311, + "step": 12120 + }, + { + "epoch": 5.052018310445277, + "grad_norm": 0.25390625, + "learning_rate": 3.409330787702428e-06, + "loss": 0.2297, + "step": 12140 + }, + { + "epoch": 5.060341240116521, + "grad_norm": 0.263671875, + "learning_rate": 3.4017146131355205e-06, + "loss": 0.2329, + "step": 12160 + }, + { + "epoch": 5.068664169787765, + "grad_norm": 0.267578125, + "learning_rate": 3.394088806025925e-06, + "loss": 0.2279, + "step": 12180 + }, + { + "epoch": 5.07698709945901, + "grad_norm": 0.267578125, + "learning_rate": 3.3864534478361235e-06, + "loss": 0.2314, + "step": 12200 + }, + { + "epoch": 5.07698709945901, + "eval_main_loss": 0.2382633239030838, + "eval_main_runtime": 6.3506, + "eval_main_samples_per_second": 29.919, + "eval_main_steps_per_second": 3.779, + "step": 12200 + }, + { + "epoch": 5.07698709945901, + "eval_anatomy_loss": 2.834725856781006, + "eval_anatomy_runtime": 0.2681, + "eval_anatomy_samples_per_second": 7.461, + "eval_anatomy_steps_per_second": 3.73, + "step": 12200 + }, + { + "epoch": 5.07698709945901, + "eval_college_mathematics_loss": 2.052021026611328, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.511, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 12200 + }, + { + "epoch": 5.07698709945901, + "eval_international_law_loss": 3.0643866062164307, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.51, + "eval_international_law_steps_per_second": 3.755, + "step": 12200 + }, + { + "epoch": 5.0853100291302535, + "grad_norm": 0.2890625, + "learning_rate": 3.3788086201306295e-06, + "loss": 0.2321, + "step": 12220 + }, + { + "epoch": 5.093632958801498, + "grad_norm": 0.263671875, + "learning_rate": 3.371154404575116e-06, + "loss": 0.2346, + "step": 12240 + }, + { + "epoch": 5.101955888472743, + "grad_norm": 0.33203125, + "learning_rate": 3.3634908829355384e-06, + "loss": 0.2298, + "step": 12260 + }, + { + "epoch": 5.110278818143986, + "grad_norm": 0.330078125, + "learning_rate": 3.3558181370772657e-06, + "loss": 0.2324, + "step": 12280 + }, + { + "epoch": 5.118601747815231, + "grad_norm": 0.236328125, + "learning_rate": 3.3481362489642055e-06, + "loss": 0.2314, + "step": 12300 + }, + { + "epoch": 5.118601747815231, + "eval_main_loss": 0.23837122321128845, + "eval_main_runtime": 6.354, + "eval_main_samples_per_second": 29.902, + "eval_main_steps_per_second": 3.777, + "step": 12300 + }, + { + "epoch": 5.118601747815231, + "eval_anatomy_loss": 2.833470582962036, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.488, + "eval_anatomy_steps_per_second": 3.744, + "step": 12300 + }, + { + "epoch": 5.118601747815231, + "eval_college_mathematics_loss": 2.0529050827026367, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.505, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 12300 + }, + { + "epoch": 5.118601747815231, + "eval_international_law_loss": 3.0643563270568848, + "eval_international_law_runtime": 0.2685, + "eval_international_law_samples_per_second": 7.45, + "eval_international_law_steps_per_second": 3.725, + "step": 12300 + }, + { + "epoch": 5.126924677486476, + "grad_norm": 0.310546875, + "learning_rate": 3.340445300657924e-06, + "loss": 0.2306, + "step": 12320 + }, + { + "epoch": 5.135247607157719, + "grad_norm": 0.267578125, + "learning_rate": 3.3327453743167763e-06, + "loss": 0.2299, + "step": 12340 + }, + { + "epoch": 5.143570536828964, + "grad_norm": 0.27734375, + "learning_rate": 3.3250365521950212e-06, + "loss": 0.2334, + "step": 12360 + }, + { + "epoch": 5.1518934665002085, + "grad_norm": 0.24609375, + "learning_rate": 3.317318916641952e-06, + "loss": 0.2327, + "step": 12380 + }, + { + "epoch": 5.160216396171452, + "grad_norm": 0.267578125, + "learning_rate": 3.309592550101005e-06, + "loss": 0.2314, + "step": 12400 + }, + { + "epoch": 5.160216396171452, + "eval_main_loss": 0.2383902668952942, + "eval_main_runtime": 6.3193, + "eval_main_samples_per_second": 30.067, + "eval_main_steps_per_second": 3.798, + "step": 12400 + }, + { + "epoch": 5.160216396171452, + "eval_anatomy_loss": 2.8342981338500977, + "eval_anatomy_runtime": 0.2654, + "eval_anatomy_samples_per_second": 7.536, + "eval_anatomy_steps_per_second": 3.768, + "step": 12400 + }, + { + "epoch": 5.160216396171452, + "eval_college_mathematics_loss": 2.054227113723755, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.493, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 12400 + }, + { + "epoch": 5.160216396171452, + "eval_international_law_loss": 3.067544937133789, + "eval_international_law_runtime": 0.2651, + "eval_international_law_samples_per_second": 7.543, + "eval_international_law_steps_per_second": 3.772, + "step": 12400 + }, + { + "epoch": 5.168539325842697, + "grad_norm": 0.2734375, + "learning_rate": 3.3018575351088894e-06, + "loss": 0.2336, + "step": 12420 + }, + { + "epoch": 5.1768622555139405, + "grad_norm": 0.310546875, + "learning_rate": 3.2941139542946996e-06, + "loss": 0.2368, + "step": 12440 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 0.279296875, + "learning_rate": 3.2863618903790346e-06, + "loss": 0.2295, + "step": 12460 + }, + { + "epoch": 5.19350811485643, + "grad_norm": 0.267578125, + "learning_rate": 3.2786014261731138e-06, + "loss": 0.2322, + "step": 12480 + }, + { + "epoch": 5.201831044527673, + "grad_norm": 0.30859375, + "learning_rate": 3.270832644577891e-06, + "loss": 0.2365, + "step": 12500 + }, + { + "epoch": 5.201831044527673, + "eval_main_loss": 0.23834168910980225, + "eval_main_runtime": 6.3151, + "eval_main_samples_per_second": 30.087, + "eval_main_steps_per_second": 3.8, + "step": 12500 + }, + { + "epoch": 5.201831044527673, + "eval_anatomy_loss": 2.833904981613159, + "eval_anatomy_runtime": 0.2651, + "eval_anatomy_samples_per_second": 7.543, + "eval_anatomy_steps_per_second": 3.772, + "step": 12500 + }, + { + "epoch": 5.201831044527673, + "eval_college_mathematics_loss": 2.05311918258667, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.489, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 12500 + }, + { + "epoch": 5.201831044527673, + "eval_international_law_loss": 3.065603494644165, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.527, + "eval_international_law_steps_per_second": 3.764, + "step": 12500 + }, + { + "epoch": 5.210153974198918, + "grad_norm": 0.26171875, + "learning_rate": 3.263055628583174e-06, + "loss": 0.2315, + "step": 12520 + }, + { + "epoch": 5.218476903870163, + "grad_norm": 0.33203125, + "learning_rate": 3.25527046126673e-06, + "loss": 0.2295, + "step": 12540 + }, + { + "epoch": 5.226799833541406, + "grad_norm": 0.271484375, + "learning_rate": 3.247477225793406e-06, + "loss": 0.2349, + "step": 12560 + }, + { + "epoch": 5.235122763212651, + "grad_norm": 0.26953125, + "learning_rate": 3.239676005414234e-06, + "loss": 0.231, + "step": 12580 + }, + { + "epoch": 5.2434456928838955, + "grad_norm": 0.251953125, + "learning_rate": 3.231866883465548e-06, + "loss": 0.2316, + "step": 12600 + }, + { + "epoch": 5.2434456928838955, + "eval_main_loss": 0.2384309619665146, + "eval_main_runtime": 6.3556, + "eval_main_samples_per_second": 29.895, + "eval_main_steps_per_second": 3.776, + "step": 12600 + }, + { + "epoch": 5.2434456928838955, + "eval_anatomy_loss": 2.8339719772338867, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.518, + "eval_anatomy_steps_per_second": 3.759, + "step": 12600 + }, + { + "epoch": 5.2434456928838955, + "eval_college_mathematics_loss": 2.048271417617798, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.486, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 12600 + }, + { + "epoch": 5.2434456928838955, + "eval_international_law_loss": 3.0657992362976074, + "eval_international_law_runtime": 0.2694, + "eval_international_law_samples_per_second": 7.425, + "eval_international_law_steps_per_second": 3.712, + "step": 12600 + }, + { + "epoch": 5.251768622555139, + "grad_norm": 0.28125, + "learning_rate": 3.2240499433680866e-06, + "loss": 0.2337, + "step": 12620 + }, + { + "epoch": 5.260091552226384, + "grad_norm": 0.275390625, + "learning_rate": 3.2162252686261077e-06, + "loss": 0.2284, + "step": 12640 + }, + { + "epoch": 5.2684144818976275, + "grad_norm": 0.302734375, + "learning_rate": 3.2083929428264938e-06, + "loss": 0.2318, + "step": 12660 + }, + { + "epoch": 5.276737411568872, + "grad_norm": 0.30078125, + "learning_rate": 3.2005530496378596e-06, + "loss": 0.2267, + "step": 12680 + }, + { + "epoch": 5.285060341240117, + "grad_norm": 0.271484375, + "learning_rate": 3.1927056728096582e-06, + "loss": 0.235, + "step": 12700 + }, + { + "epoch": 5.285060341240117, + "eval_main_loss": 0.23832783102989197, + "eval_main_runtime": 6.3472, + "eval_main_samples_per_second": 29.934, + "eval_main_steps_per_second": 3.781, + "step": 12700 + }, + { + "epoch": 5.285060341240117, + "eval_anatomy_loss": 2.834886312484741, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.503, + "eval_anatomy_steps_per_second": 3.751, + "step": 12700 + }, + { + "epoch": 5.285060341240117, + "eval_college_mathematics_loss": 2.0527701377868652, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.515, + "eval_college_mathematics_steps_per_second": 3.758, + "step": 12700 + }, + { + "epoch": 5.285060341240117, + "eval_international_law_loss": 3.0642306804656982, + "eval_international_law_runtime": 0.2678, + "eval_international_law_samples_per_second": 7.468, + "eval_international_law_steps_per_second": 3.734, + "step": 12700 + }, + { + "epoch": 5.29338327091136, + "grad_norm": 0.2734375, + "learning_rate": 3.184850896171288e-06, + "loss": 0.2332, + "step": 12720 + }, + { + "epoch": 5.301706200582605, + "grad_norm": 0.26171875, + "learning_rate": 3.1769888036311924e-06, + "loss": 0.2293, + "step": 12740 + }, + { + "epoch": 5.31002913025385, + "grad_norm": 0.26953125, + "learning_rate": 3.1691194791759693e-06, + "loss": 0.2336, + "step": 12760 + }, + { + "epoch": 5.318352059925093, + "grad_norm": 0.296875, + "learning_rate": 3.161243006869471e-06, + "loss": 0.2298, + "step": 12780 + }, + { + "epoch": 5.326674989596338, + "grad_norm": 0.3125, + "learning_rate": 3.1533594708519067e-06, + "loss": 0.2353, + "step": 12800 + }, + { + "epoch": 5.326674989596338, + "eval_main_loss": 0.23834091424942017, + "eval_main_runtime": 6.345, + "eval_main_samples_per_second": 29.945, + "eval_main_steps_per_second": 3.783, + "step": 12800 + }, + { + "epoch": 5.326674989596338, + "eval_anatomy_loss": 2.8317482471466064, + "eval_anatomy_runtime": 0.2676, + "eval_anatomy_samples_per_second": 7.474, + "eval_anatomy_steps_per_second": 3.737, + "step": 12800 + }, + { + "epoch": 5.326674989596338, + "eval_college_mathematics_loss": 2.0519726276397705, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 12800 + }, + { + "epoch": 5.326674989596338, + "eval_international_law_loss": 3.0644750595092773, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.487, + "eval_international_law_steps_per_second": 3.744, + "step": 12800 + }, + { + "epoch": 5.3349979192675825, + "grad_norm": 0.279296875, + "learning_rate": 3.145468955338942e-06, + "loss": 0.2312, + "step": 12820 + }, + { + "epoch": 5.343320848938826, + "grad_norm": 0.279296875, + "learning_rate": 3.1375715446208014e-06, + "loss": 0.2297, + "step": 12840 + }, + { + "epoch": 5.351643778610071, + "grad_norm": 0.275390625, + "learning_rate": 3.129667323061369e-06, + "loss": 0.2341, + "step": 12860 + }, + { + "epoch": 5.359966708281315, + "grad_norm": 0.30859375, + "learning_rate": 3.1217563750972827e-06, + "loss": 0.232, + "step": 12880 + }, + { + "epoch": 5.368289637952559, + "grad_norm": 0.265625, + "learning_rate": 3.1138387852370385e-06, + "loss": 0.2255, + "step": 12900 + }, + { + "epoch": 5.368289637952559, + "eval_main_loss": 0.23833264410495758, + "eval_main_runtime": 6.3508, + "eval_main_samples_per_second": 29.917, + "eval_main_steps_per_second": 3.779, + "step": 12900 + }, + { + "epoch": 5.368289637952559, + "eval_anatomy_loss": 2.8347291946411133, + "eval_anatomy_runtime": 0.2672, + "eval_anatomy_samples_per_second": 7.484, + "eval_anatomy_steps_per_second": 3.742, + "step": 12900 + }, + { + "epoch": 5.368289637952559, + "eval_college_mathematics_loss": 2.050865411758423, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.49, + "eval_college_mathematics_steps_per_second": 3.745, + "step": 12900 + }, + { + "epoch": 5.368289637952559, + "eval_international_law_loss": 3.068070411682129, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.494, + "eval_international_law_steps_per_second": 3.747, + "step": 12900 + }, + { + "epoch": 5.376612567623804, + "grad_norm": 0.27734375, + "learning_rate": 3.10591463806008e-06, + "loss": 0.2328, + "step": 12920 + }, + { + "epoch": 5.384935497295048, + "grad_norm": 0.29296875, + "learning_rate": 3.0979840182159033e-06, + "loss": 0.2346, + "step": 12940 + }, + { + "epoch": 5.393258426966292, + "grad_norm": 0.275390625, + "learning_rate": 3.0900470104231456e-06, + "loss": 0.2319, + "step": 12960 + }, + { + "epoch": 5.401581356637537, + "grad_norm": 0.2890625, + "learning_rate": 3.0821036994686837e-06, + "loss": 0.2343, + "step": 12980 + }, + { + "epoch": 5.40990428630878, + "grad_norm": 0.259765625, + "learning_rate": 3.074154170206731e-06, + "loss": 0.2299, + "step": 13000 + }, + { + "epoch": 5.40990428630878, + "eval_main_loss": 0.23825454711914062, + "eval_main_runtime": 6.3483, + "eval_main_samples_per_second": 29.929, + "eval_main_steps_per_second": 3.781, + "step": 13000 + }, + { + "epoch": 5.40990428630878, + "eval_anatomy_loss": 2.83435320854187, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.752, + "step": 13000 + }, + { + "epoch": 5.40990428630878, + "eval_college_mathematics_loss": 2.0521814823150635, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.485, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 13000 + }, + { + "epoch": 5.40990428630878, + "eval_international_law_loss": 3.065732479095459, + "eval_international_law_runtime": 0.2669, + "eval_international_law_samples_per_second": 7.494, + "eval_international_law_steps_per_second": 3.747, + "step": 13000 + }, + { + "epoch": 5.418227215980025, + "grad_norm": 0.28125, + "learning_rate": 3.066198507557923e-06, + "loss": 0.2285, + "step": 13020 + }, + { + "epoch": 5.4265501456512695, + "grad_norm": 0.259765625, + "learning_rate": 3.0582367965084183e-06, + "loss": 0.2353, + "step": 13040 + }, + { + "epoch": 5.434873075322513, + "grad_norm": 0.328125, + "learning_rate": 3.0502691221089846e-06, + "loss": 0.2346, + "step": 13060 + }, + { + "epoch": 5.443196004993758, + "grad_norm": 0.27734375, + "learning_rate": 3.042295569474096e-06, + "loss": 0.2344, + "step": 13080 + }, + { + "epoch": 5.451518934665002, + "grad_norm": 0.287109375, + "learning_rate": 3.03431622378102e-06, + "loss": 0.2324, + "step": 13100 + }, + { + "epoch": 5.451518934665002, + "eval_main_loss": 0.23835337162017822, + "eval_main_runtime": 6.3367, + "eval_main_samples_per_second": 29.984, + "eval_main_steps_per_second": 3.787, + "step": 13100 + }, + { + "epoch": 5.451518934665002, + "eval_anatomy_loss": 2.835411787033081, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.479, + "eval_anatomy_steps_per_second": 3.74, + "step": 13100 + }, + { + "epoch": 5.451518934665002, + "eval_college_mathematics_loss": 2.0492405891418457, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.527, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 13100 + }, + { + "epoch": 5.451518934665002, + "eval_international_law_loss": 3.0660183429718018, + "eval_international_law_runtime": 0.2667, + "eval_international_law_samples_per_second": 7.499, + "eval_international_law_steps_per_second": 3.75, + "step": 13100 + }, + { + "epoch": 5.459841864336246, + "grad_norm": 0.3125, + "learning_rate": 3.0263311702689045e-06, + "loss": 0.2356, + "step": 13120 + }, + { + "epoch": 5.468164794007491, + "grad_norm": 0.298828125, + "learning_rate": 3.018340494237878e-06, + "loss": 0.2302, + "step": 13140 + }, + { + "epoch": 5.476487723678735, + "grad_norm": 0.287109375, + "learning_rate": 3.010344281048125e-06, + "loss": 0.2322, + "step": 13160 + }, + { + "epoch": 5.484810653349979, + "grad_norm": 0.279296875, + "learning_rate": 3.0023426161189828e-06, + "loss": 0.2329, + "step": 13180 + }, + { + "epoch": 5.493133583021224, + "grad_norm": 0.31640625, + "learning_rate": 2.994335584928028e-06, + "loss": 0.233, + "step": 13200 + }, + { + "epoch": 5.493133583021224, + "eval_main_loss": 0.23826710879802704, + "eval_main_runtime": 6.3331, + "eval_main_samples_per_second": 30.001, + "eval_main_steps_per_second": 3.79, + "step": 13200 + }, + { + "epoch": 5.493133583021224, + "eval_anatomy_loss": 2.8339459896087646, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.501, + "eval_anatomy_steps_per_second": 3.751, + "step": 13200 + }, + { + "epoch": 5.493133583021224, + "eval_college_mathematics_loss": 2.051870107650757, + "eval_college_mathematics_runtime": 0.2669, + "eval_college_mathematics_samples_per_second": 7.494, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 13200 + }, + { + "epoch": 5.493133583021224, + "eval_international_law_loss": 3.066417694091797, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.758, + "step": 13200 + }, + { + "epoch": 5.501456512692467, + "grad_norm": 0.283203125, + "learning_rate": 2.9863232730101616e-06, + "loss": 0.231, + "step": 13220 + }, + { + "epoch": 5.509779442363712, + "grad_norm": 0.314453125, + "learning_rate": 2.9783057659566945e-06, + "loss": 0.2367, + "step": 13240 + }, + { + "epoch": 5.5181023720349565, + "grad_norm": 0.267578125, + "learning_rate": 2.9702831494144354e-06, + "loss": 0.2299, + "step": 13260 + }, + { + "epoch": 5.526425301706201, + "grad_norm": 0.3203125, + "learning_rate": 2.9622555090847756e-06, + "loss": 0.2303, + "step": 13280 + }, + { + "epoch": 5.534748231377445, + "grad_norm": 0.232421875, + "learning_rate": 2.954222930722771e-06, + "loss": 0.2281, + "step": 13300 + }, + { + "epoch": 5.534748231377445, + "eval_main_loss": 0.23840035498142242, + "eval_main_runtime": 6.3399, + "eval_main_samples_per_second": 29.969, + "eval_main_steps_per_second": 3.786, + "step": 13300 + }, + { + "epoch": 5.534748231377445, + "eval_anatomy_loss": 2.832589864730835, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.501, + "eval_anatomy_steps_per_second": 3.751, + "step": 13300 + }, + { + "epoch": 5.534748231377445, + "eval_college_mathematics_loss": 2.051084518432617, + "eval_college_mathematics_runtime": 0.2678, + "eval_college_mathematics_samples_per_second": 7.468, + "eval_college_mathematics_steps_per_second": 3.734, + "step": 13300 + }, + { + "epoch": 5.534748231377445, + "eval_international_law_loss": 3.0671823024749756, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.509, + "eval_international_law_steps_per_second": 3.754, + "step": 13300 + }, + { + "epoch": 5.543071161048689, + "grad_norm": 0.2734375, + "learning_rate": 2.9461855001362298e-06, + "loss": 0.2315, + "step": 13320 + }, + { + "epoch": 5.551394090719933, + "grad_norm": 0.2060546875, + "learning_rate": 2.9381433031847946e-06, + "loss": 0.2309, + "step": 13340 + }, + { + "epoch": 5.559717020391178, + "grad_norm": 0.294921875, + "learning_rate": 2.9300964257790215e-06, + "loss": 0.2326, + "step": 13360 + }, + { + "epoch": 5.568039950062422, + "grad_norm": 0.25, + "learning_rate": 2.9220449538794676e-06, + "loss": 0.2342, + "step": 13380 + }, + { + "epoch": 5.576362879733666, + "grad_norm": 0.3125, + "learning_rate": 2.9139889734957698e-06, + "loss": 0.23, + "step": 13400 + }, + { + "epoch": 5.576362879733666, + "eval_main_loss": 0.23834313452243805, + "eval_main_runtime": 6.3321, + "eval_main_samples_per_second": 30.006, + "eval_main_steps_per_second": 3.79, + "step": 13400 + }, + { + "epoch": 5.576362879733666, + "eval_anatomy_loss": 2.8331494331359863, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.512, + "eval_anatomy_steps_per_second": 3.756, + "step": 13400 + }, + { + "epoch": 5.576362879733666, + "eval_college_mathematics_loss": 2.0503456592559814, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.529, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 13400 + }, + { + "epoch": 5.576362879733666, + "eval_international_law_loss": 3.065586805343628, + "eval_international_law_runtime": 0.2652, + "eval_international_law_samples_per_second": 7.542, + "eval_international_law_steps_per_second": 3.771, + "step": 13400 + }, + { + "epoch": 5.5846858094049106, + "grad_norm": 0.296875, + "learning_rate": 2.9059285706857287e-06, + "loss": 0.2328, + "step": 13420 + }, + { + "epoch": 5.593008739076155, + "grad_norm": 0.322265625, + "learning_rate": 2.897863831554385e-06, + "loss": 0.2294, + "step": 13440 + }, + { + "epoch": 5.601331668747399, + "grad_norm": 0.29296875, + "learning_rate": 2.889794842253102e-06, + "loss": 0.2362, + "step": 13460 + }, + { + "epoch": 5.6096545984186434, + "grad_norm": 0.337890625, + "learning_rate": 2.881721688978647e-06, + "loss": 0.2309, + "step": 13480 + }, + { + "epoch": 5.617977528089888, + "grad_norm": 0.279296875, + "learning_rate": 2.8736444579722665e-06, + "loss": 0.2347, + "step": 13500 + }, + { + "epoch": 5.617977528089888, + "eval_main_loss": 0.238357812166214, + "eval_main_runtime": 6.3227, + "eval_main_samples_per_second": 30.051, + "eval_main_steps_per_second": 3.796, + "step": 13500 + }, + { + "epoch": 5.617977528089888, + "eval_anatomy_loss": 2.834318161010742, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.511, + "eval_anatomy_steps_per_second": 3.756, + "step": 13500 + }, + { + "epoch": 5.617977528089888, + "eval_college_mathematics_loss": 2.0517024993896484, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.532, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 13500 + }, + { + "epoch": 5.617977528089888, + "eval_international_law_loss": 3.0657360553741455, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 13500 + }, + { + "epoch": 5.626300457761132, + "grad_norm": 0.2890625, + "learning_rate": 2.865563235518772e-06, + "loss": 0.2299, + "step": 13520 + }, + { + "epoch": 5.634623387432376, + "grad_norm": 0.259765625, + "learning_rate": 2.8574781079456065e-06, + "loss": 0.2336, + "step": 13540 + }, + { + "epoch": 5.64294631710362, + "grad_norm": 0.251953125, + "learning_rate": 2.8493891616219354e-06, + "loss": 0.2347, + "step": 13560 + }, + { + "epoch": 5.651269246774865, + "grad_norm": 0.23046875, + "learning_rate": 2.841296482957715e-06, + "loss": 0.2323, + "step": 13580 + }, + { + "epoch": 5.659592176446109, + "grad_norm": 0.23046875, + "learning_rate": 2.8332001584027724e-06, + "loss": 0.233, + "step": 13600 + }, + { + "epoch": 5.659592176446109, + "eval_main_loss": 0.23819133639335632, + "eval_main_runtime": 6.333, + "eval_main_samples_per_second": 30.002, + "eval_main_steps_per_second": 3.79, + "step": 13600 + }, + { + "epoch": 5.659592176446109, + "eval_anatomy_loss": 2.8354241847991943, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.522, + "eval_anatomy_steps_per_second": 3.761, + "step": 13600 + }, + { + "epoch": 5.659592176446109, + "eval_college_mathematics_loss": 2.0492091178894043, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 13600 + }, + { + "epoch": 5.659592176446109, + "eval_international_law_loss": 3.063929557800293, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.52, + "eval_international_law_steps_per_second": 3.76, + "step": 13600 + }, + { + "epoch": 5.667915106117353, + "grad_norm": 0.2392578125, + "learning_rate": 2.825100274445882e-06, + "loss": 0.2308, + "step": 13620 + }, + { + "epoch": 5.6762380357885975, + "grad_norm": 0.2890625, + "learning_rate": 2.8169969176138435e-06, + "loss": 0.2305, + "step": 13640 + }, + { + "epoch": 5.684560965459842, + "grad_norm": 0.271484375, + "learning_rate": 2.808890174470551e-06, + "loss": 0.2316, + "step": 13660 + }, + { + "epoch": 5.692883895131086, + "grad_norm": 0.302734375, + "learning_rate": 2.8007801316160767e-06, + "loss": 0.2328, + "step": 13680 + }, + { + "epoch": 5.70120682480233, + "grad_norm": 0.2421875, + "learning_rate": 2.79266687568574e-06, + "loss": 0.2303, + "step": 13700 + }, + { + "epoch": 5.70120682480233, + "eval_main_loss": 0.23830586671829224, + "eval_main_runtime": 6.3319, + "eval_main_samples_per_second": 30.007, + "eval_main_steps_per_second": 3.79, + "step": 13700 + }, + { + "epoch": 5.70120682480233, + "eval_anatomy_loss": 2.833200216293335, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.48, + "eval_anatomy_steps_per_second": 3.74, + "step": 13700 + }, + { + "epoch": 5.70120682480233, + "eval_college_mathematics_loss": 2.051815986633301, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.543, + "eval_college_mathematics_steps_per_second": 3.772, + "step": 13700 + }, + { + "epoch": 5.70120682480233, + "eval_international_law_loss": 3.0680174827575684, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.478, + "eval_international_law_steps_per_second": 3.739, + "step": 13700 + }, + { + "epoch": 5.709529754473575, + "grad_norm": 0.25, + "learning_rate": 2.784550493349185e-06, + "loss": 0.2286, + "step": 13720 + }, + { + "epoch": 5.717852684144819, + "grad_norm": 0.271484375, + "learning_rate": 2.776431071309453e-06, + "loss": 0.227, + "step": 13740 + }, + { + "epoch": 5.726175613816063, + "grad_norm": 0.3046875, + "learning_rate": 2.7683086963020566e-06, + "loss": 0.2313, + "step": 13760 + }, + { + "epoch": 5.734498543487308, + "grad_norm": 0.25390625, + "learning_rate": 2.7601834550940538e-06, + "loss": 0.2275, + "step": 13780 + }, + { + "epoch": 5.742821473158552, + "grad_norm": 0.2490234375, + "learning_rate": 2.7520554344831194e-06, + "loss": 0.2313, + "step": 13800 + }, + { + "epoch": 5.742821473158552, + "eval_main_loss": 0.23832766711711884, + "eval_main_runtime": 6.3298, + "eval_main_samples_per_second": 30.017, + "eval_main_steps_per_second": 3.792, + "step": 13800 + }, + { + "epoch": 5.742821473158552, + "eval_anatomy_loss": 2.833317995071411, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.527, + "eval_anatomy_steps_per_second": 3.763, + "step": 13800 + }, + { + "epoch": 5.742821473158552, + "eval_college_mathematics_loss": 2.052196502685547, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.487, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 13800 + }, + { + "epoch": 5.742821473158552, + "eval_international_law_loss": 3.066879987716675, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.536, + "eval_international_law_steps_per_second": 3.768, + "step": 13800 + }, + { + "epoch": 5.751144402829796, + "grad_norm": 0.2451171875, + "learning_rate": 2.743924721296622e-06, + "loss": 0.2323, + "step": 13820 + }, + { + "epoch": 5.759467332501041, + "grad_norm": 0.345703125, + "learning_rate": 2.735791402390691e-06, + "loss": 0.2325, + "step": 13840 + }, + { + "epoch": 5.7677902621722845, + "grad_norm": 0.275390625, + "learning_rate": 2.727655564649293e-06, + "loss": 0.2308, + "step": 13860 + }, + { + "epoch": 5.776113191843529, + "grad_norm": 0.30078125, + "learning_rate": 2.719517294983299e-06, + "loss": 0.2337, + "step": 13880 + }, + { + "epoch": 5.784436121514773, + "grad_norm": 0.2109375, + "learning_rate": 2.7113766803295637e-06, + "loss": 0.2319, + "step": 13900 + }, + { + "epoch": 5.784436121514773, + "eval_main_loss": 0.23832297325134277, + "eval_main_runtime": 6.3179, + "eval_main_samples_per_second": 30.073, + "eval_main_steps_per_second": 3.799, + "step": 13900 + }, + { + "epoch": 5.784436121514773, + "eval_anatomy_loss": 2.8318614959716797, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.526, + "eval_anatomy_steps_per_second": 3.763, + "step": 13900 + }, + { + "epoch": 5.784436121514773, + "eval_college_mathematics_loss": 2.0509073734283447, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 13900 + }, + { + "epoch": 5.784436121514773, + "eval_international_law_loss": 3.0668725967407227, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.513, + "eval_international_law_steps_per_second": 3.756, + "step": 13900 + }, + { + "epoch": 5.792759051186017, + "grad_norm": 0.298828125, + "learning_rate": 2.7032338076499882e-06, + "loss": 0.2317, + "step": 13920 + }, + { + "epoch": 5.801081980857262, + "grad_norm": 0.2890625, + "learning_rate": 2.695088763930596e-06, + "loss": 0.2333, + "step": 13940 + }, + { + "epoch": 5.809404910528506, + "grad_norm": 0.2890625, + "learning_rate": 2.6869416361806026e-06, + "loss": 0.2296, + "step": 13960 + }, + { + "epoch": 5.81772784019975, + "grad_norm": 0.287109375, + "learning_rate": 2.6787925114314885e-06, + "loss": 0.2286, + "step": 13980 + }, + { + "epoch": 5.826050769870995, + "grad_norm": 0.283203125, + "learning_rate": 2.6706414767360615e-06, + "loss": 0.233, + "step": 14000 + }, + { + "epoch": 5.826050769870995, + "eval_main_loss": 0.23830385506153107, + "eval_main_runtime": 6.3332, + "eval_main_samples_per_second": 30.001, + "eval_main_steps_per_second": 3.79, + "step": 14000 + }, + { + "epoch": 5.826050769870995, + "eval_anatomy_loss": 2.8321759700775146, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.53, + "eval_anatomy_steps_per_second": 3.765, + "step": 14000 + }, + { + "epoch": 5.826050769870995, + "eval_college_mathematics_loss": 2.050740957260132, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 14000 + }, + { + "epoch": 5.826050769870995, + "eval_international_law_loss": 3.068516492843628, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 14000 + }, + { + "epoch": 5.834373699542239, + "grad_norm": 0.2734375, + "learning_rate": 2.6624886191675387e-06, + "loss": 0.2309, + "step": 14020 + }, + { + "epoch": 5.842696629213483, + "grad_norm": 0.2392578125, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.229, + "step": 14040 + }, + { + "epoch": 5.851019558884728, + "grad_norm": 0.2890625, + "learning_rate": 2.6461777838004933e-06, + "loss": 0.2307, + "step": 14060 + }, + { + "epoch": 5.8593424885559715, + "grad_norm": 0.2353515625, + "learning_rate": 2.6380199802420414e-06, + "loss": 0.2294, + "step": 14080 + }, + { + "epoch": 5.867665418227216, + "grad_norm": 0.298828125, + "learning_rate": 2.629860702288773e-06, + "loss": 0.234, + "step": 14100 + }, + { + "epoch": 5.867665418227216, + "eval_main_loss": 0.23834149539470673, + "eval_main_runtime": 6.3326, + "eval_main_samples_per_second": 30.003, + "eval_main_steps_per_second": 3.79, + "step": 14100 + }, + { + "epoch": 5.867665418227216, + "eval_anatomy_loss": 2.8320865631103516, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.493, + "eval_anatomy_steps_per_second": 3.747, + "step": 14100 + }, + { + "epoch": 5.867665418227216, + "eval_college_mathematics_loss": 2.049973249435425, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 14100 + }, + { + "epoch": 5.867665418227216, + "eval_international_law_loss": 3.065117120742798, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.758, + "step": 14100 + }, + { + "epoch": 5.87598834789846, + "grad_norm": 0.265625, + "learning_rate": 2.6217000371019597e-06, + "loss": 0.2286, + "step": 14120 + }, + { + "epoch": 5.884311277569704, + "grad_norm": 0.2060546875, + "learning_rate": 2.6135380718576947e-06, + "loss": 0.2321, + "step": 14140 + }, + { + "epoch": 5.892634207240949, + "grad_norm": 0.30859375, + "learning_rate": 2.6053748937459565e-06, + "loss": 0.2322, + "step": 14160 + }, + { + "epoch": 5.900957136912194, + "grad_norm": 0.25, + "learning_rate": 2.597210589969682e-06, + "loss": 0.2275, + "step": 14180 + }, + { + "epoch": 5.909280066583437, + "grad_norm": 0.318359375, + "learning_rate": 2.5890452477438318e-06, + "loss": 0.2323, + "step": 14200 + }, + { + "epoch": 5.909280066583437, + "eval_main_loss": 0.2382332980632782, + "eval_main_runtime": 6.3329, + "eval_main_samples_per_second": 30.002, + "eval_main_steps_per_second": 3.79, + "step": 14200 + }, + { + "epoch": 5.909280066583437, + "eval_anatomy_loss": 2.8326704502105713, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.503, + "eval_anatomy_steps_per_second": 3.751, + "step": 14200 + }, + { + "epoch": 5.909280066583437, + "eval_college_mathematics_loss": 2.0510854721069336, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.524, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 14200 + }, + { + "epoch": 5.909280066583437, + "eval_international_law_loss": 3.0666379928588867, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 14200 + }, + { + "epoch": 5.917602996254682, + "grad_norm": 0.275390625, + "learning_rate": 2.5808789542944585e-06, + "loss": 0.2338, + "step": 14220 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.29296875, + "learning_rate": 2.572711796857779e-06, + "loss": 0.2326, + "step": 14240 + }, + { + "epoch": 5.93424885559717, + "grad_norm": 0.25390625, + "learning_rate": 2.564543862679238e-06, + "loss": 0.2278, + "step": 14260 + }, + { + "epoch": 5.942571785268415, + "grad_norm": 0.279296875, + "learning_rate": 2.556375239012578e-06, + "loss": 0.2372, + "step": 14280 + }, + { + "epoch": 5.9508947149396585, + "grad_norm": 0.287109375, + "learning_rate": 2.5482060131189058e-06, + "loss": 0.232, + "step": 14300 + }, + { + "epoch": 5.9508947149396585, + "eval_main_loss": 0.2383614033460617, + "eval_main_runtime": 6.3311, + "eval_main_samples_per_second": 30.01, + "eval_main_steps_per_second": 3.791, + "step": 14300 + }, + { + "epoch": 5.9508947149396585, + "eval_anatomy_loss": 2.8360040187835693, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.524, + "eval_anatomy_steps_per_second": 3.762, + "step": 14300 + }, + { + "epoch": 5.9508947149396585, + "eval_college_mathematics_loss": 2.0509347915649414, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.538, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 14300 + }, + { + "epoch": 5.9508947149396585, + "eval_international_law_loss": 3.064924478530884, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.497, + "eval_international_law_steps_per_second": 3.749, + "step": 14300 + }, + { + "epoch": 5.959217644610903, + "grad_norm": 0.27734375, + "learning_rate": 2.540036272265764e-06, + "loss": 0.2343, + "step": 14320 + }, + { + "epoch": 5.967540574282148, + "grad_norm": 0.283203125, + "learning_rate": 2.5318661037261955e-06, + "loss": 0.233, + "step": 14340 + }, + { + "epoch": 5.975863503953391, + "grad_norm": 0.2734375, + "learning_rate": 2.5236955947778096e-06, + "loss": 0.2315, + "step": 14360 + }, + { + "epoch": 5.984186433624636, + "grad_norm": 0.2890625, + "learning_rate": 2.515524832701854e-06, + "loss": 0.2357, + "step": 14380 + }, + { + "epoch": 5.992509363295881, + "grad_norm": 0.310546875, + "learning_rate": 2.507353904782281e-06, + "loss": 0.2323, + "step": 14400 + }, + { + "epoch": 5.992509363295881, + "eval_main_loss": 0.23834244906902313, + "eval_main_runtime": 6.3321, + "eval_main_samples_per_second": 30.006, + "eval_main_steps_per_second": 3.79, + "step": 14400 + }, + { + "epoch": 5.992509363295881, + "eval_anatomy_loss": 2.830299139022827, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.52, + "eval_anatomy_steps_per_second": 3.76, + "step": 14400 + }, + { + "epoch": 5.992509363295881, + "eval_college_mathematics_loss": 2.0514543056488037, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.504, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 14400 + }, + { + "epoch": 5.992509363295881, + "eval_international_law_loss": 3.0648834705352783, + "eval_international_law_runtime": 0.2653, + "eval_international_law_samples_per_second": 7.539, + "eval_international_law_steps_per_second": 3.769, + "step": 14400 + }, + { + "epoch": 6.000832292967124, + "grad_norm": 0.2890625, + "learning_rate": 2.4991828983048126e-06, + "loss": 0.2355, + "step": 14420 + }, + { + "epoch": 6.009155222638369, + "grad_norm": 0.296875, + "learning_rate": 2.4910119005560123e-06, + "loss": 0.2307, + "step": 14440 + }, + { + "epoch": 6.017478152309613, + "grad_norm": 0.306640625, + "learning_rate": 2.4828409988223487e-06, + "loss": 0.2313, + "step": 14460 + }, + { + "epoch": 6.025801081980857, + "grad_norm": 0.28125, + "learning_rate": 2.4746702803892637e-06, + "loss": 0.2315, + "step": 14480 + }, + { + "epoch": 6.034124011652102, + "grad_norm": 0.25390625, + "learning_rate": 2.4664998325402442e-06, + "loss": 0.2318, + "step": 14500 + }, + { + "epoch": 6.034124011652102, + "eval_main_loss": 0.23824182152748108, + "eval_main_runtime": 6.3272, + "eval_main_samples_per_second": 30.029, + "eval_main_steps_per_second": 3.793, + "step": 14500 + }, + { + "epoch": 6.034124011652102, + "eval_anatomy_loss": 2.8344292640686035, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.517, + "eval_anatomy_steps_per_second": 3.758, + "step": 14500 + }, + { + "epoch": 6.034124011652102, + "eval_college_mathematics_loss": 2.0524632930755615, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.517, + "eval_college_mathematics_steps_per_second": 3.759, + "step": 14500 + }, + { + "epoch": 6.034124011652102, + "eval_international_law_loss": 3.0627036094665527, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.514, + "eval_international_law_steps_per_second": 3.757, + "step": 14500 + }, + { + "epoch": 6.0424469413233455, + "grad_norm": 0.30078125, + "learning_rate": 2.4583297425558848e-06, + "loss": 0.2297, + "step": 14520 + }, + { + "epoch": 6.05076987099459, + "grad_norm": 0.29296875, + "learning_rate": 2.4501600977129564e-06, + "loss": 0.2293, + "step": 14540 + }, + { + "epoch": 6.059092800665835, + "grad_norm": 0.291015625, + "learning_rate": 2.441990985283476e-06, + "loss": 0.2329, + "step": 14560 + }, + { + "epoch": 6.067415730337078, + "grad_norm": 0.287109375, + "learning_rate": 2.433822492533774e-06, + "loss": 0.2308, + "step": 14580 + }, + { + "epoch": 6.075738660008323, + "grad_norm": 0.306640625, + "learning_rate": 2.4256547067235577e-06, + "loss": 0.2313, + "step": 14600 + }, + { + "epoch": 6.075738660008323, + "eval_main_loss": 0.2382703721523285, + "eval_main_runtime": 6.3262, + "eval_main_samples_per_second": 30.034, + "eval_main_steps_per_second": 3.794, + "step": 14600 + }, + { + "epoch": 6.075738660008323, + "eval_anatomy_loss": 2.832831859588623, + "eval_anatomy_runtime": 0.2652, + "eval_anatomy_samples_per_second": 7.543, + "eval_anatomy_steps_per_second": 3.771, + "step": 14600 + }, + { + "epoch": 6.075738660008323, + "eval_college_mathematics_loss": 2.0489039421081543, + "eval_college_mathematics_runtime": 0.2646, + "eval_college_mathematics_samples_per_second": 7.56, + "eval_college_mathematics_steps_per_second": 3.78, + "step": 14600 + }, + { + "epoch": 6.075738660008323, + "eval_international_law_loss": 3.067589044570923, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.514, + "eval_international_law_steps_per_second": 3.757, + "step": 14600 + }, + { + "epoch": 6.084061589679568, + "grad_norm": 0.25, + "learning_rate": 2.4174877151049852e-06, + "loss": 0.2373, + "step": 14620 + }, + { + "epoch": 6.092384519350811, + "grad_norm": 0.275390625, + "learning_rate": 2.4093216049217315e-06, + "loss": 0.2279, + "step": 14640 + }, + { + "epoch": 6.100707449022056, + "grad_norm": 0.291015625, + "learning_rate": 2.4011564634080527e-06, + "loss": 0.2341, + "step": 14660 + }, + { + "epoch": 6.1090303786933005, + "grad_norm": 0.345703125, + "learning_rate": 2.3929923777878596e-06, + "loss": 0.2316, + "step": 14680 + }, + { + "epoch": 6.117353308364544, + "grad_norm": 0.25390625, + "learning_rate": 2.3848294352737837e-06, + "loss": 0.2363, + "step": 14700 + }, + { + "epoch": 6.117353308364544, + "eval_main_loss": 0.23824332654476166, + "eval_main_runtime": 6.3273, + "eval_main_samples_per_second": 30.028, + "eval_main_steps_per_second": 3.793, + "step": 14700 + }, + { + "epoch": 6.117353308364544, + "eval_anatomy_loss": 2.832658290863037, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.497, + "eval_anatomy_steps_per_second": 3.748, + "step": 14700 + }, + { + "epoch": 6.117353308364544, + "eval_college_mathematics_loss": 2.05214524269104, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.532, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 14700 + }, + { + "epoch": 6.117353308364544, + "eval_international_law_loss": 3.068110942840576, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.528, + "eval_international_law_steps_per_second": 3.764, + "step": 14700 + }, + { + "epoch": 6.125676238035789, + "grad_norm": 0.31640625, + "learning_rate": 2.3766677230662413e-06, + "loss": 0.2342, + "step": 14720 + }, + { + "epoch": 6.1339991677070325, + "grad_norm": 0.267578125, + "learning_rate": 2.368507328352511e-06, + "loss": 0.2281, + "step": 14740 + }, + { + "epoch": 6.142322097378277, + "grad_norm": 0.263671875, + "learning_rate": 2.3603483383057944e-06, + "loss": 0.2332, + "step": 14760 + }, + { + "epoch": 6.150645027049522, + "grad_norm": 0.27734375, + "learning_rate": 2.352190840084288e-06, + "loss": 0.2308, + "step": 14780 + }, + { + "epoch": 6.158967956720765, + "grad_norm": 0.30859375, + "learning_rate": 2.3440349208302553e-06, + "loss": 0.2364, + "step": 14800 + }, + { + "epoch": 6.158967956720765, + "eval_main_loss": 0.2383221834897995, + "eval_main_runtime": 6.332, + "eval_main_samples_per_second": 30.006, + "eval_main_steps_per_second": 3.79, + "step": 14800 + }, + { + "epoch": 6.158967956720765, + "eval_anatomy_loss": 2.832653045654297, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.506, + "eval_anatomy_steps_per_second": 3.753, + "step": 14800 + }, + { + "epoch": 6.158967956720765, + "eval_college_mathematics_loss": 2.0538275241851807, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.509, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 14800 + }, + { + "epoch": 6.158967956720765, + "eval_international_law_loss": 3.0637497901916504, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.536, + "eval_international_law_steps_per_second": 3.768, + "step": 14800 + }, + { + "epoch": 6.16729088639201, + "grad_norm": 0.26171875, + "learning_rate": 2.3358806676690855e-06, + "loss": 0.2316, + "step": 14820 + }, + { + "epoch": 6.1756138160632545, + "grad_norm": 0.265625, + "learning_rate": 2.327728167708377e-06, + "loss": 0.2321, + "step": 14840 + }, + { + "epoch": 6.183936745734498, + "grad_norm": 0.2109375, + "learning_rate": 2.3195775080369954e-06, + "loss": 0.2358, + "step": 14860 + }, + { + "epoch": 6.192259675405743, + "grad_norm": 0.296875, + "learning_rate": 2.3114287757241487e-06, + "loss": 0.2296, + "step": 14880 + }, + { + "epoch": 6.200582605076987, + "grad_norm": 0.29296875, + "learning_rate": 2.3032820578184567e-06, + "loss": 0.2284, + "step": 14900 + }, + { + "epoch": 6.200582605076987, + "eval_main_loss": 0.23826871812343597, + "eval_main_runtime": 6.3212, + "eval_main_samples_per_second": 30.058, + "eval_main_steps_per_second": 3.797, + "step": 14900 + }, + { + "epoch": 6.200582605076987, + "eval_anatomy_loss": 2.8343729972839355, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.526, + "eval_anatomy_steps_per_second": 3.763, + "step": 14900 + }, + { + "epoch": 6.200582605076987, + "eval_college_mathematics_loss": 2.051208257675171, + "eval_college_mathematics_runtime": 0.266, + "eval_college_mathematics_samples_per_second": 7.52, + "eval_college_mathematics_steps_per_second": 3.76, + "step": 14900 + }, + { + "epoch": 6.200582605076987, + "eval_international_law_loss": 3.064528465270996, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.505, + "eval_international_law_steps_per_second": 3.753, + "step": 14900 + }, + { + "epoch": 6.208905534748231, + "grad_norm": 0.2490234375, + "learning_rate": 2.29513744134702e-06, + "loss": 0.2272, + "step": 14920 + }, + { + "epoch": 6.217228464419476, + "grad_norm": 0.3046875, + "learning_rate": 2.286995013314488e-06, + "loss": 0.2331, + "step": 14940 + }, + { + "epoch": 6.22555139409072, + "grad_norm": 0.318359375, + "learning_rate": 2.2788548607021366e-06, + "loss": 0.2334, + "step": 14960 + }, + { + "epoch": 6.233874323761964, + "grad_norm": 0.2734375, + "learning_rate": 2.270717070466933e-06, + "loss": 0.2327, + "step": 14980 + }, + { + "epoch": 6.242197253433209, + "grad_norm": 0.306640625, + "learning_rate": 2.262581729540605e-06, + "loss": 0.2337, + "step": 15000 + }, + { + "epoch": 6.242197253433209, + "eval_main_loss": 0.238285630941391, + "eval_main_runtime": 6.3306, + "eval_main_samples_per_second": 30.013, + "eval_main_steps_per_second": 3.791, + "step": 15000 + }, + { + "epoch": 6.242197253433209, + "eval_anatomy_loss": 2.83296537399292, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.521, + "eval_anatomy_steps_per_second": 3.761, + "step": 15000 + }, + { + "epoch": 6.242197253433209, + "eval_college_mathematics_loss": 2.051861047744751, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.532, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 15000 + }, + { + "epoch": 6.242197253433209, + "eval_international_law_loss": 3.066511631011963, + "eval_international_law_runtime": 0.2683, + "eval_international_law_samples_per_second": 7.455, + "eval_international_law_steps_per_second": 3.728, + "step": 15000 + }, + { + "epoch": 6.250520183104452, + "grad_norm": 0.291015625, + "learning_rate": 2.2544489248287218e-06, + "loss": 0.2295, + "step": 15020 + }, + { + "epoch": 6.258843112775697, + "grad_norm": 0.27734375, + "learning_rate": 2.246318743209753e-06, + "loss": 0.2312, + "step": 15040 + }, + { + "epoch": 6.2671660424469415, + "grad_norm": 0.2421875, + "learning_rate": 2.2381912715341528e-06, + "loss": 0.2327, + "step": 15060 + }, + { + "epoch": 6.275488972118185, + "grad_norm": 0.2890625, + "learning_rate": 2.2300665966234243e-06, + "loss": 0.2317, + "step": 15080 + }, + { + "epoch": 6.28381190178943, + "grad_norm": 0.30078125, + "learning_rate": 2.221944805269192e-06, + "loss": 0.2274, + "step": 15100 + }, + { + "epoch": 6.28381190178943, + "eval_main_loss": 0.23841296136379242, + "eval_main_runtime": 6.3599, + "eval_main_samples_per_second": 29.875, + "eval_main_steps_per_second": 3.774, + "step": 15100 + }, + { + "epoch": 6.28381190178943, + "eval_anatomy_loss": 2.8354194164276123, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.753, + "step": 15100 + }, + { + "epoch": 6.28381190178943, + "eval_college_mathematics_loss": 2.0525858402252197, + "eval_college_mathematics_runtime": 0.2681, + "eval_college_mathematics_samples_per_second": 7.461, + "eval_college_mathematics_steps_per_second": 3.73, + "step": 15100 + }, + { + "epoch": 6.28381190178943, + "eval_international_law_loss": 3.0661587715148926, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.484, + "eval_international_law_steps_per_second": 3.742, + "step": 15100 + }, + { + "epoch": 6.292134831460674, + "grad_norm": 0.267578125, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.2311, + "step": 15120 + }, + { + "epoch": 6.300457761131918, + "grad_norm": 0.3203125, + "learning_rate": 2.2057102202417806e-06, + "loss": 0.2308, + "step": 15140 + }, + { + "epoch": 6.308780690803163, + "grad_norm": 0.28125, + "learning_rate": 2.1975975999941298e-06, + "loss": 0.232, + "step": 15160 + }, + { + "epoch": 6.317103620474407, + "grad_norm": 0.259765625, + "learning_rate": 2.1894882101521807e-06, + "loss": 0.2317, + "step": 15180 + }, + { + "epoch": 6.325426550145651, + "grad_norm": 0.275390625, + "learning_rate": 2.181382137344278e-06, + "loss": 0.2302, + "step": 15200 + }, + { + "epoch": 6.325426550145651, + "eval_main_loss": 0.23836155235767365, + "eval_main_runtime": 6.3527, + "eval_main_samples_per_second": 29.909, + "eval_main_steps_per_second": 3.778, + "step": 15200 + }, + { + "epoch": 6.325426550145651, + "eval_anatomy_loss": 2.8317906856536865, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.506, + "eval_anatomy_steps_per_second": 3.753, + "step": 15200 + }, + { + "epoch": 6.325426550145651, + "eval_college_mathematics_loss": 2.054029703140259, + "eval_college_mathematics_runtime": 0.2679, + "eval_college_mathematics_samples_per_second": 7.467, + "eval_college_mathematics_steps_per_second": 3.733, + "step": 15200 + }, + { + "epoch": 6.325426550145651, + "eval_international_law_loss": 3.0642471313476562, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.755, + "step": 15200 + }, + { + "epoch": 6.333749479816896, + "grad_norm": 0.26171875, + "learning_rate": 2.173279468163331e-06, + "loss": 0.2322, + "step": 15220 + }, + { + "epoch": 6.34207240948814, + "grad_norm": 0.291015625, + "learning_rate": 2.1651802891658897e-06, + "loss": 0.2321, + "step": 15240 + }, + { + "epoch": 6.350395339159384, + "grad_norm": 0.345703125, + "learning_rate": 2.1570846868712227e-06, + "loss": 0.2378, + "step": 15260 + }, + { + "epoch": 6.3587182688306285, + "grad_norm": 0.25, + "learning_rate": 2.1489927477603884e-06, + "loss": 0.2301, + "step": 15280 + }, + { + "epoch": 6.367041198501873, + "grad_norm": 0.26953125, + "learning_rate": 2.1409045582753144e-06, + "loss": 0.234, + "step": 15300 + }, + { + "epoch": 6.367041198501873, + "eval_main_loss": 0.23826418817043304, + "eval_main_runtime": 6.3363, + "eval_main_samples_per_second": 29.986, + "eval_main_steps_per_second": 3.788, + "step": 15300 + }, + { + "epoch": 6.367041198501873, + "eval_anatomy_loss": 2.8337223529815674, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.741, + "step": 15300 + }, + { + "epoch": 6.367041198501873, + "eval_college_mathematics_loss": 2.0507359504699707, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.503, + "eval_college_mathematics_steps_per_second": 3.751, + "step": 15300 + }, + { + "epoch": 6.367041198501873, + "eval_international_law_loss": 3.065744161605835, + "eval_international_law_runtime": 0.2653, + "eval_international_law_samples_per_second": 7.538, + "eval_international_law_steps_per_second": 3.769, + "step": 15300 + }, + { + "epoch": 6.375364128173117, + "grad_norm": 0.318359375, + "learning_rate": 2.132820204817872e-06, + "loss": 0.2279, + "step": 15320 + }, + { + "epoch": 6.383687057844361, + "grad_norm": 0.251953125, + "learning_rate": 2.124739773748955e-06, + "loss": 0.2297, + "step": 15340 + }, + { + "epoch": 6.392009987515605, + "grad_norm": 0.275390625, + "learning_rate": 2.1166633513875563e-06, + "loss": 0.2296, + "step": 15360 + }, + { + "epoch": 6.40033291718685, + "grad_norm": 0.27734375, + "learning_rate": 2.1085910240098456e-06, + "loss": 0.2306, + "step": 15380 + }, + { + "epoch": 6.408655846858094, + "grad_norm": 0.2890625, + "learning_rate": 2.1005228778482484e-06, + "loss": 0.2319, + "step": 15400 + }, + { + "epoch": 6.408655846858094, + "eval_main_loss": 0.238305002450943, + "eval_main_runtime": 6.3303, + "eval_main_samples_per_second": 30.014, + "eval_main_steps_per_second": 3.791, + "step": 15400 + }, + { + "epoch": 6.408655846858094, + "eval_anatomy_loss": 2.8330578804016113, + "eval_anatomy_runtime": 0.2654, + "eval_anatomy_samples_per_second": 7.535, + "eval_anatomy_steps_per_second": 3.768, + "step": 15400 + }, + { + "epoch": 6.408655846858094, + "eval_college_mathematics_loss": 2.0521047115325928, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 15400 + }, + { + "epoch": 6.408655846858094, + "eval_international_law_loss": 3.0654828548431396, + "eval_international_law_runtime": 0.4213, + "eval_international_law_samples_per_second": 4.748, + "eval_international_law_steps_per_second": 2.374, + "step": 15400 + }, + { + "epoch": 6.416978776529338, + "grad_norm": 0.265625, + "learning_rate": 2.0924589990905253e-06, + "loss": 0.2338, + "step": 15420 + }, + { + "epoch": 6.425301706200583, + "grad_norm": 0.2734375, + "learning_rate": 2.084399473878848e-06, + "loss": 0.2353, + "step": 15440 + }, + { + "epoch": 6.433624635871827, + "grad_norm": 0.28515625, + "learning_rate": 2.0763443883088833e-06, + "loss": 0.2326, + "step": 15460 + }, + { + "epoch": 6.441947565543071, + "grad_norm": 0.2255859375, + "learning_rate": 2.068293828428872e-06, + "loss": 0.2361, + "step": 15480 + }, + { + "epoch": 6.4502704952143155, + "grad_norm": 0.33203125, + "learning_rate": 2.0602478802387085e-06, + "loss": 0.2288, + "step": 15500 + }, + { + "epoch": 6.4502704952143155, + "eval_main_loss": 0.23823831975460052, + "eval_main_runtime": 6.3318, + "eval_main_samples_per_second": 30.007, + "eval_main_steps_per_second": 3.79, + "step": 15500 + }, + { + "epoch": 6.4502704952143155, + "eval_anatomy_loss": 2.834176540374756, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.51, + "eval_anatomy_steps_per_second": 3.755, + "step": 15500 + }, + { + "epoch": 6.4502704952143155, + "eval_college_mathematics_loss": 2.052133560180664, + "eval_college_mathematics_runtime": 0.2676, + "eval_college_mathematics_samples_per_second": 7.473, + "eval_college_mathematics_steps_per_second": 3.736, + "step": 15500 + }, + { + "epoch": 6.4502704952143155, + "eval_international_law_loss": 3.066138982772827, + "eval_international_law_runtime": 0.4176, + "eval_international_law_samples_per_second": 4.789, + "eval_international_law_steps_per_second": 2.394, + "step": 15500 + }, + { + "epoch": 6.45859342488556, + "grad_norm": 0.29296875, + "learning_rate": 2.0522066296890226e-06, + "loss": 0.2292, + "step": 15520 + }, + { + "epoch": 6.466916354556804, + "grad_norm": 0.314453125, + "learning_rate": 2.0441701626802647e-06, + "loss": 0.2313, + "step": 15540 + }, + { + "epoch": 6.475239284228048, + "grad_norm": 0.29296875, + "learning_rate": 2.036138565061779e-06, + "loss": 0.2322, + "step": 15560 + }, + { + "epoch": 6.483562213899292, + "grad_norm": 0.287109375, + "learning_rate": 2.0281119226308976e-06, + "loss": 0.2331, + "step": 15580 + }, + { + "epoch": 6.491885143570537, + "grad_norm": 0.26953125, + "learning_rate": 2.02009032113202e-06, + "loss": 0.232, + "step": 15600 + }, + { + "epoch": 6.491885143570537, + "eval_main_loss": 0.2383803129196167, + "eval_main_runtime": 6.3247, + "eval_main_samples_per_second": 30.041, + "eval_main_steps_per_second": 3.795, + "step": 15600 + }, + { + "epoch": 6.491885143570537, + "eval_anatomy_loss": 2.8346643447875977, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.524, + "eval_anatomy_steps_per_second": 3.762, + "step": 15600 + }, + { + "epoch": 6.491885143570537, + "eval_college_mathematics_loss": 2.0503625869750977, + "eval_college_mathematics_runtime": 0.2647, + "eval_college_mathematics_samples_per_second": 7.554, + "eval_college_mathematics_steps_per_second": 3.777, + "step": 15600 + }, + { + "epoch": 6.491885143570537, + "eval_international_law_loss": 3.066617012023926, + "eval_international_law_runtime": 0.2652, + "eval_international_law_samples_per_second": 7.542, + "eval_international_law_steps_per_second": 3.771, + "step": 15600 + }, + { + "epoch": 6.500208073241781, + "grad_norm": 0.271484375, + "learning_rate": 2.012073846255691e-06, + "loss": 0.2364, + "step": 15620 + }, + { + "epoch": 6.508531002913025, + "grad_norm": 0.291015625, + "learning_rate": 2.0040625836376937e-06, + "loss": 0.2308, + "step": 15640 + }, + { + "epoch": 6.51685393258427, + "grad_norm": 0.28515625, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.2287, + "step": 15660 + }, + { + "epoch": 6.525176862255514, + "grad_norm": 0.3203125, + "learning_rate": 1.9880560374405107e-06, + "loss": 0.2299, + "step": 15680 + }, + { + "epoch": 6.533499791926758, + "grad_norm": 0.29296875, + "learning_rate": 1.980060924850836e-06, + "loss": 0.2325, + "step": 15700 + }, + { + "epoch": 6.533499791926758, + "eval_main_loss": 0.2383701652288437, + "eval_main_runtime": 6.3198, + "eval_main_samples_per_second": 30.064, + "eval_main_steps_per_second": 3.798, + "step": 15700 + }, + { + "epoch": 6.533499791926758, + "eval_anatomy_loss": 2.833786725997925, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.528, + "eval_anatomy_steps_per_second": 3.764, + "step": 15700 + }, + { + "epoch": 6.533499791926758, + "eval_college_mathematics_loss": 2.050478219985962, + "eval_college_mathematics_runtime": 0.2678, + "eval_college_mathematics_samples_per_second": 7.467, + "eval_college_mathematics_steps_per_second": 3.734, + "step": 15700 + }, + { + "epoch": 6.533499791926758, + "eval_international_law_loss": 3.06551194190979, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.535, + "eval_international_law_steps_per_second": 3.767, + "step": 15700 + }, + { + "epoch": 6.5418227215980025, + "grad_norm": 0.283203125, + "learning_rate": 1.972071366496685e-06, + "loss": 0.2301, + "step": 15720 + }, + { + "epoch": 6.550145651269247, + "grad_norm": 0.287109375, + "learning_rate": 1.964087447726306e-06, + "loss": 0.2336, + "step": 15740 + }, + { + "epoch": 6.558468580940491, + "grad_norm": 0.275390625, + "learning_rate": 1.956109253827702e-06, + "loss": 0.2355, + "step": 15760 + }, + { + "epoch": 6.566791510611735, + "grad_norm": 0.302734375, + "learning_rate": 1.9481368700277197e-06, + "loss": 0.2309, + "step": 15780 + }, + { + "epoch": 6.57511444028298, + "grad_norm": 0.267578125, + "learning_rate": 1.9401703814911394e-06, + "loss": 0.2334, + "step": 15800 + }, + { + "epoch": 6.57511444028298, + "eval_main_loss": 0.2383488267660141, + "eval_main_runtime": 6.3287, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 15800 + }, + { + "epoch": 6.57511444028298, + "eval_anatomy_loss": 2.8328428268432617, + "eval_anatomy_runtime": 0.2649, + "eval_anatomy_samples_per_second": 7.551, + "eval_anatomy_steps_per_second": 3.776, + "step": 15800 + }, + { + "epoch": 6.57511444028298, + "eval_college_mathematics_loss": 2.052703619003296, + "eval_college_mathematics_runtime": 0.2652, + "eval_college_mathematics_samples_per_second": 7.54, + "eval_college_mathematics_steps_per_second": 3.77, + "step": 15800 + }, + { + "epoch": 6.57511444028298, + "eval_international_law_loss": 3.065504550933838, + "eval_international_law_runtime": 0.2679, + "eval_international_law_samples_per_second": 7.466, + "eval_international_law_steps_per_second": 3.733, + "step": 15800 + }, + { + "epoch": 6.583437369954224, + "grad_norm": 0.2578125, + "learning_rate": 1.9322098733197677e-06, + "loss": 0.2323, + "step": 15820 + }, + { + "epoch": 6.591760299625468, + "grad_norm": 0.283203125, + "learning_rate": 1.9242554305515208e-06, + "loss": 0.2327, + "step": 15840 + }, + { + "epoch": 6.600083229296713, + "grad_norm": 0.322265625, + "learning_rate": 1.9163071381595273e-06, + "loss": 0.2335, + "step": 15860 + }, + { + "epoch": 6.608406158967957, + "grad_norm": 0.2373046875, + "learning_rate": 1.908365081051212e-06, + "loss": 0.2286, + "step": 15880 + }, + { + "epoch": 6.616729088639201, + "grad_norm": 0.35546875, + "learning_rate": 1.9004293440673908e-06, + "loss": 0.2317, + "step": 15900 + }, + { + "epoch": 6.616729088639201, + "eval_main_loss": 0.23834270238876343, + "eval_main_runtime": 6.3247, + "eval_main_samples_per_second": 30.041, + "eval_main_steps_per_second": 3.795, + "step": 15900 + }, + { + "epoch": 6.616729088639201, + "eval_anatomy_loss": 2.832962989807129, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.492, + "eval_anatomy_steps_per_second": 3.746, + "step": 15900 + }, + { + "epoch": 6.616729088639201, + "eval_college_mathematics_loss": 2.0485119819641113, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.495, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 15900 + }, + { + "epoch": 6.616729088639201, + "eval_international_law_loss": 3.066624164581299, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.527, + "eval_international_law_steps_per_second": 3.763, + "step": 15900 + }, + { + "epoch": 6.625052018310445, + "grad_norm": 0.31640625, + "learning_rate": 1.8925000119813669e-06, + "loss": 0.2292, + "step": 15920 + }, + { + "epoch": 6.6333749479816895, + "grad_norm": 0.251953125, + "learning_rate": 1.88457716949802e-06, + "loss": 0.231, + "step": 15940 + }, + { + "epoch": 6.641697877652934, + "grad_norm": 0.3828125, + "learning_rate": 1.8766609012529085e-06, + "loss": 0.2292, + "step": 15960 + }, + { + "epoch": 6.650020807324178, + "grad_norm": 0.2236328125, + "learning_rate": 1.8687512918113604e-06, + "loss": 0.2347, + "step": 15980 + }, + { + "epoch": 6.658343736995422, + "grad_norm": 0.28515625, + "learning_rate": 1.8608484256675702e-06, + "loss": 0.2292, + "step": 16000 + }, + { + "epoch": 6.658343736995422, + "eval_main_loss": 0.23837390542030334, + "eval_main_runtime": 6.3517, + "eval_main_samples_per_second": 29.913, + "eval_main_steps_per_second": 3.779, + "step": 16000 + }, + { + "epoch": 6.658343736995422, + "eval_anatomy_loss": 2.8328123092651367, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.488, + "eval_anatomy_steps_per_second": 3.744, + "step": 16000 + }, + { + "epoch": 6.658343736995422, + "eval_college_mathematics_loss": 2.0518500804901123, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.489, + "eval_college_mathematics_steps_per_second": 3.745, + "step": 16000 + }, + { + "epoch": 6.658343736995422, + "eval_international_law_loss": 3.0677542686462402, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.523, + "eval_international_law_steps_per_second": 3.762, + "step": 16000 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.244140625, + "learning_rate": 1.852952387243698e-06, + "loss": 0.2263, + "step": 16020 + }, + { + "epoch": 6.674989596337911, + "grad_norm": 0.21484375, + "learning_rate": 1.845063260888969e-06, + "loss": 0.2312, + "step": 16040 + }, + { + "epoch": 6.683312526009155, + "grad_norm": 0.298828125, + "learning_rate": 1.8371811308787655e-06, + "loss": 0.2325, + "step": 16060 + }, + { + "epoch": 6.6916354556804, + "grad_norm": 0.24609375, + "learning_rate": 1.829306081413736e-06, + "loss": 0.2292, + "step": 16080 + }, + { + "epoch": 6.699958385351644, + "grad_norm": 0.328125, + "learning_rate": 1.8214381966188898e-06, + "loss": 0.2314, + "step": 16100 + }, + { + "epoch": 6.699958385351644, + "eval_main_loss": 0.2383338063955307, + "eval_main_runtime": 6.3513, + "eval_main_samples_per_second": 29.915, + "eval_main_steps_per_second": 3.779, + "step": 16100 + }, + { + "epoch": 6.699958385351644, + "eval_anatomy_loss": 2.8324191570281982, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.479, + "eval_anatomy_steps_per_second": 3.74, + "step": 16100 + }, + { + "epoch": 6.699958385351644, + "eval_college_mathematics_loss": 2.0509135723114014, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.507, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 16100 + }, + { + "epoch": 6.699958385351644, + "eval_international_law_loss": 3.066457748413086, + "eval_international_law_runtime": 0.2667, + "eval_international_law_samples_per_second": 7.499, + "eval_international_law_steps_per_second": 3.749, + "step": 16100 + }, + { + "epoch": 6.708281315022888, + "grad_norm": 0.2890625, + "learning_rate": 1.813577560542699e-06, + "loss": 0.2288, + "step": 16120 + }, + { + "epoch": 6.716604244694132, + "grad_norm": 0.28515625, + "learning_rate": 1.8057242571562034e-06, + "loss": 0.2331, + "step": 16140 + }, + { + "epoch": 6.7249271743653765, + "grad_norm": 0.302734375, + "learning_rate": 1.7978783703521102e-06, + "loss": 0.2371, + "step": 16160 + }, + { + "epoch": 6.733250104036621, + "grad_norm": 0.283203125, + "learning_rate": 1.7900399839438976e-06, + "loss": 0.236, + "step": 16180 + }, + { + "epoch": 6.741573033707866, + "grad_norm": 0.298828125, + "learning_rate": 1.782209181664924e-06, + "loss": 0.2352, + "step": 16200 + }, + { + "epoch": 6.741573033707866, + "eval_main_loss": 0.23834922909736633, + "eval_main_runtime": 6.3393, + "eval_main_samples_per_second": 29.972, + "eval_main_steps_per_second": 3.786, + "step": 16200 + }, + { + "epoch": 6.741573033707866, + "eval_anatomy_loss": 2.832639217376709, + "eval_anatomy_runtime": 0.2655, + "eval_anatomy_samples_per_second": 7.533, + "eval_anatomy_steps_per_second": 3.766, + "step": 16200 + }, + { + "epoch": 6.741573033707866, + "eval_college_mathematics_loss": 2.050755023956299, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 16200 + }, + { + "epoch": 6.741573033707866, + "eval_international_law_loss": 3.0636000633239746, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.502, + "eval_international_law_steps_per_second": 3.751, + "step": 16200 + }, + { + "epoch": 6.749895963379109, + "grad_norm": 0.291015625, + "learning_rate": 1.774386047167529e-06, + "loss": 0.2301, + "step": 16220 + }, + { + "epoch": 6.758218893050354, + "grad_norm": 0.275390625, + "learning_rate": 1.7665706640221415e-06, + "loss": 0.236, + "step": 16240 + }, + { + "epoch": 6.766541822721598, + "grad_norm": 0.287109375, + "learning_rate": 1.7587631157163876e-06, + "loss": 0.2328, + "step": 16260 + }, + { + "epoch": 6.774864752392842, + "grad_norm": 0.296875, + "learning_rate": 1.7509634856541951e-06, + "loss": 0.2337, + "step": 16280 + }, + { + "epoch": 6.783187682064087, + "grad_norm": 0.283203125, + "learning_rate": 1.7431718571549092e-06, + "loss": 0.231, + "step": 16300 + }, + { + "epoch": 6.783187682064087, + "eval_main_loss": 0.23830153048038483, + "eval_main_runtime": 6.3187, + "eval_main_samples_per_second": 30.069, + "eval_main_steps_per_second": 3.798, + "step": 16300 + }, + { + "epoch": 6.783187682064087, + "eval_anatomy_loss": 2.8300514221191406, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.49, + "eval_anatomy_steps_per_second": 3.745, + "step": 16300 + }, + { + "epoch": 6.783187682064087, + "eval_college_mathematics_loss": 2.052163600921631, + "eval_college_mathematics_runtime": 0.2643, + "eval_college_mathematics_samples_per_second": 7.568, + "eval_college_mathematics_steps_per_second": 3.784, + "step": 16300 + }, + { + "epoch": 6.783187682064087, + "eval_international_law_loss": 3.064903736114502, + "eval_international_law_runtime": 0.2653, + "eval_international_law_samples_per_second": 7.539, + "eval_international_law_steps_per_second": 3.77, + "step": 16300 + }, + { + "epoch": 6.7915106117353305, + "grad_norm": 0.26171875, + "learning_rate": 1.7353883134523975e-06, + "loss": 0.231, + "step": 16320 + }, + { + "epoch": 6.799833541406575, + "grad_norm": 0.259765625, + "learning_rate": 1.7276129376941594e-06, + "loss": 0.2344, + "step": 16340 + }, + { + "epoch": 6.80815647107782, + "grad_norm": 0.265625, + "learning_rate": 1.7198458129404433e-06, + "loss": 0.2312, + "step": 16360 + }, + { + "epoch": 6.8164794007490634, + "grad_norm": 0.2470703125, + "learning_rate": 1.7120870221633556e-06, + "loss": 0.2315, + "step": 16380 + }, + { + "epoch": 6.824802330420308, + "grad_norm": 0.251953125, + "learning_rate": 1.704336648245975e-06, + "loss": 0.2272, + "step": 16400 + }, + { + "epoch": 6.824802330420308, + "eval_main_loss": 0.23840200901031494, + "eval_main_runtime": 6.3137, + "eval_main_samples_per_second": 30.093, + "eval_main_steps_per_second": 3.801, + "step": 16400 + }, + { + "epoch": 6.824802330420308, + "eval_anatomy_loss": 2.8353002071380615, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.53, + "eval_anatomy_steps_per_second": 3.765, + "step": 16400 + }, + { + "epoch": 6.824802330420308, + "eval_college_mathematics_loss": 2.0492591857910156, + "eval_college_mathematics_runtime": 0.2643, + "eval_college_mathematics_samples_per_second": 7.568, + "eval_college_mathematics_steps_per_second": 3.784, + "step": 16400 + }, + { + "epoch": 6.824802330420308, + "eval_international_law_loss": 3.065768241882324, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.531, + "eval_international_law_steps_per_second": 3.765, + "step": 16400 + }, + { + "epoch": 6.833125260091553, + "grad_norm": 0.271484375, + "learning_rate": 1.6965947739814672e-06, + "loss": 0.2333, + "step": 16420 + }, + { + "epoch": 6.841448189762796, + "grad_norm": 0.298828125, + "learning_rate": 1.6888614820722024e-06, + "loss": 0.2353, + "step": 16440 + }, + { + "epoch": 6.849771119434041, + "grad_norm": 0.279296875, + "learning_rate": 1.681136855128866e-06, + "loss": 0.2247, + "step": 16460 + }, + { + "epoch": 6.858094049105285, + "grad_norm": 0.279296875, + "learning_rate": 1.6734209756695843e-06, + "loss": 0.2363, + "step": 16480 + }, + { + "epoch": 6.866416978776529, + "grad_norm": 0.255859375, + "learning_rate": 1.6657139261190364e-06, + "loss": 0.2328, + "step": 16500 + }, + { + "epoch": 6.866416978776529, + "eval_main_loss": 0.23840853571891785, + "eval_main_runtime": 6.3197, + "eval_main_samples_per_second": 30.065, + "eval_main_steps_per_second": 3.798, + "step": 16500 + }, + { + "epoch": 6.866416978776529, + "eval_anatomy_loss": 2.834096908569336, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 16500 + }, + { + "epoch": 6.866416978776529, + "eval_college_mathematics_loss": 2.0504603385925293, + "eval_college_mathematics_runtime": 0.265, + "eval_college_mathematics_samples_per_second": 7.546, + "eval_college_mathematics_steps_per_second": 3.773, + "step": 16500 + }, + { + "epoch": 6.866416978776529, + "eval_international_law_loss": 3.065997838973999, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.525, + "eval_international_law_steps_per_second": 3.762, + "step": 16500 + }, + { + "epoch": 6.874739908447774, + "grad_norm": 0.322265625, + "learning_rate": 1.6580157888075766e-06, + "loss": 0.2319, + "step": 16520 + }, + { + "epoch": 6.8830628381190175, + "grad_norm": 0.2578125, + "learning_rate": 1.6503266459703566e-06, + "loss": 0.2363, + "step": 16540 + }, + { + "epoch": 6.891385767790262, + "grad_norm": 0.287109375, + "learning_rate": 1.6426465797464402e-06, + "loss": 0.2318, + "step": 16560 + }, + { + "epoch": 6.899708697461507, + "grad_norm": 0.23828125, + "learning_rate": 1.6349756721779348e-06, + "loss": 0.2304, + "step": 16580 + }, + { + "epoch": 6.90803162713275, + "grad_norm": 0.28125, + "learning_rate": 1.6273140052091097e-06, + "loss": 0.2345, + "step": 16600 + }, + { + "epoch": 6.90803162713275, + "eval_main_loss": 0.2384311705827713, + "eval_main_runtime": 6.3138, + "eval_main_samples_per_second": 30.093, + "eval_main_steps_per_second": 3.801, + "step": 16600 + }, + { + "epoch": 6.90803162713275, + "eval_anatomy_loss": 2.832714319229126, + "eval_anatomy_runtime": 0.2643, + "eval_anatomy_samples_per_second": 7.566, + "eval_anatomy_steps_per_second": 3.783, + "step": 16600 + }, + { + "epoch": 6.90803162713275, + "eval_college_mathematics_loss": 2.0529298782348633, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.543, + "eval_college_mathematics_steps_per_second": 3.772, + "step": 16600 + }, + { + "epoch": 6.90803162713275, + "eval_international_law_loss": 3.064915180206299, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.497, + "eval_international_law_steps_per_second": 3.748, + "step": 16600 + }, + { + "epoch": 6.916354556803995, + "grad_norm": 0.306640625, + "learning_rate": 1.6196616606855194e-06, + "loss": 0.231, + "step": 16620 + }, + { + "epoch": 6.92467748647524, + "grad_norm": 0.265625, + "learning_rate": 1.6120187203531351e-06, + "loss": 0.2332, + "step": 16640 + }, + { + "epoch": 6.933000416146483, + "grad_norm": 0.2451171875, + "learning_rate": 1.6043852658574666e-06, + "loss": 0.2305, + "step": 16660 + }, + { + "epoch": 6.941323345817728, + "grad_norm": 0.318359375, + "learning_rate": 1.596761378742689e-06, + "loss": 0.2346, + "step": 16680 + }, + { + "epoch": 6.949646275488972, + "grad_norm": 0.291015625, + "learning_rate": 1.589147140450778e-06, + "loss": 0.2263, + "step": 16700 + }, + { + "epoch": 6.949646275488972, + "eval_main_loss": 0.23828622698783875, + "eval_main_runtime": 6.3112, + "eval_main_samples_per_second": 30.105, + "eval_main_steps_per_second": 3.803, + "step": 16700 + }, + { + "epoch": 6.949646275488972, + "eval_anatomy_loss": 2.833873748779297, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.497, + "eval_anatomy_steps_per_second": 3.748, + "step": 16700 + }, + { + "epoch": 6.949646275488972, + "eval_college_mathematics_loss": 2.0528018474578857, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.534, + "eval_college_mathematics_steps_per_second": 3.767, + "step": 16700 + }, + { + "epoch": 6.949646275488972, + "eval_international_law_loss": 3.0665361881256104, + "eval_international_law_runtime": 0.2655, + "eval_international_law_samples_per_second": 7.533, + "eval_international_law_steps_per_second": 3.766, + "step": 16700 + }, + { + "epoch": 6.957969205160216, + "grad_norm": 0.271484375, + "learning_rate": 1.5815426323206345e-06, + "loss": 0.2324, + "step": 16720 + }, + { + "epoch": 6.966292134831461, + "grad_norm": 0.2421875, + "learning_rate": 1.5739479355872162e-06, + "loss": 0.2307, + "step": 16740 + }, + { + "epoch": 6.974615064502705, + "grad_norm": 0.2421875, + "learning_rate": 1.5663631313806726e-06, + "loss": 0.235, + "step": 16760 + }, + { + "epoch": 6.982937994173949, + "grad_norm": 0.212890625, + "learning_rate": 1.5587883007254741e-06, + "loss": 0.2266, + "step": 16780 + }, + { + "epoch": 6.991260923845194, + "grad_norm": 0.265625, + "learning_rate": 1.5512235245395514e-06, + "loss": 0.2316, + "step": 16800 + }, + { + "epoch": 6.991260923845194, + "eval_main_loss": 0.23832102119922638, + "eval_main_runtime": 6.3531, + "eval_main_samples_per_second": 29.907, + "eval_main_steps_per_second": 3.778, + "step": 16800 + }, + { + "epoch": 6.991260923845194, + "eval_anatomy_loss": 2.8318288326263428, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.741, + "step": 16800 + }, + { + "epoch": 6.991260923845194, + "eval_college_mathematics_loss": 2.0499653816223145, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.505, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 16800 + }, + { + "epoch": 6.991260923845194, + "eval_international_law_loss": 3.06691312789917, + "eval_international_law_runtime": 0.2681, + "eval_international_law_samples_per_second": 7.46, + "eval_international_law_steps_per_second": 3.73, + "step": 16800 + }, + { + "epoch": 6.999583853516437, + "grad_norm": 0.251953125, + "learning_rate": 1.5436688836334274e-06, + "loss": 0.2314, + "step": 16820 + }, + { + "epoch": 7.007906783187682, + "grad_norm": 0.294921875, + "learning_rate": 1.5361244587093551e-06, + "loss": 0.2344, + "step": 16840 + }, + { + "epoch": 7.016229712858927, + "grad_norm": 0.2421875, + "learning_rate": 1.528590330360456e-06, + "loss": 0.2314, + "step": 16860 + }, + { + "epoch": 7.02455264253017, + "grad_norm": 0.2890625, + "learning_rate": 1.5210665790698592e-06, + "loss": 0.2334, + "step": 16880 + }, + { + "epoch": 7.032875572201415, + "grad_norm": 0.279296875, + "learning_rate": 1.513553285209838e-06, + "loss": 0.2308, + "step": 16900 + }, + { + "epoch": 7.032875572201415, + "eval_main_loss": 0.2382553517818451, + "eval_main_runtime": 6.3599, + "eval_main_samples_per_second": 29.875, + "eval_main_steps_per_second": 3.774, + "step": 16900 + }, + { + "epoch": 7.032875572201415, + "eval_anatomy_loss": 2.832185983657837, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.476, + "eval_anatomy_steps_per_second": 3.738, + "step": 16900 + }, + { + "epoch": 7.032875572201415, + "eval_college_mathematics_loss": 2.0515077114105225, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 16900 + }, + { + "epoch": 7.032875572201415, + "eval_international_law_loss": 3.0670435428619385, + "eval_international_law_runtime": 0.2682, + "eval_international_law_samples_per_second": 7.458, + "eval_international_law_steps_per_second": 3.729, + "step": 16900 + }, + { + "epoch": 7.0411985018726595, + "grad_norm": 0.31640625, + "learning_rate": 1.5060505290409594e-06, + "loss": 0.237, + "step": 16920 + }, + { + "epoch": 7.049521431543903, + "grad_norm": 0.27734375, + "learning_rate": 1.4985583907112188e-06, + "loss": 0.2303, + "step": 16940 + }, + { + "epoch": 7.057844361215148, + "grad_norm": 0.26953125, + "learning_rate": 1.491076950255186e-06, + "loss": 0.2309, + "step": 16960 + }, + { + "epoch": 7.066167290886392, + "grad_norm": 0.255859375, + "learning_rate": 1.4836062875931534e-06, + "loss": 0.2307, + "step": 16980 + }, + { + "epoch": 7.074490220557636, + "grad_norm": 0.275390625, + "learning_rate": 1.4761464825302788e-06, + "loss": 0.2316, + "step": 17000 + }, + { + "epoch": 7.074490220557636, + "eval_main_loss": 0.2383512556552887, + "eval_main_runtime": 6.3399, + "eval_main_samples_per_second": 29.969, + "eval_main_steps_per_second": 3.786, + "step": 17000 + }, + { + "epoch": 7.074490220557636, + "eval_anatomy_loss": 2.8314664363861084, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.481, + "eval_anatomy_steps_per_second": 3.74, + "step": 17000 + }, + { + "epoch": 7.074490220557636, + "eval_college_mathematics_loss": 2.0510685443878174, + "eval_college_mathematics_runtime": 0.2673, + "eval_college_mathematics_samples_per_second": 7.482, + "eval_college_mathematics_steps_per_second": 3.741, + "step": 17000 + }, + { + "epoch": 7.074490220557636, + "eval_international_law_loss": 3.0641367435455322, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.504, + "eval_international_law_steps_per_second": 3.752, + "step": 17000 + }, + { + "epoch": 7.082813150228881, + "grad_norm": 0.291015625, + "learning_rate": 1.4686976147557332e-06, + "loss": 0.2291, + "step": 17020 + }, + { + "epoch": 7.091136079900124, + "grad_norm": 0.279296875, + "learning_rate": 1.461259763841853e-06, + "loss": 0.2321, + "step": 17040 + }, + { + "epoch": 7.099459009571369, + "grad_norm": 0.291015625, + "learning_rate": 1.4538330092432828e-06, + "loss": 0.2324, + "step": 17060 + }, + { + "epoch": 7.107781939242614, + "grad_norm": 0.294921875, + "learning_rate": 1.4464174302961343e-06, + "loss": 0.2357, + "step": 17080 + }, + { + "epoch": 7.116104868913857, + "grad_norm": 0.28125, + "learning_rate": 1.4390131062171378e-06, + "loss": 0.2319, + "step": 17100 + }, + { + "epoch": 7.116104868913857, + "eval_main_loss": 0.2382993996143341, + "eval_main_runtime": 6.329, + "eval_main_samples_per_second": 30.021, + "eval_main_steps_per_second": 3.792, + "step": 17100 + }, + { + "epoch": 7.116104868913857, + "eval_anatomy_loss": 2.833508014678955, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.506, + "eval_anatomy_steps_per_second": 3.753, + "step": 17100 + }, + { + "epoch": 7.116104868913857, + "eval_college_mathematics_loss": 2.0484731197357178, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.538, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 17100 + }, + { + "epoch": 7.116104868913857, + "eval_international_law_loss": 3.0654804706573486, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.508, + "eval_international_law_steps_per_second": 3.754, + "step": 17100 + }, + { + "epoch": 7.124427798585102, + "grad_norm": 0.2890625, + "learning_rate": 1.4316201161027865e-06, + "loss": 0.227, + "step": 17120 + }, + { + "epoch": 7.1327507282563465, + "grad_norm": 0.259765625, + "learning_rate": 1.4242385389285068e-06, + "loss": 0.2312, + "step": 17140 + }, + { + "epoch": 7.14107365792759, + "grad_norm": 0.310546875, + "learning_rate": 1.416868453547802e-06, + "loss": 0.2329, + "step": 17160 + }, + { + "epoch": 7.149396587598835, + "grad_norm": 0.255859375, + "learning_rate": 1.4095099386914146e-06, + "loss": 0.2376, + "step": 17180 + }, + { + "epoch": 7.157719517270079, + "grad_norm": 0.294921875, + "learning_rate": 1.402163072966488e-06, + "loss": 0.2379, + "step": 17200 + }, + { + "epoch": 7.157719517270079, + "eval_main_loss": 0.2382867932319641, + "eval_main_runtime": 6.3286, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 17200 + }, + { + "epoch": 7.157719517270079, + "eval_anatomy_loss": 2.8338112831115723, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.491, + "eval_anatomy_steps_per_second": 3.746, + "step": 17200 + }, + { + "epoch": 7.157719517270079, + "eval_college_mathematics_loss": 2.050143241882324, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.532, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 17200 + }, + { + "epoch": 7.157719517270079, + "eval_international_law_loss": 3.067023754119873, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 17200 + }, + { + "epoch": 7.166042446941323, + "grad_norm": 0.29296875, + "learning_rate": 1.3948279348557225e-06, + "loss": 0.234, + "step": 17220 + }, + { + "epoch": 7.174365376612568, + "grad_norm": 0.244140625, + "learning_rate": 1.3875046027165376e-06, + "loss": 0.2318, + "step": 17240 + }, + { + "epoch": 7.182688306283812, + "grad_norm": 0.294921875, + "learning_rate": 1.380193154780236e-06, + "loss": 0.2343, + "step": 17260 + }, + { + "epoch": 7.191011235955056, + "grad_norm": 0.2421875, + "learning_rate": 1.3728936691511704e-06, + "loss": 0.2304, + "step": 17280 + }, + { + "epoch": 7.199334165626301, + "grad_norm": 0.271484375, + "learning_rate": 1.3656062238059035e-06, + "loss": 0.2359, + "step": 17300 + }, + { + "epoch": 7.199334165626301, + "eval_main_loss": 0.23832285404205322, + "eval_main_runtime": 6.3287, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 17300 + }, + { + "epoch": 7.199334165626301, + "eval_anatomy_loss": 2.833261251449585, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.757, + "step": 17300 + }, + { + "epoch": 7.199334165626301, + "eval_college_mathematics_loss": 2.0532398223876953, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.497, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 17300 + }, + { + "epoch": 7.199334165626301, + "eval_international_law_loss": 3.065138816833496, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.51, + "eval_international_law_steps_per_second": 3.755, + "step": 17300 + }, + { + "epoch": 7.207657095297545, + "grad_norm": 0.251953125, + "learning_rate": 1.3583308965923778e-06, + "loss": 0.2289, + "step": 17320 + }, + { + "epoch": 7.215980024968789, + "grad_norm": 0.26171875, + "learning_rate": 1.3510677652290882e-06, + "loss": 0.2337, + "step": 17340 + }, + { + "epoch": 7.2243029546400335, + "grad_norm": 0.2314453125, + "learning_rate": 1.3438169073042415e-06, + "loss": 0.2328, + "step": 17360 + }, + { + "epoch": 7.232625884311277, + "grad_norm": 0.275390625, + "learning_rate": 1.3365784002749393e-06, + "loss": 0.2327, + "step": 17380 + }, + { + "epoch": 7.240948813982522, + "grad_norm": 0.2734375, + "learning_rate": 1.3293523214663428e-06, + "loss": 0.2336, + "step": 17400 + }, + { + "epoch": 7.240948813982522, + "eval_main_loss": 0.23834027349948883, + "eval_main_runtime": 6.3246, + "eval_main_samples_per_second": 30.041, + "eval_main_steps_per_second": 3.795, + "step": 17400 + }, + { + "epoch": 7.240948813982522, + "eval_anatomy_loss": 2.8343632221221924, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.53, + "eval_anatomy_steps_per_second": 3.765, + "step": 17400 + }, + { + "epoch": 7.240948813982522, + "eval_college_mathematics_loss": 2.052730083465576, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.503, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 17400 + }, + { + "epoch": 7.240948813982522, + "eval_international_law_loss": 3.066589117050171, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 17400 + }, + { + "epoch": 7.249271743653766, + "grad_norm": 0.25390625, + "learning_rate": 1.3221387480708475e-06, + "loss": 0.2331, + "step": 17420 + }, + { + "epoch": 7.25759467332501, + "grad_norm": 0.271484375, + "learning_rate": 1.3149377571472655e-06, + "loss": 0.2362, + "step": 17440 + }, + { + "epoch": 7.265917602996255, + "grad_norm": 0.248046875, + "learning_rate": 1.3077494256199892e-06, + "loss": 0.2297, + "step": 17460 + }, + { + "epoch": 7.274240532667499, + "grad_norm": 0.29296875, + "learning_rate": 1.3005738302781839e-06, + "loss": 0.2319, + "step": 17480 + }, + { + "epoch": 7.282563462338743, + "grad_norm": 0.29296875, + "learning_rate": 1.2934110477749584e-06, + "loss": 0.2295, + "step": 17500 + }, + { + "epoch": 7.282563462338743, + "eval_main_loss": 0.23828759789466858, + "eval_main_runtime": 6.3264, + "eval_main_samples_per_second": 30.033, + "eval_main_steps_per_second": 3.794, + "step": 17500 + }, + { + "epoch": 7.282563462338743, + "eval_anatomy_loss": 2.8331925868988037, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.513, + "eval_anatomy_steps_per_second": 3.756, + "step": 17500 + }, + { + "epoch": 7.282563462338743, + "eval_college_mathematics_loss": 2.0508506298065186, + "eval_college_mathematics_runtime": 0.2652, + "eval_college_mathematics_samples_per_second": 7.542, + "eval_college_mathematics_steps_per_second": 3.771, + "step": 17500 + }, + { + "epoch": 7.282563462338743, + "eval_international_law_loss": 3.0639867782592773, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.532, + "eval_international_law_steps_per_second": 3.766, + "step": 17500 + }, + { + "epoch": 7.290886392009988, + "grad_norm": 0.263671875, + "learning_rate": 1.2862611546265469e-06, + "loss": 0.2287, + "step": 17520 + }, + { + "epoch": 7.299209321681232, + "grad_norm": 0.27734375, + "learning_rate": 1.279124227211498e-06, + "loss": 0.2313, + "step": 17540 + }, + { + "epoch": 7.307532251352476, + "grad_norm": 0.2890625, + "learning_rate": 1.2720003417698506e-06, + "loss": 0.2324, + "step": 17560 + }, + { + "epoch": 7.3158551810237205, + "grad_norm": 0.298828125, + "learning_rate": 1.2648895744023223e-06, + "loss": 0.2323, + "step": 17580 + }, + { + "epoch": 7.324178110694964, + "grad_norm": 0.318359375, + "learning_rate": 1.2577920010695015e-06, + "loss": 0.233, + "step": 17600 + }, + { + "epoch": 7.324178110694964, + "eval_main_loss": 0.2383326143026352, + "eval_main_runtime": 6.331, + "eval_main_samples_per_second": 30.011, + "eval_main_steps_per_second": 3.791, + "step": 17600 + }, + { + "epoch": 7.324178110694964, + "eval_anatomy_loss": 2.833402633666992, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 17600 + }, + { + "epoch": 7.324178110694964, + "eval_college_mathematics_loss": 2.052766799926758, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.511, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 17600 + }, + { + "epoch": 7.324178110694964, + "eval_international_law_loss": 3.0673341751098633, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.756, + "step": 17600 + }, + { + "epoch": 7.332501040366209, + "grad_norm": 0.279296875, + "learning_rate": 1.2507076975910276e-06, + "loss": 0.2342, + "step": 17620 + }, + { + "epoch": 7.340823970037453, + "grad_norm": 0.322265625, + "learning_rate": 1.243636739644787e-06, + "loss": 0.2301, + "step": 17640 + }, + { + "epoch": 7.349146899708697, + "grad_norm": 0.28515625, + "learning_rate": 1.236579202766102e-06, + "loss": 0.2286, + "step": 17660 + }, + { + "epoch": 7.357469829379942, + "grad_norm": 0.275390625, + "learning_rate": 1.229535162346924e-06, + "loss": 0.2294, + "step": 17680 + }, + { + "epoch": 7.365792759051186, + "grad_norm": 0.35546875, + "learning_rate": 1.2225046936350324e-06, + "loss": 0.2349, + "step": 17700 + }, + { + "epoch": 7.365792759051186, + "eval_main_loss": 0.2383442372083664, + "eval_main_runtime": 6.3529, + "eval_main_samples_per_second": 29.908, + "eval_main_steps_per_second": 3.778, + "step": 17700 + }, + { + "epoch": 7.365792759051186, + "eval_anatomy_loss": 2.8332571983337402, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.493, + "eval_anatomy_steps_per_second": 3.746, + "step": 17700 + }, + { + "epoch": 7.365792759051186, + "eval_college_mathematics_loss": 2.0507707595825195, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 17700 + }, + { + "epoch": 7.365792759051186, + "eval_international_law_loss": 3.0661237239837646, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.488, + "eval_international_law_steps_per_second": 3.744, + "step": 17700 + }, + { + "epoch": 7.37411568872243, + "grad_norm": 0.2890625, + "learning_rate": 1.2154878717332235e-06, + "loss": 0.2334, + "step": 17720 + }, + { + "epoch": 7.3824386183936745, + "grad_norm": 0.25, + "learning_rate": 1.208484771598513e-06, + "loss": 0.2292, + "step": 17740 + }, + { + "epoch": 7.390761548064919, + "grad_norm": 0.29296875, + "learning_rate": 1.2014954680413334e-06, + "loss": 0.2273, + "step": 17760 + }, + { + "epoch": 7.399084477736163, + "grad_norm": 0.279296875, + "learning_rate": 1.1945200357247386e-06, + "loss": 0.2305, + "step": 17780 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 0.271484375, + "learning_rate": 1.1875585491636e-06, + "loss": 0.2317, + "step": 17800 + }, + { + "epoch": 7.407407407407407, + "eval_main_loss": 0.23827992379665375, + "eval_main_runtime": 6.3476, + "eval_main_samples_per_second": 29.932, + "eval_main_steps_per_second": 3.781, + "step": 17800 + }, + { + "epoch": 7.407407407407407, + "eval_anatomy_loss": 2.832733631134033, + "eval_anatomy_runtime": 0.2672, + "eval_anatomy_samples_per_second": 7.484, + "eval_anatomy_steps_per_second": 3.742, + "step": 17800 + }, + { + "epoch": 7.407407407407407, + "eval_college_mathematics_loss": 2.052572250366211, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.489, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 17800 + }, + { + "epoch": 7.407407407407407, + "eval_international_law_loss": 3.064929723739624, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.484, + "eval_international_law_steps_per_second": 3.742, + "step": 17800 + }, + { + "epoch": 7.415730337078652, + "grad_norm": 0.27734375, + "learning_rate": 1.180611082723814e-06, + "loss": 0.2316, + "step": 17820 + }, + { + "epoch": 7.424053266749896, + "grad_norm": 0.28515625, + "learning_rate": 1.1736777106215118e-06, + "loss": 0.2336, + "step": 17840 + }, + { + "epoch": 7.43237619642114, + "grad_norm": 0.255859375, + "learning_rate": 1.1667585069222554e-06, + "loss": 0.2324, + "step": 17860 + }, + { + "epoch": 7.440699126092385, + "grad_norm": 0.271484375, + "learning_rate": 1.1598535455402584e-06, + "loss": 0.2301, + "step": 17880 + }, + { + "epoch": 7.449022055763629, + "grad_norm": 0.30859375, + "learning_rate": 1.15296290023759e-06, + "loss": 0.2282, + "step": 17900 + }, + { + "epoch": 7.449022055763629, + "eval_main_loss": 0.2383432686328888, + "eval_main_runtime": 6.356, + "eval_main_samples_per_second": 29.893, + "eval_main_steps_per_second": 3.776, + "step": 17900 + }, + { + "epoch": 7.449022055763629, + "eval_anatomy_loss": 2.833138942718506, + "eval_anatomy_runtime": 0.2678, + "eval_anatomy_samples_per_second": 7.468, + "eval_anatomy_steps_per_second": 3.734, + "step": 17900 + }, + { + "epoch": 7.449022055763629, + "eval_college_mathematics_loss": 2.052255868911743, + "eval_college_mathematics_runtime": 0.2683, + "eval_college_mathematics_samples_per_second": 7.454, + "eval_college_mathematics_steps_per_second": 3.727, + "step": 17900 + }, + { + "epoch": 7.449022055763629, + "eval_international_law_loss": 3.065342664718628, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.519, + "eval_international_law_steps_per_second": 3.76, + "step": 17900 + }, + { + "epoch": 7.457344985434873, + "grad_norm": 0.283203125, + "learning_rate": 1.1460866446233857e-06, + "loss": 0.2326, + "step": 17920 + }, + { + "epoch": 7.465667915106117, + "grad_norm": 0.26953125, + "learning_rate": 1.1392248521530692e-06, + "loss": 0.2321, + "step": 17940 + }, + { + "epoch": 7.4739908447773615, + "grad_norm": 0.3125, + "learning_rate": 1.132377596127554e-06, + "loss": 0.2294, + "step": 17960 + }, + { + "epoch": 7.482313774448606, + "grad_norm": 0.283203125, + "learning_rate": 1.1255449496924767e-06, + "loss": 0.2333, + "step": 17980 + }, + { + "epoch": 7.49063670411985, + "grad_norm": 0.267578125, + "learning_rate": 1.1187269858374017e-06, + "loss": 0.2293, + "step": 18000 + }, + { + "epoch": 7.49063670411985, + "eval_main_loss": 0.2383362054824829, + "eval_main_runtime": 6.3387, + "eval_main_samples_per_second": 29.974, + "eval_main_steps_per_second": 3.786, + "step": 18000 + }, + { + "epoch": 7.49063670411985, + "eval_anatomy_loss": 2.832285165786743, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.492, + "eval_anatomy_steps_per_second": 3.746, + "step": 18000 + }, + { + "epoch": 7.49063670411985, + "eval_college_mathematics_loss": 2.0504801273345947, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.54, + "eval_college_mathematics_steps_per_second": 3.77, + "step": 18000 + }, + { + "epoch": 7.49063670411985, + "eval_international_law_loss": 3.0625662803649902, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.524, + "eval_international_law_steps_per_second": 3.762, + "step": 18000 + }, + { + "epoch": 7.498959633791094, + "grad_norm": 0.279296875, + "learning_rate": 1.1119237773950484e-06, + "loss": 0.2286, + "step": 18020 + }, + { + "epoch": 7.507282563462339, + "grad_norm": 0.2431640625, + "learning_rate": 1.1051353970405142e-06, + "loss": 0.2315, + "step": 18040 + }, + { + "epoch": 7.515605493133583, + "grad_norm": 0.341796875, + "learning_rate": 1.0983619172904935e-06, + "loss": 0.2284, + "step": 18060 + }, + { + "epoch": 7.523928422804827, + "grad_norm": 0.3046875, + "learning_rate": 1.0916034105025052e-06, + "loss": 0.2282, + "step": 18080 + }, + { + "epoch": 7.532251352476072, + "grad_norm": 0.248046875, + "learning_rate": 1.0848599488741208e-06, + "loss": 0.2277, + "step": 18100 + }, + { + "epoch": 7.532251352476072, + "eval_main_loss": 0.2383170872926712, + "eval_main_runtime": 6.3286, + "eval_main_samples_per_second": 30.023, + "eval_main_steps_per_second": 3.792, + "step": 18100 + }, + { + "epoch": 7.532251352476072, + "eval_anatomy_loss": 2.834378480911255, + "eval_anatomy_runtime": 0.2677, + "eval_anatomy_samples_per_second": 7.472, + "eval_anatomy_steps_per_second": 3.736, + "step": 18100 + }, + { + "epoch": 7.532251352476072, + "eval_college_mathematics_loss": 2.0512096881866455, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.501, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 18100 + }, + { + "epoch": 7.532251352476072, + "eval_international_law_loss": 3.066354513168335, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.51, + "eval_international_law_steps_per_second": 3.755, + "step": 18100 + }, + { + "epoch": 7.540574282147316, + "grad_norm": 0.3203125, + "learning_rate": 1.078131604442193e-06, + "loss": 0.2295, + "step": 18120 + }, + { + "epoch": 7.54889721181856, + "grad_norm": 0.310546875, + "learning_rate": 1.0714184490820842e-06, + "loss": 0.2296, + "step": 18140 + }, + { + "epoch": 7.557220141489804, + "grad_norm": 0.2890625, + "learning_rate": 1.0647205545068992e-06, + "loss": 0.2311, + "step": 18160 + }, + { + "epoch": 7.5655430711610485, + "grad_norm": 0.259765625, + "learning_rate": 1.0580379922667241e-06, + "loss": 0.2333, + "step": 18180 + }, + { + "epoch": 7.573866000832293, + "grad_norm": 0.3046875, + "learning_rate": 1.0513708337478509e-06, + "loss": 0.2304, + "step": 18200 + }, + { + "epoch": 7.573866000832293, + "eval_main_loss": 0.2383020520210266, + "eval_main_runtime": 6.3287, + "eval_main_samples_per_second": 30.022, + "eval_main_steps_per_second": 3.792, + "step": 18200 + }, + { + "epoch": 7.573866000832293, + "eval_anatomy_loss": 2.834932804107666, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.496, + "eval_anatomy_steps_per_second": 3.748, + "step": 18200 + }, + { + "epoch": 7.573866000832293, + "eval_college_mathematics_loss": 2.0513229370117188, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.762, + "step": 18200 + }, + { + "epoch": 7.573866000832293, + "eval_international_law_loss": 3.0660512447357178, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.504, + "eval_international_law_steps_per_second": 3.752, + "step": 18200 + }, + { + "epoch": 7.582188930503538, + "grad_norm": 0.279296875, + "learning_rate": 1.044719150172028e-06, + "loss": 0.2259, + "step": 18220 + }, + { + "epoch": 7.590511860174781, + "grad_norm": 0.3125, + "learning_rate": 1.0380830125956897e-06, + "loss": 0.2273, + "step": 18240 + }, + { + "epoch": 7.598834789846026, + "grad_norm": 0.3515625, + "learning_rate": 1.0314624919092011e-06, + "loss": 0.2365, + "step": 18260 + }, + { + "epoch": 7.60715771951727, + "grad_norm": 0.322265625, + "learning_rate": 1.024857658836102e-06, + "loss": 0.2327, + "step": 18280 + }, + { + "epoch": 7.615480649188514, + "grad_norm": 0.294921875, + "learning_rate": 1.0182685839323475e-06, + "loss": 0.232, + "step": 18300 + }, + { + "epoch": 7.615480649188514, + "eval_main_loss": 0.23833827674388885, + "eval_main_runtime": 6.3315, + "eval_main_samples_per_second": 30.009, + "eval_main_steps_per_second": 3.791, + "step": 18300 + }, + { + "epoch": 7.615480649188514, + "eval_anatomy_loss": 2.8317782878875732, + "eval_anatomy_runtime": 0.2681, + "eval_anatomy_samples_per_second": 7.46, + "eval_anatomy_steps_per_second": 3.73, + "step": 18300 + }, + { + "epoch": 7.615480649188514, + "eval_college_mathematics_loss": 2.0530128479003906, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.489, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 18300 + }, + { + "epoch": 7.615480649188514, + "eval_international_law_loss": 3.064087152481079, + "eval_international_law_runtime": 0.2649, + "eval_international_law_samples_per_second": 7.551, + "eval_international_law_steps_per_second": 3.776, + "step": 18300 + }, + { + "epoch": 7.623803578859759, + "grad_norm": 0.2734375, + "learning_rate": 1.0116953375855565e-06, + "loss": 0.2331, + "step": 18320 + }, + { + "epoch": 7.632126508531003, + "grad_norm": 0.2138671875, + "learning_rate": 1.0051379900142635e-06, + "loss": 0.2336, + "step": 18340 + }, + { + "epoch": 7.640449438202247, + "grad_norm": 0.298828125, + "learning_rate": 9.98596611267158e-07, + "loss": 0.2308, + "step": 18360 + }, + { + "epoch": 7.648772367873492, + "grad_norm": 0.267578125, + "learning_rate": 9.920712712223494e-07, + "loss": 0.2258, + "step": 18380 + }, + { + "epoch": 7.6570952975447355, + "grad_norm": 0.28125, + "learning_rate": 9.855620395866107e-07, + "loss": 0.2336, + "step": 18400 + }, + { + "epoch": 7.6570952975447355, + "eval_main_loss": 0.23823559284210205, + "eval_main_runtime": 6.3201, + "eval_main_samples_per_second": 30.063, + "eval_main_steps_per_second": 3.797, + "step": 18400 + }, + { + "epoch": 7.6570952975447355, + "eval_anatomy_loss": 2.834606409072876, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 18400 + }, + { + "epoch": 7.6570952975447355, + "eval_college_mathematics_loss": 2.0500612258911133, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 18400 + }, + { + "epoch": 7.6570952975447355, + "eval_international_law_loss": 3.0666239261627197, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.757, + "step": 18400 + }, + { + "epoch": 7.66541822721598, + "grad_norm": 0.26953125, + "learning_rate": 9.790689858946374e-07, + "loss": 0.2301, + "step": 18420 + }, + { + "epoch": 7.673741156887225, + "grad_norm": 0.2490234375, + "learning_rate": 9.725921795083063e-07, + "loss": 0.2327, + "step": 18440 + }, + { + "epoch": 7.682064086558468, + "grad_norm": 0.2578125, + "learning_rate": 9.661316896159313e-07, + "loss": 0.2304, + "step": 18460 + }, + { + "epoch": 7.690387016229713, + "grad_norm": 0.263671875, + "learning_rate": 9.59687585231526e-07, + "loss": 0.2282, + "step": 18480 + }, + { + "epoch": 7.698709945900957, + "grad_norm": 0.287109375, + "learning_rate": 9.53259935194066e-07, + "loss": 0.2313, + "step": 18500 + }, + { + "epoch": 7.698709945900957, + "eval_main_loss": 0.23836714029312134, + "eval_main_runtime": 6.351, + "eval_main_samples_per_second": 29.917, + "eval_main_steps_per_second": 3.779, + "step": 18500 + }, + { + "epoch": 7.698709945900957, + "eval_anatomy_loss": 2.8336453437805176, + "eval_anatomy_runtime": 0.2666, + "eval_anatomy_samples_per_second": 7.502, + "eval_anatomy_steps_per_second": 3.751, + "step": 18500 + }, + { + "epoch": 7.698709945900957, + "eval_college_mathematics_loss": 2.0492594242095947, + "eval_college_mathematics_runtime": 0.2688, + "eval_college_mathematics_samples_per_second": 7.442, + "eval_college_mathematics_steps_per_second": 3.721, + "step": 18500 + }, + { + "epoch": 7.698709945900957, + "eval_international_law_loss": 3.0663108825683594, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.489, + "eval_international_law_steps_per_second": 3.745, + "step": 18500 + }, + { + "epoch": 7.707032875572201, + "grad_norm": 0.302734375, + "learning_rate": 9.468488081667537e-07, + "loss": 0.2357, + "step": 18520 + }, + { + "epoch": 7.715355805243446, + "grad_norm": 0.3515625, + "learning_rate": 9.404542726362872e-07, + "loss": 0.231, + "step": 18540 + }, + { + "epoch": 7.72367873491469, + "grad_norm": 0.30078125, + "learning_rate": 9.34076396912123e-07, + "loss": 0.234, + "step": 18560 + }, + { + "epoch": 7.732001664585934, + "grad_norm": 0.2412109375, + "learning_rate": 9.277152491257515e-07, + "loss": 0.2341, + "step": 18580 + }, + { + "epoch": 7.740324594257179, + "grad_norm": 0.296875, + "learning_rate": 9.213708972299662e-07, + "loss": 0.2339, + "step": 18600 + }, + { + "epoch": 7.740324594257179, + "eval_main_loss": 0.23831741511821747, + "eval_main_runtime": 6.3514, + "eval_main_samples_per_second": 29.915, + "eval_main_steps_per_second": 3.779, + "step": 18600 + }, + { + "epoch": 7.740324594257179, + "eval_anatomy_loss": 2.8322861194610596, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.479, + "eval_anatomy_steps_per_second": 3.74, + "step": 18600 + }, + { + "epoch": 7.740324594257179, + "eval_college_mathematics_loss": 2.050147771835327, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.49, + "eval_college_mathematics_steps_per_second": 3.745, + "step": 18600 + }, + { + "epoch": 7.740324594257179, + "eval_international_law_loss": 3.066422700881958, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.512, + "eval_international_law_steps_per_second": 3.756, + "step": 18600 + }, + { + "epoch": 7.7486475239284225, + "grad_norm": 0.271484375, + "learning_rate": 9.150434089981413e-07, + "loss": 0.234, + "step": 18620 + }, + { + "epoch": 7.756970453599667, + "grad_norm": 0.2578125, + "learning_rate": 9.087328520235028e-07, + "loss": 0.2348, + "step": 18640 + }, + { + "epoch": 7.765293383270912, + "grad_norm": 0.275390625, + "learning_rate": 9.024392937184096e-07, + "loss": 0.2315, + "step": 18660 + }, + { + "epoch": 7.773616312942155, + "grad_norm": 0.26953125, + "learning_rate": 8.961628013136351e-07, + "loss": 0.2295, + "step": 18680 + }, + { + "epoch": 7.7819392426134, + "grad_norm": 0.259765625, + "learning_rate": 8.899034418576413e-07, + "loss": 0.2329, + "step": 18700 + }, + { + "epoch": 7.7819392426134, + "eval_main_loss": 0.2382706105709076, + "eval_main_runtime": 6.3343, + "eval_main_samples_per_second": 29.996, + "eval_main_steps_per_second": 3.789, + "step": 18700 + }, + { + "epoch": 7.7819392426134, + "eval_anatomy_loss": 2.832164764404297, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.514, + "eval_anatomy_steps_per_second": 3.757, + "step": 18700 + }, + { + "epoch": 7.7819392426134, + "eval_college_mathematics_loss": 2.0492961406707764, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.491, + "eval_college_mathematics_steps_per_second": 3.745, + "step": 18700 + }, + { + "epoch": 7.7819392426134, + "eval_international_law_loss": 3.065026044845581, + "eval_international_law_runtime": 0.2654, + "eval_international_law_samples_per_second": 7.537, + "eval_international_law_steps_per_second": 3.768, + "step": 18700 + }, + { + "epoch": 7.790262172284645, + "grad_norm": 0.25, + "learning_rate": 8.836612822158743e-07, + "loss": 0.2279, + "step": 18720 + }, + { + "epoch": 7.798585101955888, + "grad_norm": 0.318359375, + "learning_rate": 8.774363890700394e-07, + "loss": 0.2337, + "step": 18740 + }, + { + "epoch": 7.806908031627133, + "grad_norm": 0.234375, + "learning_rate": 8.712288289173937e-07, + "loss": 0.2348, + "step": 18760 + }, + { + "epoch": 7.8152309612983775, + "grad_norm": 0.263671875, + "learning_rate": 8.650386680700373e-07, + "loss": 0.2285, + "step": 18780 + }, + { + "epoch": 7.823553890969621, + "grad_norm": 0.26171875, + "learning_rate": 8.588659726541998e-07, + "loss": 0.2324, + "step": 18800 + }, + { + "epoch": 7.823553890969621, + "eval_main_loss": 0.23833107948303223, + "eval_main_runtime": 6.3262, + "eval_main_samples_per_second": 30.034, + "eval_main_steps_per_second": 3.794, + "step": 18800 + }, + { + "epoch": 7.823553890969621, + "eval_anatomy_loss": 2.8310635089874268, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.522, + "eval_anatomy_steps_per_second": 3.761, + "step": 18800 + }, + { + "epoch": 7.823553890969621, + "eval_college_mathematics_loss": 2.0509285926818848, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 18800 + }, + { + "epoch": 7.823553890969621, + "eval_international_law_loss": 3.065098285675049, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 18800 + }, + { + "epoch": 7.831876820640866, + "grad_norm": 0.306640625, + "learning_rate": 8.527108086095375e-07, + "loss": 0.232, + "step": 18820 + }, + { + "epoch": 7.8401997503121095, + "grad_norm": 0.2890625, + "learning_rate": 8.465732416884312e-07, + "loss": 0.2316, + "step": 18840 + }, + { + "epoch": 7.848522679983354, + "grad_norm": 0.259765625, + "learning_rate": 8.404533374552751e-07, + "loss": 0.233, + "step": 18860 + }, + { + "epoch": 7.856845609654599, + "grad_norm": 0.314453125, + "learning_rate": 8.343511612857879e-07, + "loss": 0.2333, + "step": 18880 + }, + { + "epoch": 7.865168539325842, + "grad_norm": 0.298828125, + "learning_rate": 8.282667783663056e-07, + "loss": 0.235, + "step": 18900 + }, + { + "epoch": 7.865168539325842, + "eval_main_loss": 0.23828402161598206, + "eval_main_runtime": 6.3357, + "eval_main_samples_per_second": 29.989, + "eval_main_steps_per_second": 3.788, + "step": 18900 + }, + { + "epoch": 7.865168539325842, + "eval_anatomy_loss": 2.832879066467285, + "eval_anatomy_runtime": 0.2655, + "eval_anatomy_samples_per_second": 7.532, + "eval_anatomy_steps_per_second": 3.766, + "step": 18900 + }, + { + "epoch": 7.865168539325842, + "eval_college_mathematics_loss": 2.05184006690979, + "eval_college_mathematics_runtime": 0.2659, + "eval_college_mathematics_samples_per_second": 7.521, + "eval_college_mathematics_steps_per_second": 3.761, + "step": 18900 + }, + { + "epoch": 7.865168539325842, + "eval_international_law_loss": 3.065366506576538, + "eval_international_law_runtime": 0.2648, + "eval_international_law_samples_per_second": 7.553, + "eval_international_law_steps_per_second": 3.776, + "step": 18900 + }, + { + "epoch": 7.873491468997087, + "grad_norm": 0.2890625, + "learning_rate": 8.222002536930887e-07, + "loss": 0.2314, + "step": 18920 + }, + { + "epoch": 7.881814398668332, + "grad_norm": 0.26171875, + "learning_rate": 8.161516520716287e-07, + "loss": 0.2328, + "step": 18940 + }, + { + "epoch": 7.890137328339575, + "grad_norm": 0.279296875, + "learning_rate": 8.101210381159533e-07, + "loss": 0.229, + "step": 18960 + }, + { + "epoch": 7.89846025801082, + "grad_norm": 0.267578125, + "learning_rate": 8.041084762479376e-07, + "loss": 0.2307, + "step": 18980 + }, + { + "epoch": 7.9067831876820645, + "grad_norm": 0.291015625, + "learning_rate": 7.981140306966154e-07, + "loss": 0.2328, + "step": 19000 + }, + { + "epoch": 7.9067831876820645, + "eval_main_loss": 0.23827558755874634, + "eval_main_runtime": 6.3308, + "eval_main_samples_per_second": 30.012, + "eval_main_steps_per_second": 3.791, + "step": 19000 + }, + { + "epoch": 7.9067831876820645, + "eval_anatomy_loss": 2.8324079513549805, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 19000 + }, + { + "epoch": 7.9067831876820645, + "eval_college_mathematics_loss": 2.050251007080078, + "eval_college_mathematics_runtime": 0.266, + "eval_college_mathematics_samples_per_second": 7.519, + "eval_college_mathematics_steps_per_second": 3.76, + "step": 19000 + }, + { + "epoch": 7.9067831876820645, + "eval_international_law_loss": 3.0664069652557373, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.478, + "eval_international_law_steps_per_second": 3.739, + "step": 19000 + }, + { + "epoch": 7.915106117353308, + "grad_norm": 0.306640625, + "learning_rate": 7.921377654974955e-07, + "loss": 0.2349, + "step": 19020 + }, + { + "epoch": 7.923429047024553, + "grad_norm": 0.30859375, + "learning_rate": 7.861797444918731e-07, + "loss": 0.2338, + "step": 19040 + }, + { + "epoch": 7.9317519766957965, + "grad_norm": 0.265625, + "learning_rate": 7.802400313261505e-07, + "loss": 0.2378, + "step": 19060 + }, + { + "epoch": 7.940074906367041, + "grad_norm": 0.302734375, + "learning_rate": 7.743186894511603e-07, + "loss": 0.2277, + "step": 19080 + }, + { + "epoch": 7.948397836038286, + "grad_norm": 0.318359375, + "learning_rate": 7.684157821214783e-07, + "loss": 0.2305, + "step": 19100 + }, + { + "epoch": 7.948397836038286, + "eval_main_loss": 0.23835448920726776, + "eval_main_runtime": 6.3314, + "eval_main_samples_per_second": 30.009, + "eval_main_steps_per_second": 3.791, + "step": 19100 + }, + { + "epoch": 7.948397836038286, + "eval_anatomy_loss": 2.8330698013305664, + "eval_anatomy_runtime": 0.2647, + "eval_anatomy_samples_per_second": 7.555, + "eval_anatomy_steps_per_second": 3.777, + "step": 19100 + }, + { + "epoch": 7.948397836038286, + "eval_college_mathematics_loss": 2.0542187690734863, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.537, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 19100 + }, + { + "epoch": 7.948397836038286, + "eval_international_law_loss": 3.0674057006835938, + "eval_international_law_runtime": 0.2667, + "eval_international_law_samples_per_second": 7.499, + "eval_international_law_steps_per_second": 3.749, + "step": 19100 + }, + { + "epoch": 7.95672076570953, + "grad_norm": 0.2060546875, + "learning_rate": 7.625313723947592e-07, + "loss": 0.2358, + "step": 19120 + }, + { + "epoch": 7.965043695380774, + "grad_norm": 0.279296875, + "learning_rate": 7.566655231310551e-07, + "loss": 0.2339, + "step": 19140 + }, + { + "epoch": 7.9733666250520185, + "grad_norm": 0.26171875, + "learning_rate": 7.508182969921463e-07, + "loss": 0.2359, + "step": 19160 + }, + { + "epoch": 7.981689554723262, + "grad_norm": 0.333984375, + "learning_rate": 7.449897564408743e-07, + "loss": 0.2312, + "step": 19180 + }, + { + "epoch": 7.990012484394507, + "grad_norm": 0.287109375, + "learning_rate": 7.391799637404675e-07, + "loss": 0.2328, + "step": 19200 + }, + { + "epoch": 7.990012484394507, + "eval_main_loss": 0.238310769200325, + "eval_main_runtime": 6.3251, + "eval_main_samples_per_second": 30.039, + "eval_main_steps_per_second": 3.794, + "step": 19200 + }, + { + "epoch": 7.990012484394507, + "eval_anatomy_loss": 2.834007501602173, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.518, + "eval_anatomy_steps_per_second": 3.759, + "step": 19200 + }, + { + "epoch": 7.990012484394507, + "eval_college_mathematics_loss": 2.0495834350585938, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.528, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 19200 + }, + { + "epoch": 7.990012484394507, + "eval_international_law_loss": 3.065659523010254, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.484, + "eval_international_law_steps_per_second": 3.742, + "step": 19200 + }, + { + "epoch": 7.998335414065751, + "grad_norm": 0.232421875, + "learning_rate": 7.333889809538869e-07, + "loss": 0.2292, + "step": 19220 + }, + { + "epoch": 8.006658343736996, + "grad_norm": 0.263671875, + "learning_rate": 7.276168699431527e-07, + "loss": 0.2287, + "step": 19240 + }, + { + "epoch": 8.014981273408239, + "grad_norm": 0.333984375, + "learning_rate": 7.218636923686889e-07, + "loss": 0.2308, + "step": 19260 + }, + { + "epoch": 8.023304203079483, + "grad_norm": 0.3046875, + "learning_rate": 7.16129509688665e-07, + "loss": 0.2346, + "step": 19280 + }, + { + "epoch": 8.031627132750728, + "grad_norm": 0.263671875, + "learning_rate": 7.104143831583368e-07, + "loss": 0.2292, + "step": 19300 + }, + { + "epoch": 8.031627132750728, + "eval_main_loss": 0.2382253110408783, + "eval_main_runtime": 6.3318, + "eval_main_samples_per_second": 30.007, + "eval_main_steps_per_second": 3.79, + "step": 19300 + }, + { + "epoch": 8.031627132750728, + "eval_anatomy_loss": 2.833739757537842, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.505, + "eval_anatomy_steps_per_second": 3.752, + "step": 19300 + }, + { + "epoch": 8.031627132750728, + "eval_college_mathematics_loss": 2.053969144821167, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.484, + "eval_college_mathematics_steps_per_second": 3.742, + "step": 19300 + }, + { + "epoch": 8.031627132750728, + "eval_international_law_loss": 3.0664596557617188, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.503, + "eval_international_law_steps_per_second": 3.751, + "step": 19300 + }, + { + "epoch": 8.039950062421973, + "grad_norm": 0.23046875, + "learning_rate": 7.047183738293933e-07, + "loss": 0.2291, + "step": 19320 + }, + { + "epoch": 8.048272992093217, + "grad_norm": 0.294921875, + "learning_rate": 6.990415425493039e-07, + "loss": 0.2317, + "step": 19340 + }, + { + "epoch": 8.056595921764462, + "grad_norm": 0.28125, + "learning_rate": 6.933839499606709e-07, + "loss": 0.2325, + "step": 19360 + }, + { + "epoch": 8.064918851435705, + "grad_norm": 0.26171875, + "learning_rate": 6.877456565005783e-07, + "loss": 0.2344, + "step": 19380 + }, + { + "epoch": 8.07324178110695, + "grad_norm": 0.244140625, + "learning_rate": 6.82126722399948e-07, + "loss": 0.2284, + "step": 19400 + }, + { + "epoch": 8.07324178110695, + "eval_main_loss": 0.23833835124969482, + "eval_main_runtime": 6.3288, + "eval_main_samples_per_second": 30.021, + "eval_main_steps_per_second": 3.792, + "step": 19400 + }, + { + "epoch": 8.07324178110695, + "eval_anatomy_loss": 2.831571102142334, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.519, + "eval_anatomy_steps_per_second": 3.76, + "step": 19400 + }, + { + "epoch": 8.07324178110695, + "eval_college_mathematics_loss": 2.052187442779541, + "eval_college_mathematics_runtime": 0.2652, + "eval_college_mathematics_samples_per_second": 7.542, + "eval_college_mathematics_steps_per_second": 3.771, + "step": 19400 + }, + { + "epoch": 8.07324178110695, + "eval_international_law_loss": 3.0645389556884766, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.516, + "eval_international_law_steps_per_second": 3.758, + "step": 19400 + }, + { + "epoch": 8.081564710778194, + "grad_norm": 0.25390625, + "learning_rate": 6.765272076828961e-07, + "loss": 0.2323, + "step": 19420 + }, + { + "epoch": 8.089887640449438, + "grad_norm": 0.265625, + "learning_rate": 6.709471721660904e-07, + "loss": 0.2336, + "step": 19440 + }, + { + "epoch": 8.098210570120683, + "grad_norm": 0.294921875, + "learning_rate": 6.653866754581159e-07, + "loss": 0.2299, + "step": 19460 + }, + { + "epoch": 8.106533499791928, + "grad_norm": 0.30859375, + "learning_rate": 6.598457769588315e-07, + "loss": 0.2282, + "step": 19480 + }, + { + "epoch": 8.11485642946317, + "grad_norm": 0.23828125, + "learning_rate": 6.5432453585874e-07, + "loss": 0.2324, + "step": 19500 + }, + { + "epoch": 8.11485642946317, + "eval_main_loss": 0.23833340406417847, + "eval_main_runtime": 6.3274, + "eval_main_samples_per_second": 30.028, + "eval_main_steps_per_second": 3.793, + "step": 19500 + }, + { + "epoch": 8.11485642946317, + "eval_anatomy_loss": 2.832037925720215, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.514, + "eval_anatomy_steps_per_second": 3.757, + "step": 19500 + }, + { + "epoch": 8.11485642946317, + "eval_college_mathematics_loss": 2.0525174140930176, + "eval_college_mathematics_runtime": 0.2659, + "eval_college_mathematics_samples_per_second": 7.521, + "eval_college_mathematics_steps_per_second": 3.76, + "step": 19500 + }, + { + "epoch": 8.11485642946317, + "eval_international_law_loss": 3.0670900344848633, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 19500 + }, + { + "epoch": 8.123179359134415, + "grad_norm": 0.25390625, + "learning_rate": 6.488230111383553e-07, + "loss": 0.2317, + "step": 19520 + }, + { + "epoch": 8.13150228880566, + "grad_norm": 0.3125, + "learning_rate": 6.433412615675705e-07, + "loss": 0.2324, + "step": 19540 + }, + { + "epoch": 8.139825218476904, + "grad_norm": 0.265625, + "learning_rate": 6.378793457050306e-07, + "loss": 0.234, + "step": 19560 + }, + { + "epoch": 8.148148148148149, + "grad_norm": 0.2451171875, + "learning_rate": 6.324373218975105e-07, + "loss": 0.234, + "step": 19580 + }, + { + "epoch": 8.156471077819392, + "grad_norm": 0.283203125, + "learning_rate": 6.270152482792844e-07, + "loss": 0.2307, + "step": 19600 + }, + { + "epoch": 8.156471077819392, + "eval_main_loss": 0.23835553228855133, + "eval_main_runtime": 6.3341, + "eval_main_samples_per_second": 29.996, + "eval_main_steps_per_second": 3.789, + "step": 19600 + }, + { + "epoch": 8.156471077819392, + "eval_anatomy_loss": 2.833484649658203, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.496, + "eval_anatomy_steps_per_second": 3.748, + "step": 19600 + }, + { + "epoch": 8.156471077819392, + "eval_college_mathematics_loss": 2.054138422012329, + "eval_college_mathematics_runtime": 0.2646, + "eval_college_mathematics_samples_per_second": 7.558, + "eval_college_mathematics_steps_per_second": 3.779, + "step": 19600 + }, + { + "epoch": 8.156471077819392, + "eval_international_law_loss": 3.066187620162964, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.478, + "eval_international_law_steps_per_second": 3.739, + "step": 19600 + }, + { + "epoch": 8.164794007490636, + "grad_norm": 0.30859375, + "learning_rate": 6.21613182771513e-07, + "loss": 0.2277, + "step": 19620 + }, + { + "epoch": 8.17311693716188, + "grad_norm": 0.26953125, + "learning_rate": 6.162311830816187e-07, + "loss": 0.2276, + "step": 19640 + }, + { + "epoch": 8.181439866833125, + "grad_norm": 0.31640625, + "learning_rate": 6.108693067026713e-07, + "loss": 0.2329, + "step": 19660 + }, + { + "epoch": 8.18976279650437, + "grad_norm": 0.291015625, + "learning_rate": 6.05527610912777e-07, + "loss": 0.2341, + "step": 19680 + }, + { + "epoch": 8.198085726175615, + "grad_norm": 0.28515625, + "learning_rate": 6.002061527744573e-07, + "loss": 0.2313, + "step": 19700 + }, + { + "epoch": 8.198085726175615, + "eval_main_loss": 0.23825512826442719, + "eval_main_runtime": 6.325, + "eval_main_samples_per_second": 30.04, + "eval_main_steps_per_second": 3.794, + "step": 19700 + }, + { + "epoch": 8.198085726175615, + "eval_anatomy_loss": 2.831984519958496, + "eval_anatomy_runtime": 0.2658, + "eval_anatomy_samples_per_second": 7.523, + "eval_anatomy_steps_per_second": 3.762, + "step": 19700 + }, + { + "epoch": 8.198085726175615, + "eval_college_mathematics_loss": 2.0513973236083984, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.528, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 19700 + }, + { + "epoch": 8.198085726175615, + "eval_international_law_loss": 3.0651326179504395, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 19700 + }, + { + "epoch": 8.206408655846857, + "grad_norm": 0.32421875, + "learning_rate": 5.94904989134052e-07, + "loss": 0.2367, + "step": 19720 + }, + { + "epoch": 8.214731585518102, + "grad_norm": 0.296875, + "learning_rate": 5.896241766211011e-07, + "loss": 0.232, + "step": 19740 + }, + { + "epoch": 8.223054515189347, + "grad_norm": 0.31640625, + "learning_rate": 5.843637716477454e-07, + "loss": 0.2336, + "step": 19760 + }, + { + "epoch": 8.231377444860591, + "grad_norm": 0.318359375, + "learning_rate": 5.791238304081245e-07, + "loss": 0.235, + "step": 19780 + }, + { + "epoch": 8.239700374531836, + "grad_norm": 0.26953125, + "learning_rate": 5.73904408877772e-07, + "loss": 0.2298, + "step": 19800 + }, + { + "epoch": 8.239700374531836, + "eval_main_loss": 0.23827609419822693, + "eval_main_runtime": 6.324, + "eval_main_samples_per_second": 30.044, + "eval_main_steps_per_second": 3.795, + "step": 19800 + }, + { + "epoch": 8.239700374531836, + "eval_anatomy_loss": 2.8319296836853027, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.51, + "eval_anatomy_steps_per_second": 3.755, + "step": 19800 + }, + { + "epoch": 8.239700374531836, + "eval_college_mathematics_loss": 2.051198720932007, + "eval_college_mathematics_runtime": 0.2656, + "eval_college_mathematics_samples_per_second": 7.531, + "eval_college_mathematics_steps_per_second": 3.765, + "step": 19800 + }, + { + "epoch": 8.239700374531836, + "eval_international_law_loss": 3.0645523071289062, + "eval_international_law_runtime": 0.2649, + "eval_international_law_samples_per_second": 7.551, + "eval_international_law_steps_per_second": 3.775, + "step": 19800 + }, + { + "epoch": 8.24802330420308, + "grad_norm": 0.29296875, + "learning_rate": 5.687055628130219e-07, + "loss": 0.2318, + "step": 19820 + }, + { + "epoch": 8.256346233874323, + "grad_norm": 0.2373046875, + "learning_rate": 5.6352734775041e-07, + "loss": 0.231, + "step": 19840 + }, + { + "epoch": 8.264669163545568, + "grad_norm": 0.27734375, + "learning_rate": 5.58369819006084e-07, + "loss": 0.2331, + "step": 19860 + }, + { + "epoch": 8.272992093216812, + "grad_norm": 0.33203125, + "learning_rate": 5.532330316752091e-07, + "loss": 0.2343, + "step": 19880 + }, + { + "epoch": 8.281315022888057, + "grad_norm": 0.298828125, + "learning_rate": 5.481170406313799e-07, + "loss": 0.2327, + "step": 19900 + }, + { + "epoch": 8.281315022888057, + "eval_main_loss": 0.23821409046649933, + "eval_main_runtime": 6.3564, + "eval_main_samples_per_second": 29.891, + "eval_main_steps_per_second": 3.776, + "step": 19900 + }, + { + "epoch": 8.281315022888057, + "eval_anatomy_loss": 2.8317759037017822, + "eval_anatomy_runtime": 0.2667, + "eval_anatomy_samples_per_second": 7.5, + "eval_anatomy_steps_per_second": 3.75, + "step": 19900 + }, + { + "epoch": 8.281315022888057, + "eval_college_mathematics_loss": 2.051119804382324, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.497, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 19900 + }, + { + "epoch": 8.281315022888057, + "eval_international_law_loss": 3.0645089149475098, + "eval_international_law_runtime": 0.2665, + "eval_international_law_samples_per_second": 7.506, + "eval_international_law_steps_per_second": 3.753, + "step": 19900 + }, + { + "epoch": 8.289637952559302, + "grad_norm": 0.28125, + "learning_rate": 5.430219005260387e-07, + "loss": 0.2335, + "step": 19920 + }, + { + "epoch": 8.297960882230544, + "grad_norm": 0.201171875, + "learning_rate": 5.379476657878834e-07, + "loss": 0.2349, + "step": 19940 + }, + { + "epoch": 8.306283811901789, + "grad_norm": 0.302734375, + "learning_rate": 5.328943906222955e-07, + "loss": 0.2327, + "step": 19960 + }, + { + "epoch": 8.314606741573034, + "grad_norm": 0.25390625, + "learning_rate": 5.278621290107533e-07, + "loss": 0.2318, + "step": 19980 + }, + { + "epoch": 8.322929671244278, + "grad_norm": 0.26171875, + "learning_rate": 5.228509347102593e-07, + "loss": 0.2314, + "step": 20000 + }, + { + "epoch": 8.322929671244278, + "eval_main_loss": 0.2383708357810974, + "eval_main_runtime": 6.3506, + "eval_main_samples_per_second": 29.918, + "eval_main_steps_per_second": 3.779, + "step": 20000 + }, + { + "epoch": 8.322929671244278, + "eval_anatomy_loss": 2.8334150314331055, + "eval_anatomy_runtime": 0.2681, + "eval_anatomy_samples_per_second": 7.459, + "eval_anatomy_steps_per_second": 3.73, + "step": 20000 + }, + { + "epoch": 8.322929671244278, + "eval_college_mathematics_loss": 2.049489974975586, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 20000 + }, + { + "epoch": 8.322929671244278, + "eval_international_law_loss": 3.0655791759490967, + "eval_international_law_runtime": 0.2674, + "eval_international_law_samples_per_second": 7.48, + "eval_international_law_steps_per_second": 3.74, + "step": 20000 + }, + { + "epoch": 8.331252600915523, + "grad_norm": 0.30859375, + "learning_rate": 5.178608612527663e-07, + "loss": 0.2332, + "step": 20020 + }, + { + "epoch": 8.339575530586767, + "grad_norm": 0.2431640625, + "learning_rate": 5.12891961944601e-07, + "loss": 0.2299, + "step": 20040 + }, + { + "epoch": 8.34789846025801, + "grad_norm": 0.279296875, + "learning_rate": 5.079442898659017e-07, + "loss": 0.2326, + "step": 20060 + }, + { + "epoch": 8.356221389929255, + "grad_norm": 0.265625, + "learning_rate": 5.030178978700448e-07, + "loss": 0.2295, + "step": 20080 + }, + { + "epoch": 8.3645443196005, + "grad_norm": 0.24609375, + "learning_rate": 4.98112838583083e-07, + "loss": 0.2306, + "step": 20100 + }, + { + "epoch": 8.3645443196005, + "eval_main_loss": 0.23833905160427094, + "eval_main_runtime": 6.343, + "eval_main_samples_per_second": 29.954, + "eval_main_steps_per_second": 3.784, + "step": 20100 + }, + { + "epoch": 8.3645443196005, + "eval_anatomy_loss": 2.832882881164551, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 20100 + }, + { + "epoch": 8.3645443196005, + "eval_college_mathematics_loss": 2.0530946254730225, + "eval_college_mathematics_runtime": 0.2677, + "eval_college_mathematics_samples_per_second": 7.472, + "eval_college_mathematics_steps_per_second": 3.736, + "step": 20100 + }, + { + "epoch": 8.3645443196005, + "eval_international_law_loss": 3.066558599472046, + "eval_international_law_runtime": 0.2659, + "eval_international_law_samples_per_second": 7.522, + "eval_international_law_steps_per_second": 3.761, + "step": 20100 + }, + { + "epoch": 8.372867249271744, + "grad_norm": 0.29296875, + "learning_rate": 4.932291644031844e-07, + "loss": 0.232, + "step": 20120 + }, + { + "epoch": 8.381190178942989, + "grad_norm": 0.287109375, + "learning_rate": 4.883669275000699e-07, + "loss": 0.2289, + "step": 20140 + }, + { + "epoch": 8.389513108614231, + "grad_norm": 0.265625, + "learning_rate": 4.835261798144569e-07, + "loss": 0.2359, + "step": 20160 + }, + { + "epoch": 8.397836038285476, + "grad_norm": 0.25390625, + "learning_rate": 4.787069730575067e-07, + "loss": 0.2288, + "step": 20180 + }, + { + "epoch": 8.40615896795672, + "grad_norm": 0.267578125, + "learning_rate": 4.739093587102686e-07, + "loss": 0.2314, + "step": 20200 + }, + { + "epoch": 8.40615896795672, + "eval_main_loss": 0.23831138014793396, + "eval_main_runtime": 6.3296, + "eval_main_samples_per_second": 30.018, + "eval_main_steps_per_second": 3.792, + "step": 20200 + }, + { + "epoch": 8.40615896795672, + "eval_anatomy_loss": 2.833329916000366, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.519, + "eval_anatomy_steps_per_second": 3.759, + "step": 20200 + }, + { + "epoch": 8.40615896795672, + "eval_college_mathematics_loss": 2.0539867877960205, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.488, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 20200 + }, + { + "epoch": 8.40615896795672, + "eval_international_law_loss": 3.067291498184204, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.757, + "step": 20200 + }, + { + "epoch": 8.414481897627965, + "grad_norm": 0.232421875, + "learning_rate": 4.6913338802313165e-07, + "loss": 0.227, + "step": 20220 + }, + { + "epoch": 8.42280482729921, + "grad_norm": 0.318359375, + "learning_rate": 4.6437911201527686e-07, + "loss": 0.2312, + "step": 20240 + }, + { + "epoch": 8.431127756970454, + "grad_norm": 0.330078125, + "learning_rate": 4.596465814741341e-07, + "loss": 0.2295, + "step": 20260 + }, + { + "epoch": 8.439450686641697, + "grad_norm": 0.236328125, + "learning_rate": 4.5493584695483593e-07, + "loss": 0.2321, + "step": 20280 + }, + { + "epoch": 8.447773616312942, + "grad_norm": 0.306640625, + "learning_rate": 4.502469587796807e-07, + "loss": 0.2317, + "step": 20300 + }, + { + "epoch": 8.447773616312942, + "eval_main_loss": 0.23827072978019714, + "eval_main_runtime": 6.3259, + "eval_main_samples_per_second": 30.035, + "eval_main_steps_per_second": 3.794, + "step": 20300 + }, + { + "epoch": 8.447773616312942, + "eval_anatomy_loss": 2.834202766418457, + "eval_anatomy_runtime": 0.2657, + "eval_anatomy_samples_per_second": 7.527, + "eval_anatomy_steps_per_second": 3.764, + "step": 20300 + }, + { + "epoch": 8.447773616312942, + "eval_college_mathematics_loss": 2.0520122051239014, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 20300 + }, + { + "epoch": 8.447773616312942, + "eval_international_law_loss": 3.065819025039673, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.759, + "step": 20300 + }, + { + "epoch": 8.456096545984186, + "grad_norm": 0.26171875, + "learning_rate": 4.4557996703759295e-07, + "loss": 0.2337, + "step": 20320 + }, + { + "epoch": 8.464419475655431, + "grad_norm": 0.267578125, + "learning_rate": 4.409349215835887e-07, + "loss": 0.2304, + "step": 20340 + }, + { + "epoch": 8.472742405326676, + "grad_norm": 0.27734375, + "learning_rate": 4.363118720382456e-07, + "loss": 0.2306, + "step": 20360 + }, + { + "epoch": 8.481065334997918, + "grad_norm": 0.263671875, + "learning_rate": 4.317108677871687e-07, + "loss": 0.2262, + "step": 20380 + }, + { + "epoch": 8.489388264669163, + "grad_norm": 0.275390625, + "learning_rate": 4.271319579804639e-07, + "loss": 0.2297, + "step": 20400 + }, + { + "epoch": 8.489388264669163, + "eval_main_loss": 0.23833975195884705, + "eval_main_runtime": 6.3573, + "eval_main_samples_per_second": 29.887, + "eval_main_steps_per_second": 3.775, + "step": 20400 + }, + { + "epoch": 8.489388264669163, + "eval_anatomy_loss": 2.8326468467712402, + "eval_anatomy_runtime": 0.2676, + "eval_anatomy_samples_per_second": 7.475, + "eval_anatomy_steps_per_second": 3.738, + "step": 20400 + }, + { + "epoch": 8.489388264669163, + "eval_college_mathematics_loss": 2.0523431301116943, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.5, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 20400 + }, + { + "epoch": 8.489388264669163, + "eval_international_law_loss": 3.0675792694091797, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.512, + "eval_international_law_steps_per_second": 3.756, + "step": 20400 + }, + { + "epoch": 8.497711194340408, + "grad_norm": 0.26171875, + "learning_rate": 4.2257519153221736e-07, + "loss": 0.2366, + "step": 20420 + }, + { + "epoch": 8.506034124011652, + "grad_norm": 0.349609375, + "learning_rate": 4.180406171199644e-07, + "loss": 0.2301, + "step": 20440 + }, + { + "epoch": 8.514357053682897, + "grad_norm": 0.2734375, + "learning_rate": 4.1352828318417915e-07, + "loss": 0.2337, + "step": 20460 + }, + { + "epoch": 8.522679983354141, + "grad_norm": 0.265625, + "learning_rate": 4.090382379277499e-07, + "loss": 0.2325, + "step": 20480 + }, + { + "epoch": 8.531002913025384, + "grad_norm": 0.255859375, + "learning_rate": 4.045705293154664e-07, + "loss": 0.2335, + "step": 20500 + }, + { + "epoch": 8.531002913025384, + "eval_main_loss": 0.2383665144443512, + "eval_main_runtime": 6.3513, + "eval_main_samples_per_second": 29.915, + "eval_main_steps_per_second": 3.779, + "step": 20500 + }, + { + "epoch": 8.531002913025384, + "eval_anatomy_loss": 2.8339719772338867, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.507, + "eval_anatomy_steps_per_second": 3.753, + "step": 20500 + }, + { + "epoch": 8.531002913025384, + "eval_college_mathematics_loss": 2.0505967140197754, + "eval_college_mathematics_runtime": 0.2672, + "eval_college_mathematics_samples_per_second": 7.486, + "eval_college_mathematics_steps_per_second": 3.743, + "step": 20500 + }, + { + "epoch": 8.531002913025384, + "eval_international_law_loss": 3.065483808517456, + "eval_international_law_runtime": 0.2682, + "eval_international_law_samples_per_second": 7.457, + "eval_international_law_steps_per_second": 3.729, + "step": 20500 + }, + { + "epoch": 8.539325842696629, + "grad_norm": 0.251953125, + "learning_rate": 4.001252050735102e-07, + "loss": 0.2313, + "step": 20520 + }, + { + "epoch": 8.547648772367873, + "grad_norm": 0.2109375, + "learning_rate": 3.9570231268893975e-07, + "loss": 0.2308, + "step": 20540 + }, + { + "epoch": 8.555971702039118, + "grad_norm": 0.314453125, + "learning_rate": 3.9130189940918745e-07, + "loss": 0.2313, + "step": 20560 + }, + { + "epoch": 8.564294631710363, + "grad_norm": 0.310546875, + "learning_rate": 3.869240122415521e-07, + "loss": 0.233, + "step": 20580 + }, + { + "epoch": 8.572617561381607, + "grad_norm": 0.2265625, + "learning_rate": 3.8256869795269824e-07, + "loss": 0.2285, + "step": 20600 + }, + { + "epoch": 8.572617561381607, + "eval_main_loss": 0.2381974309682846, + "eval_main_runtime": 6.3331, + "eval_main_samples_per_second": 30.001, + "eval_main_steps_per_second": 3.79, + "step": 20600 + }, + { + "epoch": 8.572617561381607, + "eval_anatomy_loss": 2.835426092147827, + "eval_anatomy_runtime": 0.2654, + "eval_anatomy_samples_per_second": 7.537, + "eval_anatomy_steps_per_second": 3.769, + "step": 20600 + }, + { + "epoch": 8.572617561381607, + "eval_college_mathematics_loss": 2.0525310039520264, + "eval_college_mathematics_runtime": 0.2646, + "eval_college_mathematics_samples_per_second": 7.559, + "eval_college_mathematics_steps_per_second": 3.78, + "step": 20600 + }, + { + "epoch": 8.572617561381607, + "eval_international_law_loss": 3.064969539642334, + "eval_international_law_runtime": 0.265, + "eval_international_law_samples_per_second": 7.547, + "eval_international_law_steps_per_second": 3.774, + "step": 20600 + }, + { + "epoch": 8.58094049105285, + "grad_norm": 0.2578125, + "learning_rate": 3.782360030681578e-07, + "loss": 0.2336, + "step": 20620 + }, + { + "epoch": 8.589263420724095, + "grad_norm": 0.283203125, + "learning_rate": 3.73925973871829e-07, + "loss": 0.2336, + "step": 20640 + }, + { + "epoch": 8.59758635039534, + "grad_norm": 0.314453125, + "learning_rate": 3.696386564054863e-07, + "loss": 0.2337, + "step": 20660 + }, + { + "epoch": 8.605909280066584, + "grad_norm": 0.2412109375, + "learning_rate": 3.6537409646828505e-07, + "loss": 0.2329, + "step": 20680 + }, + { + "epoch": 8.614232209737828, + "grad_norm": 0.248046875, + "learning_rate": 3.611323396162758e-07, + "loss": 0.2354, + "step": 20700 + }, + { + "epoch": 8.614232209737828, + "eval_main_loss": 0.23833750188350677, + "eval_main_runtime": 6.3553, + "eval_main_samples_per_second": 29.896, + "eval_main_steps_per_second": 3.776, + "step": 20700 + }, + { + "epoch": 8.614232209737828, + "eval_anatomy_loss": 2.8340907096862793, + "eval_anatomy_runtime": 0.2683, + "eval_anatomy_samples_per_second": 7.455, + "eval_anatomy_steps_per_second": 3.728, + "step": 20700 + }, + { + "epoch": 8.614232209737828, + "eval_college_mathematics_loss": 2.0527515411376953, + "eval_college_mathematics_runtime": 0.2665, + "eval_college_mathematics_samples_per_second": 7.504, + "eval_college_mathematics_steps_per_second": 3.752, + "step": 20700 + }, + { + "epoch": 8.614232209737828, + "eval_international_law_loss": 3.065247058868408, + "eval_international_law_runtime": 0.2686, + "eval_international_law_samples_per_second": 7.447, + "eval_international_law_steps_per_second": 3.724, + "step": 20700 + }, + { + "epoch": 8.622555139409073, + "grad_norm": 0.27734375, + "learning_rate": 3.569134311619146e-07, + "loss": 0.2259, + "step": 20720 + }, + { + "epoch": 8.630878069080316, + "grad_norm": 0.26953125, + "learning_rate": 3.527174161735797e-07, + "loss": 0.2324, + "step": 20740 + }, + { + "epoch": 8.63920099875156, + "grad_norm": 0.28125, + "learning_rate": 3.4854433947509256e-07, + "loss": 0.2314, + "step": 20760 + }, + { + "epoch": 8.647523928422805, + "grad_norm": 0.28125, + "learning_rate": 3.4439424564523346e-07, + "loss": 0.2357, + "step": 20780 + }, + { + "epoch": 8.65584685809405, + "grad_norm": 0.234375, + "learning_rate": 3.402671790172718e-07, + "loss": 0.2318, + "step": 20800 + }, + { + "epoch": 8.65584685809405, + "eval_main_loss": 0.23831304907798767, + "eval_main_runtime": 6.3537, + "eval_main_samples_per_second": 29.904, + "eval_main_steps_per_second": 3.777, + "step": 20800 + }, + { + "epoch": 8.65584685809405, + "eval_anatomy_loss": 2.8343677520751953, + "eval_anatomy_runtime": 0.2677, + "eval_anatomy_samples_per_second": 7.472, + "eval_anatomy_steps_per_second": 3.736, + "step": 20800 + }, + { + "epoch": 8.65584685809405, + "eval_college_mathematics_loss": 2.0545074939727783, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.501, + "eval_college_mathematics_steps_per_second": 3.751, + "step": 20800 + }, + { + "epoch": 8.65584685809405, + "eval_international_law_loss": 3.0653839111328125, + "eval_international_law_runtime": 0.2681, + "eval_international_law_samples_per_second": 7.461, + "eval_international_law_steps_per_second": 3.731, + "step": 20800 + }, + { + "epoch": 8.664169787765294, + "grad_norm": 0.333984375, + "learning_rate": 3.361631836784898e-07, + "loss": 0.2319, + "step": 20820 + }, + { + "epoch": 8.672492717436537, + "grad_norm": 0.291015625, + "learning_rate": 3.320823034697074e-07, + "loss": 0.2332, + "step": 20840 + }, + { + "epoch": 8.680815647107782, + "grad_norm": 0.28515625, + "learning_rate": 3.280245819848224e-07, + "loss": 0.2346, + "step": 20860 + }, + { + "epoch": 8.689138576779026, + "grad_norm": 0.2333984375, + "learning_rate": 3.239900625703374e-07, + "loss": 0.2327, + "step": 20880 + }, + { + "epoch": 8.69746150645027, + "grad_norm": 0.291015625, + "learning_rate": 3.199787883248992e-07, + "loss": 0.2333, + "step": 20900 + }, + { + "epoch": 8.69746150645027, + "eval_main_loss": 0.23826073110103607, + "eval_main_runtime": 6.3163, + "eval_main_samples_per_second": 30.081, + "eval_main_steps_per_second": 3.8, + "step": 20900 + }, + { + "epoch": 8.69746150645027, + "eval_anatomy_loss": 2.8350343704223633, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.514, + "eval_anatomy_steps_per_second": 3.757, + "step": 20900 + }, + { + "epoch": 8.69746150645027, + "eval_college_mathematics_loss": 2.052778959274292, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.538, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 20900 + }, + { + "epoch": 8.69746150645027, + "eval_international_law_loss": 3.064143180847168, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.523, + "eval_international_law_steps_per_second": 3.762, + "step": 20900 + }, + { + "epoch": 8.705784436121515, + "grad_norm": 0.24609375, + "learning_rate": 3.15990802098842e-07, + "loss": 0.2307, + "step": 20920 + }, + { + "epoch": 8.71410736579276, + "grad_norm": 0.2431640625, + "learning_rate": 3.12026146493721e-07, + "loss": 0.2292, + "step": 20940 + }, + { + "epoch": 8.722430295464003, + "grad_norm": 0.30078125, + "learning_rate": 3.0808486386186804e-07, + "loss": 0.2302, + "step": 20960 + }, + { + "epoch": 8.730753225135247, + "grad_norm": 0.271484375, + "learning_rate": 3.041669963059304e-07, + "loss": 0.2284, + "step": 20980 + }, + { + "epoch": 8.739076154806492, + "grad_norm": 0.31640625, + "learning_rate": 3.0027258567842525e-07, + "loss": 0.2315, + "step": 21000 + }, + { + "epoch": 8.739076154806492, + "eval_main_loss": 0.23824970424175262, + "eval_main_runtime": 6.3549, + "eval_main_samples_per_second": 29.898, + "eval_main_steps_per_second": 3.777, + "step": 21000 + }, + { + "epoch": 8.739076154806492, + "eval_anatomy_loss": 2.8310306072235107, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.497, + "eval_anatomy_steps_per_second": 3.749, + "step": 21000 + }, + { + "epoch": 8.739076154806492, + "eval_college_mathematics_loss": 2.0544137954711914, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.488, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 21000 + }, + { + "epoch": 8.739076154806492, + "eval_international_law_loss": 3.0671803951263428, + "eval_international_law_runtime": 0.2679, + "eval_international_law_samples_per_second": 7.465, + "eval_international_law_steps_per_second": 3.733, + "step": 21000 + }, + { + "epoch": 8.747399084477737, + "grad_norm": 0.28515625, + "learning_rate": 2.9640167358129273e-07, + "loss": 0.2344, + "step": 21020 + }, + { + "epoch": 8.755722014148981, + "grad_norm": 0.30859375, + "learning_rate": 2.9255430136544886e-07, + "loss": 0.231, + "step": 21040 + }, + { + "epoch": 8.764044943820224, + "grad_norm": 0.3203125, + "learning_rate": 2.8873051013034695e-07, + "loss": 0.2341, + "step": 21060 + }, + { + "epoch": 8.772367873491469, + "grad_norm": 0.25, + "learning_rate": 2.84930340723536e-07, + "loss": 0.229, + "step": 21080 + }, + { + "epoch": 8.780690803162713, + "grad_norm": 0.275390625, + "learning_rate": 2.811538337402264e-07, + "loss": 0.2285, + "step": 21100 + }, + { + "epoch": 8.780690803162713, + "eval_main_loss": 0.23829315602779388, + "eval_main_runtime": 6.3571, + "eval_main_samples_per_second": 29.888, + "eval_main_steps_per_second": 3.775, + "step": 21100 + }, + { + "epoch": 8.780690803162713, + "eval_anatomy_loss": 2.8332481384277344, + "eval_anatomy_runtime": 0.2673, + "eval_anatomy_samples_per_second": 7.483, + "eval_anatomy_steps_per_second": 3.742, + "step": 21100 + }, + { + "epoch": 8.780690803162713, + "eval_college_mathematics_loss": 2.0525107383728027, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.512, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 21100 + }, + { + "epoch": 8.780690803162713, + "eval_international_law_loss": 3.0667715072631836, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.507, + "eval_international_law_steps_per_second": 3.754, + "step": 21100 + }, + { + "epoch": 8.789013732833958, + "grad_norm": 0.306640625, + "learning_rate": 2.774010295228549e-07, + "loss": 0.2297, + "step": 21120 + }, + { + "epoch": 8.797336662505202, + "grad_norm": 0.283203125, + "learning_rate": 2.7367196816065397e-07, + "loss": 0.2364, + "step": 21140 + }, + { + "epoch": 8.805659592176447, + "grad_norm": 0.283203125, + "learning_rate": 2.699666894892236e-07, + "loss": 0.2285, + "step": 21160 + }, + { + "epoch": 8.81398252184769, + "grad_norm": 0.291015625, + "learning_rate": 2.662852330901053e-07, + "loss": 0.24, + "step": 21180 + }, + { + "epoch": 8.822305451518934, + "grad_norm": 0.27734375, + "learning_rate": 2.6262763829036197e-07, + "loss": 0.2316, + "step": 21200 + }, + { + "epoch": 8.822305451518934, + "eval_main_loss": 0.23829114437103271, + "eval_main_runtime": 6.3393, + "eval_main_samples_per_second": 29.972, + "eval_main_steps_per_second": 3.786, + "step": 21200 + }, + { + "epoch": 8.822305451518934, + "eval_anatomy_loss": 2.832653045654297, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.515, + "eval_anatomy_steps_per_second": 3.758, + "step": 21200 + }, + { + "epoch": 8.822305451518934, + "eval_college_mathematics_loss": 2.0520215034484863, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.502, + "eval_college_mathematics_steps_per_second": 3.751, + "step": 21200 + }, + { + "epoch": 8.822305451518934, + "eval_international_law_loss": 3.0661661624908447, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.757, + "step": 21200 + }, + { + "epoch": 8.830628381190179, + "grad_norm": 0.2734375, + "learning_rate": 2.5899394416215305e-07, + "loss": 0.2294, + "step": 21220 + }, + { + "epoch": 8.838951310861423, + "grad_norm": 0.2421875, + "learning_rate": 2.5538418952232054e-07, + "loss": 0.2324, + "step": 21240 + }, + { + "epoch": 8.847274240532668, + "grad_norm": 0.271484375, + "learning_rate": 2.5179841293197476e-07, + "loss": 0.2331, + "step": 21260 + }, + { + "epoch": 8.855597170203911, + "grad_norm": 0.25, + "learning_rate": 2.482366526960786e-07, + "loss": 0.2328, + "step": 21280 + }, + { + "epoch": 8.863920099875156, + "grad_norm": 0.28515625, + "learning_rate": 2.446989468630434e-07, + "loss": 0.2363, + "step": 21300 + }, + { + "epoch": 8.863920099875156, + "eval_main_loss": 0.23828859627246857, + "eval_main_runtime": 6.3239, + "eval_main_samples_per_second": 30.045, + "eval_main_steps_per_second": 3.795, + "step": 21300 + }, + { + "epoch": 8.863920099875156, + "eval_anatomy_loss": 2.8329224586486816, + "eval_anatomy_runtime": 0.267, + "eval_anatomy_samples_per_second": 7.491, + "eval_anatomy_steps_per_second": 3.746, + "step": 21300 + }, + { + "epoch": 8.863920099875156, + "eval_college_mathematics_loss": 2.0518035888671875, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.527, + "eval_college_mathematics_steps_per_second": 3.764, + "step": 21300 + }, + { + "epoch": 8.863920099875156, + "eval_international_law_loss": 3.0643723011016846, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.528, + "eval_international_law_steps_per_second": 3.764, + "step": 21300 + }, + { + "epoch": 8.8722430295464, + "grad_norm": 0.27734375, + "learning_rate": 2.411853332243183e-07, + "loss": 0.2284, + "step": 21320 + }, + { + "epoch": 8.880565959217645, + "grad_norm": 0.28515625, + "learning_rate": 2.376958493139886e-07, + "loss": 0.2363, + "step": 21340 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.234375, + "learning_rate": 2.3423053240837518e-07, + "loss": 0.2325, + "step": 21360 + }, + { + "epoch": 8.897211818560134, + "grad_norm": 0.21875, + "learning_rate": 2.3078941952563466e-07, + "loss": 0.2328, + "step": 21380 + }, + { + "epoch": 8.905534748231377, + "grad_norm": 0.29296875, + "learning_rate": 2.2737254742536547e-07, + "loss": 0.23, + "step": 21400 + }, + { + "epoch": 8.905534748231377, + "eval_main_loss": 0.23842591047286987, + "eval_main_runtime": 6.3216, + "eval_main_samples_per_second": 30.056, + "eval_main_steps_per_second": 3.796, + "step": 21400 + }, + { + "epoch": 8.905534748231377, + "eval_anatomy_loss": 2.8358285427093506, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.509, + "eval_anatomy_steps_per_second": 3.755, + "step": 21400 + }, + { + "epoch": 8.905534748231377, + "eval_college_mathematics_loss": 2.0552380084991455, + "eval_college_mathematics_runtime": 0.2662, + "eval_college_mathematics_samples_per_second": 7.514, + "eval_college_mathematics_steps_per_second": 3.757, + "step": 21400 + }, + { + "epoch": 8.905534748231377, + "eval_international_law_loss": 3.0662267208099365, + "eval_international_law_runtime": 0.2673, + "eval_international_law_samples_per_second": 7.482, + "eval_international_law_steps_per_second": 3.741, + "step": 21400 + }, + { + "epoch": 8.913857677902621, + "grad_norm": 0.228515625, + "learning_rate": 2.2397995260821342e-07, + "loss": 0.2304, + "step": 21420 + }, + { + "epoch": 8.922180607573866, + "grad_norm": 0.326171875, + "learning_rate": 2.206116713154838e-07, + "loss": 0.2309, + "step": 21440 + }, + { + "epoch": 8.93050353724511, + "grad_norm": 0.263671875, + "learning_rate": 2.172677395287537e-07, + "loss": 0.2324, + "step": 21460 + }, + { + "epoch": 8.938826466916355, + "grad_norm": 0.3046875, + "learning_rate": 2.1394819296948616e-07, + "loss": 0.2306, + "step": 21480 + }, + { + "epoch": 8.947149396587598, + "grad_norm": 0.263671875, + "learning_rate": 2.106530670986498e-07, + "loss": 0.2323, + "step": 21500 + }, + { + "epoch": 8.947149396587598, + "eval_main_loss": 0.23836477100849152, + "eval_main_runtime": 6.3511, + "eval_main_samples_per_second": 29.916, + "eval_main_steps_per_second": 3.779, + "step": 21500 + }, + { + "epoch": 8.947149396587598, + "eval_anatomy_loss": 2.8358511924743652, + "eval_anatomy_runtime": 0.2678, + "eval_anatomy_samples_per_second": 7.468, + "eval_anatomy_steps_per_second": 3.734, + "step": 21500 + }, + { + "epoch": 8.947149396587598, + "eval_college_mathematics_loss": 2.0521907806396484, + "eval_college_mathematics_runtime": 0.2659, + "eval_college_mathematics_samples_per_second": 7.521, + "eval_college_mathematics_steps_per_second": 3.761, + "step": 21500 + }, + { + "epoch": 8.947149396587598, + "eval_international_law_loss": 3.0672295093536377, + "eval_international_law_runtime": 0.2671, + "eval_international_law_samples_per_second": 7.486, + "eval_international_law_steps_per_second": 3.743, + "step": 21500 + }, + { + "epoch": 8.955472326258842, + "grad_norm": 0.28515625, + "learning_rate": 2.0738239711634133e-07, + "loss": 0.2317, + "step": 21520 + }, + { + "epoch": 8.963795255930087, + "grad_norm": 0.2490234375, + "learning_rate": 2.0413621796140647e-07, + "loss": 0.2289, + "step": 21540 + }, + { + "epoch": 8.972118185601332, + "grad_norm": 0.28515625, + "learning_rate": 2.0091456431106854e-07, + "loss": 0.2357, + "step": 21560 + }, + { + "epoch": 8.980441115272576, + "grad_norm": 0.330078125, + "learning_rate": 1.977174705805582e-07, + "loss": 0.2299, + "step": 21580 + }, + { + "epoch": 8.98876404494382, + "grad_norm": 0.2353515625, + "learning_rate": 1.9454497092274565e-07, + "loss": 0.2327, + "step": 21600 + }, + { + "epoch": 8.98876404494382, + "eval_main_loss": 0.23839198052883148, + "eval_main_runtime": 6.3485, + "eval_main_samples_per_second": 29.928, + "eval_main_steps_per_second": 3.78, + "step": 21600 + }, + { + "epoch": 8.98876404494382, + "eval_anatomy_loss": 2.8339269161224365, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.495, + "eval_anatomy_steps_per_second": 3.747, + "step": 21600 + }, + { + "epoch": 8.98876404494382, + "eval_college_mathematics_loss": 2.0503244400024414, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.511, + "eval_college_mathematics_steps_per_second": 3.756, + "step": 21600 + }, + { + "epoch": 8.98876404494382, + "eval_international_law_loss": 3.066997766494751, + "eval_international_law_runtime": 0.2675, + "eval_international_law_samples_per_second": 7.477, + "eval_international_law_steps_per_second": 3.739, + "step": 21600 + }, + { + "epoch": 8.997086974615064, + "grad_norm": 0.3125, + "learning_rate": 1.9139709922777528e-07, + "loss": 0.234, + "step": 21620 + }, + { + "epoch": 9.005409904286308, + "grad_norm": 0.294921875, + "learning_rate": 1.8827388912270318e-07, + "loss": 0.2279, + "step": 21640 + }, + { + "epoch": 9.013732833957553, + "grad_norm": 0.3359375, + "learning_rate": 1.8517537397114066e-07, + "loss": 0.2288, + "step": 21660 + }, + { + "epoch": 9.022055763628797, + "grad_norm": 0.275390625, + "learning_rate": 1.8210158687289258e-07, + "loss": 0.2344, + "step": 21680 + }, + { + "epoch": 9.030378693300042, + "grad_norm": 0.267578125, + "learning_rate": 1.7905256066361037e-07, + "loss": 0.2313, + "step": 21700 + }, + { + "epoch": 9.030378693300042, + "eval_main_loss": 0.23835183680057526, + "eval_main_runtime": 6.3323, + "eval_main_samples_per_second": 30.005, + "eval_main_steps_per_second": 3.79, + "step": 21700 + }, + { + "epoch": 9.030378693300042, + "eval_anatomy_loss": 2.832937479019165, + "eval_anatomy_runtime": 0.2671, + "eval_anatomy_samples_per_second": 7.487, + "eval_anatomy_steps_per_second": 3.743, + "step": 21700 + }, + { + "epoch": 9.030378693300042, + "eval_college_mathematics_loss": 2.048555612564087, + "eval_college_mathematics_runtime": 0.2651, + "eval_college_mathematics_samples_per_second": 7.545, + "eval_college_mathematics_steps_per_second": 3.773, + "step": 21700 + }, + { + "epoch": 9.030378693300042, + "eval_international_law_loss": 3.0660414695739746, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.519, + "eval_international_law_steps_per_second": 3.759, + "step": 21700 + }, + { + "epoch": 9.038701622971287, + "grad_norm": 0.255859375, + "learning_rate": 1.7602832791443648e-07, + "loss": 0.2293, + "step": 21720 + }, + { + "epoch": 9.04702455264253, + "grad_norm": 0.275390625, + "learning_rate": 1.7302892093165684e-07, + "loss": 0.2289, + "step": 21740 + }, + { + "epoch": 9.055347482313774, + "grad_norm": 0.328125, + "learning_rate": 1.700543717563591e-07, + "loss": 0.2313, + "step": 21760 + }, + { + "epoch": 9.063670411985019, + "grad_norm": 0.2890625, + "learning_rate": 1.6710471216408563e-07, + "loss": 0.2315, + "step": 21780 + }, + { + "epoch": 9.071993341656263, + "grad_norm": 0.2578125, + "learning_rate": 1.641799736644986e-07, + "loss": 0.2333, + "step": 21800 + }, + { + "epoch": 9.071993341656263, + "eval_main_loss": 0.2383333146572113, + "eval_main_runtime": 6.3262, + "eval_main_samples_per_second": 30.034, + "eval_main_steps_per_second": 3.794, + "step": 21800 + }, + { + "epoch": 9.071993341656263, + "eval_anatomy_loss": 2.831967353820801, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.504, + "eval_anatomy_steps_per_second": 3.752, + "step": 21800 + }, + { + "epoch": 9.071993341656263, + "eval_college_mathematics_loss": 2.050198554992676, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 21800 + }, + { + "epoch": 9.071993341656263, + "eval_international_law_loss": 3.0654492378234863, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.503, + "eval_international_law_steps_per_second": 3.751, + "step": 21800 + }, + { + "epoch": 9.080316271327508, + "grad_norm": 0.333984375, + "learning_rate": 1.6128018750103975e-07, + "loss": 0.2335, + "step": 21820 + }, + { + "epoch": 9.088639200998752, + "grad_norm": 0.24609375, + "learning_rate": 1.5840538465059813e-07, + "loss": 0.2288, + "step": 21840 + }, + { + "epoch": 9.096962130669995, + "grad_norm": 0.26953125, + "learning_rate": 1.555555958231808e-07, + "loss": 0.2326, + "step": 21860 + }, + { + "epoch": 9.10528506034124, + "grad_norm": 0.263671875, + "learning_rate": 1.527308514615819e-07, + "loss": 0.2301, + "step": 21880 + }, + { + "epoch": 9.113607990012484, + "grad_norm": 0.31640625, + "learning_rate": 1.4993118174105842e-07, + "loss": 0.2275, + "step": 21900 + }, + { + "epoch": 9.113607990012484, + "eval_main_loss": 0.23825402557849884, + "eval_main_runtime": 6.3324, + "eval_main_samples_per_second": 30.005, + "eval_main_steps_per_second": 3.79, + "step": 21900 + }, + { + "epoch": 9.113607990012484, + "eval_anatomy_loss": 2.833402395248413, + "eval_anatomy_runtime": 0.2656, + "eval_anatomy_samples_per_second": 7.529, + "eval_anatomy_steps_per_second": 3.765, + "step": 21900 + }, + { + "epoch": 9.113607990012484, + "eval_college_mathematics_loss": 2.0520825386047363, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.508, + "eval_college_mathematics_steps_per_second": 3.754, + "step": 21900 + }, + { + "epoch": 9.113607990012484, + "eval_international_law_loss": 3.0643093585968018, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.515, + "eval_international_law_steps_per_second": 3.758, + "step": 21900 + }, + { + "epoch": 9.121930919683729, + "grad_norm": 0.275390625, + "learning_rate": 1.471566165690086e-07, + "loss": 0.2317, + "step": 21920 + }, + { + "epoch": 9.130253849354974, + "grad_norm": 0.25390625, + "learning_rate": 1.4440718558465294e-07, + "loss": 0.232, + "step": 21940 + }, + { + "epoch": 9.138576779026216, + "grad_norm": 0.27734375, + "learning_rate": 1.416829181587151e-07, + "loss": 0.2346, + "step": 21960 + }, + { + "epoch": 9.146899708697461, + "grad_norm": 0.212890625, + "learning_rate": 1.3898384339311038e-07, + "loss": 0.2318, + "step": 21980 + }, + { + "epoch": 9.155222638368706, + "grad_norm": 0.28125, + "learning_rate": 1.3630999012063467e-07, + "loss": 0.2328, + "step": 22000 + }, + { + "epoch": 9.155222638368706, + "eval_main_loss": 0.23840220272541046, + "eval_main_runtime": 6.333, + "eval_main_samples_per_second": 30.002, + "eval_main_steps_per_second": 3.79, + "step": 22000 + }, + { + "epoch": 9.155222638368706, + "eval_anatomy_loss": 2.8348910808563232, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.511, + "eval_anatomy_steps_per_second": 3.755, + "step": 22000 + }, + { + "epoch": 9.155222638368706, + "eval_college_mathematics_loss": 2.0505683422088623, + "eval_college_mathematics_runtime": 0.2666, + "eval_college_mathematics_samples_per_second": 7.501, + "eval_college_mathematics_steps_per_second": 3.75, + "step": 22000 + }, + { + "epoch": 9.155222638368706, + "eval_international_law_loss": 3.065016508102417, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.497, + "eval_international_law_steps_per_second": 3.748, + "step": 22000 + }, + { + "epoch": 9.16354556803995, + "grad_norm": 0.2734375, + "learning_rate": 1.3366138690465437e-07, + "loss": 0.2313, + "step": 22020 + }, + { + "epoch": 9.171868497711195, + "grad_norm": 0.294921875, + "learning_rate": 1.3103806203880493e-07, + "loss": 0.2352, + "step": 22040 + }, + { + "epoch": 9.18019142738244, + "grad_norm": 0.337890625, + "learning_rate": 1.2844004354668483e-07, + "loss": 0.2299, + "step": 22060 + }, + { + "epoch": 9.188514357053682, + "grad_norm": 0.3046875, + "learning_rate": 1.258673591815579e-07, + "loss": 0.2313, + "step": 22080 + }, + { + "epoch": 9.196837286724927, + "grad_norm": 0.267578125, + "learning_rate": 1.2332003642605868e-07, + "loss": 0.2304, + "step": 22100 + }, + { + "epoch": 9.196837286724927, + "eval_main_loss": 0.2382725030183792, + "eval_main_runtime": 6.3586, + "eval_main_samples_per_second": 29.881, + "eval_main_steps_per_second": 3.774, + "step": 22100 + }, + { + "epoch": 9.196837286724927, + "eval_anatomy_loss": 2.8336524963378906, + "eval_anatomy_runtime": 0.268, + "eval_anatomy_samples_per_second": 7.462, + "eval_anatomy_steps_per_second": 3.731, + "step": 22100 + }, + { + "epoch": 9.196837286724927, + "eval_college_mathematics_loss": 2.0524938106536865, + "eval_college_mathematics_runtime": 0.2678, + "eval_college_mathematics_samples_per_second": 7.468, + "eval_college_mathematics_steps_per_second": 3.734, + "step": 22100 + }, + { + "epoch": 9.196837286724927, + "eval_international_law_loss": 3.0638139247894287, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.495, + "eval_international_law_steps_per_second": 3.748, + "step": 22100 + }, + { + "epoch": 9.205160216396171, + "grad_norm": 0.29296875, + "learning_rate": 1.2079810249189415e-07, + "loss": 0.234, + "step": 22120 + }, + { + "epoch": 9.213483146067416, + "grad_norm": 0.2734375, + "learning_rate": 1.1830158431955841e-07, + "loss": 0.2319, + "step": 22140 + }, + { + "epoch": 9.22180607573866, + "grad_norm": 0.283203125, + "learning_rate": 1.1583050857804145e-07, + "loss": 0.2295, + "step": 22160 + }, + { + "epoch": 9.230129005409903, + "grad_norm": 0.29296875, + "learning_rate": 1.1338490166454386e-07, + "loss": 0.2315, + "step": 22180 + }, + { + "epoch": 9.238451935081148, + "grad_norm": 0.310546875, + "learning_rate": 1.109647897041985e-07, + "loss": 0.2329, + "step": 22200 + }, + { + "epoch": 9.238451935081148, + "eval_main_loss": 0.23832343518733978, + "eval_main_runtime": 6.3644, + "eval_main_samples_per_second": 29.854, + "eval_main_steps_per_second": 3.771, + "step": 22200 + }, + { + "epoch": 9.238451935081148, + "eval_anatomy_loss": 2.8315091133117676, + "eval_anatomy_runtime": 0.2675, + "eval_anatomy_samples_per_second": 7.477, + "eval_anatomy_steps_per_second": 3.739, + "step": 22200 + }, + { + "epoch": 9.238451935081148, + "eval_college_mathematics_loss": 2.051854133605957, + "eval_college_mathematics_runtime": 0.2658, + "eval_college_mathematics_samples_per_second": 7.525, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 22200 + }, + { + "epoch": 9.238451935081148, + "eval_international_law_loss": 3.0672245025634766, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.501, + "eval_international_law_steps_per_second": 3.751, + "step": 22200 + }, + { + "epoch": 9.246774864752393, + "grad_norm": 0.283203125, + "learning_rate": 1.085701985497875e-07, + "loss": 0.2331, + "step": 22220 + }, + { + "epoch": 9.255097794423637, + "grad_norm": 0.2431640625, + "learning_rate": 1.062011537814675e-07, + "loss": 0.2341, + "step": 22240 + }, + { + "epoch": 9.263420724094882, + "grad_norm": 0.298828125, + "learning_rate": 1.0385768070649783e-07, + "loss": 0.2314, + "step": 22260 + }, + { + "epoch": 9.271743653766126, + "grad_norm": 0.31640625, + "learning_rate": 1.015398043589677e-07, + "loss": 0.2301, + "step": 22280 + }, + { + "epoch": 9.28006658343737, + "grad_norm": 0.2578125, + "learning_rate": 9.924754949953069e-08, + "loss": 0.2287, + "step": 22300 + }, + { + "epoch": 9.28006658343737, + "eval_main_loss": 0.23838910460472107, + "eval_main_runtime": 6.3361, + "eval_main_samples_per_second": 29.987, + "eval_main_steps_per_second": 3.788, + "step": 22300 + }, + { + "epoch": 9.28006658343737, + "eval_anatomy_loss": 2.8351857662200928, + "eval_anatomy_runtime": 0.2664, + "eval_anatomy_samples_per_second": 7.508, + "eval_anatomy_steps_per_second": 3.754, + "step": 22300 + }, + { + "epoch": 9.28006658343737, + "eval_college_mathematics_loss": 2.0533783435821533, + "eval_college_mathematics_runtime": 0.2663, + "eval_college_mathematics_samples_per_second": 7.51, + "eval_college_mathematics_steps_per_second": 3.755, + "step": 22300 + }, + { + "epoch": 9.28006658343737, + "eval_international_law_loss": 3.0666663646698, + "eval_international_law_runtime": 0.2658, + "eval_international_law_samples_per_second": 7.525, + "eval_international_law_steps_per_second": 3.762, + "step": 22300 + }, + { + "epoch": 9.288389513108614, + "grad_norm": 0.306640625, + "learning_rate": 9.698094061513868e-08, + "loss": 0.2336, + "step": 22320 + }, + { + "epoch": 9.296712442779858, + "grad_norm": 0.279296875, + "learning_rate": 9.474000191878163e-08, + "loss": 0.2304, + "step": 22340 + }, + { + "epoch": 9.305035372451103, + "grad_norm": 0.296875, + "learning_rate": 9.252475734922883e-08, + "loss": 0.2302, + "step": 22360 + }, + { + "epoch": 9.313358302122348, + "grad_norm": 0.31640625, + "learning_rate": 9.033523057077193e-08, + "loss": 0.2312, + "step": 22380 + }, + { + "epoch": 9.321681231793592, + "grad_norm": 0.30078125, + "learning_rate": 8.817144497297342e-08, + "loss": 0.2355, + "step": 22400 + }, + { + "epoch": 9.321681231793592, + "eval_main_loss": 0.23832879960536957, + "eval_main_runtime": 6.3273, + "eval_main_samples_per_second": 30.029, + "eval_main_steps_per_second": 3.793, + "step": 22400 + }, + { + "epoch": 9.321681231793592, + "eval_anatomy_loss": 2.8331458568573, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.497, + "eval_anatomy_steps_per_second": 3.748, + "step": 22400 + }, + { + "epoch": 9.321681231793592, + "eval_college_mathematics_loss": 2.0527069568634033, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 22400 + }, + { + "epoch": 9.321681231793592, + "eval_international_law_loss": 3.0645482540130615, + "eval_international_law_runtime": 0.2664, + "eval_international_law_samples_per_second": 7.508, + "eval_international_law_steps_per_second": 3.754, + "step": 22400 + }, + { + "epoch": 9.330004161464835, + "grad_norm": 0.287109375, + "learning_rate": 8.603342367041578e-08, + "loss": 0.2334, + "step": 22420 + }, + { + "epoch": 9.33832709113608, + "grad_norm": 0.271484375, + "learning_rate": 8.392118950245581e-08, + "loss": 0.2309, + "step": 22440 + }, + { + "epoch": 9.346650020807324, + "grad_norm": 0.26953125, + "learning_rate": 8.183476503297982e-08, + "loss": 0.2301, + "step": 22460 + }, + { + "epoch": 9.354972950478569, + "grad_norm": 0.287109375, + "learning_rate": 7.977417255016162e-08, + "loss": 0.2335, + "step": 22480 + }, + { + "epoch": 9.363295880149813, + "grad_norm": 0.28125, + "learning_rate": 7.77394340662277e-08, + "loss": 0.2291, + "step": 22500 + }, + { + "epoch": 9.363295880149813, + "eval_main_loss": 0.23837444186210632, + "eval_main_runtime": 6.3339, + "eval_main_samples_per_second": 29.997, + "eval_main_steps_per_second": 3.789, + "step": 22500 + }, + { + "epoch": 9.363295880149813, + "eval_anatomy_loss": 2.834547758102417, + "eval_anatomy_runtime": 0.2678, + "eval_anatomy_samples_per_second": 7.467, + "eval_anatomy_steps_per_second": 3.734, + "step": 22500 + }, + { + "epoch": 9.363295880149813, + "eval_college_mathematics_loss": 2.0539867877960205, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.526, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 22500 + }, + { + "epoch": 9.363295880149813, + "eval_international_law_loss": 3.0666635036468506, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.756, + "step": 22500 + }, + { + "epoch": 9.371618809821056, + "grad_norm": 0.3125, + "learning_rate": 7.573057131721684e-08, + "loss": 0.23, + "step": 22520 + }, + { + "epoch": 9.3799417394923, + "grad_norm": 0.30859375, + "learning_rate": 7.374760576275397e-08, + "loss": 0.2319, + "step": 22540 + }, + { + "epoch": 9.388264669163545, + "grad_norm": 0.294921875, + "learning_rate": 7.179055858581586e-08, + "loss": 0.2296, + "step": 22560 + }, + { + "epoch": 9.39658759883479, + "grad_norm": 0.28515625, + "learning_rate": 6.985945069250766e-08, + "loss": 0.2351, + "step": 22580 + }, + { + "epoch": 9.404910528506035, + "grad_norm": 0.2734375, + "learning_rate": 6.795430271183929e-08, + "loss": 0.237, + "step": 22600 + }, + { + "epoch": 9.404910528506035, + "eval_main_loss": 0.23828138411045074, + "eval_main_runtime": 6.3303, + "eval_main_samples_per_second": 30.014, + "eval_main_steps_per_second": 3.791, + "step": 22600 + }, + { + "epoch": 9.404910528506035, + "eval_anatomy_loss": 2.8350436687469482, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.516, + "eval_anatomy_steps_per_second": 3.758, + "step": 22600 + }, + { + "epoch": 9.404910528506035, + "eval_college_mathematics_loss": 2.051068067550659, + "eval_college_mathematics_runtime": 0.2664, + "eval_college_mathematics_samples_per_second": 7.506, + "eval_college_mathematics_steps_per_second": 3.753, + "step": 22600 + }, + { + "epoch": 9.404910528506035, + "eval_international_law_loss": 3.065190315246582, + "eval_international_law_runtime": 0.2653, + "eval_international_law_samples_per_second": 7.537, + "eval_international_law_steps_per_second": 3.769, + "step": 22600 + }, + { + "epoch": 9.41323345817728, + "grad_norm": 0.291015625, + "learning_rate": 6.607513499550328e-08, + "loss": 0.233, + "step": 22620 + }, + { + "epoch": 9.421556387848522, + "grad_norm": 0.310546875, + "learning_rate": 6.422196761766031e-08, + "loss": 0.2309, + "step": 22640 + }, + { + "epoch": 9.429879317519767, + "grad_norm": 0.271484375, + "learning_rate": 6.239482037472156e-08, + "loss": 0.2313, + "step": 22660 + }, + { + "epoch": 9.438202247191011, + "grad_norm": 0.314453125, + "learning_rate": 6.059371278513942e-08, + "loss": 0.2308, + "step": 22680 + }, + { + "epoch": 9.446525176862256, + "grad_norm": 0.29296875, + "learning_rate": 5.881866408919912e-08, + "loss": 0.2296, + "step": 22700 + }, + { + "epoch": 9.446525176862256, + "eval_main_loss": 0.23832696676254272, + "eval_main_runtime": 6.3308, + "eval_main_samples_per_second": 30.012, + "eval_main_steps_per_second": 3.791, + "step": 22700 + }, + { + "epoch": 9.446525176862256, + "eval_anatomy_loss": 2.834747791290283, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.506, + "eval_anatomy_steps_per_second": 3.753, + "step": 22700 + }, + { + "epoch": 9.446525176862256, + "eval_college_mathematics_loss": 2.05255389213562, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.527, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 22700 + }, + { + "epoch": 9.446525176862256, + "eval_international_law_loss": 3.064356803894043, + "eval_international_law_runtime": 0.266, + "eval_international_law_samples_per_second": 7.518, + "eval_international_law_steps_per_second": 3.759, + "step": 22700 + }, + { + "epoch": 9.4548481065335, + "grad_norm": 0.296875, + "learning_rate": 5.7069693248811566e-08, + "loss": 0.2266, + "step": 22720 + }, + { + "epoch": 9.463171036204745, + "grad_norm": 0.283203125, + "learning_rate": 5.5346818947311365e-08, + "loss": 0.2316, + "step": 22740 + }, + { + "epoch": 9.471493965875988, + "grad_norm": 0.265625, + "learning_rate": 5.3650059589258616e-08, + "loss": 0.2313, + "step": 22760 + }, + { + "epoch": 9.479816895547232, + "grad_norm": 0.26171875, + "learning_rate": 5.197943330024019e-08, + "loss": 0.2367, + "step": 22780 + }, + { + "epoch": 9.488139825218477, + "grad_norm": 0.32421875, + "learning_rate": 5.0334957926677917e-08, + "loss": 0.2302, + "step": 22800 + }, + { + "epoch": 9.488139825218477, + "eval_main_loss": 0.2384629100561142, + "eval_main_runtime": 6.3266, + "eval_main_samples_per_second": 30.032, + "eval_main_steps_per_second": 3.794, + "step": 22800 + }, + { + "epoch": 9.488139825218477, + "eval_anatomy_loss": 2.8347084522247314, + "eval_anatomy_runtime": 0.2653, + "eval_anatomy_samples_per_second": 7.54, + "eval_anatomy_steps_per_second": 3.77, + "step": 22800 + }, + { + "epoch": 9.488139825218477, + "eval_college_mathematics_loss": 2.0520851612091064, + "eval_college_mathematics_runtime": 0.267, + "eval_college_mathematics_samples_per_second": 7.492, + "eval_college_mathematics_steps_per_second": 3.746, + "step": 22800 + }, + { + "epoch": 9.488139825218477, + "eval_international_law_loss": 3.0670056343078613, + "eval_international_law_runtime": 0.2666, + "eval_international_law_samples_per_second": 7.501, + "eval_international_law_steps_per_second": 3.751, + "step": 22800 + }, + { + "epoch": 9.496462754889722, + "grad_norm": 0.26171875, + "learning_rate": 4.871665103563655e-08, + "loss": 0.2362, + "step": 22820 + }, + { + "epoch": 9.504785684560966, + "grad_norm": 0.298828125, + "learning_rate": 4.7124529914637226e-08, + "loss": 0.2338, + "step": 22840 + }, + { + "epoch": 9.513108614232209, + "grad_norm": 0.2353515625, + "learning_rate": 4.555861157147179e-08, + "loss": 0.2339, + "step": 22860 + }, + { + "epoch": 9.521431543903454, + "grad_norm": 0.259765625, + "learning_rate": 4.401891273402209e-08, + "loss": 0.2355, + "step": 22880 + }, + { + "epoch": 9.529754473574698, + "grad_norm": 0.26171875, + "learning_rate": 4.250544985008043e-08, + "loss": 0.2331, + "step": 22900 + }, + { + "epoch": 9.529754473574698, + "eval_main_loss": 0.2383948415517807, + "eval_main_runtime": 6.3315, + "eval_main_samples_per_second": 30.008, + "eval_main_steps_per_second": 3.791, + "step": 22900 + }, + { + "epoch": 9.529754473574698, + "eval_anatomy_loss": 2.832704782485962, + "eval_anatomy_runtime": 0.2655, + "eval_anatomy_samples_per_second": 7.532, + "eval_anatomy_steps_per_second": 3.766, + "step": 22900 + }, + { + "epoch": 9.529754473574698, + "eval_college_mathematics_loss": 2.0502614974975586, + "eval_college_mathematics_runtime": 0.2673, + "eval_college_mathematics_samples_per_second": 7.482, + "eval_college_mathematics_steps_per_second": 3.741, + "step": 22900 + }, + { + "epoch": 9.529754473574698, + "eval_international_law_loss": 3.0666775703430176, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.527, + "eval_international_law_steps_per_second": 3.764, + "step": 22900 + }, + { + "epoch": 9.538077403245943, + "grad_norm": 0.267578125, + "learning_rate": 4.1018239087174114e-08, + "loss": 0.2385, + "step": 22920 + }, + { + "epoch": 9.546400332917187, + "grad_norm": 0.2734375, + "learning_rate": 3.9557296332393413e-08, + "loss": 0.2306, + "step": 22940 + }, + { + "epoch": 9.554723262588432, + "grad_norm": 0.2353515625, + "learning_rate": 3.812263719222081e-08, + "loss": 0.2322, + "step": 22960 + }, + { + "epoch": 9.563046192259675, + "grad_norm": 0.2734375, + "learning_rate": 3.671427699236479e-08, + "loss": 0.2305, + "step": 22980 + }, + { + "epoch": 9.57136912193092, + "grad_norm": 0.30078125, + "learning_rate": 3.53322307775969e-08, + "loss": 0.2349, + "step": 23000 + }, + { + "epoch": 9.57136912193092, + "eval_main_loss": 0.23835638165473938, + "eval_main_runtime": 6.3283, + "eval_main_samples_per_second": 30.024, + "eval_main_steps_per_second": 3.792, + "step": 23000 + }, + { + "epoch": 9.57136912193092, + "eval_anatomy_loss": 2.8344595432281494, + "eval_anatomy_runtime": 0.2648, + "eval_anatomy_samples_per_second": 7.554, + "eval_anatomy_steps_per_second": 3.777, + "step": 23000 + }, + { + "epoch": 9.57136912193092, + "eval_college_mathematics_loss": 2.0526890754699707, + "eval_college_mathematics_runtime": 0.2671, + "eval_college_mathematics_samples_per_second": 7.488, + "eval_college_mathematics_steps_per_second": 3.744, + "step": 23000 + }, + { + "epoch": 9.57136912193092, + "eval_international_law_loss": 3.0659825801849365, + "eval_international_law_runtime": 0.2668, + "eval_international_law_samples_per_second": 7.496, + "eval_international_law_steps_per_second": 3.748, + "step": 23000 + }, + { + "epoch": 9.579692051602164, + "grad_norm": 0.24609375, + "learning_rate": 3.39765133115888e-08, + "loss": 0.2312, + "step": 23020 + }, + { + "epoch": 9.588014981273409, + "grad_norm": 0.28125, + "learning_rate": 3.264713907675687e-08, + "loss": 0.2308, + "step": 23040 + }, + { + "epoch": 9.596337910944653, + "grad_norm": 0.28515625, + "learning_rate": 3.134412227410677e-08, + "loss": 0.232, + "step": 23060 + }, + { + "epoch": 9.604660840615896, + "grad_norm": 0.3046875, + "learning_rate": 3.006747682308103e-08, + "loss": 0.2307, + "step": 23080 + }, + { + "epoch": 9.61298377028714, + "grad_norm": 0.291015625, + "learning_rate": 2.8817216361411438e-08, + "loss": 0.234, + "step": 23100 + }, + { + "epoch": 9.61298377028714, + "eval_main_loss": 0.23833413422107697, + "eval_main_runtime": 6.32, + "eval_main_samples_per_second": 30.063, + "eval_main_steps_per_second": 3.797, + "step": 23100 + }, + { + "epoch": 9.61298377028714, + "eval_anatomy_loss": 2.832688331604004, + "eval_anatomy_runtime": 0.2665, + "eval_anatomy_samples_per_second": 7.504, + "eval_anatomy_steps_per_second": 3.752, + "step": 23100 + }, + { + "epoch": 9.61298377028714, + "eval_college_mathematics_loss": 2.050745725631714, + "eval_college_mathematics_runtime": 0.2667, + "eval_college_mathematics_samples_per_second": 7.498, + "eval_college_mathematics_steps_per_second": 3.749, + "step": 23100 + }, + { + "epoch": 9.61298377028714, + "eval_international_law_loss": 3.0674684047698975, + "eval_international_law_runtime": 0.2656, + "eval_international_law_samples_per_second": 7.53, + "eval_international_law_steps_per_second": 3.765, + "step": 23100 + }, + { + "epoch": 9.621306699958385, + "grad_norm": 0.267578125, + "learning_rate": 2.7593354244972448e-08, + "loss": 0.2298, + "step": 23120 + }, + { + "epoch": 9.62962962962963, + "grad_norm": 0.298828125, + "learning_rate": 2.6395903547638825e-08, + "loss": 0.2305, + "step": 23140 + }, + { + "epoch": 9.637952559300874, + "grad_norm": 0.2578125, + "learning_rate": 2.5224877061146292e-08, + "loss": 0.2332, + "step": 23160 + }, + { + "epoch": 9.646275488972119, + "grad_norm": 0.287109375, + "learning_rate": 2.4080287294954706e-08, + "loss": 0.2306, + "step": 23180 + }, + { + "epoch": 9.654598418643362, + "grad_norm": 0.23828125, + "learning_rate": 2.2962146476114e-08, + "loss": 0.2315, + "step": 23200 + }, + { + "epoch": 9.654598418643362, + "eval_main_loss": 0.23836931586265564, + "eval_main_runtime": 6.3322, + "eval_main_samples_per_second": 30.005, + "eval_main_steps_per_second": 3.79, + "step": 23200 + }, + { + "epoch": 9.654598418643362, + "eval_anatomy_loss": 2.834285259246826, + "eval_anatomy_runtime": 0.2661, + "eval_anatomy_samples_per_second": 7.517, + "eval_anatomy_steps_per_second": 3.758, + "step": 23200 + }, + { + "epoch": 9.654598418643362, + "eval_college_mathematics_loss": 2.05277943611145, + "eval_college_mathematics_runtime": 0.2661, + "eval_college_mathematics_samples_per_second": 7.517, + "eval_college_mathematics_steps_per_second": 3.759, + "step": 23200 + }, + { + "epoch": 9.654598418643362, + "eval_international_law_loss": 3.065124273300171, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.759, + "step": 23200 + }, + { + "epoch": 9.662921348314606, + "grad_norm": 0.26953125, + "learning_rate": 2.187046654913455e-08, + "loss": 0.2309, + "step": 23220 + }, + { + "epoch": 9.671244277985851, + "grad_norm": 0.26171875, + "learning_rate": 2.080525917585785e-08, + "loss": 0.2303, + "step": 23240 + }, + { + "epoch": 9.679567207657096, + "grad_norm": 0.2734375, + "learning_rate": 1.9766535735334102e-08, + "loss": 0.2316, + "step": 23260 + }, + { + "epoch": 9.68789013732834, + "grad_norm": 0.2734375, + "learning_rate": 1.875430732369954e-08, + "loss": 0.232, + "step": 23280 + }, + { + "epoch": 9.696213066999583, + "grad_norm": 0.294921875, + "learning_rate": 1.7768584754056796e-08, + "loss": 0.2298, + "step": 23300 + }, + { + "epoch": 9.696213066999583, + "eval_main_loss": 0.2383754402399063, + "eval_main_runtime": 6.3306, + "eval_main_samples_per_second": 30.013, + "eval_main_steps_per_second": 3.791, + "step": 23300 + }, + { + "epoch": 9.696213066999583, + "eval_anatomy_loss": 2.833543539047241, + "eval_anatomy_runtime": 0.2663, + "eval_anatomy_samples_per_second": 7.509, + "eval_anatomy_steps_per_second": 3.755, + "step": 23300 + }, + { + "epoch": 9.696213066999583, + "eval_college_mathematics_loss": 2.0523107051849365, + "eval_college_mathematics_runtime": 0.2657, + "eval_college_mathematics_samples_per_second": 7.527, + "eval_college_mathematics_steps_per_second": 3.763, + "step": 23300 + }, + { + "epoch": 9.696213066999583, + "eval_international_law_loss": 3.065718650817871, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.756, + "step": 23300 + }, + { + "epoch": 9.704535996670828, + "grad_norm": 0.30859375, + "learning_rate": 1.6809378556361945e-08, + "loss": 0.2306, + "step": 23320 + }, + { + "epoch": 9.712858926342072, + "grad_norm": 0.30859375, + "learning_rate": 1.5876698977310145e-08, + "loss": 0.2311, + "step": 23340 + }, + { + "epoch": 9.721181856013317, + "grad_norm": 0.263671875, + "learning_rate": 1.497055598022601e-08, + "loss": 0.2291, + "step": 23360 + }, + { + "epoch": 9.729504785684561, + "grad_norm": 0.21875, + "learning_rate": 1.4090959244958402e-08, + "loss": 0.2285, + "step": 23380 + }, + { + "epoch": 9.737827715355806, + "grad_norm": 0.2890625, + "learning_rate": 1.3237918167776919e-08, + "loss": 0.2278, + "step": 23400 + }, + { + "epoch": 9.737827715355806, + "eval_main_loss": 0.23843945562839508, + "eval_main_runtime": 6.3292, + "eval_main_samples_per_second": 30.02, + "eval_main_steps_per_second": 3.792, + "step": 23400 + }, + { + "epoch": 9.737827715355806, + "eval_anatomy_loss": 2.8331258296966553, + "eval_anatomy_runtime": 0.2669, + "eval_anatomy_samples_per_second": 7.494, + "eval_anatomy_steps_per_second": 3.747, + "step": 23400 + }, + { + "epoch": 9.737827715355806, + "eval_college_mathematics_loss": 2.0540261268615723, + "eval_college_mathematics_runtime": 0.2653, + "eval_college_mathematics_samples_per_second": 7.539, + "eval_college_mathematics_steps_per_second": 3.769, + "step": 23400 + }, + { + "epoch": 9.737827715355806, + "eval_international_law_loss": 3.066096067428589, + "eval_international_law_runtime": 0.2663, + "eval_international_law_samples_per_second": 7.511, + "eval_international_law_steps_per_second": 3.755, + "step": 23400 + }, + { + "epoch": 9.746150645027049, + "grad_norm": 0.306640625, + "learning_rate": 1.2411441861269746e-08, + "loss": 0.2354, + "step": 23420 + }, + { + "epoch": 9.754473574698293, + "grad_norm": 0.251953125, + "learning_rate": 1.161153915424873e-08, + "loss": 0.2322, + "step": 23440 + }, + { + "epoch": 9.762796504369538, + "grad_norm": 0.224609375, + "learning_rate": 1.0838218591653348e-08, + "loss": 0.237, + "step": 23460 + }, + { + "epoch": 9.771119434040783, + "grad_norm": 0.3046875, + "learning_rate": 1.0091488434460506e-08, + "loss": 0.2304, + "step": 23480 + }, + { + "epoch": 9.779442363712027, + "grad_norm": 0.275390625, + "learning_rate": 9.371356659595431e-09, + "loss": 0.2284, + "step": 23500 + }, + { + "epoch": 9.779442363712027, + "eval_main_loss": 0.23842047154903412, + "eval_main_runtime": 6.3254, + "eval_main_samples_per_second": 30.038, + "eval_main_steps_per_second": 3.794, + "step": 23500 + }, + { + "epoch": 9.779442363712027, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.266, + "eval_anatomy_samples_per_second": 7.517, + "eval_anatomy_steps_per_second": 3.759, + "step": 23500 + }, + { + "epoch": 9.779442363712027, + "eval_college_mathematics_loss": 2.0542471408843994, + "eval_college_mathematics_runtime": 0.266, + "eval_college_mathematics_samples_per_second": 7.517, + "eval_college_mathematics_steps_per_second": 3.759, + "step": 23500 + }, + { + "epoch": 9.779442363712027, + "eval_international_law_loss": 3.0644288063049316, + "eval_international_law_runtime": 0.2657, + "eval_international_law_samples_per_second": 7.529, + "eval_international_law_steps_per_second": 3.764, + "step": 23500 + }, + { + "epoch": 9.787765293383272, + "grad_norm": 0.294921875, + "learning_rate": 8.677830959846756e-09, + "loss": 0.2291, + "step": 23520 + }, + { + "epoch": 9.796088223054515, + "grad_norm": 0.298828125, + "learning_rate": 8.010918743784624e-09, + "loss": 0.2362, + "step": 23540 + }, + { + "epoch": 9.80441115272576, + "grad_norm": 0.283203125, + "learning_rate": 7.370627135681319e-09, + "loss": 0.2299, + "step": 23560 + }, + { + "epoch": 9.812734082397004, + "grad_norm": 0.3046875, + "learning_rate": 6.75696297543521e-09, + "loss": 0.2291, + "step": 23580 + }, + { + "epoch": 9.821057012068248, + "grad_norm": 0.2890625, + "learning_rate": 6.169932818497482e-09, + "loss": 0.2299, + "step": 23600 + }, + { + "epoch": 9.821057012068248, + "eval_main_loss": 0.2384078949689865, + "eval_main_runtime": 6.3301, + "eval_main_samples_per_second": 30.015, + "eval_main_steps_per_second": 3.791, + "step": 23600 + }, + { + "epoch": 9.821057012068248, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.2668, + "eval_anatomy_samples_per_second": 7.495, + "eval_anatomy_steps_per_second": 3.748, + "step": 23600 + }, + { + "epoch": 9.821057012068248, + "eval_college_mathematics_loss": 2.0545294284820557, + "eval_college_mathematics_runtime": 0.2655, + "eval_college_mathematics_samples_per_second": 7.533, + "eval_college_mathematics_steps_per_second": 3.766, + "step": 23600 + }, + { + "epoch": 9.821057012068248, + "eval_international_law_loss": 3.0657763481140137, + "eval_international_law_runtime": 0.267, + "eval_international_law_samples_per_second": 7.492, + "eval_international_law_steps_per_second": 3.746, + "step": 23600 + }, + { + "epoch": 9.829379941739493, + "grad_norm": 0.314453125, + "learning_rate": 5.609542935802181e-09, + "loss": 0.2375, + "step": 23620 + }, + { + "epoch": 9.837702871410738, + "grad_norm": 0.236328125, + "learning_rate": 5.075799313699892e-09, + "loss": 0.2332, + "step": 23640 + }, + { + "epoch": 9.84602580108198, + "grad_norm": 0.3359375, + "learning_rate": 4.568707653892779e-09, + "loss": 0.2306, + "step": 23660 + }, + { + "epoch": 9.854348730753225, + "grad_norm": 0.29296875, + "learning_rate": 4.088273373373807e-09, + "loss": 0.2353, + "step": 23680 + }, + { + "epoch": 9.86267166042447, + "grad_norm": 0.32421875, + "learning_rate": 3.634501604370122e-09, + "loss": 0.2306, + "step": 23700 + }, + { + "epoch": 9.86267166042447, + "eval_main_loss": 0.23842915892601013, + "eval_main_runtime": 6.3292, + "eval_main_samples_per_second": 30.019, + "eval_main_steps_per_second": 3.792, + "step": 23700 + }, + { + "epoch": 9.86267166042447, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.2678, + "eval_anatomy_samples_per_second": 7.467, + "eval_anatomy_steps_per_second": 3.734, + "step": 23700 + }, + { + "epoch": 9.86267166042447, + "eval_college_mathematics_loss": 2.0545294284820557, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.495, + "eval_college_mathematics_steps_per_second": 3.747, + "step": 23700 + }, + { + "epoch": 9.86267166042447, + "eval_international_law_loss": 3.066246271133423, + "eval_international_law_runtime": 0.2662, + "eval_international_law_samples_per_second": 7.514, + "eval_international_law_steps_per_second": 3.757, + "step": 23700 + }, + { + "epoch": 9.870994590095714, + "grad_norm": 0.314453125, + "learning_rate": 3.2073971942864214e-09, + "loss": 0.2292, + "step": 23720 + }, + { + "epoch": 9.879317519766959, + "grad_norm": 0.287109375, + "learning_rate": 2.806964705654447e-09, + "loss": 0.2297, + "step": 23740 + }, + { + "epoch": 9.887640449438202, + "grad_norm": 0.26953125, + "learning_rate": 2.4332084160835766e-09, + "loss": 0.2313, + "step": 23760 + }, + { + "epoch": 9.895963379109446, + "grad_norm": 0.279296875, + "learning_rate": 2.086132318215861e-09, + "loss": 0.2343, + "step": 23780 + }, + { + "epoch": 9.90428630878069, + "grad_norm": 0.279296875, + "learning_rate": 1.765740119682169e-09, + "loss": 0.2333, + "step": 23800 + }, + { + "epoch": 9.90428630878069, + "eval_main_loss": 0.23843587934970856, + "eval_main_runtime": 6.3325, + "eval_main_samples_per_second": 30.004, + "eval_main_steps_per_second": 3.79, + "step": 23800 + }, + { + "epoch": 9.90428630878069, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.2662, + "eval_anatomy_samples_per_second": 7.514, + "eval_anatomy_steps_per_second": 3.757, + "step": 23800 + }, + { + "epoch": 9.90428630878069, + "eval_college_mathematics_loss": 2.0545294284820557, + "eval_college_mathematics_runtime": 0.2677, + "eval_college_mathematics_samples_per_second": 7.471, + "eval_college_mathematics_steps_per_second": 3.736, + "step": 23800 + }, + { + "epoch": 9.90428630878069, + "eval_international_law_loss": 3.066246271133423, + "eval_international_law_runtime": 0.2655, + "eval_international_law_samples_per_second": 7.534, + "eval_international_law_steps_per_second": 3.767, + "step": 23800 + }, + { + "epoch": 9.912609238451935, + "grad_norm": 0.28515625, + "learning_rate": 1.4720352430644402e-09, + "loss": 0.2361, + "step": 23820 + }, + { + "epoch": 9.92093216812318, + "grad_norm": 0.302734375, + "learning_rate": 1.205020825856551e-09, + "loss": 0.2316, + "step": 23840 + }, + { + "epoch": 9.929255097794425, + "grad_norm": 0.279296875, + "learning_rate": 9.64699720433504e-10, + "loss": 0.2324, + "step": 23860 + }, + { + "epoch": 9.937578027465667, + "grad_norm": 0.26953125, + "learning_rate": 7.510744940192327e-10, + "loss": 0.2323, + "step": 23880 + }, + { + "epoch": 9.945900957136912, + "grad_norm": 0.32421875, + "learning_rate": 5.64147428659123e-10, + "loss": 0.2323, + "step": 23900 + }, + { + "epoch": 9.945900957136912, + "eval_main_loss": 0.23843084275722504, + "eval_main_runtime": 6.3373, + "eval_main_samples_per_second": 29.981, + "eval_main_steps_per_second": 3.787, + "step": 23900 + }, + { + "epoch": 9.945900957136912, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.2674, + "eval_anatomy_samples_per_second": 7.479, + "eval_anatomy_steps_per_second": 3.74, + "step": 23900 + }, + { + "epoch": 9.945900957136912, + "eval_college_mathematics_loss": 2.0545294284820557, + "eval_college_mathematics_runtime": 0.2668, + "eval_college_mathematics_samples_per_second": 7.496, + "eval_college_mathematics_steps_per_second": 3.748, + "step": 23900 + }, + { + "epoch": 9.945900957136912, + "eval_international_law_loss": 3.066246271133423, + "eval_international_law_runtime": 0.2661, + "eval_international_law_samples_per_second": 7.517, + "eval_international_law_steps_per_second": 3.759, + "step": 23900 + }, + { + "epoch": 9.954223886808157, + "grad_norm": 0.310546875, + "learning_rate": 4.0392052119697654e-10, + "loss": 0.2338, + "step": 23920 + }, + { + "epoch": 9.962546816479401, + "grad_norm": 0.3125, + "learning_rate": 2.703954832528055e-10, + "loss": 0.2311, + "step": 23940 + }, + { + "epoch": 9.970869746150646, + "grad_norm": 0.2421875, + "learning_rate": 1.6357374120368197e-10, + "loss": 0.2312, + "step": 23960 + }, + { + "epoch": 9.979192675821889, + "grad_norm": 0.310546875, + "learning_rate": 8.345643617069243e-11, + "loss": 0.232, + "step": 23980 + }, + { + "epoch": 9.987515605493133, + "grad_norm": 0.240234375, + "learning_rate": 3.004442400422747e-11, + "loss": 0.2373, + "step": 24000 + }, + { + "epoch": 9.987515605493133, + "eval_main_loss": 0.23843084275722504, + "eval_main_runtime": 6.3378, + "eval_main_samples_per_second": 29.979, + "eval_main_steps_per_second": 3.787, + "step": 24000 + }, + { + "epoch": 9.987515605493133, + "eval_anatomy_loss": 2.8333277702331543, + "eval_anatomy_runtime": 0.2659, + "eval_anatomy_samples_per_second": 7.522, + "eval_anatomy_steps_per_second": 3.761, + "step": 24000 + }, + { + "epoch": 9.987515605493133, + "eval_college_mathematics_loss": 2.0545294284820557, + "eval_college_mathematics_runtime": 0.2659, + "eval_college_mathematics_samples_per_second": 7.521, + "eval_college_mathematics_steps_per_second": 3.761, + "step": 24000 + }, + { + "epoch": 9.987515605493133, + "eval_international_law_loss": 3.066246271133423, + "eval_international_law_runtime": 0.2672, + "eval_international_law_samples_per_second": 7.486, + "eval_international_law_steps_per_second": 3.743, + "step": 24000 + }, + { + "epoch": 9.995838535164378, + "grad_norm": 0.259765625, + "learning_rate": 3.3382752773203352e-12, + "loss": 0.2333, + "step": 24020 + } + ], + "logging_steps": 20, + "max_steps": 24030, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7646735410967085e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}