{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "global_step": 44028, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 1.9772871808848918e-05, "loss": 3.0067, "step": 500 }, { "epoch": 0.09, "learning_rate": 1.954574361769783e-05, "loss": 2.1686, "step": 1000 }, { "epoch": 0.14, "learning_rate": 1.9318615426546744e-05, "loss": 1.9283, "step": 1500 }, { "epoch": 0.18, "learning_rate": 1.909148723539566e-05, "loss": 1.8097, "step": 2000 }, { "epoch": 0.23, "learning_rate": 1.8864359044244573e-05, "loss": 1.6774, "step": 2500 }, { "epoch": 0.27, "learning_rate": 1.8637230853093485e-05, "loss": 1.6303, "step": 3000 }, { "epoch": 0.32, "learning_rate": 1.84101026619424e-05, "loss": 1.5638, "step": 3500 }, { "epoch": 0.36, "learning_rate": 1.8182974470791318e-05, "loss": 1.5224, "step": 4000 }, { "epoch": 0.41, "learning_rate": 1.795584627964023e-05, "loss": 1.4723, "step": 4500 }, { "epoch": 0.45, "learning_rate": 1.7728718088489147e-05, "loss": 1.4697, "step": 5000 }, { "epoch": 0.5, "learning_rate": 1.750158989733806e-05, "loss": 1.4104, "step": 5500 }, { "epoch": 0.55, "learning_rate": 1.7274461706186973e-05, "loss": 1.4252, "step": 6000 }, { "epoch": 0.59, "learning_rate": 1.704733351503589e-05, "loss": 1.3764, "step": 6500 }, { "epoch": 0.64, "learning_rate": 1.68202053238848e-05, "loss": 1.3702, "step": 7000 }, { "epoch": 0.68, "learning_rate": 1.6593077132733718e-05, "loss": 1.3184, "step": 7500 }, { "epoch": 0.73, "learning_rate": 1.636594894158263e-05, "loss": 1.2977, "step": 8000 }, { "epoch": 0.77, "learning_rate": 1.6138820750431547e-05, "loss": 1.3242, "step": 8500 }, { "epoch": 0.82, "learning_rate": 1.591169255928046e-05, "loss": 1.2685, "step": 9000 }, { "epoch": 0.86, "learning_rate": 1.5684564368129372e-05, "loss": 1.2796, "step": 9500 }, { "epoch": 0.91, "learning_rate": 1.545743617697829e-05, "loss": 1.2629, "step": 10000 }, { "epoch": 0.95, "learning_rate": 1.5230307985827202e-05, "loss": 1.2414, "step": 10500 }, { "epoch": 1.0, "learning_rate": 1.5003179794676118e-05, "loss": 1.2226, "step": 11000 }, { "epoch": 1.04, "learning_rate": 1.477605160352503e-05, "loss": 1.0322, "step": 11500 }, { "epoch": 1.09, "learning_rate": 1.4548923412373945e-05, "loss": 1.0281, "step": 12000 }, { "epoch": 1.14, "learning_rate": 1.4321795221222858e-05, "loss": 1.0143, "step": 12500 }, { "epoch": 1.18, "learning_rate": 1.4094667030071774e-05, "loss": 1.0324, "step": 13000 }, { "epoch": 1.23, "learning_rate": 1.3867538838920689e-05, "loss": 0.996, "step": 13500 }, { "epoch": 1.27, "learning_rate": 1.3640410647769601e-05, "loss": 1.0049, "step": 14000 }, { "epoch": 1.32, "learning_rate": 1.3413282456618518e-05, "loss": 0.9879, "step": 14500 }, { "epoch": 1.36, "learning_rate": 1.318615426546743e-05, "loss": 0.9806, "step": 15000 }, { "epoch": 1.41, "learning_rate": 1.2959026074316345e-05, "loss": 0.986, "step": 15500 }, { "epoch": 1.45, "learning_rate": 1.2731897883165258e-05, "loss": 0.9894, "step": 16000 }, { "epoch": 1.5, "learning_rate": 1.2504769692014174e-05, "loss": 1.0172, "step": 16500 }, { "epoch": 1.54, "learning_rate": 1.2277641500863089e-05, "loss": 0.9919, "step": 17000 }, { "epoch": 1.59, "learning_rate": 1.2050513309712001e-05, "loss": 1.0085, "step": 17500 }, { "epoch": 1.64, "learning_rate": 1.1823385118560918e-05, "loss": 0.9667, "step": 18000 }, { "epoch": 1.68, "learning_rate": 1.159625692740983e-05, "loss": 0.989, "step": 18500 }, { "epoch": 1.73, "learning_rate": 1.1369128736258745e-05, "loss": 0.985, "step": 19000 }, { "epoch": 1.77, "learning_rate": 1.1142000545107661e-05, "loss": 1.0043, "step": 19500 }, { "epoch": 1.82, "learning_rate": 1.0914872353956574e-05, "loss": 0.9636, "step": 20000 }, { "epoch": 1.86, "learning_rate": 1.0687744162805489e-05, "loss": 0.9883, "step": 20500 }, { "epoch": 1.91, "learning_rate": 1.0460615971654401e-05, "loss": 0.964, "step": 21000 }, { "epoch": 1.95, "learning_rate": 1.0233487780503318e-05, "loss": 0.951, "step": 21500 }, { "epoch": 2.0, "learning_rate": 1.000635958935223e-05, "loss": 0.9524, "step": 22000 }, { "epoch": 2.04, "learning_rate": 9.779231398201145e-06, "loss": 0.7409, "step": 22500 }, { "epoch": 2.09, "learning_rate": 9.55210320705006e-06, "loss": 0.7549, "step": 23000 }, { "epoch": 2.14, "learning_rate": 9.324975015898974e-06, "loss": 0.7276, "step": 23500 }, { "epoch": 2.18, "learning_rate": 9.097846824747889e-06, "loss": 0.7692, "step": 24000 }, { "epoch": 2.23, "learning_rate": 8.870718633596803e-06, "loss": 0.7532, "step": 24500 }, { "epoch": 2.27, "learning_rate": 8.643590442445718e-06, "loss": 0.7775, "step": 25000 }, { "epoch": 2.32, "learning_rate": 8.416462251294632e-06, "loss": 0.7352, "step": 25500 }, { "epoch": 2.36, "learning_rate": 8.189334060143545e-06, "loss": 0.7427, "step": 26000 }, { "epoch": 2.41, "learning_rate": 7.96220586899246e-06, "loss": 0.7589, "step": 26500 }, { "epoch": 2.45, "learning_rate": 7.735077677841374e-06, "loss": 0.7267, "step": 27000 }, { "epoch": 2.5, "learning_rate": 7.507949486690289e-06, "loss": 0.7502, "step": 27500 }, { "epoch": 2.54, "learning_rate": 7.280821295539203e-06, "loss": 0.7225, "step": 28000 }, { "epoch": 2.59, "learning_rate": 7.0536931043881176e-06, "loss": 0.7362, "step": 28500 }, { "epoch": 2.63, "learning_rate": 6.826564913237031e-06, "loss": 0.7324, "step": 29000 }, { "epoch": 2.68, "learning_rate": 6.599436722085946e-06, "loss": 0.7083, "step": 29500 }, { "epoch": 2.73, "learning_rate": 6.3723085309348594e-06, "loss": 0.7398, "step": 30000 }, { "epoch": 2.77, "learning_rate": 6.145180339783774e-06, "loss": 0.7333, "step": 30500 }, { "epoch": 2.82, "learning_rate": 5.918052148632689e-06, "loss": 0.7494, "step": 31000 }, { "epoch": 2.86, "learning_rate": 5.690923957481603e-06, "loss": 0.7604, "step": 31500 }, { "epoch": 2.91, "learning_rate": 5.4637957663305175e-06, "loss": 0.7495, "step": 32000 }, { "epoch": 2.95, "learning_rate": 5.236667575179431e-06, "loss": 0.7622, "step": 32500 }, { "epoch": 3.0, "learning_rate": 5.009539384028346e-06, "loss": 0.731, "step": 33000 }, { "epoch": 3.04, "learning_rate": 4.78241119287726e-06, "loss": 0.5705, "step": 33500 }, { "epoch": 3.09, "learning_rate": 4.555283001726175e-06, "loss": 0.5773, "step": 34000 }, { "epoch": 3.13, "learning_rate": 4.3281548105750885e-06, "loss": 0.5764, "step": 34500 }, { "epoch": 3.18, "learning_rate": 4.101026619424003e-06, "loss": 0.5782, "step": 35000 }, { "epoch": 3.23, "learning_rate": 3.8738984282729175e-06, "loss": 0.582, "step": 35500 }, { "epoch": 3.27, "learning_rate": 3.6467702371218316e-06, "loss": 0.5784, "step": 36000 }, { "epoch": 3.32, "learning_rate": 3.419642045970746e-06, "loss": 0.5942, "step": 36500 }, { "epoch": 3.36, "learning_rate": 3.1925138548196606e-06, "loss": 0.5642, "step": 37000 }, { "epoch": 3.41, "learning_rate": 2.9653856636685747e-06, "loss": 0.5889, "step": 37500 }, { "epoch": 3.45, "learning_rate": 2.7382574725174893e-06, "loss": 0.5779, "step": 38000 }, { "epoch": 3.5, "learning_rate": 2.5111292813664034e-06, "loss": 0.5782, "step": 38500 }, { "epoch": 3.54, "learning_rate": 2.2840010902153175e-06, "loss": 0.5962, "step": 39000 }, { "epoch": 3.59, "learning_rate": 2.056872899064232e-06, "loss": 0.5722, "step": 39500 }, { "epoch": 3.63, "learning_rate": 1.8297447079131463e-06, "loss": 0.5736, "step": 40000 }, { "epoch": 3.68, "learning_rate": 1.6026165167620606e-06, "loss": 0.569, "step": 40500 }, { "epoch": 3.72, "learning_rate": 1.375488325610975e-06, "loss": 0.5749, "step": 41000 }, { "epoch": 3.77, "learning_rate": 1.1483601344598892e-06, "loss": 0.5859, "step": 41500 }, { "epoch": 3.82, "learning_rate": 9.212319433088035e-07, "loss": 0.5674, "step": 42000 }, { "epoch": 3.86, "learning_rate": 6.94103752157718e-07, "loss": 0.5916, "step": 42500 }, { "epoch": 3.91, "learning_rate": 4.6697556100663217e-07, "loss": 0.5707, "step": 43000 }, { "epoch": 3.95, "learning_rate": 2.398473698555465e-07, "loss": 0.5624, "step": 43500 }, { "epoch": 4.0, "learning_rate": 1.2719178704460798e-08, "loss": 0.5748, "step": 44000 }, { "epoch": 4.0, "step": 44028, "total_flos": 5.176959239737958e+16, "train_loss": 0.9650859335002343, "train_runtime": 62054.88, "train_samples_per_second": 8.514, "train_steps_per_second": 0.71 } ], "max_steps": 44028, "num_train_epochs": 4, "total_flos": 5.176959239737958e+16, "trial_name": null, "trial_params": null }