{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971563981042654, "eval_steps": 100, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018957345971563982, "grad_norm": 264.3919199015695, "learning_rate": 4.545454545454545e-08, "logits/chosen": 117.53560638427734, "logits/rejected": 126.8960952758789, "logps/chosen": -335.40118408203125, "logps/rejected": -439.16552734375, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1895734597156398, "grad_norm": 268.0892149105686, "learning_rate": 4.545454545454545e-07, "logits/chosen": 135.0012969970703, "logits/rejected": 138.34600830078125, "logps/chosen": -395.8360595703125, "logps/rejected": -439.23095703125, "loss": 0.9693, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.008023276925086975, "rewards/margins": 0.0031940473709255457, "rewards/rejected": 0.004829231183975935, "step": 10 }, { "epoch": 0.3791469194312796, "grad_norm": 204.23414477809496, "learning_rate": 4.885348141000122e-07, "logits/chosen": 121.4811019897461, "logits/rejected": 125.18589782714844, "logps/chosen": -370.91253662109375, "logps/rejected": -425.2193298339844, "loss": 0.8003, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.07497303187847137, "rewards/margins": 0.33636990189552307, "rewards/rejected": -0.2613968253135681, "step": 20 }, { "epoch": 0.5687203791469194, "grad_norm": 173.7304699078016, "learning_rate": 4.5025027361734613e-07, "logits/chosen": 141.61241149902344, "logits/rejected": 135.17759704589844, "logps/chosen": -426.39410400390625, "logps/rejected": -472.140625, "loss": 0.7258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6964428424835205, "rewards/margins": 0.9104781150817871, "rewards/rejected": -2.6069209575653076, "step": 30 }, { "epoch": 0.7582938388625592, "grad_norm": 189.807566233107, "learning_rate": 3.893311157806091e-07, "logits/chosen": 125.9510498046875, "logits/rejected": 114.42036437988281, "logps/chosen": -409.6640625, "logps/rejected": -436.3352966308594, "loss": 0.6779, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7734901905059814, "rewards/margins": 1.1495102643966675, "rewards/rejected": -3.9230003356933594, "step": 40 }, { "epoch": 0.9478672985781991, "grad_norm": 208.75706608415575, "learning_rate": 3.126631330646801e-07, "logits/chosen": 141.94976806640625, "logits/rejected": 145.94911193847656, "logps/chosen": -465.7303161621094, "logps/rejected": -547.1525268554688, "loss": 0.5763, "rewards/accuracies": 0.75, "rewards/chosen": -2.3293235301971436, "rewards/margins": 1.2119842767715454, "rewards/rejected": -3.5413079261779785, "step": 50 }, { "epoch": 1.1374407582938388, "grad_norm": 137.79002373576745, "learning_rate": 2.2891223348923882e-07, "logits/chosen": 134.63436889648438, "logits/rejected": 138.12661743164062, "logps/chosen": -450.57525634765625, "logps/rejected": -530.6395263671875, "loss": 0.3625, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.628477096557617, "rewards/margins": 2.0852205753326416, "rewards/rejected": -4.713697910308838, "step": 60 }, { "epoch": 1.3270142180094786, "grad_norm": 111.85723665841337, "learning_rate": 1.4754491880085317e-07, "logits/chosen": 128.9261932373047, "logits/rejected": 130.80894470214844, "logps/chosen": -414.4986877441406, "logps/rejected": -507.4915466308594, "loss": 0.2163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.367741823196411, "rewards/margins": 2.3311409950256348, "rewards/rejected": -4.698883056640625, "step": 70 }, { "epoch": 1.5165876777251186, "grad_norm": 115.1507773079353, "learning_rate": 7.775827023107834e-08, "logits/chosen": 116.55960845947266, "logits/rejected": 133.2828369140625, "logps/chosen": -403.3376770019531, "logps/rejected": -510.32763671875, "loss": 0.1811, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.4843320846557617, "rewards/margins": 2.370164394378662, "rewards/rejected": -4.854496955871582, "step": 80 }, { "epoch": 1.7061611374407581, "grad_norm": 106.89250321890712, "learning_rate": 2.7440387297912122e-08, "logits/chosen": 116.5929183959961, "logits/rejected": 129.53289794921875, "logps/chosen": -430.5577087402344, "logps/rejected": -534.07861328125, "loss": 0.1604, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.206333637237549, "rewards/margins": 2.613506555557251, "rewards/rejected": -4.819840431213379, "step": 90 }, { "epoch": 1.8957345971563981, "grad_norm": 92.45827774262504, "learning_rate": 2.27878296044029e-09, "logits/chosen": 122.9133529663086, "logits/rejected": 122.8787612915039, "logps/chosen": -420.15228271484375, "logps/rejected": -503.9156188964844, "loss": 0.1516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9088605642318726, "rewards/margins": 2.448718547821045, "rewards/rejected": -4.357579231262207, "step": 100 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": 101.81689453125, "eval_logits/rejected": 95.97393798828125, "eval_logps/chosen": -416.4397888183594, "eval_logps/rejected": -440.0738830566406, "eval_loss": 0.5250489711761475, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -2.638577699661255, "eval_rewards/margins": 1.2615679502487183, "eval_rewards/rejected": -3.9001457691192627, "eval_runtime": 123.0399, "eval_samples_per_second": 6.096, "eval_steps_per_second": 0.195, "step": 100 }, { "epoch": 1.971563981042654, "step": 104, "total_flos": 0.0, "train_loss": 0.4740314678503917, "train_runtime": 2291.7376, "train_samples_per_second": 5.891, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }