{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4e-08, "logits/chosen": -2.6577353477478027, "logits/rejected": -2.043900489807129, "logps/chosen": -505.98724365234375, "logps/rejected": -319.40179443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -2.477527141571045, "logits/rejected": -2.134815216064453, "logps/chosen": -285.37506103515625, "logps/rejected": -191.59552001953125, "loss": 0.6932, "rewards/accuracies": 0.2777777910232544, "rewards/chosen": -1.798523953766562e-05, "rewards/margins": -2.5926061425707303e-05, "rewards/rejected": 7.940820069052279e-06, "step": 10 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-07, "logits/chosen": -2.360628843307495, "logits/rejected": -2.1267056465148926, "logps/chosen": -271.4191589355469, "logps/rejected": -208.81991577148438, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00010290103818988428, "rewards/margins": 2.105976818711497e-05, "rewards/rejected": 8.184127364074811e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -2.278747797012329, "logits/rejected": -2.2493417263031006, "logps/chosen": -269.8002624511719, "logps/rejected": -288.9651794433594, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00028741464484483004, "rewards/margins": 0.00019061131752096117, "rewards/rejected": 9.68033418757841e-05, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -2.5087287425994873, "logits/rejected": -2.340841293334961, "logps/chosen": -210.5767059326172, "logps/rejected": -181.60897827148438, "loss": 0.6931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0001595711219124496, "rewards/margins": 0.00010818429291248322, "rewards/rejected": 5.138682899996638e-05, "step": 40 }, { "epoch": 0.04, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.362971067428589, "logits/rejected": -2.338986873626709, "logps/chosen": -195.54049682617188, "logps/rejected": -211.3101806640625, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0001843376230681315, "rewards/margins": 0.00030581915052607656, "rewards/rejected": -0.0001214815056300722, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -2.495109796524048, "logits/rejected": -2.304320812225342, "logps/chosen": -244.63357543945312, "logps/rejected": -277.849853515625, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.000983591889962554, "rewards/margins": 1.5824800357222557e-05, "rewards/rejected": -0.0009994168067350984, "step": 60 }, { "epoch": 0.06, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -2.2505688667297363, "logits/rejected": -2.2676665782928467, "logps/chosen": -229.9136199951172, "logps/rejected": -216.98001098632812, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0010834664572030306, "rewards/margins": 0.0009649285930208862, "rewards/rejected": -0.0020483951084315777, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -2.4005239009857178, "logits/rejected": -2.3944191932678223, "logps/chosen": -266.11859130859375, "logps/rejected": -262.9701232910156, "loss": 0.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0014705440262332559, "rewards/margins": 0.002042059786617756, "rewards/rejected": -0.00351260369643569, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -2.3290226459503174, "logits/rejected": -1.9915828704833984, "logps/chosen": -268.29278564453125, "logps/rejected": -180.13323974609375, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.002903540385887027, "rewards/margins": 0.002247781725600362, "rewards/rejected": -0.005151322111487389, "step": 90 }, { "epoch": 0.08, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.3091585636138916, "logits/rejected": -2.312863826751709, "logps/chosen": -275.3695373535156, "logps/rejected": -251.3533477783203, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003613454522565007, "rewards/margins": 0.0010787051869556308, "rewards/rejected": -0.004692160524427891, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.4e-06, "logits/chosen": -2.258510112762451, "logits/rejected": -2.1448404788970947, "logps/chosen": -211.90158081054688, "logps/rejected": -180.40475463867188, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": 0.0001777430734364316, "rewards/margins": 0.0037381600122898817, "rewards/rejected": -0.003560416866093874, "step": 110 }, { "epoch": 0.1, "learning_rate": 4.800000000000001e-06, "logits/chosen": -2.158743381500244, "logits/rejected": -2.169588088989258, "logps/chosen": -238.22067260742188, "logps/rejected": -270.46417236328125, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": -0.0018882494186982512, "rewards/margins": 0.005054115317761898, "rewards/rejected": -0.006942364387214184, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999756310023261e-06, "logits/chosen": -2.4432594776153564, "logits/rejected": -2.315918445587158, "logps/chosen": -237.79501342773438, "logps/rejected": -196.10853576660156, "loss": 0.6911, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036071953363716602, "rewards/margins": 0.005986797157675028, "rewards/rejected": -0.009593991562724113, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.997807075247147e-06, "logits/chosen": -2.3266425132751465, "logits/rejected": -2.025289535522461, "logps/chosen": -207.1509552001953, "logps/rejected": -187.79754638671875, "loss": 0.6907, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.015933997929096222, "rewards/margins": 0.004653518553823233, "rewards/rejected": -0.02058752067387104, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.993910125649561e-06, "logits/chosen": -2.303725481033325, "logits/rejected": -2.190458297729492, "logps/chosen": -239.7921600341797, "logps/rejected": -213.1902618408203, "loss": 0.6873, "rewards/accuracies": 0.625, "rewards/chosen": -0.01170186698436737, "rewards/margins": 0.013611750677227974, "rewards/rejected": -0.025313619524240494, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.988068499954578e-06, "logits/chosen": -2.1805403232574463, "logits/rejected": -2.2514309883117676, "logps/chosen": -323.44732666015625, "logps/rejected": -342.3636779785156, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.07079382240772247, "rewards/margins": 0.012775696814060211, "rewards/rejected": -0.08356951922178268, "step": 160 }, { "epoch": 0.14, "learning_rate": 4.980286753286196e-06, "logits/chosen": -2.377260446548462, "logits/rejected": -1.992498755455017, "logps/chosen": -361.740966796875, "logps/rejected": -316.5578308105469, "loss": 0.6815, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09483315795660019, "rewards/margins": -0.003298636060208082, "rewards/rejected": -0.09153451770544052, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.970570953616383e-06, "logits/chosen": -2.125253677368164, "logits/rejected": -2.1080162525177, "logps/chosen": -345.4566955566406, "logps/rejected": -362.8345642089844, "loss": 0.6796, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.11889471858739853, "rewards/margins": 0.017375323921442032, "rewards/rejected": -0.13627007603645325, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.958928677033465e-06, "logits/chosen": -1.872900366783142, "logits/rejected": -1.8023853302001953, "logps/chosen": -607.0579223632812, "logps/rejected": -631.1737060546875, "loss": 0.6814, "rewards/accuracies": 0.5, "rewards/chosen": -0.37075790762901306, "rewards/margins": 0.051843322813510895, "rewards/rejected": -0.42260122299194336, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -2.2707138061523438, "logits/rejected": -2.1255440711975098, "logps/chosen": -350.2916564941406, "logps/rejected": -372.66778564453125, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": -0.09592956304550171, "rewards/margins": 0.014601891860365868, "rewards/rejected": -0.11053146421909332, "step": 200 }, { "epoch": 0.17, "learning_rate": 4.9299025014463665e-06, "logits/chosen": -2.4437155723571777, "logits/rejected": -2.0841479301452637, "logps/chosen": -348.2846984863281, "logps/rejected": -332.79693603515625, "loss": 0.6716, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07477747648954391, "rewards/margins": 0.06754375249147415, "rewards/rejected": -0.14232121407985687, "step": 210 }, { "epoch": 0.18, "learning_rate": 4.912541236180779e-06, "logits/chosen": -2.2131965160369873, "logits/rejected": -1.79428231716156, "logps/chosen": -453.1966247558594, "logps/rejected": -504.7650451660156, "loss": 0.6753, "rewards/accuracies": 0.5, "rewards/chosen": -0.2166297435760498, "rewards/margins": 0.09042102098464966, "rewards/rejected": -0.30705076456069946, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.893298743830168e-06, "logits/chosen": -2.114473342895508, "logits/rejected": -2.1522789001464844, "logps/chosen": -447.54510498046875, "logps/rejected": -531.5616455078125, "loss": 0.6747, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.24870488047599792, "rewards/margins": 0.05732503533363342, "rewards/rejected": -0.30602994561195374, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.8721900291112415e-06, "logits/chosen": -2.0410103797912598, "logits/rejected": -1.8702392578125, "logps/chosen": -417.52557373046875, "logps/rejected": -418.7784118652344, "loss": 0.6794, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.21090665459632874, "rewards/margins": 0.014016765169799328, "rewards/rejected": -0.22492341697216034, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.9370673894882202, "logits/rejected": -1.845969796180725, "logps/chosen": -467.31817626953125, "logps/rejected": -516.6582641601562, "loss": 0.6586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2973518967628479, "rewards/margins": 0.04663931205868721, "rewards/rejected": -0.34399116039276123, "step": 250 }, { "epoch": 0.21, "learning_rate": 4.824441214720629e-06, "logits/chosen": -1.7340936660766602, "logits/rejected": -1.4695367813110352, "logps/chosen": -1242.895751953125, "logps/rejected": -1449.67138671875, "loss": 0.6468, "rewards/accuracies": 0.625, "rewards/chosen": -0.9694870114326477, "rewards/margins": 0.2230859100818634, "rewards/rejected": -1.192572832107544, "step": 260 }, { "epoch": 0.22, "learning_rate": 4.7978383481380865e-06, "logits/chosen": -1.9577367305755615, "logits/rejected": -1.7384374141693115, "logps/chosen": -651.7098999023438, "logps/rejected": -757.5595703125, "loss": 0.6567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.40336376428604126, "rewards/margins": 0.14299169182777405, "rewards/rejected": -0.5463554263114929, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.769443696332272e-06, "logits/chosen": -1.6682405471801758, "logits/rejected": -1.7225072383880615, "logps/chosen": -636.3197021484375, "logps/rejected": -823.1611328125, "loss": 0.654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4699520468711853, "rewards/margins": 0.15158866345882416, "rewards/rejected": -0.6215407252311707, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.7392794005985324e-06, "logits/chosen": -1.7123403549194336, "logits/rejected": -1.4536840915679932, "logps/chosen": -984.7986450195312, "logps/rejected": -1116.86328125, "loss": 0.6296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7080470323562622, "rewards/margins": 0.18753795325756073, "rewards/rejected": -0.8955849409103394, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.707368982147318e-06, "logits/chosen": -1.5288642644882202, "logits/rejected": -1.4273738861083984, "logps/chosen": -1024.952880859375, "logps/rejected": -1212.50439453125, "loss": 0.6232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.734005331993103, "rewards/margins": 0.26781877875328064, "rewards/rejected": -1.0018240213394165, "step": 300 }, { "epoch": 0.25, "learning_rate": 4.673737323763048e-06, "logits/chosen": -1.244638204574585, "logits/rejected": -1.2806625366210938, "logps/chosen": -906.6008911132812, "logps/rejected": -1112.8660888671875, "loss": 0.6264, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.713959813117981, "rewards/margins": 0.16486592590808868, "rewards/rejected": -0.8788257837295532, "step": 310 }, { "epoch": 0.26, "learning_rate": 4.638410650401267e-06, "logits/chosen": -1.2582708597183228, "logits/rejected": -0.9609068632125854, "logps/chosen": -789.2811889648438, "logps/rejected": -924.0436401367188, "loss": 0.6751, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.53520667552948, "rewards/margins": 0.21356996893882751, "rewards/rejected": -0.7487767338752747, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.601416508739211e-06, "logits/chosen": -1.5856783390045166, "logits/rejected": -1.6001255512237549, "logps/chosen": -586.2677001953125, "logps/rejected": -678.0260009765625, "loss": 0.6333, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.31349319219589233, "rewards/margins": 0.13062533736228943, "rewards/rejected": -0.44411858916282654, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.562783745695738e-06, "logits/chosen": -1.710599660873413, "logits/rejected": -1.4391021728515625, "logps/chosen": -415.0328674316406, "logps/rejected": -529.9607543945312, "loss": 0.6341, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2211925983428955, "rewards/margins": 0.15037958323955536, "rewards/rejected": -0.3715721666812897, "step": 340 }, { "epoch": 0.28, "learning_rate": 4.522542485937369e-06, "logits/chosen": -1.8084720373153687, "logits/rejected": -1.506037712097168, "logps/chosen": -570.231201171875, "logps/rejected": -703.8088989257812, "loss": 0.6278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.31438136100769043, "rewards/margins": 0.16334742307662964, "rewards/rejected": -0.4777289032936096, "step": 350 }, { "epoch": 0.29, "learning_rate": 4.4807241083879774e-06, "logits/chosen": -1.4908154010772705, "logits/rejected": -1.2486298084259033, "logps/chosen": -1008.43310546875, "logps/rejected": -1258.6966552734375, "loss": 0.6064, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7509051561355591, "rewards/margins": 0.28672417998313904, "rewards/rejected": -1.0376293659210205, "step": 360 }, { "epoch": 0.3, "learning_rate": 4.437361221760449e-06, "logits/chosen": -1.114203929901123, "logits/rejected": -1.0544617176055908, "logps/chosen": -935.9404296875, "logps/rejected": -1014.9347534179688, "loss": 0.6677, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7481463551521301, "rewards/margins": 0.08286388218402863, "rewards/rejected": -0.8310102224349976, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.3924876391293915e-06, "logits/chosen": -0.82923823595047, "logits/rejected": -0.7714365124702454, "logps/chosen": -1037.5648193359375, "logps/rejected": -1317.840087890625, "loss": 0.6065, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8485895991325378, "rewards/margins": 0.2712119519710541, "rewards/rejected": -1.1198015213012695, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.346138351564711e-06, "logits/chosen": -0.4438857138156891, "logits/rejected": -0.24459032714366913, "logps/chosen": -1160.9703369140625, "logps/rejected": -1448.4993896484375, "loss": 0.6151, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.9693711996078491, "rewards/margins": 0.3293563425540924, "rewards/rejected": -1.2987276315689087, "step": 390 }, { "epoch": 0.32, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -1.0683261156082153, "logits/rejected": -1.0331823825836182, "logps/chosen": -835.9192504882812, "logps/rejected": -1117.007568359375, "loss": 0.672, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.6222294569015503, "rewards/margins": 0.26769739389419556, "rewards/rejected": -0.8899267911911011, "step": 400 }, { "epoch": 0.33, "learning_rate": 4.249158351283414e-06, "logits/chosen": -1.524524450302124, "logits/rejected": -1.3127562999725342, "logps/chosen": -700.7545166015625, "logps/rejected": -866.1266479492188, "loss": 0.6623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4409456253051758, "rewards/margins": 0.15933458507061005, "rewards/rejected": -0.600280225276947, "step": 410 }, { "epoch": 0.34, "learning_rate": 4.198603260653792e-06, "logits/chosen": -1.8277499675750732, "logits/rejected": -1.6153600215911865, "logps/chosen": -648.5438842773438, "logps/rejected": -690.0060424804688, "loss": 0.6354, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3811865448951721, "rewards/margins": 0.06636995077133179, "rewards/rejected": -0.4475564956665039, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.146723650296701e-06, "logits/chosen": -1.4686410427093506, "logits/rejected": -1.4586213827133179, "logps/chosen": -1219.008544921875, "logps/rejected": -1602.6322021484375, "loss": 0.6145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9192007184028625, "rewards/margins": 0.37095293402671814, "rewards/rejected": -1.2901536226272583, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.093559974371725e-06, "logits/chosen": -1.156723976135254, "logits/rejected": -0.8702109456062317, "logps/chosen": -1515.7261962890625, "logps/rejected": -1630.477783203125, "loss": 0.5933, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.331788420677185, "rewards/margins": 0.15234079957008362, "rewards/rejected": -1.4841291904449463, "step": 440 }, { "epoch": 0.36, "learning_rate": 4.039153688314146e-06, "logits/chosen": -1.4883257150650024, "logits/rejected": -1.2767736911773682, "logps/chosen": -1196.55517578125, "logps/rejected": -1548.430419921875, "loss": 0.6235, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9334648847579956, "rewards/margins": 0.3865690231323242, "rewards/rejected": -1.3200337886810303, "step": 450 }, { "epoch": 0.37, "learning_rate": 3.983547216509254e-06, "logits/chosen": -1.688001275062561, "logits/rejected": -1.5989247560501099, "logps/chosen": -713.7869873046875, "logps/rejected": -806.5700073242188, "loss": 0.6501, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4852449297904968, "rewards/margins": 0.12311458587646484, "rewards/rejected": -0.6083595752716064, "step": 460 }, { "epoch": 0.38, "learning_rate": 3.92678391921108e-06, "logits/chosen": -1.6326487064361572, "logits/rejected": -1.5402270555496216, "logps/chosen": -553.8263549804688, "logps/rejected": -636.650390625, "loss": 0.6405, "rewards/accuracies": 0.5, "rewards/chosen": -0.3886161148548126, "rewards/margins": 0.10035456717014313, "rewards/rejected": -0.48897066712379456, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.868908058731376e-06, "logits/chosen": -1.2314743995666504, "logits/rejected": -1.1016968488693237, "logps/chosen": -1220.119140625, "logps/rejected": -1464.5045166015625, "loss": 0.627, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0543556213378906, "rewards/margins": 0.22365888953208923, "rewards/rejected": -1.2780145406723022, "step": 480 }, { "epoch": 0.39, "learning_rate": 3.8099647649251984e-06, "logits/chosen": -1.1137750148773193, "logits/rejected": -0.8115717768669128, "logps/chosen": -1465.813720703125, "logps/rejected": -1871.6728515625, "loss": 0.5908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2194162607192993, "rewards/margins": 0.45649608969688416, "rewards/rejected": -1.6759124994277954, "step": 490 }, { "epoch": 0.4, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.160712480545044, "logits/rejected": -1.0044102668762207, "logps/chosen": -1403.710693359375, "logps/rejected": -1854.1767578125, "loss": 0.6275, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2094306945800781, "rewards/margins": 0.44932323694229126, "rewards/rejected": -1.658753752708435, "step": 500 }, { "epoch": 0.41, "learning_rate": 3.689060522675689e-06, "logits/chosen": -1.4455738067626953, "logits/rejected": -1.3064179420471191, "logps/chosen": -1048.9417724609375, "logps/rejected": -1300.092529296875, "loss": 0.5972, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8072590827941895, "rewards/margins": 0.2859098017215729, "rewards/rejected": -1.0931689739227295, "step": 510 }, { "epoch": 0.42, "learning_rate": 3.627193851723577e-06, "logits/chosen": -1.168351650238037, "logits/rejected": -1.0151941776275635, "logps/chosen": -1202.1624755859375, "logps/rejected": -1375.6861572265625, "loss": 0.6332, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.973349928855896, "rewards/margins": 0.1988798826932907, "rewards/rejected": -1.1722297668457031, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.564448228912682e-06, "logits/chosen": -1.2055370807647705, "logits/rejected": -1.1897690296173096, "logps/chosen": -1302.2347412109375, "logps/rejected": -1396.9376220703125, "loss": 0.6383, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.0556827783584595, "rewards/margins": 0.08378318697214127, "rewards/rejected": -1.1394660472869873, "step": 530 }, { "epoch": 0.43, "learning_rate": 3.5008725813922383e-06, "logits/chosen": -1.2467204332351685, "logits/rejected": -0.9966660737991333, "logps/chosen": -1237.6484375, "logps/rejected": -1357.805908203125, "loss": 0.6061, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9747149348258972, "rewards/margins": 0.1926373690366745, "rewards/rejected": -1.167352318763733, "step": 540 }, { "epoch": 0.44, "learning_rate": 3.436516483539781e-06, "logits/chosen": -1.0729809999465942, "logits/rejected": -0.880784809589386, "logps/chosen": -1176.840576171875, "logps/rejected": -1703.155029296875, "loss": 0.5811, "rewards/accuracies": 0.625, "rewards/chosen": -0.9750388860702515, "rewards/margins": 0.547395646572113, "rewards/rejected": -1.5224344730377197, "step": 550 }, { "epoch": 0.45, "learning_rate": 3.3714301183045382e-06, "logits/chosen": -1.343149185180664, "logits/rejected": -1.3310397863388062, "logps/chosen": -1393.66748046875, "logps/rejected": -1949.219482421875, "loss": 0.593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0957911014556885, "rewards/margins": 0.5795475244522095, "rewards/rejected": -1.6753385066986084, "step": 560 }, { "epoch": 0.46, "learning_rate": 3.3056642380762783e-06, "logits/chosen": -1.2191109657287598, "logits/rejected": -1.1489986181259155, "logps/chosen": -1165.780517578125, "logps/rejected": -1509.78173828125, "loss": 0.5879, "rewards/accuracies": 0.625, "rewards/chosen": -0.9111903309822083, "rewards/margins": 0.37165799736976624, "rewards/rejected": -1.2828481197357178, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.2392701251101172e-06, "logits/chosen": -1.2665627002716064, "logits/rejected": -1.0979241132736206, "logps/chosen": -1186.7427978515625, "logps/rejected": -1492.552001953125, "loss": 0.601, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.9691814184188843, "rewards/margins": 0.35359352827072144, "rewards/rejected": -1.3227750062942505, "step": 580 }, { "epoch": 0.47, "learning_rate": 3.1722995515381644e-06, "logits/chosen": -1.089277982711792, "logits/rejected": -0.9281560778617859, "logps/chosen": -1328.541015625, "logps/rejected": -1635.415771484375, "loss": 0.5736, "rewards/accuracies": 0.5, "rewards/chosen": -1.0775573253631592, "rewards/margins": 0.35047078132629395, "rewards/rejected": -1.4280281066894531, "step": 590 }, { "epoch": 0.48, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -1.0954294204711914, "logits/rejected": -0.9223726391792297, "logps/chosen": -1286.0498046875, "logps/rejected": -1733.7152099609375, "loss": 0.6187, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0697466135025024, "rewards/margins": 0.43960708379745483, "rewards/rejected": -1.5093533992767334, "step": 600 }, { "epoch": 0.49, "learning_rate": 3.0368383179176584e-06, "logits/chosen": -1.2414100170135498, "logits/rejected": -1.0788267850875854, "logps/chosen": -808.4361572265625, "logps/rejected": -1251.864990234375, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.566831648349762, "rewards/margins": 0.4803600311279297, "rewards/rejected": -1.0471916198730469, "step": 610 }, { "epoch": 0.5, "learning_rate": 2.9684532864643123e-06, "logits/chosen": -1.262020230293274, "logits/rejected": -1.148590087890625, "logps/chosen": -931.0750122070312, "logps/rejected": -1080.2626953125, "loss": 0.6143, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6786974668502808, "rewards/margins": 0.17992933094501495, "rewards/rejected": -0.8586267232894897, "step": 620 }, { "epoch": 0.5, "learning_rate": 2.8997029692295875e-06, "logits/chosen": -1.380244255065918, "logits/rejected": -1.1846643686294556, "logps/chosen": -1102.616943359375, "logps/rejected": -1461.800048828125, "loss": 0.6248, "rewards/accuracies": 0.625, "rewards/chosen": -0.832280158996582, "rewards/margins": 0.3882806599140167, "rewards/rejected": -1.2205607891082764, "step": 630 }, { "epoch": 0.51, "learning_rate": 2.8306409756428067e-06, "logits/chosen": -1.1531554460525513, "logits/rejected": -0.73602694272995, "logps/chosen": -1035.087158203125, "logps/rejected": -1619.76171875, "loss": 0.568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7359131574630737, "rewards/margins": 0.6530500650405884, "rewards/rejected": -1.388963222503662, "step": 640 }, { "epoch": 0.52, "learning_rate": 2.761321158169134e-06, "logits/chosen": -1.1167099475860596, "logits/rejected": -0.873482346534729, "logps/chosen": -1361.4727783203125, "logps/rejected": -1630.313232421875, "loss": 0.5607, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0866434574127197, "rewards/margins": 0.33569416403770447, "rewards/rejected": -1.4223374128341675, "step": 650 }, { "epoch": 0.53, "learning_rate": 2.6917975703170466e-06, "logits/chosen": -0.7387585639953613, "logits/rejected": -0.3248792886734009, "logps/chosen": -1398.619384765625, "logps/rejected": -2202.85498046875, "loss": 0.5724, "rewards/accuracies": 0.75, "rewards/chosen": -1.1486316919326782, "rewards/margins": 0.7962583303451538, "rewards/rejected": -1.944890022277832, "step": 660 }, { "epoch": 0.54, "learning_rate": 2.6221244244890336e-06, "logits/chosen": -0.6219555139541626, "logits/rejected": -0.4327964782714844, "logps/chosen": -1998.990234375, "logps/rejected": -2370.507080078125, "loss": 0.576, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.755999207496643, "rewards/margins": 0.406169593334198, "rewards/rejected": -2.1621687412261963, "step": 670 }, { "epoch": 0.54, "learning_rate": 2.5523560497083927e-06, "logits/chosen": -0.9830737113952637, "logits/rejected": -0.8169624209403992, "logps/chosen": -1028.17578125, "logps/rejected": -1266.951904296875, "loss": 0.6217, "rewards/accuracies": 0.5, "rewards/chosen": -0.8314197659492493, "rewards/margins": 0.2545499801635742, "rewards/rejected": -1.0859696865081787, "step": 680 }, { "epoch": 0.55, "learning_rate": 2.482546849255096e-06, "logits/chosen": -1.0446968078613281, "logits/rejected": -0.7680097818374634, "logps/chosen": -963.1312255859375, "logps/rejected": -1276.6685791015625, "loss": 0.5595, "rewards/accuracies": 0.625, "rewards/chosen": -0.7486265301704407, "rewards/margins": 0.34474682807922363, "rewards/rejected": -1.0933732986450195, "step": 690 }, { "epoch": 0.56, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -0.9387643933296204, "logits/rejected": -0.8350385427474976, "logps/chosen": -1224.355712890625, "logps/rejected": -1547.6513671875, "loss": 0.5682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9411084055900574, "rewards/margins": 0.34462398290634155, "rewards/rejected": -1.285732388496399, "step": 700 }, { "epoch": 0.57, "learning_rate": 2.3430237011767166e-06, "logits/chosen": -0.7177497148513794, "logits/rejected": -0.5354570746421814, "logps/chosen": -1501.9898681640625, "logps/rejected": -2006.5120849609375, "loss": 0.5666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.279309630393982, "rewards/margins": 0.5385541319847107, "rewards/rejected": -1.8178638219833374, "step": 710 }, { "epoch": 0.58, "learning_rate": 2.2734185495055503e-06, "logits/chosen": -0.14772020280361176, "logits/rejected": 0.09259579330682755, "logps/chosen": -1891.1800537109375, "logps/rejected": -2354.27734375, "loss": 0.5845, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.666395902633667, "rewards/margins": 0.5180760025978088, "rewards/rejected": -2.184471845626831, "step": 720 }, { "epoch": 0.58, "learning_rate": 2.2039900792337477e-06, "logits/chosen": -0.7175928354263306, "logits/rejected": -0.47789135575294495, "logps/chosen": -1339.34765625, "logps/rejected": -1721.939453125, "loss": 0.6008, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0828118324279785, "rewards/margins": 0.42446571588516235, "rewards/rejected": -1.5072776079177856, "step": 730 }, { "epoch": 0.59, "learning_rate": 2.134792428593971e-06, "logits/chosen": -1.1001973152160645, "logits/rejected": -0.7717048525810242, "logps/chosen": -975.1065673828125, "logps/rejected": -1379.069580078125, "loss": 0.5754, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7290834188461304, "rewards/margins": 0.4491938054561615, "rewards/rejected": -1.1782772541046143, "step": 740 }, { "epoch": 0.6, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -0.9087737202644348, "logits/rejected": -0.9751186370849609, "logps/chosen": -962.6619873046875, "logps/rejected": -1202.0980224609375, "loss": 0.6048, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7626216411590576, "rewards/margins": 0.23821432888507843, "rewards/rejected": -1.0008360147476196, "step": 750 }, { "epoch": 0.61, "learning_rate": 1.997305197135089e-06, "logits/chosen": -0.6515249013900757, "logits/rejected": -0.5174766778945923, "logps/chosen": -1231.8673095703125, "logps/rejected": -1379.020263671875, "loss": 0.6427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.020787000656128, "rewards/margins": 0.1627589762210846, "rewards/rejected": -1.1835458278656006, "step": 760 }, { "epoch": 0.62, "learning_rate": 1.9291228247233607e-06, "logits/chosen": -1.1023533344268799, "logits/rejected": -0.8313691020011902, "logps/chosen": -1354.640380859375, "logps/rejected": -1333.6419677734375, "loss": 0.6151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0642707347869873, "rewards/margins": 0.0644897073507309, "rewards/rejected": -1.1287604570388794, "step": 770 }, { "epoch": 0.62, "learning_rate": 1.8613856051605242e-06, "logits/chosen": -0.8817610740661621, "logits/rejected": -0.6308177709579468, "logps/chosen": -1051.3311767578125, "logps/rejected": -1573.681396484375, "loss": 0.6084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7915970087051392, "rewards/margins": 0.5443645715713501, "rewards/rejected": -1.3359615802764893, "step": 780 }, { "epoch": 0.63, "learning_rate": 1.7941463578928088e-06, "logits/chosen": -1.0322520732879639, "logits/rejected": -0.9708759188652039, "logps/chosen": -1105.9547119140625, "logps/rejected": -1240.3804931640625, "loss": 0.6281, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.8885539174079895, "rewards/margins": 0.14831490814685822, "rewards/rejected": -1.0368688106536865, "step": 790 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.8422597646713257, "logits/rejected": -0.9403928518295288, "logps/chosen": -1141.7333984375, "logps/rejected": -1371.9580078125, "loss": 0.5443, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8957103490829468, "rewards/margins": 0.22806143760681152, "rewards/rejected": -1.1237719058990479, "step": 800 }, { "epoch": 0.65, "learning_rate": 1.661371075624363e-06, "logits/chosen": -0.6579400300979614, "logits/rejected": -0.38535481691360474, "logps/chosen": -1287.460205078125, "logps/rejected": -1504.084228515625, "loss": 0.6025, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.095157265663147, "rewards/margins": 0.25700071454048157, "rewards/rejected": -1.3521578311920166, "step": 810 }, { "epoch": 0.66, "learning_rate": 1.5959385747947697e-06, "logits/chosen": -0.7758182287216187, "logits/rejected": -0.67876136302948, "logps/chosen": -1263.397216796875, "logps/rejected": -1797.307373046875, "loss": 0.5785, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.020475149154663, "rewards/margins": 0.5204795002937317, "rewards/rejected": -1.540954828262329, "step": 820 }, { "epoch": 0.66, "learning_rate": 1.5312110338697427e-06, "logits/chosen": -1.0213903188705444, "logits/rejected": -0.8918190002441406, "logps/chosen": -1065.27734375, "logps/rejected": -1240.601806640625, "loss": 0.6148, "rewards/accuracies": 0.5, "rewards/chosen": -0.8754297494888306, "rewards/margins": 0.1893172711133957, "rewards/rejected": -1.0647470951080322, "step": 830 }, { "epoch": 0.67, "learning_rate": 1.467238925438646e-06, "logits/chosen": -1.0888749361038208, "logits/rejected": -0.9006432294845581, "logps/chosen": -1159.646728515625, "logps/rejected": -1520.9205322265625, "loss": 0.5909, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.891791045665741, "rewards/margins": 0.37499967217445374, "rewards/rejected": -1.2667908668518066, "step": 840 }, { "epoch": 0.68, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -0.7614107131958008, "logits/rejected": -0.4498376250267029, "logps/chosen": -1577.4564208984375, "logps/rejected": -2064.27587890625, "loss": 0.5638, "rewards/accuracies": 0.75, "rewards/chosen": -1.2812590599060059, "rewards/margins": 0.5300472974777222, "rewards/rejected": -1.811306357383728, "step": 850 }, { "epoch": 0.69, "learning_rate": 1.3417599122003464e-06, "logits/chosen": -1.0388845205307007, "logits/rejected": -0.6018816232681274, "logps/chosen": -1428.4647216796875, "logps/rejected": -2024.8896484375, "loss": 0.5664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1658397912979126, "rewards/margins": 0.6608397364616394, "rewards/rejected": -1.8266795873641968, "step": 860 }, { "epoch": 0.7, "learning_rate": 1.280350852153168e-06, "logits/chosen": -0.6871947050094604, "logits/rejected": -0.6853007674217224, "logps/chosen": -1243.111572265625, "logps/rejected": -1829.707275390625, "loss": 0.5468, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0525375604629517, "rewards/margins": 0.5433839559555054, "rewards/rejected": -1.595921277999878, "step": 870 }, { "epoch": 0.7, "learning_rate": 1.2198928378235717e-06, "logits/chosen": -0.8195822834968567, "logits/rejected": -0.6704593896865845, "logps/chosen": -1643.8544921875, "logps/rejected": -2129.94189453125, "loss": 0.6271, "rewards/accuracies": 0.625, "rewards/chosen": -1.3954288959503174, "rewards/margins": 0.46880459785461426, "rewards/rejected": -1.8642336130142212, "step": 880 }, { "epoch": 0.71, "learning_rate": 1.160433012552508e-06, "logits/chosen": -1.0698474645614624, "logits/rejected": -0.9074158668518066, "logps/chosen": -1074.9571533203125, "logps/rejected": -1626.334228515625, "loss": 0.5905, "rewards/accuracies": 0.625, "rewards/chosen": -0.8372681736946106, "rewards/margins": 0.602739691734314, "rewards/rejected": -1.4400079250335693, "step": 890 }, { "epoch": 0.72, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -1.0306494235992432, "logits/rejected": -0.7876461148262024, "logps/chosen": -1247.2880859375, "logps/rejected": -1743.0517578125, "loss": 0.5594, "rewards/accuracies": 0.625, "rewards/chosen": -1.0142000913619995, "rewards/margins": 0.5253391265869141, "rewards/rejected": -1.5395392179489136, "step": 900 }, { "epoch": 0.73, "learning_rate": 1.0446925746067768e-06, "logits/chosen": -0.7109737396240234, "logits/rejected": -0.4215407371520996, "logps/chosen": -1115.1761474609375, "logps/rejected": -1571.439697265625, "loss": 0.584, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8916100263595581, "rewards/margins": 0.515656590461731, "rewards/rejected": -1.407266616821289, "step": 910 }, { "epoch": 0.74, "learning_rate": 9.88502212844063e-07, "logits/chosen": -0.9104745984077454, "logits/rejected": -0.6836374402046204, "logps/chosen": -1387.066650390625, "logps/rejected": -2122.8583984375, "loss": 0.5959, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1135450601577759, "rewards/margins": 0.7385995388031006, "rewards/rejected": -1.8521445989608765, "step": 920 }, { "epoch": 0.74, "learning_rate": 9.334904715888496e-07, "logits/chosen": -0.7981353998184204, "logits/rejected": -0.47408953309059143, "logps/chosen": -1591.907958984375, "logps/rejected": -2268.22802734375, "loss": 0.5996, "rewards/accuracies": 0.625, "rewards/chosen": -1.3141790628433228, "rewards/margins": 0.6781451106071472, "rewards/rejected": -1.9923241138458252, "step": 930 }, { "epoch": 0.75, "learning_rate": 8.797002473421729e-07, "logits/chosen": -0.7773085832595825, "logits/rejected": -0.8014146089553833, "logps/chosen": -1200.51904296875, "logps/rejected": -1684.487060546875, "loss": 0.551, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.948921799659729, "rewards/margins": 0.48516377806663513, "rewards/rejected": -1.434085488319397, "step": 940 }, { "epoch": 0.76, "learning_rate": 8.271734841028553e-07, "logits/chosen": -0.7013689279556274, "logits/rejected": -0.48669877648353577, "logps/chosen": -1563.8013916015625, "logps/rejected": -1946.0748291015625, "loss": 0.5625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3094342947006226, "rewards/margins": 0.41629427671432495, "rewards/rejected": -1.7257286310195923, "step": 950 }, { "epoch": 0.77, "learning_rate": 7.759511406608255e-07, "logits/chosen": -0.8420946002006531, "logits/rejected": -0.7498366832733154, "logps/chosen": -1459.593994140625, "logps/rejected": -1710.6732177734375, "loss": 0.5906, "rewards/accuracies": 0.625, "rewards/chosen": -1.2246272563934326, "rewards/margins": 0.26000121235847473, "rewards/rejected": -1.484628438949585, "step": 960 }, { "epoch": 0.78, "learning_rate": 7.260731586586983e-07, "logits/chosen": -0.6557348370552063, "logits/rejected": -0.6690261363983154, "logps/chosen": -1101.353759765625, "logps/rejected": -1434.849365234375, "loss": 0.5741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.974203884601593, "rewards/margins": 0.33285054564476013, "rewards/rejected": -1.3070546388626099, "step": 970 }, { "epoch": 0.78, "learning_rate": 6.775784314464717e-07, "logits/chosen": -1.0794492959976196, "logits/rejected": -0.7161605954170227, "logps/chosen": -1141.7310791015625, "logps/rejected": -1709.1611328125, "loss": 0.5292, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9028726816177368, "rewards/margins": 0.6150614619255066, "rewards/rejected": -1.5179340839385986, "step": 980 }, { "epoch": 0.79, "learning_rate": 6.305047737536707e-07, "logits/chosen": -0.6755369305610657, "logits/rejected": -0.619501531124115, "logps/chosen": -1471.1715087890625, "logps/rejected": -2048.2431640625, "loss": 0.5757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2707023620605469, "rewards/margins": 0.5611073970794678, "rewards/rejected": -1.8318097591400146, "step": 990 }, { "epoch": 0.8, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.8728944659233093, "logits/rejected": -1.02140212059021, "logps/chosen": -1593.0389404296875, "logps/rejected": -1743.603759765625, "loss": 0.5888, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3282172679901123, "rewards/margins": 0.18238051235675812, "rewards/rejected": -1.510597825050354, "step": 1000 }, { "epoch": 0.81, "learning_rate": 5.407663566854008e-07, "logits/chosen": -1.2148396968841553, "logits/rejected": -0.9437984228134155, "logps/chosen": -1290.468994140625, "logps/rejected": -1955.718017578125, "loss": 0.5642, "rewards/accuracies": 0.625, "rewards/chosen": -1.0306814908981323, "rewards/margins": 0.7142370939254761, "rewards/rejected": -1.7449188232421875, "step": 1010 }, { "epoch": 0.82, "learning_rate": 4.981715726281666e-07, "logits/chosen": -0.9041656255722046, "logits/rejected": -0.7784754633903503, "logps/chosen": -1627.837890625, "logps/rejected": -2467.47314453125, "loss": 0.5837, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3451955318450928, "rewards/margins": 0.8315626978874207, "rewards/rejected": -2.176758289337158, "step": 1020 }, { "epoch": 0.82, "learning_rate": 4.5713775416217884e-07, "logits/chosen": -0.508884608745575, "logits/rejected": -0.4718368649482727, "logps/chosen": -1368.59765625, "logps/rejected": -2104.90625, "loss": 0.563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1711418628692627, "rewards/margins": 0.7298630475997925, "rewards/rejected": -1.9010050296783447, "step": 1030 }, { "epoch": 0.83, "learning_rate": 4.1769689822475147e-07, "logits/chosen": -0.5781607031822205, "logits/rejected": -0.6443449258804321, "logps/chosen": -1434.3941650390625, "logps/rejected": -2030.039306640625, "loss": 0.5716, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2758595943450928, "rewards/margins": 0.5422929525375366, "rewards/rejected": -1.818152666091919, "step": 1040 }, { "epoch": 0.84, "learning_rate": 3.798797596089351e-07, "logits/chosen": -0.8322515487670898, "logits/rejected": -0.5321582555770874, "logps/chosen": -1379.1712646484375, "logps/rejected": -1931.9146728515625, "loss": 0.55, "rewards/accuracies": 0.75, "rewards/chosen": -1.1463197469711304, "rewards/margins": 0.5763110518455505, "rewards/rejected": -1.7226308584213257, "step": 1050 }, { "epoch": 0.85, "learning_rate": 3.4371582698185636e-07, "logits/chosen": -0.9420675039291382, "logits/rejected": -0.7674391269683838, "logps/chosen": -1645.792236328125, "logps/rejected": -2230.914306640625, "loss": 0.5025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3653483390808105, "rewards/margins": 0.5703693628311157, "rewards/rejected": -1.9357175827026367, "step": 1060 }, { "epoch": 0.86, "learning_rate": 3.092332998903416e-07, "logits/chosen": -0.8753671646118164, "logits/rejected": -0.8147989511489868, "logps/chosen": -1408.5784912109375, "logps/rejected": -2042.339599609375, "loss": 0.576, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.158811092376709, "rewards/margins": 0.6254128217697144, "rewards/rejected": -1.7842239141464233, "step": 1070 }, { "epoch": 0.86, "learning_rate": 2.764590667717562e-07, "logits/chosen": -0.6400734186172485, "logits/rejected": -0.46232056617736816, "logps/chosen": -1486.711669921875, "logps/rejected": -1575.9361572265625, "loss": 0.5985, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.3110835552215576, "rewards/margins": 0.12779514491558075, "rewards/rejected": -1.4388787746429443, "step": 1080 }, { "epoch": 0.87, "learning_rate": 2.454186839872158e-07, "logits/chosen": -0.616904616355896, "logits/rejected": -0.3994078040122986, "logps/chosen": -1475.5816650390625, "logps/rejected": -2202.4833984375, "loss": 0.5632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2749942541122437, "rewards/margins": 0.7362874746322632, "rewards/rejected": -2.011281728744507, "step": 1090 }, { "epoch": 0.88, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -0.8570725321769714, "logits/rejected": -0.636644184589386, "logps/chosen": -1252.1573486328125, "logps/rejected": -1564.4869384765625, "loss": 0.6027, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0256534814834595, "rewards/margins": 0.33503228425979614, "rewards/rejected": -1.3606857061386108, "step": 1100 }, { "epoch": 0.89, "learning_rate": 1.8863491596921745e-07, "logits/chosen": -0.7760205268859863, "logits/rejected": -0.4805734157562256, "logps/chosen": -1414.7236328125, "logps/rejected": -1822.532958984375, "loss": 0.5691, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1799442768096924, "rewards/margins": 0.46841782331466675, "rewards/rejected": -1.648362159729004, "step": 1110 }, { "epoch": 0.9, "learning_rate": 1.629358090099639e-07, "logits/chosen": -0.7541752457618713, "logits/rejected": -0.5927517414093018, "logps/chosen": -1383.7718505859375, "logps/rejected": -1702.1370849609375, "loss": 0.5901, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.173661231994629, "rewards/margins": 0.3563198149204254, "rewards/rejected": -1.529981017112732, "step": 1120 }, { "epoch": 0.9, "learning_rate": 1.3905907440629752e-07, "logits/chosen": -0.8271347284317017, "logits/rejected": -0.5372225046157837, "logps/chosen": -1264.5931396484375, "logps/rejected": -1810.603759765625, "loss": 0.5606, "rewards/accuracies": 0.625, "rewards/chosen": -1.0567820072174072, "rewards/margins": 0.5766392946243286, "rewards/rejected": -1.633421540260315, "step": 1130 }, { "epoch": 0.91, "learning_rate": 1.1702333051763271e-07, "logits/chosen": -0.7219616770744324, "logits/rejected": -0.44911837577819824, "logps/chosen": -1458.1873779296875, "logps/rejected": -2138.194091796875, "loss": 0.5666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1980092525482178, "rewards/margins": 0.7267267107963562, "rewards/rejected": -1.9247362613677979, "step": 1140 }, { "epoch": 0.92, "learning_rate": 9.684576015420277e-08, "logits/chosen": -0.6613628268241882, "logits/rejected": -0.41062062978744507, "logps/chosen": -1361.629150390625, "logps/rejected": -1998.5576171875, "loss": 0.5569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1172220706939697, "rewards/margins": 0.6446647644042969, "rewards/rejected": -1.7618869543075562, "step": 1150 }, { "epoch": 0.93, "learning_rate": 7.854209717842231e-08, "logits/chosen": -0.6104007959365845, "logits/rejected": -0.5428999662399292, "logps/chosen": -1572.9007568359375, "logps/rejected": -1579.3857421875, "loss": 0.5912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3415987491607666, "rewards/margins": 0.03751998022198677, "rewards/rejected": -1.37911856174469, "step": 1160 }, { "epoch": 0.94, "learning_rate": 6.212661423609184e-08, "logits/chosen": -0.4355439245700836, "logits/rejected": -0.3004533052444458, "logps/chosen": -1881.150390625, "logps/rejected": -2220.178955078125, "loss": 0.6615, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6670087575912476, "rewards/margins": 0.3249967694282532, "rewards/rejected": -1.9920055866241455, "step": 1170 }, { "epoch": 0.94, "learning_rate": 4.761211162702117e-08, "logits/chosen": -0.7097476124763489, "logits/rejected": -0.6131819486618042, "logps/chosen": -1349.8577880859375, "logps/rejected": -1754.811279296875, "loss": 0.5665, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1261099576950073, "rewards/margins": 0.42261672019958496, "rewards/rejected": -1.5487267971038818, "step": 1180 }, { "epoch": 0.95, "learning_rate": 3.5009907323737826e-08, "logits/chosen": -0.6964675188064575, "logits/rejected": -0.35203319787979126, "logps/chosen": -1499.8782958984375, "logps/rejected": -2223.939697265625, "loss": 0.5449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.267060399055481, "rewards/margins": 0.7385483980178833, "rewards/rejected": -2.0056090354919434, "step": 1190 }, { "epoch": 0.96, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -0.9956095814704895, "logits/rejected": -0.6606365442276001, "logps/chosen": -1594.940673828125, "logps/rejected": -1980.383056640625, "loss": 0.5542, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3295806646347046, "rewards/margins": 0.44766488671302795, "rewards/rejected": -1.777245283126831, "step": 1200 }, { "epoch": 0.97, "learning_rate": 1.5580202098509078e-08, "logits/chosen": -0.9308391809463501, "logits/rejected": -0.8904244303703308, "logps/chosen": -1657.0823974609375, "logps/rejected": -1868.926025390625, "loss": 0.6277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3185545206069946, "rewards/margins": 0.27054083347320557, "rewards/rejected": -1.5890953540802002, "step": 1210 }, { "epoch": 0.98, "learning_rate": 8.767851876239075e-09, "logits/chosen": -0.625472903251648, "logits/rejected": -0.4992523789405823, "logps/chosen": -1419.646240234375, "logps/rejected": -2127.7783203125, "loss": 0.5692, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.194054365158081, "rewards/margins": 0.7004168629646301, "rewards/rejected": -1.8944714069366455, "step": 1220 }, { "epoch": 0.98, "learning_rate": 3.8980895450474455e-09, "logits/chosen": -0.8765754699707031, "logits/rejected": -0.6600515842437744, "logps/chosen": -1335.862060546875, "logps/rejected": -1862.95703125, "loss": 0.6035, "rewards/accuracies": 0.625, "rewards/chosen": -1.117206335067749, "rewards/margins": 0.5470607280731201, "rewards/rejected": -1.6642669439315796, "step": 1230 }, { "epoch": 0.99, "learning_rate": 9.747123991141193e-10, "logits/chosen": -0.9478354454040527, "logits/rejected": -0.7755793929100037, "logps/chosen": -1195.031005859375, "logps/rejected": -1684.8265380859375, "loss": 0.5304, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9703825116157532, "rewards/margins": 0.4978027939796448, "rewards/rejected": -1.4681851863861084, "step": 1240 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -0.8340710401535034, "logits/rejected": -0.6496783494949341, "logps/chosen": -1408.339111328125, "logps/rejected": -1972.5435791015625, "loss": 0.5482, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1645435094833374, "rewards/margins": 0.58301842212677, "rewards/rejected": -1.7475616931915283, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.6029718887329102, "train_runtime": 12868.9286, "train_samples_per_second": 1.166, "train_steps_per_second": 0.097 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }