zephyr-7b-dpo-0k-15k-0.001-i1 / trainer_state.json
BraylonDash's picture
Model save
84a1b63 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 4e-08,
"logits/chosen": -2.6577353477478027,
"logits/rejected": -2.043900489807129,
"logps/chosen": -505.98724365234375,
"logps/rejected": -319.40179443359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 4.0000000000000003e-07,
"logits/chosen": -2.477527141571045,
"logits/rejected": -2.134815216064453,
"logps/chosen": -285.37506103515625,
"logps/rejected": -191.59552001953125,
"loss": 0.6932,
"rewards/accuracies": 0.2777777910232544,
"rewards/chosen": -1.798523953766562e-05,
"rewards/margins": -2.5926061425707303e-05,
"rewards/rejected": 7.940820069052279e-06,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 8.000000000000001e-07,
"logits/chosen": -2.360628843307495,
"logits/rejected": -2.1267056465148926,
"logps/chosen": -271.4191589355469,
"logps/rejected": -208.81991577148438,
"loss": 0.6931,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.00010290103818988428,
"rewards/margins": 2.105976818711497e-05,
"rewards/rejected": 8.184127364074811e-05,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 1.2000000000000002e-06,
"logits/chosen": -2.278747797012329,
"logits/rejected": -2.2493417263031006,
"logps/chosen": -269.8002624511719,
"logps/rejected": -288.9651794433594,
"loss": 0.6931,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.00028741464484483004,
"rewards/margins": 0.00019061131752096117,
"rewards/rejected": 9.68033418757841e-05,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 1.6000000000000001e-06,
"logits/chosen": -2.5087287425994873,
"logits/rejected": -2.340841293334961,
"logps/chosen": -210.5767059326172,
"logps/rejected": -181.60897827148438,
"loss": 0.6931,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": 0.0001595711219124496,
"rewards/margins": 0.00010818429291248322,
"rewards/rejected": 5.138682899996638e-05,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -2.362971067428589,
"logits/rejected": -2.338986873626709,
"logps/chosen": -195.54049682617188,
"logps/rejected": -211.3101806640625,
"loss": 0.6931,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0001843376230681315,
"rewards/margins": 0.00030581915052607656,
"rewards/rejected": -0.0001214815056300722,
"step": 50
},
{
"epoch": 0.05,
"learning_rate": 2.4000000000000003e-06,
"logits/chosen": -2.495109796524048,
"logits/rejected": -2.304320812225342,
"logps/chosen": -244.63357543945312,
"logps/rejected": -277.849853515625,
"loss": 0.693,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.000983591889962554,
"rewards/margins": 1.5824800357222557e-05,
"rewards/rejected": -0.0009994168067350984,
"step": 60
},
{
"epoch": 0.06,
"learning_rate": 2.8000000000000003e-06,
"logits/chosen": -2.2505688667297363,
"logits/rejected": -2.2676665782928467,
"logps/chosen": -229.9136199951172,
"logps/rejected": -216.98001098632812,
"loss": 0.6928,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0010834664572030306,
"rewards/margins": 0.0009649285930208862,
"rewards/rejected": -0.0020483951084315777,
"step": 70
},
{
"epoch": 0.06,
"learning_rate": 3.2000000000000003e-06,
"logits/chosen": -2.4005239009857178,
"logits/rejected": -2.3944191932678223,
"logps/chosen": -266.11859130859375,
"logps/rejected": -262.9701232910156,
"loss": 0.6925,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0014705440262332559,
"rewards/margins": 0.002042059786617756,
"rewards/rejected": -0.00351260369643569,
"step": 80
},
{
"epoch": 0.07,
"learning_rate": 3.6000000000000003e-06,
"logits/chosen": -2.3290226459503174,
"logits/rejected": -1.9915828704833984,
"logps/chosen": -268.29278564453125,
"logps/rejected": -180.13323974609375,
"loss": 0.6924,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.002903540385887027,
"rewards/margins": 0.002247781725600362,
"rewards/rejected": -0.005151322111487389,
"step": 90
},
{
"epoch": 0.08,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": -2.3091585636138916,
"logits/rejected": -2.312863826751709,
"logps/chosen": -275.3695373535156,
"logps/rejected": -251.3533477783203,
"loss": 0.6928,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.003613454522565007,
"rewards/margins": 0.0010787051869556308,
"rewards/rejected": -0.004692160524427891,
"step": 100
},
{
"epoch": 0.09,
"learning_rate": 4.4e-06,
"logits/chosen": -2.258510112762451,
"logits/rejected": -2.1448404788970947,
"logps/chosen": -211.90158081054688,
"logps/rejected": -180.40475463867188,
"loss": 0.6919,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0001777430734364316,
"rewards/margins": 0.0037381600122898817,
"rewards/rejected": -0.003560416866093874,
"step": 110
},
{
"epoch": 0.1,
"learning_rate": 4.800000000000001e-06,
"logits/chosen": -2.158743381500244,
"logits/rejected": -2.169588088989258,
"logps/chosen": -238.22067260742188,
"logps/rejected": -270.46417236328125,
"loss": 0.6906,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0018882494186982512,
"rewards/margins": 0.005054115317761898,
"rewards/rejected": -0.006942364387214184,
"step": 120
},
{
"epoch": 0.1,
"learning_rate": 4.999756310023261e-06,
"logits/chosen": -2.4432594776153564,
"logits/rejected": -2.315918445587158,
"logps/chosen": -237.79501342773438,
"logps/rejected": -196.10853576660156,
"loss": 0.6911,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0036071953363716602,
"rewards/margins": 0.005986797157675028,
"rewards/rejected": -0.009593991562724113,
"step": 130
},
{
"epoch": 0.11,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": -2.3266425132751465,
"logits/rejected": -2.025289535522461,
"logps/chosen": -207.1509552001953,
"logps/rejected": -187.79754638671875,
"loss": 0.6907,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.015933997929096222,
"rewards/margins": 0.004653518553823233,
"rewards/rejected": -0.02058752067387104,
"step": 140
},
{
"epoch": 0.12,
"learning_rate": 4.993910125649561e-06,
"logits/chosen": -2.303725481033325,
"logits/rejected": -2.190458297729492,
"logps/chosen": -239.7921600341797,
"logps/rejected": -213.1902618408203,
"loss": 0.6873,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.01170186698436737,
"rewards/margins": 0.013611750677227974,
"rewards/rejected": -0.025313619524240494,
"step": 150
},
{
"epoch": 0.13,
"learning_rate": 4.988068499954578e-06,
"logits/chosen": -2.1805403232574463,
"logits/rejected": -2.2514309883117676,
"logps/chosen": -323.44732666015625,
"logps/rejected": -342.3636779785156,
"loss": 0.6899,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.07079382240772247,
"rewards/margins": 0.012775696814060211,
"rewards/rejected": -0.08356951922178268,
"step": 160
},
{
"epoch": 0.14,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": -2.377260446548462,
"logits/rejected": -1.992498755455017,
"logps/chosen": -361.740966796875,
"logps/rejected": -316.5578308105469,
"loss": 0.6815,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.09483315795660019,
"rewards/margins": -0.003298636060208082,
"rewards/rejected": -0.09153451770544052,
"step": 170
},
{
"epoch": 0.14,
"learning_rate": 4.970570953616383e-06,
"logits/chosen": -2.125253677368164,
"logits/rejected": -2.1080162525177,
"logps/chosen": -345.4566955566406,
"logps/rejected": -362.8345642089844,
"loss": 0.6796,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.11889471858739853,
"rewards/margins": 0.017375323921442032,
"rewards/rejected": -0.13627007603645325,
"step": 180
},
{
"epoch": 0.15,
"learning_rate": 4.958928677033465e-06,
"logits/chosen": -1.872900366783142,
"logits/rejected": -1.8023853302001953,
"logps/chosen": -607.0579223632812,
"logps/rejected": -631.1737060546875,
"loss": 0.6814,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.37075790762901306,
"rewards/margins": 0.051843322813510895,
"rewards/rejected": -0.42260122299194336,
"step": 190
},
{
"epoch": 0.16,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": -2.2707138061523438,
"logits/rejected": -2.1255440711975098,
"logps/chosen": -350.2916564941406,
"logps/rejected": -372.66778564453125,
"loss": 0.6812,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09592956304550171,
"rewards/margins": 0.014601891860365868,
"rewards/rejected": -0.11053146421909332,
"step": 200
},
{
"epoch": 0.17,
"learning_rate": 4.9299025014463665e-06,
"logits/chosen": -2.4437155723571777,
"logits/rejected": -2.0841479301452637,
"logps/chosen": -348.2846984863281,
"logps/rejected": -332.79693603515625,
"loss": 0.6716,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.07477747648954391,
"rewards/margins": 0.06754375249147415,
"rewards/rejected": -0.14232121407985687,
"step": 210
},
{
"epoch": 0.18,
"learning_rate": 4.912541236180779e-06,
"logits/chosen": -2.2131965160369873,
"logits/rejected": -1.79428231716156,
"logps/chosen": -453.1966247558594,
"logps/rejected": -504.7650451660156,
"loss": 0.6753,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2166297435760498,
"rewards/margins": 0.09042102098464966,
"rewards/rejected": -0.30705076456069946,
"step": 220
},
{
"epoch": 0.18,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": -2.114473342895508,
"logits/rejected": -2.1522789001464844,
"logps/chosen": -447.54510498046875,
"logps/rejected": -531.5616455078125,
"loss": 0.6747,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.24870488047599792,
"rewards/margins": 0.05732503533363342,
"rewards/rejected": -0.30602994561195374,
"step": 230
},
{
"epoch": 0.19,
"learning_rate": 4.8721900291112415e-06,
"logits/chosen": -2.0410103797912598,
"logits/rejected": -1.8702392578125,
"logps/chosen": -417.52557373046875,
"logps/rejected": -418.7784118652344,
"loss": 0.6794,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.21090665459632874,
"rewards/margins": 0.014016765169799328,
"rewards/rejected": -0.22492341697216034,
"step": 240
},
{
"epoch": 0.2,
"learning_rate": 4.849231551964771e-06,
"logits/chosen": -1.9370673894882202,
"logits/rejected": -1.845969796180725,
"logps/chosen": -467.31817626953125,
"logps/rejected": -516.6582641601562,
"loss": 0.6586,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2973518967628479,
"rewards/margins": 0.04663931205868721,
"rewards/rejected": -0.34399116039276123,
"step": 250
},
{
"epoch": 0.21,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": -1.7340936660766602,
"logits/rejected": -1.4695367813110352,
"logps/chosen": -1242.895751953125,
"logps/rejected": -1449.67138671875,
"loss": 0.6468,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9694870114326477,
"rewards/margins": 0.2230859100818634,
"rewards/rejected": -1.192572832107544,
"step": 260
},
{
"epoch": 0.22,
"learning_rate": 4.7978383481380865e-06,
"logits/chosen": -1.9577367305755615,
"logits/rejected": -1.7384374141693115,
"logps/chosen": -651.7098999023438,
"logps/rejected": -757.5595703125,
"loss": 0.6567,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.40336376428604126,
"rewards/margins": 0.14299169182777405,
"rewards/rejected": -0.5463554263114929,
"step": 270
},
{
"epoch": 0.22,
"learning_rate": 4.769443696332272e-06,
"logits/chosen": -1.6682405471801758,
"logits/rejected": -1.7225072383880615,
"logps/chosen": -636.3197021484375,
"logps/rejected": -823.1611328125,
"loss": 0.654,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4699520468711853,
"rewards/margins": 0.15158866345882416,
"rewards/rejected": -0.6215407252311707,
"step": 280
},
{
"epoch": 0.23,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": -1.7123403549194336,
"logits/rejected": -1.4536840915679932,
"logps/chosen": -984.7986450195312,
"logps/rejected": -1116.86328125,
"loss": 0.6296,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7080470323562622,
"rewards/margins": 0.18753795325756073,
"rewards/rejected": -0.8955849409103394,
"step": 290
},
{
"epoch": 0.24,
"learning_rate": 4.707368982147318e-06,
"logits/chosen": -1.5288642644882202,
"logits/rejected": -1.4273738861083984,
"logps/chosen": -1024.952880859375,
"logps/rejected": -1212.50439453125,
"loss": 0.6232,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.734005331993103,
"rewards/margins": 0.26781877875328064,
"rewards/rejected": -1.0018240213394165,
"step": 300
},
{
"epoch": 0.25,
"learning_rate": 4.673737323763048e-06,
"logits/chosen": -1.244638204574585,
"logits/rejected": -1.2806625366210938,
"logps/chosen": -906.6008911132812,
"logps/rejected": -1112.8660888671875,
"loss": 0.6264,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.713959813117981,
"rewards/margins": 0.16486592590808868,
"rewards/rejected": -0.8788257837295532,
"step": 310
},
{
"epoch": 0.26,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": -1.2582708597183228,
"logits/rejected": -0.9609068632125854,
"logps/chosen": -789.2811889648438,
"logps/rejected": -924.0436401367188,
"loss": 0.6751,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.53520667552948,
"rewards/margins": 0.21356996893882751,
"rewards/rejected": -0.7487767338752747,
"step": 320
},
{
"epoch": 0.26,
"learning_rate": 4.601416508739211e-06,
"logits/chosen": -1.5856783390045166,
"logits/rejected": -1.6001255512237549,
"logps/chosen": -586.2677001953125,
"logps/rejected": -678.0260009765625,
"loss": 0.6333,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.31349319219589233,
"rewards/margins": 0.13062533736228943,
"rewards/rejected": -0.44411858916282654,
"step": 330
},
{
"epoch": 0.27,
"learning_rate": 4.562783745695738e-06,
"logits/chosen": -1.710599660873413,
"logits/rejected": -1.4391021728515625,
"logps/chosen": -415.0328674316406,
"logps/rejected": -529.9607543945312,
"loss": 0.6341,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2211925983428955,
"rewards/margins": 0.15037958323955536,
"rewards/rejected": -0.3715721666812897,
"step": 340
},
{
"epoch": 0.28,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": -1.8084720373153687,
"logits/rejected": -1.506037712097168,
"logps/chosen": -570.231201171875,
"logps/rejected": -703.8088989257812,
"loss": 0.6278,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.31438136100769043,
"rewards/margins": 0.16334742307662964,
"rewards/rejected": -0.4777289032936096,
"step": 350
},
{
"epoch": 0.29,
"learning_rate": 4.4807241083879774e-06,
"logits/chosen": -1.4908154010772705,
"logits/rejected": -1.2486298084259033,
"logps/chosen": -1008.43310546875,
"logps/rejected": -1258.6966552734375,
"loss": 0.6064,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7509051561355591,
"rewards/margins": 0.28672417998313904,
"rewards/rejected": -1.0376293659210205,
"step": 360
},
{
"epoch": 0.3,
"learning_rate": 4.437361221760449e-06,
"logits/chosen": -1.114203929901123,
"logits/rejected": -1.0544617176055908,
"logps/chosen": -935.9404296875,
"logps/rejected": -1014.9347534179688,
"loss": 0.6677,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7481463551521301,
"rewards/margins": 0.08286388218402863,
"rewards/rejected": -0.8310102224349976,
"step": 370
},
{
"epoch": 0.3,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": -0.82923823595047,
"logits/rejected": -0.7714365124702454,
"logps/chosen": -1037.5648193359375,
"logps/rejected": -1317.840087890625,
"loss": 0.6065,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8485895991325378,
"rewards/margins": 0.2712119519710541,
"rewards/rejected": -1.1198015213012695,
"step": 380
},
{
"epoch": 0.31,
"learning_rate": 4.346138351564711e-06,
"logits/chosen": -0.4438857138156891,
"logits/rejected": -0.24459032714366913,
"logps/chosen": -1160.9703369140625,
"logps/rejected": -1448.4993896484375,
"loss": 0.6151,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.9693711996078491,
"rewards/margins": 0.3293563425540924,
"rewards/rejected": -1.2987276315689087,
"step": 390
},
{
"epoch": 0.32,
"learning_rate": 4.2983495008466285e-06,
"logits/chosen": -1.0683261156082153,
"logits/rejected": -1.0331823825836182,
"logps/chosen": -835.9192504882812,
"logps/rejected": -1117.007568359375,
"loss": 0.672,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.6222294569015503,
"rewards/margins": 0.26769739389419556,
"rewards/rejected": -0.8899267911911011,
"step": 400
},
{
"epoch": 0.33,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": -1.524524450302124,
"logits/rejected": -1.3127562999725342,
"logps/chosen": -700.7545166015625,
"logps/rejected": -866.1266479492188,
"loss": 0.6623,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4409456253051758,
"rewards/margins": 0.15933458507061005,
"rewards/rejected": -0.600280225276947,
"step": 410
},
{
"epoch": 0.34,
"learning_rate": 4.198603260653792e-06,
"logits/chosen": -1.8277499675750732,
"logits/rejected": -1.6153600215911865,
"logps/chosen": -648.5438842773438,
"logps/rejected": -690.0060424804688,
"loss": 0.6354,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3811865448951721,
"rewards/margins": 0.06636995077133179,
"rewards/rejected": -0.4475564956665039,
"step": 420
},
{
"epoch": 0.34,
"learning_rate": 4.146723650296701e-06,
"logits/chosen": -1.4686410427093506,
"logits/rejected": -1.4586213827133179,
"logps/chosen": -1219.008544921875,
"logps/rejected": -1602.6322021484375,
"loss": 0.6145,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9192007184028625,
"rewards/margins": 0.37095293402671814,
"rewards/rejected": -1.2901536226272583,
"step": 430
},
{
"epoch": 0.35,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": -1.156723976135254,
"logits/rejected": -0.8702109456062317,
"logps/chosen": -1515.7261962890625,
"logps/rejected": -1630.477783203125,
"loss": 0.5933,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.331788420677185,
"rewards/margins": 0.15234079957008362,
"rewards/rejected": -1.4841291904449463,
"step": 440
},
{
"epoch": 0.36,
"learning_rate": 4.039153688314146e-06,
"logits/chosen": -1.4883257150650024,
"logits/rejected": -1.2767736911773682,
"logps/chosen": -1196.55517578125,
"logps/rejected": -1548.430419921875,
"loss": 0.6235,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.9334648847579956,
"rewards/margins": 0.3865690231323242,
"rewards/rejected": -1.3200337886810303,
"step": 450
},
{
"epoch": 0.37,
"learning_rate": 3.983547216509254e-06,
"logits/chosen": -1.688001275062561,
"logits/rejected": -1.5989247560501099,
"logps/chosen": -713.7869873046875,
"logps/rejected": -806.5700073242188,
"loss": 0.6501,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.4852449297904968,
"rewards/margins": 0.12311458587646484,
"rewards/rejected": -0.6083595752716064,
"step": 460
},
{
"epoch": 0.38,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": -1.6326487064361572,
"logits/rejected": -1.5402270555496216,
"logps/chosen": -553.8263549804688,
"logps/rejected": -636.650390625,
"loss": 0.6405,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3886161148548126,
"rewards/margins": 0.10035456717014313,
"rewards/rejected": -0.48897066712379456,
"step": 470
},
{
"epoch": 0.38,
"learning_rate": 3.868908058731376e-06,
"logits/chosen": -1.2314743995666504,
"logits/rejected": -1.1016968488693237,
"logps/chosen": -1220.119140625,
"logps/rejected": -1464.5045166015625,
"loss": 0.627,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.0543556213378906,
"rewards/margins": 0.22365888953208923,
"rewards/rejected": -1.2780145406723022,
"step": 480
},
{
"epoch": 0.39,
"learning_rate": 3.8099647649251984e-06,
"logits/chosen": -1.1137750148773193,
"logits/rejected": -0.8115717768669128,
"logps/chosen": -1465.813720703125,
"logps/rejected": -1871.6728515625,
"loss": 0.5908,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.2194162607192993,
"rewards/margins": 0.45649608969688416,
"rewards/rejected": -1.6759124994277954,
"step": 490
},
{
"epoch": 0.4,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": -1.160712480545044,
"logits/rejected": -1.0044102668762207,
"logps/chosen": -1403.710693359375,
"logps/rejected": -1854.1767578125,
"loss": 0.6275,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.2094306945800781,
"rewards/margins": 0.44932323694229126,
"rewards/rejected": -1.658753752708435,
"step": 500
},
{
"epoch": 0.41,
"learning_rate": 3.689060522675689e-06,
"logits/chosen": -1.4455738067626953,
"logits/rejected": -1.3064179420471191,
"logps/chosen": -1048.9417724609375,
"logps/rejected": -1300.092529296875,
"loss": 0.5972,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8072590827941895,
"rewards/margins": 0.2859098017215729,
"rewards/rejected": -1.0931689739227295,
"step": 510
},
{
"epoch": 0.42,
"learning_rate": 3.627193851723577e-06,
"logits/chosen": -1.168351650238037,
"logits/rejected": -1.0151941776275635,
"logps/chosen": -1202.1624755859375,
"logps/rejected": -1375.6861572265625,
"loss": 0.6332,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.973349928855896,
"rewards/margins": 0.1988798826932907,
"rewards/rejected": -1.1722297668457031,
"step": 520
},
{
"epoch": 0.42,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": -1.2055370807647705,
"logits/rejected": -1.1897690296173096,
"logps/chosen": -1302.2347412109375,
"logps/rejected": -1396.9376220703125,
"loss": 0.6383,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -1.0556827783584595,
"rewards/margins": 0.08378318697214127,
"rewards/rejected": -1.1394660472869873,
"step": 530
},
{
"epoch": 0.43,
"learning_rate": 3.5008725813922383e-06,
"logits/chosen": -1.2467204332351685,
"logits/rejected": -0.9966660737991333,
"logps/chosen": -1237.6484375,
"logps/rejected": -1357.805908203125,
"loss": 0.6061,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.9747149348258972,
"rewards/margins": 0.1926373690366745,
"rewards/rejected": -1.167352318763733,
"step": 540
},
{
"epoch": 0.44,
"learning_rate": 3.436516483539781e-06,
"logits/chosen": -1.0729809999465942,
"logits/rejected": -0.880784809589386,
"logps/chosen": -1176.840576171875,
"logps/rejected": -1703.155029296875,
"loss": 0.5811,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9750388860702515,
"rewards/margins": 0.547395646572113,
"rewards/rejected": -1.5224344730377197,
"step": 550
},
{
"epoch": 0.45,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": -1.343149185180664,
"logits/rejected": -1.3310397863388062,
"logps/chosen": -1393.66748046875,
"logps/rejected": -1949.219482421875,
"loss": 0.593,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.0957911014556885,
"rewards/margins": 0.5795475244522095,
"rewards/rejected": -1.6753385066986084,
"step": 560
},
{
"epoch": 0.46,
"learning_rate": 3.3056642380762783e-06,
"logits/chosen": -1.2191109657287598,
"logits/rejected": -1.1489986181259155,
"logps/chosen": -1165.780517578125,
"logps/rejected": -1509.78173828125,
"loss": 0.5879,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9111903309822083,
"rewards/margins": 0.37165799736976624,
"rewards/rejected": -1.2828481197357178,
"step": 570
},
{
"epoch": 0.46,
"learning_rate": 3.2392701251101172e-06,
"logits/chosen": -1.2665627002716064,
"logits/rejected": -1.0979241132736206,
"logps/chosen": -1186.7427978515625,
"logps/rejected": -1492.552001953125,
"loss": 0.601,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.9691814184188843,
"rewards/margins": 0.35359352827072144,
"rewards/rejected": -1.3227750062942505,
"step": 580
},
{
"epoch": 0.47,
"learning_rate": 3.1722995515381644e-06,
"logits/chosen": -1.089277982711792,
"logits/rejected": -0.9281560778617859,
"logps/chosen": -1328.541015625,
"logps/rejected": -1635.415771484375,
"loss": 0.5736,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0775573253631592,
"rewards/margins": 0.35047078132629395,
"rewards/rejected": -1.4280281066894531,
"step": 590
},
{
"epoch": 0.48,
"learning_rate": 3.1048047389991693e-06,
"logits/chosen": -1.0954294204711914,
"logits/rejected": -0.9223726391792297,
"logps/chosen": -1286.0498046875,
"logps/rejected": -1733.7152099609375,
"loss": 0.6187,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.0697466135025024,
"rewards/margins": 0.43960708379745483,
"rewards/rejected": -1.5093533992767334,
"step": 600
},
{
"epoch": 0.49,
"learning_rate": 3.0368383179176584e-06,
"logits/chosen": -1.2414100170135498,
"logits/rejected": -1.0788267850875854,
"logps/chosen": -808.4361572265625,
"logps/rejected": -1251.864990234375,
"loss": 0.5898,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.566831648349762,
"rewards/margins": 0.4803600311279297,
"rewards/rejected": -1.0471916198730469,
"step": 610
},
{
"epoch": 0.5,
"learning_rate": 2.9684532864643123e-06,
"logits/chosen": -1.262020230293274,
"logits/rejected": -1.148590087890625,
"logps/chosen": -931.0750122070312,
"logps/rejected": -1080.2626953125,
"loss": 0.6143,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.6786974668502808,
"rewards/margins": 0.17992933094501495,
"rewards/rejected": -0.8586267232894897,
"step": 620
},
{
"epoch": 0.5,
"learning_rate": 2.8997029692295875e-06,
"logits/chosen": -1.380244255065918,
"logits/rejected": -1.1846643686294556,
"logps/chosen": -1102.616943359375,
"logps/rejected": -1461.800048828125,
"loss": 0.6248,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.832280158996582,
"rewards/margins": 0.3882806599140167,
"rewards/rejected": -1.2205607891082764,
"step": 630
},
{
"epoch": 0.51,
"learning_rate": 2.8306409756428067e-06,
"logits/chosen": -1.1531554460525513,
"logits/rejected": -0.73602694272995,
"logps/chosen": -1035.087158203125,
"logps/rejected": -1619.76171875,
"loss": 0.568,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7359131574630737,
"rewards/margins": 0.6530500650405884,
"rewards/rejected": -1.388963222503662,
"step": 640
},
{
"epoch": 0.52,
"learning_rate": 2.761321158169134e-06,
"logits/chosen": -1.1167099475860596,
"logits/rejected": -0.873482346534729,
"logps/chosen": -1361.4727783203125,
"logps/rejected": -1630.313232421875,
"loss": 0.5607,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0866434574127197,
"rewards/margins": 0.33569416403770447,
"rewards/rejected": -1.4223374128341675,
"step": 650
},
{
"epoch": 0.53,
"learning_rate": 2.6917975703170466e-06,
"logits/chosen": -0.7387585639953613,
"logits/rejected": -0.3248792886734009,
"logps/chosen": -1398.619384765625,
"logps/rejected": -2202.85498046875,
"loss": 0.5724,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1486316919326782,
"rewards/margins": 0.7962583303451538,
"rewards/rejected": -1.944890022277832,
"step": 660
},
{
"epoch": 0.54,
"learning_rate": 2.6221244244890336e-06,
"logits/chosen": -0.6219555139541626,
"logits/rejected": -0.4327964782714844,
"logps/chosen": -1998.990234375,
"logps/rejected": -2370.507080078125,
"loss": 0.576,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.755999207496643,
"rewards/margins": 0.406169593334198,
"rewards/rejected": -2.1621687412261963,
"step": 670
},
{
"epoch": 0.54,
"learning_rate": 2.5523560497083927e-06,
"logits/chosen": -0.9830737113952637,
"logits/rejected": -0.8169624209403992,
"logps/chosen": -1028.17578125,
"logps/rejected": -1266.951904296875,
"loss": 0.6217,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8314197659492493,
"rewards/margins": 0.2545499801635742,
"rewards/rejected": -1.0859696865081787,
"step": 680
},
{
"epoch": 0.55,
"learning_rate": 2.482546849255096e-06,
"logits/chosen": -1.0446968078613281,
"logits/rejected": -0.7680097818374634,
"logps/chosen": -963.1312255859375,
"logps/rejected": -1276.6685791015625,
"loss": 0.5595,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7486265301704407,
"rewards/margins": 0.34474682807922363,
"rewards/rejected": -1.0933732986450195,
"step": 690
},
{
"epoch": 0.56,
"learning_rate": 2.4127512582437486e-06,
"logits/chosen": -0.9387643933296204,
"logits/rejected": -0.8350385427474976,
"logps/chosen": -1224.355712890625,
"logps/rejected": -1547.6513671875,
"loss": 0.5682,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9411084055900574,
"rewards/margins": 0.34462398290634155,
"rewards/rejected": -1.285732388496399,
"step": 700
},
{
"epoch": 0.57,
"learning_rate": 2.3430237011767166e-06,
"logits/chosen": -0.7177497148513794,
"logits/rejected": -0.5354570746421814,
"logps/chosen": -1501.9898681640625,
"logps/rejected": -2006.5120849609375,
"loss": 0.5666,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.279309630393982,
"rewards/margins": 0.5385541319847107,
"rewards/rejected": -1.8178638219833374,
"step": 710
},
{
"epoch": 0.58,
"learning_rate": 2.2734185495055503e-06,
"logits/chosen": -0.14772020280361176,
"logits/rejected": 0.09259579330682755,
"logps/chosen": -1891.1800537109375,
"logps/rejected": -2354.27734375,
"loss": 0.5845,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.666395902633667,
"rewards/margins": 0.5180760025978088,
"rewards/rejected": -2.184471845626831,
"step": 720
},
{
"epoch": 0.58,
"learning_rate": 2.2039900792337477e-06,
"logits/chosen": -0.7175928354263306,
"logits/rejected": -0.47789135575294495,
"logps/chosen": -1339.34765625,
"logps/rejected": -1721.939453125,
"loss": 0.6008,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.0828118324279785,
"rewards/margins": 0.42446571588516235,
"rewards/rejected": -1.5072776079177856,
"step": 730
},
{
"epoch": 0.59,
"learning_rate": 2.134792428593971e-06,
"logits/chosen": -1.1001973152160645,
"logits/rejected": -0.7717048525810242,
"logps/chosen": -975.1065673828125,
"logps/rejected": -1379.069580078125,
"loss": 0.5754,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7290834188461304,
"rewards/margins": 0.4491938054561615,
"rewards/rejected": -1.1782772541046143,
"step": 740
},
{
"epoch": 0.6,
"learning_rate": 2.0658795558326745e-06,
"logits/chosen": -0.9087737202644348,
"logits/rejected": -0.9751186370849609,
"logps/chosen": -962.6619873046875,
"logps/rejected": -1202.0980224609375,
"loss": 0.6048,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7626216411590576,
"rewards/margins": 0.23821432888507843,
"rewards/rejected": -1.0008360147476196,
"step": 750
},
{
"epoch": 0.61,
"learning_rate": 1.997305197135089e-06,
"logits/chosen": -0.6515249013900757,
"logits/rejected": -0.5174766778945923,
"logps/chosen": -1231.8673095703125,
"logps/rejected": -1379.020263671875,
"loss": 0.6427,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.020787000656128,
"rewards/margins": 0.1627589762210846,
"rewards/rejected": -1.1835458278656006,
"step": 760
},
{
"epoch": 0.62,
"learning_rate": 1.9291228247233607e-06,
"logits/chosen": -1.1023533344268799,
"logits/rejected": -0.8313691020011902,
"logps/chosen": -1354.640380859375,
"logps/rejected": -1333.6419677734375,
"loss": 0.6151,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0642707347869873,
"rewards/margins": 0.0644897073507309,
"rewards/rejected": -1.1287604570388794,
"step": 770
},
{
"epoch": 0.62,
"learning_rate": 1.8613856051605242e-06,
"logits/chosen": -0.8817610740661621,
"logits/rejected": -0.6308177709579468,
"logps/chosen": -1051.3311767578125,
"logps/rejected": -1573.681396484375,
"loss": 0.6084,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7915970087051392,
"rewards/margins": 0.5443645715713501,
"rewards/rejected": -1.3359615802764893,
"step": 780
},
{
"epoch": 0.63,
"learning_rate": 1.7941463578928088e-06,
"logits/chosen": -1.0322520732879639,
"logits/rejected": -0.9708759188652039,
"logps/chosen": -1105.9547119140625,
"logps/rejected": -1240.3804931640625,
"loss": 0.6281,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8885539174079895,
"rewards/margins": 0.14831490814685822,
"rewards/rejected": -1.0368688106536865,
"step": 790
},
{
"epoch": 0.64,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": -0.8422597646713257,
"logits/rejected": -0.9403928518295288,
"logps/chosen": -1141.7333984375,
"logps/rejected": -1371.9580078125,
"loss": 0.5443,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8957103490829468,
"rewards/margins": 0.22806143760681152,
"rewards/rejected": -1.1237719058990479,
"step": 800
},
{
"epoch": 0.65,
"learning_rate": 1.661371075624363e-06,
"logits/chosen": -0.6579400300979614,
"logits/rejected": -0.38535481691360474,
"logps/chosen": -1287.460205078125,
"logps/rejected": -1504.084228515625,
"loss": 0.6025,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.095157265663147,
"rewards/margins": 0.25700071454048157,
"rewards/rejected": -1.3521578311920166,
"step": 810
},
{
"epoch": 0.66,
"learning_rate": 1.5959385747947697e-06,
"logits/chosen": -0.7758182287216187,
"logits/rejected": -0.67876136302948,
"logps/chosen": -1263.397216796875,
"logps/rejected": -1797.307373046875,
"loss": 0.5785,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.020475149154663,
"rewards/margins": 0.5204795002937317,
"rewards/rejected": -1.540954828262329,
"step": 820
},
{
"epoch": 0.66,
"learning_rate": 1.5312110338697427e-06,
"logits/chosen": -1.0213903188705444,
"logits/rejected": -0.8918190002441406,
"logps/chosen": -1065.27734375,
"logps/rejected": -1240.601806640625,
"loss": 0.6148,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8754297494888306,
"rewards/margins": 0.1893172711133957,
"rewards/rejected": -1.0647470951080322,
"step": 830
},
{
"epoch": 0.67,
"learning_rate": 1.467238925438646e-06,
"logits/chosen": -1.0888749361038208,
"logits/rejected": -0.9006432294845581,
"logps/chosen": -1159.646728515625,
"logps/rejected": -1520.9205322265625,
"loss": 0.5909,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.891791045665741,
"rewards/margins": 0.37499967217445374,
"rewards/rejected": -1.2667908668518066,
"step": 840
},
{
"epoch": 0.68,
"learning_rate": 1.4040721330273063e-06,
"logits/chosen": -0.7614107131958008,
"logits/rejected": -0.4498376250267029,
"logps/chosen": -1577.4564208984375,
"logps/rejected": -2064.27587890625,
"loss": 0.5638,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2812590599060059,
"rewards/margins": 0.5300472974777222,
"rewards/rejected": -1.811306357383728,
"step": 850
},
{
"epoch": 0.69,
"learning_rate": 1.3417599122003464e-06,
"logits/chosen": -1.0388845205307007,
"logits/rejected": -0.6018816232681274,
"logps/chosen": -1428.4647216796875,
"logps/rejected": -2024.8896484375,
"loss": 0.5664,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1658397912979126,
"rewards/margins": 0.6608397364616394,
"rewards/rejected": -1.8266795873641968,
"step": 860
},
{
"epoch": 0.7,
"learning_rate": 1.280350852153168e-06,
"logits/chosen": -0.6871947050094604,
"logits/rejected": -0.6853007674217224,
"logps/chosen": -1243.111572265625,
"logps/rejected": -1829.707275390625,
"loss": 0.5468,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0525375604629517,
"rewards/margins": 0.5433839559555054,
"rewards/rejected": -1.595921277999878,
"step": 870
},
{
"epoch": 0.7,
"learning_rate": 1.2198928378235717e-06,
"logits/chosen": -0.8195822834968567,
"logits/rejected": -0.6704593896865845,
"logps/chosen": -1643.8544921875,
"logps/rejected": -2129.94189453125,
"loss": 0.6271,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3954288959503174,
"rewards/margins": 0.46880459785461426,
"rewards/rejected": -1.8642336130142212,
"step": 880
},
{
"epoch": 0.71,
"learning_rate": 1.160433012552508e-06,
"logits/chosen": -1.0698474645614624,
"logits/rejected": -0.9074158668518066,
"logps/chosen": -1074.9571533203125,
"logps/rejected": -1626.334228515625,
"loss": 0.5905,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8372681736946106,
"rewards/margins": 0.602739691734314,
"rewards/rejected": -1.4400079250335693,
"step": 890
},
{
"epoch": 0.72,
"learning_rate": 1.1020177413231334e-06,
"logits/chosen": -1.0306494235992432,
"logits/rejected": -0.7876461148262024,
"logps/chosen": -1247.2880859375,
"logps/rejected": -1743.0517578125,
"loss": 0.5594,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0142000913619995,
"rewards/margins": 0.5253391265869141,
"rewards/rejected": -1.5395392179489136,
"step": 900
},
{
"epoch": 0.73,
"learning_rate": 1.0446925746067768e-06,
"logits/chosen": -0.7109737396240234,
"logits/rejected": -0.4215407371520996,
"logps/chosen": -1115.1761474609375,
"logps/rejected": -1571.439697265625,
"loss": 0.584,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.8916100263595581,
"rewards/margins": 0.515656590461731,
"rewards/rejected": -1.407266616821289,
"step": 910
},
{
"epoch": 0.74,
"learning_rate": 9.88502212844063e-07,
"logits/chosen": -0.9104745984077454,
"logits/rejected": -0.6836374402046204,
"logps/chosen": -1387.066650390625,
"logps/rejected": -2122.8583984375,
"loss": 0.5959,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1135450601577759,
"rewards/margins": 0.7385995388031006,
"rewards/rejected": -1.8521445989608765,
"step": 920
},
{
"epoch": 0.74,
"learning_rate": 9.334904715888496e-07,
"logits/chosen": -0.7981353998184204,
"logits/rejected": -0.47408953309059143,
"logps/chosen": -1591.907958984375,
"logps/rejected": -2268.22802734375,
"loss": 0.5996,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3141790628433228,
"rewards/margins": 0.6781451106071472,
"rewards/rejected": -1.9923241138458252,
"step": 930
},
{
"epoch": 0.75,
"learning_rate": 8.797002473421729e-07,
"logits/chosen": -0.7773085832595825,
"logits/rejected": -0.8014146089553833,
"logps/chosen": -1200.51904296875,
"logps/rejected": -1684.487060546875,
"loss": 0.551,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.948921799659729,
"rewards/margins": 0.48516377806663513,
"rewards/rejected": -1.434085488319397,
"step": 940
},
{
"epoch": 0.76,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": -0.7013689279556274,
"logits/rejected": -0.48669877648353577,
"logps/chosen": -1563.8013916015625,
"logps/rejected": -1946.0748291015625,
"loss": 0.5625,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.3094342947006226,
"rewards/margins": 0.41629427671432495,
"rewards/rejected": -1.7257286310195923,
"step": 950
},
{
"epoch": 0.77,
"learning_rate": 7.759511406608255e-07,
"logits/chosen": -0.8420946002006531,
"logits/rejected": -0.7498366832733154,
"logps/chosen": -1459.593994140625,
"logps/rejected": -1710.6732177734375,
"loss": 0.5906,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2246272563934326,
"rewards/margins": 0.26000121235847473,
"rewards/rejected": -1.484628438949585,
"step": 960
},
{
"epoch": 0.78,
"learning_rate": 7.260731586586983e-07,
"logits/chosen": -0.6557348370552063,
"logits/rejected": -0.6690261363983154,
"logps/chosen": -1101.353759765625,
"logps/rejected": -1434.849365234375,
"loss": 0.5741,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.974203884601593,
"rewards/margins": 0.33285054564476013,
"rewards/rejected": -1.3070546388626099,
"step": 970
},
{
"epoch": 0.78,
"learning_rate": 6.775784314464717e-07,
"logits/chosen": -1.0794492959976196,
"logits/rejected": -0.7161605954170227,
"logps/chosen": -1141.7310791015625,
"logps/rejected": -1709.1611328125,
"loss": 0.5292,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.9028726816177368,
"rewards/margins": 0.6150614619255066,
"rewards/rejected": -1.5179340839385986,
"step": 980
},
{
"epoch": 0.79,
"learning_rate": 6.305047737536707e-07,
"logits/chosen": -0.6755369305610657,
"logits/rejected": -0.619501531124115,
"logps/chosen": -1471.1715087890625,
"logps/rejected": -2048.2431640625,
"loss": 0.5757,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2707023620605469,
"rewards/margins": 0.5611073970794678,
"rewards/rejected": -1.8318097591400146,
"step": 990
},
{
"epoch": 0.8,
"learning_rate": 5.848888922025553e-07,
"logits/chosen": -0.8728944659233093,
"logits/rejected": -1.02140212059021,
"logps/chosen": -1593.0389404296875,
"logps/rejected": -1743.603759765625,
"loss": 0.5888,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.3282172679901123,
"rewards/margins": 0.18238051235675812,
"rewards/rejected": -1.510597825050354,
"step": 1000
},
{
"epoch": 0.81,
"learning_rate": 5.407663566854008e-07,
"logits/chosen": -1.2148396968841553,
"logits/rejected": -0.9437984228134155,
"logps/chosen": -1290.468994140625,
"logps/rejected": -1955.718017578125,
"loss": 0.5642,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0306814908981323,
"rewards/margins": 0.7142370939254761,
"rewards/rejected": -1.7449188232421875,
"step": 1010
},
{
"epoch": 0.82,
"learning_rate": 4.981715726281666e-07,
"logits/chosen": -0.9041656255722046,
"logits/rejected": -0.7784754633903503,
"logps/chosen": -1627.837890625,
"logps/rejected": -2467.47314453125,
"loss": 0.5837,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3451955318450928,
"rewards/margins": 0.8315626978874207,
"rewards/rejected": -2.176758289337158,
"step": 1020
},
{
"epoch": 0.82,
"learning_rate": 4.5713775416217884e-07,
"logits/chosen": -0.508884608745575,
"logits/rejected": -0.4718368649482727,
"logps/chosen": -1368.59765625,
"logps/rejected": -2104.90625,
"loss": 0.563,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1711418628692627,
"rewards/margins": 0.7298630475997925,
"rewards/rejected": -1.9010050296783447,
"step": 1030
},
{
"epoch": 0.83,
"learning_rate": 4.1769689822475147e-07,
"logits/chosen": -0.5781607031822205,
"logits/rejected": -0.6443449258804321,
"logps/chosen": -1434.3941650390625,
"logps/rejected": -2030.039306640625,
"loss": 0.5716,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2758595943450928,
"rewards/margins": 0.5422929525375366,
"rewards/rejected": -1.818152666091919,
"step": 1040
},
{
"epoch": 0.84,
"learning_rate": 3.798797596089351e-07,
"logits/chosen": -0.8322515487670898,
"logits/rejected": -0.5321582555770874,
"logps/chosen": -1379.1712646484375,
"logps/rejected": -1931.9146728515625,
"loss": 0.55,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1463197469711304,
"rewards/margins": 0.5763110518455505,
"rewards/rejected": -1.7226308584213257,
"step": 1050
},
{
"epoch": 0.85,
"learning_rate": 3.4371582698185636e-07,
"logits/chosen": -0.9420675039291382,
"logits/rejected": -0.7674391269683838,
"logps/chosen": -1645.792236328125,
"logps/rejected": -2230.914306640625,
"loss": 0.5025,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3653483390808105,
"rewards/margins": 0.5703693628311157,
"rewards/rejected": -1.9357175827026367,
"step": 1060
},
{
"epoch": 0.86,
"learning_rate": 3.092332998903416e-07,
"logits/chosen": -0.8753671646118164,
"logits/rejected": -0.8147989511489868,
"logps/chosen": -1408.5784912109375,
"logps/rejected": -2042.339599609375,
"loss": 0.576,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.158811092376709,
"rewards/margins": 0.6254128217697144,
"rewards/rejected": -1.7842239141464233,
"step": 1070
},
{
"epoch": 0.86,
"learning_rate": 2.764590667717562e-07,
"logits/chosen": -0.6400734186172485,
"logits/rejected": -0.46232056617736816,
"logps/chosen": -1486.711669921875,
"logps/rejected": -1575.9361572265625,
"loss": 0.5985,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -1.3110835552215576,
"rewards/margins": 0.12779514491558075,
"rewards/rejected": -1.4388787746429443,
"step": 1080
},
{
"epoch": 0.87,
"learning_rate": 2.454186839872158e-07,
"logits/chosen": -0.616904616355896,
"logits/rejected": -0.3994078040122986,
"logps/chosen": -1475.5816650390625,
"logps/rejected": -2202.4833984375,
"loss": 0.5632,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2749942541122437,
"rewards/margins": 0.7362874746322632,
"rewards/rejected": -2.011281728744507,
"step": 1090
},
{
"epoch": 0.88,
"learning_rate": 2.1613635589349756e-07,
"logits/chosen": -0.8570725321769714,
"logits/rejected": -0.636644184589386,
"logps/chosen": -1252.1573486328125,
"logps/rejected": -1564.4869384765625,
"loss": 0.6027,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.0256534814834595,
"rewards/margins": 0.33503228425979614,
"rewards/rejected": -1.3606857061386108,
"step": 1100
},
{
"epoch": 0.89,
"learning_rate": 1.8863491596921745e-07,
"logits/chosen": -0.7760205268859863,
"logits/rejected": -0.4805734157562256,
"logps/chosen": -1414.7236328125,
"logps/rejected": -1822.532958984375,
"loss": 0.5691,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.1799442768096924,
"rewards/margins": 0.46841782331466675,
"rewards/rejected": -1.648362159729004,
"step": 1110
},
{
"epoch": 0.9,
"learning_rate": 1.629358090099639e-07,
"logits/chosen": -0.7541752457618713,
"logits/rejected": -0.5927517414093018,
"logps/chosen": -1383.7718505859375,
"logps/rejected": -1702.1370849609375,
"loss": 0.5901,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.173661231994629,
"rewards/margins": 0.3563198149204254,
"rewards/rejected": -1.529981017112732,
"step": 1120
},
{
"epoch": 0.9,
"learning_rate": 1.3905907440629752e-07,
"logits/chosen": -0.8271347284317017,
"logits/rejected": -0.5372225046157837,
"logps/chosen": -1264.5931396484375,
"logps/rejected": -1810.603759765625,
"loss": 0.5606,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0567820072174072,
"rewards/margins": 0.5766392946243286,
"rewards/rejected": -1.633421540260315,
"step": 1130
},
{
"epoch": 0.91,
"learning_rate": 1.1702333051763271e-07,
"logits/chosen": -0.7219616770744324,
"logits/rejected": -0.44911837577819824,
"logps/chosen": -1458.1873779296875,
"logps/rejected": -2138.194091796875,
"loss": 0.5666,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1980092525482178,
"rewards/margins": 0.7267267107963562,
"rewards/rejected": -1.9247362613677979,
"step": 1140
},
{
"epoch": 0.92,
"learning_rate": 9.684576015420277e-08,
"logits/chosen": -0.6613628268241882,
"logits/rejected": -0.41062062978744507,
"logps/chosen": -1361.629150390625,
"logps/rejected": -1998.5576171875,
"loss": 0.5569,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1172220706939697,
"rewards/margins": 0.6446647644042969,
"rewards/rejected": -1.7618869543075562,
"step": 1150
},
{
"epoch": 0.93,
"learning_rate": 7.854209717842231e-08,
"logits/chosen": -0.6104007959365845,
"logits/rejected": -0.5428999662399292,
"logps/chosen": -1572.9007568359375,
"logps/rejected": -1579.3857421875,
"loss": 0.5912,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.3415987491607666,
"rewards/margins": 0.03751998022198677,
"rewards/rejected": -1.37911856174469,
"step": 1160
},
{
"epoch": 0.94,
"learning_rate": 6.212661423609184e-08,
"logits/chosen": -0.4355439245700836,
"logits/rejected": -0.3004533052444458,
"logps/chosen": -1881.150390625,
"logps/rejected": -2220.178955078125,
"loss": 0.6615,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6670087575912476,
"rewards/margins": 0.3249967694282532,
"rewards/rejected": -1.9920055866241455,
"step": 1170
},
{
"epoch": 0.94,
"learning_rate": 4.761211162702117e-08,
"logits/chosen": -0.7097476124763489,
"logits/rejected": -0.6131819486618042,
"logps/chosen": -1349.8577880859375,
"logps/rejected": -1754.811279296875,
"loss": 0.5665,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.1261099576950073,
"rewards/margins": 0.42261672019958496,
"rewards/rejected": -1.5487267971038818,
"step": 1180
},
{
"epoch": 0.95,
"learning_rate": 3.5009907323737826e-08,
"logits/chosen": -0.6964675188064575,
"logits/rejected": -0.35203319787979126,
"logps/chosen": -1499.8782958984375,
"logps/rejected": -2223.939697265625,
"loss": 0.5449,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.267060399055481,
"rewards/margins": 0.7385483980178833,
"rewards/rejected": -2.0056090354919434,
"step": 1190
},
{
"epoch": 0.96,
"learning_rate": 2.4329828146074096e-08,
"logits/chosen": -0.9956095814704895,
"logits/rejected": -0.6606365442276001,
"logps/chosen": -1594.940673828125,
"logps/rejected": -1980.383056640625,
"loss": 0.5542,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.3295806646347046,
"rewards/margins": 0.44766488671302795,
"rewards/rejected": -1.777245283126831,
"step": 1200
},
{
"epoch": 0.97,
"learning_rate": 1.5580202098509078e-08,
"logits/chosen": -0.9308391809463501,
"logits/rejected": -0.8904244303703308,
"logps/chosen": -1657.0823974609375,
"logps/rejected": -1868.926025390625,
"loss": 0.6277,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3185545206069946,
"rewards/margins": 0.27054083347320557,
"rewards/rejected": -1.5890953540802002,
"step": 1210
},
{
"epoch": 0.98,
"learning_rate": 8.767851876239075e-09,
"logits/chosen": -0.625472903251648,
"logits/rejected": -0.4992523789405823,
"logps/chosen": -1419.646240234375,
"logps/rejected": -2127.7783203125,
"loss": 0.5692,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.194054365158081,
"rewards/margins": 0.7004168629646301,
"rewards/rejected": -1.8944714069366455,
"step": 1220
},
{
"epoch": 0.98,
"learning_rate": 3.8980895450474455e-09,
"logits/chosen": -0.8765754699707031,
"logits/rejected": -0.6600515842437744,
"logps/chosen": -1335.862060546875,
"logps/rejected": -1862.95703125,
"loss": 0.6035,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.117206335067749,
"rewards/margins": 0.5470607280731201,
"rewards/rejected": -1.6642669439315796,
"step": 1230
},
{
"epoch": 0.99,
"learning_rate": 9.747123991141193e-10,
"logits/chosen": -0.9478354454040527,
"logits/rejected": -0.7755793929100037,
"logps/chosen": -1195.031005859375,
"logps/rejected": -1684.8265380859375,
"loss": 0.5304,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9703825116157532,
"rewards/margins": 0.4978027939796448,
"rewards/rejected": -1.4681851863861084,
"step": 1240
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": -0.8340710401535034,
"logits/rejected": -0.6496783494949341,
"logps/chosen": -1408.339111328125,
"logps/rejected": -1972.5435791015625,
"loss": 0.5482,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.1645435094833374,
"rewards/margins": 0.58301842212677,
"rewards/rejected": -1.7475616931915283,
"step": 1250
},
{
"epoch": 1.0,
"step": 1250,
"total_flos": 0.0,
"train_loss": 0.6029718887329102,
"train_runtime": 12868.9286,
"train_samples_per_second": 1.166,
"train_steps_per_second": 0.097
}
],
"logging_steps": 10,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}