{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959925193694897, "eval_steps": 100, "global_step": 233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004274646005877639, "grad_norm": 3.4727758395385346, "learning_rate": 4.166666666666666e-08, "logits/chosen": -0.9238853454589844, "logits/rejected": -0.9009266495704651, "logps/chosen": -211.83998107910156, "logps/rejected": -194.95265197753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.008549292011755277, "grad_norm": 3.5000648062483686, "learning_rate": 8.333333333333333e-08, "logits/chosen": -0.9474210739135742, "logits/rejected": -0.9417086243629456, "logps/chosen": -160.0943603515625, "logps/rejected": -163.26644897460938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.012823938017632914, "grad_norm": 3.8566721368935113, "learning_rate": 1.25e-07, "logits/chosen": -0.8552289009094238, "logits/rejected": -0.9027292132377625, "logps/chosen": -197.13523864746094, "logps/rejected": -191.77366638183594, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.00209163804538548, "rewards/margins": 0.0021166829392313957, "rewards/rejected": -2.5045330403372645e-05, "step": 3 }, { "epoch": 0.017098584023510555, "grad_norm": 3.527297888533762, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.9195055961608887, "logits/rejected": -0.9506024122238159, "logps/chosen": -175.96563720703125, "logps/rejected": -177.187255859375, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005994887324050069, "rewards/margins": 0.001228818204253912, "rewards/rejected": -0.0006293297046795487, "step": 4 }, { "epoch": 0.02137323002938819, "grad_norm": 3.274108961837268, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.9131849408149719, "logits/rejected": -0.9851359128952026, "logps/chosen": -196.52279663085938, "logps/rejected": -209.4899444580078, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014956831000745296, "rewards/margins": -0.002281556138768792, "rewards/rejected": 0.0007858729222789407, "step": 5 }, { "epoch": 0.02564787603526583, "grad_norm": 3.4643988401861643, "learning_rate": 2.5e-07, "logits/chosen": -1.0323811769485474, "logits/rejected": -1.0281962156295776, "logps/chosen": -175.13864135742188, "logps/rejected": -171.71237182617188, "loss": 0.6934, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0016992997843772173, "rewards/margins": -0.0023347530514001846, "rewards/rejected": 0.000635453499853611, "step": 6 }, { "epoch": 0.029922522041143467, "grad_norm": 3.753822101296772, "learning_rate": 2.916666666666667e-07, "logits/chosen": -0.8140788078308105, "logits/rejected": -0.8268399238586426, "logps/chosen": -204.0390625, "logps/rejected": -210.50558471679688, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0002044451393885538, "rewards/margins": 0.0006307458970695734, "rewards/rejected": -0.0004263008013367653, "step": 7 }, { "epoch": 0.03419716804702111, "grad_norm": 3.1848827253835568, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.9922436475753784, "logits/rejected": -0.9979274868965149, "logps/chosen": -192.83494567871094, "logps/rejected": -200.88128662109375, "loss": 0.693, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002161582000553608, "rewards/margins": 0.0022183258552104235, "rewards/rejected": -5.674359272234142e-05, "step": 8 }, { "epoch": 0.03847181405289874, "grad_norm": 3.7147220039656568, "learning_rate": 3.75e-07, "logits/chosen": -0.9252921342849731, "logits/rejected": -0.9685516357421875, "logps/chosen": -175.70448303222656, "logps/rejected": -180.89736938476562, "loss": 0.6927, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0024666767567396164, "rewards/margins": 0.002756566507741809, "rewards/rejected": -0.0002898902166634798, "step": 9 }, { "epoch": 0.04274646005877638, "grad_norm": 3.553251668230928, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.9595114588737488, "logits/rejected": -0.9833444356918335, "logps/chosen": -208.72735595703125, "logps/rejected": -214.8730926513672, "loss": 0.693, "rewards/accuracies": 0.4375, "rewards/chosen": 0.002427927916869521, "rewards/margins": 0.001882559503428638, "rewards/rejected": 0.0005453681806102395, "step": 10 }, { "epoch": 0.04702110606465402, "grad_norm": 3.4673888891096216, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -0.986074686050415, "logits/rejected": -0.9903304576873779, "logps/chosen": -138.227783203125, "logps/rejected": -137.13824462890625, "loss": 0.6931, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0006541174370795488, "rewards/margins": -0.0011124282609671354, "rewards/rejected": 0.0017665456980466843, "step": 11 }, { "epoch": 0.05129575207053166, "grad_norm": 3.412261478741586, "learning_rate": 5e-07, "logits/chosen": -0.896647036075592, "logits/rejected": -0.9640191197395325, "logps/chosen": -157.36685180664062, "logps/rejected": -180.9624481201172, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0007026732200756669, "rewards/margins": 0.0007272702641785145, "rewards/rejected": -2.459682582411915e-05, "step": 12 }, { "epoch": 0.055570398076409296, "grad_norm": 3.5015942981464434, "learning_rate": 5.416666666666666e-07, "logits/chosen": -0.8603953123092651, "logits/rejected": -0.8457555770874023, "logps/chosen": -190.04727172851562, "logps/rejected": -196.87872314453125, "loss": 0.6918, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0017774368170648813, "rewards/margins": 0.002889451337978244, "rewards/rejected": -0.0011120146373286843, "step": 13 }, { "epoch": 0.059845044082286934, "grad_norm": 3.3564122983724283, "learning_rate": 5.833333333333334e-07, "logits/chosen": -0.9946928024291992, "logits/rejected": -0.9674972295761108, "logps/chosen": -173.98526000976562, "logps/rejected": -167.90187072753906, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00402639526873827, "rewards/margins": 0.00235772505402565, "rewards/rejected": 0.0016686702147126198, "step": 14 }, { "epoch": 0.06411969008816458, "grad_norm": 3.6183547057085903, "learning_rate": 6.249999999999999e-07, "logits/chosen": -0.9302492737770081, "logits/rejected": -0.9131873846054077, "logps/chosen": -172.501953125, "logps/rejected": -165.2920684814453, "loss": 0.6914, "rewards/accuracies": 0.53125, "rewards/chosen": 0.008232050575315952, "rewards/margins": 0.0026909802109003067, "rewards/rejected": 0.005541070364415646, "step": 15 }, { "epoch": 0.06839433609404222, "grad_norm": 4.094403587057344, "learning_rate": 6.666666666666666e-07, "logits/chosen": -0.8987658023834229, "logits/rejected": -0.918194591999054, "logps/chosen": -182.8192901611328, "logps/rejected": -188.6702423095703, "loss": 0.6907, "rewards/accuracies": 0.625, "rewards/chosen": 0.0033055683597922325, "rewards/margins": 0.006322154775261879, "rewards/rejected": -0.003016585949808359, "step": 16 }, { "epoch": 0.07266898209991986, "grad_norm": 3.7556102735295602, "learning_rate": 7.083333333333334e-07, "logits/chosen": -0.7985554933547974, "logits/rejected": -0.8355307579040527, "logps/chosen": -218.515869140625, "logps/rejected": -218.05130004882812, "loss": 0.6907, "rewards/accuracies": 0.53125, "rewards/chosen": 0.002777060493826866, "rewards/margins": 0.0028633405454456806, "rewards/rejected": -8.627981878817081e-05, "step": 17 }, { "epoch": 0.07694362810579748, "grad_norm": 3.7267439469140835, "learning_rate": 7.5e-07, "logits/chosen": -1.0510368347167969, "logits/rejected": -1.1025066375732422, "logps/chosen": -187.49362182617188, "logps/rejected": -213.5237274169922, "loss": 0.6893, "rewards/accuracies": 0.46875, "rewards/chosen": 0.003624723292887211, "rewards/margins": 0.0034801624715328217, "rewards/rejected": 0.00014456117060035467, "step": 18 }, { "epoch": 0.08121827411167512, "grad_norm": 3.6435544044761947, "learning_rate": 7.916666666666666e-07, "logits/chosen": -1.0699188709259033, "logits/rejected": -1.0673398971557617, "logps/chosen": -185.699951171875, "logps/rejected": -175.41836547851562, "loss": 0.6889, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01173433754593134, "rewards/margins": 0.009952141903340816, "rewards/rejected": 0.001782197505235672, "step": 19 }, { "epoch": 0.08549292011755276, "grad_norm": 3.56212142993403, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.9693958759307861, "logits/rejected": -1.0447947978973389, "logps/chosen": -160.5248260498047, "logps/rejected": -177.9250030517578, "loss": 0.6883, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0071954806335270405, "rewards/margins": 0.007437723223119974, "rewards/rejected": -0.00024224258959293365, "step": 20 }, { "epoch": 0.0897675661234304, "grad_norm": 3.599911110818667, "learning_rate": 8.75e-07, "logits/chosen": -0.8949970006942749, "logits/rejected": -0.9538885951042175, "logps/chosen": -155.24188232421875, "logps/rejected": -175.83969116210938, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": -0.0026197312399744987, "rewards/margins": 0.013713551685214043, "rewards/rejected": -0.016333281993865967, "step": 21 }, { "epoch": 0.09404221212930804, "grad_norm": 3.809589692220953, "learning_rate": 9.166666666666665e-07, "logits/chosen": -0.8426035642623901, "logits/rejected": -0.909124493598938, "logps/chosen": -162.82546997070312, "logps/rejected": -168.50677490234375, "loss": 0.6853, "rewards/accuracies": 0.625, "rewards/chosen": 0.014116955921053886, "rewards/margins": 0.008742437697947025, "rewards/rejected": 0.005374519154429436, "step": 22 }, { "epoch": 0.09831685813518568, "grad_norm": 3.8481134168387325, "learning_rate": 9.583333333333334e-07, "logits/chosen": -0.9963463544845581, "logits/rejected": -1.030158281326294, "logps/chosen": -212.16732788085938, "logps/rejected": -226.55050659179688, "loss": 0.6831, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005188916344195604, "rewards/margins": 0.037282973527908325, "rewards/rejected": -0.04247189313173294, "step": 23 }, { "epoch": 0.10259150414106331, "grad_norm": 4.017050664188984, "learning_rate": 1e-06, "logits/chosen": -0.9375415444374084, "logits/rejected": -0.9786323308944702, "logps/chosen": -161.66697692871094, "logps/rejected": -171.43328857421875, "loss": 0.6793, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0014034155756235123, "rewards/margins": 0.029357939958572388, "rewards/rejected": -0.03076135367155075, "step": 24 }, { "epoch": 0.10686615014694095, "grad_norm": 4.219585279560121, "learning_rate": 9.999435142363483e-07, "logits/chosen": -0.9440574049949646, "logits/rejected": -0.97591233253479, "logps/chosen": -142.18214416503906, "logps/rejected": -145.74217224121094, "loss": 0.6753, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006380043923854828, "rewards/margins": 0.02844325453042984, "rewards/rejected": -0.022063210606575012, "step": 25 }, { "epoch": 0.11114079615281859, "grad_norm": 4.465336404650454, "learning_rate": 9.997740697079592e-07, "logits/chosen": -0.907569408416748, "logits/rejected": -0.9431344270706177, "logps/chosen": -186.16468811035156, "logps/rejected": -188.70187377929688, "loss": 0.6698, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03334157541394234, "rewards/margins": 0.04588525742292404, "rewards/rejected": -0.07922682911157608, "step": 26 }, { "epoch": 0.11541544215869623, "grad_norm": 4.035688150172887, "learning_rate": 9.994917046996472e-07, "logits/chosen": -0.9081155061721802, "logits/rejected": -0.9375332593917847, "logps/chosen": -196.47586059570312, "logps/rejected": -210.2967071533203, "loss": 0.6745, "rewards/accuracies": 0.46875, "rewards/chosen": -0.06558644771575928, "rewards/margins": 0.024703964591026306, "rewards/rejected": -0.09029041230678558, "step": 27 }, { "epoch": 0.11969008816457387, "grad_norm": 4.589583975444085, "learning_rate": 9.990964830098245e-07, "logits/chosen": -0.9100086688995361, "logits/rejected": -0.9473557472229004, "logps/chosen": -183.28317260742188, "logps/rejected": -191.90957641601562, "loss": 0.6642, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0952601209282875, "rewards/margins": 0.06084320694208145, "rewards/rejected": -0.15610332787036896, "step": 28 }, { "epoch": 0.12396473417045151, "grad_norm": 4.479468138978008, "learning_rate": 9.985884939360872e-07, "logits/chosen": -1.1165940761566162, "logits/rejected": -1.1295504570007324, "logps/chosen": -166.12542724609375, "logps/rejected": -165.3243408203125, "loss": 0.6578, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11186902225017548, "rewards/margins": 0.050340794026851654, "rewards/rejected": -0.16220980882644653, "step": 29 }, { "epoch": 0.12823938017632916, "grad_norm": 4.703342289738615, "learning_rate": 9.97967852255038e-07, "logits/chosen": -0.9528751969337463, "logits/rejected": -0.9631531238555908, "logps/chosen": -254.89320373535156, "logps/rejected": -258.4338073730469, "loss": 0.6583, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22431208193302155, "rewards/margins": 0.050895195454359055, "rewards/rejected": -0.2752072513103485, "step": 30 }, { "epoch": 0.13251402618220678, "grad_norm": 4.869138630164683, "learning_rate": 9.972346981963546e-07, "logits/chosen": -1.059159755706787, "logits/rejected": -1.1036772727966309, "logps/chosen": -245.163330078125, "logps/rejected": -268.007568359375, "loss": 0.6513, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28532153367996216, "rewards/margins": 0.1269759237766266, "rewards/rejected": -0.41229742765426636, "step": 31 }, { "epoch": 0.13678867218808444, "grad_norm": 4.841549329203085, "learning_rate": 9.96389197411104e-07, "logits/chosen": -0.9731124043464661, "logits/rejected": -1.025037169456482, "logps/chosen": -209.7532958984375, "logps/rejected": -234.02642822265625, "loss": 0.6414, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20661108195781708, "rewards/margins": 0.18127745389938354, "rewards/rejected": -0.38788852095603943, "step": 32 }, { "epoch": 0.14106331819396206, "grad_norm": 4.527086707272363, "learning_rate": 9.954315409343168e-07, "logits/chosen": -0.9516006708145142, "logits/rejected": -1.0085594654083252, "logps/chosen": -233.74896240234375, "logps/rejected": -257.0697937011719, "loss": 0.6391, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3886350095272064, "rewards/margins": 0.19288921356201172, "rewards/rejected": -0.5815242528915405, "step": 33 }, { "epoch": 0.14533796419983971, "grad_norm": 5.675688535211087, "learning_rate": 9.943619451418224e-07, "logits/chosen": -0.9171434640884399, "logits/rejected": -0.9520907998085022, "logps/chosen": -232.1197967529297, "logps/rejected": -252.1339874267578, "loss": 0.6138, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4231939911842346, "rewards/margins": 0.2075667530298233, "rewards/rejected": -0.6307607293128967, "step": 34 }, { "epoch": 0.14961261020571734, "grad_norm": 4.555434497600014, "learning_rate": 9.931806517013612e-07, "logits/chosen": -0.9599072933197021, "logits/rejected": -0.9873026013374329, "logps/chosen": -235.87911987304688, "logps/rejected": -277.68585205078125, "loss": 0.6249, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5353493690490723, "rewards/margins": 0.26459625363349915, "rewards/rejected": -0.799945592880249, "step": 35 }, { "epoch": 0.15388725621159496, "grad_norm": 4.824318328500942, "learning_rate": 9.918879275179817e-07, "logits/chosen": -1.1668760776519775, "logits/rejected": -1.1293714046478271, "logps/chosen": -288.35406494140625, "logps/rejected": -298.5234375, "loss": 0.6005, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6688686609268188, "rewards/margins": 0.22619600594043732, "rewards/rejected": -0.8950645923614502, "step": 36 }, { "epoch": 0.15816190221747262, "grad_norm": 4.401543973490666, "learning_rate": 9.904840646737345e-07, "logits/chosen": -0.9521760940551758, "logits/rejected": -0.9997081756591797, "logps/chosen": -282.0852355957031, "logps/rejected": -336.65020751953125, "loss": 0.6319, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7313822507858276, "rewards/margins": 0.29729628562927246, "rewards/rejected": -1.0286785364151, "step": 37 }, { "epoch": 0.16243654822335024, "grad_norm": 5.15798330777588, "learning_rate": 9.889693803616791e-07, "logits/chosen": -1.0276933908462524, "logits/rejected": -1.045649766921997, "logps/chosen": -311.87677001953125, "logps/rejected": -334.0548095703125, "loss": 0.6, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0708937644958496, "rewards/margins": 0.2531777620315552, "rewards/rejected": -1.3240714073181152, "step": 38 }, { "epoch": 0.1667111942292279, "grad_norm": 4.8993868987417555, "learning_rate": 9.873442168142157e-07, "logits/chosen": -0.909888505935669, "logits/rejected": -0.9343925714492798, "logps/chosen": -254.18350219726562, "logps/rejected": -285.18243408203125, "loss": 0.5973, "rewards/accuracies": 0.75, "rewards/chosen": -0.9324233531951904, "rewards/margins": 0.22102315723896027, "rewards/rejected": -1.1534464359283447, "step": 39 }, { "epoch": 0.17098584023510552, "grad_norm": 4.965329298176294, "learning_rate": 9.856089412257604e-07, "logits/chosen": -0.8430695533752441, "logits/rejected": -0.8712520599365234, "logps/chosen": -278.5356750488281, "logps/rejected": -313.9254455566406, "loss": 0.5892, "rewards/accuracies": 0.75, "rewards/chosen": -1.097804069519043, "rewards/margins": 0.29625624418258667, "rewards/rejected": -1.3940601348876953, "step": 40 }, { "epoch": 0.17526048624098317, "grad_norm": 4.964663460213464, "learning_rate": 9.8376394566978e-07, "logits/chosen": -0.9349880218505859, "logits/rejected": -0.9195177555084229, "logps/chosen": -353.0047607421875, "logps/rejected": -375.0325927734375, "loss": 0.5905, "rewards/accuracies": 0.75, "rewards/chosen": -1.4320145845413208, "rewards/margins": 0.2865561842918396, "rewards/rejected": -1.7185708284378052, "step": 41 }, { "epoch": 0.1795351322468608, "grad_norm": 4.673829673061791, "learning_rate": 9.818096470102066e-07, "logits/chosen": -0.9460776448249817, "logits/rejected": -1.0075451135635376, "logps/chosen": -326.8829040527344, "logps/rejected": -359.4068908691406, "loss": 0.59, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3413481712341309, "rewards/margins": 0.43105652928352356, "rewards/rejected": -1.772404670715332, "step": 42 }, { "epoch": 0.18380977825273845, "grad_norm": 4.756386075263894, "learning_rate": 9.797464868072486e-07, "logits/chosen": -0.8998066186904907, "logits/rejected": -0.9348124265670776, "logps/chosen": -347.86090087890625, "logps/rejected": -441.64501953125, "loss": 0.5674, "rewards/accuracies": 0.8125, "rewards/chosen": -1.69937264919281, "rewards/margins": 0.7839919924736023, "rewards/rejected": -2.4833645820617676, "step": 43 }, { "epoch": 0.18808442425861607, "grad_norm": 5.666891206447458, "learning_rate": 9.775749312176248e-07, "logits/chosen": -0.8193731307983398, "logits/rejected": -0.8275444507598877, "logps/chosen": -334.3702697753906, "logps/rejected": -402.1867370605469, "loss": 0.592, "rewards/accuracies": 0.71875, "rewards/chosen": -1.655839204788208, "rewards/margins": 0.6519087553024292, "rewards/rejected": -2.3077480792999268, "step": 44 }, { "epoch": 0.19235907026449373, "grad_norm": 5.611553010112512, "learning_rate": 9.752954708892377e-07, "logits/chosen": -0.8545299172401428, "logits/rejected": -0.9027716517448425, "logps/chosen": -371.7701721191406, "logps/rejected": -439.71881103515625, "loss": 0.5779, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9063547849655151, "rewards/margins": 0.5792344808578491, "rewards/rejected": -2.4855895042419434, "step": 45 }, { "epoch": 0.19663371627037135, "grad_norm": 5.062237682542423, "learning_rate": 9.729086208503173e-07, "logits/chosen": -0.9441611766815186, "logits/rejected": -0.956858217716217, "logps/chosen": -451.3914794921875, "logps/rejected": -498.17999267578125, "loss": 0.5592, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6227638721466064, "rewards/margins": 0.4997914731502533, "rewards/rejected": -3.1225552558898926, "step": 46 }, { "epoch": 0.200908362276249, "grad_norm": 5.547722907580694, "learning_rate": 9.70414920393052e-07, "logits/chosen": -0.8402402400970459, "logits/rejected": -0.8305561542510986, "logps/chosen": -410.6358642578125, "logps/rejected": -456.8866882324219, "loss": 0.5657, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3196969032287598, "rewards/margins": 0.5383195281028748, "rewards/rejected": -2.8580164909362793, "step": 47 }, { "epoch": 0.20518300828212663, "grad_norm": 5.843768728466239, "learning_rate": 9.678149329517409e-07, "logits/chosen": -0.9230031967163086, "logits/rejected": -0.9459983706474304, "logps/chosen": -421.91253662109375, "logps/rejected": -464.15460205078125, "loss": 0.5158, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3388688564300537, "rewards/margins": 0.5758055448532104, "rewards/rejected": -2.9146745204925537, "step": 48 }, { "epoch": 0.20945765428800428, "grad_norm": 6.600018288252386, "learning_rate": 9.651092459754877e-07, "logits/chosen": -0.7874542474746704, "logits/rejected": -0.7807765007019043, "logps/chosen": -553.3658447265625, "logps/rejected": -578.9154052734375, "loss": 0.5601, "rewards/accuracies": 0.65625, "rewards/chosen": -3.3889737129211426, "rewards/margins": 0.2704327702522278, "rewards/rejected": -3.6594066619873047, "step": 49 }, { "epoch": 0.2137323002938819, "grad_norm": 6.555644686187637, "learning_rate": 9.62298470795473e-07, "logits/chosen": -0.7596021890640259, "logits/rejected": -0.8105506896972656, "logps/chosen": -396.783935546875, "logps/rejected": -437.79541015625, "loss": 0.5798, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4116053581237793, "rewards/margins": 0.3920546770095825, "rewards/rejected": -2.8036601543426514, "step": 50 }, { "epoch": 0.21800694629975956, "grad_norm": 6.744038117473701, "learning_rate": 9.59383242486827e-07, "logits/chosen": -0.8625648617744446, "logits/rejected": -0.8875184059143066, "logps/chosen": -505.1508483886719, "logps/rejected": -608.79248046875, "loss": 0.5389, "rewards/accuracies": 0.625, "rewards/chosen": -3.0446105003356934, "rewards/margins": 0.9900886416435242, "rewards/rejected": -4.034698963165283, "step": 51 }, { "epoch": 0.22228159230563718, "grad_norm": 5.565039920114929, "learning_rate": 9.56364219725138e-07, "logits/chosen": -0.8463042974472046, "logits/rejected": -0.8962733745574951, "logps/chosen": -499.99041748046875, "logps/rejected": -634.49072265625, "loss": 0.4915, "rewards/accuracies": 0.78125, "rewards/chosen": -3.071751832962036, "rewards/margins": 1.300065279006958, "rewards/rejected": -4.371817588806152, "step": 52 }, { "epoch": 0.22655623831151483, "grad_norm": 9.993449327747468, "learning_rate": 9.532420846376315e-07, "logits/chosen": -0.7763329744338989, "logits/rejected": -0.8177902698516846, "logps/chosen": -433.4925842285156, "logps/rejected": -530.1204223632812, "loss": 0.6104, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5958311557769775, "rewards/margins": 0.9427847862243652, "rewards/rejected": -3.5386157035827637, "step": 53 }, { "epoch": 0.23083088431739246, "grad_norm": 6.914149355278302, "learning_rate": 9.500175426490454e-07, "logits/chosen": -0.7263307571411133, "logits/rejected": -0.7950284481048584, "logps/chosen": -590.033447265625, "logps/rejected": -696.9810791015625, "loss": 0.5291, "rewards/accuracies": 0.75, "rewards/chosen": -3.79225492477417, "rewards/margins": 0.9605345726013184, "rewards/rejected": -4.75278902053833, "step": 54 }, { "epoch": 0.2351055303232701, "grad_norm": 6.4385225722348425, "learning_rate": 9.466913223222465e-07, "logits/chosen": -0.73805832862854, "logits/rejected": -0.8121139407157898, "logps/chosen": -527.0991821289062, "logps/rejected": -672.7268676757812, "loss": 0.536, "rewards/accuracies": 0.84375, "rewards/chosen": -3.397312641143799, "rewards/margins": 1.3639640808105469, "rewards/rejected": -4.761276721954346, "step": 55 }, { "epoch": 0.23938017632914774, "grad_norm": 7.766777578194787, "learning_rate": 9.432641751936162e-07, "logits/chosen": -0.8009728193283081, "logits/rejected": -0.8259899020195007, "logps/chosen": -421.18414306640625, "logps/rejected": -515.0050659179688, "loss": 0.5853, "rewards/accuracies": 0.65625, "rewards/chosen": -2.522728204727173, "rewards/margins": 0.8710657954216003, "rewards/rejected": -3.393793821334839, "step": 56 }, { "epoch": 0.2436548223350254, "grad_norm": 6.5333907258413655, "learning_rate": 9.397368756032444e-07, "logits/chosen": -0.7609117031097412, "logits/rejected": -0.7754147052764893, "logps/chosen": -436.06427001953125, "logps/rejected": -512.485107421875, "loss": 0.5019, "rewards/accuracies": 0.65625, "rewards/chosen": -2.599217414855957, "rewards/margins": 0.6933461427688599, "rewards/rejected": -3.2925636768341064, "step": 57 }, { "epoch": 0.24792946834090301, "grad_norm": 7.290942942753059, "learning_rate": 9.36110220519976e-07, "logits/chosen": -0.7123927474021912, "logits/rejected": -0.7812705039978027, "logps/chosen": -428.41351318359375, "logps/rejected": -493.216552734375, "loss": 0.5486, "rewards/accuracies": 0.78125, "rewards/chosen": -2.501229763031006, "rewards/margins": 0.6336008906364441, "rewards/rejected": -3.1348307132720947, "step": 58 }, { "epoch": 0.25220411434678064, "grad_norm": 7.291539302989226, "learning_rate": 9.323850293613379e-07, "logits/chosen": -0.8743740916252136, "logits/rejected": -0.8304850459098816, "logps/chosen": -416.0941467285156, "logps/rejected": -461.4928283691406, "loss": 0.5248, "rewards/accuracies": 0.6875, "rewards/chosen": -2.469886064529419, "rewards/margins": 0.6079959869384766, "rewards/rejected": -3.0778818130493164, "step": 59 }, { "epoch": 0.2564787603526583, "grad_norm": 6.444086326444115, "learning_rate": 9.285621438083997e-07, "logits/chosen": -0.7638828754425049, "logits/rejected": -0.830043375492096, "logps/chosen": -462.22723388671875, "logps/rejected": -567.3082275390625, "loss": 0.496, "rewards/accuracies": 0.875, "rewards/chosen": -2.678903818130493, "rewards/margins": 0.9149044752120972, "rewards/rejected": -3.59380841255188, "step": 60 }, { "epoch": 0.26075340635853594, "grad_norm": 6.239441558672264, "learning_rate": 9.246424276156006e-07, "logits/chosen": -0.7686220407485962, "logits/rejected": -0.786496102809906, "logps/chosen": -426.57977294921875, "logps/rejected": -539.332763671875, "loss": 0.4872, "rewards/accuracies": 0.71875, "rewards/chosen": -2.421875, "rewards/margins": 1.0811634063720703, "rewards/rejected": -3.5030384063720703, "step": 61 }, { "epoch": 0.26502805236441357, "grad_norm": 6.955390195667037, "learning_rate": 9.206267664155906e-07, "logits/chosen": -0.8518524765968323, "logits/rejected": -0.8896721005439758, "logps/chosen": -490.898681640625, "logps/rejected": -571.5889282226562, "loss": 0.5328, "rewards/accuracies": 0.75, "rewards/chosen": -2.7304701805114746, "rewards/margins": 0.8642421364784241, "rewards/rejected": -3.594712495803833, "step": 62 }, { "epoch": 0.2693026983702912, "grad_norm": 6.71732914608112, "learning_rate": 9.165160675191271e-07, "logits/chosen": -0.7856395244598389, "logits/rejected": -0.8273566961288452, "logps/chosen": -406.04241943359375, "logps/rejected": -512.580810546875, "loss": 0.54, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3751254081726074, "rewards/margins": 1.008442759513855, "rewards/rejected": -3.3835678100585938, "step": 63 }, { "epoch": 0.2735773443761689, "grad_norm": 8.23163566342515, "learning_rate": 9.123112597100757e-07, "logits/chosen": -0.7550954818725586, "logits/rejected": -0.7312250733375549, "logps/chosen": -428.09210205078125, "logps/rejected": -472.0053405761719, "loss": 0.5628, "rewards/accuracies": 0.71875, "rewards/chosen": -2.460500717163086, "rewards/margins": 0.6525804996490479, "rewards/rejected": -3.1130809783935547, "step": 64 }, { "epoch": 0.2778519903820465, "grad_norm": 6.484017150981526, "learning_rate": 9.080132930355566e-07, "logits/chosen": -0.7198902368545532, "logits/rejected": -0.7333334684371948, "logps/chosen": -447.80694580078125, "logps/rejected": -543.547119140625, "loss": 0.4951, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3654160499572754, "rewards/margins": 1.1327059268951416, "rewards/rejected": -3.498121738433838, "step": 65 }, { "epoch": 0.2821266363879241, "grad_norm": 7.88288469592664, "learning_rate": 9.036231385912889e-07, "logits/chosen": -0.787277102470398, "logits/rejected": -0.8082758188247681, "logps/chosen": -542.9320678710938, "logps/rejected": -589.2691650390625, "loss": 0.5554, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2829174995422363, "rewards/margins": 0.4686228036880493, "rewards/rejected": -3.751540184020996, "step": 66 }, { "epoch": 0.28640128239380175, "grad_norm": 6.87598900963406, "learning_rate": 8.991417883021779e-07, "logits/chosen": -0.7320197820663452, "logits/rejected": -0.7914742231369019, "logps/chosen": -322.4740295410156, "logps/rejected": -403.00982666015625, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": -1.867960810661316, "rewards/margins": 0.7985786199569702, "rewards/rejected": -2.666539430618286, "step": 67 }, { "epoch": 0.29067592839967943, "grad_norm": 6.988332805689512, "learning_rate": 8.945702546981968e-07, "logits/chosen": -0.7299609780311584, "logits/rejected": -0.7391811013221741, "logps/chosen": -424.68255615234375, "logps/rejected": -520.7315673828125, "loss": 0.485, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4903666973114014, "rewards/margins": 0.9038018584251404, "rewards/rejected": -3.3941686153411865, "step": 68 }, { "epoch": 0.29495057440555705, "grad_norm": 7.25643319614823, "learning_rate": 8.899095706856121e-07, "logits/chosen": -0.8242793679237366, "logits/rejected": -0.8567203879356384, "logps/chosen": -416.467041015625, "logps/rejected": -556.6646118164062, "loss": 0.501, "rewards/accuracies": 0.8125, "rewards/chosen": -2.350522994995117, "rewards/margins": 1.3620076179504395, "rewards/rejected": -3.7125303745269775, "step": 69 }, { "epoch": 0.2992252204114347, "grad_norm": 7.602549324125367, "learning_rate": 8.851607893136064e-07, "logits/chosen": -0.7457299828529358, "logits/rejected": -0.7355296611785889, "logps/chosen": -458.4794006347656, "logps/rejected": -523.21484375, "loss": 0.4974, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8070144653320312, "rewards/margins": 0.6739380955696106, "rewards/rejected": -3.480952739715576, "step": 70 }, { "epoch": 0.3034998664173123, "grad_norm": 7.842587956186825, "learning_rate": 8.803249835363484e-07, "logits/chosen": -0.7719243168830872, "logits/rejected": -0.8175538778305054, "logps/chosen": -391.3406982421875, "logps/rejected": -472.3711242675781, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": -2.26741099357605, "rewards/margins": 0.7575722336769104, "rewards/rejected": -3.0249834060668945, "step": 71 }, { "epoch": 0.3077745124231899, "grad_norm": 8.33328622593718, "learning_rate": 8.754032459705671e-07, "logits/chosen": -0.7375326752662659, "logits/rejected": -0.7411423921585083, "logps/chosen": -552.6005249023438, "logps/rejected": -658.5523071289062, "loss": 0.4689, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2394614219665527, "rewards/margins": 1.1419782638549805, "rewards/rejected": -4.381440162658691, "step": 72 }, { "epoch": 0.3120491584290676, "grad_norm": 8.00590533781815, "learning_rate": 8.703966886486818e-07, "logits/chosen": -0.7447977066040039, "logits/rejected": -0.8021827340126038, "logps/chosen": -528.2827758789062, "logps/rejected": -663.02099609375, "loss": 0.4719, "rewards/accuracies": 0.78125, "rewards/chosen": -3.100710868835449, "rewards/margins": 1.4192044734954834, "rewards/rejected": -4.519914627075195, "step": 73 }, { "epoch": 0.31632380443494523, "grad_norm": 9.911560282455111, "learning_rate": 8.653064427675469e-07, "logits/chosen": -0.7718651294708252, "logits/rejected": -0.7922145128250122, "logps/chosen": -473.9974365234375, "logps/rejected": -587.6644897460938, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": -2.9299654960632324, "rewards/margins": 1.1681456565856934, "rewards/rejected": -4.098111152648926, "step": 74 }, { "epoch": 0.32059845044082286, "grad_norm": 7.86727024374843, "learning_rate": 8.601336584328658e-07, "logits/chosen": -0.6917619705200195, "logits/rejected": -0.6980517506599426, "logps/chosen": -500.0274963378906, "logps/rejected": -590.8597412109375, "loss": 0.4719, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1366689205169678, "rewards/margins": 0.9159411191940308, "rewards/rejected": -4.052610397338867, "step": 75 }, { "epoch": 0.3248730964467005, "grad_norm": 9.289091160075673, "learning_rate": 8.548795043993315e-07, "logits/chosen": -0.7438817620277405, "logits/rejected": -0.7294880747795105, "logps/chosen": -521.81005859375, "logps/rejected": -567.1892700195312, "loss": 0.5389, "rewards/accuracies": 0.6875, "rewards/chosen": -3.433103322982788, "rewards/margins": 0.4686957001686096, "rewards/rejected": -3.901798725128174, "step": 76 }, { "epoch": 0.32914774245257816, "grad_norm": 8.347011918030578, "learning_rate": 8.495451678065561e-07, "logits/chosen": -0.7081446647644043, "logits/rejected": -0.7084572315216064, "logps/chosen": -471.94879150390625, "logps/rejected": -576.2655639648438, "loss": 0.4923, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9118268489837646, "rewards/margins": 1.087062954902649, "rewards/rejected": -3.9988901615142822, "step": 77 }, { "epoch": 0.3334223884584558, "grad_norm": 9.178664809436675, "learning_rate": 8.441318539108432e-07, "logits/chosen": -0.672901451587677, "logits/rejected": -0.6473367214202881, "logps/chosen": -446.5679931640625, "logps/rejected": -525.839599609375, "loss": 0.4831, "rewards/accuracies": 0.71875, "rewards/chosen": -2.884944200515747, "rewards/margins": 0.8302309513092041, "rewards/rejected": -3.7151753902435303, "step": 78 }, { "epoch": 0.3376970344643334, "grad_norm": 9.222651758068919, "learning_rate": 8.386407858128706e-07, "logits/chosen": -0.7438699007034302, "logits/rejected": -0.7355214357376099, "logps/chosen": -530.9581909179688, "logps/rejected": -658.5861206054688, "loss": 0.4838, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4786720275878906, "rewards/margins": 1.2334851026535034, "rewards/rejected": -4.712156772613525, "step": 79 }, { "epoch": 0.34197168047021104, "grad_norm": 8.799544386144294, "learning_rate": 8.330732041813366e-07, "logits/chosen": -0.5365869402885437, "logits/rejected": -0.5661185383796692, "logps/chosen": -488.5903015136719, "logps/rejected": -571.9346923828125, "loss": 0.4732, "rewards/accuracies": 0.78125, "rewards/chosen": -3.123514175415039, "rewards/margins": 0.8028107285499573, "rewards/rejected": -3.9263250827789307, "step": 80 }, { "epoch": 0.3462463264760887, "grad_norm": 9.030244312954085, "learning_rate": 8.274303669726426e-07, "logits/chosen": -0.6187846660614014, "logits/rejected": -0.6990691423416138, "logps/chosen": -469.09161376953125, "logps/rejected": -603.3698120117188, "loss": 0.4698, "rewards/accuracies": 0.90625, "rewards/chosen": -3.157665729522705, "rewards/margins": 1.1742490530014038, "rewards/rejected": -4.331915378570557, "step": 81 }, { "epoch": 0.35052097248196634, "grad_norm": 8.841699455559503, "learning_rate": 8.217135491466636e-07, "logits/chosen": -0.473153293132782, "logits/rejected": -0.5449516177177429, "logps/chosen": -491.49249267578125, "logps/rejected": -654.7282104492188, "loss": 0.4764, "rewards/accuracies": 0.90625, "rewards/chosen": -3.222175121307373, "rewards/margins": 1.6088354587554932, "rewards/rejected": -4.831010341644287, "step": 82 }, { "epoch": 0.35479561848784397, "grad_norm": 11.098615894997318, "learning_rate": 8.159240423786819e-07, "logits/chosen": -0.6635532379150391, "logits/rejected": -0.6708536148071289, "logps/chosen": -529.8382568359375, "logps/rejected": -615.276123046875, "loss": 0.5068, "rewards/accuracies": 0.71875, "rewards/chosen": -3.458679437637329, "rewards/margins": 0.8304582238197327, "rewards/rejected": -4.289137840270996, "step": 83 }, { "epoch": 0.3590702644937216, "grad_norm": 10.69217432485512, "learning_rate": 8.100631547675416e-07, "logits/chosen": -0.5764239430427551, "logits/rejected": -0.6042333245277405, "logps/chosen": -538.3191528320312, "logps/rejected": -671.1778564453125, "loss": 0.463, "rewards/accuracies": 0.78125, "rewards/chosen": -3.707031726837158, "rewards/margins": 1.2638828754425049, "rewards/rejected": -4.970914363861084, "step": 84 }, { "epoch": 0.36334491049959927, "grad_norm": 10.389532135595351, "learning_rate": 8.041322105400921e-07, "logits/chosen": -0.5952804088592529, "logits/rejected": -0.5918059349060059, "logps/chosen": -468.830322265625, "logps/rejected": -555.058349609375, "loss": 0.4507, "rewards/accuracies": 0.75, "rewards/chosen": -3.2899224758148193, "rewards/margins": 0.8072735071182251, "rewards/rejected": -4.097196102142334, "step": 85 }, { "epoch": 0.3676195565054769, "grad_norm": 9.76247745990493, "learning_rate": 7.981325497519891e-07, "logits/chosen": -0.5011740922927856, "logits/rejected": -0.5748673677444458, "logps/chosen": -568.4143676757812, "logps/rejected": -665.891357421875, "loss": 0.473, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9403202533721924, "rewards/margins": 0.8801581859588623, "rewards/rejected": -4.820478439331055, "step": 86 }, { "epoch": 0.3718942025113545, "grad_norm": 9.276763138531336, "learning_rate": 7.920655279849171e-07, "logits/chosen": -0.6208050847053528, "logits/rejected": -0.6661792993545532, "logps/chosen": -454.78558349609375, "logps/rejected": -583.1072387695312, "loss": 0.439, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0670785903930664, "rewards/margins": 1.2415242195129395, "rewards/rejected": -4.308602333068848, "step": 87 }, { "epoch": 0.37616884851723215, "grad_norm": 8.47887809958896, "learning_rate": 7.859325160403071e-07, "logits/chosen": -0.5842097401618958, "logits/rejected": -0.6111244559288025, "logps/chosen": -513.4686279296875, "logps/rejected": -631.78515625, "loss": 0.4224, "rewards/accuracies": 0.6875, "rewards/chosen": -3.507704973220825, "rewards/margins": 1.1260159015655518, "rewards/rejected": -4.633721351623535, "step": 88 }, { "epoch": 0.3804434945231098, "grad_norm": 9.042970665585285, "learning_rate": 7.797348996296114e-07, "logits/chosen": -0.594511091709137, "logits/rejected": -0.5762075185775757, "logps/chosen": -528.5706787109375, "logps/rejected": -640.2254638671875, "loss": 0.4195, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5566911697387695, "rewards/margins": 1.1777138710021973, "rewards/rejected": -4.734404563903809, "step": 89 }, { "epoch": 0.38471814052898745, "grad_norm": 9.591838712191619, "learning_rate": 7.734740790612136e-07, "logits/chosen": -0.5243846774101257, "logits/rejected": -0.5419484376907349, "logps/chosen": -597.5997924804688, "logps/rejected": -721.8450927734375, "loss": 0.4525, "rewards/accuracies": 0.8125, "rewards/chosen": -4.259735107421875, "rewards/margins": 1.2303788661956787, "rewards/rejected": -5.490115165710449, "step": 90 }, { "epoch": 0.3889927865348651, "grad_norm": 9.197817870604572, "learning_rate": 7.671514689240365e-07, "logits/chosen": -0.5726766586303711, "logits/rejected": -0.6172913312911987, "logps/chosen": -557.262939453125, "logps/rejected": -697.6727294921875, "loss": 0.4701, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7903552055358887, "rewards/margins": 1.3101625442504883, "rewards/rejected": -5.100517749786377, "step": 91 }, { "epoch": 0.3932674325407427, "grad_norm": 11.581331107950847, "learning_rate": 7.607684977679283e-07, "logits/chosen": -0.6335964202880859, "logits/rejected": -0.6610329747200012, "logps/chosen": -519.31103515625, "logps/rejected": -657.5548706054688, "loss": 0.4294, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4710004329681396, "rewards/margins": 1.4494860172271729, "rewards/rejected": -4.920486927032471, "step": 92 }, { "epoch": 0.3975420785466204, "grad_norm": 10.631351767452296, "learning_rate": 7.543266077808892e-07, "logits/chosen": -0.427675724029541, "logits/rejected": -0.45514771342277527, "logps/chosen": -571.4942626953125, "logps/rejected": -717.7467041015625, "loss": 0.4636, "rewards/accuracies": 0.71875, "rewards/chosen": -3.948040723800659, "rewards/margins": 1.4244039058685303, "rewards/rejected": -5.372445106506348, "step": 93 }, { "epoch": 0.401816724552498, "grad_norm": 12.057725080831089, "learning_rate": 7.478272544632202e-07, "logits/chosen": -0.5969647765159607, "logits/rejected": -0.6751678586006165, "logps/chosen": -643.1666870117188, "logps/rejected": -773.894287109375, "loss": 0.4507, "rewards/accuracies": 0.75, "rewards/chosen": -4.552432537078857, "rewards/margins": 1.3130940198898315, "rewards/rejected": -5.8655266761779785, "step": 94 }, { "epoch": 0.40609137055837563, "grad_norm": 10.936442473029212, "learning_rate": 7.412719062986631e-07, "logits/chosen": -0.4887186288833618, "logits/rejected": -0.4894056022167206, "logps/chosen": -555.25341796875, "logps/rejected": -653.7760009765625, "loss": 0.4518, "rewards/accuracies": 0.71875, "rewards/chosen": -3.9043784141540527, "rewards/margins": 1.035258412361145, "rewards/rejected": -4.939637184143066, "step": 95 }, { "epoch": 0.41036601656425326, "grad_norm": 18.608338586356762, "learning_rate": 7.346620444226059e-07, "logits/chosen": -0.5932431221008301, "logits/rejected": -0.6164640784263611, "logps/chosen": -586.3929443359375, "logps/rejected": -703.4130249023438, "loss": 0.4449, "rewards/accuracies": 0.75, "rewards/chosen": -4.206478118896484, "rewards/margins": 1.083022117614746, "rewards/rejected": -5.2895002365112305, "step": 96 }, { "epoch": 0.41464066257013094, "grad_norm": 12.657787002066415, "learning_rate": 7.279991622874318e-07, "logits/chosen": -0.5697692632675171, "logits/rejected": -0.6259853839874268, "logps/chosen": -585.4468994140625, "logps/rejected": -732.0520629882812, "loss": 0.483, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9570021629333496, "rewards/margins": 1.447950839996338, "rewards/rejected": -5.404953479766846, "step": 97 }, { "epoch": 0.41891530857600856, "grad_norm": 13.367477990049776, "learning_rate": 7.212847653250828e-07, "logits/chosen": -0.6540141105651855, "logits/rejected": -0.6565195322036743, "logps/chosen": -731.0140380859375, "logps/rejected": -848.3659057617188, "loss": 0.4767, "rewards/accuracies": 0.84375, "rewards/chosen": -5.200318336486816, "rewards/margins": 1.2101118564605713, "rewards/rejected": -6.410430908203125, "step": 98 }, { "epoch": 0.4231899545818862, "grad_norm": 10.139524171463488, "learning_rate": 7.145203706069182e-07, "logits/chosen": -0.7252554893493652, "logits/rejected": -0.7746644616127014, "logps/chosen": -669.2903442382812, "logps/rejected": -828.5930786132812, "loss": 0.3875, "rewards/accuracies": 0.84375, "rewards/chosen": -4.771048545837402, "rewards/margins": 1.5108307600021362, "rewards/rejected": -6.281879425048828, "step": 99 }, { "epoch": 0.4274646005877638, "grad_norm": 12.356398801857143, "learning_rate": 7.077075065009433e-07, "logits/chosen": -0.557784914970398, "logits/rejected": -0.5647093057632446, "logps/chosen": -607.4837646484375, "logps/rejected": -718.2990112304688, "loss": 0.4439, "rewards/accuracies": 0.71875, "rewards/chosen": -4.555881500244141, "rewards/margins": 1.080979585647583, "rewards/rejected": -5.636861801147461, "step": 100 }, { "epoch": 0.4274646005877638, "eval_logits/chosen": -0.5001155734062195, "eval_logits/rejected": -0.5150425434112549, "eval_logps/chosen": -704.6570434570312, "eval_logps/rejected": -829.715087890625, "eval_loss": 0.4167614281177521, "eval_rewards/accuracies": 0.8145161271095276, "eval_rewards/chosen": -4.996352672576904, "eval_rewards/margins": 1.3122578859329224, "eval_rewards/rejected": -6.308610916137695, "eval_runtime": 165.659, "eval_samples_per_second": 11.838, "eval_steps_per_second": 0.374, "step": 100 }, { "epoch": 0.4317392465936415, "grad_norm": 13.314006294995002, "learning_rate": 7.008477123264847e-07, "logits/chosen": -0.6598826050758362, "logits/rejected": -0.6927035450935364, "logps/chosen": -734.70556640625, "logps/rejected": -900.852294921875, "loss": 0.4118, "rewards/accuracies": 0.875, "rewards/chosen": -5.161342620849609, "rewards/margins": 1.7760246992111206, "rewards/rejected": -6.9373674392700195, "step": 101 }, { "epoch": 0.4360138925995191, "grad_norm": 13.821741873689282, "learning_rate": 6.939425380063923e-07, "logits/chosen": -0.6629341244697571, "logits/rejected": -0.7558687925338745, "logps/chosen": -699.57177734375, "logps/rejected": -886.5426025390625, "loss": 0.3874, "rewards/accuracies": 0.875, "rewards/chosen": -5.201181888580322, "rewards/margins": 1.6117980480194092, "rewards/rejected": -6.8129801750183105, "step": 102 }, { "epoch": 0.44028853860539674, "grad_norm": 14.52198562884218, "learning_rate": 6.869935437168449e-07, "logits/chosen": -0.4441612958908081, "logits/rejected": -0.4517134428024292, "logps/chosen": -648.8721313476562, "logps/rejected": -743.1588745117188, "loss": 0.4932, "rewards/accuracies": 0.625, "rewards/chosen": -4.671912670135498, "rewards/margins": 1.0545084476470947, "rewards/rejected": -5.726420879364014, "step": 103 }, { "epoch": 0.44456318461127436, "grad_norm": 15.339894722169653, "learning_rate": 6.80002299534838e-07, "logits/chosen": -0.719368577003479, "logits/rejected": -0.7461254596710205, "logps/chosen": -573.4705810546875, "logps/rejected": -651.0980224609375, "loss": 0.4402, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8566808700561523, "rewards/margins": 0.8586393594741821, "rewards/rejected": -4.715320587158203, "step": 104 }, { "epoch": 0.448837830617152, "grad_norm": 10.956142784913482, "learning_rate": 6.72970385083438e-07, "logits/chosen": -0.641654372215271, "logits/rejected": -0.6621043682098389, "logps/chosen": -592.4070434570312, "logps/rejected": -721.1480102539062, "loss": 0.4013, "rewards/accuracies": 0.90625, "rewards/chosen": -4.002495288848877, "rewards/margins": 1.1794517040252686, "rewards/rejected": -5.181946754455566, "step": 105 }, { "epoch": 0.45311247662302967, "grad_norm": 14.08687818754259, "learning_rate": 6.658993891748759e-07, "logits/chosen": -0.6141338348388672, "logits/rejected": -0.5712395310401917, "logps/chosen": -525.6826171875, "logps/rejected": -657.1926879882812, "loss": 0.3788, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3286538124084473, "rewards/margins": 1.5682119131088257, "rewards/rejected": -4.896864891052246, "step": 106 }, { "epoch": 0.4573871226289073, "grad_norm": 12.007137757034995, "learning_rate": 6.587909094515663e-07, "logits/chosen": -0.6399226188659668, "logits/rejected": -0.6818464994430542, "logps/chosen": -515.7030639648438, "logps/rejected": -624.790283203125, "loss": 0.4432, "rewards/accuracies": 0.71875, "rewards/chosen": -3.5857155323028564, "rewards/margins": 0.9131308794021606, "rewards/rejected": -4.498846530914307, "step": 107 }, { "epoch": 0.4616617686347849, "grad_norm": 11.626806758384587, "learning_rate": 6.516465520251313e-07, "logits/chosen": -0.6572325229644775, "logits/rejected": -0.7261943221092224, "logps/chosen": -557.6213989257812, "logps/rejected": -685.3796997070312, "loss": 0.4302, "rewards/accuracies": 0.75, "rewards/chosen": -3.802943468093872, "rewards/margins": 1.3063392639160156, "rewards/rejected": -5.109282970428467, "step": 108 }, { "epoch": 0.46593641464066254, "grad_norm": 11.769626267692969, "learning_rate": 6.444679311135112e-07, "logits/chosen": -0.6812455058097839, "logits/rejected": -0.6769453287124634, "logps/chosen": -545.5555419921875, "logps/rejected": -670.9700317382812, "loss": 0.4633, "rewards/accuracies": 0.75, "rewards/chosen": -3.524083137512207, "rewards/margins": 1.1972600221633911, "rewards/rejected": -4.721343040466309, "step": 109 }, { "epoch": 0.4702110606465402, "grad_norm": 11.834467781345984, "learning_rate": 6.372566686762426e-07, "logits/chosen": -0.6734607219696045, "logits/rejected": -0.6938244104385376, "logps/chosen": -631.7657470703125, "logps/rejected": -778.4968872070312, "loss": 0.3988, "rewards/accuracies": 0.84375, "rewards/chosen": -4.416792869567871, "rewards/margins": 1.5345261096954346, "rewards/rejected": -5.951319217681885, "step": 110 }, { "epoch": 0.47448570665241785, "grad_norm": 10.600986700850507, "learning_rate": 6.30014394047988e-07, "logits/chosen": -0.7839672565460205, "logits/rejected": -0.7656916379928589, "logps/chosen": -520.810791015625, "logps/rejected": -590.8253173828125, "loss": 0.4064, "rewards/accuracies": 0.71875, "rewards/chosen": -3.642113447189331, "rewards/margins": 0.7910320162773132, "rewards/rejected": -4.433145999908447, "step": 111 }, { "epoch": 0.4787603526582955, "grad_norm": 13.974810391572062, "learning_rate": 6.227427435703995e-07, "logits/chosen": -0.6362528204917908, "logits/rejected": -0.7391636371612549, "logps/chosen": -589.657470703125, "logps/rejected": -778.1963500976562, "loss": 0.397, "rewards/accuracies": 0.90625, "rewards/chosen": -4.09708309173584, "rewards/margins": 1.6432350873947144, "rewards/rejected": -5.7403178215026855, "step": 112 }, { "epoch": 0.4830349986641731, "grad_norm": 12.06902931160219, "learning_rate": 6.154433602223978e-07, "logits/chosen": -0.7784813046455383, "logits/rejected": -0.8440088033676147, "logps/chosen": -634.3173828125, "logps/rejected": -829.8695068359375, "loss": 0.4383, "rewards/accuracies": 0.90625, "rewards/chosen": -4.257462024688721, "rewards/margins": 1.7875339984893799, "rewards/rejected": -6.0449957847595215, "step": 113 }, { "epoch": 0.4873096446700508, "grad_norm": 13.358425807533337, "learning_rate": 6.081178932489535e-07, "logits/chosen": -0.7081687450408936, "logits/rejected": -0.7073873281478882, "logps/chosen": -569.8103637695312, "logps/rejected": -694.3984375, "loss": 0.4252, "rewards/accuracies": 0.78125, "rewards/chosen": -3.89652943611145, "rewards/margins": 1.3301247358322144, "rewards/rejected": -5.226654529571533, "step": 114 }, { "epoch": 0.4915842906759284, "grad_norm": 11.586882789419233, "learning_rate": 6.00767997788451e-07, "logits/chosen": -0.5270929336547852, "logits/rejected": -0.5626642107963562, "logps/chosen": -693.6819458007812, "logps/rejected": -889.004150390625, "loss": 0.3575, "rewards/accuracies": 0.90625, "rewards/chosen": -4.817679405212402, "rewards/margins": 1.964691162109375, "rewards/rejected": -6.782370090484619, "step": 115 }, { "epoch": 0.49585893668180603, "grad_norm": 12.667062697493666, "learning_rate": 5.933953344987214e-07, "logits/chosen": -0.6200395226478577, "logits/rejected": -0.6530672311782837, "logps/chosen": -617.082763671875, "logps/rejected": -743.7801513671875, "loss": 0.394, "rewards/accuracies": 0.84375, "rewards/chosen": -4.380313873291016, "rewards/margins": 1.2766342163085938, "rewards/rejected": -5.656947612762451, "step": 116 }, { "epoch": 0.5001335826876837, "grad_norm": 13.06084918611884, "learning_rate": 5.860015691818292e-07, "logits/chosen": -0.5794460773468018, "logits/rejected": -0.6392884850502014, "logps/chosen": -523.0586547851562, "logps/rejected": -706.2977294921875, "loss": 0.3972, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7082810401916504, "rewards/margins": 1.7129367589950562, "rewards/rejected": -5.421217918395996, "step": 117 }, { "epoch": 0.5044082286935613, "grad_norm": 14.158925761660381, "learning_rate": 5.78588372407695e-07, "logits/chosen": -0.591346025466919, "logits/rejected": -0.5808792114257812, "logps/chosen": -661.6780395507812, "logps/rejected": -753.6257934570312, "loss": 0.3814, "rewards/accuracies": 0.78125, "rewards/chosen": -4.592419147491455, "rewards/margins": 1.0543147325515747, "rewards/rejected": -5.646734237670898, "step": 118 }, { "epoch": 0.508682874699439, "grad_norm": 13.571207607427793, "learning_rate": 5.711574191366427e-07, "logits/chosen": -0.4889651834964752, "logits/rejected": -0.44250980019569397, "logps/chosen": -608.6267700195312, "logps/rejected": -910.5017700195312, "loss": 0.4381, "rewards/accuracies": 0.6875, "rewards/chosen": -4.152202606201172, "rewards/margins": 0.7526392936706543, "rewards/rejected": -4.904841423034668, "step": 119 }, { "epoch": 0.5129575207053166, "grad_norm": 10.741928061872432, "learning_rate": 5.637103883409525e-07, "logits/chosen": -0.5629594922065735, "logits/rejected": -0.6181632876396179, "logps/chosen": -604.459228515625, "logps/rejected": -852.540283203125, "loss": 0.3589, "rewards/accuracies": 0.90625, "rewards/chosen": -4.461546421051025, "rewards/margins": 2.3289871215820312, "rewards/rejected": -6.790533542633057, "step": 120 }, { "epoch": 0.5172321667111942, "grad_norm": 13.589230759795878, "learning_rate": 5.562489626255103e-07, "logits/chosen": -0.6361875534057617, "logits/rejected": -0.6799750924110413, "logps/chosen": -612.7998657226562, "logps/rejected": -803.4393920898438, "loss": 0.3612, "rewards/accuracies": 0.8125, "rewards/chosen": -4.2646894454956055, "rewards/margins": 1.7221603393554688, "rewards/rejected": -5.986849784851074, "step": 121 }, { "epoch": 0.5215068127170719, "grad_norm": 13.130136917819023, "learning_rate": 5.48774827847634e-07, "logits/chosen": -0.6019195914268494, "logits/rejected": -0.6733092665672302, "logps/chosen": -578.5673828125, "logps/rejected": -739.2437133789062, "loss": 0.3972, "rewards/accuracies": 0.84375, "rewards/chosen": -4.263480186462402, "rewards/margins": 1.4445068836212158, "rewards/rejected": -5.707987308502197, "step": 122 }, { "epoch": 0.5257814587229495, "grad_norm": 12.568726768016017, "learning_rate": 5.412896727361662e-07, "logits/chosen": -0.5387797951698303, "logits/rejected": -0.6281207799911499, "logps/chosen": -604.70703125, "logps/rejected": -767.2006225585938, "loss": 0.3866, "rewards/accuracies": 0.875, "rewards/chosen": -4.197920322418213, "rewards/margins": 1.5229953527450562, "rewards/rejected": -5.720915794372559, "step": 123 }, { "epoch": 0.5300561047288271, "grad_norm": 12.66743855092958, "learning_rate": 5.337951885099166e-07, "logits/chosen": -0.7120057940483093, "logits/rejected": -0.6868148446083069, "logps/chosen": -564.8189086914062, "logps/rejected": -678.998779296875, "loss": 0.4235, "rewards/accuracies": 0.75, "rewards/chosen": -3.940401077270508, "rewards/margins": 1.179326057434082, "rewards/rejected": -5.11972713470459, "step": 124 }, { "epoch": 0.5343307507347048, "grad_norm": 14.46489476063489, "learning_rate": 5.262930684955438e-07, "logits/chosen": -0.7230139970779419, "logits/rejected": -0.7383438348770142, "logps/chosen": -680.5816040039062, "logps/rejected": -828.6448974609375, "loss": 0.4321, "rewards/accuracies": 0.875, "rewards/chosen": -4.831777572631836, "rewards/margins": 1.4383575916290283, "rewards/rejected": -6.270134925842285, "step": 125 }, { "epoch": 0.5386053967405824, "grad_norm": 13.241409444585061, "learning_rate": 5.187850077449603e-07, "logits/chosen": -0.49940329790115356, "logits/rejected": -0.5305464267730713, "logps/chosen": -678.576416015625, "logps/rejected": -828.8834838867188, "loss": 0.3599, "rewards/accuracies": 0.8125, "rewards/chosen": -4.9777750968933105, "rewards/margins": 1.5080969333648682, "rewards/rejected": -6.485872268676758, "step": 126 }, { "epoch": 0.5428800427464601, "grad_norm": 15.600684361137235, "learning_rate": 5.11272702652346e-07, "logits/chosen": -0.766007125377655, "logits/rejected": -0.8170765042304993, "logps/chosen": -783.025146484375, "logps/rejected": -941.555908203125, "loss": 0.3807, "rewards/accuracies": 0.8125, "rewards/chosen": -5.462682247161865, "rewards/margins": 1.6811782121658325, "rewards/rejected": -7.143860816955566, "step": 127 }, { "epoch": 0.5471546887523377, "grad_norm": 12.46259382895567, "learning_rate": 5.03757850570861e-07, "logits/chosen": -0.6791242361068726, "logits/rejected": -0.6803139448165894, "logps/chosen": -693.43115234375, "logps/rejected": -792.7020263671875, "loss": 0.3952, "rewards/accuracies": 0.71875, "rewards/chosen": -4.814350128173828, "rewards/margins": 1.0507954359054565, "rewards/rejected": -5.865145206451416, "step": 128 }, { "epoch": 0.5514293347582153, "grad_norm": 14.68281824566638, "learning_rate": 4.962421494291391e-07, "logits/chosen": -0.6624226570129395, "logits/rejected": -0.8003214597702026, "logps/chosen": -641.5405883789062, "logps/rejected": -848.9283447265625, "loss": 0.3979, "rewards/accuracies": 0.90625, "rewards/chosen": -4.510129928588867, "rewards/margins": 1.8167277574539185, "rewards/rejected": -6.326857566833496, "step": 129 }, { "epoch": 0.555703980764093, "grad_norm": 14.156910665906969, "learning_rate": 4.88727297347654e-07, "logits/chosen": -0.6747015714645386, "logits/rejected": -0.6304070353507996, "logps/chosen": -673.6571655273438, "logps/rejected": -818.0390625, "loss": 0.3615, "rewards/accuracies": 0.8125, "rewards/chosen": -4.6810078620910645, "rewards/margins": 1.7004587650299072, "rewards/rejected": -6.381466388702393, "step": 130 }, { "epoch": 0.5599786267699706, "grad_norm": 13.513462308218608, "learning_rate": 4.812149922550397e-07, "logits/chosen": -0.5138005614280701, "logits/rejected": -0.5008392333984375, "logps/chosen": -603.177490234375, "logps/rejected": -718.6824951171875, "loss": 0.4195, "rewards/accuracies": 0.71875, "rewards/chosen": -4.324260234832764, "rewards/margins": 1.1721910238265991, "rewards/rejected": -5.496450901031494, "step": 131 }, { "epoch": 0.5642532727758482, "grad_norm": 13.859777475577925, "learning_rate": 4.7370693150445615e-07, "logits/chosen": -0.7230309247970581, "logits/rejected": -0.7601820826530457, "logps/chosen": -678.0418090820312, "logps/rejected": -836.7296142578125, "loss": 0.4123, "rewards/accuracies": 0.78125, "rewards/chosen": -4.642270565032959, "rewards/margins": 1.615240216255188, "rewards/rejected": -6.257511138916016, "step": 132 }, { "epoch": 0.5685279187817259, "grad_norm": 12.304755311149645, "learning_rate": 4.6620481149008364e-07, "logits/chosen": -0.5858466029167175, "logits/rejected": -0.5665376782417297, "logps/chosen": -551.0853271484375, "logps/rejected": -661.145263671875, "loss": 0.3952, "rewards/accuracies": 0.8125, "rewards/chosen": -4.0769124031066895, "rewards/margins": 1.150291919708252, "rewards/rejected": -5.227204322814941, "step": 133 }, { "epoch": 0.5728025647876035, "grad_norm": 13.318626812706627, "learning_rate": 4.5871032726383385e-07, "logits/chosen": -0.6011719703674316, "logits/rejected": -0.6539227962493896, "logps/chosen": -613.9602661132812, "logps/rejected": -801.2767333984375, "loss": 0.3028, "rewards/accuracies": 0.9375, "rewards/chosen": -4.4908127784729, "rewards/margins": 1.866058588027954, "rewards/rejected": -6.356871604919434, "step": 134 }, { "epoch": 0.5770772107934812, "grad_norm": 15.02191791227977, "learning_rate": 4.512251721523659e-07, "logits/chosen": -0.5807833671569824, "logits/rejected": -0.5801360607147217, "logps/chosen": -585.0089111328125, "logps/rejected": -690.991455078125, "loss": 0.4568, "rewards/accuracies": 0.65625, "rewards/chosen": -4.436345100402832, "rewards/margins": 0.9718096852302551, "rewards/rejected": -5.40815544128418, "step": 135 }, { "epoch": 0.5813518567993589, "grad_norm": 13.719743061532137, "learning_rate": 4.4375103737448967e-07, "logits/chosen": -0.6176421642303467, "logits/rejected": -0.5977914333343506, "logps/chosen": -647.6025390625, "logps/rejected": -780.4196166992188, "loss": 0.3489, "rewards/accuracies": 0.75, "rewards/chosen": -4.731601715087891, "rewards/margins": 1.3735716342926025, "rewards/rejected": -6.105173110961914, "step": 136 }, { "epoch": 0.5856265028052364, "grad_norm": 13.899011782478087, "learning_rate": 4.362896116590475e-07, "logits/chosen": -0.6031906604766846, "logits/rejected": -0.6842055320739746, "logps/chosen": -619.6199340820312, "logps/rejected": -823.1647338867188, "loss": 0.3829, "rewards/accuracies": 0.875, "rewards/chosen": -4.415548324584961, "rewards/margins": 1.8599358797073364, "rewards/rejected": -6.275484085083008, "step": 137 }, { "epoch": 0.5899011488111141, "grad_norm": 14.094976818965387, "learning_rate": 4.2884258086335745e-07, "logits/chosen": -0.5712490081787109, "logits/rejected": -0.6076186299324036, "logps/chosen": -635.4631958007812, "logps/rejected": -767.619873046875, "loss": 0.3952, "rewards/accuracies": 0.84375, "rewards/chosen": -4.872575759887695, "rewards/margins": 1.2816861867904663, "rewards/rejected": -6.154261589050293, "step": 138 }, { "epoch": 0.5941757948169917, "grad_norm": 15.923719497368527, "learning_rate": 4.2141162759230503e-07, "logits/chosen": -0.4579673409461975, "logits/rejected": -0.5088114738464355, "logps/chosen": -541.15185546875, "logps/rejected": -640.8451538085938, "loss": 0.3694, "rewards/accuracies": 0.6875, "rewards/chosen": -4.0685858726501465, "rewards/margins": 0.9016439914703369, "rewards/rejected": -4.970229625701904, "step": 139 }, { "epoch": 0.5984504408228694, "grad_norm": 15.626619862394913, "learning_rate": 4.139984308181708e-07, "logits/chosen": -0.6617997884750366, "logits/rejected": -0.6638819575309753, "logps/chosen": -747.4265747070312, "logps/rejected": -863.458251953125, "loss": 0.3971, "rewards/accuracies": 0.78125, "rewards/chosen": -5.498664379119873, "rewards/margins": 1.1953853368759155, "rewards/rejected": -6.694049835205078, "step": 140 }, { "epoch": 0.602725086828747, "grad_norm": 16.11770967148462, "learning_rate": 4.0660466550127853e-07, "logits/chosen": -0.7728097438812256, "logits/rejected": -0.8388174772262573, "logps/chosen": -708.0802001953125, "logps/rejected": -857.48486328125, "loss": 0.4029, "rewards/accuracies": 0.8125, "rewards/chosen": -5.071959972381592, "rewards/margins": 1.4134087562561035, "rewards/rejected": -6.485368251800537, "step": 141 }, { "epoch": 0.6069997328346246, "grad_norm": 13.358737574089647, "learning_rate": 3.9923200221154914e-07, "logits/chosen": -0.5902035236358643, "logits/rejected": -0.600926399230957, "logps/chosen": -655.1759033203125, "logps/rejected": -776.4323120117188, "loss": 0.4008, "rewards/accuracies": 0.75, "rewards/chosen": -4.78019905090332, "rewards/margins": 1.229543685913086, "rewards/rejected": -6.009742736816406, "step": 142 }, { "epoch": 0.6112743788405023, "grad_norm": 20.882100986314004, "learning_rate": 3.918821067510464e-07, "logits/chosen": -0.5608689188957214, "logits/rejected": -0.5520298480987549, "logps/chosen": -606.8939208984375, "logps/rejected": -732.2677001953125, "loss": 0.481, "rewards/accuracies": 0.8125, "rewards/chosen": -4.506969928741455, "rewards/margins": 1.263184666633606, "rewards/rejected": -5.77015495300293, "step": 143 }, { "epoch": 0.6155490248463799, "grad_norm": 15.142641307206523, "learning_rate": 3.845566397776021e-07, "logits/chosen": -0.5451078414916992, "logits/rejected": -0.5328483581542969, "logps/chosen": -578.9205932617188, "logps/rejected": -720.2644653320312, "loss": 0.3981, "rewards/accuracies": 0.78125, "rewards/chosen": -4.13190221786499, "rewards/margins": 1.3544728755950928, "rewards/rejected": -5.486375331878662, "step": 144 }, { "epoch": 0.6198236708522575, "grad_norm": 14.102739323418549, "learning_rate": 3.772572564296004e-07, "logits/chosen": -0.5815902948379517, "logits/rejected": -0.6612125635147095, "logps/chosen": -647.4329833984375, "logps/rejected": -800.5177001953125, "loss": 0.3771, "rewards/accuracies": 0.8125, "rewards/chosen": -4.570836067199707, "rewards/margins": 1.5209659337997437, "rewards/rejected": -6.091801643371582, "step": 145 }, { "epoch": 0.6240983168581352, "grad_norm": 16.783278936540377, "learning_rate": 3.699856059520118e-07, "logits/chosen": -0.5741180777549744, "logits/rejected": -0.6261047720909119, "logps/chosen": -518.5027465820312, "logps/rejected": -752.7945556640625, "loss": 0.339, "rewards/accuracies": 0.90625, "rewards/chosen": -3.5159010887145996, "rewards/margins": 2.189849615097046, "rewards/rejected": -5.705749988555908, "step": 146 }, { "epoch": 0.6283729628640128, "grad_norm": 16.241290219398206, "learning_rate": 3.627433313237576e-07, "logits/chosen": -0.6445101499557495, "logits/rejected": -0.6295093297958374, "logps/chosen": -611.40576171875, "logps/rejected": -746.1281127929688, "loss": 0.4584, "rewards/accuracies": 0.71875, "rewards/chosen": -4.344273090362549, "rewards/margins": 1.294838309288025, "rewards/rejected": -5.639111518859863, "step": 147 }, { "epoch": 0.6326476088698905, "grad_norm": 16.58217058287404, "learning_rate": 3.5553206888648885e-07, "logits/chosen": -0.5924898386001587, "logits/rejected": -0.6705700755119324, "logps/chosen": -561.9385986328125, "logps/rejected": -790.1613159179688, "loss": 0.3589, "rewards/accuracies": 0.875, "rewards/chosen": -3.8234567642211914, "rewards/margins": 2.038635015487671, "rewards/rejected": -5.862092018127441, "step": 148 }, { "epoch": 0.6369222548757681, "grad_norm": 13.631809000580427, "learning_rate": 3.483534479748688e-07, "logits/chosen": -0.6043068170547485, "logits/rejected": -0.6237097978591919, "logps/chosen": -599.3536376953125, "logps/rejected": -737.3873291015625, "loss": 0.3333, "rewards/accuracies": 0.78125, "rewards/chosen": -4.288971424102783, "rewards/margins": 1.361509919166565, "rewards/rejected": -5.650481224060059, "step": 149 }, { "epoch": 0.6411969008816457, "grad_norm": 13.48441747872976, "learning_rate": 3.412090905484337e-07, "logits/chosen": -0.5789849758148193, "logits/rejected": -0.5934211015701294, "logps/chosen": -637.4574584960938, "logps/rejected": -799.4398193359375, "loss": 0.3633, "rewards/accuracies": 0.875, "rewards/chosen": -4.6277079582214355, "rewards/margins": 1.6034901142120361, "rewards/rejected": -6.231198310852051, "step": 150 }, { "epoch": 0.6454715468875234, "grad_norm": 13.54705279065067, "learning_rate": 3.3410061082512417e-07, "logits/chosen": -0.7143419981002808, "logits/rejected": -0.7396361231803894, "logps/chosen": -644.5235595703125, "logps/rejected": -819.0350341796875, "loss": 0.3787, "rewards/accuracies": 0.84375, "rewards/chosen": -4.598511695861816, "rewards/margins": 1.7460615634918213, "rewards/rejected": -6.344573020935059, "step": 151 }, { "epoch": 0.649746192893401, "grad_norm": 13.97282414379501, "learning_rate": 3.270296149165619e-07, "logits/chosen": -0.7898523807525635, "logits/rejected": -0.78404700756073, "logps/chosen": -746.5989990234375, "logps/rejected": -925.236328125, "loss": 0.3681, "rewards/accuracies": 0.6875, "rewards/chosen": -5.577144145965576, "rewards/margins": 1.7709450721740723, "rewards/rejected": -7.348089218139648, "step": 152 }, { "epoch": 0.6540208388992786, "grad_norm": 15.30647255299837, "learning_rate": 3.1999770046516194e-07, "logits/chosen": -0.6549022197723389, "logits/rejected": -0.6652963161468506, "logps/chosen": -734.189453125, "logps/rejected": -882.783203125, "loss": 0.397, "rewards/accuracies": 0.84375, "rewards/chosen": -5.586746692657471, "rewards/margins": 1.5474714040756226, "rewards/rejected": -7.134217739105225, "step": 153 }, { "epoch": 0.6582954849051563, "grad_norm": 15.480021561153041, "learning_rate": 3.1300645628315526e-07, "logits/chosen": -0.6595125794410706, "logits/rejected": -0.684340238571167, "logps/chosen": -692.1060180664062, "logps/rejected": -861.9107055664062, "loss": 0.3519, "rewards/accuracies": 0.90625, "rewards/chosen": -5.179555416107178, "rewards/margins": 1.7508639097213745, "rewards/rejected": -6.930419445037842, "step": 154 }, { "epoch": 0.6625701309110339, "grad_norm": 14.370880370113698, "learning_rate": 3.060574619936075e-07, "logits/chosen": -0.6609420776367188, "logits/rejected": -0.6882165670394897, "logps/chosen": -755.37109375, "logps/rejected": -923.8385620117188, "loss": 0.3964, "rewards/accuracies": 0.84375, "rewards/chosen": -5.7476301193237305, "rewards/margins": 1.664482831954956, "rewards/rejected": -7.412113189697266, "step": 155 }, { "epoch": 0.6668447769169116, "grad_norm": 15.740330800217551, "learning_rate": 2.9915228767351535e-07, "logits/chosen": -0.6638086438179016, "logits/rejected": -0.6543954014778137, "logps/chosen": -691.0468139648438, "logps/rejected": -834.0437622070312, "loss": 0.3507, "rewards/accuracies": 0.8125, "rewards/chosen": -4.920919895172119, "rewards/margins": 1.5358891487121582, "rewards/rejected": -6.4568095207214355, "step": 156 }, { "epoch": 0.6711194229227893, "grad_norm": 16.901891917408413, "learning_rate": 2.922924934990568e-07, "logits/chosen": -0.7027104496955872, "logits/rejected": -0.7690137624740601, "logps/chosen": -740.62353515625, "logps/rejected": -911.9879150390625, "loss": 0.383, "rewards/accuracies": 0.84375, "rewards/chosen": -5.2726335525512695, "rewards/margins": 1.703195571899414, "rewards/rejected": -6.975828647613525, "step": 157 }, { "epoch": 0.6753940689286668, "grad_norm": 14.502221464305004, "learning_rate": 2.8547962939308186e-07, "logits/chosen": -0.7240225672721863, "logits/rejected": -0.743172287940979, "logps/chosen": -638.63720703125, "logps/rejected": -785.5480346679688, "loss": 0.3907, "rewards/accuracies": 0.90625, "rewards/chosen": -4.341987133026123, "rewards/margins": 1.3872699737548828, "rewards/rejected": -5.729257583618164, "step": 158 }, { "epoch": 0.6796687149345445, "grad_norm": 13.076811073919702, "learning_rate": 2.7871523467491725e-07, "logits/chosen": -0.5847674608230591, "logits/rejected": -0.6150667667388916, "logps/chosen": -558.647216796875, "logps/rejected": -742.8478393554688, "loss": 0.3777, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8698740005493164, "rewards/margins": 1.8015196323394775, "rewards/rejected": -5.671393394470215, "step": 159 }, { "epoch": 0.6839433609404221, "grad_norm": 13.435743880369799, "learning_rate": 2.720008377125682e-07, "logits/chosen": -0.7234424352645874, "logits/rejected": -0.7753596305847168, "logps/chosen": -599.20947265625, "logps/rejected": -840.508544921875, "loss": 0.3559, "rewards/accuracies": 0.875, "rewards/chosen": -4.066052436828613, "rewards/margins": 2.2670648097991943, "rewards/rejected": -6.3331170082092285, "step": 160 }, { "epoch": 0.6882180069462998, "grad_norm": 14.725846441531868, "learning_rate": 2.6533795557739405e-07, "logits/chosen": -0.5986216068267822, "logits/rejected": -0.6119877099990845, "logps/chosen": -587.1053466796875, "logps/rejected": -751.2557983398438, "loss": 0.3576, "rewards/accuracies": 0.875, "rewards/chosen": -3.9571621417999268, "rewards/margins": 1.737083077430725, "rewards/rejected": -5.694245338439941, "step": 161 }, { "epoch": 0.6924926529521774, "grad_norm": 18.854853550251143, "learning_rate": 2.5872809370133704e-07, "logits/chosen": -0.7047430872917175, "logits/rejected": -0.7392921447753906, "logps/chosen": -541.9876098632812, "logps/rejected": -679.9630737304688, "loss": 0.3713, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7241249084472656, "rewards/margins": 1.3988580703735352, "rewards/rejected": -5.122982978820801, "step": 162 }, { "epoch": 0.696767298958055, "grad_norm": 15.327216222305758, "learning_rate": 2.521727455367797e-07, "logits/chosen": -0.4683057963848114, "logits/rejected": -0.4958358705043793, "logps/chosen": -477.89453125, "logps/rejected": -642.5132446289062, "loss": 0.3217, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4097371101379395, "rewards/margins": 1.6171811819076538, "rewards/rejected": -5.026918411254883, "step": 163 }, { "epoch": 0.7010419449639327, "grad_norm": 15.107047920787036, "learning_rate": 2.456733922191108e-07, "logits/chosen": -0.6403992176055908, "logits/rejected": -0.7081367373466492, "logps/chosen": -535.940673828125, "logps/rejected": -721.487060546875, "loss": 0.3835, "rewards/accuracies": 0.875, "rewards/chosen": -3.6437511444091797, "rewards/margins": 1.7465698719024658, "rewards/rejected": -5.390320777893066, "step": 164 }, { "epoch": 0.7053165909698104, "grad_norm": 14.789037132111227, "learning_rate": 2.3923150223207173e-07, "logits/chosen": -0.6419979333877563, "logits/rejected": -0.6663538217544556, "logps/chosen": -611.316162109375, "logps/rejected": -779.5244140625, "loss": 0.357, "rewards/accuracies": 0.875, "rewards/chosen": -3.9267449378967285, "rewards/margins": 1.7568156719207764, "rewards/rejected": -5.683561325073242, "step": 165 }, { "epoch": 0.7095912369756879, "grad_norm": 11.678139693168967, "learning_rate": 2.3284853107596347e-07, "logits/chosen": -0.643075704574585, "logits/rejected": -0.677239179611206, "logps/chosen": -605.615478515625, "logps/rejected": -788.3361206054688, "loss": 0.3161, "rewards/accuracies": 0.84375, "rewards/chosen": -4.1840410232543945, "rewards/margins": 1.828648567199707, "rewards/rejected": -6.012689590454102, "step": 166 }, { "epoch": 0.7138658829815656, "grad_norm": 14.838448866889486, "learning_rate": 2.2652592093878665e-07, "logits/chosen": -0.5908488631248474, "logits/rejected": -0.6062439680099487, "logps/chosen": -623.344482421875, "logps/rejected": -775.658447265625, "loss": 0.3693, "rewards/accuracies": 0.8125, "rewards/chosen": -4.541435241699219, "rewards/margins": 1.4988759756088257, "rewards/rejected": -6.040311336517334, "step": 167 }, { "epoch": 0.7181405289874432, "grad_norm": 13.768660505175903, "learning_rate": 2.202651003703885e-07, "logits/chosen": -0.5698331594467163, "logits/rejected": -0.5812557339668274, "logps/chosen": -609.9556274414062, "logps/rejected": -810.02587890625, "loss": 0.3727, "rewards/accuracies": 0.875, "rewards/chosen": -4.267837047576904, "rewards/margins": 1.995699167251587, "rewards/rejected": -6.2635369300842285, "step": 168 }, { "epoch": 0.7224151749933209, "grad_norm": 12.956408292396839, "learning_rate": 2.1406748395969305e-07, "logits/chosen": -0.6224421858787537, "logits/rejected": -0.6563930511474609, "logps/chosen": -620.7723388671875, "logps/rejected": -781.536865234375, "loss": 0.3362, "rewards/accuracies": 0.875, "rewards/chosen": -4.473085403442383, "rewards/margins": 1.6361382007598877, "rewards/rejected": -6.109223365783691, "step": 169 }, { "epoch": 0.7266898209991985, "grad_norm": 15.331044331855917, "learning_rate": 2.0793447201508286e-07, "logits/chosen": -0.6418094635009766, "logits/rejected": -0.6333540678024292, "logps/chosen": -693.9434204101562, "logps/rejected": -780.4866333007812, "loss": 0.3273, "rewards/accuracies": 0.6875, "rewards/chosen": -5.1741743087768555, "rewards/margins": 0.9175342321395874, "rewards/rejected": -6.091708660125732, "step": 170 }, { "epoch": 0.7309644670050761, "grad_norm": 14.142847544336131, "learning_rate": 2.01867450248011e-07, "logits/chosen": -0.6202086210250854, "logits/rejected": -0.6713452935218811, "logps/chosen": -733.3516235351562, "logps/rejected": -912.0302734375, "loss": 0.3551, "rewards/accuracies": 0.75, "rewards/chosen": -5.687126636505127, "rewards/margins": 1.6365104913711548, "rewards/rejected": -7.32363748550415, "step": 171 }, { "epoch": 0.7352391130109538, "grad_norm": 14.223351065221193, "learning_rate": 1.9586778945990783e-07, "logits/chosen": -0.5605691075325012, "logits/rejected": -0.6431994438171387, "logps/chosen": -720.2319946289062, "logps/rejected": -907.8939819335938, "loss": 0.3518, "rewards/accuracies": 0.84375, "rewards/chosen": -5.346857070922852, "rewards/margins": 1.8520584106445312, "rewards/rejected": -7.198915481567383, "step": 172 }, { "epoch": 0.7395137590168315, "grad_norm": 14.283973424406769, "learning_rate": 1.899368452324584e-07, "logits/chosen": -0.8039106130599976, "logits/rejected": -0.8154680728912354, "logps/chosen": -702.9293823242188, "logps/rejected": -874.7135009765625, "loss": 0.369, "rewards/accuracies": 0.71875, "rewards/chosen": -5.166163444519043, "rewards/margins": 1.7623482942581177, "rewards/rejected": -6.928511142730713, "step": 173 }, { "epoch": 0.743788405022709, "grad_norm": 13.648679506818656, "learning_rate": 1.840759576213181e-07, "logits/chosen": -0.5466803908348083, "logits/rejected": -0.6139577031135559, "logps/chosen": -643.9953002929688, "logps/rejected": -851.5556640625, "loss": 0.3272, "rewards/accuracies": 0.96875, "rewards/chosen": -4.4559197425842285, "rewards/margins": 2.084817886352539, "rewards/rejected": -6.540737152099609, "step": 174 }, { "epoch": 0.7480630510285867, "grad_norm": 16.8663479372205, "learning_rate": 1.7828645085333644e-07, "logits/chosen": -0.6725043654441833, "logits/rejected": -0.7195257544517517, "logps/chosen": -706.1845703125, "logps/rejected": -901.8247680664062, "loss": 0.3902, "rewards/accuracies": 0.84375, "rewards/chosen": -5.220101356506348, "rewards/margins": 1.968306541442871, "rewards/rejected": -7.188408374786377, "step": 175 }, { "epoch": 0.7523376970344643, "grad_norm": 13.612220492425323, "learning_rate": 1.725696330273575e-07, "logits/chosen": -0.7102064490318298, "logits/rejected": -0.7330564260482788, "logps/chosen": -644.0183715820312, "logps/rejected": -833.1116943359375, "loss": 0.3007, "rewards/accuracies": 0.9375, "rewards/chosen": -4.664500713348389, "rewards/margins": 1.9048974514007568, "rewards/rejected": -6.569398403167725, "step": 176 }, { "epoch": 0.756612343040342, "grad_norm": 13.592118995239437, "learning_rate": 1.6692679581866332e-07, "logits/chosen": -0.5269302725791931, "logits/rejected": -0.562321662902832, "logps/chosen": -647.377685546875, "logps/rejected": -876.7937622070312, "loss": 0.3336, "rewards/accuracies": 0.90625, "rewards/chosen": -4.665207862854004, "rewards/margins": 2.2923545837402344, "rewards/rejected": -6.9575629234313965, "step": 177 }, { "epoch": 0.7608869890462197, "grad_norm": 13.100123198154126, "learning_rate": 1.6135921418712955e-07, "logits/chosen": -0.6257606744766235, "logits/rejected": -0.6605125069618225, "logps/chosen": -589.7310791015625, "logps/rejected": -762.8815307617188, "loss": 0.3326, "rewards/accuracies": 0.90625, "rewards/chosen": -4.342212200164795, "rewards/margins": 1.6928372383117676, "rewards/rejected": -6.035048961639404, "step": 178 }, { "epoch": 0.7651616350520972, "grad_norm": 24.95386459216583, "learning_rate": 1.558681460891567e-07, "logits/chosen": -0.6882709860801697, "logits/rejected": -0.7164211273193359, "logps/chosen": -729.4912719726562, "logps/rejected": -960.5554809570312, "loss": 0.3596, "rewards/accuracies": 0.90625, "rewards/chosen": -5.27896785736084, "rewards/margins": 2.3151795864105225, "rewards/rejected": -7.594147682189941, "step": 179 }, { "epoch": 0.7694362810579749, "grad_norm": 14.508183670705439, "learning_rate": 1.5045483219344385e-07, "logits/chosen": -0.4706317186355591, "logits/rejected": -0.4956177771091461, "logps/chosen": -686.981201171875, "logps/rejected": -892.425537109375, "loss": 0.3878, "rewards/accuracies": 0.875, "rewards/chosen": -5.139954090118408, "rewards/margins": 1.9214184284210205, "rewards/rejected": -7.061371803283691, "step": 180 }, { "epoch": 0.7737109270638525, "grad_norm": 13.601053697523751, "learning_rate": 1.4512049560066835e-07, "logits/chosen": -0.5556597113609314, "logits/rejected": -0.6318129301071167, "logps/chosen": -582.2374877929688, "logps/rejected": -793.1754760742188, "loss": 0.3245, "rewards/accuracies": 0.9375, "rewards/chosen": -4.254069805145264, "rewards/margins": 2.0026259422302246, "rewards/rejected": -6.256695747375488, "step": 181 }, { "epoch": 0.7779855730697302, "grad_norm": 18.93784031657066, "learning_rate": 1.3986634156713417e-07, "logits/chosen": -0.5807328224182129, "logits/rejected": -0.5342915654182434, "logps/chosen": -602.649658203125, "logps/rejected": -747.00732421875, "loss": 0.3884, "rewards/accuracies": 0.90625, "rewards/chosen": -4.532341957092285, "rewards/margins": 1.6315253973007202, "rewards/rejected": -6.163866996765137, "step": 182 }, { "epoch": 0.7822602190756078, "grad_norm": 16.431500936058505, "learning_rate": 1.34693557232453e-07, "logits/chosen": -0.6352940797805786, "logits/rejected": -0.6816412210464478, "logps/chosen": -679.6211547851562, "logps/rejected": -891.3534545898438, "loss": 0.3745, "rewards/accuracies": 0.84375, "rewards/chosen": -4.584761619567871, "rewards/margins": 2.096449375152588, "rewards/rejected": -6.681210994720459, "step": 183 }, { "epoch": 0.7865348650814854, "grad_norm": 17.829780427983952, "learning_rate": 1.2960331135131823e-07, "logits/chosen": -0.6611433029174805, "logits/rejected": -0.6609802842140198, "logps/chosen": -662.570068359375, "logps/rejected": -845.5758666992188, "loss": 0.3201, "rewards/accuracies": 0.78125, "rewards/chosen": -4.930952072143555, "rewards/margins": 1.9189307689666748, "rewards/rejected": -6.849882125854492, "step": 184 }, { "epoch": 0.7908095110873631, "grad_norm": 14.013649032140211, "learning_rate": 1.2459675402943288e-07, "logits/chosen": -0.5998414754867554, "logits/rejected": -0.6097269058227539, "logps/chosen": -691.530517578125, "logps/rejected": -900.010986328125, "loss": 0.2947, "rewards/accuracies": 0.875, "rewards/chosen": -5.21755838394165, "rewards/margins": 2.078969955444336, "rewards/rejected": -7.296527862548828, "step": 185 }, { "epoch": 0.7950841570932408, "grad_norm": 13.880919593847631, "learning_rate": 1.1967501646365146e-07, "logits/chosen": -0.6878648996353149, "logits/rejected": -0.7795136570930481, "logps/chosen": -629.9570922851562, "logps/rejected": -834.251708984375, "loss": 0.3317, "rewards/accuracies": 0.78125, "rewards/chosen": -4.382741451263428, "rewards/margins": 2.003836154937744, "rewards/rejected": -6.386577606201172, "step": 186 }, { "epoch": 0.7993588030991183, "grad_norm": 15.353298997567329, "learning_rate": 1.1483921068639351e-07, "logits/chosen": -0.6340612173080444, "logits/rejected": -0.6507092714309692, "logps/chosen": -723.2861938476562, "logps/rejected": -939.2916259765625, "loss": 0.3424, "rewards/accuracies": 0.8125, "rewards/chosen": -5.30019474029541, "rewards/margins": 2.066894769668579, "rewards/rejected": -7.36708927154541, "step": 187 }, { "epoch": 0.803633449104996, "grad_norm": 14.566838990350824, "learning_rate": 1.1009042931438783e-07, "logits/chosen": -0.6575983762741089, "logits/rejected": -0.699353814125061, "logps/chosen": -693.1535034179688, "logps/rejected": -887.3889770507812, "loss": 0.3477, "rewards/accuracies": 0.84375, "rewards/chosen": -5.220640182495117, "rewards/margins": 1.8395450115203857, "rewards/rejected": -7.060185432434082, "step": 188 }, { "epoch": 0.8079080951108736, "grad_norm": 17.580820182141434, "learning_rate": 1.0542974530180327e-07, "logits/chosen": -0.6650811433792114, "logits/rejected": -0.7292627692222595, "logps/chosen": -640.0955200195312, "logps/rejected": -821.3978881835938, "loss": 0.3859, "rewards/accuracies": 0.8125, "rewards/chosen": -4.4905171394348145, "rewards/margins": 1.7442741394042969, "rewards/rejected": -6.234791278839111, "step": 189 }, { "epoch": 0.8121827411167513, "grad_norm": 14.138278292631155, "learning_rate": 1.0085821169782199e-07, "logits/chosen": -0.6633419394493103, "logits/rejected": -0.7350410223007202, "logps/chosen": -550.8514404296875, "logps/rejected": -755.83056640625, "loss": 0.3497, "rewards/accuracies": 0.96875, "rewards/chosen": -3.9601831436157227, "rewards/margins": 1.8831403255462646, "rewards/rejected": -5.843323707580566, "step": 190 }, { "epoch": 0.8164573871226289, "grad_norm": 15.051759686903543, "learning_rate": 9.637686140871121e-08, "logits/chosen": -0.5633993148803711, "logits/rejected": -0.5642579793930054, "logps/chosen": -751.5147705078125, "logps/rejected": -917.2871704101562, "loss": 0.3641, "rewards/accuracies": 0.8125, "rewards/chosen": -5.359073162078857, "rewards/margins": 1.6620866060256958, "rewards/rejected": -7.021159648895264, "step": 191 }, { "epoch": 0.8207320331285065, "grad_norm": 16.652659340140023, "learning_rate": 9.198670696444338e-08, "logits/chosen": -0.6166589260101318, "logits/rejected": -0.6604666709899902, "logps/chosen": -641.2590942382812, "logps/rejected": -832.0524291992188, "loss": 0.3808, "rewards/accuracies": 0.75, "rewards/chosen": -4.821630954742432, "rewards/margins": 1.7895194292068481, "rewards/rejected": -6.611149787902832, "step": 192 }, { "epoch": 0.8250066791343842, "grad_norm": 14.1280180622041, "learning_rate": 8.768874028992429e-08, "logits/chosen": -0.6036140322685242, "logits/rejected": -0.6334025859832764, "logps/chosen": -613.3704223632812, "logps/rejected": -792.6769409179688, "loss": 0.3289, "rewards/accuracies": 0.90625, "rewards/chosen": -4.494506359100342, "rewards/margins": 1.7201225757598877, "rewards/rejected": -6.21462869644165, "step": 193 }, { "epoch": 0.8292813251402619, "grad_norm": 13.810909832189019, "learning_rate": 8.348393248087287e-08, "logits/chosen": -0.5372692346572876, "logits/rejected": -0.5193148255348206, "logps/chosen": -559.5550537109375, "logps/rejected": -737.1176147460938, "loss": 0.3486, "rewards/accuracies": 0.78125, "rewards/chosen": -4.210740089416504, "rewards/margins": 1.7630614042282104, "rewards/rejected": -5.973801612854004, "step": 194 }, { "epoch": 0.8335559711461394, "grad_norm": 13.364037119659574, "learning_rate": 7.937323358440934e-08, "logits/chosen": -0.7003932595252991, "logits/rejected": -0.6555891633033752, "logps/chosen": -687.826171875, "logps/rejected": -845.4681396484375, "loss": 0.3359, "rewards/accuracies": 0.8125, "rewards/chosen": -4.774288654327393, "rewards/margins": 1.7430295944213867, "rewards/rejected": -6.517318248748779, "step": 195 }, { "epoch": 0.8378306171520171, "grad_norm": 14.21133162771402, "learning_rate": 7.535757238439938e-08, "logits/chosen": -0.7010968923568726, "logits/rejected": -0.7519139647483826, "logps/chosen": -618.968017578125, "logps/rejected": -890.8580322265625, "loss": 0.2995, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3997673988342285, "rewards/margins": 2.6948697566986084, "rewards/rejected": -7.094637393951416, "step": 196 }, { "epoch": 0.8421052631578947, "grad_norm": 14.29704216155569, "learning_rate": 7.143785619160026e-08, "logits/chosen": -0.8667165637016296, "logits/rejected": -0.9397881031036377, "logps/chosen": -635.5562133789062, "logps/rejected": -869.5856323242188, "loss": 0.2775, "rewards/accuracies": 1.0, "rewards/chosen": -4.424742221832275, "rewards/margins": 2.1135940551757812, "rewards/rejected": -6.538336277008057, "step": 197 }, { "epoch": 0.8463799091637724, "grad_norm": 15.830532425162781, "learning_rate": 6.761497063866206e-08, "logits/chosen": -0.715203583240509, "logits/rejected": -0.7201322317123413, "logps/chosen": -675.7356567382812, "logps/rejected": -823.666748046875, "loss": 0.3961, "rewards/accuracies": 0.84375, "rewards/chosen": -4.689385890960693, "rewards/margins": 1.436130404472351, "rewards/rejected": -6.125515937805176, "step": 198 }, { "epoch": 0.85065455516965, "grad_norm": 12.71855284005859, "learning_rate": 6.388977948002406e-08, "logits/chosen": -0.6863987445831299, "logits/rejected": -0.7026057839393616, "logps/chosen": -635.3793334960938, "logps/rejected": -810.3753051757812, "loss": 0.3249, "rewards/accuracies": 0.875, "rewards/chosen": -4.71897554397583, "rewards/margins": 1.7087393999099731, "rewards/rejected": -6.427714824676514, "step": 199 }, { "epoch": 0.8549292011755276, "grad_norm": 14.663842598726813, "learning_rate": 6.026312439675551e-08, "logits/chosen": -0.6016858220100403, "logits/rejected": -0.6423814296722412, "logps/chosen": -533.045654296875, "logps/rejected": -688.97705078125, "loss": 0.343, "rewards/accuracies": 0.90625, "rewards/chosen": -3.7861201763153076, "rewards/margins": 1.5125634670257568, "rewards/rejected": -5.298683166503906, "step": 200 }, { "epoch": 0.8549292011755276, "eval_logits/chosen": -0.5621978044509888, "eval_logits/rejected": -0.5776455998420715, "eval_logps/chosen": -698.124755859375, "eval_logps/rejected": -878.510498046875, "eval_loss": 0.3298446834087372, "eval_rewards/accuracies": 0.8951612710952759, "eval_rewards/chosen": -4.931028842926025, "eval_rewards/margins": 1.8655366897583008, "eval_rewards/rejected": -6.796565055847168, "eval_runtime": 148.1942, "eval_samples_per_second": 13.233, "eval_steps_per_second": 0.418, "step": 200 }, { "epoch": 0.8592038471814053, "grad_norm": 13.178173716498932, "learning_rate": 5.6735824806383945e-08, "logits/chosen": -0.8137510418891907, "logits/rejected": -0.8618326783180237, "logps/chosen": -762.1471557617188, "logps/rejected": -987.831787109375, "loss": 0.3117, "rewards/accuracies": 0.875, "rewards/chosen": -5.3300957679748535, "rewards/margins": 2.290207862854004, "rewards/rejected": -7.620304107666016, "step": 201 }, { "epoch": 0.863478493187283, "grad_norm": 15.015075721065077, "learning_rate": 5.3308677677753324e-08, "logits/chosen": -0.6007865071296692, "logits/rejected": -0.6010035872459412, "logps/chosen": -607.5897827148438, "logps/rejected": -783.7905883789062, "loss": 0.3751, "rewards/accuracies": 0.8125, "rewards/chosen": -4.329671382904053, "rewards/margins": 1.7175862789154053, "rewards/rejected": -6.047257423400879, "step": 202 }, { "epoch": 0.8677531391931605, "grad_norm": 17.440815026489904, "learning_rate": 4.9982457350954576e-08, "logits/chosen": -0.6124709844589233, "logits/rejected": -0.5998551249504089, "logps/chosen": -768.2071533203125, "logps/rejected": -909.81982421875, "loss": 0.337, "rewards/accuracies": 0.75, "rewards/chosen": -5.643802642822266, "rewards/margins": 1.5375871658325195, "rewards/rejected": -7.181390285491943, "step": 203 }, { "epoch": 0.8720277851990382, "grad_norm": 13.117492290397454, "learning_rate": 4.675791536236856e-08, "logits/chosen": -0.6630779504776001, "logits/rejected": -0.7016023397445679, "logps/chosen": -576.6640014648438, "logps/rejected": -781.410888671875, "loss": 0.3565, "rewards/accuracies": 0.90625, "rewards/chosen": -4.391974449157715, "rewards/margins": 2.0928616523742676, "rewards/rejected": -6.484836101531982, "step": 204 }, { "epoch": 0.8763024312049158, "grad_norm": 16.38485979196371, "learning_rate": 4.3635780274861864e-08, "logits/chosen": -0.6497581005096436, "logits/rejected": -0.7152493596076965, "logps/chosen": -604.6185302734375, "logps/rejected": -769.5409545898438, "loss": 0.3998, "rewards/accuracies": 0.84375, "rewards/chosen": -4.433528900146484, "rewards/margins": 1.5042006969451904, "rewards/rejected": -5.937729835510254, "step": 205 }, { "epoch": 0.8805770772107935, "grad_norm": 19.27882004478496, "learning_rate": 4.0616757513173115e-08, "logits/chosen": -0.5923482775688171, "logits/rejected": -0.6674529910087585, "logps/chosen": -730.7793579101562, "logps/rejected": -982.1051025390625, "loss": 0.3504, "rewards/accuracies": 0.84375, "rewards/chosen": -4.950905799865723, "rewards/margins": 2.3350894451141357, "rewards/rejected": -7.2859954833984375, "step": 206 }, { "epoch": 0.8848517232166712, "grad_norm": 12.70289041030948, "learning_rate": 3.7701529204526846e-08, "logits/chosen": -0.5751956105232239, "logits/rejected": -0.6102803349494934, "logps/chosen": -638.7919311523438, "logps/rejected": -788.8345336914062, "loss": 0.3242, "rewards/accuracies": 0.84375, "rewards/chosen": -4.849457740783691, "rewards/margins": 1.458791971206665, "rewards/rejected": -6.308249473571777, "step": 207 }, { "epoch": 0.8891263692225487, "grad_norm": 16.266039867395605, "learning_rate": 3.4890754024512246e-08, "logits/chosen": -0.6625305414199829, "logits/rejected": -0.6873234510421753, "logps/chosen": -717.0994873046875, "logps/rejected": -884.3046875, "loss": 0.3447, "rewards/accuracies": 0.75, "rewards/chosen": -5.226888179779053, "rewards/margins": 1.590306043624878, "rewards/rejected": -6.81719446182251, "step": 208 }, { "epoch": 0.8934010152284264, "grad_norm": 14.509559337608318, "learning_rate": 3.218506704825924e-08, "logits/chosen": -0.5810579061508179, "logits/rejected": -0.6108168363571167, "logps/chosen": -675.04150390625, "logps/rejected": -836.3502197265625, "loss": 0.3439, "rewards/accuracies": 0.78125, "rewards/chosen": -5.011352062225342, "rewards/margins": 1.6481198072433472, "rewards/rejected": -6.6594719886779785, "step": 209 }, { "epoch": 0.897675661234304, "grad_norm": 17.634651177078286, "learning_rate": 2.958507960694784e-08, "logits/chosen": -0.5604880452156067, "logits/rejected": -0.5632505416870117, "logps/chosen": -718.5660400390625, "logps/rejected": -901.1178588867188, "loss": 0.3646, "rewards/accuracies": 0.875, "rewards/chosen": -5.241810321807861, "rewards/margins": 1.8300410509109497, "rewards/rejected": -7.071850776672363, "step": 210 }, { "epoch": 0.9019503072401817, "grad_norm": 14.379481683528487, "learning_rate": 2.7091379149682682e-08, "logits/chosen": -0.7952392101287842, "logits/rejected": -0.8281516432762146, "logps/chosen": -715.3723754882812, "logps/rejected": -877.9434814453125, "loss": 0.3452, "rewards/accuracies": 0.78125, "rewards/chosen": -5.230836868286133, "rewards/margins": 1.5704870223999023, "rewards/rejected": -6.801323413848877, "step": 211 }, { "epoch": 0.9062249532460593, "grad_norm": 12.38379892339324, "learning_rate": 2.470452911076226e-08, "logits/chosen": -0.6212865114212036, "logits/rejected": -0.7052218317985535, "logps/chosen": -568.6085205078125, "logps/rejected": -806.6718139648438, "loss": 0.3225, "rewards/accuracies": 0.96875, "rewards/chosen": -4.2210798263549805, "rewards/margins": 2.2609493732452393, "rewards/rejected": -6.482028484344482, "step": 212 }, { "epoch": 0.9104995992519369, "grad_norm": 13.671352372902698, "learning_rate": 2.2425068782375378e-08, "logits/chosen": -0.708694577217102, "logits/rejected": -0.7406002283096313, "logps/chosen": -631.50537109375, "logps/rejected": -812.782958984375, "loss": 0.307, "rewards/accuracies": 0.875, "rewards/chosen": -4.614077091217041, "rewards/margins": 1.823103427886963, "rewards/rejected": -6.4371795654296875, "step": 213 }, { "epoch": 0.9147742452578146, "grad_norm": 14.646295067927662, "learning_rate": 2.025351319275137e-08, "logits/chosen": -0.6601104736328125, "logits/rejected": -0.6835007667541504, "logps/chosen": -590.8656005859375, "logps/rejected": -773.0665283203125, "loss": 0.3485, "rewards/accuracies": 0.84375, "rewards/chosen": -4.47109317779541, "rewards/margins": 1.788784384727478, "rewards/rejected": -6.259877681732178, "step": 214 }, { "epoch": 0.9190488912636923, "grad_norm": 13.735401688183746, "learning_rate": 1.8190352989793322e-08, "logits/chosen": -0.6832427978515625, "logits/rejected": -0.7862576246261597, "logps/chosen": -725.4074096679688, "logps/rejected": -984.6351318359375, "loss": 0.3231, "rewards/accuracies": 0.9375, "rewards/chosen": -5.303395748138428, "rewards/margins": 2.5184988975524902, "rewards/rejected": -7.821893692016602, "step": 215 }, { "epoch": 0.9233235372695698, "grad_norm": 15.601926902374625, "learning_rate": 1.623605433021985e-08, "logits/chosen": -0.7613773345947266, "logits/rejected": -0.8268823623657227, "logps/chosen": -661.9906005859375, "logps/rejected": -919.4451904296875, "loss": 0.3372, "rewards/accuracies": 0.84375, "rewards/chosen": -4.652050018310547, "rewards/margins": 2.513514995574951, "rewards/rejected": -7.1655659675598145, "step": 216 }, { "epoch": 0.9275981832754475, "grad_norm": 16.26278938580055, "learning_rate": 1.4391058774239629e-08, "logits/chosen": -0.6215861439704895, "logits/rejected": -0.684840977191925, "logps/chosen": -776.4395751953125, "logps/rejected": -996.853271484375, "loss": 0.326, "rewards/accuracies": 0.84375, "rewards/chosen": -5.608438491821289, "rewards/margins": 1.9786306619644165, "rewards/rejected": -7.587069511413574, "step": 217 }, { "epoch": 0.9318728292813251, "grad_norm": 14.96580166748689, "learning_rate": 1.2655783185784252e-08, "logits/chosen": -0.5001079440116882, "logits/rejected": -0.5907378792762756, "logps/chosen": -636.527099609375, "logps/rejected": -864.0882568359375, "loss": 0.3191, "rewards/accuracies": 0.90625, "rewards/chosen": -4.651758670806885, "rewards/margins": 2.046351432800293, "rewards/rejected": -6.698110103607178, "step": 218 }, { "epoch": 0.9361474752872028, "grad_norm": 16.265287798595743, "learning_rate": 1.1030619638320804e-08, "logits/chosen": -0.7028571963310242, "logits/rejected": -0.7375434041023254, "logps/chosen": -659.0259399414062, "logps/rejected": -862.6585693359375, "loss": 0.4146, "rewards/accuracies": 0.84375, "rewards/chosen": -4.674835205078125, "rewards/margins": 1.9501476287841797, "rewards/rejected": -6.624982833862305, "step": 219 }, { "epoch": 0.9404221212930804, "grad_norm": 18.704093009678243, "learning_rate": 9.515935326265378e-09, "logits/chosen": -0.6694950461387634, "logits/rejected": -0.7062525749206543, "logps/chosen": -723.8511962890625, "logps/rejected": -943.264892578125, "loss": 0.3529, "rewards/accuracies": 0.78125, "rewards/chosen": -5.229786396026611, "rewards/margins": 2.252092123031616, "rewards/rejected": -7.481878757476807, "step": 220 }, { "epoch": 0.944696767298958, "grad_norm": 14.334384768178085, "learning_rate": 8.11207248201834e-09, "logits/chosen": -0.5764753818511963, "logits/rejected": -0.5958765745162964, "logps/chosen": -652.0119018554688, "logps/rejected": -832.4238891601562, "loss": 0.3477, "rewards/accuracies": 0.8125, "rewards/chosen": -4.68829870223999, "rewards/margins": 1.8251862525939941, "rewards/rejected": -6.513484001159668, "step": 221 }, { "epoch": 0.9489714133048357, "grad_norm": 13.331601354010285, "learning_rate": 6.819348298638839e-09, "logits/chosen": -0.5833301544189453, "logits/rejected": -0.5820556879043579, "logps/chosen": -636.1702270507812, "logps/rejected": -775.4622192382812, "loss": 0.3313, "rewards/accuracies": 0.875, "rewards/chosen": -4.623978137969971, "rewards/margins": 1.5225858688354492, "rewards/rejected": -6.14656400680542, "step": 222 }, { "epoch": 0.9532460593107134, "grad_norm": 14.100864285899743, "learning_rate": 5.638054858177643e-09, "logits/chosen": -0.6460739374160767, "logits/rejected": -0.704484760761261, "logps/chosen": -666.4550170898438, "logps/rejected": -879.8217163085938, "loss": 0.3125, "rewards/accuracies": 0.875, "rewards/chosen": -4.722194194793701, "rewards/margins": 2.0787832736968994, "rewards/rejected": -6.8009772300720215, "step": 223 }, { "epoch": 0.957520705316591, "grad_norm": 14.340696291340606, "learning_rate": 4.568459065683205e-09, "logits/chosen": -0.6707419753074646, "logits/rejected": -0.6990107893943787, "logps/chosen": -594.0106201171875, "logps/rejected": -786.5427856445312, "loss": 0.2979, "rewards/accuracies": 0.84375, "rewards/chosen": -4.234139442443848, "rewards/margins": 1.8917232751846313, "rewards/rejected": -6.1258625984191895, "step": 224 }, { "epoch": 0.9617953513224686, "grad_norm": 16.87207011175805, "learning_rate": 3.6108025888958447e-09, "logits/chosen": -0.6689302921295166, "logits/rejected": -0.7221664786338806, "logps/chosen": -647.2908935546875, "logps/rejected": -820.8074340820312, "loss": 0.3808, "rewards/accuracies": 0.875, "rewards/chosen": -4.4107489585876465, "rewards/margins": 1.711435317993164, "rewards/rejected": -6.1221842765808105, "step": 225 }, { "epoch": 0.9660699973283462, "grad_norm": 15.439317380628985, "learning_rate": 2.7653018036454256e-09, "logits/chosen": -0.7938471436500549, "logits/rejected": -0.8257592916488647, "logps/chosen": -685.37255859375, "logps/rejected": -856.272705078125, "loss": 0.376, "rewards/accuracies": 0.875, "rewards/chosen": -4.837329864501953, "rewards/margins": 1.7641983032226562, "rewards/rejected": -6.601528167724609, "step": 226 }, { "epoch": 0.9703446433342239, "grad_norm": 15.341084234652751, "learning_rate": 2.0321477449619096e-09, "logits/chosen": -0.6373786330223083, "logits/rejected": -0.6358063220977783, "logps/chosen": -671.7559814453125, "logps/rejected": -808.6800537109375, "loss": 0.3511, "rewards/accuracies": 0.75, "rewards/chosen": -4.954955101013184, "rewards/margins": 1.4025709629058838, "rewards/rejected": -6.357525825500488, "step": 227 }, { "epoch": 0.9746192893401016, "grad_norm": 16.08851675396365, "learning_rate": 1.4115060639128818e-09, "logits/chosen": -0.7105420827865601, "logits/rejected": -0.7698283195495605, "logps/chosen": -771.7847900390625, "logps/rejected": -1010.12255859375, "loss": 0.3947, "rewards/accuracies": 0.8125, "rewards/chosen": -5.364959716796875, "rewards/margins": 2.1707944869995117, "rewards/rejected": -7.535754203796387, "step": 228 }, { "epoch": 0.9788939353459791, "grad_norm": 16.317073121348123, "learning_rate": 9.035169901754902e-10, "logits/chosen": -0.6789236664772034, "logits/rejected": -0.718908429145813, "logps/chosen": -675.6837158203125, "logps/rejected": -947.82958984375, "loss": 0.3487, "rewards/accuracies": 0.90625, "rewards/chosen": -4.9360833168029785, "rewards/margins": 2.77742600440979, "rewards/rejected": -7.713509559631348, "step": 229 }, { "epoch": 0.9831685813518568, "grad_norm": 14.913455884524495, "learning_rate": 5.082953003528456e-10, "logits/chosen": -0.7133156061172485, "logits/rejected": -0.7598533630371094, "logps/chosen": -662.4967651367188, "logps/rejected": -898.2874145507812, "loss": 0.3301, "rewards/accuracies": 0.875, "rewards/chosen": -4.673389911651611, "rewards/margins": 2.1788995265960693, "rewards/rejected": -6.852289199829102, "step": 230 }, { "epoch": 0.9874432273577345, "grad_norm": 13.34444116932449, "learning_rate": 2.2593029204076574e-10, "logits/chosen": -0.7370655536651611, "logits/rejected": -0.7865728139877319, "logps/chosen": -603.37109375, "logps/rejected": -828.8916015625, "loss": 0.3555, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3032660484313965, "rewards/margins": 2.1432766914367676, "rewards/rejected": -6.446542739868164, "step": 231 }, { "epoch": 0.9917178733636121, "grad_norm": 14.986063948498115, "learning_rate": 5.648576365169244e-11, "logits/chosen": -0.7017238140106201, "logits/rejected": -0.7418109178543091, "logps/chosen": -690.6285400390625, "logps/rejected": -863.64208984375, "loss": 0.352, "rewards/accuracies": 0.90625, "rewards/chosen": -5.160223960876465, "rewards/margins": 1.6970188617706299, "rewards/rejected": -6.857243061065674, "step": 232 }, { "epoch": 0.9959925193694897, "grad_norm": 14.725841841963547, "learning_rate": 0.0, "logits/chosen": -0.540886402130127, "logits/rejected": -0.5910319089889526, "logps/chosen": -539.7564697265625, "logps/rejected": -689.2047119140625, "loss": 0.3329, "rewards/accuracies": 0.8125, "rewards/chosen": -4.104672431945801, "rewards/margins": 1.427870750427246, "rewards/rejected": -5.532542705535889, "step": 233 }, { "epoch": 0.9959925193694897, "step": 233, "total_flos": 0.0, "train_loss": 0.45488236134655996, "train_runtime": 10577.9564, "train_samples_per_second": 5.66, "train_steps_per_second": 0.022 } ], "logging_steps": 1, "max_steps": 233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }